MathgeomGLS
4662 строки · 144.6 Кб
1; MacOS64 uses the System V AMD64 ABI:
2; * First 6 (up to) 64-bit int/pointer parameters in RDI, RSI, RDX, RCX, R8, R9
3; * First 8 float parameters in XMM0-XMM7
4; * (Up to) 64-bit int return value: RAX
5; * (Up to) 128-bit int return value: RAX, RDX
6; * Float return value(s): XMM0, XMM1. This applies to records with float values
7; as well: the first 2 Single values are returned in XMM0 and the next to in
8; XMM1 (so for a TVector4, XMM0 contains X and Y, and XMM1 contains Z and W).
9; Use "movhlps xmm1, xmm0" to copy the upper 2 floats from xmm0 to the lower
10; 2 floats of xmm1
11; * For return values larger than 128 bits, the first parameter (RDI) will be
12; set by the caller to the address of the return value (and all other
13; parameters move one up).
14; * RBX, RBP and R12-R15 must be saved
15; * RAX, RCX, RDX, RSI, RDI, R8-R11 can be modified
16; * All XMM registeres can be modified
17; * For leaf-node functions (that don't call other functions), the 128 bytes
18; below the stack pointer (the red-zone) can be freely used.
19; * The ".data" segment is aligned, so you can use "movaps" and friends
20; * Parameter pointers do *not* have to be aligned, so you should use "movups"
21
22BITS 64
23
24section .data
25
26ALIGN 16
27
28; SSE rounding modes (bits in MXCSR register)
29%define SSE_ROUND_MASK 0xFFFF9FFF
30%define SSE_ROUND_NEAREST 0x00000000
31%define SSE_ROUND_DOWN 0x00002000
32%define SSE_ROUND_UP 0x00004000
33%define SSE_ROUND_TRUNC 0x00006000
34
35; These constants fit in a single XMM register. These values represent
36; sign-bits as used by 32-bit floating-point values.
37; XOR'ing a floating-point value with 0x80000000 swaps the sign.
38; XOR'ing a floating-point value with 0x00000000 leaves the value unchanged.
39kSSE_MASK_SIGN:
40dd 0x80000000, 0x80000000, 0x80000000, 0x80000000
41
42kSSE_MASK_NPNP:
43dd 0x80000000, 0x00000000, 0x80000000, 0x00000000
44
45kSSE_MASK_PNPN:
46dd 0x00000000, 0x80000000, 0x00000000, 0x80000000
47
48kSSE_MASK_0FFF:
49dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
50
51; These constants mask off an element of the binary representation of a
52; 32-bit floating-point value.
53kSSE_MASK_FRACTION:
54dd 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF
55
56kSSE_MASK_EXPONENT:
57dd 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
58
59kSSE_MASK_ABS_VAL:
60dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
61
62; Commonly used floating-point values
63kSSE_ONE_HALF:
64dd 0.5, 0.5, 0.5, 0.5
65
66kSSE_ONE:
67dd 1.0, 1.0, 1.0, 1.0
68
69kSSE_TWO:
70dd 2.0, 2.0, 2.0, 2.0
71
72kSSE_THREE:
73dd 3.0, 3.0, 3.0, 3.0
74
75kSSE_PI_OVER_180:
76dd 0.01745329251994329576923690768489, 0.01745329251994329576923690768489, 0.01745329251994329576923690768489, 0.01745329251994329576923690768489
77
78kSSE_180_OVER_PI:
79dd 57.295779513082320876798154814105, 57.295779513082320876798154814105, 57.295779513082320876798154814105, 57.295779513082320876798154814105
80
81kSSE_NEG_INFINITY:
82dd -__Infinity__, -__Infinity__, -__Infinity__, -__Infinity__
83
84kSSE_PI_OVER_4:
85dd 0.78539816339744830961566084581988, 0.78539816339744830961566084581988, 0.78539816339744830961566084581988, 0.78539816339744830961566084581988
86
87; Commonly used integer values
88kSSE_INT_ONE:
89dd 1, 1, 1, 1
90
91kSSE_INT_NOT_ONE:
92dd 0xFFFFFFFE, 0xFFFFFFFE, 0xFFFFFFFE, 0xFFFFFFFE
93
94kSSE_INT_TWO:
95dd 2, 2, 2, 2
96
97kSSE_INT_FOUR:
98dd 4, 4, 4, 4
99
100; Constants for approximating trigonometric functions
101kSSE_FOPI:
102dd 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516
103
104kSSE_SINCOF_P0:
105dd -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4
106
107kSSE_SINCOF_P1:
108dd 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3
109
110kSSE_SINCOF_P2:
111dd -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1
112
113kSSE_COSCOF_P0:
114dd 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005
115
116kSSE_COSCOF_P1:
117dd -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003
118
119kSSE_COSCOF_P2:
120dd 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002
121
122kSSE_EXP_A1:
123dd 12102203.1615614, 12102203.1615614, 12102203.1615614, 12102203.1615614
124
125kSSE_EXP_A2:
126dd 1065353216.0, 1065353216.0, 1065353216.0, 1065353216.0
127
128kSSE_EXP_CST:
129dd 2139095040.0, 2139095040.0, 2139095040.0, 2139095040.0
130
131kSSE_EXP_F1:
132dd 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125
133
134kSSE_EXP_F2:
135dd 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375
136
137kSSE_EXP_F3:
138dd 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875
139
140kSSE_EXP_F4:
141dd -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3
142
143kSSE_EXP_F5:
144dd 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2
145
146kSSE_EXP_I1:
147dd 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000
148
149kSSE_LN_CST:
150dd -89.93423858, -89.93423858, -89.93423858, -89.93423858
151
152kSSE_LN_F1:
153dd 3.3977745, 3.3977745, 3.3977745, 3.3977745
154
155kSSE_LN_F2:
156dd 2.2744832, 2.2744832, 2.2744832, 2.2744832
157
158kSSE_LN_F3:
159dd 0.024982445, 0.024982445, 0.024982445, 0.024982445
160
161kSSE_LN_F4:
162dd 0.24371102, 0.24371102, 0.24371102, 0.24371102
163
164kSSE_LN_F5:
165dd 0.69314718055995, 0.69314718055995, 0.69314718055995, 0.69314718055995
166
167kSSE_LOG2_I1:
168dd 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
169
170kSSE_LOG2_F1:
171dd 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7
172
173kSSE_LOG2_F2:
174dd 124.22551499, 124.22551499, 124.22551499, 124.22551499
175
176kSSE_LOG2_F3:
177dd 1.498030302, 1.498030302, 1.498030302, 1.498030302
178
179kSSE_LOG2_F4:
180dd 1.72587999, 1.72587999, 1.72587999, 1.72587999
181
182kSSE_LOG2_F5:
183dd 0.3520887068, 0.3520887068, 0.3520887068, 0.3520887068
184
185kSSE_EXP2_F1:
186dd 121.2740575, 121.2740575, 121.2740575, 121.2740575
187
188kSSE_EXP2_F2:
189dd 27.7280233, 27.7280233, 27.7280233, 27.7280233
190
191kSSE_EXP2_F3:
192dd 4.84252568, 4.84252568, 4.84252568, 4.84252568
193
194kSSE_EXP2_F4:
195dd 1.49012907, 1.49012907, 1.49012907, 1.49012907
196
197kSSE_EXP2_F5:
198dd 8388608.0, 8388608.0, 8388608.0, 8388608.00000
199
200section .text
201
202%define Param1 rdi
203%define Param2 rsi
204%define Param3 rdx
205%define Self rdi
206%define OldFlags rsp-32
207%define NewFlags rsp-48
208
209global _radians_vector2, _radians_vector3, _radians_vector4
210global _degrees_vector2, _degrees_vector3, _degrees_vector4
211global _sqrt_single, _sqrt_vector2, _sqrt_vector3, _sqrt_vector4
212global _inverse_sqrt_single, _inverse_sqrt_vector2, _inverse_sqrt_vector3, _inverse_sqrt_vector4
213global _fast_sin_single, _fast_sin_vector2, _fast_sin_vector3, _fast_sin_vector4
214global _fast_cos_single, _fast_cos_vector2, _fast_cos_vector3, _fast_cos_vector4
215global _fast_sin_cos_single, _fast_sin_cos_vector2, _fast_sin_cos_vector3, _fast_sin_cos_vector4
216global _fast_exp_single, _fast_exp_vector2, _fast_exp_vector3, _fast_exp_vector4
217global _fast_ln_single, _fast_ln_vector2, _fast_ln_vector3, _fast_ln_vector4
218global _fast_log2_single, _fast_log2_vector2, _fast_log2_vector3, _fast_log2_vector4
219global _fast_exp2_single, _fast_exp2_vector2, _fast_exp2_vector3, _fast_exp2_vector4
220global _abs_vector3, _abs_vector4
221global _sign_single, _sign_vector2, _sign_vector3, _sign_vector4
222global _floor_single, _floor_vector2, _floor_vector3, _floor_vector4
223global _trunc_single, _trunc_vector2, _trunc_vector3, _trunc_vector4
224global _round_single,_round_vector2, _round_vector3, _round_vector4
225global _ceil_single, _ceil_vector2, _ceil_vector3, _ceil_vector4
226global _frac_vector2, _frac_vector3, _frac_vector4
227global _fmod_vector2_single, _fmod_vector3_single, _fmod_vector4_single
228global _fmod_vector2, _fmod_vector3, _fmod_vector4
229global _modf_vector2, _modf_vector3, _modf_vector4
230global _min_vector2_single, _min_vector3_single, _min_vector4_single
231global _min_vector2, _min_vector3, _min_vector4
232global _max_vector2_single, _max_vector3_single, _max_vector4_single
233global _max_vector2, _max_vector3, _max_vector4
234global _ensure_range_single
235global _ensure_range_vector2_single, _ensure_range_vector3_single, _ensure_range_vector4_single
236global _ensure_range_vector2, _ensure_range_vector3, _ensure_range_vector4
237global _mix_vector3_single, _mix_vector4_single
238global _mix_vector3, _mix_vector4
239global _step_single_vector2, _step_single_vector3, _step_single_vector4
240global _step_vector2, _step_vector3, _step_vector4
241global _smooth_step_single_vector3, _smooth_step_single_vector4
242global _smooth_step_vector3, _smooth_step_vector4
243global _fma_vector2, _fma_vector3, _fma_vector4
244global _outer_product_matrix2, _outer_product_matrix3, _outer_product_matrix4
245global _vector2_div_single, _single_div_vector2, _vector2_div_vector2
246global _vector2_normalize_fast, _vector2_set_normalized_fast
247global _vector3_add_single, _single_add_vector3, _vector3_add_vector3
248global _vector3_sub_single, _single_sub_vector3, _vector3_sub_vector3
249global _vector3_mul_single, _single_mul_vector3, _vector3_mul_vector3
250global _vector3_div_single, _single_div_vector3, _vector3_div_vector3
251global _vector3_distance, _vector3_distance_squared
252global _vector3_get_length, _vector3_get_length_squared
253global _vector3_normalize_fast, _vector3_set_normalized_fast
254global _vector3_reflect, _vector3_refract
255global _vector4_add_single, _single_add_vector4, _vector4_add_vector4
256global _vector4_sub_single, _single_sub_vector4, _vector4_sub_vector4
257global _vector4_mul_single, _single_mul_vector4, _vector4_mul_vector4
258global _vector4_div_single, _single_div_vector4, _vector4_div_vector4
259global _vector4_negative
260global _vector4_distance, _vector4_distance_squared
261global _vector4_face_forward
262global _vector4_get_length, _vector4_get_length_squared
263global _vector4_normalize_fast, _vector4_set_normalized_fast
264global _vector4_reflect, _vector4_refract
265global _matrix3_add_single, _single_add_matrix3, _matrix3_add_matrix3
266global _matrix3_sub_single, _single_sub_matrix3, _matrix3_sub_matrix3
267global _matrix3_mul_single, _single_mul_matrix3, _matrix3_comp_mult
268global _matrix3_mul_vector3, _vector3_mul_matrix3, _matrix3_mul_matrix3
269global _matrix3_div_single, _single_div_matrix3
270global _matrix3_negative, _matrix3_transpose, _matrix3_set_transposed
271global _matrix4_add_single, _single_add_matrix4, _matrix4_add_matrix4
272global _matrix4_sub_single, _single_sub_matrix4, _matrix4_sub_matrix4
273global _matrix4_mul_single, _single_mul_matrix4, _matrix4_comp_mult
274global _matrix4_mul_vector4, _vector4_mul_matrix4, _matrix4_mul_matrix4
275global _matrix4_div_single, _single_div_matrix4
276global _matrix4_negative, _matrix4_inverse, _matrix4_set_inversed
277global _matrix4_transpose, _matrix4_set_transposed
278
279;****************************************************************************
280; Angle and Trigonometry Functions
281;****************************************************************************
282
283_radians_vector2:
284movlps xmm0, [Param1]
285movlps xmm1, [rel kSSE_PI_OVER_180]
286mulps xmm0, xmm1
287ret
288
289_radians_vector3:
290movlps xmm0, [Param1]
291movss xmm1, [Param1+8]
292movaps xmm2, [rel kSSE_PI_OVER_180]
293mulps xmm0, xmm2
294mulss xmm1, xmm2
295ret
296
297_radians_vector4:
298movups xmm0, [Param1]
299movaps xmm1, [rel kSSE_PI_OVER_180]
300mulps xmm0, xmm1
301movhlps xmm1, xmm0
302ret
303
304_degrees_vector2:
305movlps xmm0, [Param1]
306movlps xmm1, [rel kSSE_180_OVER_PI]
307mulps xmm0, xmm1
308ret
309
310_degrees_vector3:
311movlps xmm0, [Param1]
312movss xmm1, [Param1+8]
313movaps xmm2, [rel kSSE_180_OVER_PI]
314mulps xmm0, xmm2
315mulss xmm1, xmm2
316ret
317
318_degrees_vector4:
319movups xmm0, [Param1]
320movaps xmm1, [rel kSSE_180_OVER_PI]
321mulps xmm0, xmm1
322movhlps xmm1, xmm0
323ret
324
325;****************************************************************************
326; Exponential Functions
327;****************************************************************************
328
329_sqrt_single:
330sqrtss xmm0, xmm0
331ret
332
333_sqrt_vector2:
334movlps xmm0, [Param1]
335sqrtps xmm0, xmm0
336ret
337
338_sqrt_vector3:
339movlps xmm0, [Param1]
340movss xmm1, [Param1+8]
341sqrtps xmm0, xmm0
342sqrtps xmm1, xmm1
343ret
344
345_sqrt_vector4:
346movups xmm0, [Param1]
347sqrtps xmm0, xmm0
348movhlps xmm1, xmm0
349ret
350
351_inverse_sqrt_single:
352rsqrtss xmm0, xmm0
353ret
354
355_inverse_sqrt_vector2:
356movlps xmm0, [Param1]
357rsqrtps xmm0, xmm0
358ret
359
360_inverse_sqrt_vector3:
361movlps xmm0, [Param1]
362movss xmm1, [Param1+8]
363rsqrtps xmm0, xmm0
364rsqrtps xmm1, xmm1
365ret
366
367_inverse_sqrt_vector4:
368movups xmm0, [Param1]
369rsqrtps xmm0, xmm0
370movhlps xmm1, xmm0
371ret
372
373;****************************************************************************
374; Fast approximate Functions
375;****************************************************************************
376
377_fast_sin_single:
378movss xmm2, [rel kSSE_MASK_ABS_VAL]
379movaps xmm1, xmm0
380movss xmm3, [rel kSSE_MASK_SIGN]
381andps xmm0, xmm2 ; (xmm0) X := Abs(ARadians)
382andps xmm1, xmm3 ; (xmm1) SignBit
383movaps xmm2, xmm0
384movss xmm4, [rel kSSE_FOPI]
385movss xmm5, [rel kSSE_INT_ONE]
386mulss xmm2, xmm4
387movss xmm6, [rel kSSE_INT_NOT_ONE]
388cvtps2dq xmm2, xmm2 ; J := Trunc(X * FOPI)
389movss xmm7, [rel kSSE_INT_FOUR]
390paddd xmm2, xmm5
391pand xmm2, xmm6 ; (xmm2) J := (J + 1) and (not 1)
392movss xmm6, [rel kSSE_INT_TWO]
393cvtdq2ps xmm4, xmm2 ; (xmm4) Y := J
394movaps xmm5, xmm2
395pand xmm2, xmm6 ; J and 2
396pand xmm5, xmm7 ; J and 4
397pxor xmm7, xmm7
398pslld xmm5, 29 ; (xmm5) SwapSignBit := (J and 4) shl 29
399pcmpeqd xmm2, xmm7 ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
400movss xmm6, [rel kSSE_PI_OVER_4]
401pxor xmm1, xmm5 ; (xmm1) SignBit := SignBit xor SwapSignBit
402mulss xmm4, xmm6 ; Y * Pi / 4
403movss xmm3, [rel kSSE_COSCOF_P0]
404subss xmm0, xmm4 ; (xmm0) X := X - (Y * Pi / 4)
405movss xmm4, [rel kSSE_COSCOF_P1]
406movaps xmm7, xmm0
407movss xmm6, [rel kSSE_COSCOF_P2]
408mulss xmm7, xmm7 ; (xmm7) Z := X * X
409movss xmm5, [rel kSSE_SINCOF_P1]
410mulss xmm3, xmm7 ; COSCOF_P0 * Z
411addss xmm3, xmm4 ; Y := COSCOF_P0 * Z + COSCOF_P1
412movss xmm4, [rel kSSE_ONE_HALF]
413mulss xmm3, xmm7 ; Y * Z
414mulss xmm4, xmm7 ; Z * 0.5
415addps xmm3, xmm6 ; Y := (Y * Z) + COSCOF_P2
416movss xmm6, [rel kSSE_ONE]
417mulss xmm3, xmm7 ; Y * Z
418mulss xmm3, xmm7 ; Y := Y * (Z * Z)
419subss xmm3, xmm4 ; Y - Z * 0.5
420movss xmm4, [rel kSSE_SINCOF_P0]
421addps xmm3, xmm6 ; (xmm3) Y := Y - Z * 0.5 + 1
422movss xmm6, [rel kSSE_SINCOF_P2]
423mulss xmm4, xmm7 ; SINCOF_P0 * Z
424addss xmm4, xmm5 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
425movaps xmm5, xmm2
426mulss xmm4, xmm7 ; Y2 * Z
427addss xmm4, xmm6 ; Y2 := (Y2 * Z) + SINCOF_P2
428mulss xmm4, xmm7 ; Y2 * Z
429mulss xmm4, xmm0 ; Y2 * (Z * X)
430addss xmm4, xmm0 ; (xmm4) Y2 := Y2 * (Z * X) + X
431andps xmm4, xmm2 ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
432andnps xmm5, xmm3 ; Y := ((J and 2) = 0)? Yes: 0 , No: Y
433addss xmm4, xmm5
434xorps xmm4, xmm1 ; (Y + Y2) xor SignBit
435movss xmm0, xmm4
436ret
437
438_fast_sin_vector2:
439movlps xmm0, [Param1]
440movlps xmm2, [rel kSSE_MASK_ABS_VAL]
441movaps xmm1, xmm0
442movlps xmm3, [rel kSSE_MASK_SIGN]
443andps xmm0, xmm2 ; (xmm0) X := Abs(ARadians)
444andps xmm1, xmm3 ; (xmm1) SignBit
445movaps xmm2, xmm0
446movlps xmm4, [rel kSSE_FOPI]
447movlps xmm5, [rel kSSE_INT_ONE]
448mulps xmm2, xmm4
449movlps xmm6, [rel kSSE_INT_NOT_ONE]
450cvtps2dq xmm2, xmm2 ; J := Trunc(X * FOPI)
451movlps xmm7, [rel kSSE_INT_FOUR]
452paddd xmm2, xmm5
453pand xmm2, xmm6 ; (xmm2) J := (J + 1) and (not 1)
454movlps xmm6, [rel kSSE_INT_TWO]
455cvtdq2ps xmm4, xmm2 ; (xmm4) Y := J
456movaps xmm5, xmm2
457pand xmm2, xmm6 ; J and 2
458pand xmm5, xmm7 ; J and 4
459pxor xmm7, xmm7
460pslld xmm5, 29 ; (xmm5) SwapSignBit := (J and 4) shl 29
461pcmpeqd xmm2, xmm7 ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
462movlps xmm6, [rel kSSE_PI_OVER_4]
463pxor xmm1, xmm5 ; (xmm1) SignBit := SignBit xor SwapSignBit
464mulps xmm4, xmm6 ; Y * Pi / 4
465movlps xmm3, [rel kSSE_COSCOF_P0]
466subps xmm0, xmm4 ; (xmm0) X := X - (Y * Pi / 4)
467movlps xmm4, [rel kSSE_COSCOF_P1]
468movaps xmm7, xmm0
469movlps xmm6, [rel kSSE_COSCOF_P2]
470mulps xmm7, xmm7 ; (xmm7) Z := X * X
471movlps xmm5, [rel kSSE_SINCOF_P1]
472mulps xmm3, xmm7 ; COSCOF_P0 * Z
473addps xmm3, xmm4 ; Y := COSCOF_P0 * Z + COSCOF_P1
474movlps xmm4, [rel kSSE_ONE_HALF]
475mulps xmm3, xmm7 ; Y * Z
476mulps xmm4, xmm7 ; Z * 0.5
477addps xmm3, xmm6 ; Y := (Y * Z) + COSCOF_P2
478movlps xmm6, [rel kSSE_ONE]
479mulps xmm3, xmm7 ; Y * Z
480mulps xmm3, xmm7 ; Y := Y * (Z * Z)
481subps xmm3, xmm4 ; Y - Z * 0.5
482movlps xmm4, [rel kSSE_SINCOF_P0]
483addps xmm3, xmm6 ; (xmm3) Y := Y - Z * 0.5 + 1
484movlps xmm6, [rel kSSE_SINCOF_P2]
485mulps xmm4, xmm7 ; SINCOF_P0 * Z
486addps xmm4, xmm5 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
487movaps xmm5, xmm2
488mulps xmm4, xmm7 ; Y2 * Z
489addps xmm4, xmm6 ; Y2 := (Y2 * Z) + SINCOF_P2
490mulps xmm4, xmm7 ; Y2 * Z
491mulps xmm4, xmm0 ; Y2 * (Z * X)
492addps xmm4, xmm0 ; (xmm4) Y2 := Y2 * (Z * X) + X
493andps xmm4, xmm2 ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
494andnps xmm5, xmm3 ; Y := ((J and 2) = 0)? Yes: 0 , No: Y
495addps xmm4, xmm5
496xorps xmm4, xmm1 ; (Y + Y2) xor SignBit
497movaps xmm0, xmm4
498ret
499
500_fast_sin_vector3:
501movq xmm0, [Param1]
502movss xmm1, [Param1+8]
503movlhps xmm0, xmm1
504movaps xmm2, [rel kSSE_MASK_ABS_VAL]
505movaps xmm1, xmm0
506movaps xmm3, [rel kSSE_MASK_SIGN]
507andps xmm0, xmm2 ; (xmm0) X := Abs(ARadians)
508andps xmm1, xmm3 ; (xmm1) SignBit
509movaps xmm2, xmm0
510movaps xmm4, [rel kSSE_FOPI]
511movaps xmm5, [rel kSSE_INT_ONE]
512mulps xmm2, xmm4
513movaps xmm6, [rel kSSE_INT_NOT_ONE]
514cvtps2dq xmm2, xmm2 ; J := Trunc(X * FOPI)
515movaps xmm7, [rel kSSE_INT_FOUR]
516paddd xmm2, xmm5
517pand xmm2, xmm6 ; (xmm2) J := (J + 1) and (not 1)
518movaps xmm6, [rel kSSE_INT_TWO]
519cvtdq2ps xmm4, xmm2 ; (xmm4) Y := J
520movaps xmm5, xmm2
521pand xmm2, xmm6 ; J and 2
522pand xmm5, xmm7 ; J and 4
523pxor xmm7, xmm7
524pslld xmm5, 29 ; (xmm5) SwapSignBit := (J and 4) shl 29
525pcmpeqd xmm2, xmm7 ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
526movaps xmm6, [rel kSSE_PI_OVER_4]
527pxor xmm1, xmm5 ; (xmm1) SignBit := SignBit xor SwapSignBit
528mulps xmm4, xmm6 ; Y * Pi / 4
529movaps xmm3, [rel kSSE_COSCOF_P0]
530subps xmm0, xmm4 ; (xmm0) X := X - (Y * Pi / 4)
531movaps xmm4, [rel kSSE_COSCOF_P1]
532movaps xmm7, xmm0
533movaps xmm6, [rel kSSE_COSCOF_P2]
534mulps xmm7, xmm7 ; (xmm7) Z := X * X
535movaps xmm5, [rel kSSE_SINCOF_P1]
536mulps xmm3, xmm7 ; COSCOF_P0 * Z
537addps xmm3, xmm4 ; Y := COSCOF_P0 * Z + COSCOF_P1
538movaps xmm4, [rel kSSE_ONE_HALF]
539mulps xmm3, xmm7 ; Y * Z
540mulps xmm4, xmm7 ; Z * 0.5
541addps xmm3, xmm6 ; Y := (Y * Z) + COSCOF_P2
542movaps xmm6, [rel kSSE_ONE]
543mulps xmm3, xmm7 ; Y * Z
544mulps xmm3, xmm7 ; Y := Y * (Z * Z)
545subps xmm3, xmm4 ; Y - Z * 0.5
546movaps xmm4, [rel kSSE_SINCOF_P0]
547addps xmm3, xmm6 ; (xmm3) Y := Y - Z * 0.5 + 1
548movaps xmm6, [rel kSSE_SINCOF_P2]
549mulps xmm4, xmm7 ; SINCOF_P0 * Z
550addps xmm4, xmm5 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
551movaps xmm5, xmm2
552mulps xmm4, xmm7 ; Y2 * Z
553addps xmm4, xmm6 ; Y2 := (Y2 * Z) + SINCOF_P2
554mulps xmm4, xmm7 ; Y2 * Z
555mulps xmm4, xmm0 ; Y2 * (Z * X)
556addps xmm4, xmm0 ; (xmm4) Y2 := Y2 * (Z * X) + X
557andps xmm4, xmm2 ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
558andnps xmm5, xmm3 ; Y := ((J and 2) = 0)? Yes: 0 , No: Y
559addps xmm4, xmm5
560xorps xmm4, xmm1 ; (Y + Y2) xor SignBit
561movaps xmm0, xmm4
562movhlps xmm1, xmm4
563ret
564
565_fast_sin_vector4:
566movups xmm0, [Param1]
567movaps xmm2, [rel kSSE_MASK_ABS_VAL]
568movaps xmm1, xmm0
569movaps xmm3, [rel kSSE_MASK_SIGN]
570andps xmm0, xmm2 ; (xmm0) X := Abs(ARadians)
571andps xmm1, xmm3 ; (xmm1) SignBit
572movaps xmm2, xmm0
573movaps xmm4, [rel kSSE_FOPI]
574movaps xmm5, [rel kSSE_INT_ONE]
575mulps xmm2, xmm4
576movaps xmm6, [rel kSSE_INT_NOT_ONE]
577cvtps2dq xmm2, xmm2 ; J := Trunc(X * FOPI)
578movaps xmm7, [rel kSSE_INT_FOUR]
579paddd xmm2, xmm5
580pand xmm2, xmm6 ; (xmm2) J := (J + 1) and (not 1)
581movaps xmm6, [rel kSSE_INT_TWO]
582cvtdq2ps xmm4, xmm2 ; (xmm4) Y := J
583movaps xmm5, xmm2
584pand xmm2, xmm6 ; J and 2
585pand xmm5, xmm7 ; J and 4
586pxor xmm7, xmm7
587pslld xmm5, 29 ; (xmm5) SwapSignBit := (J and 4) shl 29
588pcmpeqd xmm2, xmm7 ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
589movaps xmm6, [rel kSSE_PI_OVER_4]
590pxor xmm1, xmm5 ; (xmm1) SignBit := SignBit xor SwapSignBit
591mulps xmm4, xmm6 ; Y * Pi / 4
592movaps xmm3, [rel kSSE_COSCOF_P0]
593subps xmm0, xmm4 ; (xmm0) X := X - (Y * Pi / 4)
594movaps xmm4, [rel kSSE_COSCOF_P1]
595movaps xmm7, xmm0
596movaps xmm6, [rel kSSE_COSCOF_P2]
597mulps xmm7, xmm7 ; (xmm7) Z := X * X
598movaps xmm5, [rel kSSE_SINCOF_P1]
599mulps xmm3, xmm7 ; COSCOF_P0 * Z
600addps xmm3, xmm4 ; Y := COSCOF_P0 * Z + COSCOF_P1
601movaps xmm4, [rel kSSE_ONE_HALF]
602mulps xmm3, xmm7 ; Y * Z
603mulps xmm4, xmm7 ; Z * 0.5
604addps xmm3, xmm6 ; Y := (Y * Z) + COSCOF_P2
605movaps xmm6, [rel kSSE_ONE]
606mulps xmm3, xmm7 ; Y * Z
607mulps xmm3, xmm7 ; Y := Y * (Z * Z)
608subps xmm3, xmm4 ; Y - Z * 0.5
609movaps xmm4, [rel kSSE_SINCOF_P0]
610addps xmm3, xmm6 ; (xmm3) Y := Y - Z * 0.5 + 1
611movaps xmm6, [rel kSSE_SINCOF_P2]
612mulps xmm4, xmm7 ; SINCOF_P0 * Z
613addps xmm4, xmm5 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
614movaps xmm5, xmm2
615mulps xmm4, xmm7 ; Y2 * Z
616addps xmm4, xmm6 ; Y2 := (Y2 * Z) + SINCOF_P2
617mulps xmm4, xmm7 ; Y2 * Z
618mulps xmm4, xmm0 ; Y2 * (Z * X)
619addps xmm4, xmm0 ; (xmm4) Y2 := Y2 * (Z * X) + X
620andps xmm4, xmm2 ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
621andnps xmm5, xmm3 ; Y := ((J and 2) = 0)? Yes: 0 , No: Y
622addps xmm4, xmm5
623xorps xmm4, xmm1 ; (Y + Y2) xor SignBit
624movaps xmm0, xmm4
625movhlps xmm1, xmm4
626ret
627
628_fast_cos_single:
629movss xmm1, [rel kSSE_MASK_ABS_VAL]
630movss xmm2, [rel kSSE_FOPI]
631andps xmm0, xmm1 ; (xmm0) X := Abs(ARadians)
632movss xmm3, [rel kSSE_INT_NOT_ONE]
633movaps xmm1, xmm0
634movss xmm4, [rel kSSE_INT_FOUR]
635mulss xmm1, xmm2
636movss xmm2, [rel kSSE_INT_ONE]
637cvtps2dq xmm1, xmm1 ; J := Trunc(X * FOPI)
638pxor xmm6, xmm6
639paddd xmm1, xmm2
640pand xmm1, xmm3 ; (xmm1) J := (J + 1) and (not 1)
641movss xmm3, [rel kSSE_INT_TWO]
642cvtdq2ps xmm2, xmm1 ; (xmm2) Y := J
643psubd xmm1, xmm3 ; J - 2
644movaps xmm5, xmm1
645pandn xmm1, xmm4 ; (not (J - 2)) and 4
646pand xmm5, xmm3 ; (J - 2) and 2
647pslld xmm1, 29 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
648movss xmm3, [rel kSSE_PI_OVER_4]
649pcmpeqd xmm5, xmm6 ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
650mulss xmm2, xmm3 ; Y * Pi / 4
651movss xmm3, [rel kSSE_COSCOF_P1]
652subss xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
653movss xmm2, [rel kSSE_COSCOF_P0]
654movss xmm4, [rel kSSE_COSCOF_P2]
655movaps xmm6, xmm0
656mulss xmm6, xmm6 ; (xmm6) Z := X * X
657mulss xmm2, xmm6 ; COSCOF_P0 * Z
658addps xmm2, xmm3 ; Y := COSCOF_P0 * Z + COSCOF_P1
659movss xmm3, [rel kSSE_ONE_HALF]
660mulss xmm2, xmm6 ; Y * Z
661mulss xmm3, xmm6 ; Z * 0.5
662addss xmm2, xmm4 ; Y := (Y * Z) + COSCOF_P2
663movss xmm7, [rel kSSE_ONE]
664mulss xmm2, xmm6
665movss xmm4, [rel kSSE_SINCOF_P1]
666mulss xmm2, xmm6 ; Y := Y * (Z * Z)
667subss xmm2, xmm3 ; Y - Z * 0.5
668addss xmm2, xmm7 ; (xmm2) Y := Y - Z * 0.5 + 1
669movss xmm3, [rel kSSE_SINCOF_P0]
670movss xmm7, [rel kSSE_SINCOF_P2]
671mulss xmm3, xmm6 ; SINCOF_P0 * Z
672addss xmm3, xmm4 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
673mulss xmm3, xmm6 ; Y2 * Z
674addss xmm3, xmm7 ; Y2 := (Y2 * Z) + SINCOF_P2
675mulss xmm3, xmm6 ; Y2 * Z
676mulss xmm3, xmm0 ; Y2 * (Z * X)
677addss xmm3, xmm0 ; Y2 := Y2 * (Z * X) + X
678andps xmm3, xmm5 ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
679andnps xmm5, xmm2 ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
680addss xmm3, xmm5
681xorps xmm3, xmm1 ; (Y + Y2) xor SignBit
682movss xmm0, xmm3
683ret
684
685_fast_cos_vector2:
686movlps xmm0, [Param1]
687movlps xmm1, [rel kSSE_MASK_ABS_VAL]
688movlps xmm2, [rel kSSE_FOPI]
689andps xmm0, xmm1 ; (xmm0) X := Abs(ARadians)
690movlps xmm3, [rel kSSE_INT_NOT_ONE]
691movaps xmm1, xmm0
692movlps xmm4, [rel kSSE_INT_FOUR]
693mulps xmm1, xmm2
694movlps xmm2, [rel kSSE_INT_ONE]
695cvtps2dq xmm1, xmm1 ; J := Trunc(X * FOPI)
696pxor xmm6, xmm6
697paddd xmm1, xmm2
698pand xmm1, xmm3 ; (xmm1) J := (J + 1) and (not 1)
699movlps xmm3, [rel kSSE_INT_TWO]
700cvtdq2ps xmm2, xmm1 ; (xmm2) Y := J
701psubd xmm1, xmm3 ; J - 2
702movaps xmm5, xmm1
703pandn xmm1, xmm4 ; (not (J - 2)) and 4
704pand xmm5, xmm3 ; (J - 2) and 2
705pslld xmm1, 29 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
706movlps xmm3, [rel kSSE_PI_OVER_4]
707pcmpeqd xmm5, xmm6 ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
708mulps xmm2, xmm3 ; Y * Pi / 4
709movlps xmm3, [rel kSSE_COSCOF_P1]
710subps xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
711movlps xmm2, [rel kSSE_COSCOF_P0]
712movlps xmm4, [rel kSSE_COSCOF_P2]
713movaps xmm6, xmm0
714mulps xmm6, xmm6 ; (xmm6) Z := X * X
715mulps xmm2, xmm6 ; COSCOF_P0 * Z
716addps xmm2, xmm3 ; Y := COSCOF_P0 * Z + COSCOF_P1
717movlps xmm3, [rel kSSE_ONE_HALF]
718mulps xmm2, xmm6 ; Y * Z
719mulps xmm3, xmm6 ; Z * 0.5
720addps xmm2, xmm4 ; Y := (Y * Z) + COSCOF_P2
721movlps xmm7, [rel kSSE_ONE]
722mulps xmm2, xmm6
723movlps xmm4, [rel kSSE_SINCOF_P1]
724mulps xmm2, xmm6 ; Y := Y * (Z * Z)
725subps xmm2, xmm3 ; Y - Z * 0.5
726addps xmm2, xmm7 ; (xmm2) Y := Y - Z * 0.5 + 1
727movlps xmm3, [rel kSSE_SINCOF_P0]
728movlps xmm7, [rel kSSE_SINCOF_P2]
729mulps xmm3, xmm6 ; SINCOF_P0 * Z
730addps xmm3, xmm4 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
731mulps xmm3, xmm6 ; Y2 * Z
732addps xmm3, xmm7 ; Y2 := (Y2 * Z) + SINCOF_P2
733mulps xmm3, xmm6 ; Y2 * Z
734mulps xmm3, xmm0 ; Y2 * (Z * X)
735addps xmm3, xmm0 ; Y2 := Y2 * (Z * X) + X
736andps xmm3, xmm5 ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
737andnps xmm5, xmm2 ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
738addps xmm3, xmm5
739xorps xmm3, xmm1 ; (Y + Y2) xor SignBit
740movaps xmm0, xmm3
741ret
742
743_fast_cos_vector3:
744movq xmm0, [Param1]
745movss xmm1, [Param1+8]
746movlhps xmm0, xmm1
747movaps xmm1, [rel kSSE_MASK_ABS_VAL]
748movaps xmm2, [rel kSSE_FOPI]
749andps xmm0, xmm1 ; (xmm0) X := Abs(ARadians)
750movaps xmm3, [rel kSSE_INT_NOT_ONE]
751movaps xmm1, xmm0
752movaps xmm4, [rel kSSE_INT_FOUR]
753mulps xmm1, xmm2
754movaps xmm2, [rel kSSE_INT_ONE]
755cvtps2dq xmm1, xmm1 ; J := Trunc(X * FOPI)
756pxor xmm6, xmm6
757paddd xmm1, xmm2
758pand xmm1, xmm3 ; (xmm1) J := (J + 1) and (not 1)
759movaps xmm3, [rel kSSE_INT_TWO]
760cvtdq2ps xmm2, xmm1 ; (xmm2) Y := J
761psubd xmm1, xmm3 ; J - 2
762movaps xmm5, xmm1
763pandn xmm1, xmm4 ; (not (J - 2)) and 4
764pand xmm5, xmm3 ; (J - 2) and 2
765pslld xmm1, 29 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
766movaps xmm3, [rel kSSE_PI_OVER_4]
767pcmpeqd xmm5, xmm6 ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
768mulps xmm2, xmm3 ; Y * Pi / 4
769movaps xmm3, [rel kSSE_COSCOF_P1]
770subps xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
771movaps xmm2, [rel kSSE_COSCOF_P0]
772movaps xmm4, [rel kSSE_COSCOF_P2]
773movaps xmm6, xmm0
774mulps xmm6, xmm6 ; (xmm6) Z := X * X
775mulps xmm2, xmm6 ; COSCOF_P0 * Z
776addps xmm2, xmm3 ; Y := COSCOF_P0 * Z + COSCOF_P1
777movaps xmm3, [rel kSSE_ONE_HALF]
778mulps xmm2, xmm6 ; Y * Z
779mulps xmm3, xmm6 ; Z * 0.5
780addps xmm2, xmm4 ; Y := (Y * Z) + COSCOF_P2
781movaps xmm7, [rel kSSE_ONE]
782mulps xmm2, xmm6
783movaps xmm4, [rel kSSE_SINCOF_P1]
784mulps xmm2, xmm6 ; Y := Y * (Z * Z)
785subps xmm2, xmm3 ; Y - Z * 0.5
786addps xmm2, xmm7 ; (xmm2) Y := Y - Z * 0.5 + 1
787movaps xmm3, [rel kSSE_SINCOF_P0]
788movaps xmm7, [rel kSSE_SINCOF_P2]
789mulps xmm3, xmm6 ; SINCOF_P0 * Z
790addps xmm3, xmm4 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
791mulps xmm3, xmm6 ; Y2 * Z
792addps xmm3, xmm7 ; Y2 := (Y2 * Z) + SINCOF_P2
793mulps xmm3, xmm6 ; Y2 * Z
794mulps xmm3, xmm0 ; Y2 * (Z * X)
795addps xmm3, xmm0 ; Y2 := Y2 * (Z * X) + X
796andps xmm3, xmm5 ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
797andnps xmm5, xmm2 ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
798addps xmm3, xmm5
799xorps xmm3, xmm1 ; (Y + Y2) xor SignBit
800movaps xmm0, xmm3
801movhlps xmm1, xmm3
802ret
803
804_fast_cos_vector4:
805movups xmm0, [Param1]
806movaps xmm1, [rel kSSE_MASK_ABS_VAL]
807movaps xmm2, [rel kSSE_FOPI]
808andps xmm0, xmm1 ; (xmm0) X := Abs(ARadians)
809movaps xmm3, [rel kSSE_INT_NOT_ONE]
810movaps xmm1, xmm0
811movaps xmm4, [rel kSSE_INT_FOUR]
812mulps xmm1, xmm2
813movaps xmm2, [rel kSSE_INT_ONE]
814cvtps2dq xmm1, xmm1 ; J := Trunc(X * FOPI)
815pxor xmm6, xmm6
816paddd xmm1, xmm2
817pand xmm1, xmm3 ; (xmm1) J := (J + 1) and (not 1)
818movaps xmm3, [rel kSSE_INT_TWO]
819cvtdq2ps xmm2, xmm1 ; (xmm2) Y := J
820psubd xmm1, xmm3 ; J - 2
821movaps xmm5, xmm1
822pandn xmm1, xmm4 ; (not (J - 2)) and 4
823pand xmm5, xmm3 ; (J - 2) and 2
824pslld xmm1, 29 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
825movaps xmm3, [rel kSSE_PI_OVER_4]
826pcmpeqd xmm5, xmm6 ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
827mulps xmm2, xmm3 ; Y * Pi / 4
828movaps xmm3, [rel kSSE_COSCOF_P1]
829subps xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
830movaps xmm2, [rel kSSE_COSCOF_P0]
831movaps xmm4, [rel kSSE_COSCOF_P2]
832movaps xmm6, xmm0
833mulps xmm6, xmm6 ; (xmm6) Z := X * X
834mulps xmm2, xmm6 ; COSCOF_P0 * Z
835addps xmm2, xmm3 ; Y := COSCOF_P0 * Z + COSCOF_P1
836movaps xmm3, [rel kSSE_ONE_HALF]
837mulps xmm2, xmm6 ; Y * Z
838mulps xmm3, xmm6 ; Z * 0.5
839addps xmm2, xmm4 ; Y := (Y * Z) + COSCOF_P2
840movaps xmm7, [rel kSSE_ONE]
841mulps xmm2, xmm6
842movaps xmm4, [rel kSSE_SINCOF_P1]
843mulps xmm2, xmm6 ; Y := Y * (Z * Z)
844subps xmm2, xmm3 ; Y - Z * 0.5
845addps xmm2, xmm7 ; (xmm2) Y := Y - Z * 0.5 + 1
846movaps xmm3, [rel kSSE_SINCOF_P0]
847movaps xmm7, [rel kSSE_SINCOF_P2]
848mulps xmm3, xmm6 ; SINCOF_P0 * Z
849addps xmm3, xmm4 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
850mulps xmm3, xmm6 ; Y2 * Z
851addps xmm3, xmm7 ; Y2 := (Y2 * Z) + SINCOF_P2
852mulps xmm3, xmm6 ; Y2 * Z
853mulps xmm3, xmm0 ; Y2 * (Z * X)
854addps xmm3, xmm0 ; Y2 := Y2 * (Z * X) + X
855andps xmm3, xmm5 ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
856andnps xmm5, xmm2 ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
857addps xmm3, xmm5
858xorps xmm3, xmm1 ; (Y + Y2) xor SignBit
859movaps xmm0, xmm3
860movhlps xmm1, xmm3
861ret
862
863_fast_sin_cos_single:
864movss xmm2, [rel kSSE_MASK_SIGN]
865movss xmm3, [rel kSSE_MASK_ABS_VAL]
866movaps xmm1, xmm0
867pand xmm0, xmm3 ; (xmm0) X := Abs(ARadians)
868pand xmm1, xmm2 ; (xmm1) SignBitSin
869movaps xmm4, xmm0
870movss xmm5, [rel kSSE_FOPI]
871movss xmm6, [rel kSSE_INT_ONE]
872mulss xmm4, xmm5
873movss xmm7, [rel kSSE_INT_NOT_ONE]
874cvtps2dq xmm4, xmm4 ; (xmm4) J := Trunc(X * FOPI)
875movss xmm5, [rel kSSE_INT_FOUR]
876paddd xmm4, xmm6
877pand xmm4, xmm7 ; (xmm4) J := (J + 1) and (not 1)
878movss xmm7, [rel kSSE_INT_TWO]
879cvtdq2ps xmm2, xmm4 ; (xmm2) Y := J
880movaps xmm3, xmm4
881movaps xmm6, xmm4 ; (xmm6) J
882pand xmm3, xmm5 ; J and 4
883pand xmm4, xmm7 ; J and 2
884pxor xmm5, xmm5
885pslld xmm3, 29 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
886movss xmm7, [rel kSSE_PI_OVER_4]
887pcmpeqd xmm4, xmm5 ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
888mulss xmm2, xmm7 ; Y * Pi / 4
889movss xmm5, [rel kSSE_INT_TWO]
890subss xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
891psubd xmm6, xmm5 ; J - 2
892movss xmm7, [rel kSSE_INT_FOUR]
893pxor xmm1, xmm3 ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
894andnps xmm6, xmm7 ; (not (J - 2)) and 4
895movaps xmm3, xmm0
896pslld xmm6, 29 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
897mulss xmm3, xmm3 ; (xmm3) Z := X * X
898movss xmm2, [rel kSSE_COSCOF_P0]
899movss xmm5, [rel kSSE_COSCOF_P1]
900movss xmm7, [rel kSSE_COSCOF_P2]
901mulss xmm2, xmm3 ; COSCOF_P0 * Z
902addss xmm2, xmm5 ; Y := COSCOF_P0 * Z + COSCOF_P1
903movss xmm5, [rel kSSE_ONE_HALF]
904mulss xmm2, xmm3 ; Y * Z
905addss xmm2, xmm7 ; Y := (Y * Z) + COSCOF_P2
906movss xmm7, [rel kSSE_ONE]
907mulss xmm2, xmm3 ; Y * Z
908mulss xmm5, xmm3 ; 0.5 * Z
909mulss xmm2, xmm3 ; Y * (Z * Z)
910subss xmm2, xmm5 ; Y - 0.5 * Z
911movss xmm5, [rel kSSE_SINCOF_P0]
912addss xmm2, xmm7 ; (xmm2) Y := Y - 0.5 * Z + 1
913movss xmm7, [rel kSSE_SINCOF_P1]
914mulss xmm5, xmm3 ; SINCOF_P0 * Z
915addss xmm5, xmm7 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
916mulss xmm5, xmm3 ; Y2 * Z
917movss xmm7, [rel kSSE_SINCOF_P2]
918addss xmm5, xmm7 ; Y2 := Y2 * Z + SINCOF_P2
919mulss xmm5, xmm3 ; Y2 * Z
920mulss xmm5, xmm0 ; Y2 * (Z * X)
921addss xmm5, xmm0 ; (xmm5) Y2 := Y2 * (Z * X) + X
922movaps xmm0, xmm2 ; Y
923movaps xmm3, xmm5 ; Y2
924andps xmm5, xmm4 ; ((J and 2) = 0)? Yes: Y2, No: 0
925andnps xmm4, xmm2 ; ((J and 2) = 0)? Yes: 0 , No: Y
926subss xmm3, xmm5 ; ((J and 2) = 0)? Yes: 0 , No: Y2
927subss xmm0, xmm4 ; ((J and 2) = 0)? Yes: Y , No: 0
928addps xmm4, xmm5 ; ((J and 2) = 0)? Yes: Y2, No: Y
929addps xmm3, xmm0 ; ((J and 2) = 0)? Yes: Y , No: Y2
930xorps xmm4, xmm1 ; Sin
931xorps xmm3, xmm6 ; Cos
932movss [Param1], xmm4
933movss [Param2], xmm3
934ret
935
936_fast_sin_cos_vector2:
937movlps xmm0, [Param1]
938movlps xmm2, [rel kSSE_MASK_SIGN]
939movlps xmm3, [rel kSSE_MASK_ABS_VAL]
940movaps xmm1, xmm0
941pand xmm0, xmm3 ; (xmm0) X := Abs(ARadians)
942pand xmm1, xmm2 ; (xmm1) SignBitSin
943movaps xmm4, xmm0
944movlps xmm5, [rel kSSE_FOPI]
945movlps xmm6, [rel kSSE_INT_ONE]
946mulps xmm4, xmm5
947movlps xmm7, [rel kSSE_INT_NOT_ONE]
948cvtps2dq xmm4, xmm4 ; (xmm4) J := Trunc(X * FOPI)
949movlps xmm5, [rel kSSE_INT_FOUR]
950paddd xmm4, xmm6
951pand xmm4, xmm7 ; (xmm4) J := (J + 1) and (not 1)
952movlps xmm7, [rel kSSE_INT_TWO]
953cvtdq2ps xmm2, xmm4 ; (xmm2) Y := J
954movaps xmm3, xmm4
955movaps xmm6, xmm4 ; (xmm6) J
956pand xmm3, xmm5 ; J and 4
957pand xmm4, xmm7 ; J and 2
958pxor xmm5, xmm5
959pslld xmm3, 29 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
960movlps xmm7, [rel kSSE_PI_OVER_4]
961pcmpeqd xmm4, xmm5 ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
962mulps xmm2, xmm7 ; Y * Pi / 4
963movlps xmm5, [rel kSSE_INT_TWO]
964subps xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
965psubd xmm6, xmm5 ; J - 2
966movlps xmm7, [rel kSSE_INT_FOUR]
967pxor xmm1, xmm3 ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
968andnps xmm6, xmm7 ; (not (J - 2)) and 4
969movaps xmm3, xmm0
970pslld xmm6, 29 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
971mulps xmm3, xmm3 ; (xmm3) Z := X * X
972movlps xmm2, [rel kSSE_COSCOF_P0]
973movlps xmm5, [rel kSSE_COSCOF_P1]
974movlps xmm7, [rel kSSE_COSCOF_P2]
975mulps xmm2, xmm3 ; COSCOF_P0 * Z
976addps xmm2, xmm5 ; Y := COSCOF_P0 * Z + COSCOF_P1
977movlps xmm5, [rel kSSE_ONE_HALF]
978mulps xmm2, xmm3 ; Y * Z
979addps xmm2, xmm7 ; Y := (Y * Z) + COSCOF_P2
980movlps xmm7, [rel kSSE_ONE]
981mulps xmm2, xmm3 ; Y * Z
982mulps xmm5, xmm3 ; 0.5 * Z
983mulps xmm2, xmm3 ; Y * (Z * Z)
984subps xmm2, xmm5 ; Y - 0.5 * Z
985movlps xmm5, [rel kSSE_SINCOF_P0]
986addps xmm2, xmm7 ; (xmm2) Y := Y - 0.5 * Z + 1
987movlps xmm7, [rel kSSE_SINCOF_P1]
988mulps xmm5, xmm3 ; SINCOF_P0 * Z
989addps xmm5, xmm7 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
990mulps xmm5, xmm3 ; Y2 * Z
991movlps xmm7, [rel kSSE_SINCOF_P2]
992addps xmm5, xmm7 ; Y2 := Y2 * Z + SINCOF_P2
993mulps xmm5, xmm3 ; Y2 * Z
994mulps xmm5, xmm0 ; Y2 * (Z * X)
995addps xmm5, xmm0 ; (xmm5) Y2 := Y2 * (Z * X) + X
996movaps xmm0, xmm2 ; Y
997movaps xmm3, xmm5 ; Y2
998andps xmm5, xmm4 ; ((J and 2) = 0)? Yes: Y2, No: 0
999andnps xmm4, xmm2 ; ((J and 2) = 0)? Yes: 0 , No: Y
1000subps xmm3, xmm5 ; ((J and 2) = 0)? Yes: 0 , No: Y2
1001subps xmm0, xmm4 ; ((J and 2) = 0)? Yes: Y , No: 0
1002addps xmm4, xmm5 ; ((J and 2) = 0)? Yes: Y2, No: Y
1003addps xmm3, xmm0 ; ((J and 2) = 0)? Yes: Y , No: Y2
1004xorps xmm4, xmm1 ; Sin
1005xorps xmm3, xmm6 ; Cos
1006movlps [Param2], xmm4
1007movlps [Param3], xmm3
1008ret
1009
1010_fast_sin_cos_vector3:
1011movq xmm0, [Param1]
1012movss xmm1, [Param1+8]
1013movlhps xmm0, xmm1
1014movaps xmm2, [rel kSSE_MASK_SIGN]
1015movaps xmm3, [rel kSSE_MASK_ABS_VAL]
1016movaps xmm1, xmm0
1017pand xmm0, xmm3 ; (xmm0) X := Abs(ARadians)
1018pand xmm1, xmm2 ; (xmm1) SignBitSin
1019movaps xmm4, xmm0
1020movaps xmm5, [rel kSSE_FOPI]
1021movaps xmm6, [rel kSSE_INT_ONE]
1022mulps xmm4, xmm5
1023movaps xmm7, [rel kSSE_INT_NOT_ONE]
1024cvtps2dq xmm4, xmm4 ; (xmm4) J := Trunc(X * FOPI)
1025movaps xmm5, [rel kSSE_INT_FOUR]
1026paddd xmm4, xmm6
1027pand xmm4, xmm7 ; (xmm4) J := (J + 1) and (not 1)
1028movaps xmm7, [rel kSSE_INT_TWO]
1029cvtdq2ps xmm2, xmm4 ; (xmm2) Y := J
1030movaps xmm3, xmm4
1031movaps xmm6, xmm4 ; (xmm6) J
1032pand xmm3, xmm5 ; J and 4
1033pand xmm4, xmm7 ; J and 2
1034pxor xmm5, xmm5
1035pslld xmm3, 29 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
1036movaps xmm7, [rel kSSE_PI_OVER_4]
1037pcmpeqd xmm4, xmm5 ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1038mulps xmm2, xmm7 ; Y * Pi / 4
1039movaps xmm5, [rel kSSE_INT_TWO]
1040subps xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
1041psubd xmm6, xmm5 ; J - 2
1042movaps xmm7, [rel kSSE_INT_FOUR]
1043pxor xmm1, xmm3 ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
1044andnps xmm6, xmm7 ; (not (J - 2)) and 4
1045movaps xmm3, xmm0
1046pslld xmm6, 29 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
1047mulps xmm3, xmm3 ; (xmm3) Z := X * X
1048movaps xmm2, [rel kSSE_COSCOF_P0]
1049movaps xmm5, [rel kSSE_COSCOF_P1]
1050movaps xmm7, [rel kSSE_COSCOF_P2]
1051mulps xmm2, xmm3 ; COSCOF_P0 * Z
1052addps xmm2, xmm5 ; Y := COSCOF_P0 * Z + COSCOF_P1
1053movaps xmm5, [rel kSSE_ONE_HALF]
1054mulps xmm2, xmm3 ; Y * Z
1055addps xmm2, xmm7 ; Y := (Y * Z) + COSCOF_P2
1056movaps xmm7, [rel kSSE_ONE]
1057mulps xmm2, xmm3 ; Y * Z
1058mulps xmm5, xmm3 ; 0.5 * Z
1059mulps xmm2, xmm3 ; Y * (Z * Z)
1060subps xmm2, xmm5 ; Y - 0.5 * Z
1061movaps xmm5, [rel kSSE_SINCOF_P0]
1062addps xmm2, xmm7 ; (xmm2) Y := Y - 0.5 * Z + 1
1063movaps xmm7, [rel kSSE_SINCOF_P1]
1064mulps xmm5, xmm3 ; SINCOF_P0 * Z
1065addps xmm5, xmm7 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
1066mulps xmm5, xmm3 ; Y2 * Z
1067movaps xmm7, [rel kSSE_SINCOF_P2]
1068addps xmm5, xmm7 ; Y2 := Y2 * Z + SINCOF_P2
1069mulps xmm5, xmm3 ; Y2 * Z
1070mulps xmm5, xmm0 ; Y2 * (Z * X)
1071addps xmm5, xmm0 ; (xmm5) Y2 := Y2 * (Z * X) + X
1072movaps xmm0, xmm2 ; Y
1073movaps xmm3, xmm5 ; Y2
1074andps xmm5, xmm4 ; ((J and 2) = 0)? Yes: Y2, No: 0
1075andnps xmm4, xmm2 ; ((J and 2) = 0)? Yes: 0 , No: Y
1076subps xmm3, xmm5 ; ((J and 2) = 0)? Yes: 0 , No: Y2
1077subps xmm0, xmm4 ; ((J and 2) = 0)? Yes: Y , No: 0
1078addps xmm4, xmm5 ; ((J and 2) = 0)? Yes: Y2, No: Y
1079addps xmm3, xmm0 ; ((J and 2) = 0)? Yes: Y , No: Y2
1080xorps xmm4, xmm1 ; Sin
1081xorps xmm3, xmm6 ; Cos
1082movhlps xmm5, xmm4
1083movhlps xmm2, xmm3
1084movq [Param2], xmm4
1085movss [Param2+8], xmm5
1086movq [Param3], xmm3
1087movss [Param3+8], xmm2
1088ret
1089
1090_fast_sin_cos_vector4:
1091movups xmm0, [Param1]
1092movaps xmm2, [rel kSSE_MASK_SIGN]
1093movaps xmm3, [rel kSSE_MASK_ABS_VAL]
1094movaps xmm1, xmm0
1095pand xmm0, xmm3 ; (xmm0) X := Abs(ARadians)
1096pand xmm1, xmm2 ; (xmm1) SignBitSin
1097movaps xmm4, xmm0
1098movaps xmm5, [rel kSSE_FOPI]
1099movaps xmm6, [rel kSSE_INT_ONE]
1100mulps xmm4, xmm5
1101movaps xmm7, [rel kSSE_INT_NOT_ONE]
1102cvtps2dq xmm4, xmm4 ; (xmm4) J := Trunc(X * FOPI)
1103movaps xmm5, [rel kSSE_INT_FOUR]
1104paddd xmm4, xmm6
1105pand xmm4, xmm7 ; (xmm4) J := (J + 1) and (not 1)
1106movaps xmm7, [rel kSSE_INT_TWO]
1107cvtdq2ps xmm2, xmm4 ; (xmm2) Y := J
1108movaps xmm3, xmm4
1109movaps xmm6, xmm4 ; (xmm6) J
1110pand xmm3, xmm5 ; J and 4
1111pand xmm4, xmm7 ; J and 2
1112pxor xmm5, xmm5
1113pslld xmm3, 29 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
1114movaps xmm7, [rel kSSE_PI_OVER_4]
1115pcmpeqd xmm4, xmm5 ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1116mulps xmm2, xmm7 ; Y * Pi / 4
1117movaps xmm5, [rel kSSE_INT_TWO]
1118subps xmm0, xmm2 ; (xmm0) X := X - (Y * Pi / 4)
1119psubd xmm6, xmm5 ; J - 2
1120movaps xmm7, [rel kSSE_INT_FOUR]
1121pxor xmm1, xmm3 ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
1122andnps xmm6, xmm7 ; (not (J - 2)) and 4
1123movaps xmm3, xmm0
1124pslld xmm6, 29 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
1125mulps xmm3, xmm3 ; (xmm3) Z := X * X
1126movaps xmm2, [rel kSSE_COSCOF_P0]
1127movaps xmm5, [rel kSSE_COSCOF_P1]
1128movaps xmm7, [rel kSSE_COSCOF_P2]
1129mulps xmm2, xmm3 ; COSCOF_P0 * Z
1130addps xmm2, xmm5 ; Y := COSCOF_P0 * Z + COSCOF_P1
1131movaps xmm5, [rel kSSE_ONE_HALF]
1132mulps xmm2, xmm3 ; Y * Z
1133addps xmm2, xmm7 ; Y := (Y * Z) + COSCOF_P2
1134movaps xmm7, [rel kSSE_ONE]
1135mulps xmm2, xmm3 ; Y * Z
1136mulps xmm5, xmm3 ; 0.5 * Z
1137mulps xmm2, xmm3 ; Y * (Z * Z)
1138subps xmm2, xmm5 ; Y - 0.5 * Z
1139movaps xmm5, [rel kSSE_SINCOF_P0]
1140addps xmm2, xmm7 ; (xmm2) Y := Y - 0.5 * Z + 1
1141movaps xmm7, [rel kSSE_SINCOF_P1]
1142mulps xmm5, xmm3 ; SINCOF_P0 * Z
1143addps xmm5, xmm7 ; Y2 := SINCOF_P0 * Z + SINCOF_P1
1144mulps xmm5, xmm3 ; Y2 * Z
1145movaps xmm7, [rel kSSE_SINCOF_P2]
1146addps xmm5, xmm7 ; Y2 := Y2 * Z + SINCOF_P2
1147mulps xmm5, xmm3 ; Y2 * Z
1148mulps xmm5, xmm0 ; Y2 * (Z * X)
1149addps xmm5, xmm0 ; (xmm5) Y2 := Y2 * (Z * X) + X
1150movaps xmm0, xmm2 ; Y
1151movaps xmm3, xmm5 ; Y2
1152andps xmm5, xmm4 ; ((J and 2) = 0)? Yes: Y2, No: 0
1153andnps xmm4, xmm2 ; ((J and 2) = 0)? Yes: 0 , No: Y
1154subps xmm3, xmm5 ; ((J and 2) = 0)? Yes: 0 , No: Y2
1155subps xmm0, xmm4 ; ((J and 2) = 0)? Yes: Y , No: 0
1156addps xmm4, xmm5 ; ((J and 2) = 0)? Yes: Y2, No: Y
1157addps xmm3, xmm0 ; ((J and 2) = 0)? Yes: Y , No: Y2
1158xorps xmm4, xmm1 ; Sin
1159xorps xmm3, xmm6 ; Cos
1160movups [Param2], xmm4
1161movups [Param3], xmm3
1162ret
1163
1164_fast_exp_single:
1165movss xmm1, [rel kSSE_EXP_A1]
1166movss xmm2, [rel kSSE_EXP_A2]
1167
1168; Val := 12102203.1615614 * A + 1065353216.0
1169mulss xmm0, xmm1
1170movss xmm3, [rel kSSE_EXP_CST]
1171addss xmm0, xmm2
1172
1173; if (Val >= EXP_CST) then Val := EXP_CST
1174movss xmm1, xmm0
1175cmpltss xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1176andps xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1177andnps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1178orps xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1179
1180; IVal := Trunc(Val)
1181xorps xmm3, xmm3
1182cvtps2dq xmm1, xmm0
1183
1184; if (IVal < 0) then I := 0
1185movss xmm2, [rel kSSE_MASK_EXPONENT]
1186movdqa xmm0, xmm1 ; IVal
1187pcmpgtd xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1188movss xmm3, [rel kSSE_MASK_FRACTION]
1189pand xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1190
1191; XU.I := IVal and 0x7F800000
1192movss xmm4, [rel kSSE_EXP_I1]
1193movss xmm1, xmm0
1194pand xmm0, xmm2 ; XU.I / XU.S
1195
1196; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1197pand xmm1, xmm3
1198movss xmm6, [rel kSSE_EXP_F5]
1199por xmm1, xmm4 ; XU2.I / XU2.S
1200
1201; Result := XU.S *
1202; ( 0.509964287281036376953125 + B *
1203; ( 0.3120158612728118896484375 + B *
1204; ( 0.1666135489940643310546875 + B *
1205; (-2.12528370320796966552734375e-3 + B *
1206; 1.3534179888665676116943359375e-2))));
1207movss xmm5, [rel kSSE_EXP_F4]
1208movss xmm7, xmm1
1209
1210mulss xmm1, xmm6
1211movss xmm4, [rel kSSE_EXP_F3]
1212addss xmm1, xmm5
1213movss xmm3, [rel kSSE_EXP_F2]
1214mulss xmm1, xmm7
1215movss xmm2, [rel kSSE_EXP_F1]
1216addss xmm1, xmm4
1217mulss xmm1, xmm7
1218addss xmm1, xmm3
1219mulss xmm1, xmm7
1220addss xmm1, xmm2
1221mulss xmm1, xmm0
1222
1223movss xmm0, xmm1
1224ret
1225
1226_fast_exp_vector2:
1227movlps xmm0, [Param1]
1228movlps xmm1, [rel kSSE_EXP_A1]
1229movlps xmm2, [rel kSSE_EXP_A2]
1230
1231; Val := 12102203.1615614 * A + 1065353216.0
1232mulps xmm0, xmm1
1233movlps xmm3, [rel kSSE_EXP_CST]
1234addps xmm0, xmm2
1235
1236; if (Val >= EXP_CST) then Val := EXP_CST
1237movaps xmm1, xmm0
1238cmpltps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1239andps xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1240andnps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1241orps xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1242
1243; IVal := Trunc(Val)
1244xorps xmm3, xmm3
1245cvtps2dq xmm1, xmm0
1246
1247; if (IVal < 0) then I := 0
1248movlps xmm2, [rel kSSE_MASK_EXPONENT]
1249movdqa xmm0, xmm1 ; IVal
1250pcmpgtd xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1251movlps xmm3, [rel kSSE_MASK_FRACTION]
1252pand xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1253
1254; XU.I := IVal and 0x7F800000
1255movlps xmm4, [rel kSSE_EXP_I1]
1256movdqa xmm1, xmm0
1257pand xmm0, xmm2 ; XU.I / XU.S
1258
1259; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1260pand xmm1, xmm3
1261movlps xmm6, [rel kSSE_EXP_F5]
1262por xmm1, xmm4 ; XU2.I / XU2.S
1263
1264; Result := XU.S *
1265; ( 0.509964287281036376953125 + B *
1266; ( 0.3120158612728118896484375 + B *
1267; ( 0.1666135489940643310546875 + B *
1268; (-2.12528370320796966552734375e-3 + B *
1269; 1.3534179888665676116943359375e-2))));
1270movlps xmm5, [rel kSSE_EXP_F4]
1271movaps xmm7, xmm1
1272
1273mulps xmm1, xmm6
1274movlps xmm4, [rel kSSE_EXP_F3]
1275addps xmm1, xmm5
1276movlps xmm3, [rel kSSE_EXP_F2]
1277mulps xmm1, xmm7
1278movlps xmm2, [rel kSSE_EXP_F1]
1279addps xmm1, xmm4
1280mulps xmm1, xmm7
1281addps xmm1, xmm3
1282mulps xmm1, xmm7
1283addps xmm1, xmm2
1284mulps xmm1, xmm0
1285movaps xmm0, xmm1
1286ret
1287
1288_fast_exp_vector3:
1289movq xmm0, [Param1]
1290movss xmm1, [Param1+8]
1291movlhps xmm0, xmm1
1292movaps xmm1, [rel kSSE_EXP_A1]
1293movaps xmm2, [rel kSSE_EXP_A2]
1294
1295; Val := 12102203.1615614 * A + 1065353216.0
1296mulps xmm0, xmm1
1297movaps xmm3, [rel kSSE_EXP_CST]
1298addps xmm0, xmm2
1299
1300; if (Val >= EXP_CST) then Val := EXP_CST
1301movaps xmm1, xmm0
1302cmpltps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1303andps xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1304andnps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1305orps xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1306
1307; IVal := Trunc(Val)
1308xorps xmm3, xmm3
1309cvtps2dq xmm1, xmm0
1310
1311; if (IVal < 0) then I := 0
1312movaps xmm2, [rel kSSE_MASK_EXPONENT]
1313movdqa xmm0, xmm1 ; IVal
1314pcmpgtd xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1315movaps xmm3, [rel kSSE_MASK_FRACTION]
1316pand xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1317
1318; XU.I := IVal and 0x7F800000
1319movaps xmm4, [rel kSSE_EXP_I1]
1320movdqa xmm1, xmm0
1321pand xmm0, xmm2 ; XU.I / XU.S
1322
1323; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1324pand xmm1, xmm3
1325movaps xmm6, [rel kSSE_EXP_F5]
1326por xmm1, xmm4 ; XU2.I / XU2.S
1327
1328; Result := XU.S *
1329; ( 0.509964287281036376953125 + B *
1330; ( 0.3120158612728118896484375 + B *
1331; ( 0.1666135489940643310546875 + B *
1332; (-2.12528370320796966552734375e-3 + B *
1333; 1.3534179888665676116943359375e-2))));
1334movaps xmm5, [rel kSSE_EXP_F4]
1335movaps xmm7, xmm1
1336
1337mulps xmm1, xmm6
1338movaps xmm4, [rel kSSE_EXP_F3]
1339addps xmm1, xmm5
1340movaps xmm3, [rel kSSE_EXP_F2]
1341mulps xmm1, xmm7
1342movaps xmm2, [rel kSSE_EXP_F1]
1343addps xmm1, xmm4
1344mulps xmm1, xmm7
1345addps xmm1, xmm3
1346mulps xmm1, xmm7
1347addps xmm1, xmm2
1348mulps xmm1, xmm0
1349movaps xmm0, xmm1
1350movhlps xmm1, xmm1
1351ret
1352
1353_fast_exp_vector4:
1354movups xmm0, [Param1]
1355movaps xmm1, [rel kSSE_EXP_A1]
1356movaps xmm2, [rel kSSE_EXP_A2]
1357
1358; Val := 12102203.1615614 * A + 1065353216.0
1359mulps xmm0, xmm1
1360movaps xmm3, [rel kSSE_EXP_CST]
1361addps xmm0, xmm2
1362
1363; if (Val >= EXP_CST) then Val := EXP_CST
1364movaps xmm1, xmm0
1365cmpltps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1366andps xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1367andnps xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1368orps xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1369
1370; IVal := Trunc(Val)
1371xorps xmm3, xmm3
1372cvtps2dq xmm1, xmm0
1373
1374; if (IVal < 0) then I := 0
1375movaps xmm2, [rel kSSE_MASK_EXPONENT]
1376movdqa xmm0, xmm1 ; IVal
1377pcmpgtd xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1378movaps xmm3, [rel kSSE_MASK_FRACTION]
1379pand xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1380
1381; XU.I := IVal and 0x7F800000
1382movaps xmm4, [rel kSSE_EXP_I1]
1383movdqa xmm1, xmm0
1384pand xmm0, xmm2 ; XU.I / XU.S
1385
1386; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1387pand xmm1, xmm3
1388movaps xmm6, [rel kSSE_EXP_F5]
1389por xmm1, xmm4 ; XU2.I / XU2.S
1390
1391; Result := XU.S *
1392; ( 0.509964287281036376953125 + B *
1393; ( 0.3120158612728118896484375 + B *
1394; ( 0.1666135489940643310546875 + B *
1395; (-2.12528370320796966552734375e-3 + B *
1396; 1.3534179888665676116943359375e-2))));
1397movaps xmm5, [rel kSSE_EXP_F4]
1398movaps xmm7, xmm1
1399
1400mulps xmm1, xmm6
1401movaps xmm4, [rel kSSE_EXP_F3]
1402addps xmm1, xmm5
1403movaps xmm3, [rel kSSE_EXP_F2]
1404mulps xmm1, xmm7
1405movaps xmm2, [rel kSSE_EXP_F1]
1406addps xmm1, xmm4
1407mulps xmm1, xmm7
1408addps xmm1, xmm3
1409mulps xmm1, xmm7
1410addps xmm1, xmm2
1411mulps xmm1, xmm0
1412movaps xmm0, xmm1
1413movhlps xmm1, xmm1
1414ret
1415
1416_fast_ln_single:
1417xorps xmm2, xmm2
1418movss xmm1, xmm0
1419movss xmm3, [rel kSSE_LN_CST]
1420movss xmm4, [rel kSSE_NEG_INFINITY]
1421
1422; Exp := Val.I shr 23
1423psrld xmm0, 23
1424movss xmm5, xmm1
1425cvtdq2ps xmm0, xmm0 ; xmm0=Exp
1426
1427; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1428cmpnless xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1429movss xmm2, [rel kSSE_MASK_FRACTION]
1430andps xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1431andnps xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1432movss xmm4, [rel kSSE_EXP_I1]
1433orps xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1434
1435; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1436pand xmm5, xmm2
1437movss xmm2, [rel kSSE_LN_F5]
1438por xmm5, xmm4
1439movss xmm6, [rel kSSE_LN_F3]
1440movss xmm3, xmm5 ; xmm3=X
1441mulss xmm5, xmm5 ; xmm5=X2
1442
1443movss xmm4, xmm3
1444movss xmm7, [rel kSSE_LN_F4]
1445mulss xmm4, xmm6
1446mulss xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1447subss xmm4, xmm7
1448movss xmm7, [rel kSSE_LN_F2]
1449movss xmm6, xmm3
1450mulss xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1451subss xmm6, xmm7
1452movss xmm2, [rel kSSE_LN_F1]
1453addss xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1454mulss xmm3, xmm2
1455mulss xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1456addss xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1457addss xmm4, xmm0
1458addss xmm3, xmm4
1459
1460movss xmm0, xmm3
1461ret
1462
1463_fast_ln_vector2:
1464movlps xmm0, [Param1]
1465xorps xmm2, xmm2
1466movaps xmm1, xmm0
1467movlps xmm3, [rel kSSE_LN_CST]
1468movlps xmm4, [rel kSSE_NEG_INFINITY]
1469
1470; Exp := Val.I shr 23
1471psrld xmm0, 23
1472movaps xmm5, xmm1
1473cvtdq2ps xmm0, xmm0 ; xmm0=Exp
1474
1475; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1476cmpnleps xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1477movlps xmm2, [rel kSSE_MASK_FRACTION]
1478andps xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1479andnps xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1480movlps xmm4, [rel kSSE_EXP_I1]
1481orps xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1482
1483; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1484pand xmm5, xmm2
1485movlps xmm2, [rel kSSE_LN_F5]
1486por xmm5, xmm4
1487movlps xmm6, [rel kSSE_LN_F3]
1488movaps xmm3, xmm5 ; xmm3=X
1489mulps xmm5, xmm5 ; xmm5=X2
1490
1491movaps xmm4, xmm3
1492movlps xmm7, [rel kSSE_LN_F4]
1493mulps xmm4, xmm6
1494mulps xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1495subps xmm4, xmm7
1496movlps xmm7, [rel kSSE_LN_F2]
1497movaps xmm6, xmm3
1498mulps xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1499subps xmm6, xmm7
1500movlps xmm2, [rel kSSE_LN_F1]
1501addps xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1502mulps xmm3, xmm2
1503mulps xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1504addps xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1505addps xmm4, xmm0
1506addps xmm3, xmm4
1507
1508movaps xmm0, xmm3
1509ret
1510
1511_fast_ln_vector3:
1512movq xmm0, [Param1]
1513movss xmm1, [Param1+8]
1514movlhps xmm0, xmm1
1515xorps xmm2, xmm2
1516movaps xmm1, xmm0
1517movaps xmm3, [rel kSSE_LN_CST]
1518movaps xmm4, [rel kSSE_NEG_INFINITY]
1519
1520; Exp := Val.I shr 23
1521psrld xmm0, 23
1522movaps xmm5, xmm1
1523cvtdq2ps xmm0, xmm0 ; xmm0=Exp
1524
1525; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1526cmpnleps xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1527movaps xmm2, [rel kSSE_MASK_FRACTION]
1528andps xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1529andnps xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1530movaps xmm4, [rel kSSE_EXP_I1]
1531orps xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1532
1533; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1534pand xmm5, xmm2
1535movaps xmm2, [rel kSSE_LN_F5]
1536por xmm5, xmm4
1537movaps xmm6, [rel kSSE_LN_F3]
1538movaps xmm3, xmm5 ; xmm3=X
1539mulps xmm5, xmm5 ; xmm5=X2
1540
1541movaps xmm4, xmm3
1542movaps xmm7, [rel kSSE_LN_F4]
1543mulps xmm4, xmm6
1544mulps xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1545subps xmm4, xmm7
1546movaps xmm7, [rel kSSE_LN_F2]
1547movaps xmm6, xmm3
1548mulps xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1549subps xmm6, xmm7
1550movaps xmm2, [rel kSSE_LN_F1]
1551addps xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1552mulps xmm3, xmm2
1553mulps xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1554addps xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1555addps xmm4, xmm0
1556addps xmm3, xmm4
1557
1558movaps xmm0, xmm3
1559movhlps xmm1, xmm3
1560ret
1561
1562_fast_ln_vector4:
1563movups xmm0, [Param1]
1564xorps xmm2, xmm2
1565movaps xmm1, xmm0
1566movaps xmm3, [rel kSSE_LN_CST]
1567movaps xmm4, [rel kSSE_NEG_INFINITY]
1568
1569; Exp := Val.I shr 23
1570psrld xmm0, 23
1571movaps xmm5, xmm1
1572cvtdq2ps xmm0, xmm0 ; xmm0=Exp
1573
1574; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1575cmpnleps xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1576movaps xmm2, [rel kSSE_MASK_FRACTION]
1577andps xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1578andnps xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1579movaps xmm4, [rel kSSE_EXP_I1]
1580orps xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1581
1582; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1583pand xmm5, xmm2
1584movaps xmm2, [rel kSSE_LN_F5]
1585por xmm5, xmm4
1586movaps xmm6, [rel kSSE_LN_F3]
1587movaps xmm3, xmm5 ; xmm3=X
1588mulps xmm5, xmm5 ; xmm5=X2
1589
1590movaps xmm4, xmm3
1591movaps xmm7, [rel kSSE_LN_F4]
1592mulps xmm4, xmm6
1593mulps xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1594subps xmm4, xmm7
1595movaps xmm7, [rel kSSE_LN_F2]
1596movaps xmm6, xmm3
1597mulps xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1598subps xmm6, xmm7
1599movaps xmm2, [rel kSSE_LN_F1]
1600addps xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1601mulps xmm3, xmm2
1602mulps xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1603addps xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1604addps xmm4, xmm0
1605addps xmm3, xmm4
1606
1607movaps xmm0, xmm3
1608movhlps xmm1, xmm3
1609ret
1610
1611_fast_log2_single:
1612movss xmm2, [rel kSSE_MASK_FRACTION]
1613movss xmm1, xmm0
1614
1615; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1616movss xmm3, [rel kSSE_LOG2_I1]
1617pand xmm0, xmm2
1618cvtdq2ps xmm1, xmm1
1619movss xmm4, [rel kSSE_LOG2_F1]
1620por xmm0, xmm3
1621
1622movss xmm2, [rel kSSE_LOG2_F2]
1623mulss xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1624movss xmm3, [rel kSSE_LOG2_F3]
1625subss xmm1, xmm2 ; Result - 124.22551499
1626mulss xmm3, xmm0
1627movss xmm4, [rel kSSE_LOG2_F5]
1628subss xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1629movss xmm2, [rel kSSE_LOG2_F4]
1630addss xmm0, xmm4
1631divss xmm2, xmm0
1632subss xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1633
1634movss xmm0, xmm1
1635ret
1636
1637_fast_log2_vector2:
1638movlps xmm0, [Param1]
1639movlps xmm2, [rel kSSE_MASK_FRACTION]
1640movaps xmm1, xmm0
1641
1642; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1643movlps xmm3, [rel kSSE_LOG2_I1]
1644pand xmm0, xmm2
1645cvtdq2ps xmm1, xmm1
1646movlps xmm4, [rel kSSE_LOG2_F1]
1647por xmm0, xmm3
1648
1649movlps xmm2, [rel kSSE_LOG2_F2]
1650mulps xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1651movlps xmm3, [rel kSSE_LOG2_F3]
1652subps xmm1, xmm2 ; Result - 124.22551499
1653mulps xmm3, xmm0
1654movlps xmm4, [rel kSSE_LOG2_F5]
1655subps xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1656movlps xmm2, [rel kSSE_LOG2_F4]
1657addps xmm0, xmm4
1658divps xmm2, xmm0
1659subps xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1660
1661movaps xmm0, xmm1
1662ret
1663
1664_fast_log2_vector3:
1665movq xmm0, [Param1]
1666movss xmm1, [Param1+8]
1667movlhps xmm0, xmm1
1668movaps xmm2, [rel kSSE_MASK_FRACTION]
1669movaps xmm1, xmm0
1670
1671; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1672movaps xmm3, [rel kSSE_LOG2_I1]
1673pand xmm0, xmm2
1674cvtdq2ps xmm1, xmm1
1675movaps xmm4, [rel kSSE_LOG2_F1]
1676por xmm0, xmm3
1677
1678movaps xmm2, [rel kSSE_LOG2_F2]
1679mulps xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1680movaps xmm3, [rel kSSE_LOG2_F3]
1681subps xmm1, xmm2 ; Result - 124.22551499
1682mulps xmm3, xmm0
1683movaps xmm4, [rel kSSE_LOG2_F5]
1684subps xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1685movaps xmm2, [rel kSSE_LOG2_F4]
1686addps xmm0, xmm4
1687divps xmm2, xmm0
1688subps xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1689
1690movaps xmm0, xmm1
1691movhlps xmm1, xmm1
1692ret
1693
1694_fast_log2_vector4:
1695movups xmm0, [Param1]
1696movaps xmm2, [rel kSSE_MASK_FRACTION]
1697movaps xmm1, xmm0
1698
1699; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1700movaps xmm3, [rel kSSE_LOG2_I1]
1701pand xmm0, xmm2
1702cvtdq2ps xmm1, xmm1
1703movaps xmm4, [rel kSSE_LOG2_F1]
1704por xmm0, xmm3
1705
1706movaps xmm2, [rel kSSE_LOG2_F2]
1707mulps xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1708movaps xmm3, [rel kSSE_LOG2_F3]
1709subps xmm1, xmm2 ; Result - 124.22551499
1710mulps xmm3, xmm0
1711movaps xmm4, [rel kSSE_LOG2_F5]
1712subps xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1713movaps xmm2, [rel kSSE_LOG2_F4]
1714addps xmm0, xmm4
1715divps xmm2, xmm0
1716subps xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1717
1718movaps xmm0, xmm1
1719movhlps xmm1, xmm1
1720ret
1721
1722_fast_exp2_single:
1723; Set rounding mode to Round Positive (=Round Down)
1724stmxcsr [OldFlags]
1725mov ecx, [OldFlags]
1726xorps xmm1, xmm1
1727and ecx, SSE_ROUND_MASK
1728movss xmm3, xmm0
1729or ecx, SSE_ROUND_DOWN
1730movss xmm5, xmm0
1731mov [NewFlags], ecx
1732
1733movss xmm1, [rel kSSE_EXP2_F1]
1734ldmxcsr [NewFlags]
1735
1736; Z := A - RoundDown(A)
1737cvtps2dq xmm3, xmm3
1738addss xmm1, xmm5 ; A + 121.2740575
1739cvtdq2ps xmm3, xmm3
1740movss xmm2, [rel kSSE_EXP2_F2]
1741subss xmm0, xmm3
1742
1743movss xmm3, [rel kSSE_EXP2_F3]
1744movss xmm4, [rel kSSE_EXP2_F4]
1745subss xmm3, xmm0 ; (4.84252568 - Z)
1746mulss xmm0, xmm4 ; 1.49012907 * Z
1747divss xmm2, xmm3
1748movss xmm5, [rel kSSE_EXP2_F5]
1749addss xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1750subss xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1751mulss xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1752cvtps2dq xmm1, xmm1
1753
1754; Restore rounding mode
1755ldmxcsr [OldFlags]
1756
1757movss xmm0, xmm1
1758ret
1759
1760_fast_exp2_vector2:
1761; Set rounding mode to Round Positive (=Round Down)
1762stmxcsr [OldFlags]
1763movlps xmm0, [Param1]
1764mov ecx, [OldFlags]
1765xorps xmm1, xmm1
1766and ecx, SSE_ROUND_MASK
1767movaps xmm3, xmm0
1768or ecx, SSE_ROUND_DOWN
1769movaps xmm5, xmm0
1770mov [NewFlags], ecx
1771
1772movlps xmm1, [rel kSSE_EXP2_F1]
1773ldmxcsr [NewFlags]
1774
1775; Z := A - RoundDown(A)
1776cvtps2dq xmm3, xmm3
1777addps xmm1, xmm5 ; A + 121.2740575
1778cvtdq2ps xmm3, xmm3
1779movlps xmm2, [rel kSSE_EXP2_F2]
1780subps xmm0, xmm3
1781
1782movlps xmm3, [rel kSSE_EXP2_F3]
1783movlps xmm4, [rel kSSE_EXP2_F4]
1784subps xmm3, xmm0 ; (4.84252568 - Z)
1785mulps xmm0, xmm4 ; 1.49012907 * Z
1786divps xmm2, xmm3
1787movlps xmm5, [rel kSSE_EXP2_F5]
1788addps xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1789subps xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1790mulps xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1791cvtps2dq xmm1, xmm1
1792
1793; Restore rounding mode
1794ldmxcsr [OldFlags]
1795
1796movaps xmm0, xmm1
1797ret
1798
1799_fast_exp2_vector3:
1800; Set rounding mode to Round Positive (=Round Down)
1801stmxcsr [OldFlags]
1802movq xmm0, [Param1]
1803movss xmm1, [Param1+8]
1804movlhps xmm0, xmm1
1805mov edx, [OldFlags]
1806xorps xmm1, xmm1
1807and edx, SSE_ROUND_MASK
1808movaps xmm3, xmm0
1809or edx, SSE_ROUND_DOWN
1810movaps xmm5, xmm0
1811mov [NewFlags], edx
1812
1813movaps xmm1, [rel kSSE_EXP2_F1]
1814ldmxcsr [NewFlags]
1815
1816; Z := A - RoundDown(A)
1817cvtps2dq xmm3, xmm3
1818addps xmm1, xmm5 ; A + 121.2740575
1819cvtdq2ps xmm3, xmm3
1820movaps xmm2, [rel kSSE_EXP2_F2]
1821subps xmm0, xmm3
1822
1823movaps xmm3, [rel kSSE_EXP2_F3]
1824movaps xmm4, [rel kSSE_EXP2_F4]
1825subps xmm3, xmm0 ; (4.84252568 - Z)
1826mulps xmm0, xmm4 ; 1.49012907 * Z
1827divps xmm2, xmm3
1828movaps xmm5, [rel kSSE_EXP2_F5]
1829addps xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1830subps xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1831mulps xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1832cvtps2dq xmm1, xmm1
1833
1834; Restore rounding mode
1835ldmxcsr [OldFlags]
1836
1837movaps xmm0, xmm1
1838movhlps xmm1, xmm1
1839ret
1840
1841_fast_exp2_vector4:
1842; Set rounding mode to Round Positive (=Round Down)
1843stmxcsr [OldFlags]
1844movups xmm0, [Param1]
1845mov edx, [OldFlags]
1846xorps xmm1, xmm1
1847and edx, SSE_ROUND_MASK
1848movaps xmm3, xmm0
1849or edx, SSE_ROUND_DOWN
1850movaps xmm5, xmm0
1851mov [NewFlags], edx
1852
1853movaps xmm1, [rel kSSE_EXP2_F1]
1854ldmxcsr [NewFlags]
1855
1856; Z := A - RoundDown(A)
1857cvtps2dq xmm3, xmm3
1858addps xmm1, xmm5 ; A + 121.2740575
1859cvtdq2ps xmm3, xmm3
1860movaps xmm2, [rel kSSE_EXP2_F2]
1861subps xmm0, xmm3
1862
1863movaps xmm3, [rel kSSE_EXP2_F3]
1864movaps xmm4, [rel kSSE_EXP2_F4]
1865subps xmm3, xmm0 ; (4.84252568 - Z)
1866mulps xmm0, xmm4 ; 1.49012907 * Z
1867divps xmm2, xmm3
1868movaps xmm5, [rel kSSE_EXP2_F5]
1869addps xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1870subps xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1871mulps xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1872cvtps2dq xmm1, xmm1
1873
1874; Restore rounding mode
1875ldmxcsr [OldFlags]
1876
1877movaps xmm0, xmm1
1878movhlps xmm1, xmm1
1879ret
1880
1881;****************************************************************************
1882; Common Functions
1883;****************************************************************************
1884
1885_abs_vector3:
1886movq xmm0, [Param1]
1887movss xmm1, [Param1+8]
1888movaps xmm2, [rel kSSE_MASK_ABS_VAL]
1889andps xmm0, xmm2
1890pand xmm1, xmm2
1891ret
1892
1893_abs_vector4:
1894movups xmm0, [Param1]
1895movaps xmm1, [rel kSSE_MASK_ABS_VAL]
1896andps xmm0, xmm1
1897movhlps xmm1, xmm0
1898ret
1899
1900_sign_single:
1901movss xmm1, [rel kSSE_ONE]
1902movss xmm2, xmm0
1903movss xmm3, [rel kSSE_MASK_SIGN]
1904
1905andps xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1906xorps xmm4, xmm4
1907orps xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1908cmpneqss xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1909andps xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1910ret
1911
1912_sign_vector2:
1913movlps xmm0, [Param1]
1914movlps xmm1, [rel kSSE_ONE]
1915movaps xmm2, xmm0
1916movlps xmm3, [rel kSSE_MASK_SIGN]
1917
1918andps xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1919xorps xmm4, xmm4
1920orps xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1921cmpneqps xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1922andps xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1923ret
1924
1925_sign_vector3:
1926movq xmm0, [Param1]
1927movss xmm1, [Param1+8]
1928movlhps xmm0, xmm1
1929movaps xmm1, [rel kSSE_ONE]
1930movaps xmm2, xmm0
1931movaps xmm3, [rel kSSE_MASK_SIGN]
1932
1933andps xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1934xorps xmm4, xmm4
1935orps xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1936cmpneqps xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1937andps xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1938movhlps xmm1, xmm0
1939ret
1940
1941_sign_vector4:
1942movups xmm0, [Param1]
1943movaps xmm1, [rel kSSE_ONE]
1944movaps xmm2, xmm0
1945movaps xmm3, [rel kSSE_MASK_SIGN]
1946
1947andps xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1948xorps xmm4, xmm4
1949orps xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1950cmpneqps xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1951andps xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1952movhlps xmm1, xmm0
1953ret
1954
1955_floor_single:
1956; Set rounding mode to Round Down
1957stmxcsr [OldFlags]
1958mov eax, [OldFlags]
1959and eax, SSE_ROUND_MASK
1960or eax, SSE_ROUND_DOWN
1961mov [NewFlags], eax
1962ldmxcsr [NewFlags]
1963
1964cvtss2si rax, xmm0
1965
1966; Restore rounding mode
1967ldmxcsr [OldFlags]
1968ret
1969
1970_floor_vector2:
1971; Set rounding mode to Round Down
1972stmxcsr [OldFlags]
1973mov eax, [OldFlags]
1974and eax, SSE_ROUND_MASK
1975or eax, SSE_ROUND_DOWN
1976mov [NewFlags], eax
1977movlps xmm0, [Param1]
1978ldmxcsr [NewFlags]
1979
1980cvtps2dq xmm0, xmm0
1981
1982; Restore rounding mode
1983ldmxcsr [OldFlags]
1984
1985movq rax, xmm0
1986ret
1987
1988_floor_vector3:
1989; Set rounding mode to Round Down
1990stmxcsr [OldFlags]
1991mov eax, [OldFlags]
1992and eax, SSE_ROUND_MASK
1993or eax, SSE_ROUND_DOWN
1994mov [NewFlags], eax
1995movq xmm0, [Param1]
1996movss xmm1, [Param1+8]
1997movlhps xmm0, xmm1
1998ldmxcsr [NewFlags]
1999
2000cvtps2dq xmm0, xmm0
2001
2002; Restore rounding mode
2003ldmxcsr [OldFlags]
2004
2005movhlps xmm1, xmm0
2006movq rax, xmm0
2007movq rdx, xmm1
2008ret
2009
2010_floor_vector4:
2011; Set rounding mode to Round Down
2012stmxcsr [OldFlags]
2013mov eax, [OldFlags]
2014and eax, SSE_ROUND_MASK
2015or eax, SSE_ROUND_DOWN
2016mov [NewFlags], eax
2017movups xmm0, [Param1]
2018ldmxcsr [NewFlags]
2019
2020cvtps2dq xmm0, xmm0
2021
2022; Restore rounding mode
2023ldmxcsr [OldFlags]
2024
2025movhlps xmm1, xmm0
2026movq rax, xmm0
2027movq rdx, xmm1
2028ret
2029
2030_trunc_single:
2031; Set rounding mode to Truncate
2032stmxcsr [OldFlags]
2033mov eax, [OldFlags]
2034and eax, SSE_ROUND_MASK
2035or eax, SSE_ROUND_TRUNC
2036mov [NewFlags], eax
2037ldmxcsr [NewFlags]
2038
2039cvtss2si rax, xmm0
2040
2041; Restore rounding mode
2042ldmxcsr [OldFlags]
2043ret
2044
2045_trunc_vector2:
2046; Set rounding mode to Truncate
2047stmxcsr [OldFlags]
2048mov eax, [OldFlags]
2049and eax, SSE_ROUND_MASK
2050or eax, SSE_ROUND_TRUNC
2051mov [NewFlags], eax
2052movlps xmm0, [Param1]
2053ldmxcsr [NewFlags]
2054
2055cvtps2dq xmm0, xmm0
2056
2057; Restore rounding mode
2058ldmxcsr [OldFlags]
2059
2060movq rax, xmm0
2061ret
2062
2063_trunc_vector3:
2064; Set rounding mode to Truncate
2065stmxcsr [OldFlags]
2066mov eax, [OldFlags]
2067and eax, SSE_ROUND_MASK
2068or eax, SSE_ROUND_TRUNC
2069mov [NewFlags], eax
2070movq xmm0, [Param1]
2071movss xmm1, [Param1+8]
2072movlhps xmm0, xmm1
2073ldmxcsr [NewFlags]
2074
2075cvtps2dq xmm0, xmm0
2076
2077; Restore rounding mode
2078ldmxcsr [OldFlags]
2079
2080movhlps xmm1, xmm0
2081movq rax, xmm0
2082movq rdx, xmm1
2083ret
2084
2085_trunc_vector4:
2086; Set rounding mode to Truncate
2087stmxcsr [OldFlags]
2088mov eax, [OldFlags]
2089and eax, SSE_ROUND_MASK
2090or eax, SSE_ROUND_TRUNC
2091mov [NewFlags], eax
2092movups xmm0, [Param1]
2093ldmxcsr [NewFlags]
2094
2095cvtps2dq xmm0, xmm0
2096
2097; Restore rounding mode
2098ldmxcsr [OldFlags]
2099
2100movhlps xmm1, xmm0
2101movq rax, xmm0
2102movq rdx, xmm1
2103ret
2104
2105_round_single:
2106; Rounding mode defaults to round-to-nearest
2107cvtss2si rax, xmm0
2108ret
2109
2110_round_vector2:
2111; Rounding mode defaults to round-to-nearest
2112movlps xmm0, [Param1]
2113cvtps2dq xmm0, xmm0
2114movq rax, xmm0
2115ret
2116
2117_round_vector3:
2118; Rounding mode defaults to round-to-nearest
2119movq xmm0, [Param1]
2120movss xmm1, [Param1+8]
2121movlhps xmm0, xmm1
2122cvtps2dq xmm0, xmm0
2123movhlps xmm1, xmm0
2124movq rax, xmm0
2125movq rdx, xmm1
2126ret
2127
2128_round_vector4:
2129; Rounding mode defaults to round-to-nearest
2130movups xmm0, [Param1]
2131cvtps2dq xmm0, xmm0
2132movhlps xmm1, xmm0
2133movq rax, xmm0
2134movq rdx, xmm1
2135ret
2136
2137_ceil_single:
2138; Set rounding mode to Round Up
2139stmxcsr [OldFlags]
2140mov eax, [OldFlags]
2141and eax, SSE_ROUND_MASK
2142or eax, SSE_ROUND_UP
2143mov [NewFlags], eax
2144ldmxcsr [NewFlags]
2145
2146cvtss2si rax, xmm0
2147
2148; Restore rounding mode
2149ldmxcsr [OldFlags]
2150ret
2151
2152_ceil_vector2:
2153; Set rounding mode to Round Up
2154stmxcsr [OldFlags]
2155mov eax, [OldFlags]
2156and eax, SSE_ROUND_MASK
2157or eax, SSE_ROUND_UP
2158mov [NewFlags], eax
2159movlps xmm0, [Param1]
2160ldmxcsr [NewFlags]
2161
2162cvtps2dq xmm0, xmm0
2163
2164; Restore rounding mode
2165ldmxcsr [OldFlags]
2166
2167movq rax, xmm0
2168ret
2169
2170_ceil_vector3:
2171; Set rounding mode to Round Up
2172stmxcsr [OldFlags]
2173mov eax, [OldFlags]
2174and eax, SSE_ROUND_MASK
2175or eax, SSE_ROUND_UP
2176mov [NewFlags], eax
2177movq xmm0, [Param1]
2178movss xmm1, [Param1+8]
2179movlhps xmm0, xmm1
2180ldmxcsr [NewFlags]
2181
2182cvtps2dq xmm0, xmm0
2183
2184; Restore rounding mode
2185ldmxcsr [OldFlags]
2186
2187movhlps xmm1, xmm0
2188movq rax, xmm0
2189movq rdx, xmm1
2190ret
2191
2192_ceil_vector4:
2193; Set rounding mode to Round Up
2194stmxcsr [OldFlags]
2195mov eax, [OldFlags]
2196and eax, SSE_ROUND_MASK
2197or eax, SSE_ROUND_UP
2198mov [NewFlags], eax
2199movups xmm0, [Param1]
2200ldmxcsr [NewFlags]
2201
2202cvtps2dq xmm0, xmm0
2203
2204; Restore rounding mode
2205ldmxcsr [OldFlags]
2206
2207movhlps xmm1, xmm0
2208movq rax, xmm0
2209movq rdx, xmm1
2210ret
2211
2212_frac_vector2:
2213; Set rounding mode to Truncate
2214stmxcsr [OldFlags]
2215mov edx, [OldFlags]
2216and edx, SSE_ROUND_MASK
2217or edx, SSE_ROUND_TRUNC
2218movlps xmm0, [Param1]
2219mov [NewFlags], edx
2220movaps xmm1, xmm0
2221ldmxcsr [NewFlags]
2222
2223cvtps2dq xmm0, xmm0
2224ldmxcsr [OldFlags]
2225cvtdq2ps xmm0, xmm0
2226subps xmm1, xmm0 ; A - Trunc(A)
2227
2228movaps xmm0, xmm1
2229ret
2230
2231_frac_vector3:
2232; Set rounding mode to Truncate
2233stmxcsr [OldFlags]
2234mov eax, [OldFlags]
2235and eax, SSE_ROUND_MASK
2236or eax, SSE_ROUND_TRUNC
2237movq xmm0, [Param1]
2238movss xmm1, [Param1+8]
2239movlhps xmm0, xmm1
2240mov [NewFlags], eax
2241movaps xmm1, xmm0
2242ldmxcsr [NewFlags]
2243
2244cvtps2dq xmm0, xmm0
2245ldmxcsr [OldFlags]
2246cvtdq2ps xmm0, xmm0
2247subps xmm1, xmm0 ; A - Trunc(A)
2248
2249movaps xmm0, xmm1
2250movhlps xmm1, xmm1
2251ret
2252
2253_frac_vector4:
2254; Set rounding mode to Truncate
2255stmxcsr [OldFlags]
2256mov eax, [OldFlags]
2257and eax, SSE_ROUND_MASK
2258or eax, SSE_ROUND_TRUNC
2259movups xmm0, [Param1]
2260mov [NewFlags], eax
2261movaps xmm1, xmm0
2262ldmxcsr [NewFlags]
2263
2264cvtps2dq xmm0, xmm0
2265ldmxcsr [OldFlags]
2266cvtdq2ps xmm0, xmm0
2267subps xmm1, xmm0 ; A - Trunc(A)
2268
2269movaps xmm0, xmm1
2270movhlps xmm1, xmm1
2271ret
2272
2273_fmod_vector2_single:
2274; Set rounding mode to Truncate
2275movss xmm1, xmm0
2276movlps xmm0, [Param1]
2277stmxcsr [OldFlags]
2278mov ecx, [OldFlags]
2279shufps xmm1, xmm1, 0x00 ; Replicate B
2280and ecx, SSE_ROUND_MASK
2281movaps xmm2, xmm0
2282or ecx, SSE_ROUND_TRUNC
2283movaps xmm3, xmm1
2284mov [NewFlags], ecx
2285divps xmm2, xmm3 ; A / B
2286ldmxcsr [NewFlags]
2287
2288cvtps2dq xmm2, xmm2
2289cvtdq2ps xmm2, xmm2 ; Trunc(A / B)
2290mulps xmm2, xmm1
2291subps xmm0, xmm2 ; A - (B * Trunc(A / B))
2292
2293; Restore rounding mode
2294ldmxcsr [OldFlags]
2295ret
2296
2297_fmod_vector3_single:
2298; Set rounding mode to Truncate
2299movss xmm1, xmm0
2300movq xmm0, [Param1]
2301movss xmm2, [Param1+8]
2302movlhps xmm0, xmm2
2303stmxcsr [OldFlags]
2304mov edx, [OldFlags]
2305shufps xmm1, xmm1, 0x00 ; Replicate B
2306and edx, SSE_ROUND_MASK
2307movaps xmm2, xmm0
2308or edx, SSE_ROUND_TRUNC
2309movaps xmm3, xmm1
2310mov [NewFlags], edx
2311divps xmm2, xmm3 ; A / B
2312ldmxcsr [NewFlags]
2313
2314cvtps2dq xmm2, xmm2
2315cvtdq2ps xmm2, xmm2 ; Trunc(A / B)
2316mulps xmm2, xmm1
2317subps xmm0, xmm2 ; A - (B * Trunc(A / B))
2318
2319; Restore rounding mode
2320ldmxcsr [OldFlags]
2321
2322movhlps xmm1, xmm0
2323ret
2324
2325_fmod_vector4_single:
2326; Set rounding mode to Truncate
2327movss xmm1, xmm0
2328movups xmm0, [Param1]
2329stmxcsr [OldFlags]
2330mov edx, [OldFlags]
2331shufps xmm1, xmm1, 0x00 ; Replicate B
2332and edx, SSE_ROUND_MASK
2333movaps xmm2, xmm0
2334or edx, SSE_ROUND_TRUNC
2335movaps xmm3, xmm1
2336mov [NewFlags], edx
2337divps xmm2, xmm3 ; A / B
2338ldmxcsr [NewFlags]
2339
2340cvtps2dq xmm2, xmm2
2341cvtdq2ps xmm2, xmm2 ; Trunc(A / B)
2342mulps xmm2, xmm1
2343subps xmm0, xmm2 ; A - (B * Trunc(A / B))
2344
2345; Restore rounding mode
2346ldmxcsr [OldFlags]
2347
2348movhlps xmm1, xmm0
2349ret
2350
2351_fmod_vector2:
2352; Set rounding mode to Truncate
2353movlps xmm0, [Param1]
2354stmxcsr [OldFlags]
2355movlps xmm1, [Param2]
2356mov edx, [OldFlags]
2357movaps xmm2, xmm0
2358and edx, SSE_ROUND_MASK
2359movaps xmm3, xmm1
2360or edx, SSE_ROUND_TRUNC
2361divps xmm2, xmm3 ; A / B
2362mov [NewFlags], edx
2363ldmxcsr [NewFlags]
2364
2365cvtps2dq xmm2, xmm2
2366cvtdq2ps xmm2, xmm2 ; Trunc(A / B)
2367mulps xmm2, xmm1
2368subps xmm0, xmm2 ; A - (B * Trunc(A / B))
2369
2370; Restore rounding mode
2371ldmxcsr [OldFlags]
2372ret
2373
2374_fmod_vector3:
2375; Set rounding mode to Truncate
2376movq xmm0, [Param1]
2377movss xmm1, [Param1+8]
2378movlhps xmm0, xmm1
2379stmxcsr [OldFlags]
2380movq xmm1, [Param2]
2381movss xmm2, [Param2+8]
2382movlhps xmm1, xmm2
2383mov edx, [OldFlags]
2384movaps xmm2, xmm0
2385and edx, SSE_ROUND_MASK
2386movaps xmm3, xmm1
2387or edx, SSE_ROUND_TRUNC
2388divps xmm2, xmm3 ; A / B
2389mov [NewFlags], edx
2390ldmxcsr [NewFlags]
2391
2392cvtps2dq xmm2, xmm2
2393cvtdq2ps xmm2, xmm2 ; Trunc(A / B)
2394mulps xmm2, xmm1
2395subps xmm0, xmm2 ; A - (B * Trunc(A / B))
2396
2397; Restore rounding mode
2398ldmxcsr [OldFlags]
2399
2400movhlps xmm1, xmm0
2401ret
2402
2403_fmod_vector4:
2404; Set rounding mode to Truncate
2405movups xmm0, [Param1]
2406stmxcsr [OldFlags]
2407movups xmm1, [Param2]
2408mov edx, [OldFlags]
2409movaps xmm2, xmm0
2410and edx, SSE_ROUND_MASK
2411movaps xmm3, xmm1
2412or edx, SSE_ROUND_TRUNC
2413divps xmm2, xmm3 ; A / B
2414mov [NewFlags], edx
2415ldmxcsr [NewFlags]
2416
2417cvtps2dq xmm2, xmm2
2418cvtdq2ps xmm2, xmm2 ; Trunc(A / B)
2419mulps xmm2, xmm1
2420subps xmm0, xmm2 ; A - (B * Trunc(A / B))
2421
2422; Restore rounding mode
2423ldmxcsr [OldFlags]
2424
2425movhlps xmm1, xmm0
2426ret
2427
2428_modf_vector2:
2429movlps xmm0, [Param1]
2430
2431; Set rounding mode to Truncate
2432stmxcsr [OldFlags]
2433mov eax, [OldFlags]
2434and eax, SSE_ROUND_MASK
2435or eax, SSE_ROUND_TRUNC
2436mov [NewFlags], eax
2437ldmxcsr [NewFlags]
2438
2439movaps xmm1, xmm0
2440cvtps2dq xmm0, xmm0
2441movlps [Param2], xmm0 ; B = Trunc(A)
2442cvtdq2ps xmm0, xmm0
2443subps xmm1, xmm0 ; A - Trunc(A)
2444
2445; Restore rounding mode
2446ldmxcsr [OldFlags]
2447
2448movaps xmm0, xmm1
2449ret
2450
2451_modf_vector3:
2452movq xmm0, [Param1]
2453movss xmm1, [Param1+8]
2454movlhps xmm0, xmm1
2455
2456; Set rounding mode to Truncate
2457stmxcsr [OldFlags]
2458mov eax, [OldFlags]
2459and eax, SSE_ROUND_MASK
2460or eax, SSE_ROUND_TRUNC
2461mov [NewFlags], eax
2462ldmxcsr [NewFlags]
2463
2464movaps xmm1, xmm0
2465cvtps2dq xmm0, xmm0
2466movhlps xmm2, xmm0
2467movq [Param2], xmm0 ; B = Trunc(A)
2468movd [Param2+8], xmm2
2469cvtdq2ps xmm0, xmm0
2470subps xmm1, xmm0 ; A - Trunc(A)
2471
2472; Restore rounding mode
2473ldmxcsr [OldFlags]
2474
2475movaps xmm0, xmm1
2476movhlps xmm1, xmm1
2477ret
2478
2479_modf_vector4:
2480movups xmm0, [Param1]
2481
2482; Set rounding mode to Truncate
2483stmxcsr [OldFlags]
2484mov eax, [OldFlags]
2485and eax, SSE_ROUND_MASK
2486or eax, SSE_ROUND_TRUNC
2487mov [NewFlags], eax
2488ldmxcsr [NewFlags]
2489
2490movaps xmm1, xmm0
2491cvtps2dq xmm0, xmm0
2492movups [Param2], xmm0 ; B = Trunc(A)
2493cvtdq2ps xmm0, xmm0
2494subps xmm1, xmm0 ; A - Trunc(A)
2495
2496; Restore rounding mode
2497ldmxcsr [OldFlags]
2498
2499movaps xmm0, xmm1
2500movhlps xmm1, xmm1
2501ret
2502
2503_min_vector2_single:
2504shufps xmm0, xmm0, 0x00 ; Replicate B
2505movlps xmm1, [Param1]
2506minps xmm0, xmm1
2507ret
2508
2509_min_vector3_single:
2510shufps xmm0, xmm0, 0x00 ; Replicate B
2511movq xmm1, [Param1]
2512movss xmm2, [Param1+8]
2513movlhps xmm1, xmm2
2514minps xmm0, xmm1
2515movhlps xmm1, xmm0
2516ret
2517
2518_min_vector4_single:
2519shufps xmm0, xmm0, 0x00 ; Replicate B
2520movups xmm1, [Param1]
2521minps xmm0, xmm1
2522movhlps xmm1, xmm0
2523ret
2524
2525_min_vector2:
2526movlps xmm0, [Param1]
2527movlps xmm1, [Param2]
2528minps xmm0, xmm1
2529ret
2530
2531_min_vector3:
2532movq xmm0, [Param1]
2533movss xmm1, [Param1+8]
2534movlhps xmm0, xmm1
2535movq xmm1, [Param2]
2536movss xmm2, [Param2+8]
2537movlhps xmm1, xmm2
2538minps xmm0, xmm1
2539movhlps xmm1, xmm0
2540ret
2541
2542_min_vector4:
2543movups xmm0, [Param1]
2544movups xmm1, [Param2]
2545minps xmm0, xmm1
2546movhlps xmm1, xmm0
2547ret
2548
2549_max_vector2_single:
2550shufps xmm0, xmm0, 0x00 ; Replicate B
2551movlps xmm1, [Param1]
2552maxps xmm0, xmm1
2553ret
2554
2555_max_vector3_single:
2556shufps xmm0, xmm0, 0x00 ; Replicate B
2557movq xmm1, [Param1]
2558movss xmm2, [Param1+8]
2559movlhps xmm1, xmm2
2560maxps xmm0, xmm1
2561movhlps xmm1, xmm0
2562ret
2563
2564_max_vector4_single:
2565shufps xmm0, xmm0, 0x00 ; Replicate B
2566movups xmm1, [Param1]
2567maxps xmm0, xmm1
2568movhlps xmm1, xmm0
2569ret
2570
2571_max_vector2:
2572movlps xmm0, [Param1]
2573movlps xmm1, [Param2]
2574maxps xmm0, xmm1
2575ret
2576
2577_max_vector3:
2578movq xmm0, [Param1]
2579movss xmm1, [Param1+8]
2580movlhps xmm0, xmm1
2581movq xmm1, [Param2]
2582movss xmm2, [Param2+8]
2583movlhps xmm1, xmm2
2584maxps xmm0, xmm1
2585movhlps xmm1, xmm0
2586ret
2587
2588_max_vector4:
2589movups xmm0, [Param1]
2590movups xmm1, [Param2]
2591maxps xmm0, xmm1
2592movhlps xmm1, xmm0
2593ret
2594
2595_ensure_range_single:
2596maxss xmm0, xmm1
2597minss xmm0, xmm2
2598ret
2599
2600_ensure_range_vector2_single:
2601shufps xmm0, xmm0, 0x00 ; Replicate AMin
2602shufps xmm1, xmm1, 0x00 ; Replicate AMax
2603movlps xmm2, [Param1]
2604minps xmm2, xmm1
2605maxps xmm0, xmm2
2606ret
2607
2608_ensure_range_vector3_single:
2609shufps xmm0, xmm0, 0x00 ; Replicate AMin
2610shufps xmm1, xmm1, 0x00 ; Replicate AMax
2611movq xmm2, [Param1]
2612movss xmm3, [Param1+8]
2613movlhps xmm2, xmm3
2614minps xmm2, xmm1
2615maxps xmm0, xmm2
2616movhlps xmm1, xmm0
2617ret
2618
2619_ensure_range_vector4_single:
2620shufps xmm0, xmm0, 0x00 ; Replicate AMin
2621shufps xmm1, xmm1, 0x00 ; Replicate AMax
2622movups xmm2, [Param1]
2623minps xmm2, xmm1
2624maxps xmm0, xmm2
2625movhlps xmm1, xmm0
2626ret
2627
2628_ensure_range_vector2:
2629movlps xmm0, [Param1]
2630movlps xmm1, [Param2]
2631movlps xmm2, [Param3]
2632maxps xmm0, xmm1
2633minps xmm0, xmm2
2634ret
2635
2636_ensure_range_vector3:
2637movq xmm0, [Param1]
2638movss xmm1, [Param1+8]
2639movlhps xmm0, xmm1
2640movq xmm1, [Param2]
2641movss xmm2, [Param2+8]
2642movlhps xmm1, xmm2
2643movq xmm2, [Param3]
2644movss xmm3, [Param3+8]
2645movlhps xmm2, xmm3
2646maxps xmm0, xmm1
2647minps xmm0, xmm2
2648movhlps xmm1, xmm0
2649ret
2650
2651_ensure_range_vector4:
2652movups xmm0, [Param1]
2653movups xmm1, [Param2]
2654movups xmm2, [Param3]
2655maxps xmm0, xmm1
2656minps xmm0, xmm2
2657movhlps xmm1, xmm0
2658ret
2659
2660_mix_vector3_single:
2661movq xmm4, [Param1]
2662movss xmm1, [Param1+8]
2663movlhps xmm4, xmm1
2664movq xmm1, [Param2]
2665movss xmm2, [Param2+8]
2666movlhps xmm1, xmm2
2667shufps xmm0, xmm0, 0x00 ; Replicate T
2668subps xmm1, xmm4
2669mulps xmm1, xmm0
2670addps xmm4, xmm1 ; A + (T * (B - A))
2671movhlps xmm1, xmm4
2672movaps xmm0, xmm4
2673ret
2674
2675_mix_vector4_single:
2676movups xmm4, [Param1]
2677movups xmm1, [Param2]
2678shufps xmm0, xmm0, 0x00 ; Replicate T
2679subps xmm1, xmm4
2680mulps xmm1, xmm0
2681addps xmm4, xmm1 ; A + (T * (B - A))
2682movaps xmm0, xmm4
2683movhlps xmm1, xmm4
2684ret
2685
2686_mix_vector3:
2687movq xmm0, [Param1]
2688movss xmm1, [Param1+8]
2689movlhps xmm0, xmm1
2690movq xmm1, [Param2]
2691movss xmm2, [Param2+8]
2692movlhps xmm1, xmm2
2693movq xmm2, [Param3]
2694movss xmm3, [Param3+8]
2695movlhps xmm2, xmm3
2696subps xmm1, xmm0
2697mulps xmm1, xmm2
2698addps xmm0, xmm1 ; A + (T * (B - A))
2699movhlps xmm1, xmm0
2700ret
2701
2702_mix_vector4:
2703movups xmm0, [Param1]
2704movups xmm1, [Param2]
2705movups xmm2, [Param3]
2706subps xmm1, xmm0
2707mulps xmm1, xmm2
2708addps xmm0, xmm1 ; A + (T * (B - A))
2709movhlps xmm1, xmm0
2710ret
2711
2712_step_single_vector2:
2713movlps xmm1, [Param1]
2714shufps xmm0, xmm0, 0x00 ; Replicate AEdge
2715movlps xmm2, [rel kSSE_ONE]
2716cmpnltps xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2717andps xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2718movaps xmm0, xmm1
2719ret
2720
2721_step_single_vector3:
2722movq xmm3, [Param1]
2723movss xmm2, [Param1+8]
2724movlhps xmm3, xmm2
2725shufps xmm0, xmm0, 0x00 ; Replicate AEdge
2726movaps xmm2, [rel kSSE_ONE]
2727cmpnltps xmm3, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2728andps xmm3, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2729movaps xmm0, xmm3
2730movhlps xmm1, xmm3
2731ret
2732
2733_step_single_vector4:
2734movups xmm3, [Param1]
2735shufps xmm0, xmm0, 0x00 ; Replicate AEdge
2736movaps xmm2, [rel kSSE_ONE]
2737cmpnltps xmm3, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2738andps xmm3, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2739movaps xmm0, xmm3
2740movhlps xmm1, xmm3
2741ret
2742
2743_step_vector2:
2744movlps xmm0, [Param1]
2745movlps xmm1, [Param2]
2746movlps xmm2, [rel kSSE_ONE]
2747cmpnltps xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2748andps xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2749movaps xmm0, xmm1
2750ret
2751
2752_step_vector3:
2753movq xmm0, [Param1]
2754movss xmm1, [Param1+8]
2755movlhps xmm0, xmm1
2756movq xmm1, [Param2]
2757movss xmm2, [Param2+8]
2758movlhps xmm1, xmm2
2759movaps xmm2, [rel kSSE_ONE]
2760cmpnltps xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2761andps xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2762movaps xmm0, xmm1
2763movhlps xmm1, xmm1
2764ret
2765
2766_step_vector4:
2767movups xmm0, [Param1]
2768movups xmm1, [Param2]
2769movaps xmm2, [rel kSSE_ONE]
2770cmpnltps xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2771andps xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2772movaps xmm0, xmm1
2773movhlps xmm1, xmm1
2774ret
2775
2776_smooth_step_single_vector3:
2777movq xmm2, [Param1]
2778movss xmm3, [Param1+8]
2779movlhps xmm2, xmm3
2780shufps xmm0, xmm0, 0x00 ; Replicate AEdge0
2781shufps xmm1, xmm1, 0x00 ; Replicate AEdge1
2782movaps xmm3, xmm2
2783movaps xmm4, xmm2
2784movaps xmm5, xmm2
2785movaps xmm6, [rel kSSE_ONE]
2786
2787cmpnltps xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2788cmpleps xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2789subps xmm1, xmm0
2790movaps xmm5, xmm4
2791subps xmm2, xmm0
2792andnps xmm5, xmm6 ; (A > AEdge1)? Yes: 1.0, No: 0.0
2793
2794movaps xmm6, [rel kSSE_TWO]
2795divps xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2796movaps xmm7, [rel kSSE_THREE]
2797mulps xmm6, xmm2 ; 2 * Temp
2798subps xmm7, xmm6 ; 3 - (2 * Temp)
2799mulps xmm7, xmm2
2800mulps xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2801andps xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2802andps xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2803orps xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2804
2805movaps xmm0, xmm7
2806movhlps xmm1, xmm7
2807ret
2808
2809_smooth_step_single_vector4:
2810movups xmm2, [Param1]
2811shufps xmm0, xmm0, 0x00 ; Replicate AEdge0
2812shufps xmm1, xmm1, 0x00 ; Replicate AEdge1
2813movaps xmm3, xmm2
2814movaps xmm4, xmm2
2815movaps xmm5, xmm2
2816movaps xmm6, [rel kSSE_ONE]
2817
2818cmpnltps xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2819cmpleps xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2820subps xmm1, xmm0
2821movaps xmm5, xmm4
2822subps xmm2, xmm0
2823andnps xmm5, xmm6 ; (A > AEdge1)? Yes: 1.0, No: 0.0
2824
2825movaps xmm6, [rel kSSE_TWO]
2826divps xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2827movaps xmm7, [rel kSSE_THREE]
2828mulps xmm6, xmm2 ; 2 * Temp
2829subps xmm7, xmm6 ; 3 - (2 * Temp)
2830mulps xmm7, xmm2
2831mulps xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2832andps xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2833andps xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2834orps xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2835
2836movaps xmm0, xmm7
2837movhlps xmm1, xmm7
2838ret
2839
2840_smooth_step_vector3:
2841movq xmm2, [Param3]
2842movss xmm3, [Param3+8]
2843movlhps xmm2, xmm3
2844movq xmm0, [Param1]
2845movss xmm1, [Param1+8]
2846movlhps xmm0, xmm1
2847movq xmm1, [Param2]
2848movss xmm3, [Param2+8]
2849movlhps xmm1, xmm3
2850
2851movaps xmm3, xmm2
2852movaps xmm4, xmm2
2853movaps xmm5, xmm2
2854movaps xmm6, [rel kSSE_ONE]
2855
2856cmpnltps xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2857cmpleps xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2858subps xmm1, xmm0
2859movaps xmm5, xmm4
2860subps xmm2, xmm0
2861andnps xmm5, xmm6 ; (A > AEdge1)? Yes: 1.0, No: 0.0
2862
2863movaps xmm6, [rel kSSE_TWO]
2864divps xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2865movaps xmm7, [rel kSSE_THREE]
2866mulps xmm6, xmm2 ; 2 * Temp
2867subps xmm7, xmm6 ; 3 - (2 * Temp)
2868mulps xmm7, xmm2
2869mulps xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2870andps xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2871andps xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2872orps xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2873
2874movaps xmm0, xmm7
2875movhlps xmm1, xmm7
2876ret
2877
2878_smooth_step_vector4:
2879movups xmm2, [Param3]
2880movups xmm0, [Param1]
2881movups xmm1, [Param2]
2882movaps xmm3, xmm2
2883movaps xmm4, xmm2
2884movaps xmm5, xmm2
2885movaps xmm6, [rel kSSE_ONE]
2886
2887cmpnltps xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2888cmpleps xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2889subps xmm1, xmm0
2890movaps xmm5, xmm4
2891subps xmm2, xmm0
2892andnps xmm5, xmm6 ; (A > AEdge1)? Yes: 1.0, No: 0.0
2893
2894movaps xmm6, [rel kSSE_TWO]
2895divps xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2896movaps xmm7, [rel kSSE_THREE]
2897mulps xmm6, xmm2 ; 2 * Temp
2898subps xmm7, xmm6 ; 3 - (2 * Temp)
2899mulps xmm7, xmm2
2900mulps xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2901andps xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2902andps xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2903orps xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2904
2905movaps xmm0, xmm7
2906movhlps xmm1, xmm7
2907ret
2908
2909_fma_vector2:
2910movlps xmm0, [Param1]
2911movlps xmm1, [Param2]
2912movlps xmm2, [Param3]
2913mulps xmm0, xmm1
2914addps xmm0, xmm2
2915ret
2916
2917_fma_vector3:
2918movq xmm0, [Param1]
2919movss xmm1, [Param1+8]
2920movlhps xmm0, xmm1
2921movq xmm1, [Param2]
2922movss xmm2, [Param2+8]
2923movlhps xmm1, xmm2
2924movq xmm2, [Param3]
2925movss xmm3, [Param3+8]
2926movlhps xmm2, xmm3
2927mulps xmm0, xmm1
2928addps xmm0, xmm2
2929movhlps xmm1, xmm0
2930ret
2931
2932_fma_vector4:
2933movups xmm0, [Param1]
2934movups xmm1, [Param2]
2935movups xmm2, [Param3]
2936mulps xmm0, xmm1
2937addps xmm0, xmm2
2938movhlps xmm1, xmm0
2939ret
2940
2941;****************************************************************************
2942; Matrix Functions
2943;****************************************************************************
2944
2945_outer_product_matrix2:
2946%ifdef FM_COLUMN_MAJOR
2947movlps xmm0, [Param2]
2948movlps xmm1, [Param1]
2949%else
2950movlps xmm0, [Param1] ; # # C.Y C.X
2951movlps xmm1, [Param2] ; # # R.Y R.X
2952%endif
2953
2954shufps xmm0, xmm0, 0x50 ; C.Y C.X C.Y C.X
2955shufps xmm1, xmm1, 0x44 ; R.Y R.Y R.X R.X
2956
2957mulps xmm0, xmm1 ; (C.Y*R.Y) (C.X*R.Y) (C.Y*R.X) (C.X*R.X)
2958
2959; Store as matrix
2960movhlps xmm1, xmm0
2961ret
2962
2963_outer_product_matrix3:
2964%ifdef FM_COLUMN_MAJOR
2965movq xmm0, [Param2]
2966movss xmm1, [Param2+8]
2967movlhps xmm0, xmm1
2968movq xmm1, [Param3]
2969movss xmm2, [Param3+8]
2970%else
2971movq xmm0, [Param3]
2972movss xmm1, [Param3+8]
2973movlhps xmm0, xmm1
2974movq xmm1, [Param2]
2975movss xmm2, [Param2+8]
2976%endif
2977movlhps xmm1, xmm2
2978movaps xmm2, xmm1
2979movaps xmm3, xmm1
2980
2981shufps xmm1, xmm1, 0x00 ; C.X (4x)
2982shufps xmm2, xmm2, 0x55 ; C.Y (4x)
2983shufps xmm3, xmm3, 0xAA ; C.Z (4x)
2984
2985mulps xmm1, xmm0 ; R * C.X
2986mulps xmm2, xmm0 ; R * C.Y
2987mulps xmm3, xmm0 ; R * C.Z
2988
2989; Store as matrix
2990movhlps xmm0, xmm1
2991movhlps xmm4, xmm2
2992movhlps xmm5, xmm3
2993movq [Param1+0x00], xmm1
2994movss [Param1+0x08], xmm0
2995movq [Param1+0x0C], xmm2
2996movss [Param1+0x14], xmm4
2997movq [Param1+0x18], xmm3
2998movss [Param1+0x20], xmm5
2999ret
3000
3001_outer_product_matrix4:
3002%ifdef FM_COLUMN_MAJOR
3003movups xmm0, [Param2]
3004movups xmm1, [Param3]
3005%else
3006movups xmm0, [Param3]
3007movups xmm1, [Param2]
3008%endif
3009movaps xmm2, xmm1
3010movaps xmm3, xmm1
3011movaps xmm4, xmm1
3012
3013shufps xmm1, xmm1, 0x00 ; C.X (4x)
3014shufps xmm2, xmm2, 0x55 ; C.Y (4x)
3015shufps xmm3, xmm3, 0xAA ; C.Z (4x)
3016shufps xmm4, xmm4, 0xFF ; C.W (4x)
3017
3018mulps xmm1, xmm0 ; R * C.X
3019mulps xmm2, xmm0 ; R * C.Y
3020mulps xmm3, xmm0 ; R * C.Z
3021mulps xmm4, xmm0 ; R * C.W
3022
3023; Store as matrix
3024movups [Param1 + 0x00], xmm1
3025movups [Param1 + 0x10], xmm2
3026movups [Param1 + 0x20], xmm3
3027movups [Param1 + 0x30], xmm4
3028ret
3029
3030;****************************************************************************
3031; TVector2
3032;****************************************************************************
3033
3034_vector2_div_single:
3035shufps xmm0, xmm0, 0
3036movlps xmm1, [Param1]
3037divps xmm1, xmm0
3038movaps xmm0, xmm1
3039ret
3040
3041_single_div_vector2:
3042movlps xmm1, [Param1]
3043shufps xmm0, xmm0, 0
3044divps xmm0, xmm1
3045ret
3046
3047_vector2_div_vector2:
3048movlps xmm0, [Param1]
3049movlps xmm1, [Param2]
3050divps xmm0, xmm1
3051ret
3052
3053_vector2_normalize_fast:
3054movlps xmm0, [Self] ; Y X
3055movaps xmm2, xmm0
3056mulps xmm0, xmm0 ; Y*Y X*X
3057pshufd xmm1, xmm0, 0x01; X*X Y*Y
3058addps xmm0, xmm1 ; (X*X+Y*Y) (2x)
3059rsqrtps xmm0, xmm0 ; (1 / Sqrt(X*X + Y*Y)) (4x)
3060mulps xmm0, xmm2 ; A * (1 / Sqrt(Dot(A, A)))
3061ret
3062
3063_vector2_set_normalized_fast:
3064movlps xmm0, [Self] ; Y X
3065movaps xmm2, xmm0
3066mulps xmm0, xmm0 ; Y*Y X*X
3067pshufd xmm1, xmm0, 0x01; X*X Y*Y
3068addps xmm0, xmm1 ; (X*X+Y*Y) (2x)
3069rsqrtps xmm0, xmm0 ; (1 / Sqrt(X*X + Y*Y)) (4x)
3070mulps xmm0, xmm2 ; A * (1 / Sqrt(Dot(A, A)))
3071movlps [Self], xmm0
3072ret
3073
3074;****************************************************************************
3075; TVector3
3076;****************************************************************************
3077
3078_vector3_add_single:
3079movq xmm2, [Param1] ; Load 3 floating-point values
3080movss xmm1, [Param1+8]
3081shufps xmm0, xmm0, 0 ; Replicate B
3082addps xmm2, xmm0 ; A + B
3083addss xmm1, xmm0
3084movaps xmm0, xmm2 ; Store result
3085ret
3086
3087_single_add_vector3:
3088movq xmm2, [Param1]
3089movss xmm1, [Param1+8]
3090shufps xmm0, xmm0, 0
3091addps xmm2, xmm0
3092addss xmm1, xmm0
3093movaps xmm0, xmm2
3094ret
3095
3096_vector3_add_vector3:
3097movq xmm0, [Param1]
3098movss xmm1, [Param1+8]
3099movq xmm2, [Param2]
3100movss xmm3, [Param2+8]
3101addps xmm0, xmm2
3102addss xmm1, xmm3
3103ret
3104
3105_vector3_sub_single:
3106movq xmm2, [Param1] ; Load 3 floating-point values
3107movss xmm1, [Param1+8]
3108shufps xmm0, xmm0, 0 ; Replicate B
3109subps xmm2, xmm0 ; A + B
3110subss xmm1, xmm0
3111movaps xmm0, xmm2 ; Store result
3112ret
3113
3114_single_sub_vector3:
3115movq xmm4, [Param1]
3116movss xmm2, [Param1+8]
3117movss xmm1, xmm0
3118shufps xmm0, xmm0, 0
3119subps xmm0, xmm4
3120subss xmm1, xmm2
3121ret
3122
3123_vector3_sub_vector3:
3124movq xmm0, [Param1]
3125movss xmm1, [Param1+8]
3126movq xmm2, [Param2]
3127movss xmm3, [Param2+8]
3128subps xmm0, xmm2
3129subss xmm1, xmm3
3130ret
3131
3132_vector3_mul_single:
3133movq xmm2, [Param1]
3134movss xmm1, [Param1+8]
3135shufps xmm0, xmm0, 0
3136mulps xmm2, xmm0
3137mulss xmm1, xmm0
3138movaps xmm0, xmm2
3139ret
3140
3141_single_mul_vector3:
3142movq xmm2, [Param1]
3143movss xmm1, [Param1+8]
3144shufps xmm0, xmm0, 0
3145mulps xmm2, xmm0
3146mulss xmm1, xmm0
3147movaps xmm0, xmm2
3148ret
3149
3150_vector3_mul_vector3:
3151movq xmm0, [Param1]
3152movss xmm1, [Param1+8]
3153movq xmm2, [Param2]
3154movss xmm3, [Param2+8]
3155mulps xmm0, xmm2
3156mulss xmm1, xmm3
3157ret
3158
3159_vector3_div_single:
3160movq xmm2, [Param1]
3161movss xmm1, [Param1+8]
3162shufps xmm0, xmm0, 0
3163divps xmm2, xmm0
3164divss xmm1, xmm0
3165movaps xmm0, xmm2
3166ret
3167
3168_single_div_vector3:
3169movq xmm3, [Param1]
3170movss xmm2, [Param1+8]
3171movss xmm1, xmm0
3172shufps xmm0, xmm0, 0
3173divps xmm0, xmm3
3174divss xmm1, xmm2
3175ret
3176
3177_vector3_div_vector3:
3178movq xmm0, [Param1]
3179movss xmm1, [Param1+8]
3180movq xmm2, [Param2]
3181movss xmm3, [Param2+8]
3182divps xmm0, xmm2
3183divss xmm1, xmm3
3184ret
3185
3186_vector3_distance:
3187movq xmm0, [Self]
3188movss xmm1, [Self+8]
3189movq xmm2, [Param2]
3190movss xmm3, [Param2+8]
3191movlhps xmm0, xmm1
3192movlhps xmm2, xmm3
3193subps xmm0, xmm2 ; A - B
3194
3195; (A - B).Length
3196mulps xmm0, xmm0
3197pshufd xmm1, xmm0, 0x0E
3198addps xmm0, xmm1
3199pshufd xmm1, xmm0, 0x01
3200addss xmm0, xmm1
3201sqrtss xmm0, xmm0
3202ret
3203
3204_vector3_distance_squared:
3205movq xmm0, [Self]
3206movss xmm1, [Self+8]
3207movq xmm2, [Param2]
3208movss xmm3, [Param2+8]
3209movlhps xmm0, xmm1
3210movlhps xmm2, xmm3
3211subps xmm0, xmm2 ; A - B
3212
3213; (A - B).Length
3214mulps xmm0, xmm0
3215pshufd xmm1, xmm0, 0x0E
3216addps xmm0, xmm1
3217pshufd xmm1, xmm0, 0x01
3218addss xmm0, xmm1
3219ret
3220
3221_vector3_get_length:
3222movq xmm0, [Self] ; 0 0 Y X
3223movss xmm1, [Self+8] ; 0 0 0 Z
3224movlhps xmm0, xmm1 ; 0 Z Y Z
3225mulps xmm0, xmm0 ; 0 Z*Z Y*Y X*X
3226pshufd xmm1, xmm0, 0x0E; Y*Y X*X 0 Z*Z
3227addps xmm0, xmm1 ; # # (Y*Y) (X*X+Z*Z)
3228pshufd xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3229addss xmm0, xmm1 ; (X*X + Y*Y + Z*Z)
3230sqrtss xmm0, xmm0 ; Sqrt(X*X + Y*Y + Z*Z)
3231ret
3232
3233_vector3_get_length_squared:
3234movq xmm0, [Self] ; 0 0 Y X
3235movss xmm1, [Self+8] ; 0 0 0 Z
3236movlhps xmm0, xmm1 ; 0 Z Y Z
3237mulps xmm0, xmm0 ; 0 Z*Z Y*Y X*X
3238pshufd xmm1, xmm0, 0x0E; Y*Y X*X 0 Z*Z
3239addps xmm0, xmm1 ; # # (Y*Y) (X*X+Z*Z)
3240pshufd xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3241addss xmm0, xmm1 ; (X*X + Y*Y + Z*Z)
3242ret
3243
3244_vector3_normalize_fast:
3245movq xmm0, [Self] ; 0 0 Y X
3246movss xmm1, [Self+8] ; 0 0 0 Z
3247movlhps xmm0, xmm1 ; 0 Z Y Z
3248movaps xmm2, xmm0
3249
3250; Dot(A, A)
3251mulps xmm0, xmm0 ; 0 Z*Z Y*Y X*X
3252pshufd xmm1, xmm0, 0x4E; Y*Y X*X 0 Z*Z
3253addps xmm0, xmm1 ; (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3254pshufd xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3255addps xmm0, xmm1 ; (X*X + Y*Y + Z*Z) (4x)
3256
3257rsqrtps xmm0, xmm0 ; (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3258mulps xmm0, xmm2 ; A * (1 / Sqrt(Dot(A, A)))
3259movhlps xmm1, xmm0
3260ret
3261
3262_vector3_set_normalized_fast:
3263movq xmm0, [Self] ; 0 0 Y X
3264movss xmm1, [Self+8] ; 0 0 0 Z
3265movlhps xmm0, xmm1 ; 0 Z Y Z
3266movaps xmm2, xmm0
3267
3268; Dot(A, A)
3269mulps xmm0, xmm0 ; 0 Z*Z Y*Y X*X
3270pshufd xmm1, xmm0, 0x4E; Y*Y X*X 0 Z*Z
3271addps xmm0, xmm1 ; (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3272pshufd xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3273addps xmm0, xmm1 ; (X*X + Y*Y + Z*Z) (4x)
3274
3275rsqrtps xmm0, xmm0 ; (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3276mulps xmm0, xmm2 ; A * (1 / Sqrt(Dot(A, A)))
3277movhlps xmm1, xmm0
3278movq [Self], xmm0
3279movss [Self+8], xmm1
3280ret
3281
3282_vector3_reflect:
3283movq xmm0, [Self]
3284movss xmm2, [Self+8]
3285movq xmm1, [Param2]
3286movss xmm3, [Param2+8]
3287movlhps xmm0, xmm2
3288movlhps xmm1, xmm3
3289movaps xmm2, xmm0
3290movups xmm3, [rel kSSE_TWO]
3291
3292; Dot(N, I)
3293mulps xmm0, xmm1
3294mulps xmm3, xmm1 ; N * 2
3295pshufd xmm1, xmm0, 0x4E
3296addps xmm0, xmm1
3297pshufd xmm1, xmm0, 0x11
3298addps xmm0, xmm1
3299
3300; (2 * Dot(N, I)) * N
3301mulps xmm0, xmm3
3302
3303; I - ((2 * Dot(N, I)) * N)
3304subps xmm2, xmm0
3305movaps xmm0, xmm2
3306movhlps xmm1, xmm2
3307ret
3308
3309_vector3_refract:
3310movq xmm3, [Self]
3311movss xmm2, [Self+8]
3312movq xmm1, [Param2]
3313movss xmm4, [Param2+8]
3314movlhps xmm3, xmm2
3315movlhps xmm1, xmm4
3316movups xmm7, xmm3
3317movss xmm2, [rel kSSE_ONE]
3318
3319; D := Dot(N, I)
3320mulps xmm3, xmm1
3321movss xmm4, xmm2 ; 1
3322pshufd xmm1, xmm3, 0x4E
3323movss xmm5, xmm0 ; Eta
3324addps xmm3, xmm1
3325mulss xmm5, xmm5 ; Eta * Eta
3326pshufd xmm1, xmm3, 0x11
3327addss xmm3, xmm1
3328
3329; K := 1 - Eta * Eta * (1 - D * D)
3330movss xmm6, xmm3 ; D
3331mulss xmm3, xmm3 ; D * D
3332subss xmm4, xmm3 ; 1 - D * D
3333mulss xmm4, xmm5 ; Eta * Eta * (1 - D * D)
3334xorps xmm5, xmm5 ; 0
3335subss xmm2, xmm4 ; K := 1 - Eta * Eta * (1 - D * D)
3336
3337; if (K < 0) then
3338comiss xmm2, xmm5
3339
3340jb _set_null_vec3
3341
3342; K >= 0
3343mulss xmm6, xmm0 ; Eta * D
3344shufps xmm0, xmm0, 0 ; Replicate Eta (4x)
3345mulps xmm7, xmm0 ; Eta * I
3346sqrtss xmm2, xmm2 ; Sqrt(K)
3347addss xmm6, xmm2 ; Eta * D + Sqrt(K)
3348shufps xmm6, xmm6, 0 ; Replicate Eta * D + Sqrt(K) (4x)
3349movups xmm1, [Param2]
3350mulps xmm6, xmm1 ; ((Eta * D + Sqrt(K)) * N)
3351subps xmm7, xmm6 ; (Eta * I) - ((Eta * D + Sqrt(K)) * N)
3352movaps xmm0, xmm7
3353movhlps xmm1, xmm7
3354ret
3355
3356_set_null_vec3:
3357; K < 0: Result := Vector4(0, 0, 0, 0)
3358movaps xmm0, xmm5
3359movlhps xmm1, xmm5
3360ret
3361
3362;****************************************************************************
3363; TVector4
3364;****************************************************************************
3365
3366_vector4_add_single:
3367movups xmm2, [Param1] ; Load 4 floating-point values
3368shufps xmm0, xmm0, 0 ; Replicate B
3369addps xmm2, xmm0 ; A + B
3370movaps xmm0, xmm2 ; Store result
3371movhlps xmm1, xmm2
3372ret
3373
3374_single_add_vector4:
3375movups xmm1, [Param1]
3376shufps xmm0, xmm0, 0
3377addps xmm1, xmm0
3378movaps xmm0, xmm1
3379movhlps xmm1, xmm1
3380ret
3381
3382_vector4_add_vector4:
3383movups xmm0, [Param1]
3384movups xmm1, [Param2]
3385addps xmm0, xmm1
3386movhlps xmm1, xmm0
3387ret
3388
3389_vector4_sub_single:
3390movups xmm2, [Param1]
3391shufps xmm0, xmm0, 0
3392subps xmm2, xmm0
3393movaps xmm0, xmm2
3394movhlps xmm1, xmm2
3395ret
3396
3397_single_sub_vector4:
3398movups xmm1, [Param1]
3399shufps xmm0, xmm0, 0
3400subps xmm0, xmm1
3401movhlps xmm1, xmm0
3402ret
3403
3404_vector4_sub_vector4:
3405movups xmm0, [Param1]
3406movups xmm1, [Param2]
3407subps xmm0, xmm1
3408movhlps xmm1, xmm0
3409ret
3410
3411_vector4_mul_single:
3412movups xmm2, [Param1]
3413shufps xmm0, xmm0, 0
3414mulps xmm2, xmm0
3415movaps xmm0, xmm2
3416movhlps xmm1, xmm2
3417ret
3418
3419_single_mul_vector4:
3420movups xmm1, [Param1]
3421shufps xmm0, xmm0, 0
3422mulps xmm0, xmm1
3423movhlps xmm1, xmm0
3424ret
3425
3426_vector4_mul_vector4:
3427movups xmm0, [Param1]
3428movups xmm1, [Param2]
3429mulps xmm0, xmm1
3430movhlps xmm1, xmm0
3431ret
3432
3433_vector4_div_single:
3434movups xmm2, [Param1]
3435shufps xmm0, xmm0, 0
3436divps xmm2, xmm0
3437movaps xmm0, xmm2
3438movhlps xmm1, xmm2
3439ret
3440
3441_single_div_vector4:
3442movups xmm1, [Param1]
3443shufps xmm0, xmm0, 0
3444divps xmm0, xmm1
3445movhlps xmm1, xmm0
3446ret
3447
3448_vector4_div_vector4:
3449movups xmm0, [Param1]
3450movups xmm1, [Param2]
3451divps xmm0, xmm1
3452movhlps xmm1, xmm0
3453ret
3454
3455_vector4_negative:
3456movaps xmm0, [rel kSSE_MASK_SIGN] ; Load mask with 4 sign (upper) bits
3457movups xmm1, [Param1]
3458xorps xmm0, xmm1 ; Flip sign bit
3459movhlps xmm1, xmm0
3460ret
3461
3462_vector4_distance:
3463movups xmm0, [Self]
3464movups xmm1, [Param2]
3465subps xmm0, xmm1 ; A - B
3466
3467; (A - B).Length
3468mulps xmm0, xmm0
3469pshufd xmm1, xmm0, 0x0E
3470addps xmm0, xmm1
3471pshufd xmm1, xmm0, 0x01
3472addss xmm0, xmm1
3473sqrtss xmm0, xmm0
3474ret
3475
3476_vector4_distance_squared:
3477movups xmm0, [Self]
3478movups xmm1, [Param2]
3479subps xmm0, xmm1 ; A - B
3480
3481; (A - B).LengthSquared
3482mulps xmm0, xmm0
3483pshufd xmm1, xmm0, 0x0E
3484addps xmm0, xmm1
3485pshufd xmm1, xmm0, 0x01
3486addss xmm0, xmm1
3487ret
3488
3489_vector4_face_forward:
3490movups xmm0, [Self]
3491movups xmm1, [Param2]
3492movups xmm2, [Param3]
3493xorps xmm3, xmm3 ; 0
3494movaps xmm4, [rel kSSE_MASK_SIGN]
3495
3496; Dot(NRef, I)
3497mulps xmm2, xmm1
3498pshufd xmm1, xmm2, 0x4E
3499addps xmm2, xmm1
3500pshufd xmm1, xmm2, 0x11
3501addps xmm2, xmm1
3502
3503; Dot(NRef, I) >= 0? Yes: 0xFFFFFFFF, No: 0x00000000
3504cmpnltps xmm2, xmm3
3505andps xmm2, xmm4 ; Yes: 0x80000000, No: 0x00000000
3506
3507; Flip sign of N if (Dot(NRef, I) >= 0)
3508xorps xmm0, xmm2
3509movhlps xmm1, xmm0
3510ret
3511
3512_vector4_get_length:
3513movups xmm0, [Self] ; W Z Y X
3514mulps xmm0, xmm0 ; W*W Z*Z Y*Y X*X
3515pshufd xmm1, xmm0, 0x0E; Y*Y X*X W*W Z*Z
3516addps xmm0, xmm1 ; # # (Y*Y+W*W) (X*X+Z*Z)
3517pshufd xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3518addss xmm0, xmm1 ; (X*X + Y*Y + Z*Z + W*W)
3519sqrtss xmm0, xmm0 ; Sqrt(X*X + Y*Y + Z*Z + W*W)
3520ret
3521
3522_vector4_get_length_squared:
3523movups xmm0, [Self] ; W Z Y X
3524mulps xmm0, xmm0 ; W*W Z*Z Y*Y X*X
3525pshufd xmm1, xmm0, 0x0E; Y*Y X*X W*W Z*Z
3526addps xmm0, xmm1 ; # # (Y*Y+W*W) (X*X+Z*Z)
3527pshufd xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3528addss xmm0, xmm1 ; (X*X + Y*Y + Z*Z + W*W)
3529ret
3530
3531_vector4_normalize_fast:
3532movups xmm0, [Self] ; W Z Y X
3533movaps xmm2, xmm0
3534
3535; Dot(A, A)
3536mulps xmm0, xmm0 ; W*W Z*Z Y*Y X*X
3537pshufd xmm1, xmm0, 0x4E; Y*Y X*X W*W Z*Z
3538addps xmm0, xmm1 ; (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
3539pshufd xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
3540addps xmm0, xmm1 ; (X*X + Y*Y + Z*Z + W*W) (4x)
3541
3542rsqrtps xmm0, xmm0 ; (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
3543mulps xmm0, xmm2 ; A * (1 / Sqrt(Dot(A, A)))
3544movhlps xmm1, xmm0
3545ret
3546
3547_vector4_set_normalized_fast:
3548movups xmm0, [Self] ; W Z Y X
3549movaps xmm2, xmm0
3550
3551; Dot(A, A)
3552mulps xmm0, xmm0 ; W*W Z*Z Y*Y X*X
3553pshufd xmm1, xmm0, 0x4E; Y*Y X*X W*W Z*Z
3554addps xmm0, xmm1 ; (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
3555pshufd xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
3556addps xmm0, xmm1 ; (X*X + Y*Y + Z*Z + W*W) (4x)
3557
3558rsqrtps xmm0, xmm0 ; (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
3559mulps xmm0, xmm2 ; A * (1 / Sqrt(Dot(A, A)))
3560movups [Self], xmm0
3561ret
3562
3563_vector4_reflect:
3564movups xmm0, [Self]
3565movups xmm1, [Param2]
3566movaps xmm2, xmm0
3567movaps xmm3, [rel kSSE_TWO]
3568
3569; Dot(N, I)
3570mulps xmm0, xmm1
3571mulps xmm3, xmm1 ; N * 2
3572pshufd xmm1, xmm0, 0x4E
3573addps xmm0, xmm1
3574pshufd xmm1, xmm0, 0x11
3575addps xmm0, xmm1
3576
3577; (2 * Dot(N, I)) * N
3578mulps xmm0, xmm3
3579
3580; I - ((2 * Dot(N, I)) * N)
3581subps xmm2, xmm0
3582movaps xmm0, xmm2
3583movhlps xmm1, xmm2
3584ret
3585
3586_vector4_refract:
3587movups xmm3, [Self]
3588movups xmm1, [Param2]
3589movups xmm7, xmm3
3590movss xmm2, [rel kSSE_ONE]
3591
3592; D := Dot(N, I)
3593mulps xmm3, xmm1
3594movss xmm4, xmm2 ; 1
3595pshufd xmm1, xmm3, 0x4E
3596movss xmm5, xmm0 ; Eta
3597addps xmm3, xmm1
3598mulss xmm5, xmm5 ; Eta * Eta
3599pshufd xmm1, xmm3, 0x11
3600addss xmm3, xmm1
3601
3602; K := 1 - Eta * Eta * (1 - D * D)
3603movss xmm6, xmm3 ; D
3604mulss xmm3, xmm3 ; D * D
3605subss xmm4, xmm3 ; 1 - D * D
3606mulss xmm4, xmm5 ; Eta * Eta * (1 - D * D)
3607xorps xmm5, xmm5 ; 0
3608subss xmm2, xmm4 ; K := 1 - Eta * Eta * (1 - D * D)
3609
3610; if (K < 0) then
3611comiss xmm2, xmm5
3612
3613jb _set_null_vec4
3614
3615; K >= 0
3616mulss xmm6, xmm0 ; Eta * D
3617shufps xmm0, xmm0, 0 ; Replicate Eta (4x)
3618mulps xmm7, xmm0 ; Eta * I
3619sqrtss xmm2, xmm2 ; Sqrt(K)
3620addss xmm6, xmm2 ; Eta * D + Sqrt(K)
3621shufps xmm6, xmm6, 0 ; Replicate Eta * D + Sqrt(K) (4x)
3622movups xmm1, [Param2]
3623mulps xmm6, xmm1 ; ((Eta * D + Sqrt(K)) * N)
3624subps xmm7, xmm6 ; (Eta * I) - ((Eta * D + Sqrt(K)) * N)
3625movaps xmm0, xmm7
3626movhlps xmm1, xmm7
3627ret
3628
3629_set_null_vec4:
3630; K < 0: Result := Vector4(0, 0, 0, 0)
3631movaps xmm0, xmm5
3632movhlps xmm1, xmm5
3633ret
3634
3635;****************************************************************************
3636; TMatrix3
3637;****************************************************************************
3638
3639_matrix3_add_single:
3640movups xmm1, [Param2 + 0x00] ; Load 3 rows
3641shufps xmm0, xmm0, 0 ; Replicate B
3642movups xmm3, [Param2 + 0x10]
3643movss xmm4, [Param2 + 0x20]
3644addps xmm1, xmm0 ; Add B to each row
3645addps xmm3, xmm0
3646addss xmm4, xmm0
3647movups [Param1 + 0x00], xmm1
3648movups [Param1 + 0x10], xmm3
3649movss [Param1 + 0x20], xmm4
3650ret
3651
3652_single_add_matrix3:
3653movups xmm1, [Param2 + 0x00] ; Load 3 rows
3654shufps xmm0, xmm0, 0 ; Replicate A
3655movups xmm2, [Param2 + 0x10]
3656movss xmm3, [Param2 + 0x20]
3657addps xmm1, xmm0 ; Add A to each row
3658addps xmm2, xmm0
3659addss xmm3, xmm0
3660movups [Param1 + 0x00], xmm1
3661movups [Param1 + 0x10], xmm2
3662movss [Param1 + 0x20], xmm3
3663ret
3664
3665_matrix3_add_matrix3:
3666movups xmm0, [Param2 + 0x00] ; Load 3 rows of A
3667movups xmm1, [Param2 + 0x10]
3668movss xmm2, [Param2 + 0x20]
3669movups xmm4, [Param3 + 0x00] ; Load 3 rows of B
3670movups xmm5, [Param3 + 0x10]
3671movss xmm3, [Param3 + 0x20]
3672addps xmm0, xmm4 ; Add rows
3673addps xmm1, xmm5
3674addss xmm2, xmm3
3675movups [Param1 + 0x00], xmm0
3676movups [Param1 + 0x10], xmm1
3677movss [Param1 + 0x20], xmm2
3678ret
3679
3680_matrix3_sub_single:
3681movups xmm1, [Param2 + 0x00] ; Load 3 rows
3682shufps xmm0, xmm0, 0 ; Replicate B
3683movups xmm2, [Param2 + 0x10]
3684movss xmm3, [Param2 + 0x20]
3685subps xmm1, xmm0 ; Subtract B from each row
3686subps xmm2, xmm0
3687subss xmm3, xmm0
3688movups [Param1 + 0x00], xmm1
3689movups [Param1 + 0x10], xmm2
3690movss [Param1 + 0x20], xmm3
3691ret
3692
3693_single_sub_matrix3:
3694movups xmm4, [Param2 + 0x00] ; Load 3 rows
3695shufps xmm0, xmm0, 0 ; Replicate A
3696movups xmm5, [Param2 + 0x10]
3697movaps xmm1, xmm0
3698movaps xmm2, xmm0
3699movss xmm6, [Param2 + 0x20]
3700subps xmm0, xmm4 ; Subtract each row from A
3701subps xmm1, xmm5
3702subss xmm2, xmm6
3703movups [Param1 + 0x00], xmm0
3704movups [Param1 + 0x10], xmm1
3705movss [Param1 + 0x20], xmm2
3706ret
3707
3708_matrix3_sub_matrix3:
3709movups xmm0, [Param2 + 0x00] ; Load 3 rows of A
3710movups xmm1, [Param2 + 0x10]
3711movss xmm2, [Param2 + 0x20]
3712movups xmm4, [Param3 + 0x00] ; Load 3 rows of B
3713movups xmm5, [Param3 + 0x10]
3714movss xmm6, [Param3 + 0x20]
3715subps xmm0, xmm4 ; Subtract rows
3716subps xmm1, xmm5
3717subss xmm2, xmm6
3718movups [Param1 + 0x00], xmm0
3719movups [Param1 + 0x10], xmm1
3720movss [Param1 + 0x20], xmm2
3721ret
3722
3723_matrix3_mul_single:
3724movups xmm1, [Param2 + 0x00] ; Load 3 rows
3725shufps xmm0, xmm0, 0 ; Replicate B
3726movups xmm2, [Param2 + 0x10]
3727movss xmm3, [Param2 + 0x20]
3728mulps xmm1, xmm0 ; Multiply each row by B
3729mulps xmm2, xmm0
3730mulss xmm3, xmm0
3731movups [Param1 + 0x00], xmm1
3732movups [Param1 + 0x10], xmm2
3733movss [Param1 + 0x20], xmm3
3734ret
3735
3736_single_mul_matrix3:
3737movups xmm2, [Param2 + 0x00] ; Load 3 rows
3738shufps xmm0, xmm0, 0 ; Replicate A
3739movups xmm1, [Param2 + 0x10]
3740movss xmm3, [Param2 + 0x20]
3741mulps xmm2, xmm0 ; Multiply each row by A
3742mulps xmm1, xmm0
3743mulss xmm3, xmm0
3744movups [Param1 + 0x00], xmm2
3745movups [Param1 + 0x10], xmm1
3746movss [Param1 + 0x20], xmm3
3747ret
3748
3749_matrix3_comp_mult:
3750movups xmm2, [Param2 + 0x00] ; Self[0]
3751movups xmm0, [Param2 + 0x10] ; Self[1]
3752movss xmm1, [Param2 + 0x20] ; Self[2]
3753movups xmm4, [Param3 + 0x00] ; AOther[0]
3754movups xmm5, [Param3 + 0x10] ; AOther[1]
3755movss xmm3, [Param3 + 0x20] ; AOther[2]
3756
3757; Component-wise multiplication
3758mulps xmm2, xmm4
3759mulps xmm0, xmm5
3760mulss xmm1, xmm3
3761
3762; Store result
3763movups [Param1 + 0x00], xmm2
3764movups [Param1 + 0x10], xmm0
3765movss [Param1 + 0x20], xmm1
3766ret
3767
3768%macro M3_MUL_V3 2
3769movq xmm0, [%2] ; Load vector
3770movss xmm1, [%2+8]
3771movlhps xmm0, xmm1
3772
3773movq xmm4, [%1 + 0x00] ; Load 3 rows
3774movss xmm1, [%1 + 0x08]
3775movlhps xmm4, xmm1
3776
3777movaps xmm1, xmm0
3778movaps xmm2, xmm0
3779
3780movq xmm5, [%1 + 0x0C]
3781movss xmm6, [%1 + 0x14]
3782movlhps xmm5, xmm6
3783
3784movq xmm6, [%1 + 0x18]
3785movss xmm3, [%1 + 0x20]
3786movlhps xmm6, xmm3
3787
3788mulps xmm0, xmm4 ; ###, (Az * B02), (Ay * B01), (Ax * B00)
3789mulps xmm1, xmm5 ; ###, (Az * B12), (Ay * B11), (Ax * B10)
3790mulps xmm2, xmm6 ; ###, (Az * B22), (Ay * B21), (Ax * B20)
3791xorps xmm3, xmm3 ; 000
3792
3793; Transpose xmm0-xmm2
3794movaps xmm4, xmm2
3795unpcklps xmm2, xmm3 ; 000 B21 000 B20
3796unpckhps xmm4, xmm3 ; 000 ### 000 B22
3797
3798movaps xmm3, xmm0
3799unpcklps xmm0, xmm1 ; B11 B01 B10 B00
3800unpckhps xmm3, xmm1 ; ### ### B12 B02
3801
3802movaps xmm1, xmm0
3803unpcklpd xmm0, xmm2 ; 000 B20 B10 B00
3804unpckhpd xmm1, xmm2 ; 000 B21 B11 B01
3805
3806unpcklpd xmm3, xmm4 ; 000 B22 B12 B02
3807
3808addps xmm0, xmm1 ; Add rows
3809addps xmm0, xmm3
3810movhlps xmm1, xmm0
3811ret
3812%endmacro
3813
3814%macro V3_MUL_M3 2
3815movq xmm0, [%1] ; Load vector
3816movss xmm1, [%1+8]
3817movlhps xmm0, xmm1
3818
3819movq xmm4, [%2 + 0x00] ; Load 3 rows
3820movss xmm1, [%2 + 0x08]
3821movlhps xmm4, xmm1
3822
3823movaps xmm1, xmm0
3824movaps xmm2, xmm0
3825shufps xmm0, xmm0, 0x00 ; Bx Bx Bx Bx
3826shufps xmm1, xmm1, 0x55 ; By By By By
3827shufps xmm2, xmm2, 0xAA ; Bz Bz Bz Bz
3828
3829movq xmm5, [%2 + 0x0C]
3830movss xmm3, [%2 + 0x14]
3831movlhps xmm5, xmm3
3832
3833movq xmm6, [%2 + 0x18]
3834movss xmm3, [%2 + 0x20]
3835movlhps xmm6, xmm3
3836
3837mulps xmm0, xmm4 ; (A00 * Bx), (A01 * Bx), (A02 * Bx), #
3838mulps xmm1, xmm5 ; (A10 * By), (A11 * By), (A12 * By), #
3839mulps xmm2, xmm6 ; (A20 * Bz), (A21 * Bz), (A22 * Bz), #
3840addps xmm0, xmm1 ; Add rows
3841addps xmm0, xmm2
3842movhlps xmm1, xmm0
3843ret
3844%endmacro
3845
3846%macro M3_MUL_M3 2
3847; A.R[0] * B
3848movq xmm0, [%1 + 0x00]
3849movss xmm1, [%1 + 0x08]
3850movlhps xmm0, xmm1
3851
3852movq xmm4, [%2 + 0x00]
3853movss xmm1, [%2 + 0x08]
3854movlhps xmm4, xmm1
3855
3856movaps xmm1, xmm0
3857movaps xmm2, xmm0
3858shufps xmm0, xmm0, 0x00
3859shufps xmm1, xmm1, 0x55
3860shufps xmm2, xmm2, 0xAA
3861
3862movq xmm5, [%2 + 0x0C]
3863movss xmm3, [%2 + 0x14]
3864movlhps xmm5, xmm3
3865
3866movq xmm6, [%2 + 0x18]
3867movss xmm3, [%2 + 0x20]
3868movlhps xmm6, xmm3
3869
3870mulps xmm0, xmm4
3871mulps xmm1, xmm5
3872mulps xmm2, xmm6
3873addps xmm0, xmm1
3874addps xmm0, xmm2
3875movhlps xmm1, xmm0
3876movq [Param1 + 0x00], xmm0
3877movss [Param1 + 0x08], xmm1
3878
3879; A.R[1] * B
3880movq xmm0, [%1 + 0x0C]
3881movss xmm1, [%1 + 0x14]
3882movlhps xmm0, xmm1
3883
3884movaps xmm1, xmm0
3885movaps xmm2, xmm0
3886shufps xmm0, xmm0, 0x00
3887shufps xmm1, xmm1, 0x55
3888shufps xmm2, xmm2, 0xAA
3889mulps xmm0, xmm4
3890mulps xmm1, xmm5
3891mulps xmm2, xmm6
3892addps xmm0, xmm1
3893addps xmm0, xmm2
3894movhlps xmm1, xmm0
3895movq [Param1 + 0x0C], xmm0
3896movss [Param1 + 0x14], xmm1
3897
3898; A.R[2] * B
3899movq xmm0, [%1 + 0x18]
3900movss xmm1, [%1 + 0x20]
3901movlhps xmm0, xmm1
3902
3903movaps xmm1, xmm0
3904movaps xmm2, xmm0
3905shufps xmm0, xmm0, 0x00
3906shufps xmm1, xmm1, 0x55
3907shufps xmm2, xmm2, 0xAA
3908mulps xmm0, xmm4
3909mulps xmm1, xmm5
3910mulps xmm2, xmm6
3911addps xmm0, xmm1
3912addps xmm0, xmm2
3913movhlps xmm1, xmm0
3914movq [Param1 + 0x18], xmm0
3915movss [Param1 + 0x20], xmm1
3916ret
3917%endmacro
3918
3919%ifdef FM_COLUMN_MAJOR
3920_matrix3_mul_vector3:
3921V3_MUL_M3 Param2, Param1
3922
3923_vector3_mul_matrix3:
3924M3_MUL_V3 Param2, Param1
3925
3926_matrix3_mul_matrix3:
3927M3_MUL_M3 Param3, Param2
3928%else
3929_matrix3_mul_vector3:
3930M3_MUL_V3 Param1, Param2
3931
3932_vector3_mul_matrix3:
3933V3_MUL_M3 Param1, Param2
3934
3935_matrix3_mul_matrix3:
3936M3_MUL_M3 Param2, Param3
3937%endif
3938
3939_matrix3_div_single:
3940movups xmm1, [Param2 + 0x00] ; Load 3 rows
3941shufps xmm0, xmm0, 0 ; Replicate B
3942movups xmm2, [Param2 + 0x10]
3943movss xmm3, [Param2 + 0x20]
3944divps xmm1, xmm0 ; Divide each row by B
3945divps xmm2, xmm0
3946divss xmm3, xmm0
3947movups [Param1 + 0x00], xmm1
3948movups [Param1 + 0x10], xmm2
3949movss [Param1 + 0x20], xmm3
3950ret
3951
3952_single_div_matrix3:
3953movups xmm4, [Param2 + 0x00] ; Load 3 rows
3954shufps xmm0, xmm0, 0 ; Replicate A
3955movups xmm5, [Param2 + 0x10]
3956movaps xmm1, xmm0
3957movaps xmm2, xmm0
3958movss xmm3, [Param2 + 0x20]
3959divps xmm0, xmm4 ; Divide A by each row
3960divps xmm1, xmm5
3961divss xmm2, xmm3
3962movups [Param1 + 0x00], xmm0
3963movups [Param1 + 0x10], xmm1
3964movss [Param1 + 0x20], xmm2
3965ret
3966
3967_matrix3_negative:
3968movups xmm0, [rel kSSE_MASK_SIGN] ; Load mask with 4 sign (upper) bits
3969movups xmm1, [Param2 + 0x00] ; Load 3 rows
3970movups xmm2, [Param2 + 0x10]
3971movss xmm3, [Param2 + 0x20]
3972xorps xmm1, xmm0 ; Flip sign bits of each element in each row
3973xorps xmm2, xmm0
3974pxor xmm3, xmm0
3975movups [Param1 + 0x00], xmm1
3976movups [Param1 + 0x10], xmm2
3977movss [Param1 + 0x20], xmm3
3978ret
3979
3980_matrix3_transpose:
3981movss xmm0, [Param2 + 0x00]
3982movss xmm1, [Param2 + 0x04]
3983movss xmm2, [Param2 + 0x08]
3984
3985movss [Param1 + 0x00], xmm0
3986movss [Param1 + 0x0C], xmm1
3987movss [Param1 + 0x18], xmm2
3988
3989movss xmm0, [Param2 + 0x0C]
3990movss xmm1, [Param2 + 0x10]
3991movss xmm2, [Param2 + 0x14]
3992
3993movss [Param1 + 0x04], xmm0
3994movss [Param1 + 0x10], xmm1
3995movss [Param1 + 0x1C], xmm2
3996
3997movss xmm0, [Param2 + 0x18]
3998movss xmm1, [Param2 + 0x1C]
3999movss xmm2, [Param2 + 0x20]
4000
4001movss [Param1 + 0x08], xmm0
4002movss [Param1 + 0x14], xmm1
4003movss [Param1 + 0x20], xmm2
4004ret
4005
4006_matrix3_set_transposed:
4007movss xmm1, [Param1 + 0x04]
4008movss xmm2, [Param1 + 0x08]
4009
4010movss xmm3, [Param1 + 0x0C]
4011movss xmm5, [Param1 + 0x14]
4012
4013movss xmm6, [Param1 + 0x18]
4014movss xmm7, [Param1 + 0x1C]
4015
4016movss [Param1 + 0x0C], xmm1
4017movss [Param1 + 0x18], xmm2
4018
4019movss [Param1 + 0x04], xmm3
4020movss [Param1 + 0x1C], xmm5
4021
4022movss [Param1 + 0x08], xmm6
4023movss [Param1 + 0x14], xmm7
4024ret
4025
4026;****************************************************************************
4027; TMatrix4
4028;****************************************************************************
4029
4030_matrix4_add_single:
4031movups xmm1, [Param2 + 0x00] ; Load 4 rows
4032shufps xmm0, xmm0, 0 ; Replicate B
4033movups xmm2, [Param2 + 0x10]
4034movups xmm3, [Param2 + 0x20]
4035movups xmm4, [Param2 + 0x30]
4036addps xmm1, xmm0 ; Add B to each row
4037addps xmm2, xmm0
4038addps xmm3, xmm0
4039addps xmm4, xmm0
4040movups [Param1 + 0x00], xmm1
4041movups [Param1 + 0x10], xmm2
4042movups [Param1 + 0x20], xmm3
4043movups [Param1 + 0x30], xmm4
4044ret
4045
4046_single_add_matrix4:
4047movups xmm1, [Param2 + 0x00] ; Load 4 rows
4048shufps xmm0, xmm0, 0 ; Replicate A
4049movups xmm2, [Param2 + 0x10]
4050movups xmm3, [Param2 + 0x20]
4051movups xmm4, [Param2 + 0x30]
4052addps xmm1, xmm0 ; Add A to each row
4053addps xmm2, xmm0
4054addps xmm3, xmm0
4055addps xmm4, xmm0
4056movups [Param1 + 0x00], xmm1
4057movups [Param1 + 0x10], xmm2
4058movups [Param1 + 0x20], xmm3
4059movups [Param1 + 0x30], xmm4
4060ret
4061
4062_matrix4_add_matrix4:
4063movups xmm0, [Param2 + 0x00] ; Load 4 rows of A
4064movups xmm1, [Param2 + 0x10]
4065movups xmm2, [Param2 + 0x20]
4066movups xmm3, [Param2 + 0x30]
4067movups xmm4, [Param3 + 0x00] ; Load 2 rows of B
4068movups xmm5, [Param3 + 0x10]
4069addps xmm0, xmm4 ; Add rows
4070addps xmm1, xmm5
4071movups xmm4, [Param3 + 0x20] ; Load 2 rows of B
4072movups xmm5, [Param3 + 0x30]
4073addps xmm2, xmm4 ; Add rows
4074addps xmm3, xmm5
4075movups [Param1 + 0x00], xmm0
4076movups [Param1 + 0x10], xmm1
4077movups [Param1 + 0x20], xmm2
4078movups [Param1 + 0x30], xmm3
4079ret
4080
4081_matrix4_sub_single:
4082movups xmm1, [Param2 + 0x00] ; Load 4 rows
4083shufps xmm0, xmm0, 0 ; Replicate B
4084movups xmm2, [Param2 + 0x10]
4085movups xmm3, [Param2 + 0x20]
4086movups xmm4, [Param2 + 0x30]
4087subps xmm1, xmm0 ; Subtract B from each row
4088subps xmm2, xmm0
4089subps xmm3, xmm0
4090subps xmm4, xmm0
4091movups [Param1 + 0x00], xmm1
4092movups [Param1 + 0x10], xmm2
4093movups [Param1 + 0x20], xmm3
4094movups [Param1 + 0x30], xmm4
4095ret
4096
4097_single_sub_matrix4:
4098movups xmm4, [Param2 + 0x00] ; Load 4 rows
4099shufps xmm0, xmm0, 0 ; Replicate A
4100movups xmm5, [Param2 + 0x10]
4101movaps xmm1, xmm0
4102movaps xmm2, xmm0
4103movaps xmm3, xmm0
4104subps xmm0, xmm4 ; Subtract each row from A
4105subps xmm1, xmm5
4106movups xmm4, [Param2 + 0x20]
4107movups xmm5, [Param2 + 0x30]
4108subps xmm2, xmm4
4109subps xmm3, xmm5
4110movups [Param1 + 0x00], xmm0
4111movups [Param1 + 0x10], xmm1
4112movups [Param1 + 0x20], xmm2
4113movups [Param1 + 0x30], xmm3
4114ret
4115
4116_matrix4_sub_matrix4:
4117movups xmm0, [Param2 + 0x00] ; Load 4 rows of A
4118movups xmm1, [Param2 + 0x10]
4119movups xmm2, [Param2 + 0x20]
4120movups xmm3, [Param2 + 0x30]
4121movups xmm4, [Param3 + 0x00] ; Load 4 rows of B
4122movups xmm5, [Param3 + 0x10]
4123subps xmm0, xmm4 ; Subtract rows
4124subps xmm1, xmm5
4125movups xmm4, [Param3 + 0x20]
4126movups xmm5, [Param3 + 0x30]
4127subps xmm2, xmm4
4128subps xmm3, xmm5
4129movups [Param1 + 0x00], xmm0
4130movups [Param1 + 0x10], xmm1
4131movups [Param1 + 0x20], xmm2
4132movups [Param1 + 0x30], xmm3
4133ret
4134
4135_matrix4_mul_single:
4136movups xmm1, [Param2 + 0x00] ; Load 4 rows
4137shufps xmm0, xmm0, 0 ; Replicate B
4138movups xmm2, [Param2 + 0x10]
4139movups xmm3, [Param2 + 0x20]
4140movups xmm4, [Param2 + 0x30]
4141mulps xmm1, xmm0 ; Multiply each row by B
4142mulps xmm2, xmm0
4143mulps xmm3, xmm0
4144mulps xmm4, xmm0
4145movups [Param1 + 0x00], xmm1
4146movups [Param1 + 0x10], xmm2
4147movups [Param1 + 0x20], xmm3
4148movups [Param1 + 0x30], xmm4
4149ret
4150
4151_single_mul_matrix4:
4152movups xmm1, [Param2 + 0x00] ; Load 4 rows
4153shufps xmm0, xmm0, 0 ; Replicate A
4154movups xmm2, [Param2 + 0x10]
4155movups xmm3, [Param2 + 0x20]
4156movups xmm4, [Param2 + 0x30]
4157mulps xmm1, xmm0 ; Multiply each row by A
4158mulps xmm2, xmm0
4159mulps xmm3, xmm0
4160mulps xmm4, xmm0
4161movups [Param1 + 0x00], xmm1
4162movups [Param1 + 0x10], xmm2
4163movups [Param1 + 0x20], xmm3
4164movups [Param1 + 0x30], xmm4
4165ret
4166
4167_matrix4_comp_mult:
4168movups xmm0, [Param2 + 0x00] ; Self[0]
4169movups xmm1, [Param2 + 0x10] ; Self[1]
4170movups xmm2, [Param2 + 0x20] ; Self[2]
4171movups xmm3, [Param2 + 0x30] ; Self[3]
4172movups xmm4, [Param3 + 0x00] ; AOther[0]
4173movups xmm5, [Param3 + 0x10] ; AOther[1]
4174
4175; Component-wise multiplication
4176mulps xmm0, xmm4
4177mulps xmm1, xmm5
4178movups xmm4, [Param3 + 0x20] ; AOther[2]
4179movups xmm5, [Param3 + 0x30] ; AOther[3]
4180mulps xmm2, xmm4
4181mulps xmm3, xmm5
4182
4183; Store result
4184movups [Param1 + 0x00], xmm0
4185movups [Param1 + 0x10], xmm1
4186movups [Param1 + 0x20], xmm2
4187movups [Param1 + 0x30], xmm3
4188ret
4189
4190%macro M4_MUL_V4 2
4191movups xmm0, [%2] ; Load vector
4192movups xmm4, [%1 + 0x00] ; Load 4 rows
4193movaps xmm1, xmm0
4194movaps xmm2, xmm0
4195movaps xmm3, xmm0
4196movups xmm5, [%1 + 0x10]
4197mulps xmm0, xmm4 ; (Ax * B00), (Ay * B01), (Az * B02), (Aw * B03)
4198mulps xmm1, xmm5 ; (Ax * B10), (Ay * B11), (Az * B12), (Aw * B13)
4199movups xmm4, [%1 + 0x20]
4200movups xmm5, [%1 + 0x30]
4201mulps xmm2, xmm4 ; (Ax * B20), (Ay * B21), (Az * B22), (Aw * B23)
4202mulps xmm3, xmm5 ; (Ax * B30), (Ay * B31), (Az * B32), (Aw * B33)
4203
4204; Transpose xmm0-xmm3
4205movaps xmm4, xmm2
4206unpcklps xmm2, xmm3 ; B32 B22 B33 B23
4207unpckhps xmm4, xmm3 ; B30 B20 B31 B21
4208
4209movaps xmm3, xmm0
4210unpcklps xmm0, xmm1 ; B12 B02 B13 B03
4211unpckhps xmm3, xmm1 ; B10 B00 B11 B01
4212
4213movaps xmm1, xmm0
4214unpcklpd xmm0, xmm2 ; B33 B23 B13 B03
4215unpckhpd xmm1, xmm2 ; B32 B22 B12 B02
4216
4217movaps xmm2, xmm3
4218unpcklpd xmm2, xmm4 ; B31 B21 B11 B01
4219unpckhpd xmm3, xmm4 ; B30 B20 B10 B00
4220
4221addps xmm0, xmm1 ; Add rows
4222addps xmm2, xmm3
4223addps xmm0, xmm2
4224movhlps xmm1, xmm0
4225ret
4226%endmacro
4227
4228%macro V4_MUL_M4 2
4229movups xmm0, [%1] ; Load vector
4230movups xmm4, [%2 + 0x00] ; Load 4 rows
4231movaps xmm1, xmm0
4232movaps xmm2, xmm0
4233movaps xmm3, xmm0
4234shufps xmm0, xmm0, 0x00 ; Bx Bx Bx Bx
4235shufps xmm1, xmm1, 0x55 ; By By By By
4236shufps xmm2, xmm2, 0xAA ; Bz Bz Bz Bz
4237shufps xmm3, xmm3, 0xFF ; Bw Bw Bw Bw
4238movups xmm5, [%2 + 0x10]
4239mulps xmm0, xmm4 ; (A00 * Bx), (A01 * Bx), (A02 * Bx), (A03 * Bx)
4240mulps xmm1, xmm5 ; (A10 * By), (A11 * By), (A12 * By), (A13 * By)
4241movups xmm4, [%2 + 0x20]
4242movups xmm5, [%2 + 0x30]
4243mulps xmm2, xmm4 ; (A20 * Bz), (A21 * Bz), (A22 * Bz), (A23 * Bz)
4244mulps xmm3, xmm5 ; (A30 * Bw), (A31 * Bw), (A32 * Bw), (A33 * Bw)
4245addps xmm0, xmm1 ; Add rows
4246addps xmm2, xmm3
4247addps xmm0, xmm2
4248movhlps xmm1, xmm0
4249ret
4250%endmacro
4251
4252%macro M4_MUL_M4 2
4253; A.R[0] * B
4254movups xmm0, [%1 + 0x00]
4255movups xmm4, [%2 + 0x00]
4256movaps xmm1, xmm0
4257movaps xmm2, xmm0
4258movaps xmm3, xmm0
4259shufps xmm0, xmm0, 0x00
4260shufps xmm1, xmm1, 0x55
4261shufps xmm2, xmm2, 0xAA
4262shufps xmm3, xmm3, 0xFF
4263movups xmm5, [%2 + 0x10]
4264movups xmm6, [%2 + 0x20]
4265movups xmm7, [%2 + 0x30]
4266mulps xmm0, xmm4
4267mulps xmm1, xmm5
4268mulps xmm2, xmm6
4269mulps xmm3, xmm7
4270addps xmm0, xmm1
4271addps xmm2, xmm3
4272addps xmm0, xmm2
4273movups [Param1 + 0x00], xmm0
4274
4275; A.R[1] * B
4276movups xmm0, [%1 + 0x10]
4277movaps xmm1, xmm0
4278movaps xmm2, xmm0
4279movaps xmm3, xmm0
4280shufps xmm0, xmm0, 0x00
4281shufps xmm1, xmm1, 0x55
4282shufps xmm2, xmm2, 0xAA
4283shufps xmm3, xmm3, 0xFF
4284mulps xmm0, xmm4
4285mulps xmm1, xmm5
4286mulps xmm2, xmm6
4287mulps xmm3, xmm7
4288addps xmm0, xmm1
4289addps xmm2, xmm3
4290addps xmm0, xmm2
4291movups [Param1 + 0x10], xmm0
4292
4293; A.R[2] * B
4294movups xmm0, [%1 + 0x20]
4295movaps xmm1, xmm0
4296movaps xmm2, xmm0
4297movaps xmm3, xmm0
4298shufps xmm0, xmm0, 0x00
4299shufps xmm1, xmm1, 0x55
4300shufps xmm2, xmm2, 0xAA
4301shufps xmm3, xmm3, 0xFF
4302mulps xmm0, xmm4
4303mulps xmm1, xmm5
4304mulps xmm2, xmm6
4305mulps xmm3, xmm7
4306addps xmm0, xmm1
4307addps xmm2, xmm3
4308addps xmm0, xmm2
4309movups [Param1 + 0x20], xmm0
4310
4311; A.R[3] * B
4312movups xmm0, [%1 + 0x30]
4313movaps xmm1, xmm0
4314movaps xmm2, xmm0
4315movaps xmm3, xmm0
4316shufps xmm0, xmm0, 0x00
4317shufps xmm1, xmm1, 0x55
4318shufps xmm2, xmm2, 0xAA
4319shufps xmm3, xmm3, 0xFF
4320mulps xmm0, xmm4
4321mulps xmm1, xmm5
4322mulps xmm2, xmm6
4323mulps xmm3, xmm7
4324addps xmm0, xmm1
4325addps xmm2, xmm3
4326addps xmm0, xmm2
4327movups [Param1 + 0x30], xmm0
4328ret
4329%endmacro
4330
4331%ifdef FM_COLUMN_MAJOR
4332_matrix4_mul_vector4:
4333V4_MUL_M4 Param2, Param1
4334
4335_vector4_mul_matrix4:
4336M4_MUL_V4 Param2, Param1
4337
4338_matrix4_mul_matrix4:
4339M4_MUL_M4 Param3, Param2
4340%else
4341_matrix4_mul_vector4:
4342M4_MUL_V4 Param1, Param2
4343
4344_vector4_mul_matrix4:
4345V4_MUL_M4 Param1, Param2
4346
4347_matrix4_mul_matrix4:
4348M4_MUL_M4 Param2, Param3
4349%endif
4350
4351_matrix4_div_single:
4352movups xmm1, [Param2 + 0x00] ; Load 4 rows
4353shufps xmm0, xmm0, 0 ; Replicate B
4354movups xmm2, [Param2 + 0x10]
4355movups xmm3, [Param2 + 0x20]
4356movups xmm4, [Param2 + 0x30]
4357divps xmm1, xmm0 ; Divide each row by B
4358divps xmm2, xmm0 ; NOTE: We could speed it up by multiplying by
4359divps xmm3, xmm0 ; 1/B instead, using the "rcpps" instruction,
4360divps xmm4, xmm0 ; but that instruction is an approximation,
4361; so we lose accuracy.
4362movups [Param1 + 0x00], xmm1
4363movups [Param1 + 0x10], xmm2
4364movups [Param1 + 0x20], xmm3
4365movups [Param1 + 0x30], xmm4
4366ret
4367
4368_single_div_matrix4:
4369movups xmm4, [Param2 + 0x00] ; Load 4 rows
4370shufps xmm0, xmm0, 0 ; Replicate A
4371movups xmm5, [Param2 + 0x10]
4372movaps xmm1, xmm0
4373movaps xmm2, xmm0
4374movaps xmm3, xmm0
4375divps xmm0, xmm4 ; Divide A by each row
4376divps xmm1, xmm5
4377movups xmm4, [Param2 + 0x20]
4378movups xmm5, [Param2 + 0x30]
4379divps xmm2, xmm4
4380divps xmm3, xmm5
4381movups [Param1 + 0x00], xmm0
4382movups [Param1 + 0x10], xmm1
4383movups [Param1 + 0x20], xmm2
4384movups [Param1 + 0x30], xmm3
4385ret
4386
4387_matrix4_negative:
4388movaps xmm0, [rel kSSE_MASK_SIGN] ; Load mask with 4 sign (upper) bits
4389movups xmm1, [Param2 + 0x00] ; Load 4 rows
4390movups xmm2, [Param2 + 0x10]
4391movups xmm3, [Param2 + 0x20]
4392movups xmm4, [Param2 + 0x30]
4393xorps xmm1, xmm0 ; Flip sign bits of each element in each row
4394xorps xmm2, xmm0
4395xorps xmm3, xmm0
4396xorps xmm4, xmm0
4397movups [Param1 + 0x00], xmm1
4398movups [Param1 + 0x10], xmm2
4399movups [Param1 + 0x20], xmm3
4400movups [Param1 + 0x30], xmm4
4401ret
4402
4403%macro M4_INVERSE 2
4404movups xmm1, [%2 + 0x10] ; M[1]
4405movups xmm2, [%2 + 0x20] ; M[2]
4406movups xmm3, [%2 + 0x30] ; M[3]
4407
4408; C00 := (A.M[2,2] * A.M[3,3]) - (A.M[3,2] * A.M[2,3]);
4409; C02 := (A.M[1,2] * A.M[3,3]) - (A.M[3,2] * A.M[1,3]);
4410; C03 := (A.M[1,2] * A.M[2,3]) - (A.M[2,2] * A.M[1,3]);
4411; F0 := Vector4(C00, C00, C02, C03);
4412movaps xmm5, xmm2 ; M[2]
4413movaps xmm7, xmm2 ; M[2]
4414movaps xmm0, xmm3 ; M[3]
4415movaps xmm6, xmm3 ; M[3]
4416shufps xmm6, xmm2, 0xAA ; M22 M22 M32 M32
4417shufps xmm0, xmm2, 0xFF ; M23 M23 M33 M33
4418shufps xmm7, xmm1, 0xFF ; M13 M13 M23 M23
4419pshufd xmm4, xmm0, 0x80 ; M23 M33 M33 M33
4420shufps xmm5, xmm1, 0xAA ; M12 M12 M22 M22
4421pshufd xmm0, xmm6, 0x80 ; M22 M32 M32 M32
4422mulps xmm5, xmm4 ; (M12 * M23) (M12 * M33) (M22 * M33) (M22 * M33)
4423mulps xmm7, xmm0 ; (M22 * M13) (M32 * M13) (M32 * M23) (M32 * M23)
4424subps xmm5, xmm7 ; C03=(M12*M23)-(M22*M13), C02=(M12*M33)-(M32*M13), C00=(M22*M33)-(M32*M23), C00=(M22*M33)-(M32*M23)
4425movups xmm8, xmm5
4426
4427; C04 := (A.M[2,1] * A.M[3,3]) - (A.M[3,1] * A.M[2,3]);
4428; C06 := (A.M[1,1] * A.M[3,3]) - (A.M[3,1] * A.M[1,3]);
4429; C07 := (A.M[1,1] * A.M[2,3]) - (A.M[2,1] * A.M[1,3]);
4430; F1 := Vector4(C04, C04, C06, C07);
4431movaps xmm5, xmm2 ; M[2]
4432movaps xmm7, xmm2 ; M[2]
4433movaps xmm0, xmm3 ; M[3]
4434movaps xmm6, xmm3 ; M[3]
4435shufps xmm6, xmm2, 0x55 ; M21 M21 M31 M31
4436shufps xmm0, xmm2, 0xFF ; M23 M23 M33 M33
4437shufps xmm7, xmm1, 0xFF ; M13 M13 M23 M23
4438pshufd xmm4, xmm0, 0x80 ; M23 M33 M33 M33
4439shufps xmm5, xmm1, 0x55 ; M11 M11 M21 M21
4440pshufd xmm0, xmm6, 0x80 ; M21 M31 M31 M31
4441mulps xmm5, xmm4 ; (M11 * M23) (M11 * M33) (M21 * M33) (M21 * M33)
4442mulps xmm7, xmm0 ; (M21 * M13) (M31 * M13) (M31 * M23) (M31 * M23)
4443subps xmm5, xmm7 ; C07=(M11*M23)-(M21*M13), C06=(M11*M33)-(M31*M13), C04=(M21*M33)-(M31*M23), C04=(M21*M33)-(M31*M23)
4444movups xmm9, xmm5
4445
4446; C08 := (A.M[2,1] * A.M[3,2]) - (A.M[3,1] * A.M[2,2]);
4447; C10 := (A.M[1,1] * A.M[3,2]) - (A.M[3,1] * A.M[1,2]);
4448; C11 := (A.M[1,1] * A.M[2,2]) - (A.M[2,1] * A.M[1,2]);
4449; F2 := Vector4(C08, C08, C10, C11);
4450movaps xmm5, xmm2 ; M[2]
4451movaps xmm7, xmm2 ; M[2]
4452movaps xmm0, xmm3 ; M[3]
4453movaps xmm6, xmm3 ; M[3]
4454shufps xmm6, xmm2, 0x55 ; M21 M21 M31 M31
4455shufps xmm0, xmm2, 0xAA ; M22 M22 M32 M32
4456shufps xmm7, xmm1, 0xAA ; M12 M12 M22 M22
4457pshufd xmm4, xmm0, 0x80 ; M22 M32 M32 M32
4458shufps xmm5, xmm1, 0x55 ; M11 M11 M21 M21
4459pshufd xmm0, xmm6, 0x80 ; M21 M31 M31 M31
4460mulps xmm5, xmm4 ; (M11 * M22) (M11 * M32) (M21 * M32) (M21 * M32)
4461mulps xmm7, xmm0 ; (M21 * M12) (M31 * M12) (M31 * M22) (M32 * M22)
4462subps xmm5, xmm7 ; C11=(M11*M22)-(M21*M12), C10=(M11*M32)-(M31*M12), C08=(M21*M32)-(M31*M22), C08=(M21*M32)-(M31*M22)
4463movups xmm10, xmm5
4464
4465; C12 := (A.M[2,0] * A.M[3,3]) - (A.M[3,0] * A.M[2,3]);
4466; C14 := (A.M[1,0] * A.M[3,3]) - (A.M[3,0] * A.M[1,3]);
4467; C15 := (A.M[1,0] * A.M[2,3]) - (A.M[2,0] * A.M[1,3]);
4468; F3 := Vector4(C12, C12, C14, C15);
4469movaps xmm5, xmm2 ; M[2]
4470movaps xmm7, xmm2 ; M[2]
4471movaps xmm0, xmm3 ; M[3]
4472movaps xmm6, xmm3 ; M[3]
4473shufps xmm6, xmm2, 0x00 ; M20 M20 M30 M30
4474shufps xmm0, xmm2, 0xFF ; M23 M23 M33 M33
4475shufps xmm7, xmm1, 0xFF ; M13 M13 M23 M23
4476pshufd xmm4, xmm0, 0x80 ; M23 M33 M33 M33
4477shufps xmm5, xmm1, 0x00 ; M10 M10 M20 M20
4478pshufd xmm0, xmm6, 0x80 ; M20 M30 M30 M30
4479mulps xmm5, xmm4 ; (M10 * M23) (M10 * M33) (M20 * M33) (M20 * M33)
4480mulps xmm7, xmm0 ; (M20 * M13) (M30 * M13) (M30 * M23) (M30 * M23)
4481subps xmm5, xmm7 ; C15=(M10*M23)-(M20*M13), C14=(M10*M33)-(M30*M13), C12=(M20*M33)-(M30*M23), C12=(M20*M33)-(M30*M23)
4482movups xmm11, xmm5
4483
4484; C16 := (A.M[2,0] * A.M[3,2]) - (A.M[3,0] * A.M[2,2]);
4485; C18 := (A.M[1,0] * A.M[3,2]) - (A.M[3,0] * A.M[1,2]);
4486; C19 := (A.M[1,0] * A.M[2,2]) - (A.M[2,0] * A.M[1,2]);
4487; F4 := Vector4(C16, C16, C18, C19);
4488movaps xmm5, xmm2 ; M[2]
4489movaps xmm7, xmm2 ; M[2]
4490movaps xmm0, xmm3 ; M[3]
4491movaps xmm6, xmm3 ; M[3]
4492shufps xmm6, xmm2, 0x00 ; M20 M20 M30 M30
4493shufps xmm0, xmm2, 0xAA ; M22 M22 M32 M32
4494shufps xmm7, xmm1, 0xAA ; M12 M12 M22 M22
4495pshufd xmm4, xmm0, 0x80 ; M22 M32 M32 M32
4496shufps xmm5, xmm1, 0x00 ; M10 M10 M20 M20
4497pshufd xmm0, xmm6, 0x80 ; M20 M30 M30 M30
4498mulps xmm5, xmm4 ; (M10 * M22) (M10 * M32) (M20 * M32) (M20 * M32)
4499mulps xmm7, xmm0 ; (M20 * M12) (M30 * M12) (M30 * M22) (M30 * M22)
4500subps xmm5, xmm7 ; C19=(M10*M22)-(M20*M12), C18=(M10*M32)-(M30*M12), C16=(M20*M32)-(M30*M22), C16=(M20*M32)-(M30*M22)
4501movups xmm12, xmm5
4502
4503; C20 := (A.M[2,0] * A.M[3,1]) - (A.M[3,0] * A.M[2,1]);
4504; C22 := (A.M[1,0] * A.M[3,1]) - (A.M[3,0] * A.M[1,1]);
4505; C23 := (A.M[1,0] * A.M[2,1]) - (A.M[2,0] * A.M[1,1]);
4506; F5 := Vector4(C20, C20, C22, C23);
4507movaps xmm5, xmm2 ; M[2]
4508movaps xmm7, xmm2 ; M[2]
4509movaps xmm0, xmm3 ; M[3]
4510movaps xmm6, xmm3 ; M[3]
4511shufps xmm6, xmm2, 0x00 ; M20 M20 M30 M30
4512shufps xmm0, xmm2, 0x55 ; M21 M21 M31 M31
4513shufps xmm7, xmm1, 0x55 ; M11 M11 M21 M21
4514pshufd xmm4, xmm0, 0x80 ; M21 M31 M31 M31
4515shufps xmm5, xmm1, 0x00 ; M10 M10 M20 M20
4516pshufd xmm0, xmm6, 0x80 ; M20 M30 M30 M30
4517mulps xmm5, xmm4 ; (M10 * M21) (M10 * M31) (M20 * M31) (M20 * M31)
4518mulps xmm7, xmm0 ; (M20 * M11) (M30 * M11) (M30 * M21) (M30 * M21)
4519subps xmm5, xmm7 ; C23=(M10*M21)-(M20*M11), C22=(M10*M31)-(M30*M11), C20=(M20*M31)-(M30*M21), C20=(M20*M31)-(M30*M21)
4520movups xmm13, xmm5
4521
4522; V0 := Vector4(A.M[1,0], A.M[0,0], A.M[0,0], A.M[0,0]);
4523; V1 := Vector4(A.M[1,1], A.M[0,1], A.M[0,1], A.M[0,1]);
4524; V2 := Vector4(A.M[1,2], A.M[0,2], A.M[0,2], A.M[0,2]);
4525; V3 := Vector4(A.M[1,3], A.M[0,3], A.M[0,3], A.M[0,3]);
4526movups xmm0, [%2 + 0x00] ; M[0]
4527movaps xmm4, xmm1 ; M[1]
4528movaps xmm5, xmm1 ; M[1]
4529movaps xmm6, xmm1 ; M[1]
4530movaps xmm7, xmm1 ; M[1]
4531
4532shufps xmm4, xmm0, 0x00 ; M00 M00 M10 M10
4533shufps xmm5, xmm0, 0x55 ; M01 M01 M11 M11
4534shufps xmm6, xmm0, 0xAA ; M02 M02 M12 M12
4535shufps xmm7, xmm0, 0xFF ; M03 M03 M13 M13
4536
4537pshufd xmm4, xmm4, 0xA8 ; V0=M00 M00 M00 M10
4538pshufd xmm5, xmm5, 0xA8 ; V1=M01 M01 M01 M11
4539pshufd xmm6, xmm6, 0xA8 ; V2=M02 M02 M02 M12
4540pshufd xmm7, xmm7, 0xA8 ; V3=M03 M03 M03 M13
4541
4542; I0 := (V1 * F0) - (V2 * F1) + (V3 * F2);
4543; I1 := (V0 * F0) - (V2 * F3) + (V3 * F4);
4544; I2 := (V0 * F1) - (V1 * F3) + (V3 * F5);
4545; I3 := (V0 * F2) - (V1 * F4) + (V2 * F5);
4546movaps xmm0, xmm5 ; V1
4547movaps xmm1, xmm6 ; V2
4548movaps xmm2, xmm7 ; V3
4549mulps xmm0, xmm8 ; V1 * F0
4550mulps xmm1, xmm9 ; V2 * F1
4551mulps xmm2, xmm10 ; V3 * F2
4552subps xmm0, xmm1 ; (V1 * F0) - (V2 * F1)
4553movaps xmm1, xmm4 ; V0
4554addps xmm0, xmm2 ; I0=(V1 * F0) - (V2 * F1) + (V3 * F2)
4555
4556movaps xmm2, xmm6 ; V2
4557movaps xmm3, xmm7 ; V3
4558mulps xmm1, xmm8 ; V0 * F0
4559mulps xmm2, xmm11 ; V2 * F3
4560mulps xmm3, xmm12 ; V3 * F4
4561subps xmm1, xmm2 ; (V0 * F0) - (V2 * F3)
4562movaps xmm2, xmm4 ; V0
4563addps xmm1, xmm3 ; I1=(V0 * F0) - (V2 * F3) + (V3 * F4)
4564
4565movaps xmm3, xmm5 ; V1
4566mulps xmm2, xmm9 ; V0 * F1
4567mulps xmm3, xmm11 ; V1 * F3
4568mulps xmm7, xmm13 ; V3 * F5
4569subps xmm2, xmm3 ; (V0 * F1) - (V1 * F3)
4570mulps xmm4, xmm10 ; V0 * F2
4571addps xmm2, xmm7 ; I2=(V0 * F1) - (V1 * F3) + (V3 * F5)
4572
4573mulps xmm5, xmm12 ; V1 * F4
4574mulps xmm6, xmm13 ; V2 * F5
4575subps xmm4, xmm5 ; (V0 * F2) - (V1 * F4)
4576addps xmm4, xmm6 ; I3=(V0 * F2) - (V1 * F4) + (V2 * F5)
4577
4578; SA := Vector4(+1, -1, +1, -1);
4579; SB := Vector4(-1, +1, -1, +1);
4580; Inv := Matrix4(I0 * SA, I1 * SB, I2 * SA, I3 * SB);
4581
4582movaps xmm6, [rel kSSE_MASK_PNPN] ; SA
4583movaps xmm7, [rel kSSE_MASK_NPNP] ; SB
4584xorps xmm0, xmm6 ; Inv[0] = I0 * SA
4585xorps xmm1, xmm7 ; Inv[1] = I1 * SB
4586xorps xmm2, xmm6 ; Inv[2] = I2 * SA
4587xorps xmm4, xmm7 ; Inv[3] = I3 * SB
4588
4589; Row := Vector4(Inv[0,0], Inv[1,0], Inv[2,0], Inv[3,0]);
4590movaps xmm3, xmm0
4591movaps xmm5, xmm2
4592movaps xmm6, xmm1
4593
4594unpcklps xmm3, xmm1 ; Inv[1,1] Inv[0,1] Inv[1,0] Inv[0,0]
4595unpcklps xmm5, xmm4 ; Inv[3,1] Inv[2,1] Inv[3,0] Inv[2,0]
4596movups xmm6, [%2 + 0x00] ; A.C[0]
4597movlhps xmm3, xmm5 ; Inv[3,0] Inv[2,0] Inv[1,0] Inv[0,0]
4598
4599; Dot := A.C[0] * Row;
4600mulps xmm3, xmm6 ; Dot.W Dot.Z Dot.Y Dot.X
4601
4602; OneOverDeterminant := 1 / ((Dot.X + Dot.Y) + (Dot.Z + Dot.W));
4603pshufd xmm6, xmm3, 0x4E ; Dot.Y Dot.X Dot.W Dot.Z
4604addps xmm3, xmm6 ; W+Y Z+X Y+W X+Z
4605pshufd xmm6, xmm3, 0x11 ; X+Z Y+X X+Z Y+W
4606movaps xmm5, [rel kSSE_ONE] ; 1.0 (4x)
4607addps xmm3, xmm6 ; X+Y+Z+W (4x)
4608divps xmm5, xmm3 ; OneOverDeterminant (4x)
4609
4610; Result := Inv * OneOverDeterminant;
4611mulps xmm0, xmm5
4612mulps xmm1, xmm5
4613mulps xmm2, xmm5
4614mulps xmm4, xmm5
4615
4616movups [%1 + 0x00], xmm0
4617movups [%1 + 0x10], xmm1
4618movups [%1 + 0x20], xmm2
4619movups [%1 + 0x30], xmm4
4620ret
4621%endmacro
4622
4623_matrix4_inverse:
4624M4_INVERSE Param1, Param2
4625
4626_matrix4_set_inversed:
4627M4_INVERSE Param1, Param1
4628
4629%macro M4_TRANSPOSE 2
4630movups xmm0, [%2 + 0x00] ; A03 A02 A01 A00
4631movups xmm1, [%2 + 0x10] ; A13 A12 A11 A10
4632movups xmm2, [%2 + 0x20] ; A23 A22 A21 A20
4633movups xmm3, [%2 + 0x30] ; A33 A32 A31 A30
4634
4635movaps xmm4, xmm2
4636unpcklps xmm2, xmm3 ; A31 A21 A30 A20
4637unpckhps xmm4, xmm3 ; A33 A23 A32 A22
4638
4639movaps xmm3, xmm0
4640unpcklps xmm0, xmm1 ; A11 A01 A10 A00
4641unpckhps xmm3, xmm1 ; A13 A03 A12 A02
4642
4643movaps xmm1, xmm0
4644unpcklpd xmm0, xmm2 ; A30 A20 A10 A00
4645unpckhpd xmm1, xmm2 ; A31 A21 A11 A01
4646
4647movaps xmm2, xmm3
4648unpcklpd xmm2, xmm4 ; A32 A22 A12 A02
4649unpckhpd xmm3, xmm4 ; A33 A23 A13 A03
4650
4651movups [%1 + 0x00], xmm0
4652movups [%1 + 0x10], xmm1
4653movups [%1 + 0x20], xmm2
4654movups [%1 + 0x30], xmm3
4655ret
4656%endmacro
4657
4658_matrix4_transpose:
4659M4_TRANSPOSE Param1, Param2
4660
4661_matrix4_set_transposed:
4662M4_TRANSPOSE Param1, Param1