MathgeomGLS

Форк
0
4662 строки · 144.6 Кб
1
; MacOS64 uses the System V AMD64 ABI:
2
; * First 6 (up to) 64-bit int/pointer parameters in RDI, RSI, RDX, RCX, R8, R9
3
; * First 8 float parameters in XMM0-XMM7
4
; * (Up to) 64-bit int return value: RAX
5
; * (Up to) 128-bit int return value: RAX, RDX
6
; * Float return value(s): XMM0, XMM1. This applies to records with float values
7
;   as well: the first 2 Single values are returned in XMM0 and the next to in
8
;   XMM1 (so for a TVector4, XMM0 contains X and Y, and XMM1 contains Z and W).
9
;   Use "movhlps xmm1, xmm0" to copy the upper 2 floats from xmm0 to the lower
10
;   2 floats of xmm1
11
; * For return values larger than 128 bits, the first parameter (RDI) will be
12
;   set by the caller to the address of the return value (and all other 
13
;   parameters move one up).
14
; * RBX, RBP and R12-R15 must be saved
15
; * RAX, RCX, RDX, RSI, RDI, R8-R11 can be modified
16
; * All XMM registeres can be modified
17
; * For leaf-node functions (that don't call other functions), the 128 bytes
18
;   below the stack pointer (the red-zone) can be freely used.
19
; * The ".data" segment is aligned, so you can use "movaps" and friends
20
; * Parameter pointers do *not* have to be aligned, so you should use "movups"
21

22
BITS 64
23

24
section .data
25

26
ALIGN 16
27

28
; SSE rounding modes (bits in MXCSR register)
29
%define SSE_ROUND_MASK 0xFFFF9FFF
30
%define SSE_ROUND_NEAREST 0x00000000
31
%define SSE_ROUND_DOWN 0x00002000
32
%define SSE_ROUND_UP 0x00004000
33
%define SSE_ROUND_TRUNC 0x00006000
34

35
; These constants fit in a single XMM register. These values represent
36
; sign-bits as used by 32-bit floating-point values.
37
; XOR'ing a floating-point value with 0x80000000 swaps the sign.
38
; XOR'ing a floating-point value with 0x00000000 leaves the value unchanged.
39
kSSE_MASK_SIGN:
40
  dd 0x80000000, 0x80000000, 0x80000000, 0x80000000
41

42
kSSE_MASK_NPNP:
43
  dd 0x80000000, 0x00000000, 0x80000000, 0x00000000
44

45
kSSE_MASK_PNPN:
46
  dd 0x00000000, 0x80000000, 0x00000000, 0x80000000
47

48
kSSE_MASK_0FFF:
49
  dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
50

51
; These constants mask off an element of the binary representation of a
52
; 32-bit floating-point value.
53
kSSE_MASK_FRACTION:
54
  dd 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF
55

56
kSSE_MASK_EXPONENT:
57
  dd 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
58

59
kSSE_MASK_ABS_VAL:
60
  dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
61

62
; Commonly used floating-point values
63
kSSE_ONE_HALF:
64
  dd 0.5, 0.5, 0.5, 0.5
65

66
kSSE_ONE:
67
  dd 1.0, 1.0, 1.0, 1.0
68

69
kSSE_TWO:
70
  dd 2.0, 2.0, 2.0, 2.0
71

72
kSSE_THREE:
73
  dd 3.0, 3.0, 3.0, 3.0
74

75
kSSE_PI_OVER_180:
76
  dd 0.01745329251994329576923690768489, 0.01745329251994329576923690768489, 0.01745329251994329576923690768489, 0.01745329251994329576923690768489
77

78
kSSE_180_OVER_PI:
79
  dd 57.295779513082320876798154814105, 57.295779513082320876798154814105, 57.295779513082320876798154814105, 57.295779513082320876798154814105
80

81
kSSE_NEG_INFINITY:
82
  dd -__Infinity__, -__Infinity__, -__Infinity__, -__Infinity__
83

84
kSSE_PI_OVER_4:
85
  dd 0.78539816339744830961566084581988, 0.78539816339744830961566084581988, 0.78539816339744830961566084581988, 0.78539816339744830961566084581988
86

87
; Commonly used integer values
88
kSSE_INT_ONE:
89
  dd 1, 1, 1, 1
90

91
kSSE_INT_NOT_ONE:
92
  dd 0xFFFFFFFE, 0xFFFFFFFE, 0xFFFFFFFE, 0xFFFFFFFE
93

94
kSSE_INT_TWO:
95
  dd 2, 2, 2, 2
96

97
kSSE_INT_FOUR:
98
  dd 4, 4, 4, 4
99

100
; Constants for approximating trigonometric functions
101
kSSE_FOPI:
102
  dd 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516
103

104
kSSE_SINCOF_P0:
105
  dd -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4
106

107
kSSE_SINCOF_P1:
108
  dd 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3
109

110
kSSE_SINCOF_P2:
111
  dd -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1
112

113
kSSE_COSCOF_P0:
114
  dd 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005
115

116
kSSE_COSCOF_P1:
117
  dd -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003
118

119
kSSE_COSCOF_P2:
120
  dd 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002
121

122
kSSE_EXP_A1:
123
  dd 12102203.1615614, 12102203.1615614, 12102203.1615614, 12102203.1615614
124

125
kSSE_EXP_A2:
126
  dd 1065353216.0, 1065353216.0, 1065353216.0, 1065353216.0
127

128
kSSE_EXP_CST:
129
  dd 2139095040.0, 2139095040.0, 2139095040.0, 2139095040.0
130

131
kSSE_EXP_F1:
132
  dd 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125
133

134
kSSE_EXP_F2:
135
  dd 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375
136

137
kSSE_EXP_F3:
138
  dd 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875
139

140
kSSE_EXP_F4:
141
  dd -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3
142

143
kSSE_EXP_F5:
144
  dd 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2
145

146
kSSE_EXP_I1:
147
  dd 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000
148

149
kSSE_LN_CST:
150
  dd -89.93423858, -89.93423858, -89.93423858, -89.93423858
151

152
kSSE_LN_F1:
153
  dd 3.3977745, 3.3977745, 3.3977745, 3.3977745
154

155
kSSE_LN_F2:
156
  dd 2.2744832, 2.2744832, 2.2744832, 2.2744832
157

158
kSSE_LN_F3:
159
  dd 0.024982445, 0.024982445, 0.024982445, 0.024982445
160

161
kSSE_LN_F4:
162
  dd 0.24371102, 0.24371102, 0.24371102, 0.24371102
163

164
kSSE_LN_F5:
165
  dd 0.69314718055995, 0.69314718055995, 0.69314718055995, 0.69314718055995
166

167
kSSE_LOG2_I1:
168
  dd 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
169

170
kSSE_LOG2_F1:
171
  dd 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7
172

173
kSSE_LOG2_F2:
174
  dd 124.22551499, 124.22551499, 124.22551499, 124.22551499
175

176
kSSE_LOG2_F3:
177
  dd 1.498030302, 1.498030302, 1.498030302, 1.498030302
178

179
kSSE_LOG2_F4:
180
  dd 1.72587999, 1.72587999, 1.72587999, 1.72587999
181

182
kSSE_LOG2_F5:
183
  dd 0.3520887068, 0.3520887068, 0.3520887068, 0.3520887068
184

185
kSSE_EXP2_F1:
186
  dd 121.2740575, 121.2740575, 121.2740575, 121.2740575
187

188
kSSE_EXP2_F2:
189
  dd 27.7280233, 27.7280233, 27.7280233, 27.7280233
190

191
kSSE_EXP2_F3:
192
  dd 4.84252568, 4.84252568, 4.84252568, 4.84252568
193

194
kSSE_EXP2_F4:
195
  dd 1.49012907, 1.49012907, 1.49012907, 1.49012907
196

197
kSSE_EXP2_F5:
198
  dd 8388608.0, 8388608.0, 8388608.0, 8388608.00000
199

200
section .text
201

202
%define Param1 rdi
203
%define Param2 rsi
204
%define Param3 rdx
205
%define Self rdi
206
%define OldFlags rsp-32
207
%define NewFlags rsp-48
208

209
global _radians_vector2, _radians_vector3, _radians_vector4
210
global _degrees_vector2, _degrees_vector3, _degrees_vector4
211
global _sqrt_single, _sqrt_vector2, _sqrt_vector3, _sqrt_vector4
212
global _inverse_sqrt_single, _inverse_sqrt_vector2, _inverse_sqrt_vector3, _inverse_sqrt_vector4
213
global _fast_sin_single, _fast_sin_vector2, _fast_sin_vector3, _fast_sin_vector4
214
global _fast_cos_single, _fast_cos_vector2, _fast_cos_vector3, _fast_cos_vector4
215
global _fast_sin_cos_single, _fast_sin_cos_vector2, _fast_sin_cos_vector3, _fast_sin_cos_vector4
216
global _fast_exp_single, _fast_exp_vector2, _fast_exp_vector3, _fast_exp_vector4
217
global _fast_ln_single, _fast_ln_vector2, _fast_ln_vector3, _fast_ln_vector4
218
global _fast_log2_single, _fast_log2_vector2, _fast_log2_vector3, _fast_log2_vector4
219
global _fast_exp2_single, _fast_exp2_vector2, _fast_exp2_vector3, _fast_exp2_vector4
220
global _abs_vector3, _abs_vector4
221
global _sign_single, _sign_vector2, _sign_vector3, _sign_vector4
222
global _floor_single, _floor_vector2, _floor_vector3, _floor_vector4
223
global _trunc_single, _trunc_vector2, _trunc_vector3, _trunc_vector4
224
global _round_single,_round_vector2, _round_vector3, _round_vector4
225
global _ceil_single, _ceil_vector2, _ceil_vector3, _ceil_vector4
226
global _frac_vector2, _frac_vector3, _frac_vector4
227
global _fmod_vector2_single, _fmod_vector3_single, _fmod_vector4_single
228
global _fmod_vector2, _fmod_vector3, _fmod_vector4
229
global _modf_vector2, _modf_vector3, _modf_vector4
230
global _min_vector2_single, _min_vector3_single, _min_vector4_single
231
global _min_vector2, _min_vector3, _min_vector4
232
global _max_vector2_single, _max_vector3_single, _max_vector4_single
233
global _max_vector2, _max_vector3, _max_vector4
234
global _ensure_range_single
235
global _ensure_range_vector2_single, _ensure_range_vector3_single, _ensure_range_vector4_single
236
global _ensure_range_vector2, _ensure_range_vector3, _ensure_range_vector4
237
global _mix_vector3_single, _mix_vector4_single
238
global _mix_vector3, _mix_vector4
239
global _step_single_vector2, _step_single_vector3, _step_single_vector4
240
global _step_vector2, _step_vector3, _step_vector4
241
global _smooth_step_single_vector3, _smooth_step_single_vector4
242
global _smooth_step_vector3, _smooth_step_vector4
243
global _fma_vector2, _fma_vector3, _fma_vector4
244
global _outer_product_matrix2, _outer_product_matrix3, _outer_product_matrix4
245
global _vector2_div_single, _single_div_vector2, _vector2_div_vector2
246
global _vector2_normalize_fast, _vector2_set_normalized_fast
247
global _vector3_add_single, _single_add_vector3, _vector3_add_vector3
248
global _vector3_sub_single, _single_sub_vector3, _vector3_sub_vector3
249
global _vector3_mul_single, _single_mul_vector3, _vector3_mul_vector3
250
global _vector3_div_single, _single_div_vector3, _vector3_div_vector3
251
global _vector3_distance, _vector3_distance_squared
252
global _vector3_get_length, _vector3_get_length_squared
253
global _vector3_normalize_fast, _vector3_set_normalized_fast
254
global _vector3_reflect, _vector3_refract
255
global _vector4_add_single, _single_add_vector4, _vector4_add_vector4
256
global _vector4_sub_single, _single_sub_vector4, _vector4_sub_vector4
257
global _vector4_mul_single, _single_mul_vector4, _vector4_mul_vector4
258
global _vector4_div_single, _single_div_vector4, _vector4_div_vector4
259
global _vector4_negative
260
global _vector4_distance, _vector4_distance_squared
261
global _vector4_face_forward
262
global _vector4_get_length, _vector4_get_length_squared
263
global _vector4_normalize_fast, _vector4_set_normalized_fast
264
global _vector4_reflect, _vector4_refract
265
global _matrix3_add_single, _single_add_matrix3, _matrix3_add_matrix3
266
global _matrix3_sub_single, _single_sub_matrix3, _matrix3_sub_matrix3
267
global _matrix3_mul_single, _single_mul_matrix3, _matrix3_comp_mult
268
global _matrix3_mul_vector3, _vector3_mul_matrix3, _matrix3_mul_matrix3
269
global _matrix3_div_single, _single_div_matrix3
270
global _matrix3_negative, _matrix3_transpose, _matrix3_set_transposed
271
global _matrix4_add_single, _single_add_matrix4, _matrix4_add_matrix4
272
global _matrix4_sub_single, _single_sub_matrix4, _matrix4_sub_matrix4
273
global _matrix4_mul_single, _single_mul_matrix4, _matrix4_comp_mult
274
global _matrix4_mul_vector4, _vector4_mul_matrix4, _matrix4_mul_matrix4
275
global _matrix4_div_single, _single_div_matrix4
276
global _matrix4_negative, _matrix4_inverse, _matrix4_set_inversed
277
global _matrix4_transpose, _matrix4_set_transposed
278

279
;****************************************************************************
280
; Angle and Trigonometry Functions
281
;****************************************************************************
282

283
_radians_vector2:
284
  movlps    xmm0, [Param1]
285
  movlps    xmm1, [rel kSSE_PI_OVER_180]
286
  mulps     xmm0, xmm1
287
  ret
288

289
_radians_vector3:
290
  movlps    xmm0, [Param1]
291
  movss     xmm1, [Param1+8]
292
  movaps    xmm2, [rel kSSE_PI_OVER_180]
293
  mulps     xmm0, xmm2
294
  mulss     xmm1, xmm2
295
  ret
296

297
_radians_vector4:
298
  movups    xmm0, [Param1]
299
  movaps    xmm1, [rel kSSE_PI_OVER_180]
300
  mulps     xmm0, xmm1
301
  movhlps   xmm1, xmm0
302
  ret
303

304
_degrees_vector2:
305
  movlps    xmm0, [Param1]
306
  movlps    xmm1, [rel kSSE_180_OVER_PI]
307
  mulps     xmm0, xmm1
308
  ret
309

310
_degrees_vector3:
311
  movlps    xmm0, [Param1]
312
  movss     xmm1, [Param1+8]
313
  movaps    xmm2, [rel kSSE_180_OVER_PI]
314
  mulps     xmm0, xmm2
315
  mulss     xmm1, xmm2
316
  ret
317

318
_degrees_vector4:
319
  movups    xmm0, [Param1]
320
  movaps    xmm1, [rel kSSE_180_OVER_PI]
321
  mulps     xmm0, xmm1
322
  movhlps   xmm1, xmm0
323
  ret
324

325
;****************************************************************************
326
; Exponential Functions
327
;****************************************************************************
328

329
_sqrt_single:
330
  sqrtss    xmm0, xmm0
331
  ret
332

333
_sqrt_vector2:
334
  movlps    xmm0, [Param1]
335
  sqrtps    xmm0, xmm0
336
  ret
337

338
_sqrt_vector3:
339
  movlps    xmm0, [Param1]
340
  movss     xmm1, [Param1+8]
341
  sqrtps    xmm0, xmm0
342
  sqrtps    xmm1, xmm1
343
  ret
344

345
_sqrt_vector4:
346
  movups    xmm0, [Param1]
347
  sqrtps    xmm0, xmm0
348
  movhlps   xmm1, xmm0
349
  ret
350

351
_inverse_sqrt_single:
352
  rsqrtss   xmm0, xmm0
353
  ret
354

355
_inverse_sqrt_vector2:
356
  movlps    xmm0, [Param1]
357
  rsqrtps   xmm0, xmm0
358
  ret
359

360
_inverse_sqrt_vector3:
361
  movlps    xmm0, [Param1]
362
  movss     xmm1, [Param1+8]
363
  rsqrtps   xmm0, xmm0
364
  rsqrtps   xmm1, xmm1
365
  ret
366

367
_inverse_sqrt_vector4:
368
  movups    xmm0, [Param1]
369
  rsqrtps   xmm0, xmm0
370
  movhlps   xmm1, xmm0
371
  ret
372

373
;****************************************************************************
374
; Fast approximate Functions
375
;****************************************************************************
376

377
_fast_sin_single:
378
  movss     xmm2, [rel kSSE_MASK_ABS_VAL]
379
  movaps    xmm1, xmm0
380
  movss     xmm3, [rel kSSE_MASK_SIGN]
381
  andps     xmm0, xmm2               ; (xmm0) X := Abs(ARadians)
382
  andps     xmm1, xmm3               ; (xmm1) SignBit
383
  movaps    xmm2, xmm0
384
  movss     xmm4, [rel kSSE_FOPI]
385
  movss     xmm5, [rel kSSE_INT_ONE]
386
  mulss     xmm2, xmm4
387
  movss     xmm6, [rel kSSE_INT_NOT_ONE]
388
  cvtps2dq  xmm2, xmm2               ; J := Trunc(X * FOPI)
389
  movss     xmm7, [rel kSSE_INT_FOUR]
390
  paddd     xmm2, xmm5
391
  pand      xmm2, xmm6               ; (xmm2) J := (J + 1) and (not 1)
392
  movss     xmm6, [rel kSSE_INT_TWO]
393
  cvtdq2ps  xmm4, xmm2               ; (xmm4) Y := J
394
  movaps    xmm5, xmm2
395
  pand      xmm2, xmm6               ; J and 2
396
  pand      xmm5, xmm7               ; J and 4
397
  pxor      xmm7, xmm7
398
  pslld     xmm5, 29                 ; (xmm5) SwapSignBit := (J and 4) shl 29
399
  pcmpeqd   xmm2, xmm7               ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
400
  movss     xmm6, [rel kSSE_PI_OVER_4]
401
  pxor      xmm1, xmm5               ; (xmm1) SignBit := SignBit xor SwapSignBit
402
  mulss     xmm4, xmm6               ; Y * Pi / 4
403
  movss     xmm3, [rel kSSE_COSCOF_P0]
404
  subss     xmm0, xmm4               ; (xmm0) X := X - (Y * Pi / 4)
405
  movss     xmm4, [rel kSSE_COSCOF_P1]
406
  movaps    xmm7, xmm0
407
  movss     xmm6, [rel kSSE_COSCOF_P2]
408
  mulss     xmm7, xmm7               ; (xmm7) Z := X * X
409
  movss     xmm5, [rel kSSE_SINCOF_P1]
410
  mulss     xmm3, xmm7               ; COSCOF_P0 * Z
411
  addss     xmm3, xmm4               ; Y := COSCOF_P0 * Z + COSCOF_P1
412
  movss     xmm4, [rel kSSE_ONE_HALF]
413
  mulss     xmm3, xmm7               ; Y * Z
414
  mulss     xmm4, xmm7               ; Z * 0.5
415
  addps     xmm3, xmm6               ; Y := (Y * Z) + COSCOF_P2
416
  movss     xmm6, [rel kSSE_ONE]
417
  mulss     xmm3, xmm7               ; Y * Z
418
  mulss     xmm3, xmm7               ; Y := Y * (Z * Z)
419
  subss     xmm3, xmm4               ; Y - Z * 0.5
420
  movss     xmm4, [rel kSSE_SINCOF_P0]
421
  addps     xmm3, xmm6               ; (xmm3) Y := Y - Z * 0.5 + 1
422
  movss     xmm6, [rel kSSE_SINCOF_P2]
423
  mulss     xmm4, xmm7               ; SINCOF_P0 * Z
424
  addss     xmm4, xmm5               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
425
  movaps    xmm5, xmm2
426
  mulss     xmm4, xmm7               ; Y2 * Z
427
  addss     xmm4, xmm6               ; Y2 := (Y2 * Z) + SINCOF_P2
428
  mulss     xmm4, xmm7               ; Y2 * Z
429
  mulss     xmm4, xmm0               ; Y2 * (Z * X)
430
  addss     xmm4, xmm0               ; (xmm4) Y2 := Y2 * (Z * X) + X
431
  andps     xmm4, xmm2               ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
432
  andnps    xmm5, xmm3               ; Y  := ((J and 2) = 0)? Yes: 0 , No: Y
433
  addss     xmm4, xmm5
434
  xorps     xmm4, xmm1               ; (Y + Y2) xor SignBit
435
  movss     xmm0, xmm4
436
  ret
437

438
_fast_sin_vector2:
439
  movlps    xmm0, [Param1]
440
  movlps    xmm2, [rel kSSE_MASK_ABS_VAL]
441
  movaps    xmm1, xmm0
442
  movlps    xmm3, [rel kSSE_MASK_SIGN]
443
  andps     xmm0, xmm2               ; (xmm0) X := Abs(ARadians)
444
  andps     xmm1, xmm3               ; (xmm1) SignBit
445
  movaps    xmm2, xmm0
446
  movlps    xmm4, [rel kSSE_FOPI]
447
  movlps    xmm5, [rel kSSE_INT_ONE]
448
  mulps     xmm2, xmm4
449
  movlps    xmm6, [rel kSSE_INT_NOT_ONE]
450
  cvtps2dq  xmm2, xmm2               ; J := Trunc(X * FOPI)
451
  movlps    xmm7, [rel kSSE_INT_FOUR]
452
  paddd     xmm2, xmm5
453
  pand      xmm2, xmm6               ; (xmm2) J := (J + 1) and (not 1)
454
  movlps    xmm6, [rel kSSE_INT_TWO]
455
  cvtdq2ps  xmm4, xmm2               ; (xmm4) Y := J
456
  movaps    xmm5, xmm2
457
  pand      xmm2, xmm6               ; J and 2
458
  pand      xmm5, xmm7               ; J and 4
459
  pxor      xmm7, xmm7
460
  pslld     xmm5, 29                 ; (xmm5) SwapSignBit := (J and 4) shl 29
461
  pcmpeqd   xmm2, xmm7               ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
462
  movlps    xmm6, [rel kSSE_PI_OVER_4]
463
  pxor      xmm1, xmm5               ; (xmm1) SignBit := SignBit xor SwapSignBit
464
  mulps     xmm4, xmm6               ; Y * Pi / 4
465
  movlps    xmm3, [rel kSSE_COSCOF_P0]
466
  subps     xmm0, xmm4               ; (xmm0) X := X - (Y * Pi / 4)
467
  movlps    xmm4, [rel kSSE_COSCOF_P1]
468
  movaps    xmm7, xmm0
469
  movlps    xmm6, [rel kSSE_COSCOF_P2]
470
  mulps     xmm7, xmm7               ; (xmm7) Z := X * X
471
  movlps    xmm5, [rel kSSE_SINCOF_P1]
472
  mulps     xmm3, xmm7               ; COSCOF_P0 * Z
473
  addps     xmm3, xmm4               ; Y := COSCOF_P0 * Z + COSCOF_P1
474
  movlps    xmm4, [rel kSSE_ONE_HALF]
475
  mulps     xmm3, xmm7               ; Y * Z
476
  mulps     xmm4, xmm7               ; Z * 0.5
477
  addps     xmm3, xmm6               ; Y := (Y * Z) + COSCOF_P2
478
  movlps    xmm6, [rel kSSE_ONE]
479
  mulps     xmm3, xmm7               ; Y * Z
480
  mulps     xmm3, xmm7               ; Y := Y * (Z * Z)
481
  subps     xmm3, xmm4               ; Y - Z * 0.5
482
  movlps    xmm4, [rel kSSE_SINCOF_P0]
483
  addps     xmm3, xmm6               ; (xmm3) Y := Y - Z * 0.5 + 1
484
  movlps    xmm6, [rel kSSE_SINCOF_P2]
485
  mulps     xmm4, xmm7               ; SINCOF_P0 * Z
486
  addps     xmm4, xmm5               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
487
  movaps    xmm5, xmm2
488
  mulps     xmm4, xmm7               ; Y2 * Z
489
  addps     xmm4, xmm6               ; Y2 := (Y2 * Z) + SINCOF_P2
490
  mulps     xmm4, xmm7               ; Y2 * Z
491
  mulps     xmm4, xmm0               ; Y2 * (Z * X)
492
  addps     xmm4, xmm0               ; (xmm4) Y2 := Y2 * (Z * X) + X
493
  andps     xmm4, xmm2               ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
494
  andnps    xmm5, xmm3               ; Y  := ((J and 2) = 0)? Yes: 0 , No: Y
495
  addps     xmm4, xmm5
496
  xorps     xmm4, xmm1               ; (Y + Y2) xor SignBit
497
  movaps    xmm0, xmm4
498
  ret
499

500
_fast_sin_vector3:
501
  movq      xmm0, [Param1]
502
  movss     xmm1, [Param1+8]
503
  movlhps   xmm0, xmm1
504
  movaps    xmm2, [rel kSSE_MASK_ABS_VAL]
505
  movaps    xmm1, xmm0
506
  movaps    xmm3, [rel kSSE_MASK_SIGN]
507
  andps     xmm0, xmm2               ; (xmm0) X := Abs(ARadians)
508
  andps     xmm1, xmm3               ; (xmm1) SignBit
509
  movaps    xmm2, xmm0
510
  movaps    xmm4, [rel kSSE_FOPI]
511
  movaps    xmm5, [rel kSSE_INT_ONE]
512
  mulps     xmm2, xmm4
513
  movaps    xmm6, [rel kSSE_INT_NOT_ONE]
514
  cvtps2dq  xmm2, xmm2               ; J := Trunc(X * FOPI)
515
  movaps    xmm7, [rel kSSE_INT_FOUR]
516
  paddd     xmm2, xmm5
517
  pand      xmm2, xmm6               ; (xmm2) J := (J + 1) and (not 1)
518
  movaps    xmm6, [rel kSSE_INT_TWO]
519
  cvtdq2ps  xmm4, xmm2               ; (xmm4) Y := J
520
  movaps    xmm5, xmm2
521
  pand      xmm2, xmm6               ; J and 2
522
  pand      xmm5, xmm7               ; J and 4
523
  pxor      xmm7, xmm7
524
  pslld     xmm5, 29                 ; (xmm5) SwapSignBit := (J and 4) shl 29
525
  pcmpeqd   xmm2, xmm7               ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
526
  movaps    xmm6, [rel kSSE_PI_OVER_4]
527
  pxor      xmm1, xmm5               ; (xmm1) SignBit := SignBit xor SwapSignBit
528
  mulps     xmm4, xmm6               ; Y * Pi / 4
529
  movaps    xmm3, [rel kSSE_COSCOF_P0]
530
  subps     xmm0, xmm4               ; (xmm0) X := X - (Y * Pi / 4)
531
  movaps    xmm4, [rel kSSE_COSCOF_P1]
532
  movaps    xmm7, xmm0
533
  movaps    xmm6, [rel kSSE_COSCOF_P2]
534
  mulps     xmm7, xmm7               ; (xmm7) Z := X * X
535
  movaps    xmm5, [rel kSSE_SINCOF_P1]
536
  mulps     xmm3, xmm7               ; COSCOF_P0 * Z
537
  addps     xmm3, xmm4               ; Y := COSCOF_P0 * Z + COSCOF_P1
538
  movaps    xmm4, [rel kSSE_ONE_HALF]
539
  mulps     xmm3, xmm7               ; Y * Z
540
  mulps     xmm4, xmm7               ; Z * 0.5
541
  addps     xmm3, xmm6               ; Y := (Y * Z) + COSCOF_P2
542
  movaps    xmm6, [rel kSSE_ONE]
543
  mulps     xmm3, xmm7               ; Y * Z
544
  mulps     xmm3, xmm7               ; Y := Y * (Z * Z)
545
  subps     xmm3, xmm4               ; Y - Z * 0.5
546
  movaps    xmm4, [rel kSSE_SINCOF_P0]
547
  addps     xmm3, xmm6               ; (xmm3) Y := Y - Z * 0.5 + 1
548
  movaps    xmm6, [rel kSSE_SINCOF_P2]
549
  mulps     xmm4, xmm7               ; SINCOF_P0 * Z
550
  addps     xmm4, xmm5               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
551
  movaps    xmm5, xmm2
552
  mulps     xmm4, xmm7               ; Y2 * Z
553
  addps     xmm4, xmm6               ; Y2 := (Y2 * Z) + SINCOF_P2
554
  mulps     xmm4, xmm7               ; Y2 * Z
555
  mulps     xmm4, xmm0               ; Y2 * (Z * X)
556
  addps     xmm4, xmm0               ; (xmm4) Y2 := Y2 * (Z * X) + X
557
  andps     xmm4, xmm2               ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
558
  andnps    xmm5, xmm3               ; Y  := ((J and 2) = 0)? Yes: 0 , No: Y
559
  addps     xmm4, xmm5
560
  xorps     xmm4, xmm1               ; (Y + Y2) xor SignBit
561
  movaps    xmm0, xmm4
562
  movhlps   xmm1, xmm4
563
  ret
564

565
_fast_sin_vector4:
566
  movups    xmm0, [Param1]
567
  movaps    xmm2, [rel kSSE_MASK_ABS_VAL]
568
  movaps    xmm1, xmm0
569
  movaps    xmm3, [rel kSSE_MASK_SIGN]
570
  andps     xmm0, xmm2               ; (xmm0) X := Abs(ARadians)
571
  andps     xmm1, xmm3               ; (xmm1) SignBit
572
  movaps    xmm2, xmm0
573
  movaps    xmm4, [rel kSSE_FOPI]
574
  movaps    xmm5, [rel kSSE_INT_ONE]
575
  mulps     xmm2, xmm4
576
  movaps    xmm6, [rel kSSE_INT_NOT_ONE]
577
  cvtps2dq  xmm2, xmm2               ; J := Trunc(X * FOPI)
578
  movaps    xmm7, [rel kSSE_INT_FOUR]
579
  paddd     xmm2, xmm5
580
  pand      xmm2, xmm6               ; (xmm2) J := (J + 1) and (not 1)
581
  movaps    xmm6, [rel kSSE_INT_TWO]
582
  cvtdq2ps  xmm4, xmm2               ; (xmm4) Y := J
583
  movaps    xmm5, xmm2
584
  pand      xmm2, xmm6               ; J and 2
585
  pand      xmm5, xmm7               ; J and 4
586
  pxor      xmm7, xmm7
587
  pslld     xmm5, 29                 ; (xmm5) SwapSignBit := (J and 4) shl 29
588
  pcmpeqd   xmm2, xmm7               ; (xmm2) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
589
  movaps    xmm6, [rel kSSE_PI_OVER_4]
590
  pxor      xmm1, xmm5               ; (xmm1) SignBit := SignBit xor SwapSignBit
591
  mulps     xmm4, xmm6               ; Y * Pi / 4
592
  movaps    xmm3, [rel kSSE_COSCOF_P0]
593
  subps     xmm0, xmm4               ; (xmm0) X := X - (Y * Pi / 4)
594
  movaps    xmm4, [rel kSSE_COSCOF_P1]
595
  movaps    xmm7, xmm0
596
  movaps    xmm6, [rel kSSE_COSCOF_P2]
597
  mulps     xmm7, xmm7               ; (xmm7) Z := X * X
598
  movaps    xmm5, [rel kSSE_SINCOF_P1]
599
  mulps     xmm3, xmm7               ; COSCOF_P0 * Z
600
  addps     xmm3, xmm4               ; Y := COSCOF_P0 * Z + COSCOF_P1
601
  movaps    xmm4, [rel kSSE_ONE_HALF]
602
  mulps     xmm3, xmm7               ; Y * Z
603
  mulps     xmm4, xmm7               ; Z * 0.5
604
  addps     xmm3, xmm6               ; Y := (Y * Z) + COSCOF_P2
605
  movaps    xmm6, [rel kSSE_ONE]
606
  mulps     xmm3, xmm7               ; Y * Z
607
  mulps     xmm3, xmm7               ; Y := Y * (Z * Z)
608
  subps     xmm3, xmm4               ; Y - Z * 0.5
609
  movaps    xmm4, [rel kSSE_SINCOF_P0]
610
  addps     xmm3, xmm6               ; (xmm3) Y := Y - Z * 0.5 + 1
611
  movaps    xmm6, [rel kSSE_SINCOF_P2]
612
  mulps     xmm4, xmm7               ; SINCOF_P0 * Z
613
  addps     xmm4, xmm5               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
614
  movaps    xmm5, xmm2
615
  mulps     xmm4, xmm7               ; Y2 * Z
616
  addps     xmm4, xmm6               ; Y2 := (Y2 * Z) + SINCOF_P2
617
  mulps     xmm4, xmm7               ; Y2 * Z
618
  mulps     xmm4, xmm0               ; Y2 * (Z * X)
619
  addps     xmm4, xmm0               ; (xmm4) Y2 := Y2 * (Z * X) + X
620
  andps     xmm4, xmm2               ; Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
621
  andnps    xmm5, xmm3               ; Y  := ((J and 2) = 0)? Yes: 0 , No: Y
622
  addps     xmm4, xmm5
623
  xorps     xmm4, xmm1               ; (Y + Y2) xor SignBit
624
  movaps    xmm0, xmm4
625
  movhlps   xmm1, xmm4
626
  ret
627

628
_fast_cos_single:
629
  movss     xmm1, [rel kSSE_MASK_ABS_VAL]
630
  movss     xmm2, [rel kSSE_FOPI]
631
  andps     xmm0, xmm1               ; (xmm0) X := Abs(ARadians)
632
  movss     xmm3, [rel kSSE_INT_NOT_ONE]
633
  movaps    xmm1, xmm0
634
  movss     xmm4, [rel kSSE_INT_FOUR]
635
  mulss     xmm1, xmm2
636
  movss     xmm2, [rel kSSE_INT_ONE]
637
  cvtps2dq  xmm1, xmm1               ; J := Trunc(X * FOPI)
638
  pxor      xmm6, xmm6
639
  paddd     xmm1, xmm2
640
  pand      xmm1, xmm3               ; (xmm1) J := (J + 1) and (not 1)
641
  movss     xmm3, [rel kSSE_INT_TWO]
642
  cvtdq2ps  xmm2, xmm1               ; (xmm2) Y := J
643
  psubd     xmm1, xmm3               ; J - 2
644
  movaps    xmm5, xmm1
645
  pandn     xmm1, xmm4               ; (not (J - 2)) and 4
646
  pand      xmm5, xmm3               ; (J - 2) and 2
647
  pslld     xmm1, 29                 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
648
  movss     xmm3, [rel kSSE_PI_OVER_4]
649
  pcmpeqd   xmm5, xmm6               ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
650
  mulss     xmm2, xmm3               ; Y * Pi / 4
651
  movss     xmm3, [rel kSSE_COSCOF_P1]
652
  subss     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
653
  movss     xmm2, [rel kSSE_COSCOF_P0]
654
  movss     xmm4, [rel kSSE_COSCOF_P2]
655
  movaps    xmm6, xmm0
656
  mulss     xmm6, xmm6               ; (xmm6) Z := X * X
657
  mulss     xmm2, xmm6               ; COSCOF_P0 * Z
658
  addps     xmm2, xmm3               ; Y := COSCOF_P0 * Z + COSCOF_P1
659
  movss     xmm3, [rel kSSE_ONE_HALF]
660
  mulss     xmm2, xmm6               ; Y * Z
661
  mulss     xmm3, xmm6               ; Z * 0.5
662
  addss     xmm2, xmm4               ; Y := (Y * Z) + COSCOF_P2
663
  movss     xmm7, [rel kSSE_ONE]
664
  mulss     xmm2, xmm6
665
  movss     xmm4, [rel kSSE_SINCOF_P1]
666
  mulss     xmm2, xmm6               ; Y := Y * (Z * Z)
667
  subss     xmm2, xmm3               ; Y - Z * 0.5
668
  addss     xmm2, xmm7               ; (xmm2) Y := Y - Z * 0.5 + 1
669
  movss     xmm3, [rel kSSE_SINCOF_P0]
670
  movss     xmm7, [rel kSSE_SINCOF_P2]
671
  mulss     xmm3, xmm6               ; SINCOF_P0 * Z
672
  addss     xmm3, xmm4               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
673
  mulss     xmm3, xmm6               ; Y2 * Z
674
  addss     xmm3, xmm7               ; Y2 := (Y2 * Z) + SINCOF_P2
675
  mulss     xmm3, xmm6               ; Y2 * Z
676
  mulss     xmm3, xmm0               ; Y2 * (Z * X)
677
  addss     xmm3, xmm0               ; Y2 := Y2 * (Z * X) + X
678
  andps     xmm3, xmm5               ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
679
  andnps    xmm5, xmm2               ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
680
  addss     xmm3, xmm5
681
  xorps     xmm3, xmm1               ; (Y + Y2) xor SignBit
682
  movss     xmm0, xmm3
683
  ret   
684

685
_fast_cos_vector2:
686
  movlps    xmm0, [Param1]
687
  movlps    xmm1, [rel kSSE_MASK_ABS_VAL]
688
  movlps    xmm2, [rel kSSE_FOPI]
689
  andps     xmm0, xmm1               ; (xmm0) X := Abs(ARadians)
690
  movlps    xmm3, [rel kSSE_INT_NOT_ONE]
691
  movaps    xmm1, xmm0
692
  movlps    xmm4, [rel kSSE_INT_FOUR]
693
  mulps     xmm1, xmm2
694
  movlps    xmm2, [rel kSSE_INT_ONE]
695
  cvtps2dq  xmm1, xmm1               ; J := Trunc(X * FOPI)
696
  pxor      xmm6, xmm6
697
  paddd     xmm1, xmm2
698
  pand      xmm1, xmm3               ; (xmm1) J := (J + 1) and (not 1)
699
  movlps    xmm3, [rel kSSE_INT_TWO]
700
  cvtdq2ps  xmm2, xmm1               ; (xmm2) Y := J
701
  psubd     xmm1, xmm3               ; J - 2
702
  movaps    xmm5, xmm1
703
  pandn     xmm1, xmm4               ; (not (J - 2)) and 4
704
  pand      xmm5, xmm3               ; (J - 2) and 2
705
  pslld     xmm1, 29                 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
706
  movlps    xmm3, [rel kSSE_PI_OVER_4]
707
  pcmpeqd   xmm5, xmm6               ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
708
  mulps     xmm2, xmm3               ; Y * Pi / 4
709
  movlps    xmm3, [rel kSSE_COSCOF_P1]
710
  subps     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
711
  movlps    xmm2, [rel kSSE_COSCOF_P0]
712
  movlps    xmm4, [rel kSSE_COSCOF_P2]
713
  movaps    xmm6, xmm0
714
  mulps     xmm6, xmm6               ; (xmm6) Z := X * X
715
  mulps     xmm2, xmm6               ; COSCOF_P0 * Z
716
  addps     xmm2, xmm3               ; Y := COSCOF_P0 * Z + COSCOF_P1
717
  movlps    xmm3, [rel kSSE_ONE_HALF]
718
  mulps     xmm2, xmm6               ; Y * Z
719
  mulps     xmm3, xmm6               ; Z * 0.5
720
  addps     xmm2, xmm4               ; Y := (Y * Z) + COSCOF_P2
721
  movlps    xmm7, [rel kSSE_ONE]
722
  mulps     xmm2, xmm6
723
  movlps    xmm4, [rel kSSE_SINCOF_P1]
724
  mulps     xmm2, xmm6               ; Y := Y * (Z * Z)
725
  subps     xmm2, xmm3               ; Y - Z * 0.5
726
  addps     xmm2, xmm7               ; (xmm2) Y := Y - Z * 0.5 + 1
727
  movlps    xmm3, [rel kSSE_SINCOF_P0]
728
  movlps    xmm7, [rel kSSE_SINCOF_P2]
729
  mulps     xmm3, xmm6               ; SINCOF_P0 * Z
730
  addps     xmm3, xmm4               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
731
  mulps     xmm3, xmm6               ; Y2 * Z
732
  addps     xmm3, xmm7               ; Y2 := (Y2 * Z) + SINCOF_P2
733
  mulps     xmm3, xmm6               ; Y2 * Z
734
  mulps     xmm3, xmm0               ; Y2 * (Z * X)
735
  addps     xmm3, xmm0               ; Y2 := Y2 * (Z * X) + X
736
  andps     xmm3, xmm5               ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
737
  andnps    xmm5, xmm2               ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
738
  addps     xmm3, xmm5
739
  xorps     xmm3, xmm1               ; (Y + Y2) xor SignBit
740
  movaps    xmm0, xmm3
741
  ret
742
  
743
_fast_cos_vector3:
744
  movq      xmm0, [Param1]
745
  movss     xmm1, [Param1+8]
746
  movlhps   xmm0, xmm1
747
  movaps    xmm1, [rel kSSE_MASK_ABS_VAL]
748
  movaps    xmm2, [rel kSSE_FOPI]
749
  andps     xmm0, xmm1               ; (xmm0) X := Abs(ARadians)
750
  movaps    xmm3, [rel kSSE_INT_NOT_ONE]
751
  movaps    xmm1, xmm0
752
  movaps    xmm4, [rel kSSE_INT_FOUR]
753
  mulps     xmm1, xmm2
754
  movaps    xmm2, [rel kSSE_INT_ONE]
755
  cvtps2dq  xmm1, xmm1               ; J := Trunc(X * FOPI)
756
  pxor      xmm6, xmm6
757
  paddd     xmm1, xmm2
758
  pand      xmm1, xmm3               ; (xmm1) J := (J + 1) and (not 1)
759
  movaps    xmm3, [rel kSSE_INT_TWO]
760
  cvtdq2ps  xmm2, xmm1               ; (xmm2) Y := J
761
  psubd     xmm1, xmm3               ; J - 2
762
  movaps    xmm5, xmm1
763
  pandn     xmm1, xmm4               ; (not (J - 2)) and 4
764
  pand      xmm5, xmm3               ; (J - 2) and 2
765
  pslld     xmm1, 29                 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
766
  movaps    xmm3, [rel kSSE_PI_OVER_4]
767
  pcmpeqd   xmm5, xmm6               ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
768
  mulps     xmm2, xmm3               ; Y * Pi / 4
769
  movaps    xmm3, [rel kSSE_COSCOF_P1]
770
  subps     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
771
  movaps    xmm2, [rel kSSE_COSCOF_P0]
772
  movaps    xmm4, [rel kSSE_COSCOF_P2]
773
  movaps    xmm6, xmm0
774
  mulps     xmm6, xmm6               ; (xmm6) Z := X * X
775
  mulps     xmm2, xmm6               ; COSCOF_P0 * Z
776
  addps     xmm2, xmm3               ; Y := COSCOF_P0 * Z + COSCOF_P1
777
  movaps    xmm3, [rel kSSE_ONE_HALF]
778
  mulps     xmm2, xmm6               ; Y * Z
779
  mulps     xmm3, xmm6               ; Z * 0.5
780
  addps     xmm2, xmm4               ; Y := (Y * Z) + COSCOF_P2
781
  movaps    xmm7, [rel kSSE_ONE]
782
  mulps     xmm2, xmm6
783
  movaps    xmm4, [rel kSSE_SINCOF_P1]
784
  mulps     xmm2, xmm6               ; Y := Y * (Z * Z)
785
  subps     xmm2, xmm3               ; Y - Z * 0.5
786
  addps     xmm2, xmm7               ; (xmm2) Y := Y - Z * 0.5 + 1
787
  movaps    xmm3, [rel kSSE_SINCOF_P0]
788
  movaps    xmm7, [rel kSSE_SINCOF_P2]
789
  mulps     xmm3, xmm6               ; SINCOF_P0 * Z
790
  addps     xmm3, xmm4               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
791
  mulps     xmm3, xmm6               ; Y2 * Z
792
  addps     xmm3, xmm7               ; Y2 := (Y2 * Z) + SINCOF_P2
793
  mulps     xmm3, xmm6               ; Y2 * Z
794
  mulps     xmm3, xmm0               ; Y2 * (Z * X)
795
  addps     xmm3, xmm0               ; Y2 := Y2 * (Z * X) + X
796
  andps     xmm3, xmm5               ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
797
  andnps    xmm5, xmm2               ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
798
  addps     xmm3, xmm5
799
  xorps     xmm3, xmm1               ; (Y + Y2) xor SignBit
800
  movaps    xmm0, xmm3
801
  movhlps   xmm1, xmm3
802
  ret   
803
    
804
_fast_cos_vector4:
805
  movups    xmm0, [Param1]
806
  movaps    xmm1, [rel kSSE_MASK_ABS_VAL]
807
  movaps    xmm2, [rel kSSE_FOPI]
808
  andps     xmm0, xmm1               ; (xmm0) X := Abs(ARadians)
809
  movaps    xmm3, [rel kSSE_INT_NOT_ONE]
810
  movaps    xmm1, xmm0
811
  movaps    xmm4, [rel kSSE_INT_FOUR]
812
  mulps     xmm1, xmm2
813
  movaps    xmm2, [rel kSSE_INT_ONE]
814
  cvtps2dq  xmm1, xmm1               ; J := Trunc(X * FOPI)
815
  pxor      xmm6, xmm6
816
  paddd     xmm1, xmm2
817
  pand      xmm1, xmm3               ; (xmm1) J := (J + 1) and (not 1)
818
  movaps    xmm3, [rel kSSE_INT_TWO]
819
  cvtdq2ps  xmm2, xmm1               ; (xmm2) Y := J
820
  psubd     xmm1, xmm3               ; J - 2
821
  movaps    xmm5, xmm1
822
  pandn     xmm1, xmm4               ; (not (J - 2)) and 4
823
  pand      xmm5, xmm3               ; (J - 2) and 2
824
  pslld     xmm1, 29                 ; (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
825
  movaps    xmm3, [rel kSSE_PI_OVER_4]
826
  pcmpeqd   xmm5, xmm6               ; (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: 0xFFFFFFFF, No: 0x00000000
827
  mulps     xmm2, xmm3               ; Y * Pi / 4
828
  movaps    xmm3, [rel kSSE_COSCOF_P1]
829
  subps     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
830
  movaps    xmm2, [rel kSSE_COSCOF_P0]
831
  movaps    xmm4, [rel kSSE_COSCOF_P2]
832
  movaps    xmm6, xmm0
833
  mulps     xmm6, xmm6               ; (xmm6) Z := X * X
834
  mulps     xmm2, xmm6               ; COSCOF_P0 * Z
835
  addps     xmm2, xmm3               ; Y := COSCOF_P0 * Z + COSCOF_P1
836
  movaps    xmm3, [rel kSSE_ONE_HALF]
837
  mulps     xmm2, xmm6               ; Y * Z
838
  mulps     xmm3, xmm6               ; Z * 0.5
839
  addps     xmm2, xmm4               ; Y := (Y * Z) + COSCOF_P2
840
  movaps    xmm7, [rel kSSE_ONE]
841
  mulps     xmm2, xmm6
842
  movaps    xmm4, [rel kSSE_SINCOF_P1]
843
  mulps     xmm2, xmm6               ; Y := Y * (Z * Z)
844
  subps     xmm2, xmm3               ; Y - Z * 0.5
845
  addps     xmm2, xmm7               ; (xmm2) Y := Y - Z * 0.5 + 1
846
  movaps    xmm3, [rel kSSE_SINCOF_P0]
847
  movaps    xmm7, [rel kSSE_SINCOF_P2]
848
  mulps     xmm3, xmm6               ; SINCOF_P0 * Z
849
  addps     xmm3, xmm4               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
850
  mulps     xmm3, xmm6               ; Y2 * Z
851
  addps     xmm3, xmm7               ; Y2 := (Y2 * Z) + SINCOF_P2
852
  mulps     xmm3, xmm6               ; Y2 * Z
853
  mulps     xmm3, xmm0               ; Y2 * (Z * X)
854
  addps     xmm3, xmm0               ; Y2 := Y2 * (Z * X) + X
855
  andps     xmm3, xmm5               ; ((J-2) and 2) = 0)? Yes: Y2, No: 0
856
  andnps    xmm5, xmm2               ; ((J-2) and 2) = 0)? Yes: 0 , No: Y
857
  addps     xmm3, xmm5
858
  xorps     xmm3, xmm1               ; (Y + Y2) xor SignBit
859
  movaps    xmm0, xmm3
860
  movhlps   xmm1, xmm3
861
  ret                                               
862
  
863
_fast_sin_cos_single:
864
  movss     xmm2, [rel kSSE_MASK_SIGN]
865
  movss     xmm3, [rel kSSE_MASK_ABS_VAL]
866
  movaps    xmm1, xmm0
867
  pand      xmm0, xmm3               ; (xmm0) X := Abs(ARadians)
868
  pand      xmm1, xmm2               ; (xmm1) SignBitSin
869
  movaps    xmm4, xmm0
870
  movss     xmm5, [rel kSSE_FOPI]
871
  movss     xmm6, [rel kSSE_INT_ONE]
872
  mulss     xmm4, xmm5
873
  movss     xmm7, [rel kSSE_INT_NOT_ONE]
874
  cvtps2dq  xmm4, xmm4               ; (xmm4) J := Trunc(X * FOPI)
875
  movss     xmm5, [rel kSSE_INT_FOUR]
876
  paddd     xmm4, xmm6
877
  pand      xmm4, xmm7               ; (xmm4) J := (J + 1) and (not 1)
878
  movss     xmm7, [rel kSSE_INT_TWO]
879
  cvtdq2ps  xmm2, xmm4               ; (xmm2) Y := J
880
  movaps    xmm3, xmm4
881
  movaps    xmm6, xmm4               ; (xmm6) J
882
  pand      xmm3, xmm5               ; J and 4
883
  pand      xmm4, xmm7               ; J and 2
884
  pxor      xmm5, xmm5
885
  pslld     xmm3, 29                 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
886
  movss     xmm7, [rel kSSE_PI_OVER_4]
887
  pcmpeqd   xmm4, xmm5               ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
888
  mulss     xmm2, xmm7               ; Y * Pi / 4
889
  movss     xmm5, [rel kSSE_INT_TWO]
890
  subss     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
891
  psubd     xmm6, xmm5               ; J - 2
892
  movss     xmm7, [rel kSSE_INT_FOUR]
893
  pxor      xmm1, xmm3               ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
894
  andnps    xmm6, xmm7               ; (not (J - 2)) and 4
895
  movaps    xmm3, xmm0
896
  pslld     xmm6, 29                 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
897
  mulss     xmm3, xmm3               ; (xmm3) Z := X * X
898
  movss     xmm2, [rel kSSE_COSCOF_P0]
899
  movss     xmm5, [rel kSSE_COSCOF_P1]
900
  movss     xmm7, [rel kSSE_COSCOF_P2]
901
  mulss     xmm2, xmm3               ; COSCOF_P0 * Z
902
  addss     xmm2, xmm5               ; Y := COSCOF_P0 * Z + COSCOF_P1
903
  movss     xmm5, [rel kSSE_ONE_HALF]
904
  mulss     xmm2, xmm3               ; Y * Z
905
  addss     xmm2, xmm7               ; Y := (Y * Z) + COSCOF_P2
906
  movss     xmm7, [rel kSSE_ONE]
907
  mulss     xmm2, xmm3               ; Y * Z
908
  mulss     xmm5, xmm3               ; 0.5 * Z
909
  mulss     xmm2, xmm3               ; Y * (Z * Z)
910
  subss     xmm2, xmm5               ; Y - 0.5 * Z
911
  movss     xmm5, [rel kSSE_SINCOF_P0]
912
  addss     xmm2, xmm7               ; (xmm2) Y := Y - 0.5 * Z + 1
913
  movss     xmm7, [rel kSSE_SINCOF_P1]
914
  mulss     xmm5, xmm3               ; SINCOF_P0 * Z
915
  addss     xmm5, xmm7               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
916
  mulss     xmm5, xmm3               ; Y2 * Z
917
  movss     xmm7, [rel kSSE_SINCOF_P2]
918
  addss     xmm5, xmm7               ; Y2 := Y2 * Z + SINCOF_P2
919
  mulss     xmm5, xmm3               ; Y2 * Z
920
  mulss     xmm5, xmm0               ; Y2 * (Z * X)
921
  addss     xmm5, xmm0               ; (xmm5) Y2 := Y2 * (Z * X) + X
922
  movaps    xmm0, xmm2               ; Y
923
  movaps    xmm3, xmm5               ; Y2
924
  andps     xmm5, xmm4               ; ((J and 2) = 0)? Yes: Y2, No: 0
925
  andnps    xmm4, xmm2               ; ((J and 2) = 0)? Yes: 0 , No: Y
926
  subss     xmm3, xmm5               ; ((J and 2) = 0)? Yes: 0 , No: Y2
927
  subss     xmm0, xmm4               ; ((J and 2) = 0)? Yes: Y , No: 0
928
  addps     xmm4, xmm5               ; ((J and 2) = 0)? Yes: Y2, No: Y
929
  addps     xmm3, xmm0               ; ((J and 2) = 0)? Yes: Y , No: Y2
930
  xorps     xmm4, xmm1               ; Sin
931
  xorps     xmm3, xmm6               ; Cos
932
  movss     [Param1], xmm4
933
  movss     [Param2], xmm3
934
  ret                                                
935
  
936
_fast_sin_cos_vector2:
937
  movlps    xmm0, [Param1]
938
  movlps    xmm2, [rel kSSE_MASK_SIGN]
939
  movlps    xmm3, [rel kSSE_MASK_ABS_VAL]
940
  movaps    xmm1, xmm0
941
  pand      xmm0, xmm3               ; (xmm0) X := Abs(ARadians)
942
  pand      xmm1, xmm2               ; (xmm1) SignBitSin
943
  movaps    xmm4, xmm0
944
  movlps    xmm5, [rel kSSE_FOPI]
945
  movlps    xmm6, [rel kSSE_INT_ONE]
946
  mulps     xmm4, xmm5
947
  movlps    xmm7, [rel kSSE_INT_NOT_ONE]
948
  cvtps2dq  xmm4, xmm4               ; (xmm4) J := Trunc(X * FOPI)
949
  movlps    xmm5, [rel kSSE_INT_FOUR]
950
  paddd     xmm4, xmm6
951
  pand      xmm4, xmm7               ; (xmm4) J := (J + 1) and (not 1)
952
  movlps    xmm7, [rel kSSE_INT_TWO]
953
  cvtdq2ps  xmm2, xmm4               ; (xmm2) Y := J
954
  movaps    xmm3, xmm4
955
  movaps    xmm6, xmm4               ; (xmm6) J
956
  pand      xmm3, xmm5               ; J and 4
957
  pand      xmm4, xmm7               ; J and 2
958
  pxor      xmm5, xmm5
959
  pslld     xmm3, 29                 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
960
  movlps    xmm7, [rel kSSE_PI_OVER_4]
961
  pcmpeqd   xmm4, xmm5               ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
962
  mulps     xmm2, xmm7               ; Y * Pi / 4
963
  movlps    xmm5, [rel kSSE_INT_TWO]
964
  subps     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
965
  psubd     xmm6, xmm5               ; J - 2
966
  movlps    xmm7, [rel kSSE_INT_FOUR]
967
  pxor      xmm1, xmm3               ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
968
  andnps    xmm6, xmm7               ; (not (J - 2)) and 4
969
  movaps    xmm3, xmm0
970
  pslld     xmm6, 29                 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
971
  mulps     xmm3, xmm3               ; (xmm3) Z := X * X
972
  movlps    xmm2, [rel kSSE_COSCOF_P0]
973
  movlps    xmm5, [rel kSSE_COSCOF_P1]
974
  movlps    xmm7, [rel kSSE_COSCOF_P2]
975
  mulps     xmm2, xmm3               ; COSCOF_P0 * Z
976
  addps     xmm2, xmm5               ; Y := COSCOF_P0 * Z + COSCOF_P1
977
  movlps    xmm5, [rel kSSE_ONE_HALF]
978
  mulps     xmm2, xmm3               ; Y * Z
979
  addps     xmm2, xmm7               ; Y := (Y * Z) + COSCOF_P2
980
  movlps    xmm7, [rel kSSE_ONE]
981
  mulps     xmm2, xmm3               ; Y * Z
982
  mulps     xmm5, xmm3               ; 0.5 * Z
983
  mulps     xmm2, xmm3               ; Y * (Z * Z)
984
  subps     xmm2, xmm5               ; Y - 0.5 * Z
985
  movlps    xmm5, [rel kSSE_SINCOF_P0]
986
  addps     xmm2, xmm7               ; (xmm2) Y := Y - 0.5 * Z + 1
987
  movlps    xmm7, [rel kSSE_SINCOF_P1]
988
  mulps     xmm5, xmm3               ; SINCOF_P0 * Z
989
  addps     xmm5, xmm7               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
990
  mulps     xmm5, xmm3               ; Y2 * Z
991
  movlps    xmm7, [rel kSSE_SINCOF_P2]
992
  addps     xmm5, xmm7               ; Y2 := Y2 * Z + SINCOF_P2
993
  mulps     xmm5, xmm3               ; Y2 * Z
994
  mulps     xmm5, xmm0               ; Y2 * (Z * X)
995
  addps     xmm5, xmm0               ; (xmm5) Y2 := Y2 * (Z * X) + X
996
  movaps    xmm0, xmm2               ; Y
997
  movaps    xmm3, xmm5               ; Y2
998
  andps     xmm5, xmm4               ; ((J and 2) = 0)? Yes: Y2, No: 0
999
  andnps    xmm4, xmm2               ; ((J and 2) = 0)? Yes: 0 , No: Y
1000
  subps     xmm3, xmm5               ; ((J and 2) = 0)? Yes: 0 , No: Y2
1001
  subps     xmm0, xmm4               ; ((J and 2) = 0)? Yes: Y , No: 0
1002
  addps     xmm4, xmm5               ; ((J and 2) = 0)? Yes: Y2, No: Y
1003
  addps     xmm3, xmm0               ; ((J and 2) = 0)? Yes: Y , No: Y2
1004
  xorps     xmm4, xmm1               ; Sin
1005
  xorps     xmm3, xmm6               ; Cos
1006
  movlps    [Param2], xmm4
1007
  movlps    [Param3], xmm3
1008
  ret   
1009
    
1010
_fast_sin_cos_vector3:
1011
  movq      xmm0, [Param1]
1012
  movss     xmm1, [Param1+8]
1013
  movlhps   xmm0, xmm1
1014
  movaps    xmm2, [rel kSSE_MASK_SIGN]
1015
  movaps    xmm3, [rel kSSE_MASK_ABS_VAL]
1016
  movaps    xmm1, xmm0
1017
  pand      xmm0, xmm3               ; (xmm0) X := Abs(ARadians)
1018
  pand      xmm1, xmm2               ; (xmm1) SignBitSin
1019
  movaps    xmm4, xmm0
1020
  movaps    xmm5, [rel kSSE_FOPI]
1021
  movaps    xmm6, [rel kSSE_INT_ONE]
1022
  mulps     xmm4, xmm5
1023
  movaps    xmm7, [rel kSSE_INT_NOT_ONE]
1024
  cvtps2dq  xmm4, xmm4               ; (xmm4) J := Trunc(X * FOPI)
1025
  movaps    xmm5, [rel kSSE_INT_FOUR]
1026
  paddd     xmm4, xmm6
1027
  pand      xmm4, xmm7               ; (xmm4) J := (J + 1) and (not 1)
1028
  movaps    xmm7, [rel kSSE_INT_TWO]
1029
  cvtdq2ps  xmm2, xmm4               ; (xmm2) Y := J
1030
  movaps    xmm3, xmm4
1031
  movaps    xmm6, xmm4               ; (xmm6) J
1032
  pand      xmm3, xmm5               ; J and 4
1033
  pand      xmm4, xmm7               ; J and 2
1034
  pxor      xmm5, xmm5
1035
  pslld     xmm3, 29                 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
1036
  movaps    xmm7, [rel kSSE_PI_OVER_4]
1037
  pcmpeqd   xmm4, xmm5               ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1038
  mulps     xmm2, xmm7               ; Y * Pi / 4
1039
  movaps    xmm5, [rel kSSE_INT_TWO]
1040
  subps     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
1041
  psubd     xmm6, xmm5               ; J - 2
1042
  movaps    xmm7, [rel kSSE_INT_FOUR]
1043
  pxor      xmm1, xmm3               ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
1044
  andnps    xmm6, xmm7               ; (not (J - 2)) and 4
1045
  movaps    xmm3, xmm0
1046
  pslld     xmm6, 29                 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
1047
  mulps     xmm3, xmm3               ; (xmm3) Z := X * X
1048
  movaps    xmm2, [rel kSSE_COSCOF_P0]
1049
  movaps    xmm5, [rel kSSE_COSCOF_P1]
1050
  movaps    xmm7, [rel kSSE_COSCOF_P2]
1051
  mulps     xmm2, xmm3               ; COSCOF_P0 * Z
1052
  addps     xmm2, xmm5               ; Y := COSCOF_P0 * Z + COSCOF_P1
1053
  movaps    xmm5, [rel kSSE_ONE_HALF]
1054
  mulps     xmm2, xmm3               ; Y * Z
1055
  addps     xmm2, xmm7               ; Y := (Y * Z) + COSCOF_P2
1056
  movaps    xmm7, [rel kSSE_ONE]
1057
  mulps     xmm2, xmm3               ; Y * Z
1058
  mulps     xmm5, xmm3               ; 0.5 * Z
1059
  mulps     xmm2, xmm3               ; Y * (Z * Z)
1060
  subps     xmm2, xmm5               ; Y - 0.5 * Z
1061
  movaps    xmm5, [rel kSSE_SINCOF_P0]
1062
  addps     xmm2, xmm7               ; (xmm2) Y := Y - 0.5 * Z + 1
1063
  movaps    xmm7, [rel kSSE_SINCOF_P1]
1064
  mulps     xmm5, xmm3               ; SINCOF_P0 * Z
1065
  addps     xmm5, xmm7               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
1066
  mulps     xmm5, xmm3               ; Y2 * Z
1067
  movaps    xmm7, [rel kSSE_SINCOF_P2]
1068
  addps     xmm5, xmm7               ; Y2 := Y2 * Z + SINCOF_P2
1069
  mulps     xmm5, xmm3               ; Y2 * Z
1070
  mulps     xmm5, xmm0               ; Y2 * (Z * X)
1071
  addps     xmm5, xmm0               ; (xmm5) Y2 := Y2 * (Z * X) + X
1072
  movaps    xmm0, xmm2               ; Y
1073
  movaps    xmm3, xmm5               ; Y2
1074
  andps     xmm5, xmm4               ; ((J and 2) = 0)? Yes: Y2, No: 0
1075
  andnps    xmm4, xmm2               ; ((J and 2) = 0)? Yes: 0 , No: Y
1076
  subps     xmm3, xmm5               ; ((J and 2) = 0)? Yes: 0 , No: Y2
1077
  subps     xmm0, xmm4               ; ((J and 2) = 0)? Yes: Y , No: 0
1078
  addps     xmm4, xmm5               ; ((J and 2) = 0)? Yes: Y2, No: Y
1079
  addps     xmm3, xmm0               ; ((J and 2) = 0)? Yes: Y , No: Y2
1080
  xorps     xmm4, xmm1               ; Sin
1081
  xorps     xmm3, xmm6               ; Cos
1082
  movhlps   xmm5, xmm4
1083
  movhlps   xmm2, xmm3
1084
  movq      [Param2], xmm4
1085
  movss     [Param2+8], xmm5
1086
  movq      [Param3], xmm3
1087
  movss     [Param3+8], xmm2
1088
  ret   
1089
    
1090
_fast_sin_cos_vector4:
1091
  movups    xmm0, [Param1]
1092
  movaps    xmm2, [rel kSSE_MASK_SIGN]
1093
  movaps    xmm3, [rel kSSE_MASK_ABS_VAL]
1094
  movaps    xmm1, xmm0
1095
  pand      xmm0, xmm3               ; (xmm0) X := Abs(ARadians)
1096
  pand      xmm1, xmm2               ; (xmm1) SignBitSin
1097
  movaps    xmm4, xmm0
1098
  movaps    xmm5, [rel kSSE_FOPI]
1099
  movaps    xmm6, [rel kSSE_INT_ONE]
1100
  mulps     xmm4, xmm5
1101
  movaps    xmm7, [rel kSSE_INT_NOT_ONE]
1102
  cvtps2dq  xmm4, xmm4               ; (xmm4) J := Trunc(X * FOPI)
1103
  movaps    xmm5, [rel kSSE_INT_FOUR]
1104
  paddd     xmm4, xmm6
1105
  pand      xmm4, xmm7               ; (xmm4) J := (J + 1) and (not 1)
1106
  movaps    xmm7, [rel kSSE_INT_TWO]
1107
  cvtdq2ps  xmm2, xmm4               ; (xmm2) Y := J
1108
  movaps    xmm3, xmm4
1109
  movaps    xmm6, xmm4               ; (xmm6) J
1110
  pand      xmm3, xmm5               ; J and 4
1111
  pand      xmm4, xmm7               ; J and 2
1112
  pxor      xmm5, xmm5
1113
  pslld     xmm3, 29                 ; (xmm3) SwapSignBitSin := (J and 4) shl 29
1114
  movaps    xmm7, [rel kSSE_PI_OVER_4]
1115
  pcmpeqd   xmm4, xmm5               ; (xmm4) PolyMask := ((J and 2) = 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1116
  mulps     xmm2, xmm7               ; Y * Pi / 4
1117
  movaps    xmm5, [rel kSSE_INT_TWO]
1118
  subps     xmm0, xmm2               ; (xmm0) X := X - (Y * Pi / 4)
1119
  psubd     xmm6, xmm5               ; J - 2
1120
  movaps    xmm7, [rel kSSE_INT_FOUR]
1121
  pxor      xmm1, xmm3               ; (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
1122
  andnps    xmm6, xmm7               ; (not (J - 2)) and 4
1123
  movaps    xmm3, xmm0
1124
  pslld     xmm6, 29                 ; (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
1125
  mulps     xmm3, xmm3               ; (xmm3) Z := X * X
1126
  movaps    xmm2, [rel kSSE_COSCOF_P0]
1127
  movaps    xmm5, [rel kSSE_COSCOF_P1]
1128
  movaps    xmm7, [rel kSSE_COSCOF_P2]
1129
  mulps     xmm2, xmm3               ; COSCOF_P0 * Z
1130
  addps     xmm2, xmm5               ; Y := COSCOF_P0 * Z + COSCOF_P1
1131
  movaps    xmm5, [rel kSSE_ONE_HALF]
1132
  mulps     xmm2, xmm3               ; Y * Z
1133
  addps     xmm2, xmm7               ; Y := (Y * Z) + COSCOF_P2
1134
  movaps    xmm7, [rel kSSE_ONE]
1135
  mulps     xmm2, xmm3               ; Y * Z
1136
  mulps     xmm5, xmm3               ; 0.5 * Z
1137
  mulps     xmm2, xmm3               ; Y * (Z * Z)
1138
  subps     xmm2, xmm5               ; Y - 0.5 * Z
1139
  movaps    xmm5, [rel kSSE_SINCOF_P0]
1140
  addps     xmm2, xmm7               ; (xmm2) Y := Y - 0.5 * Z + 1
1141
  movaps    xmm7, [rel kSSE_SINCOF_P1]
1142
  mulps     xmm5, xmm3               ; SINCOF_P0 * Z
1143
  addps     xmm5, xmm7               ; Y2 := SINCOF_P0 * Z + SINCOF_P1
1144
  mulps     xmm5, xmm3               ; Y2 * Z
1145
  movaps    xmm7, [rel kSSE_SINCOF_P2]
1146
  addps     xmm5, xmm7               ; Y2 := Y2 * Z + SINCOF_P2
1147
  mulps     xmm5, xmm3               ; Y2 * Z
1148
  mulps     xmm5, xmm0               ; Y2 * (Z * X)
1149
  addps     xmm5, xmm0               ; (xmm5) Y2 := Y2 * (Z * X) + X
1150
  movaps    xmm0, xmm2               ; Y
1151
  movaps    xmm3, xmm5               ; Y2
1152
  andps     xmm5, xmm4               ; ((J and 2) = 0)? Yes: Y2, No: 0
1153
  andnps    xmm4, xmm2               ; ((J and 2) = 0)? Yes: 0 , No: Y
1154
  subps     xmm3, xmm5               ; ((J and 2) = 0)? Yes: 0 , No: Y2
1155
  subps     xmm0, xmm4               ; ((J and 2) = 0)? Yes: Y , No: 0
1156
  addps     xmm4, xmm5               ; ((J and 2) = 0)? Yes: Y2, No: Y
1157
  addps     xmm3, xmm0               ; ((J and 2) = 0)? Yes: Y , No: Y2
1158
  xorps     xmm4, xmm1               ; Sin
1159
  xorps     xmm3, xmm6               ; Cos
1160
  movups    [Param2], xmm4
1161
  movups    [Param3], xmm3
1162
  ret                                                       
1163
  
1164
_fast_exp_single:    
1165
  movss     xmm1, [rel kSSE_EXP_A1]
1166
  movss     xmm2, [rel kSSE_EXP_A2]
1167

1168
  ; Val := 12102203.1615614 * A + 1065353216.0
1169
  mulss     xmm0, xmm1
1170
  movss     xmm3, [rel kSSE_EXP_CST]
1171
  addss     xmm0, xmm2
1172

1173
  ; if (Val >= EXP_CST) then Val := EXP_CST
1174
  movss     xmm1, xmm0
1175
  cmpltss   xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1176
  andps     xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1177
  andnps    xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1178
  orps      xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1179

1180
  ; IVal := Trunc(Val)
1181
  xorps     xmm3, xmm3
1182
  cvtps2dq  xmm1, xmm0
1183

1184
  ; if (IVal < 0) then I := 0
1185
  movss     xmm2, [rel kSSE_MASK_EXPONENT]
1186
  movdqa    xmm0, xmm1 ; IVal
1187
  pcmpgtd   xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1188
  movss     xmm3, [rel kSSE_MASK_FRACTION]
1189
  pand      xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1190

1191
  ; XU.I := IVal and 0x7F800000
1192
  movss     xmm4, [rel kSSE_EXP_I1]
1193
  movss     xmm1, xmm0
1194
  pand      xmm0, xmm2 ; XU.I / XU.S
1195

1196
  ; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1197
  pand      xmm1, xmm3
1198
  movss     xmm6, [rel kSSE_EXP_F5]
1199
  por       xmm1, xmm4 ; XU2.I / XU2.S
1200

1201
  ;  Result := XU.S *
1202
  ;    ( 0.509964287281036376953125 + B *
1203
  ;    ( 0.3120158612728118896484375 + B *
1204
  ;    ( 0.1666135489940643310546875 + B *
1205
  ;    (-2.12528370320796966552734375e-3 + B *
1206
  ;      1.3534179888665676116943359375e-2))));
1207
  movss     xmm5, [rel kSSE_EXP_F4]
1208
  movss     xmm7, xmm1
1209

1210
  mulss     xmm1, xmm6
1211
  movss     xmm4, [rel kSSE_EXP_F3]
1212
  addss     xmm1, xmm5
1213
  movss     xmm3, [rel kSSE_EXP_F2]
1214
  mulss     xmm1, xmm7
1215
  movss     xmm2, [rel kSSE_EXP_F1]
1216
  addss     xmm1, xmm4
1217
  mulss     xmm1, xmm7
1218
  addss     xmm1, xmm3
1219
  mulss     xmm1, xmm7
1220
  addss     xmm1, xmm2
1221
  mulss     xmm1, xmm0
1222

1223
  movss     xmm0, xmm1
1224
  ret
1225
  
1226
_fast_exp_vector2:
1227
  movlps    xmm0, [Param1]
1228
  movlps    xmm1, [rel kSSE_EXP_A1]
1229
  movlps    xmm2, [rel kSSE_EXP_A2]
1230

1231
  ; Val := 12102203.1615614 * A + 1065353216.0
1232
  mulps     xmm0, xmm1
1233
  movlps    xmm3, [rel kSSE_EXP_CST]
1234
  addps     xmm0, xmm2
1235

1236
  ; if (Val >= EXP_CST) then Val := EXP_CST
1237
  movaps    xmm1, xmm0
1238
  cmpltps   xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1239
  andps     xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1240
  andnps    xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1241
  orps      xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1242

1243
  ; IVal := Trunc(Val)
1244
  xorps     xmm3, xmm3
1245
  cvtps2dq  xmm1, xmm0
1246

1247
  ; if (IVal < 0) then I := 0
1248
  movlps    xmm2, [rel kSSE_MASK_EXPONENT]
1249
  movdqa    xmm0, xmm1 ; IVal
1250
  pcmpgtd   xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1251
  movlps    xmm3, [rel kSSE_MASK_FRACTION]
1252
  pand      xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1253

1254
  ; XU.I := IVal and 0x7F800000
1255
  movlps    xmm4, [rel kSSE_EXP_I1]
1256
  movdqa    xmm1, xmm0
1257
  pand      xmm0, xmm2 ; XU.I / XU.S
1258

1259
  ; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1260
  pand      xmm1, xmm3
1261
  movlps    xmm6, [rel kSSE_EXP_F5]
1262
  por       xmm1, xmm4 ; XU2.I / XU2.S
1263

1264
  ;  Result := XU.S *
1265
  ;    ( 0.509964287281036376953125 + B *
1266
  ;    ( 0.3120158612728118896484375 + B *
1267
  ;    ( 0.1666135489940643310546875 + B *
1268
  ;    (-2.12528370320796966552734375e-3 + B *
1269
  ;      1.3534179888665676116943359375e-2))));
1270
  movlps    xmm5, [rel kSSE_EXP_F4]
1271
  movaps    xmm7, xmm1
1272

1273
  mulps     xmm1, xmm6
1274
  movlps    xmm4, [rel kSSE_EXP_F3]
1275
  addps     xmm1, xmm5
1276
  movlps    xmm3, [rel kSSE_EXP_F2]
1277
  mulps     xmm1, xmm7
1278
  movlps    xmm2, [rel kSSE_EXP_F1]
1279
  addps     xmm1, xmm4
1280
  mulps     xmm1, xmm7
1281
  addps     xmm1, xmm3
1282
  mulps     xmm1, xmm7
1283
  addps     xmm1, xmm2
1284
  mulps     xmm1, xmm0
1285
  movaps    xmm0, xmm1
1286
  ret
1287
  
1288
_fast_exp_vector3:
1289
  movq      xmm0, [Param1]
1290
  movss     xmm1, [Param1+8]
1291
  movlhps   xmm0, xmm1
1292
  movaps    xmm1, [rel kSSE_EXP_A1]
1293
  movaps    xmm2, [rel kSSE_EXP_A2]
1294

1295
  ; Val := 12102203.1615614 * A + 1065353216.0
1296
  mulps     xmm0, xmm1
1297
  movaps    xmm3, [rel kSSE_EXP_CST]
1298
  addps     xmm0, xmm2
1299

1300
  ; if (Val >= EXP_CST) then Val := EXP_CST
1301
  movaps    xmm1, xmm0
1302
  cmpltps   xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1303
  andps     xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1304
  andnps    xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1305
  orps      xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1306

1307
  ; IVal := Trunc(Val)
1308
  xorps     xmm3, xmm3
1309
  cvtps2dq  xmm1, xmm0
1310

1311
  ; if (IVal < 0) then I := 0
1312
  movaps    xmm2, [rel kSSE_MASK_EXPONENT]
1313
  movdqa    xmm0, xmm1 ; IVal
1314
  pcmpgtd   xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1315
  movaps    xmm3, [rel kSSE_MASK_FRACTION]
1316
  pand      xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1317

1318
  ; XU.I := IVal and 0x7F800000
1319
  movaps    xmm4, [rel kSSE_EXP_I1]
1320
  movdqa    xmm1, xmm0
1321
  pand      xmm0, xmm2 ; XU.I / XU.S
1322

1323
  ; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1324
  pand      xmm1, xmm3
1325
  movaps    xmm6, [rel kSSE_EXP_F5]
1326
  por       xmm1, xmm4 ; XU2.I / XU2.S
1327

1328
  ;  Result := XU.S *
1329
  ;    ( 0.509964287281036376953125 + B *
1330
  ;    ( 0.3120158612728118896484375 + B *
1331
  ;    ( 0.1666135489940643310546875 + B *
1332
  ;    (-2.12528370320796966552734375e-3 + B *
1333
  ;      1.3534179888665676116943359375e-2))));
1334
  movaps    xmm5, [rel kSSE_EXP_F4]
1335
  movaps    xmm7, xmm1
1336

1337
  mulps     xmm1, xmm6
1338
  movaps    xmm4, [rel kSSE_EXP_F3]
1339
  addps     xmm1, xmm5
1340
  movaps    xmm3, [rel kSSE_EXP_F2]
1341
  mulps     xmm1, xmm7
1342
  movaps    xmm2, [rel kSSE_EXP_F1]
1343
  addps     xmm1, xmm4
1344
  mulps     xmm1, xmm7
1345
  addps     xmm1, xmm3
1346
  mulps     xmm1, xmm7
1347
  addps     xmm1, xmm2
1348
  mulps     xmm1, xmm0
1349
  movaps    xmm0, xmm1
1350
  movhlps   xmm1, xmm1
1351
  ret
1352
  
1353
_fast_exp_vector4:
1354
  movups    xmm0, [Param1]
1355
  movaps    xmm1, [rel kSSE_EXP_A1]
1356
  movaps    xmm2, [rel kSSE_EXP_A2]
1357

1358
  ; Val := 12102203.1615614 * A + 1065353216.0
1359
  mulps     xmm0, xmm1
1360
  movaps    xmm3, [rel kSSE_EXP_CST]
1361
  addps     xmm0, xmm2
1362

1363
  ; if (Val >= EXP_CST) then Val := EXP_CST
1364
  movaps    xmm1, xmm0
1365
  cmpltps   xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0xFFFFFFFF, No: 0x00000000
1366
  andps     xmm1, xmm0 ; (Val < EXP_CST)? Yes: Val, No: 0
1367
  andnps    xmm0, xmm3 ; (Val < EXP_CST)? Yes: 0, No: EXP_CST
1368
  orps      xmm0, xmm1 ; (Val < EXP_CST)? Yes: Val, No: EXP_CST
1369

1370
  ; IVal := Trunc(Val)
1371
  xorps     xmm3, xmm3
1372
  cvtps2dq  xmm1, xmm0
1373

1374
  ; if (IVal < 0) then I := 0
1375
  movaps    xmm2, [rel kSSE_MASK_EXPONENT]
1376
  movdqa    xmm0, xmm1 ; IVal
1377
  pcmpgtd   xmm1, xmm3 ; (IVal > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1378
  movaps    xmm3, [rel kSSE_MASK_FRACTION]
1379
  pand      xmm0, xmm1 ; (IVal > 0)? Yes: IVal, No: 0
1380

1381
  ; XU.I := IVal and 0x7F800000
1382
  movaps    xmm4, [rel kSSE_EXP_I1]
1383
  movdqa    xmm1, xmm0
1384
  pand      xmm0, xmm2 ; XU.I / XU.S
1385

1386
  ; XU2.I := (IVal and 0x007FFFFF) or 0x3F800000;
1387
  pand      xmm1, xmm3
1388
  movaps    xmm6, [rel kSSE_EXP_F5]
1389
  por       xmm1, xmm4 ; XU2.I / XU2.S
1390

1391
  ;  Result := XU.S *
1392
  ;    ( 0.509964287281036376953125 + B *
1393
  ;    ( 0.3120158612728118896484375 + B *
1394
  ;    ( 0.1666135489940643310546875 + B *
1395
  ;    (-2.12528370320796966552734375e-3 + B *
1396
  ;      1.3534179888665676116943359375e-2))));
1397
  movaps    xmm5, [rel kSSE_EXP_F4]
1398
  movaps    xmm7, xmm1
1399

1400
  mulps     xmm1, xmm6
1401
  movaps    xmm4, [rel kSSE_EXP_F3]
1402
  addps     xmm1, xmm5
1403
  movaps    xmm3, [rel kSSE_EXP_F2]
1404
  mulps     xmm1, xmm7
1405
  movaps    xmm2, [rel kSSE_EXP_F1]
1406
  addps     xmm1, xmm4
1407
  mulps     xmm1, xmm7
1408
  addps     xmm1, xmm3
1409
  mulps     xmm1, xmm7
1410
  addps     xmm1, xmm2
1411
  mulps     xmm1, xmm0
1412
  movaps    xmm0, xmm1
1413
  movhlps   xmm1, xmm1
1414
  ret
1415
  
1416
_fast_ln_single:  
1417
  xorps     xmm2, xmm2
1418
  movss     xmm1, xmm0
1419
  movss     xmm3, [rel kSSE_LN_CST]
1420
  movss     xmm4, [rel kSSE_NEG_INFINITY]
1421

1422
  ; Exp := Val.I shr 23
1423
  psrld     xmm0, 23
1424
  movss     xmm5, xmm1
1425
  cvtdq2ps  xmm0, xmm0 ; xmm0=Exp
1426

1427
  ; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1428
  cmpnless  xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1429
  movss     xmm2, [rel kSSE_MASK_FRACTION]
1430
  andps     xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1431
  andnps    xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1432
  movss     xmm4, [rel kSSE_EXP_I1]
1433
  orps      xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1434

1435
  ; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1436
  pand      xmm5, xmm2
1437
  movss     xmm2, [rel kSSE_LN_F5]
1438
  por       xmm5, xmm4
1439
  movss     xmm6, [rel kSSE_LN_F3]
1440
  movss     xmm3, xmm5 ; xmm3=X
1441
  mulss     xmm5, xmm5 ; xmm5=X2
1442

1443
  movss     xmm4, xmm3
1444
  movss     xmm7, [rel kSSE_LN_F4]
1445
  mulss     xmm4, xmm6
1446
  mulss     xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1447
  subss     xmm4, xmm7
1448
  movss     xmm7, [rel kSSE_LN_F2]
1449
  movss     xmm6, xmm3
1450
  mulss     xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1451
  subss     xmm6, xmm7
1452
  movss     xmm2, [rel kSSE_LN_F1]
1453
  addss     xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1454
  mulss     xmm3, xmm2
1455
  mulss     xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1456
  addss     xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1457
  addss     xmm4, xmm0
1458
  addss     xmm3, xmm4
1459

1460
  movss     xmm0, xmm3
1461
  ret
1462
  
1463
_fast_ln_vector2:  
1464
  movlps    xmm0, [Param1]
1465
  xorps     xmm2, xmm2
1466
  movaps    xmm1, xmm0
1467
  movlps    xmm3, [rel kSSE_LN_CST]
1468
  movlps    xmm4, [rel kSSE_NEG_INFINITY]
1469

1470
  ; Exp := Val.I shr 23
1471
  psrld     xmm0, 23
1472
  movaps    xmm5, xmm1
1473
  cvtdq2ps  xmm0, xmm0 ; xmm0=Exp
1474

1475
  ; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1476
  cmpnleps  xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1477
  movlps    xmm2, [rel kSSE_MASK_FRACTION]
1478
  andps     xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1479
  andnps    xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1480
  movlps    xmm4, [rel kSSE_EXP_I1]
1481
  orps      xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1482

1483
  ; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1484
  pand      xmm5, xmm2
1485
  movlps    xmm2, [rel kSSE_LN_F5]
1486
  por       xmm5, xmm4
1487
  movlps    xmm6, [rel kSSE_LN_F3]
1488
  movaps    xmm3, xmm5 ; xmm3=X
1489
  mulps     xmm5, xmm5 ; xmm5=X2
1490

1491
  movaps    xmm4, xmm3
1492
  movlps    xmm7, [rel kSSE_LN_F4]
1493
  mulps     xmm4, xmm6
1494
  mulps     xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1495
  subps     xmm4, xmm7
1496
  movlps    xmm7, [rel kSSE_LN_F2]
1497
  movaps    xmm6, xmm3
1498
  mulps     xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1499
  subps     xmm6, xmm7
1500
  movlps    xmm2, [rel kSSE_LN_F1]
1501
  addps     xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1502
  mulps     xmm3, xmm2
1503
  mulps     xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1504
  addps     xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1505
  addps     xmm4, xmm0
1506
  addps     xmm3, xmm4
1507
  
1508
  movaps    xmm0, xmm3
1509
  ret
1510
  
1511
_fast_ln_vector3:  
1512
  movq      xmm0, [Param1]
1513
  movss     xmm1, [Param1+8]
1514
  movlhps   xmm0, xmm1
1515
  xorps     xmm2, xmm2
1516
  movaps    xmm1, xmm0
1517
  movaps    xmm3, [rel kSSE_LN_CST]
1518
  movaps    xmm4, [rel kSSE_NEG_INFINITY]
1519

1520
  ; Exp := Val.I shr 23
1521
  psrld     xmm0, 23
1522
  movaps    xmm5, xmm1
1523
  cvtdq2ps  xmm0, xmm0 ; xmm0=Exp
1524

1525
  ; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1526
  cmpnleps  xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1527
  movaps    xmm2, [rel kSSE_MASK_FRACTION]
1528
  andps     xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1529
  andnps    xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1530
  movaps    xmm4, [rel kSSE_EXP_I1]
1531
  orps      xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1532

1533
  ; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1534
  pand      xmm5, xmm2
1535
  movaps    xmm2, [rel kSSE_LN_F5]
1536
  por       xmm5, xmm4
1537
  movaps    xmm6, [rel kSSE_LN_F3]
1538
  movaps    xmm3, xmm5 ; xmm3=X
1539
  mulps     xmm5, xmm5 ; xmm5=X2
1540

1541
  movaps    xmm4, xmm3
1542
  movaps    xmm7, [rel kSSE_LN_F4]
1543
  mulps     xmm4, xmm6
1544
  mulps     xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1545
  subps     xmm4, xmm7
1546
  movaps    xmm7, [rel kSSE_LN_F2]
1547
  movaps    xmm6, xmm3
1548
  mulps     xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1549
  subps     xmm6, xmm7
1550
  movaps    xmm2, [rel kSSE_LN_F1]
1551
  addps     xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1552
  mulps     xmm3, xmm2
1553
  mulps     xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1554
  addps     xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1555
  addps     xmm4, xmm0
1556
  addps     xmm3, xmm4
1557

1558
  movaps    xmm0, xmm3
1559
  movhlps   xmm1, xmm3
1560
  ret
1561
  
1562
_fast_ln_vector4:  
1563
  movups    xmm0, [Param1]
1564
  xorps     xmm2, xmm2
1565
  movaps    xmm1, xmm0
1566
  movaps    xmm3, [rel kSSE_LN_CST]
1567
  movaps    xmm4, [rel kSSE_NEG_INFINITY]
1568

1569
  ; Exp := Val.I shr 23
1570
  psrld     xmm0, 23
1571
  movaps    xmm5, xmm1
1572
  cvtdq2ps  xmm0, xmm0 ; xmm0=Exp
1573

1574
  ; if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1575
  cmpnleps  xmm1, xmm2 ; (A > 0)? Yes: 0xFFFFFFFF, No: 0x00000000
1576
  movaps    xmm2, [rel kSSE_MASK_FRACTION]
1577
  andps     xmm3, xmm1 ; (A > 0)? Yes: -89.93423858, No: 0
1578
  andnps    xmm1, xmm4 ; (A > 0)? Yes: 0, No: NegInfinity
1579
  movaps    xmm4, [rel kSSE_EXP_I1]
1580
  orps      xmm1, xmm3 ; (A > 0)? Yes: -89.93423858, No: NegInfinity
1581

1582
  ; Val.I := (Val.I and 0x007FFFFF) or 0x3F800000
1583
  pand      xmm5, xmm2
1584
  movaps    xmm2, [rel kSSE_LN_F5]
1585
  por       xmm5, xmm4
1586
  movaps    xmm6, [rel kSSE_LN_F3]
1587
  movaps    xmm3, xmm5 ; xmm3=X
1588
  mulps     xmm5, xmm5 ; xmm5=X2
1589

1590
  movaps    xmm4, xmm3
1591
  movaps    xmm7, [rel kSSE_LN_F4]
1592
  mulps     xmm4, xmm6
1593
  mulps     xmm0, xmm2 ; xmm0 = Exp * 0.69314718055995
1594
  subps     xmm4, xmm7
1595
  movaps    xmm7, [rel kSSE_LN_F2]
1596
  movaps    xmm6, xmm3
1597
  mulps     xmm4, xmm5 ; xmm4 = X2 * (0.024982445 * X - 0.24371102)
1598
  subps     xmm6, xmm7
1599
  movaps    xmm2, [rel kSSE_LN_F1]
1600
  addps     xmm4, xmm6 ; xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1601
  mulps     xmm3, xmm2
1602
  mulps     xmm4, xmm5 ; xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1603
  addps     xmm3, xmm1 ; xmm3 = (3.3977745 * X + AddCst)
1604
  addps     xmm4, xmm0
1605
  addps     xmm3, xmm4
1606

1607
  movaps    xmm0, xmm3
1608
  movhlps   xmm1, xmm3
1609
  ret
1610

1611
_fast_log2_single:  
1612
  movss     xmm2, [rel kSSE_MASK_FRACTION]
1613
  movss     xmm1, xmm0
1614

1615
  ; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1616
  movss     xmm3, [rel kSSE_LOG2_I1]
1617
  pand      xmm0, xmm2
1618
  cvtdq2ps  xmm1, xmm1
1619
  movss     xmm4, [rel kSSE_LOG2_F1]
1620
  por       xmm0, xmm3
1621

1622
  movss     xmm2, [rel kSSE_LOG2_F2]
1623
  mulss     xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1624
  movss     xmm3, [rel kSSE_LOG2_F3]
1625
  subss     xmm1, xmm2 ; Result - 124.22551499
1626
  mulss     xmm3, xmm0
1627
  movss     xmm4, [rel kSSE_LOG2_F5]
1628
  subss     xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1629
  movss     xmm2, [rel kSSE_LOG2_F4]
1630
  addss     xmm0, xmm4
1631
  divss     xmm2, xmm0
1632
  subss     xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1633

1634
  movss     xmm0, xmm1
1635
  ret
1636
  
1637
_fast_log2_vector2:
1638
  movlps    xmm0, [Param1]
1639
  movlps    xmm2, [rel kSSE_MASK_FRACTION]
1640
  movaps    xmm1, xmm0
1641

1642
  ; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1643
  movlps    xmm3, [rel kSSE_LOG2_I1]
1644
  pand      xmm0, xmm2
1645
  cvtdq2ps  xmm1, xmm1
1646
  movlps    xmm4, [rel kSSE_LOG2_F1]
1647
  por       xmm0, xmm3
1648

1649
  movlps    xmm2, [rel kSSE_LOG2_F2]
1650
  mulps     xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1651
  movlps    xmm3, [rel kSSE_LOG2_F3]
1652
  subps     xmm1, xmm2 ; Result - 124.22551499
1653
  mulps     xmm3, xmm0
1654
  movlps    xmm4, [rel kSSE_LOG2_F5]
1655
  subps     xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1656
  movlps    xmm2, [rel kSSE_LOG2_F4]
1657
  addps     xmm0, xmm4
1658
  divps     xmm2, xmm0
1659
  subps     xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1660

1661
  movaps    xmm0, xmm1
1662
  ret
1663
  
1664
_fast_log2_vector3:
1665
  movq      xmm0, [Param1]
1666
  movss     xmm1, [Param1+8]
1667
  movlhps   xmm0, xmm1
1668
  movaps    xmm2, [rel kSSE_MASK_FRACTION]
1669
  movaps    xmm1, xmm0
1670

1671
  ; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1672
  movaps    xmm3, [rel kSSE_LOG2_I1]
1673
  pand      xmm0, xmm2
1674
  cvtdq2ps  xmm1, xmm1
1675
  movaps    xmm4, [rel kSSE_LOG2_F1]
1676
  por       xmm0, xmm3
1677

1678
  movaps    xmm2, [rel kSSE_LOG2_F2]
1679
  mulps     xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1680
  movaps    xmm3, [rel kSSE_LOG2_F3]
1681
  subps     xmm1, xmm2 ; Result - 124.22551499
1682
  mulps     xmm3, xmm0
1683
  movaps    xmm4, [rel kSSE_LOG2_F5]
1684
  subps     xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1685
  movaps    xmm2, [rel kSSE_LOG2_F4]
1686
  addps     xmm0, xmm4
1687
  divps     xmm2, xmm0
1688
  subps     xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1689

1690
  movaps    xmm0, xmm1
1691
  movhlps   xmm1, xmm1
1692
  ret
1693
  
1694
_fast_log2_vector4:
1695
  movups    xmm0, [Param1]
1696
  movaps    xmm2, [rel kSSE_MASK_FRACTION]
1697
  movaps    xmm1, xmm0
1698

1699
  ; MX.I := (VX.I and 0x007FFFFF) or 0x3F000000
1700
  movaps    xmm3, [rel kSSE_LOG2_I1]
1701
  pand      xmm0, xmm2
1702
  cvtdq2ps  xmm1, xmm1
1703
  movaps    xmm4, [rel kSSE_LOG2_F1]
1704
  por       xmm0, xmm3
1705

1706
  movaps    xmm2, [rel kSSE_LOG2_F2]
1707
  mulps     xmm1, xmm4 ; VX.I * 1.1920928955078125e-7
1708
  movaps    xmm3, [rel kSSE_LOG2_F3]
1709
  subps     xmm1, xmm2 ; Result - 124.22551499
1710
  mulps     xmm3, xmm0
1711
  movaps    xmm4, [rel kSSE_LOG2_F5]
1712
  subps     xmm1, xmm3 ; Result - 124.22551499 - 1.498030302 * MX.S
1713
  movaps    xmm2, [rel kSSE_LOG2_F4]
1714
  addps     xmm0, xmm4
1715
  divps     xmm2, xmm0
1716
  subps     xmm1, xmm2 ; Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1717

1718
  movaps    xmm0, xmm1
1719
  movhlps   xmm1, xmm1
1720
  ret
1721
  
1722
_fast_exp2_single:
1723
  ; Set rounding mode to Round Positive (=Round Down)
1724
  stmxcsr   [OldFlags]
1725
  mov       ecx, [OldFlags]
1726
  xorps     xmm1, xmm1
1727
  and       ecx, SSE_ROUND_MASK
1728
  movss     xmm3, xmm0
1729
  or        ecx, SSE_ROUND_DOWN
1730
  movss     xmm5, xmm0
1731
  mov       [NewFlags], ecx
1732

1733
  movss     xmm1, [rel kSSE_EXP2_F1]
1734
  ldmxcsr   [NewFlags]
1735

1736
  ; Z := A - RoundDown(A)
1737
  cvtps2dq  xmm3, xmm3
1738
  addss     xmm1, xmm5 ; A + 121.2740575
1739
  cvtdq2ps  xmm3, xmm3
1740
  movss     xmm2, [rel kSSE_EXP2_F2]
1741
  subss     xmm0, xmm3
1742

1743
  movss     xmm3, [rel kSSE_EXP2_F3]
1744
  movss     xmm4, [rel kSSE_EXP2_F4]
1745
  subss     xmm3, xmm0 ; (4.84252568 - Z)
1746
  mulss     xmm0, xmm4 ; 1.49012907 * Z
1747
  divss     xmm2, xmm3
1748
  movss     xmm5, [rel kSSE_EXP2_F5]
1749
  addss     xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1750
  subss     xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1751
  mulss     xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1752
  cvtps2dq  xmm1, xmm1
1753

1754
  ; Restore rounding mode
1755
  ldmxcsr   [OldFlags]
1756

1757
  movss     xmm0, xmm1
1758
  ret
1759
  
1760
_fast_exp2_vector2:
1761
  ; Set rounding mode to Round Positive (=Round Down)
1762
  stmxcsr   [OldFlags]
1763
  movlps    xmm0, [Param1]
1764
  mov       ecx, [OldFlags]
1765
  xorps     xmm1, xmm1
1766
  and       ecx, SSE_ROUND_MASK
1767
  movaps    xmm3, xmm0
1768
  or        ecx, SSE_ROUND_DOWN
1769
  movaps    xmm5, xmm0
1770
  mov       [NewFlags], ecx
1771

1772
  movlps    xmm1, [rel kSSE_EXP2_F1]
1773
  ldmxcsr   [NewFlags]
1774

1775
  ; Z := A - RoundDown(A)
1776
  cvtps2dq  xmm3, xmm3
1777
  addps     xmm1, xmm5 ; A + 121.2740575
1778
  cvtdq2ps  xmm3, xmm3
1779
  movlps    xmm2, [rel kSSE_EXP2_F2]
1780
  subps     xmm0, xmm3
1781

1782
  movlps    xmm3, [rel kSSE_EXP2_F3]
1783
  movlps    xmm4, [rel kSSE_EXP2_F4]
1784
  subps     xmm3, xmm0 ; (4.84252568 - Z)
1785
  mulps     xmm0, xmm4 ; 1.49012907 * Z
1786
  divps     xmm2, xmm3
1787
  movlps    xmm5, [rel kSSE_EXP2_F5]
1788
  addps     xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1789
  subps     xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1790
  mulps     xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1791
  cvtps2dq  xmm1, xmm1
1792

1793
  ; Restore rounding mode
1794
  ldmxcsr   [OldFlags]
1795

1796
  movaps    xmm0, xmm1
1797
  ret
1798
  
1799
_fast_exp2_vector3:
1800
  ; Set rounding mode to Round Positive (=Round Down)
1801
  stmxcsr   [OldFlags]
1802
  movq      xmm0, [Param1]
1803
  movss     xmm1, [Param1+8]
1804
  movlhps   xmm0, xmm1
1805
  mov       edx, [OldFlags]
1806
  xorps     xmm1, xmm1
1807
  and       edx, SSE_ROUND_MASK
1808
  movaps    xmm3, xmm0
1809
  or        edx, SSE_ROUND_DOWN
1810
  movaps    xmm5, xmm0
1811
  mov       [NewFlags], edx
1812

1813
  movaps    xmm1, [rel kSSE_EXP2_F1]
1814
  ldmxcsr   [NewFlags]
1815

1816
  ; Z := A - RoundDown(A)
1817
  cvtps2dq  xmm3, xmm3
1818
  addps     xmm1, xmm5 ; A + 121.2740575
1819
  cvtdq2ps  xmm3, xmm3
1820
  movaps    xmm2, [rel kSSE_EXP2_F2]
1821
  subps     xmm0, xmm3
1822

1823
  movaps    xmm3, [rel kSSE_EXP2_F3]
1824
  movaps    xmm4, [rel kSSE_EXP2_F4]
1825
  subps     xmm3, xmm0 ; (4.84252568 - Z)
1826
  mulps     xmm0, xmm4 ; 1.49012907 * Z
1827
  divps     xmm2, xmm3
1828
  movaps    xmm5, [rel kSSE_EXP2_F5]
1829
  addps     xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1830
  subps     xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1831
  mulps     xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1832
  cvtps2dq  xmm1, xmm1
1833

1834
  ; Restore rounding mode
1835
  ldmxcsr   [OldFlags]
1836

1837
  movaps    xmm0, xmm1
1838
  movhlps   xmm1, xmm1
1839
  ret
1840
  
1841
_fast_exp2_vector4:
1842
  ; Set rounding mode to Round Positive (=Round Down)
1843
  stmxcsr   [OldFlags]
1844
  movups    xmm0, [Param1]
1845
  mov       edx, [OldFlags]
1846
  xorps     xmm1, xmm1
1847
  and       edx, SSE_ROUND_MASK
1848
  movaps    xmm3, xmm0
1849
  or        edx, SSE_ROUND_DOWN
1850
  movaps    xmm5, xmm0
1851
  mov       [NewFlags], edx
1852

1853
  movaps    xmm1, [rel kSSE_EXP2_F1]
1854
  ldmxcsr   [NewFlags]
1855

1856
  ; Z := A - RoundDown(A)
1857
  cvtps2dq  xmm3, xmm3
1858
  addps     xmm1, xmm5 ; A + 121.2740575
1859
  cvtdq2ps  xmm3, xmm3
1860
  movaps    xmm2, [rel kSSE_EXP2_F2]
1861
  subps     xmm0, xmm3
1862

1863
  movaps    xmm3, [rel kSSE_EXP2_F3]
1864
  movaps    xmm4, [rel kSSE_EXP2_F4]
1865
  subps     xmm3, xmm0 ; (4.84252568 - Z)
1866
  mulps     xmm0, xmm4 ; 1.49012907 * Z
1867
  divps     xmm2, xmm3
1868
  movaps    xmm5, [rel kSSE_EXP2_F5]
1869
  addps     xmm1, xmm2 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1870
  subps     xmm1, xmm0 ; A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1871
  mulps     xmm1, xmm5 ; (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1872
  cvtps2dq  xmm1, xmm1
1873

1874
  ; Restore rounding mode
1875
  ldmxcsr   [OldFlags]
1876

1877
  movaps    xmm0, xmm1
1878
  movhlps   xmm1, xmm1
1879
  ret
1880
  
1881
;****************************************************************************
1882
; Common Functions
1883
;****************************************************************************
1884
  
1885
_abs_vector3:  
1886
  movq      xmm0, [Param1]
1887
  movss     xmm1, [Param1+8]
1888
  movaps    xmm2, [rel kSSE_MASK_ABS_VAL]
1889
  andps     xmm0, xmm2
1890
  pand      xmm1, xmm2
1891
  ret
1892
  
1893
_abs_vector4:  
1894
  movups    xmm0, [Param1]
1895
  movaps    xmm1, [rel kSSE_MASK_ABS_VAL]
1896
  andps     xmm0, xmm1
1897
  movhlps   xmm1, xmm0
1898
  ret   
1899
  
1900
_sign_single:
1901
  movss     xmm1, [rel kSSE_ONE]
1902
  movss     xmm2, xmm0
1903
  movss     xmm3, [rel kSSE_MASK_SIGN]
1904

1905
  andps     xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1906
  xorps     xmm4, xmm4
1907
  orps      xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1908
  cmpneqss  xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1909
  andps     xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1910
  ret
1911
  
1912
_sign_vector2:
1913
  movlps    xmm0, [Param1]
1914
  movlps    xmm1, [rel kSSE_ONE]
1915
  movaps    xmm2, xmm0
1916
  movlps    xmm3, [rel kSSE_MASK_SIGN]
1917

1918
  andps     xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1919
  xorps     xmm4, xmm4
1920
  orps      xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1921
  cmpneqps  xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1922
  andps     xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1923
  ret
1924
  
1925
_sign_vector3:
1926
  movq      xmm0, [Param1]
1927
  movss     xmm1, [Param1+8]
1928
  movlhps   xmm0, xmm1
1929
  movaps    xmm1, [rel kSSE_ONE]
1930
  movaps    xmm2, xmm0
1931
  movaps    xmm3, [rel kSSE_MASK_SIGN]
1932

1933
  andps     xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1934
  xorps     xmm4, xmm4
1935
  orps      xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1936
  cmpneqps  xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1937
  andps     xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1938
  movhlps   xmm1, xmm0
1939
  ret
1940
  
1941
_sign_vector4:
1942
  movups    xmm0, [Param1]
1943
  movaps    xmm1, [rel kSSE_ONE]
1944
  movaps    xmm2, xmm0
1945
  movaps    xmm3, [rel kSSE_MASK_SIGN]
1946

1947
  andps     xmm0, xmm3 ; (A < 0)? Yes: 0x80000000, No: 0x00000000
1948
  xorps     xmm4, xmm4
1949
  orps      xmm0, xmm1 ; (A < 0)? Yes: -1, No: 1
1950
  cmpneqps  xmm2, xmm4 ; (A = 0)? Yes: 0x00000000, No: 0xFFFFFFFF
1951
  andps     xmm0, xmm2 ; (A = 0)? Yes: 0, No: -1 or 1
1952
  movhlps   xmm1, xmm0
1953
  ret
1954
  
1955
_floor_single:
1956
  ; Set rounding mode to Round Down
1957
  stmxcsr   [OldFlags]
1958
  mov       eax, [OldFlags]
1959
  and       eax, SSE_ROUND_MASK
1960
  or        eax, SSE_ROUND_DOWN
1961
  mov       [NewFlags], eax
1962
  ldmxcsr   [NewFlags]
1963

1964
  cvtss2si  rax, xmm0
1965

1966
  ; Restore rounding mode
1967
  ldmxcsr   [OldFlags]
1968
  ret
1969

1970
 _floor_vector2:
1971
  ; Set rounding mode to Round Down
1972
  stmxcsr   [OldFlags]
1973
  mov       eax, [OldFlags]
1974
  and       eax, SSE_ROUND_MASK
1975
  or        eax, SSE_ROUND_DOWN
1976
  mov       [NewFlags], eax
1977
  movlps    xmm0, [Param1]
1978
  ldmxcsr   [NewFlags]
1979

1980
  cvtps2dq  xmm0, xmm0
1981

1982
  ; Restore rounding mode
1983
  ldmxcsr   [OldFlags]
1984
  
1985
  movq      rax, xmm0
1986
  ret
1987
  
1988
_floor_vector3:
1989
  ; Set rounding mode to Round Down
1990
  stmxcsr   [OldFlags]
1991
  mov       eax, [OldFlags]
1992
  and       eax, SSE_ROUND_MASK
1993
  or        eax, SSE_ROUND_DOWN
1994
  mov       [NewFlags], eax
1995
  movq      xmm0, [Param1]
1996
  movss     xmm1, [Param1+8]
1997
  movlhps   xmm0, xmm1
1998
  ldmxcsr   [NewFlags]
1999

2000
  cvtps2dq  xmm0, xmm0
2001

2002
  ; Restore rounding mode
2003
  ldmxcsr   [OldFlags]
2004

2005
  movhlps   xmm1, xmm0
2006
  movq      rax, xmm0
2007
  movq      rdx, xmm1
2008
  ret
2009
  
2010
_floor_vector4:
2011
  ; Set rounding mode to Round Down
2012
  stmxcsr   [OldFlags]
2013
  mov       eax, [OldFlags]
2014
  and       eax, SSE_ROUND_MASK
2015
  or        eax, SSE_ROUND_DOWN
2016
  mov       [NewFlags], eax
2017
  movups    xmm0, [Param1]
2018
  ldmxcsr   [NewFlags]
2019

2020
  cvtps2dq  xmm0, xmm0
2021

2022
  ; Restore rounding mode
2023
  ldmxcsr   [OldFlags]
2024

2025
  movhlps   xmm1, xmm0
2026
  movq      rax, xmm0
2027
  movq      rdx, xmm1
2028
  ret
2029
  
2030
_trunc_single:
2031
  ; Set rounding mode to Truncate
2032
  stmxcsr   [OldFlags]
2033
  mov       eax, [OldFlags]
2034
  and       eax, SSE_ROUND_MASK
2035
  or        eax, SSE_ROUND_TRUNC
2036
  mov       [NewFlags], eax
2037
  ldmxcsr   [NewFlags]
2038

2039
  cvtss2si  rax, xmm0
2040

2041
  ; Restore rounding mode
2042
  ldmxcsr   [OldFlags]
2043
  ret
2044
  
2045
_trunc_vector2:
2046
  ; Set rounding mode to Truncate
2047
  stmxcsr   [OldFlags]
2048
  mov       eax, [OldFlags]
2049
  and       eax, SSE_ROUND_MASK
2050
  or        eax, SSE_ROUND_TRUNC
2051
  mov       [NewFlags], eax
2052
  movlps    xmm0, [Param1]
2053
  ldmxcsr   [NewFlags]
2054

2055
  cvtps2dq  xmm0, xmm0
2056

2057
  ; Restore rounding mode
2058
  ldmxcsr   [OldFlags]
2059
  
2060
  movq      rax, xmm0
2061
  ret
2062
  
2063
_trunc_vector3:
2064
  ; Set rounding mode to Truncate
2065
  stmxcsr   [OldFlags]
2066
  mov       eax, [OldFlags]
2067
  and       eax, SSE_ROUND_MASK
2068
  or        eax, SSE_ROUND_TRUNC
2069
  mov       [NewFlags], eax
2070
  movq      xmm0, [Param1]
2071
  movss     xmm1, [Param1+8]
2072
  movlhps   xmm0, xmm1
2073
  ldmxcsr   [NewFlags]
2074

2075
  cvtps2dq  xmm0, xmm0
2076

2077
  ; Restore rounding mode
2078
  ldmxcsr   [OldFlags]
2079

2080
  movhlps   xmm1, xmm0
2081
  movq      rax, xmm0
2082
  movq      rdx, xmm1
2083
  ret
2084
  
2085
_trunc_vector4:
2086
  ; Set rounding mode to Truncate
2087
  stmxcsr   [OldFlags]
2088
  mov       eax, [OldFlags]
2089
  and       eax, SSE_ROUND_MASK
2090
  or        eax, SSE_ROUND_TRUNC
2091
  mov       [NewFlags], eax
2092
  movups    xmm0, [Param1]
2093
  ldmxcsr   [NewFlags]
2094

2095
  cvtps2dq  xmm0, xmm0
2096

2097
  ; Restore rounding mode
2098
  ldmxcsr   [OldFlags]
2099

2100
  movhlps   xmm1, xmm0
2101
  movq      rax, xmm0
2102
  movq      rdx, xmm1
2103
  ret  
2104
  
2105
_round_single:
2106
  ; Rounding mode defaults to round-to-nearest
2107
  cvtss2si  rax, xmm0 
2108
  ret
2109
  
2110
_round_vector2:
2111
  ; Rounding mode defaults to round-to-nearest
2112
  movlps    xmm0, [Param1]
2113
  cvtps2dq  xmm0, xmm0 
2114
  movq      rax, xmm0
2115
  ret
2116
  
2117
_round_vector3:
2118
  ; Rounding mode defaults to round-to-nearest
2119
  movq      xmm0, [Param1]
2120
  movss     xmm1, [Param1+8]
2121
  movlhps   xmm0, xmm1
2122
  cvtps2dq  xmm0, xmm0
2123
  movhlps   xmm1, xmm0
2124
  movq      rax, xmm0
2125
  movq      rdx, xmm1
2126
  ret
2127
  
2128
_round_vector4:
2129
  ; Rounding mode defaults to round-to-nearest
2130
  movups    xmm0, [Param1]
2131
  cvtps2dq  xmm0, xmm0
2132
  movhlps   xmm1, xmm0
2133
  movq      rax, xmm0
2134
  movq      rdx, xmm1
2135
  ret    
2136
  
2137
_ceil_single:
2138
  ; Set rounding mode to Round Up
2139
  stmxcsr   [OldFlags]
2140
  mov       eax, [OldFlags]
2141
  and       eax, SSE_ROUND_MASK
2142
  or        eax, SSE_ROUND_UP
2143
  mov       [NewFlags], eax
2144
  ldmxcsr   [NewFlags]
2145

2146
  cvtss2si  rax, xmm0
2147

2148
  ; Restore rounding mode
2149
  ldmxcsr   [OldFlags]
2150
  ret
2151
  
2152
_ceil_vector2:
2153
  ; Set rounding mode to Round Up
2154
  stmxcsr   [OldFlags]
2155
  mov       eax, [OldFlags]
2156
  and       eax, SSE_ROUND_MASK
2157
  or        eax, SSE_ROUND_UP
2158
  mov       [NewFlags], eax
2159
  movlps    xmm0, [Param1]
2160
  ldmxcsr   [NewFlags]
2161

2162
  cvtps2dq  xmm0, xmm0
2163

2164
  ; Restore rounding mode
2165
  ldmxcsr   [OldFlags]
2166
  
2167
  movq      rax, xmm0
2168
  ret
2169
  
2170
_ceil_vector3:
2171
  ; Set rounding mode to Round Up
2172
  stmxcsr   [OldFlags]
2173
  mov       eax, [OldFlags]
2174
  and       eax, SSE_ROUND_MASK
2175
  or        eax, SSE_ROUND_UP
2176
  mov       [NewFlags], eax
2177
  movq      xmm0, [Param1]
2178
  movss     xmm1, [Param1+8]
2179
  movlhps   xmm0, xmm1
2180
  ldmxcsr   [NewFlags]
2181

2182
  cvtps2dq  xmm0, xmm0
2183

2184
  ; Restore rounding mode
2185
  ldmxcsr   [OldFlags]
2186

2187
  movhlps   xmm1, xmm0
2188
  movq      rax, xmm0
2189
  movq      rdx, xmm1
2190
  ret
2191
  
2192
_ceil_vector4:
2193
  ; Set rounding mode to Round Up
2194
  stmxcsr   [OldFlags]
2195
  mov       eax, [OldFlags]
2196
  and       eax, SSE_ROUND_MASK
2197
  or        eax, SSE_ROUND_UP
2198
  mov       [NewFlags], eax
2199
  movups    xmm0, [Param1]
2200
  ldmxcsr   [NewFlags]
2201

2202
  cvtps2dq  xmm0, xmm0
2203

2204
  ; Restore rounding mode
2205
  ldmxcsr   [OldFlags]
2206

2207
  movhlps   xmm1, xmm0
2208
  movq      rax, xmm0
2209
  movq      rdx, xmm1
2210
  ret
2211
  
2212
_frac_vector2:
2213
  ; Set rounding mode to Truncate
2214
  stmxcsr   [OldFlags]
2215
  mov       edx, [OldFlags]
2216
  and       edx, SSE_ROUND_MASK
2217
  or        edx, SSE_ROUND_TRUNC
2218
  movlps    xmm0, [Param1]
2219
  mov       [NewFlags], edx
2220
  movaps    xmm1, xmm0
2221
  ldmxcsr   [NewFlags]
2222

2223
  cvtps2dq  xmm0, xmm0
2224
  ldmxcsr   [OldFlags]
2225
  cvtdq2ps  xmm0, xmm0
2226
  subps     xmm1, xmm0 ; A - Trunc(A)
2227

2228
  movaps    xmm0, xmm1
2229
  ret
2230
  
2231
_frac_vector3:
2232
  ; Set rounding mode to Truncate
2233
  stmxcsr   [OldFlags]
2234
  mov       eax, [OldFlags]
2235
  and       eax, SSE_ROUND_MASK
2236
  or        eax, SSE_ROUND_TRUNC
2237
  movq      xmm0, [Param1]
2238
  movss     xmm1, [Param1+8]
2239
  movlhps   xmm0, xmm1
2240
  mov       [NewFlags], eax
2241
  movaps    xmm1, xmm0
2242
  ldmxcsr   [NewFlags]
2243

2244
  cvtps2dq  xmm0, xmm0
2245
  ldmxcsr   [OldFlags]
2246
  cvtdq2ps  xmm0, xmm0
2247
  subps     xmm1, xmm0 ; A - Trunc(A)
2248

2249
  movaps    xmm0, xmm1
2250
  movhlps   xmm1, xmm1
2251
  ret
2252
  
2253
_frac_vector4:
2254
  ; Set rounding mode to Truncate
2255
  stmxcsr   [OldFlags]
2256
  mov       eax, [OldFlags]
2257
  and       eax, SSE_ROUND_MASK
2258
  or        eax, SSE_ROUND_TRUNC
2259
  movups    xmm0, [Param1]
2260
  mov       [NewFlags], eax
2261
  movaps    xmm1, xmm0
2262
  ldmxcsr   [NewFlags]
2263

2264
  cvtps2dq  xmm0, xmm0
2265
  ldmxcsr   [OldFlags]
2266
  cvtdq2ps  xmm0, xmm0
2267
  subps     xmm1, xmm0 ; A - Trunc(A)
2268

2269
  movaps    xmm0, xmm1
2270
  movhlps   xmm1, xmm1
2271
  ret
2272
  
2273
_fmod_vector2_single:
2274
  ; Set rounding mode to Truncate
2275
  movss     xmm1, xmm0
2276
  movlps    xmm0, [Param1]
2277
  stmxcsr   [OldFlags]
2278
  mov       ecx, [OldFlags]
2279
  shufps    xmm1, xmm1, 0x00 ; Replicate B
2280
  and       ecx, SSE_ROUND_MASK
2281
  movaps    xmm2, xmm0
2282
  or        ecx, SSE_ROUND_TRUNC
2283
  movaps    xmm3, xmm1
2284
  mov       [NewFlags], ecx
2285
  divps     xmm2, xmm3 ; A / B
2286
  ldmxcsr   [NewFlags]
2287

2288
  cvtps2dq  xmm2, xmm2
2289
  cvtdq2ps  xmm2, xmm2 ; Trunc(A / B)
2290
  mulps     xmm2, xmm1
2291
  subps     xmm0, xmm2 ; A - (B * Trunc(A / B))
2292

2293
  ; Restore rounding mode
2294
  ldmxcsr   [OldFlags]
2295
  ret
2296
  
2297
_fmod_vector3_single:
2298
  ; Set rounding mode to Truncate
2299
  movss     xmm1, xmm0
2300
  movq      xmm0, [Param1]
2301
  movss     xmm2, [Param1+8]
2302
  movlhps   xmm0, xmm2
2303
  stmxcsr   [OldFlags]
2304
  mov       edx, [OldFlags]
2305
  shufps    xmm1, xmm1, 0x00 ; Replicate B
2306
  and       edx, SSE_ROUND_MASK
2307
  movaps    xmm2, xmm0
2308
  or        edx, SSE_ROUND_TRUNC
2309
  movaps    xmm3, xmm1
2310
  mov       [NewFlags], edx
2311
  divps     xmm2, xmm3 ; A / B
2312
  ldmxcsr   [NewFlags]
2313

2314
  cvtps2dq  xmm2, xmm2
2315
  cvtdq2ps  xmm2, xmm2 ; Trunc(A / B)
2316
  mulps     xmm2, xmm1
2317
  subps     xmm0, xmm2 ; A - (B * Trunc(A / B))
2318

2319
  ; Restore rounding mode
2320
  ldmxcsr   [OldFlags]
2321

2322
  movhlps   xmm1, xmm0
2323
  ret
2324
  
2325
_fmod_vector4_single:
2326
  ; Set rounding mode to Truncate
2327
  movss     xmm1, xmm0
2328
  movups    xmm0, [Param1]
2329
  stmxcsr   [OldFlags]
2330
  mov       edx, [OldFlags]
2331
  shufps    xmm1, xmm1, 0x00 ; Replicate B
2332
  and       edx, SSE_ROUND_MASK
2333
  movaps    xmm2, xmm0
2334
  or        edx, SSE_ROUND_TRUNC
2335
  movaps    xmm3, xmm1
2336
  mov       [NewFlags], edx
2337
  divps     xmm2, xmm3 ; A / B
2338
  ldmxcsr   [NewFlags]
2339

2340
  cvtps2dq  xmm2, xmm2
2341
  cvtdq2ps  xmm2, xmm2 ; Trunc(A / B)
2342
  mulps     xmm2, xmm1
2343
  subps     xmm0, xmm2 ; A - (B * Trunc(A / B))
2344

2345
  ; Restore rounding mode
2346
  ldmxcsr   [OldFlags]
2347

2348
  movhlps   xmm1, xmm0
2349
  ret
2350
  
2351
_fmod_vector2:
2352
  ; Set rounding mode to Truncate
2353
  movlps    xmm0, [Param1]
2354
  stmxcsr   [OldFlags]
2355
  movlps    xmm1, [Param2]
2356
  mov       edx, [OldFlags]
2357
  movaps    xmm2, xmm0
2358
  and       edx, SSE_ROUND_MASK
2359
  movaps    xmm3, xmm1
2360
  or        edx, SSE_ROUND_TRUNC
2361
  divps     xmm2, xmm3 ; A / B
2362
  mov       [NewFlags], edx
2363
  ldmxcsr   [NewFlags]
2364

2365
  cvtps2dq  xmm2, xmm2
2366
  cvtdq2ps  xmm2, xmm2 ; Trunc(A / B)
2367
  mulps     xmm2, xmm1
2368
  subps     xmm0, xmm2 ; A - (B * Trunc(A / B))
2369

2370
  ; Restore rounding mode
2371
  ldmxcsr   [OldFlags]
2372
  ret
2373
  
2374
_fmod_vector3:
2375
  ; Set rounding mode to Truncate
2376
  movq      xmm0, [Param1]
2377
  movss     xmm1, [Param1+8]
2378
  movlhps   xmm0, xmm1
2379
  stmxcsr   [OldFlags]
2380
  movq      xmm1, [Param2]
2381
  movss     xmm2, [Param2+8]
2382
  movlhps   xmm1, xmm2
2383
  mov       edx, [OldFlags]
2384
  movaps    xmm2, xmm0
2385
  and       edx, SSE_ROUND_MASK
2386
  movaps    xmm3, xmm1
2387
  or        edx, SSE_ROUND_TRUNC
2388
  divps     xmm2, xmm3 ; A / B
2389
  mov       [NewFlags], edx
2390
  ldmxcsr   [NewFlags]
2391

2392
  cvtps2dq  xmm2, xmm2
2393
  cvtdq2ps  xmm2, xmm2 ; Trunc(A / B)
2394
  mulps     xmm2, xmm1
2395
  subps     xmm0, xmm2 ; A - (B * Trunc(A / B))
2396

2397
  ; Restore rounding mode
2398
  ldmxcsr   [OldFlags]
2399

2400
  movhlps   xmm1, xmm0
2401
  ret
2402
  
2403
_fmod_vector4:
2404
  ; Set rounding mode to Truncate
2405
  movups    xmm0, [Param1]
2406
  stmxcsr   [OldFlags]
2407
  movups    xmm1, [Param2]
2408
  mov       edx, [OldFlags]
2409
  movaps    xmm2, xmm0
2410
  and       edx, SSE_ROUND_MASK
2411
  movaps    xmm3, xmm1
2412
  or        edx, SSE_ROUND_TRUNC
2413
  divps     xmm2, xmm3 ; A / B
2414
  mov       [NewFlags], edx
2415
  ldmxcsr   [NewFlags]
2416

2417
  cvtps2dq  xmm2, xmm2
2418
  cvtdq2ps  xmm2, xmm2 ; Trunc(A / B)
2419
  mulps     xmm2, xmm1
2420
  subps     xmm0, xmm2 ; A - (B * Trunc(A / B))
2421

2422
  ; Restore rounding mode
2423
  ldmxcsr   [OldFlags]
2424

2425
  movhlps   xmm1, xmm0
2426
  ret
2427
  
2428
_modf_vector2:
2429
  movlps    xmm0, [Param1]
2430

2431
  ; Set rounding mode to Truncate
2432
  stmxcsr   [OldFlags]
2433
  mov       eax, [OldFlags]
2434
  and       eax, SSE_ROUND_MASK
2435
  or        eax, SSE_ROUND_TRUNC
2436
  mov       [NewFlags], eax
2437
  ldmxcsr   [NewFlags]
2438

2439
  movaps    xmm1, xmm0
2440
  cvtps2dq  xmm0, xmm0
2441
  movlps    [Param2], xmm0  ; B = Trunc(A)
2442
  cvtdq2ps  xmm0, xmm0
2443
  subps     xmm1, xmm0 ; A - Trunc(A)
2444

2445
  ; Restore rounding mode
2446
  ldmxcsr   [OldFlags]
2447

2448
  movaps    xmm0, xmm1
2449
  ret
2450
  
2451
_modf_vector3:
2452
  movq      xmm0, [Param1]
2453
  movss     xmm1, [Param1+8]
2454
  movlhps   xmm0, xmm1
2455

2456
  ; Set rounding mode to Truncate
2457
  stmxcsr   [OldFlags]
2458
  mov       eax, [OldFlags]
2459
  and       eax, SSE_ROUND_MASK
2460
  or        eax, SSE_ROUND_TRUNC
2461
  mov       [NewFlags], eax
2462
  ldmxcsr   [NewFlags]
2463

2464
  movaps    xmm1, xmm0
2465
  cvtps2dq  xmm0, xmm0
2466
  movhlps   xmm2, xmm0
2467
  movq      [Param2], xmm0  ; B = Trunc(A)
2468
  movd      [Param2+8], xmm2
2469
  cvtdq2ps  xmm0, xmm0
2470
  subps     xmm1, xmm0 ; A - Trunc(A)
2471

2472
  ; Restore rounding mode
2473
  ldmxcsr   [OldFlags]
2474

2475
  movaps    xmm0, xmm1
2476
  movhlps   xmm1, xmm1
2477
  ret
2478
  
2479
_modf_vector4:
2480
  movups    xmm0, [Param1]
2481

2482
  ; Set rounding mode to Truncate
2483
  stmxcsr   [OldFlags]
2484
  mov       eax, [OldFlags]
2485
  and       eax, SSE_ROUND_MASK
2486
  or        eax, SSE_ROUND_TRUNC
2487
  mov       [NewFlags], eax
2488
  ldmxcsr   [NewFlags]
2489

2490
  movaps    xmm1, xmm0
2491
  cvtps2dq  xmm0, xmm0
2492
  movups    [Param2], xmm0  ; B = Trunc(A)
2493
  cvtdq2ps  xmm0, xmm0
2494
  subps     xmm1, xmm0 ; A - Trunc(A)
2495

2496
  ; Restore rounding mode
2497
  ldmxcsr   [OldFlags]
2498

2499
  movaps    xmm0, xmm1
2500
  movhlps   xmm1, xmm1
2501
  ret
2502
  
2503
_min_vector2_single:
2504
  shufps    xmm0, xmm0, 0x00 ; Replicate B
2505
  movlps    xmm1, [Param1]
2506
  minps     xmm0, xmm1
2507
  ret
2508
  
2509
_min_vector3_single:
2510
  shufps    xmm0, xmm0, 0x00 ; Replicate B
2511
  movq      xmm1, [Param1]
2512
  movss     xmm2, [Param1+8]
2513
  movlhps   xmm1, xmm2
2514
  minps     xmm0, xmm1
2515
  movhlps   xmm1, xmm0
2516
  ret
2517
  
2518
_min_vector4_single:
2519
  shufps    xmm0, xmm0, 0x00 ; Replicate B
2520
  movups    xmm1, [Param1]
2521
  minps     xmm0, xmm1
2522
  movhlps   xmm1, xmm0
2523
  ret
2524
  
2525
_min_vector2:
2526
  movlps    xmm0, [Param1]
2527
  movlps    xmm1, [Param2]
2528
  minps     xmm0, xmm1
2529
  ret
2530
  
2531
_min_vector3:
2532
  movq      xmm0, [Param1]
2533
  movss     xmm1, [Param1+8]
2534
  movlhps   xmm0, xmm1
2535
  movq      xmm1, [Param2]
2536
  movss     xmm2, [Param2+8]
2537
  movlhps   xmm1, xmm2
2538
  minps     xmm0, xmm1
2539
  movhlps   xmm1, xmm0
2540
  ret
2541
  
2542
_min_vector4:
2543
  movups    xmm0, [Param1]
2544
  movups    xmm1, [Param2]
2545
  minps     xmm0, xmm1
2546
  movhlps   xmm1, xmm0
2547
  ret
2548
  
2549
_max_vector2_single:
2550
  shufps    xmm0, xmm0, 0x00 ; Replicate B
2551
  movlps    xmm1, [Param1]
2552
  maxps     xmm0, xmm1
2553
  ret
2554
  
2555
_max_vector3_single:
2556
  shufps    xmm0, xmm0, 0x00 ; Replicate B
2557
  movq      xmm1, [Param1]
2558
  movss     xmm2, [Param1+8]
2559
  movlhps   xmm1, xmm2
2560
  maxps     xmm0, xmm1
2561
  movhlps   xmm1, xmm0
2562
  ret
2563
  
2564
_max_vector4_single:
2565
  shufps    xmm0, xmm0, 0x00 ; Replicate B
2566
  movups    xmm1, [Param1]
2567
  maxps     xmm0, xmm1
2568
  movhlps   xmm1, xmm0
2569
  ret
2570
  
2571
_max_vector2:
2572
  movlps    xmm0, [Param1]
2573
  movlps    xmm1, [Param2]
2574
  maxps     xmm0, xmm1
2575
  ret
2576
  
2577
_max_vector3:
2578
  movq      xmm0, [Param1]
2579
  movss     xmm1, [Param1+8]
2580
  movlhps   xmm0, xmm1
2581
  movq      xmm1, [Param2]
2582
  movss     xmm2, [Param2+8]
2583
  movlhps   xmm1, xmm2
2584
  maxps     xmm0, xmm1
2585
  movhlps   xmm1, xmm0
2586
  ret
2587
  
2588
_max_vector4:
2589
  movups    xmm0, [Param1]
2590
  movups    xmm1, [Param2]
2591
  maxps     xmm0, xmm1
2592
  movhlps   xmm1, xmm0
2593
  ret
2594
  
2595
_ensure_range_single:
2596
  maxss     xmm0, xmm1
2597
  minss     xmm0, xmm2
2598
  ret
2599
  
2600
_ensure_range_vector2_single:
2601
  shufps    xmm0, xmm0, 0x00 ; Replicate AMin
2602
  shufps    xmm1, xmm1, 0x00 ; Replicate AMax
2603
  movlps    xmm2, [Param1]
2604
  minps     xmm2, xmm1
2605
  maxps     xmm0, xmm2
2606
  ret
2607
  
2608
_ensure_range_vector3_single:
2609
  shufps    xmm0, xmm0, 0x00 ; Replicate AMin
2610
  shufps    xmm1, xmm1, 0x00 ; Replicate AMax
2611
  movq      xmm2, [Param1]
2612
  movss     xmm3, [Param1+8]
2613
  movlhps   xmm2, xmm3
2614
  minps     xmm2, xmm1
2615
  maxps     xmm0, xmm2
2616
  movhlps   xmm1, xmm0
2617
  ret
2618
  
2619
_ensure_range_vector4_single:
2620
  shufps    xmm0, xmm0, 0x00 ; Replicate AMin
2621
  shufps    xmm1, xmm1, 0x00 ; Replicate AMax
2622
  movups    xmm2, [Param1]
2623
  minps     xmm2, xmm1
2624
  maxps     xmm0, xmm2
2625
  movhlps   xmm1, xmm0
2626
  ret
2627
  
2628
_ensure_range_vector2:
2629
  movlps    xmm0, [Param1]
2630
  movlps    xmm1, [Param2]
2631
  movlps    xmm2, [Param3]
2632
  maxps     xmm0, xmm1
2633
  minps     xmm0, xmm2
2634
  ret
2635
  
2636
_ensure_range_vector3:
2637
  movq      xmm0, [Param1]
2638
  movss     xmm1, [Param1+8]
2639
  movlhps   xmm0, xmm1
2640
  movq      xmm1, [Param2]
2641
  movss     xmm2, [Param2+8]
2642
  movlhps   xmm1, xmm2
2643
  movq      xmm2, [Param3]
2644
  movss     xmm3, [Param3+8]
2645
  movlhps   xmm2, xmm3
2646
  maxps     xmm0, xmm1
2647
  minps     xmm0, xmm2
2648
  movhlps   xmm1, xmm0
2649
  ret
2650
  
2651
_ensure_range_vector4:
2652
  movups    xmm0, [Param1]
2653
  movups    xmm1, [Param2]
2654
  movups    xmm2, [Param3]
2655
  maxps     xmm0, xmm1
2656
  minps     xmm0, xmm2
2657
  movhlps   xmm1, xmm0
2658
  ret
2659
  
2660
_mix_vector3_single:
2661
  movq      xmm4, [Param1]
2662
  movss     xmm1, [Param1+8]
2663
  movlhps   xmm4, xmm1
2664
  movq      xmm1, [Param2]
2665
  movss     xmm2, [Param2+8]
2666
  movlhps   xmm1, xmm2
2667
  shufps    xmm0, xmm0, 0x00 ; Replicate T
2668
  subps     xmm1, xmm4
2669
  mulps     xmm1, xmm0
2670
  addps     xmm4, xmm1 ; A + (T * (B - A))
2671
  movhlps   xmm1, xmm4
2672
  movaps    xmm0, xmm4
2673
  ret
2674
  
2675
_mix_vector4_single:
2676
  movups    xmm4, [Param1]
2677
  movups    xmm1, [Param2]
2678
  shufps    xmm0, xmm0, 0x00 ; Replicate T
2679
  subps     xmm1, xmm4
2680
  mulps     xmm1, xmm0
2681
  addps     xmm4, xmm1 ; A + (T * (B - A))
2682
  movaps    xmm0, xmm4
2683
  movhlps   xmm1, xmm4
2684
  ret
2685
  
2686
_mix_vector3:
2687
  movq      xmm0, [Param1]
2688
  movss     xmm1, [Param1+8]
2689
  movlhps   xmm0, xmm1
2690
  movq      xmm1, [Param2]
2691
  movss     xmm2, [Param2+8]
2692
  movlhps   xmm1, xmm2
2693
  movq      xmm2, [Param3]
2694
  movss     xmm3, [Param3+8]
2695
  movlhps   xmm2, xmm3
2696
  subps     xmm1, xmm0
2697
  mulps     xmm1, xmm2
2698
  addps     xmm0, xmm1 ; A + (T * (B - A))
2699
  movhlps   xmm1, xmm0
2700
  ret
2701
  
2702
_mix_vector4:
2703
  movups    xmm0, [Param1]
2704
  movups    xmm1, [Param2]
2705
  movups    xmm2, [Param3]
2706
  subps     xmm1, xmm0
2707
  mulps     xmm1, xmm2
2708
  addps     xmm0, xmm1 ; A + (T * (B - A))
2709
  movhlps   xmm1, xmm0
2710
  ret
2711
  
2712
_step_single_vector2:
2713
  movlps    xmm1, [Param1]
2714
  shufps    xmm0, xmm0, 0x00 ; Replicate AEdge
2715
  movlps    xmm2, [rel kSSE_ONE]
2716
  cmpnltps  xmm1, xmm0      ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2717
  andps     xmm1, xmm2      ; (A >= AEdge)? Yes: 1, No: 0
2718
  movaps    xmm0, xmm1
2719
  ret
2720
  
2721
_step_single_vector3:
2722
  movq      xmm3, [Param1]
2723
  movss     xmm2, [Param1+8]
2724
  movlhps   xmm3, xmm2
2725
  shufps    xmm0, xmm0, 0x00 ; Replicate AEdge
2726
  movaps    xmm2, [rel kSSE_ONE]
2727
  cmpnltps  xmm3, xmm0      ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2728
  andps     xmm3, xmm2      ; (A >= AEdge)? Yes: 1, No: 0
2729
  movaps    xmm0, xmm3
2730
  movhlps   xmm1, xmm3
2731
  ret
2732
  
2733
_step_single_vector4:
2734
  movups    xmm3, [Param1]
2735
  shufps    xmm0, xmm0, 0x00 ; Replicate AEdge
2736
  movaps    xmm2, [rel kSSE_ONE]
2737
  cmpnltps  xmm3, xmm0      ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2738
  andps     xmm3, xmm2      ; (A >= AEdge)? Yes: 1, No: 0
2739
  movaps    xmm0, xmm3
2740
  movhlps   xmm1, xmm3
2741
  ret
2742
  
2743
_step_vector2:
2744
  movlps    xmm0, [Param1]
2745
  movlps    xmm1, [Param2]
2746
  movlps    xmm2, [rel kSSE_ONE]
2747
  cmpnltps  xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2748
  andps     xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2749
  movaps    xmm0, xmm1
2750
  ret
2751
  
2752
_step_vector3:
2753
  movq      xmm0, [Param1]
2754
  movss     xmm1, [Param1+8]
2755
  movlhps   xmm0, xmm1
2756
  movq      xmm1, [Param2]
2757
  movss     xmm2, [Param2+8]
2758
  movlhps   xmm1, xmm2
2759
  movaps    xmm2, [rel kSSE_ONE]
2760
  cmpnltps  xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2761
  andps     xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2762
  movaps    xmm0, xmm1
2763
  movhlps   xmm1, xmm1
2764
  ret
2765
  
2766
_step_vector4:
2767
  movups    xmm0, [Param1]
2768
  movups    xmm1, [Param2]
2769
  movaps    xmm2, [rel kSSE_ONE]
2770
  cmpnltps  xmm1, xmm0 ; (A >= AEdge)? Yes: 0xFFFFFFFF, No: 0x00000000
2771
  andps     xmm1, xmm2 ; (A >= AEdge)? Yes: 1, No: 0
2772
  movaps    xmm0, xmm1
2773
  movhlps   xmm1, xmm1
2774
  ret   
2775
  
2776
_smooth_step_single_vector3:
2777
  movq      xmm2, [Param1]
2778
  movss     xmm3, [Param1+8]
2779
  movlhps   xmm2, xmm3
2780
  shufps    xmm0, xmm0, 0x00 ; Replicate AEdge0
2781
  shufps    xmm1, xmm1, 0x00 ; Replicate AEdge1
2782
  movaps    xmm3, xmm2
2783
  movaps    xmm4, xmm2
2784
  movaps    xmm5, xmm2
2785
  movaps    xmm6, [rel kSSE_ONE]
2786

2787
  cmpnltps  xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2788
  cmpleps   xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2789
  subps     xmm1, xmm0
2790
  movaps    xmm5, xmm4
2791
  subps     xmm2, xmm0
2792
  andnps    xmm5, xmm6 ; (A >  AEdge1)? Yes: 1.0, No: 0.0
2793

2794
  movaps    xmm6, [rel kSSE_TWO]
2795
  divps     xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2796
  movaps    xmm7, [rel kSSE_THREE]
2797
  mulps     xmm6, xmm2 ; 2 * Temp
2798
  subps     xmm7, xmm6 ; 3 - (2 * Temp)
2799
  mulps     xmm7, xmm2
2800
  mulps     xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2801
  andps     xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2802
  andps     xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2803
  orps      xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2804

2805
  movaps    xmm0, xmm7
2806
  movhlps   xmm1, xmm7
2807
  ret
2808
  
2809
_smooth_step_single_vector4:
2810
  movups    xmm2, [Param1]
2811
  shufps    xmm0, xmm0, 0x00 ; Replicate AEdge0
2812
  shufps    xmm1, xmm1, 0x00 ; Replicate AEdge1
2813
  movaps    xmm3, xmm2
2814
  movaps    xmm4, xmm2
2815
  movaps    xmm5, xmm2
2816
  movaps    xmm6, [rel kSSE_ONE]
2817

2818
  cmpnltps  xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2819
  cmpleps   xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2820
  subps     xmm1, xmm0
2821
  movaps    xmm5, xmm4
2822
  subps     xmm2, xmm0
2823
  andnps    xmm5, xmm6 ; (A >  AEdge1)? Yes: 1.0, No: 0.0
2824

2825
  movaps    xmm6, [rel kSSE_TWO]
2826
  divps     xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2827
  movaps    xmm7, [rel kSSE_THREE]
2828
  mulps     xmm6, xmm2 ; 2 * Temp
2829
  subps     xmm7, xmm6 ; 3 - (2 * Temp)
2830
  mulps     xmm7, xmm2
2831
  mulps     xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2832
  andps     xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2833
  andps     xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2834
  orps      xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2835

2836
  movaps    xmm0, xmm7
2837
  movhlps   xmm1, xmm7  
2838
  ret
2839
  
2840
_smooth_step_vector3:
2841
  movq      xmm2, [Param3]
2842
  movss     xmm3, [Param3+8]
2843
  movlhps   xmm2, xmm3
2844
  movq      xmm0, [Param1]
2845
  movss     xmm1, [Param1+8]
2846
  movlhps   xmm0, xmm1
2847
  movq      xmm1, [Param2]
2848
  movss     xmm3, [Param2+8]
2849
  movlhps   xmm1, xmm3
2850

2851
  movaps    xmm3, xmm2
2852
  movaps    xmm4, xmm2
2853
  movaps    xmm5, xmm2
2854
  movaps    xmm6, [rel kSSE_ONE]
2855

2856
  cmpnltps  xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2857
  cmpleps   xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2858
  subps     xmm1, xmm0
2859
  movaps    xmm5, xmm4
2860
  subps     xmm2, xmm0
2861
  andnps    xmm5, xmm6 ; (A >  AEdge1)? Yes: 1.0, No: 0.0
2862

2863
  movaps    xmm6, [rel kSSE_TWO]
2864
  divps     xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2865
  movaps    xmm7, [rel kSSE_THREE]
2866
  mulps     xmm6, xmm2 ; 2 * Temp
2867
  subps     xmm7, xmm6 ; 3 - (2 * Temp)
2868
  mulps     xmm7, xmm2
2869
  mulps     xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2870
  andps     xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2871
  andps     xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2872
  orps      xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2873

2874
  movaps    xmm0, xmm7
2875
  movhlps   xmm1, xmm7
2876
  ret
2877
  
2878
_smooth_step_vector4:
2879
  movups    xmm2, [Param3]
2880
  movups    xmm0, [Param1]
2881
  movups    xmm1, [Param2]
2882
  movaps    xmm3, xmm2
2883
  movaps    xmm4, xmm2
2884
  movaps    xmm5, xmm2
2885
  movaps    xmm6, [rel kSSE_ONE]
2886

2887
  cmpnltps  xmm3, xmm0 ; (A >= AEdge0)? Yes: 0xFFFFFFFF, No: 0x00000000
2888
  cmpleps   xmm4, xmm1 ; (A <= AEdge1)? Yes: 0xFFFFFFFF, No: 0x00000000
2889
  subps     xmm1, xmm0
2890
  movaps    xmm5, xmm4
2891
  subps     xmm2, xmm0
2892
  andnps    xmm5, xmm6 ; (A >  AEdge1)? Yes: 1.0, No: 0.0
2893

2894
  movaps    xmm6, [rel kSSE_TWO]
2895
  divps     xmm2, xmm1 ; Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2896
  movaps    xmm7, [rel kSSE_THREE]
2897
  mulps     xmm6, xmm2 ; 2 * Temp
2898
  subps     xmm7, xmm6 ; 3 - (2 * Temp)
2899
  mulps     xmm7, xmm2
2900
  mulps     xmm7, xmm2 ; Result := Temp * Temp * (3 - (2 * Temp))
2901
  andps     xmm7, xmm3 ; (A < AEdge0)? Yes: 0, No: Result
2902
  andps     xmm7, xmm4 ; (A > AEdge1)? Yes: 0, No: Result
2903
  orps      xmm7, xmm5 ; (A > AEdge1)? Yes: 1, No: Result
2904

2905
  movaps    xmm0, xmm7
2906
  movhlps   xmm1, xmm7
2907
  ret  
2908
  
2909
_fma_vector2:
2910
  movlps    xmm0, [Param1]
2911
  movlps    xmm1, [Param2]
2912
  movlps    xmm2, [Param3]
2913
  mulps     xmm0, xmm1
2914
  addps     xmm0, xmm2
2915
  ret
2916
  
2917
_fma_vector3:
2918
  movq      xmm0, [Param1]
2919
  movss     xmm1, [Param1+8]
2920
  movlhps   xmm0, xmm1
2921
  movq      xmm1, [Param2]
2922
  movss     xmm2, [Param2+8]
2923
  movlhps   xmm1, xmm2
2924
  movq      xmm2, [Param3]
2925
  movss     xmm3, [Param3+8]
2926
  movlhps   xmm2, xmm3
2927
  mulps     xmm0, xmm1
2928
  addps     xmm0, xmm2
2929
  movhlps   xmm1, xmm0
2930
  ret
2931
  
2932
_fma_vector4:
2933
  movups    xmm0, [Param1]
2934
  movups    xmm1, [Param2]
2935
  movups    xmm2, [Param3]
2936
  mulps     xmm0, xmm1
2937
  addps     xmm0, xmm2
2938
  movhlps   xmm1, xmm0
2939
  ret
2940
  
2941
;****************************************************************************
2942
; Matrix Functions
2943
;****************************************************************************
2944

2945
_outer_product_matrix2:
2946
%ifdef FM_COLUMN_MAJOR
2947
  movlps    xmm0, [Param2]
2948
  movlps    xmm1, [Param1]
2949
%else
2950
  movlps    xmm0, [Param1]   ; # # C.Y C.X
2951
  movlps    xmm1, [Param2]   ; # # R.Y R.X
2952
%endif  
2953

2954
  shufps    xmm0, xmm0, 0x50 ; C.Y C.X C.Y C.X
2955
  shufps    xmm1, xmm1, 0x44 ; R.Y R.Y R.X R.X
2956

2957
  mulps     xmm0, xmm1      ; (C.Y*R.Y) (C.X*R.Y) (C.Y*R.X) (C.X*R.X)
2958

2959
  ; Store as matrix
2960
  movhlps   xmm1, xmm0
2961
  ret
2962
  
2963
_outer_product_matrix3:
2964
%ifdef FM_COLUMN_MAJOR
2965
  movq      xmm0, [Param2]
2966
  movss     xmm1, [Param2+8]
2967
  movlhps   xmm0, xmm1
2968
  movq      xmm1, [Param3]
2969
  movss     xmm2, [Param3+8]
2970
%else
2971
  movq      xmm0, [Param3]
2972
  movss     xmm1, [Param3+8]
2973
  movlhps   xmm0, xmm1
2974
  movq      xmm1, [Param2]
2975
  movss     xmm2, [Param2+8]
2976
%endif  
2977
  movlhps   xmm1, xmm2
2978
  movaps    xmm2, xmm1
2979
  movaps    xmm3, xmm1
2980

2981
  shufps    xmm1, xmm1, 0x00 ; C.X (4x)
2982
  shufps    xmm2, xmm2, 0x55 ; C.Y (4x)
2983
  shufps    xmm3, xmm3, 0xAA ; C.Z (4x)
2984

2985
  mulps     xmm1, xmm0      ; R * C.X
2986
  mulps     xmm2, xmm0      ; R * C.Y
2987
  mulps     xmm3, xmm0      ; R * C.Z
2988

2989
  ; Store as matrix
2990
  movhlps   xmm0, xmm1
2991
  movhlps   xmm4, xmm2
2992
  movhlps   xmm5, xmm3
2993
  movq      [Param1+0x00], xmm1
2994
  movss     [Param1+0x08], xmm0
2995
  movq      [Param1+0x0C], xmm2
2996
  movss     [Param1+0x14], xmm4
2997
  movq      [Param1+0x18], xmm3
2998
  movss     [Param1+0x20], xmm5
2999
  ret
3000
  
3001
_outer_product_matrix4:
3002
%ifdef FM_COLUMN_MAJOR
3003
  movups    xmm0, [Param2]
3004
  movups    xmm1, [Param3]
3005
%else
3006
  movups    xmm0, [Param3]
3007
  movups    xmm1, [Param2]
3008
%endif  
3009
  movaps    xmm2, xmm1
3010
  movaps    xmm3, xmm1
3011
  movaps    xmm4, xmm1
3012

3013
  shufps    xmm1, xmm1, 0x00 ; C.X (4x)
3014
  shufps    xmm2, xmm2, 0x55 ; C.Y (4x)
3015
  shufps    xmm3, xmm3, 0xAA ; C.Z (4x)
3016
  shufps    xmm4, xmm4, 0xFF ; C.W (4x)
3017

3018
  mulps     xmm1, xmm0      ; R * C.X
3019
  mulps     xmm2, xmm0      ; R * C.Y
3020
  mulps     xmm3, xmm0      ; R * C.Z
3021
  mulps     xmm4, xmm0      ; R * C.W
3022

3023
  ; Store as matrix
3024
  movups    [Param1 + 0x00], xmm1
3025
  movups    [Param1 + 0x10], xmm2
3026
  movups    [Param1 + 0x20], xmm3
3027
  movups    [Param1 + 0x30], xmm4
3028
  ret
3029
  
3030
;****************************************************************************
3031
; TVector2
3032
;****************************************************************************
3033
  
3034
_vector2_div_single:
3035
  shufps    xmm0, xmm0, 0
3036
  movlps    xmm1, [Param1]
3037
  divps     xmm1, xmm0
3038
  movaps    xmm0, xmm1
3039
  ret
3040
  
3041
_single_div_vector2:
3042
  movlps    xmm1, [Param1]
3043
  shufps    xmm0, xmm0, 0
3044
  divps     xmm0, xmm1
3045
  ret
3046
  
3047
_vector2_div_vector2:
3048
  movlps    xmm0, [Param1]
3049
  movlps    xmm1, [Param2]
3050
  divps     xmm0, xmm1
3051
  ret
3052
  
3053
_vector2_normalize_fast:
3054
  movlps    xmm0, [Self]    ; Y X
3055
  movaps    xmm2, xmm0
3056
  mulps     xmm0, xmm0      ; Y*Y X*X
3057
  pshufd    xmm1, xmm0, 0x01; X*X Y*Y
3058
  addps     xmm0, xmm1      ; (X*X+Y*Y) (2x)
3059
  rsqrtps   xmm0, xmm0      ; (1 / Sqrt(X*X + Y*Y)) (4x)
3060
  mulps     xmm0, xmm2      ; A * (1 / Sqrt(Dot(A, A)))
3061
  ret
3062
  
3063
_vector2_set_normalized_fast:
3064
  movlps    xmm0, [Self]    ; Y X
3065
  movaps    xmm2, xmm0
3066
  mulps     xmm0, xmm0      ; Y*Y X*X
3067
  pshufd    xmm1, xmm0, 0x01; X*X Y*Y
3068
  addps     xmm0, xmm1      ; (X*X+Y*Y) (2x)
3069
  rsqrtps   xmm0, xmm0      ; (1 / Sqrt(X*X + Y*Y)) (4x)
3070
  mulps     xmm0, xmm2      ; A * (1 / Sqrt(Dot(A, A)))
3071
  movlps    [Self], xmm0
3072
  ret  
3073
  
3074
;****************************************************************************
3075
; TVector3
3076
;****************************************************************************
3077

3078
_vector3_add_single:
3079
  movq      xmm2, [Param1] ; Load 3 floating-point values
3080
  movss     xmm1, [Param1+8]
3081
  shufps    xmm0, xmm0, 0  ; Replicate B
3082
  addps     xmm2, xmm0     ; A + B
3083
  addss     xmm1, xmm0
3084
  movaps    xmm0, xmm2     ; Store result
3085
  ret
3086
  
3087
_single_add_vector3:
3088
  movq      xmm2, [Param1]
3089
  movss     xmm1, [Param1+8]
3090
  shufps    xmm0, xmm0, 0
3091
  addps     xmm2, xmm0
3092
  addss     xmm1, xmm0
3093
  movaps    xmm0, xmm2
3094
  ret
3095
  
3096
_vector3_add_vector3:
3097
  movq      xmm0, [Param1]
3098
  movss     xmm1, [Param1+8]
3099
  movq      xmm2, [Param2]
3100
  movss     xmm3, [Param2+8]
3101
  addps     xmm0, xmm2
3102
  addss     xmm1, xmm3
3103
  ret
3104
  
3105
_vector3_sub_single:
3106
  movq      xmm2, [Param1] ; Load 3 floating-point values
3107
  movss     xmm1, [Param1+8]
3108
  shufps    xmm0, xmm0, 0  ; Replicate B
3109
  subps     xmm2, xmm0     ; A + B
3110
  subss     xmm1, xmm0
3111
  movaps    xmm0, xmm2     ; Store result
3112
  ret
3113
  
3114
_single_sub_vector3:
3115
  movq      xmm4, [Param1]
3116
  movss     xmm2, [Param1+8]
3117
  movss     xmm1, xmm0
3118
  shufps    xmm0, xmm0, 0
3119
  subps     xmm0, xmm4
3120
  subss     xmm1, xmm2
3121
  ret
3122
  
3123
_vector3_sub_vector3:
3124
  movq      xmm0, [Param1]
3125
  movss     xmm1, [Param1+8]
3126
  movq      xmm2, [Param2]
3127
  movss     xmm3, [Param2+8]
3128
  subps     xmm0, xmm2
3129
  subss     xmm1, xmm3
3130
  ret
3131

3132
_vector3_mul_single:
3133
  movq      xmm2, [Param1]
3134
  movss     xmm1, [Param1+8]
3135
  shufps    xmm0, xmm0, 0
3136
  mulps     xmm2, xmm0
3137
  mulss     xmm1, xmm0
3138
  movaps    xmm0, xmm2
3139
  ret
3140
  
3141
_single_mul_vector3:
3142
  movq      xmm2, [Param1]
3143
  movss     xmm1, [Param1+8]
3144
  shufps    xmm0, xmm0, 0
3145
  mulps     xmm2, xmm0
3146
  mulss     xmm1, xmm0
3147
  movaps    xmm0, xmm2
3148
  ret
3149
  
3150
_vector3_mul_vector3:
3151
  movq      xmm0, [Param1]
3152
  movss     xmm1, [Param1+8]
3153
  movq      xmm2, [Param2]
3154
  movss     xmm3, [Param2+8]
3155
  mulps     xmm0, xmm2
3156
  mulss     xmm1, xmm3
3157
  ret  
3158
  
3159
_vector3_div_single:
3160
  movq      xmm2, [Param1]
3161
  movss     xmm1, [Param1+8]
3162
  shufps    xmm0, xmm0, 0
3163
  divps     xmm2, xmm0
3164
  divss     xmm1, xmm0
3165
  movaps    xmm0, xmm2
3166
  ret
3167
  
3168
_single_div_vector3:
3169
  movq      xmm3, [Param1]
3170
  movss     xmm2, [Param1+8]
3171
  movss     xmm1, xmm0
3172
  shufps    xmm0, xmm0, 0
3173
  divps     xmm0, xmm3
3174
  divss     xmm1, xmm2
3175
  ret
3176
  
3177
_vector3_div_vector3:
3178
  movq      xmm0, [Param1]
3179
  movss     xmm1, [Param1+8]
3180
  movq      xmm2, [Param2]
3181
  movss     xmm3, [Param2+8]
3182
  divps     xmm0, xmm2
3183
  divss     xmm1, xmm3
3184
  ret
3185
  
3186
_vector3_distance:
3187
  movq      xmm0, [Self]
3188
  movss     xmm1, [Self+8]
3189
  movq      xmm2, [Param2]
3190
  movss     xmm3, [Param2+8]
3191
  movlhps   xmm0, xmm1
3192
  movlhps   xmm2, xmm3
3193
  subps     xmm0, xmm2 ; A - B
3194

3195
  ; (A - B).Length
3196
  mulps     xmm0, xmm0
3197
  pshufd    xmm1, xmm0, 0x0E
3198
  addps     xmm0, xmm1
3199
  pshufd    xmm1, xmm0, 0x01
3200
  addss     xmm0, xmm1
3201
  sqrtss    xmm0, xmm0
3202
  ret
3203
  
3204
_vector3_distance_squared:
3205
  movq      xmm0, [Self]
3206
  movss     xmm1, [Self+8]
3207
  movq      xmm2, [Param2]
3208
  movss     xmm3, [Param2+8]
3209
  movlhps   xmm0, xmm1
3210
  movlhps   xmm2, xmm3
3211
  subps     xmm0, xmm2 ; A - B
3212

3213
  ; (A - B).Length
3214
  mulps     xmm0, xmm0
3215
  pshufd    xmm1, xmm0, 0x0E
3216
  addps     xmm0, xmm1
3217
  pshufd    xmm1, xmm0, 0x01
3218
  addss     xmm0, xmm1
3219
  ret
3220
  
3221
_vector3_get_length:
3222
  movq      xmm0, [Self]    ; 0 0 Y X
3223
  movss     xmm1, [Self+8]  ; 0 0 0 Z
3224
  movlhps   xmm0, xmm1      ; 0 Z Y Z
3225
  mulps     xmm0, xmm0      ;  0  Z*Z Y*Y X*X
3226
  pshufd    xmm1, xmm0, 0x0E; Y*Y X*X  0  Z*Z
3227
  addps     xmm0, xmm1      ;     #         #     (Y*Y)     (X*X+Z*Z)
3228
  pshufd    xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3229
  addss     xmm0, xmm1      ; (X*X + Y*Y + Z*Z)
3230
  sqrtss    xmm0, xmm0      ; Sqrt(X*X + Y*Y + Z*Z)
3231
  ret
3232
  
3233
_vector3_get_length_squared:
3234
  movq      xmm0, [Self]    ; 0 0 Y X
3235
  movss     xmm1, [Self+8]  ; 0 0 0 Z
3236
  movlhps   xmm0, xmm1      ; 0 Z Y Z
3237
  mulps     xmm0, xmm0      ;  0  Z*Z Y*Y X*X
3238
  pshufd    xmm1, xmm0, 0x0E; Y*Y X*X  0  Z*Z
3239
  addps     xmm0, xmm1      ;     #         #     (Y*Y)     (X*X+Z*Z)
3240
  pshufd    xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3241
  addss     xmm0, xmm1      ; (X*X + Y*Y + Z*Z)
3242
  ret
3243

3244
_vector3_normalize_fast:
3245
  movq      xmm0, [Self]    ; 0 0 Y X
3246
  movss     xmm1, [Self+8]  ; 0 0 0 Z
3247
  movlhps   xmm0, xmm1      ; 0 Z Y Z
3248
  movaps    xmm2, xmm0
3249

3250
  ; Dot(A, A)
3251
  mulps     xmm0, xmm0      ;  0  Z*Z Y*Y X*X
3252
  pshufd    xmm1, xmm0, 0x4E; Y*Y X*X  0  Z*Z
3253
  addps     xmm0, xmm1      ;   (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3254
  pshufd    xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3255
  addps     xmm0, xmm1      ; (X*X + Y*Y + Z*Z) (4x)
3256

3257
  rsqrtps   xmm0, xmm0      ; (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3258
  mulps     xmm0, xmm2      ; A * (1 / Sqrt(Dot(A, A)))
3259
  movhlps   xmm1, xmm0
3260
  ret
3261
  
3262
_vector3_set_normalized_fast:
3263
  movq      xmm0, [Self]    ; 0 0 Y X
3264
  movss     xmm1, [Self+8]  ; 0 0 0 Z
3265
  movlhps   xmm0, xmm1      ; 0 Z Y Z
3266
  movaps    xmm2, xmm0
3267

3268
  ; Dot(A, A)
3269
  mulps     xmm0, xmm0      ;  0  Z*Z Y*Y X*X
3270
  pshufd    xmm1, xmm0, 0x4E; Y*Y X*X  0  Z*Z
3271
  addps     xmm0, xmm1      ;   (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3272
  pshufd    xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3273
  addps     xmm0, xmm1      ; (X*X + Y*Y + Z*Z) (4x)
3274

3275
  rsqrtps   xmm0, xmm0      ; (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3276
  mulps     xmm0, xmm2      ; A * (1 / Sqrt(Dot(A, A)))
3277
  movhlps   xmm1, xmm0
3278
  movq      [Self], xmm0
3279
  movss     [Self+8], xmm1
3280
  ret  
3281
  
3282
_vector3_reflect:
3283
  movq      xmm0, [Self]
3284
  movss     xmm2, [Self+8]
3285
  movq      xmm1, [Param2]
3286
  movss     xmm3, [Param2+8]
3287
  movlhps   xmm0, xmm2
3288
  movlhps   xmm1, xmm3
3289
  movaps    xmm2, xmm0
3290
  movups    xmm3, [rel kSSE_TWO]
3291

3292
  ; Dot(N, I)
3293
  mulps     xmm0, xmm1
3294
  mulps     xmm3, xmm1 ; N * 2
3295
  pshufd    xmm1, xmm0, 0x4E
3296
  addps     xmm0, xmm1
3297
  pshufd    xmm1, xmm0, 0x11
3298
  addps     xmm0, xmm1
3299

3300
  ; (2 * Dot(N, I)) * N
3301
  mulps     xmm0, xmm3
3302

3303
  ; I - ((2 * Dot(N, I)) * N)
3304
  subps     xmm2, xmm0
3305
  movaps    xmm0, xmm2
3306
  movhlps   xmm1, xmm2
3307
  ret
3308
  
3309
_vector3_refract:
3310
  movq      xmm3, [Self]
3311
  movss     xmm2, [Self+8]
3312
  movq      xmm1, [Param2]
3313
  movss     xmm4, [Param2+8]
3314
  movlhps   xmm3, xmm2
3315
  movlhps   xmm1, xmm4
3316
  movups    xmm7, xmm3
3317
  movss     xmm2, [rel kSSE_ONE]
3318

3319
  ; D := Dot(N, I)
3320
  mulps     xmm3, xmm1
3321
  movss     xmm4, xmm2 ; 1
3322
  pshufd    xmm1, xmm3, 0x4E
3323
  movss     xmm5, xmm0 ; Eta
3324
  addps     xmm3, xmm1
3325
  mulss     xmm5, xmm5 ; Eta * Eta
3326
  pshufd    xmm1, xmm3, 0x11
3327
  addss     xmm3, xmm1
3328

3329
  ; K := 1 - Eta * Eta * (1 - D * D)
3330
  movss     xmm6, xmm3  ; D
3331
  mulss     xmm3, xmm3  ; D * D
3332
  subss     xmm4, xmm3  ; 1 - D * D
3333
  mulss     xmm4, xmm5  ; Eta * Eta * (1 - D * D)
3334
  xorps     xmm5, xmm5  ; 0
3335
  subss     xmm2, xmm4  ; K := 1 - Eta * Eta * (1 - D * D)
3336

3337
  ; if (K < 0) then
3338
  comiss    xmm2, xmm5
3339

3340
  jb        _set_null_vec3
3341

3342
  ; K >= 0
3343
  mulss     xmm6, xmm0    ; Eta * D
3344
  shufps    xmm0, xmm0, 0 ; Replicate Eta (4x)
3345
  mulps     xmm7, xmm0    ; Eta * I
3346
  sqrtss    xmm2, xmm2    ; Sqrt(K)
3347
  addss     xmm6, xmm2    ; Eta * D + Sqrt(K)
3348
  shufps    xmm6, xmm6, 0 ; Replicate Eta * D + Sqrt(K) (4x)
3349
  movups    xmm1, [Param2]
3350
  mulps     xmm6, xmm1    ; ((Eta * D + Sqrt(K)) * N)
3351
  subps     xmm7, xmm6    ; (Eta * I) - ((Eta * D + Sqrt(K)) * N)
3352
  movaps    xmm0, xmm7
3353
  movhlps   xmm1, xmm7
3354
  ret
3355

3356
_set_null_vec3:
3357
  ; K < 0: Result := Vector4(0, 0, 0, 0)
3358
  movaps    xmm0, xmm5
3359
  movlhps   xmm1, xmm5
3360
  ret
3361
  
3362
;****************************************************************************
3363
; TVector4
3364
;****************************************************************************
3365
  
3366
_vector4_add_single:
3367
  movups    xmm2, [Param1] ; Load 4 floating-point values
3368
  shufps    xmm0, xmm0, 0  ; Replicate B
3369
  addps     xmm2, xmm0     ; A + B
3370
  movaps    xmm0, xmm2     ; Store result
3371
  movhlps   xmm1, xmm2
3372
  ret
3373
  
3374
_single_add_vector4:
3375
  movups    xmm1, [Param1]
3376
  shufps    xmm0, xmm0, 0
3377
  addps     xmm1, xmm0
3378
  movaps    xmm0, xmm1
3379
  movhlps   xmm1, xmm1
3380
  ret
3381
  
3382
_vector4_add_vector4:
3383
  movups    xmm0, [Param1]
3384
  movups    xmm1, [Param2]
3385
  addps     xmm0, xmm1
3386
  movhlps   xmm1, xmm0
3387
  ret
3388
  
3389
_vector4_sub_single:
3390
  movups    xmm2, [Param1]
3391
  shufps    xmm0, xmm0, 0
3392
  subps     xmm2, xmm0
3393
  movaps    xmm0, xmm2  
3394
  movhlps   xmm1, xmm2
3395
  ret
3396
  
3397
_single_sub_vector4:
3398
  movups    xmm1, [Param1]
3399
  shufps    xmm0, xmm0, 0
3400
  subps     xmm0, xmm1
3401
  movhlps   xmm1, xmm0
3402
  ret
3403
  
3404
_vector4_sub_vector4:
3405
  movups    xmm0, [Param1]
3406
  movups    xmm1, [Param2]
3407
  subps     xmm0, xmm1
3408
  movhlps   xmm1, xmm0
3409
  ret
3410
  
3411
_vector4_mul_single:
3412
  movups    xmm2, [Param1]
3413
  shufps    xmm0, xmm0, 0
3414
  mulps     xmm2, xmm0
3415
  movaps    xmm0, xmm2  
3416
  movhlps   xmm1, xmm2
3417
  ret
3418
  
3419
_single_mul_vector4:
3420
  movups    xmm1, [Param1]
3421
  shufps    xmm0, xmm0, 0
3422
  mulps     xmm0, xmm1
3423
  movhlps   xmm1, xmm0
3424
  ret
3425
  
3426
_vector4_mul_vector4:
3427
  movups    xmm0, [Param1]
3428
  movups    xmm1, [Param2]
3429
  mulps     xmm0, xmm1
3430
  movhlps   xmm1, xmm0
3431
  ret
3432
  
3433
_vector4_div_single:
3434
  movups    xmm2, [Param1]
3435
  shufps    xmm0, xmm0, 0
3436
  divps     xmm2, xmm0
3437
  movaps    xmm0, xmm2  
3438
  movhlps   xmm1, xmm2
3439
  ret
3440
  
3441
_single_div_vector4:
3442
  movups    xmm1, [Param1]
3443
  shufps    xmm0, xmm0, 0
3444
  divps     xmm0, xmm1
3445
  movhlps   xmm1, xmm0
3446
  ret
3447
  
3448
_vector4_div_vector4:
3449
  movups    xmm0, [Param1]
3450
  movups    xmm1, [Param2]
3451
  divps     xmm0, xmm1
3452
  movhlps   xmm1, xmm0
3453
  ret  
3454

3455
_vector4_negative:
3456
  movaps    xmm0, [rel kSSE_MASK_SIGN] ; Load mask with 4 sign (upper) bits
3457
  movups    xmm1, [Param1]
3458
  xorps     xmm0, xmm1                 ; Flip sign bit
3459
  movhlps   xmm1, xmm0
3460
  ret
3461
  
3462
_vector4_distance:
3463
  movups    xmm0, [Self]
3464
  movups    xmm1, [Param2]
3465
  subps     xmm0, xmm1 ; A - B
3466

3467
  ; (A - B).Length
3468
  mulps     xmm0, xmm0
3469
  pshufd    xmm1, xmm0, 0x0E
3470
  addps     xmm0, xmm1
3471
  pshufd    xmm1, xmm0, 0x01
3472
  addss     xmm0, xmm1
3473
  sqrtss    xmm0, xmm0
3474
  ret
3475
  
3476
_vector4_distance_squared:
3477
  movups    xmm0, [Self]
3478
  movups    xmm1, [Param2]
3479
  subps     xmm0, xmm1 ; A - B
3480

3481
  ; (A - B).LengthSquared
3482
  mulps     xmm0, xmm0
3483
  pshufd    xmm1, xmm0, 0x0E
3484
  addps     xmm0, xmm1
3485
  pshufd    xmm1, xmm0, 0x01
3486
  addss     xmm0, xmm1
3487
  ret
3488
  
3489
_vector4_face_forward:
3490
  movups    xmm0, [Self]
3491
  movups    xmm1, [Param2]
3492
  movups    xmm2, [Param3]
3493
  xorps     xmm3, xmm3 ; 0
3494
  movaps    xmm4, [rel kSSE_MASK_SIGN]
3495

3496
  ; Dot(NRef, I)
3497
  mulps     xmm2, xmm1
3498
  pshufd    xmm1, xmm2, 0x4E
3499
  addps     xmm2, xmm1
3500
  pshufd    xmm1, xmm2, 0x11
3501
  addps     xmm2, xmm1
3502

3503
  ; Dot(NRef, I) >= 0?  Yes: 0xFFFFFFFF, No: 0x00000000
3504
  cmpnltps  xmm2, xmm3
3505
  andps     xmm2, xmm4 ; Yes: 0x80000000, No: 0x00000000
3506

3507
  ; Flip sign of N if (Dot(NRef, I) >= 0)
3508
  xorps     xmm0, xmm2
3509
  movhlps   xmm1, xmm0
3510
  ret
3511
  
3512
_vector4_get_length:
3513
  movups    xmm0, [Self]    ; W Z Y X
3514
  mulps     xmm0, xmm0      ; W*W Z*Z Y*Y X*X
3515
  pshufd    xmm1, xmm0, 0x0E; Y*Y X*X W*W Z*Z
3516
  addps     xmm0, xmm1      ;     #         #     (Y*Y+W*W) (X*X+Z*Z)
3517
  pshufd    xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3518
  addss     xmm0, xmm1      ; (X*X + Y*Y + Z*Z + W*W)
3519
  sqrtss    xmm0, xmm0      ; Sqrt(X*X + Y*Y + Z*Z + W*W)
3520
  ret
3521
  
3522
_vector4_get_length_squared:
3523
  movups    xmm0, [Self]    ; W Z Y X
3524
  mulps     xmm0, xmm0      ; W*W Z*Z Y*Y X*X
3525
  pshufd    xmm1, xmm0, 0x0E; Y*Y X*X W*W Z*Z
3526
  addps     xmm0, xmm1      ;     #         #     (Y*Y+W*W) (X*X+Z*Z)
3527
  pshufd    xmm1, xmm0, 0x01; (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3528
  addss     xmm0, xmm1      ; (X*X + Y*Y + Z*Z + W*W)
3529
  ret
3530
 
3531
_vector4_normalize_fast:
3532
  movups    xmm0, [Self]    ; W Z Y X
3533
  movaps    xmm2, xmm0
3534

3535
  ; Dot(A, A)
3536
  mulps     xmm0, xmm0      ; W*W Z*Z Y*Y X*X
3537
  pshufd    xmm1, xmm0, 0x4E; Y*Y X*X W*W Z*Z
3538
  addps     xmm0, xmm1      ; (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
3539
  pshufd    xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
3540
  addps     xmm0, xmm1      ; (X*X + Y*Y + Z*Z + W*W) (4x)
3541

3542
  rsqrtps   xmm0, xmm0      ; (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
3543
  mulps     xmm0, xmm2      ; A * (1 / Sqrt(Dot(A, A)))
3544
  movhlps   xmm1, xmm0
3545
  ret
3546
  
3547
_vector4_set_normalized_fast:
3548
  movups    xmm0, [Self]    ; W Z Y X
3549
  movaps    xmm2, xmm0
3550

3551
  ; Dot(A, A)
3552
  mulps     xmm0, xmm0      ; W*W Z*Z Y*Y X*X
3553
  pshufd    xmm1, xmm0, 0x4E; Y*Y X*X W*W Z*Z
3554
  addps     xmm0, xmm1      ; (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
3555
  pshufd    xmm1, xmm0, 0x11; (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
3556
  addps     xmm0, xmm1      ; (X*X + Y*Y + Z*Z + W*W) (4x)
3557

3558
  rsqrtps   xmm0, xmm0      ; (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
3559
  mulps     xmm0, xmm2      ; A * (1 / Sqrt(Dot(A, A)))
3560
  movups    [Self], xmm0
3561
  ret
3562
  
3563
_vector4_reflect:
3564
  movups    xmm0, [Self]
3565
  movups    xmm1, [Param2]
3566
  movaps    xmm2, xmm0
3567
  movaps    xmm3, [rel kSSE_TWO]
3568

3569
  ; Dot(N, I)
3570
  mulps     xmm0, xmm1
3571
  mulps     xmm3, xmm1 ; N * 2
3572
  pshufd    xmm1, xmm0, 0x4E
3573
  addps     xmm0, xmm1
3574
  pshufd    xmm1, xmm0, 0x11
3575
  addps     xmm0, xmm1
3576

3577
  ; (2 * Dot(N, I)) * N
3578
  mulps     xmm0, xmm3
3579

3580
  ; I - ((2 * Dot(N, I)) * N)
3581
  subps     xmm2, xmm0
3582
  movaps    xmm0, xmm2
3583
  movhlps   xmm1, xmm2
3584
  ret
3585
  
3586
_vector4_refract:
3587
  movups    xmm3, [Self]
3588
  movups    xmm1, [Param2]
3589
  movups    xmm7, xmm3
3590
  movss     xmm2, [rel kSSE_ONE]
3591

3592
  ; D := Dot(N, I)
3593
  mulps     xmm3, xmm1
3594
  movss     xmm4, xmm2 ; 1
3595
  pshufd    xmm1, xmm3, 0x4E
3596
  movss     xmm5, xmm0 ; Eta
3597
  addps     xmm3, xmm1
3598
  mulss     xmm5, xmm5 ; Eta * Eta
3599
  pshufd    xmm1, xmm3, 0x11
3600
  addss     xmm3, xmm1
3601

3602
  ; K := 1 - Eta * Eta * (1 - D * D)
3603
  movss     xmm6, xmm3  ; D
3604
  mulss     xmm3, xmm3  ; D * D
3605
  subss     xmm4, xmm3  ; 1 - D * D
3606
  mulss     xmm4, xmm5  ; Eta * Eta * (1 - D * D)
3607
  xorps     xmm5, xmm5  ; 0
3608
  subss     xmm2, xmm4  ; K := 1 - Eta * Eta * (1 - D * D)
3609

3610
  ; if (K < 0) then
3611
  comiss    xmm2, xmm5
3612

3613
  jb        _set_null_vec4
3614

3615
  ; K >= 0
3616
  mulss     xmm6, xmm0    ; Eta * D
3617
  shufps    xmm0, xmm0, 0 ; Replicate Eta (4x)
3618
  mulps     xmm7, xmm0    ; Eta * I
3619
  sqrtss    xmm2, xmm2    ; Sqrt(K)
3620
  addss     xmm6, xmm2    ; Eta * D + Sqrt(K)
3621
  shufps    xmm6, xmm6, 0 ; Replicate Eta * D + Sqrt(K) (4x)
3622
  movups    xmm1, [Param2]
3623
  mulps     xmm6, xmm1    ; ((Eta * D + Sqrt(K)) * N)
3624
  subps     xmm7, xmm6    ; (Eta * I) - ((Eta * D + Sqrt(K)) * N)
3625
  movaps    xmm0, xmm7
3626
  movhlps   xmm1, xmm7
3627
  ret
3628

3629
_set_null_vec4:
3630
  ; K < 0: Result := Vector4(0, 0, 0, 0)
3631
  movaps    xmm0, xmm5
3632
  movhlps   xmm1, xmm5
3633
  ret
3634
  
3635
;****************************************************************************
3636
; TMatrix3
3637
;****************************************************************************
3638

3639
_matrix3_add_single:
3640
  movups    xmm1, [Param2 + 0x00] ; Load 3 rows
3641
  shufps    xmm0, xmm0, 0         ; Replicate B
3642
  movups    xmm3, [Param2 + 0x10]
3643
  movss     xmm4, [Param2 + 0x20]
3644
  addps     xmm1, xmm0            ; Add B to each row
3645
  addps     xmm3, xmm0
3646
  addss     xmm4, xmm0
3647
  movups    [Param1 + 0x00], xmm1
3648
  movups    [Param1 + 0x10], xmm3
3649
  movss     [Param1 + 0x20], xmm4
3650
  ret
3651

3652
_single_add_matrix3:
3653
  movups    xmm1, [Param2 + 0x00] ; Load 3 rows
3654
  shufps    xmm0, xmm0, 0         ; Replicate A
3655
  movups    xmm2, [Param2 + 0x10]
3656
  movss     xmm3, [Param2 + 0x20]
3657
  addps     xmm1, xmm0            ; Add A to each row
3658
  addps     xmm2, xmm0
3659
  addss     xmm3, xmm0
3660
  movups    [Param1 + 0x00], xmm1
3661
  movups    [Param1 + 0x10], xmm2
3662
  movss     [Param1 + 0x20], xmm3
3663
  ret
3664

3665
_matrix3_add_matrix3:
3666
  movups    xmm0, [Param2 + 0x00] ; Load 3 rows of A
3667
  movups    xmm1, [Param2 + 0x10]
3668
  movss     xmm2, [Param2 + 0x20]
3669
  movups    xmm4, [Param3 + 0x00] ; Load 3 rows of B
3670
  movups    xmm5, [Param3 + 0x10]
3671
  movss     xmm3, [Param3 + 0x20]
3672
  addps     xmm0, xmm4            ; Add rows
3673
  addps     xmm1, xmm5
3674
  addss     xmm2, xmm3
3675
  movups    [Param1 + 0x00], xmm0
3676
  movups    [Param1 + 0x10], xmm1
3677
  movss     [Param1 + 0x20], xmm2
3678
  ret
3679
  
3680
_matrix3_sub_single:
3681
  movups    xmm1, [Param2 + 0x00]  ; Load 3 rows
3682
  shufps    xmm0, xmm0, 0          ; Replicate B
3683
  movups    xmm2, [Param2 + 0x10]
3684
  movss     xmm3, [Param2 + 0x20]
3685
  subps     xmm1, xmm0             ; Subtract B from each row
3686
  subps     xmm2, xmm0
3687
  subss     xmm3, xmm0
3688
  movups    [Param1 + 0x00], xmm1
3689
  movups    [Param1 + 0x10], xmm2
3690
  movss     [Param1 + 0x20], xmm3
3691
  ret
3692
  
3693
_single_sub_matrix3:
3694
  movups    xmm4, [Param2 + 0x00]  ; Load 3 rows
3695
  shufps    xmm0, xmm0, 0          ; Replicate A
3696
  movups    xmm5, [Param2 + 0x10]
3697
  movaps    xmm1, xmm0
3698
  movaps    xmm2, xmm0
3699
  movss     xmm6, [Param2 + 0x20]
3700
  subps     xmm0, xmm4             ; Subtract each row from A
3701
  subps     xmm1, xmm5
3702
  subss     xmm2, xmm6
3703
  movups    [Param1 + 0x00], xmm0
3704
  movups    [Param1 + 0x10], xmm1
3705
  movss     [Param1 + 0x20], xmm2
3706
  ret
3707
  
3708
_matrix3_sub_matrix3:
3709
  movups    xmm0, [Param2 + 0x00] ; Load 3 rows of A
3710
  movups    xmm1, [Param2 + 0x10]
3711
  movss     xmm2, [Param2 + 0x20]
3712
  movups    xmm4, [Param3 + 0x00] ; Load 3 rows of B
3713
  movups    xmm5, [Param3 + 0x10]
3714
  movss     xmm6, [Param3 + 0x20]
3715
  subps     xmm0, xmm4             ; Subtract rows
3716
  subps     xmm1, xmm5
3717
  subss     xmm2, xmm6
3718
  movups    [Param1 + 0x00], xmm0
3719
  movups    [Param1 + 0x10], xmm1
3720
  movss     [Param1 + 0x20], xmm2
3721
  ret
3722
  
3723
_matrix3_mul_single:
3724
  movups    xmm1, [Param2 + 0x00]  ; Load 3 rows
3725
  shufps    xmm0, xmm0, 0          ; Replicate B
3726
  movups    xmm2, [Param2 + 0x10]
3727
  movss     xmm3, [Param2 + 0x20]
3728
  mulps     xmm1, xmm0             ; Multiply each row by B
3729
  mulps     xmm2, xmm0
3730
  mulss     xmm3, xmm0
3731
  movups    [Param1 + 0x00], xmm1
3732
  movups    [Param1 + 0x10], xmm2
3733
  movss     [Param1 + 0x20], xmm3
3734
  ret
3735
  
3736
_single_mul_matrix3:
3737
  movups    xmm2, [Param2 + 0x00]  ; Load 3 rows
3738
  shufps    xmm0, xmm0, 0          ; Replicate A
3739
  movups    xmm1, [Param2 + 0x10]
3740
  movss     xmm3, [Param2 + 0x20]
3741
  mulps     xmm2, xmm0             ; Multiply each row by A
3742
  mulps     xmm1, xmm0
3743
  mulss     xmm3, xmm0
3744
  movups    [Param1 + 0x00], xmm2
3745
  movups    [Param1 + 0x10], xmm1
3746
  movss     [Param1 + 0x20], xmm3
3747
  ret
3748
  
3749
_matrix3_comp_mult:
3750
  movups    xmm2, [Param2 + 0x00]  ; Self[0]
3751
  movups    xmm0, [Param2 + 0x10]  ; Self[1]
3752
  movss     xmm1, [Param2 + 0x20]  ; Self[2]
3753
  movups    xmm4, [Param3 + 0x00]  ; AOther[0]
3754
  movups    xmm5, [Param3 + 0x10]  ; AOther[1]
3755
  movss     xmm3, [Param3 + 0x20]  ; AOther[2]
3756

3757
  ; Component-wise multiplication
3758
  mulps     xmm2, xmm4
3759
  mulps     xmm0, xmm5
3760
  mulss     xmm1, xmm3
3761

3762
  ; Store result
3763
  movups    [Param1 + 0x00], xmm2
3764
  movups    [Param1 + 0x10], xmm0
3765
  movss     [Param1 + 0x20], xmm1
3766
  ret
3767
  
3768
%macro M3_MUL_V3 2
3769
  movq      xmm0, [%2]         ; Load vector
3770
  movss     xmm1, [%2+8]
3771
  movlhps   xmm0, xmm1
3772

3773
  movq      xmm4, [%1 + 0x00]  ; Load 3 rows
3774
  movss     xmm1, [%1 + 0x08]
3775
  movlhps   xmm4, xmm1
3776

3777
  movaps    xmm1, xmm0
3778
  movaps    xmm2, xmm0
3779

3780
  movq      xmm5, [%1 + 0x0C]
3781
  movss     xmm6, [%1 + 0x14]
3782
  movlhps   xmm5, xmm6
3783

3784
  movq      xmm6, [%1 + 0x18]
3785
  movss     xmm3, [%1 + 0x20]
3786
  movlhps   xmm6, xmm3
3787

3788
  mulps     xmm0, xmm4             ; ###, (Az * B02), (Ay * B01), (Ax * B00)
3789
  mulps     xmm1, xmm5             ; ###, (Az * B12), (Ay * B11), (Ax * B10)
3790
  mulps     xmm2, xmm6             ; ###, (Az * B22), (Ay * B21), (Ax * B20)
3791
  xorps     xmm3, xmm3             ; 000
3792

3793
  ; Transpose xmm0-xmm2 
3794
  movaps    xmm4, xmm2
3795
  unpcklps  xmm2, xmm3             ; 000 B21 000 B20
3796
  unpckhps  xmm4, xmm3             ; 000 ### 000 B22
3797

3798
  movaps    xmm3, xmm0
3799
  unpcklps  xmm0, xmm1             ; B11 B01 B10 B00
3800
  unpckhps  xmm3, xmm1             ; ### ### B12 B02
3801

3802
  movaps    xmm1, xmm0
3803
  unpcklpd  xmm0, xmm2             ; 000 B20 B10 B00
3804
  unpckhpd  xmm1, xmm2             ; 000 B21 B11 B01
3805

3806
  unpcklpd  xmm3, xmm4             ; 000 B22 B12 B02
3807

3808
  addps     xmm0, xmm1             ; Add rows
3809
  addps     xmm0, xmm3
3810
  movhlps   xmm1, xmm0
3811
  ret
3812
%endmacro
3813

3814
%macro V3_MUL_M3 2
3815
  movq      xmm0, [%1]             ; Load vector
3816
  movss     xmm1, [%1+8]
3817
  movlhps   xmm0, xmm1
3818

3819
  movq      xmm4, [%2 + 0x00]      ; Load 3 rows
3820
  movss     xmm1, [%2 + 0x08]
3821
  movlhps   xmm4, xmm1
3822

3823
  movaps    xmm1, xmm0
3824
  movaps    xmm2, xmm0
3825
  shufps    xmm0, xmm0, 0x00       ; Bx Bx Bx Bx
3826
  shufps    xmm1, xmm1, 0x55       ; By By By By
3827
  shufps    xmm2, xmm2, 0xAA       ; Bz Bz Bz Bz
3828

3829
  movq      xmm5, [%2 + 0x0C]
3830
  movss     xmm3, [%2 + 0x14]
3831
  movlhps   xmm5, xmm3
3832

3833
  movq      xmm6, [%2 + 0x18]
3834
  movss     xmm3, [%2 + 0x20]
3835
  movlhps   xmm6, xmm3
3836

3837
  mulps     xmm0, xmm4             ; (A00 * Bx), (A01 * Bx), (A02 * Bx), #
3838
  mulps     xmm1, xmm5             ; (A10 * By), (A11 * By), (A12 * By), #
3839
  mulps     xmm2, xmm6             ; (A20 * Bz), (A21 * Bz), (A22 * Bz), #
3840
  addps     xmm0, xmm1             ; Add rows
3841
  addps     xmm0, xmm2
3842
  movhlps   xmm1, xmm0
3843
  ret
3844
%endmacro
3845

3846
%macro M3_MUL_M3 2
3847
  ; A.R[0] * B 
3848
  movq      xmm0, [%1 + 0x00]
3849
  movss     xmm1, [%1 + 0x08]
3850
  movlhps   xmm0, xmm1
3851

3852
  movq      xmm4, [%2 + 0x00]
3853
  movss     xmm1, [%2 + 0x08]
3854
  movlhps   xmm4, xmm1
3855

3856
  movaps    xmm1, xmm0
3857
  movaps    xmm2, xmm0
3858
  shufps    xmm0, xmm0, 0x00
3859
  shufps    xmm1, xmm1, 0x55
3860
  shufps    xmm2, xmm2, 0xAA
3861

3862
  movq      xmm5, [%2 + 0x0C]
3863
  movss     xmm3, [%2 + 0x14]
3864
  movlhps   xmm5, xmm3
3865

3866
  movq      xmm6, [%2 + 0x18]
3867
  movss     xmm3, [%2 + 0x20]
3868
  movlhps   xmm6, xmm3
3869

3870
  mulps     xmm0, xmm4
3871
  mulps     xmm1, xmm5
3872
  mulps     xmm2, xmm6
3873
  addps     xmm0, xmm1
3874
  addps     xmm0, xmm2
3875
  movhlps   xmm1, xmm0
3876
  movq      [Param1 + 0x00], xmm0
3877
  movss     [Param1 + 0x08], xmm1
3878

3879
  ; A.R[1] * B 
3880
  movq      xmm0, [%1 + 0x0C]
3881
  movss     xmm1, [%1 + 0x14]
3882
  movlhps   xmm0, xmm1
3883

3884
  movaps    xmm1, xmm0
3885
  movaps    xmm2, xmm0
3886
  shufps    xmm0, xmm0, 0x00
3887
  shufps    xmm1, xmm1, 0x55
3888
  shufps    xmm2, xmm2, 0xAA
3889
  mulps     xmm0, xmm4
3890
  mulps     xmm1, xmm5
3891
  mulps     xmm2, xmm6
3892
  addps     xmm0, xmm1
3893
  addps     xmm0, xmm2
3894
  movhlps   xmm1, xmm0
3895
  movq      [Param1 + 0x0C], xmm0
3896
  movss     [Param1 + 0x14], xmm1
3897

3898
  ; A.R[2] * B 
3899
  movq      xmm0, [%1 + 0x18]
3900
  movss     xmm1, [%1 + 0x20]
3901
  movlhps   xmm0, xmm1
3902

3903
  movaps    xmm1, xmm0
3904
  movaps    xmm2, xmm0
3905
  shufps    xmm0, xmm0, 0x00
3906
  shufps    xmm1, xmm1, 0x55
3907
  shufps    xmm2, xmm2, 0xAA
3908
  mulps     xmm0, xmm4
3909
  mulps     xmm1, xmm5
3910
  mulps     xmm2, xmm6
3911
  addps     xmm0, xmm1
3912
  addps     xmm0, xmm2
3913
  movhlps   xmm1, xmm0
3914
  movq      [Param1 + 0x18], xmm0
3915
  movss     [Param1 + 0x20], xmm1
3916
  ret
3917
%endmacro
3918
  
3919
%ifdef FM_COLUMN_MAJOR
3920
_matrix3_mul_vector3:
3921
  V3_MUL_M3 Param2, Param1
3922
  
3923
_vector3_mul_matrix3:
3924
  M3_MUL_V3 Param2, Param1
3925
  
3926
_matrix3_mul_matrix3:
3927
  M3_MUL_M3 Param3, Param2
3928
%else
3929
_matrix3_mul_vector3:
3930
  M3_MUL_V3 Param1, Param2
3931
  
3932
_vector3_mul_matrix3:
3933
  V3_MUL_M3 Param1, Param2
3934
  
3935
_matrix3_mul_matrix3:
3936
  M3_MUL_M3 Param2, Param3
3937
%endif
3938

3939
_matrix3_div_single:
3940
  movups    xmm1, [Param2 + 0x00]  ; Load 3 rows
3941
  shufps    xmm0, xmm0, 0          ; Replicate B
3942
  movups    xmm2, [Param2 + 0x10]
3943
  movss     xmm3, [Param2 + 0x20]
3944
  divps     xmm1, xmm0             ; Divide each row by B
3945
  divps     xmm2, xmm0
3946
  divss     xmm3, xmm0
3947
  movups    [Param1 + 0x00], xmm1
3948
  movups    [Param1 + 0x10], xmm2
3949
  movss     [Param1 + 0x20], xmm3
3950
  ret
3951
  
3952
_single_div_matrix3:
3953
  movups    xmm4, [Param2 + 0x00]  ; Load 3 rows
3954
  shufps    xmm0, xmm0, 0          ; Replicate A
3955
  movups    xmm5, [Param2 + 0x10]
3956
  movaps    xmm1, xmm0
3957
  movaps    xmm2, xmm0
3958
  movss     xmm3, [Param2 + 0x20]
3959
  divps     xmm0, xmm4             ; Divide A by each row
3960
  divps     xmm1, xmm5
3961
  divss     xmm2, xmm3
3962
  movups    [Param1 + 0x00], xmm0
3963
  movups    [Param1 + 0x10], xmm1
3964
  movss     [Param1 + 0x20], xmm2
3965
  ret
3966
  
3967
_matrix3_negative:
3968
  movups    xmm0, [rel kSSE_MASK_SIGN]  ; Load mask with 4 sign (upper) bits
3969
  movups    xmm1, [Param2 + 0x00]       ; Load 3 rows
3970
  movups    xmm2, [Param2 + 0x10]
3971
  movss     xmm3, [Param2 + 0x20]
3972
  xorps     xmm1, xmm0                  ; Flip sign bits of each element in each row
3973
  xorps     xmm2, xmm0
3974
  pxor      xmm3, xmm0
3975
  movups    [Param1 + 0x00], xmm1
3976
  movups    [Param1 + 0x10], xmm2
3977
  movss     [Param1 + 0x20], xmm3
3978
  ret
3979
  
3980
_matrix3_transpose:
3981
  movss     xmm0, [Param2 + 0x00]
3982
  movss     xmm1, [Param2 + 0x04]
3983
  movss     xmm2, [Param2 + 0x08]
3984

3985
  movss     [Param1 + 0x00], xmm0
3986
  movss     [Param1 + 0x0C], xmm1
3987
  movss     [Param1 + 0x18], xmm2
3988

3989
  movss     xmm0, [Param2 + 0x0C]
3990
  movss     xmm1, [Param2 + 0x10]
3991
  movss     xmm2, [Param2 + 0x14]
3992

3993
  movss     [Param1 + 0x04], xmm0
3994
  movss     [Param1 + 0x10], xmm1
3995
  movss     [Param1 + 0x1C], xmm2
3996

3997
  movss     xmm0, [Param2 + 0x18]
3998
  movss     xmm1, [Param2 + 0x1C]
3999
  movss     xmm2, [Param2 + 0x20]
4000

4001
  movss     [Param1 + 0x08], xmm0
4002
  movss     [Param1 + 0x14], xmm1
4003
  movss     [Param1 + 0x20], xmm2
4004
  ret
4005
  
4006
_matrix3_set_transposed:
4007
  movss     xmm1, [Param1 + 0x04]
4008
  movss     xmm2, [Param1 + 0x08]
4009

4010
  movss     xmm3, [Param1 + 0x0C]
4011
  movss     xmm5, [Param1 + 0x14]
4012

4013
  movss     xmm6, [Param1 + 0x18]
4014
  movss     xmm7, [Param1 + 0x1C]
4015
  
4016
  movss     [Param1 + 0x0C], xmm1
4017
  movss     [Param1 + 0x18], xmm2
4018

4019
  movss     [Param1 + 0x04], xmm3
4020
  movss     [Param1 + 0x1C], xmm5
4021

4022
  movss     [Param1 + 0x08], xmm6
4023
  movss     [Param1 + 0x14], xmm7 
4024
  ret
4025
  
4026
;****************************************************************************
4027
; TMatrix4
4028
;****************************************************************************
4029
  
4030
_matrix4_add_single:
4031
  movups    xmm1, [Param2 + 0x00]  ; Load 4 rows
4032
  shufps    xmm0, xmm0, 0          ; Replicate B
4033
  movups    xmm2, [Param2 + 0x10]
4034
  movups    xmm3, [Param2 + 0x20]
4035
  movups    xmm4, [Param2 + 0x30]
4036
  addps     xmm1, xmm0             ; Add B to each row
4037
  addps     xmm2, xmm0
4038
  addps     xmm3, xmm0
4039
  addps     xmm4, xmm0
4040
  movups    [Param1 + 0x00], xmm1
4041
  movups    [Param1 + 0x10], xmm2
4042
  movups    [Param1 + 0x20], xmm3
4043
  movups    [Param1 + 0x30], xmm4
4044
  ret
4045
  
4046
_single_add_matrix4:
4047
  movups    xmm1, [Param2 + 0x00]  ; Load 4 rows
4048
  shufps    xmm0, xmm0, 0          ; Replicate A
4049
  movups    xmm2, [Param2 + 0x10]
4050
  movups    xmm3, [Param2 + 0x20]
4051
  movups    xmm4, [Param2 + 0x30]
4052
  addps     xmm1, xmm0             ; Add A to each row
4053
  addps     xmm2, xmm0
4054
  addps     xmm3, xmm0
4055
  addps     xmm4, xmm0
4056
  movups    [Param1 + 0x00], xmm1
4057
  movups    [Param1 + 0x10], xmm2
4058
  movups    [Param1 + 0x20], xmm3
4059
  movups    [Param1 + 0x30], xmm4
4060
  ret
4061
  
4062
_matrix4_add_matrix4:
4063
  movups    xmm0, [Param2 + 0x00] ; Load 4 rows of A
4064
  movups    xmm1, [Param2 + 0x10]
4065
  movups    xmm2, [Param2 + 0x20]
4066
  movups    xmm3, [Param2 + 0x30]
4067
  movups    xmm4, [Param3 + 0x00] ; Load 2 rows of B
4068
  movups    xmm5, [Param3 + 0x10]
4069
  addps     xmm0, xmm4             ; Add rows
4070
  addps     xmm1, xmm5
4071
  movups    xmm4, [Param3 + 0x20] ; Load 2 rows of B
4072
  movups    xmm5, [Param3 + 0x30]
4073
  addps     xmm2, xmm4             ; Add rows
4074
  addps     xmm3, xmm5
4075
  movups    [Param1 + 0x00], xmm0
4076
  movups    [Param1 + 0x10], xmm1
4077
  movups    [Param1 + 0x20], xmm2
4078
  movups    [Param1 + 0x30], xmm3
4079
  ret
4080
  
4081
_matrix4_sub_single:
4082
  movups    xmm1, [Param2 + 0x00]  ; Load 4 rows
4083
  shufps    xmm0, xmm0, 0          ; Replicate B
4084
  movups    xmm2, [Param2 + 0x10]
4085
  movups    xmm3, [Param2 + 0x20]
4086
  movups    xmm4, [Param2 + 0x30]
4087
  subps     xmm1, xmm0             ; Subtract B from each row
4088
  subps     xmm2, xmm0
4089
  subps     xmm3, xmm0
4090
  subps     xmm4, xmm0
4091
  movups    [Param1 + 0x00], xmm1
4092
  movups    [Param1 + 0x10], xmm2
4093
  movups    [Param1 + 0x20], xmm3
4094
  movups    [Param1 + 0x30], xmm4
4095
  ret
4096
  
4097
_single_sub_matrix4:
4098
  movups    xmm4, [Param2 + 0x00]  ; Load 4 rows
4099
  shufps    xmm0, xmm0, 0          ; Replicate A
4100
  movups    xmm5, [Param2 + 0x10]
4101
  movaps    xmm1, xmm0
4102
  movaps    xmm2, xmm0
4103
  movaps    xmm3, xmm0
4104
  subps     xmm0, xmm4             ; Subtract each row from A
4105
  subps     xmm1, xmm5
4106
  movups    xmm4, [Param2 + 0x20]
4107
  movups    xmm5, [Param2 + 0x30]
4108
  subps     xmm2, xmm4
4109
  subps     xmm3, xmm5
4110
  movups    [Param1 + 0x00], xmm0
4111
  movups    [Param1 + 0x10], xmm1
4112
  movups    [Param1 + 0x20], xmm2
4113
  movups    [Param1 + 0x30], xmm3
4114
  ret
4115
  
4116
_matrix4_sub_matrix4:
4117
  movups    xmm0, [Param2 + 0x00] ; Load 4 rows of A
4118
  movups    xmm1, [Param2 + 0x10]
4119
  movups    xmm2, [Param2 + 0x20]
4120
  movups    xmm3, [Param2 + 0x30]
4121
  movups    xmm4, [Param3 + 0x00] ; Load 4 rows of B
4122
  movups    xmm5, [Param3 + 0x10]
4123
  subps     xmm0, xmm4             ; Subtract rows
4124
  subps     xmm1, xmm5
4125
  movups    xmm4, [Param3 + 0x20]
4126
  movups    xmm5, [Param3 + 0x30]
4127
  subps     xmm2, xmm4
4128
  subps     xmm3, xmm5
4129
  movups    [Param1 + 0x00], xmm0
4130
  movups    [Param1 + 0x10], xmm1
4131
  movups    [Param1 + 0x20], xmm2
4132
  movups    [Param1 + 0x30], xmm3
4133
  ret                                           
4134
  
4135
_matrix4_mul_single:
4136
  movups    xmm1, [Param2 + 0x00]  ; Load 4 rows
4137
  shufps    xmm0, xmm0, 0          ; Replicate B
4138
  movups    xmm2, [Param2 + 0x10]
4139
  movups    xmm3, [Param2 + 0x20]
4140
  movups    xmm4, [Param2 + 0x30]
4141
  mulps     xmm1, xmm0             ; Multiply each row by B
4142
  mulps     xmm2, xmm0
4143
  mulps     xmm3, xmm0
4144
  mulps     xmm4, xmm0
4145
  movups    [Param1 + 0x00], xmm1
4146
  movups    [Param1 + 0x10], xmm2
4147
  movups    [Param1 + 0x20], xmm3
4148
  movups    [Param1 + 0x30], xmm4
4149
  ret   
4150
  
4151
_single_mul_matrix4:
4152
  movups    xmm1, [Param2 + 0x00]  ; Load 4 rows
4153
  shufps    xmm0, xmm0, 0          ; Replicate A
4154
  movups    xmm2, [Param2 + 0x10]
4155
  movups    xmm3, [Param2 + 0x20]
4156
  movups    xmm4, [Param2 + 0x30]
4157
  mulps     xmm1, xmm0             ; Multiply each row by A
4158
  mulps     xmm2, xmm0
4159
  mulps     xmm3, xmm0
4160
  mulps     xmm4, xmm0
4161
  movups    [Param1 + 0x00], xmm1
4162
  movups    [Param1 + 0x10], xmm2
4163
  movups    [Param1 + 0x20], xmm3
4164
  movups    [Param1 + 0x30], xmm4
4165
  ret
4166
  
4167
_matrix4_comp_mult:
4168
  movups    xmm0, [Param2 + 0x00]   ; Self[0]
4169
  movups    xmm1, [Param2 + 0x10]   ; Self[1]
4170
  movups    xmm2, [Param2 + 0x20]   ; Self[2]
4171
  movups    xmm3, [Param2 + 0x30]   ; Self[3]
4172
  movups    xmm4, [Param3 + 0x00] ; AOther[0]
4173
  movups    xmm5, [Param3 + 0x10] ; AOther[1]
4174

4175
  ; Component-wise multiplication
4176
  mulps     xmm0, xmm4
4177
  mulps     xmm1, xmm5
4178
  movups    xmm4, [Param3 + 0x20] ; AOther[2]
4179
  movups    xmm5, [Param3 + 0x30] ; AOther[3]
4180
  mulps     xmm2, xmm4
4181
  mulps     xmm3, xmm5
4182

4183
  ; Store result
4184
  movups    [Param1 + 0x00], xmm0
4185
  movups    [Param1 + 0x10], xmm1
4186
  movups    [Param1 + 0x20], xmm2
4187
  movups    [Param1 + 0x30], xmm3
4188
  ret                                                   
4189
  
4190
%macro M4_MUL_V4 2
4191
  movups    xmm0, [%2]             ; Load vector
4192
  movups    xmm4, [%1 + 0x00]      ; Load 4 rows
4193
  movaps    xmm1, xmm0
4194
  movaps    xmm2, xmm0
4195
  movaps    xmm3, xmm0
4196
  movups    xmm5, [%1 + 0x10]
4197
  mulps     xmm0, xmm4             ; (Ax * B00), (Ay * B01), (Az * B02), (Aw * B03)
4198
  mulps     xmm1, xmm5             ; (Ax * B10), (Ay * B11), (Az * B12), (Aw * B13)
4199
  movups    xmm4, [%1 + 0x20]
4200
  movups    xmm5, [%1 + 0x30]
4201
  mulps     xmm2, xmm4             ; (Ax * B20), (Ay * B21), (Az * B22), (Aw * B23)
4202
  mulps     xmm3, xmm5             ; (Ax * B30), (Ay * B31), (Az * B32), (Aw * B33)
4203

4204
  ; Transpose xmm0-xmm3 
4205
  movaps    xmm4, xmm2
4206
  unpcklps  xmm2, xmm3             ; B32 B22 B33 B23
4207
  unpckhps  xmm4, xmm3             ; B30 B20 B31 B21
4208

4209
  movaps    xmm3, xmm0
4210
  unpcklps  xmm0, xmm1             ; B12 B02 B13 B03
4211
  unpckhps  xmm3, xmm1             ; B10 B00 B11 B01
4212

4213
  movaps    xmm1, xmm0
4214
  unpcklpd  xmm0, xmm2             ; B33 B23 B13 B03
4215
  unpckhpd  xmm1, xmm2             ; B32 B22 B12 B02
4216

4217
  movaps    xmm2, xmm3
4218
  unpcklpd  xmm2, xmm4             ; B31 B21 B11 B01
4219
  unpckhpd  xmm3, xmm4             ; B30 B20 B10 B00
4220

4221
  addps     xmm0, xmm1             ; Add rows
4222
  addps     xmm2, xmm3
4223
  addps     xmm0, xmm2
4224
  movhlps   xmm1, xmm0
4225
  ret
4226
%endmacro
4227

4228
%macro V4_MUL_M4 2
4229
  movups    xmm0, [%1]             ; Load vector
4230
  movups    xmm4, [%2 + 0x00]      ; Load 4 rows
4231
  movaps    xmm1, xmm0
4232
  movaps    xmm2, xmm0
4233
  movaps    xmm3, xmm0
4234
  shufps    xmm0, xmm0, 0x00       ; Bx Bx Bx Bx
4235
  shufps    xmm1, xmm1, 0x55       ; By By By By
4236
  shufps    xmm2, xmm2, 0xAA       ; Bz Bz Bz Bz
4237
  shufps    xmm3, xmm3, 0xFF       ; Bw Bw Bw Bw
4238
  movups    xmm5, [%2 + 0x10]
4239
  mulps     xmm0, xmm4             ; (A00 * Bx), (A01 * Bx), (A02 * Bx), (A03 * Bx)
4240
  mulps     xmm1, xmm5             ; (A10 * By), (A11 * By), (A12 * By), (A13 * By)
4241
  movups    xmm4, [%2 + 0x20]
4242
  movups    xmm5, [%2 + 0x30]
4243
  mulps     xmm2, xmm4             ; (A20 * Bz), (A21 * Bz), (A22 * Bz), (A23 * Bz)
4244
  mulps     xmm3, xmm5             ; (A30 * Bw), (A31 * Bw), (A32 * Bw), (A33 * Bw)
4245
  addps     xmm0, xmm1             ; Add rows
4246
  addps     xmm2, xmm3
4247
  addps     xmm0, xmm2
4248
  movhlps   xmm1, xmm0
4249
  ret
4250
%endmacro
4251
  
4252
%macro M4_MUL_M4 2
4253
  ; A.R[0] * B 
4254
  movups    xmm0, [%1 + 0x00]
4255
  movups    xmm4, [%2 + 0x00]
4256
  movaps    xmm1, xmm0
4257
  movaps    xmm2, xmm0
4258
  movaps    xmm3, xmm0
4259
  shufps    xmm0, xmm0, 0x00
4260
  shufps    xmm1, xmm1, 0x55
4261
  shufps    xmm2, xmm2, 0xAA
4262
  shufps    xmm3, xmm3, 0xFF
4263
  movups    xmm5, [%2 + 0x10]
4264
  movups    xmm6, [%2 + 0x20]
4265
  movups    xmm7, [%2 + 0x30]
4266
  mulps     xmm0, xmm4
4267
  mulps     xmm1, xmm5
4268
  mulps     xmm2, xmm6
4269
  mulps     xmm3, xmm7
4270
  addps     xmm0, xmm1
4271
  addps     xmm2, xmm3
4272
  addps     xmm0, xmm2
4273
  movups    [Param1 + 0x00], xmm0
4274

4275
  ; A.R[1] * B 
4276
  movups    xmm0, [%1 + 0x10]
4277
  movaps    xmm1, xmm0
4278
  movaps    xmm2, xmm0
4279
  movaps    xmm3, xmm0
4280
  shufps    xmm0, xmm0, 0x00
4281
  shufps    xmm1, xmm1, 0x55
4282
  shufps    xmm2, xmm2, 0xAA
4283
  shufps    xmm3, xmm3, 0xFF
4284
  mulps     xmm0, xmm4
4285
  mulps     xmm1, xmm5
4286
  mulps     xmm2, xmm6
4287
  mulps     xmm3, xmm7
4288
  addps     xmm0, xmm1
4289
  addps     xmm2, xmm3
4290
  addps     xmm0, xmm2
4291
  movups    [Param1 + 0x10], xmm0
4292

4293
  ; A.R[2] * B 
4294
  movups    xmm0, [%1 + 0x20]
4295
  movaps    xmm1, xmm0
4296
  movaps    xmm2, xmm0
4297
  movaps    xmm3, xmm0
4298
  shufps    xmm0, xmm0, 0x00
4299
  shufps    xmm1, xmm1, 0x55
4300
  shufps    xmm2, xmm2, 0xAA
4301
  shufps    xmm3, xmm3, 0xFF
4302
  mulps     xmm0, xmm4
4303
  mulps     xmm1, xmm5
4304
  mulps     xmm2, xmm6
4305
  mulps     xmm3, xmm7
4306
  addps     xmm0, xmm1
4307
  addps     xmm2, xmm3
4308
  addps     xmm0, xmm2
4309
  movups    [Param1 + 0x20], xmm0
4310

4311
  ; A.R[3] * B 
4312
  movups    xmm0, [%1 + 0x30]
4313
  movaps    xmm1, xmm0
4314
  movaps    xmm2, xmm0
4315
  movaps    xmm3, xmm0
4316
  shufps    xmm0, xmm0, 0x00
4317
  shufps    xmm1, xmm1, 0x55
4318
  shufps    xmm2, xmm2, 0xAA
4319
  shufps    xmm3, xmm3, 0xFF
4320
  mulps     xmm0, xmm4
4321
  mulps     xmm1, xmm5
4322
  mulps     xmm2, xmm6
4323
  mulps     xmm3, xmm7
4324
  addps     xmm0, xmm1
4325
  addps     xmm2, xmm3
4326
  addps     xmm0, xmm2
4327
  movups    [Param1 + 0x30], xmm0 
4328
  ret
4329
%endmacro
4330

4331
%ifdef FM_COLUMN_MAJOR
4332
_matrix4_mul_vector4:
4333
  V4_MUL_M4 Param2, Param1
4334
  
4335
_vector4_mul_matrix4:
4336
  M4_MUL_V4 Param2, Param1
4337
  
4338
_matrix4_mul_matrix4:
4339
  M4_MUL_M4 Param3, Param2
4340
%else
4341
_matrix4_mul_vector4:
4342
  M4_MUL_V4 Param1, Param2
4343
  
4344
_vector4_mul_matrix4:
4345
  V4_MUL_M4 Param1, Param2
4346
  
4347
_matrix4_mul_matrix4:
4348
  M4_MUL_M4 Param2, Param3
4349
%endif
4350

4351
_matrix4_div_single:
4352
  movups    xmm1, [Param2 + 0x00]  ; Load 4 rows
4353
  shufps    xmm0, xmm0, 0          ; Replicate B
4354
  movups    xmm2, [Param2 + 0x10]
4355
  movups    xmm3, [Param2 + 0x20]
4356
  movups    xmm4, [Param2 + 0x30]
4357
  divps     xmm1, xmm0             ; Divide each row by B
4358
  divps     xmm2, xmm0             ; NOTE: We could speed it up by multiplying by
4359
  divps     xmm3, xmm0             ; 1/B instead, using the "rcpps" instruction,
4360
  divps     xmm4, xmm0             ; but that instruction is an approximation,
4361
                                   ; so we lose accuracy.
4362
  movups    [Param1 + 0x00], xmm1
4363
  movups    [Param1 + 0x10], xmm2
4364
  movups    [Param1 + 0x20], xmm3
4365
  movups    [Param1 + 0x30], xmm4
4366
  ret
4367
  
4368
_single_div_matrix4:
4369
  movups    xmm4, [Param2 + 0x00]  ; Load 4 rows
4370
  shufps    xmm0, xmm0, 0          ; Replicate A
4371
  movups    xmm5, [Param2 + 0x10]
4372
  movaps    xmm1, xmm0
4373
  movaps    xmm2, xmm0
4374
  movaps    xmm3, xmm0
4375
  divps     xmm0, xmm4             ; Divide A by each row
4376
  divps     xmm1, xmm5
4377
  movups    xmm4, [Param2 + 0x20]
4378
  movups    xmm5, [Param2 + 0x30]
4379
  divps     xmm2, xmm4
4380
  divps     xmm3, xmm5
4381
  movups    [Param1 + 0x00], xmm0
4382
  movups    [Param1 + 0x10], xmm1
4383
  movups    [Param1 + 0x20], xmm2
4384
  movups    [Param1 + 0x30], xmm3
4385
  ret
4386
  
4387
_matrix4_negative:
4388
  movaps    xmm0, [rel kSSE_MASK_SIGN]  ; Load mask with 4 sign (upper) bits
4389
  movups    xmm1, [Param2 + 0x00]       ; Load 4 rows
4390
  movups    xmm2, [Param2 + 0x10]
4391
  movups    xmm3, [Param2 + 0x20]
4392
  movups    xmm4, [Param2 + 0x30]
4393
  xorps     xmm1, xmm0                  ; Flip sign bits of each element in each row
4394
  xorps     xmm2, xmm0
4395
  xorps     xmm3, xmm0
4396
  xorps     xmm4, xmm0
4397
  movups    [Param1 + 0x00], xmm1
4398
  movups    [Param1 + 0x10], xmm2
4399
  movups    [Param1 + 0x20], xmm3
4400
  movups    [Param1 + 0x30], xmm4
4401
  ret
4402
  
4403
%macro M4_INVERSE 2  
4404
  movups    xmm1, [%2 + 0x10]      ; M[1]
4405
  movups    xmm2, [%2 + 0x20]      ; M[2]
4406
  movups    xmm3, [%2 + 0x30]      ; M[3]
4407

4408
  ;  C00 := (A.M[2,2] * A.M[3,3]) - (A.M[3,2] * A.M[2,3]);
4409
  ;  C02 := (A.M[1,2] * A.M[3,3]) - (A.M[3,2] * A.M[1,3]);
4410
  ;  C03 := (A.M[1,2] * A.M[2,3]) - (A.M[2,2] * A.M[1,3]);
4411
  ;  F0 := Vector4(C00, C00, C02, C03);
4412
  movaps    xmm5, xmm2             ; M[2]
4413
  movaps    xmm7, xmm2             ; M[2]
4414
  movaps    xmm0, xmm3             ; M[3]
4415
  movaps    xmm6, xmm3             ; M[3]
4416
  shufps    xmm6, xmm2, 0xAA       ; M22 M22 M32 M32
4417
  shufps    xmm0, xmm2, 0xFF       ; M23 M23 M33 M33
4418
  shufps    xmm7, xmm1, 0xFF       ; M13 M13 M23 M23
4419
  pshufd    xmm4, xmm0, 0x80       ; M23 M33 M33 M33
4420
  shufps    xmm5, xmm1, 0xAA       ; M12 M12 M22 M22
4421
  pshufd    xmm0, xmm6, 0x80       ; M22 M32 M32 M32
4422
  mulps     xmm5, xmm4             ; (M12 * M23) (M12 * M33) (M22 * M33) (M22 * M33)
4423
  mulps     xmm7, xmm0             ; (M22 * M13) (M32 * M13) (M32 * M23) (M32 * M23)
4424
  subps     xmm5, xmm7             ; C03=(M12*M23)-(M22*M13), C02=(M12*M33)-(M32*M13), C00=(M22*M33)-(M32*M23), C00=(M22*M33)-(M32*M23)
4425
  movups    xmm8, xmm5
4426

4427
  ;  C04 := (A.M[2,1] * A.M[3,3]) - (A.M[3,1] * A.M[2,3]);
4428
  ;  C06 := (A.M[1,1] * A.M[3,3]) - (A.M[3,1] * A.M[1,3]);
4429
  ;  C07 := (A.M[1,1] * A.M[2,3]) - (A.M[2,1] * A.M[1,3]);
4430
  ;  F1 := Vector4(C04, C04, C06, C07);
4431
  movaps    xmm5, xmm2             ; M[2]
4432
  movaps    xmm7, xmm2             ; M[2]
4433
  movaps    xmm0, xmm3             ; M[3]
4434
  movaps    xmm6, xmm3             ; M[3]
4435
  shufps    xmm6, xmm2, 0x55       ; M21 M21 M31 M31
4436
  shufps    xmm0, xmm2, 0xFF       ; M23 M23 M33 M33
4437
  shufps    xmm7, xmm1, 0xFF       ; M13 M13 M23 M23
4438
  pshufd    xmm4, xmm0, 0x80       ; M23 M33 M33 M33
4439
  shufps    xmm5, xmm1, 0x55       ; M11 M11 M21 M21
4440
  pshufd    xmm0, xmm6, 0x80       ; M21 M31 M31 M31
4441
  mulps     xmm5, xmm4             ; (M11 * M23) (M11 * M33) (M21 * M33) (M21 * M33)
4442
  mulps     xmm7, xmm0             ; (M21 * M13) (M31 * M13) (M31 * M23) (M31 * M23)
4443
  subps     xmm5, xmm7             ; C07=(M11*M23)-(M21*M13), C06=(M11*M33)-(M31*M13), C04=(M21*M33)-(M31*M23), C04=(M21*M33)-(M31*M23)
4444
  movups    xmm9, xmm5
4445

4446
  ;  C08 := (A.M[2,1] * A.M[3,2]) - (A.M[3,1] * A.M[2,2]);
4447
  ;  C10 := (A.M[1,1] * A.M[3,2]) - (A.M[3,1] * A.M[1,2]);
4448
  ;  C11 := (A.M[1,1] * A.M[2,2]) - (A.M[2,1] * A.M[1,2]);
4449
  ;  F2 := Vector4(C08, C08, C10, C11);
4450
  movaps    xmm5, xmm2             ; M[2]
4451
  movaps    xmm7, xmm2             ; M[2]
4452
  movaps    xmm0, xmm3             ; M[3]
4453
  movaps    xmm6, xmm3             ; M[3]
4454
  shufps    xmm6, xmm2, 0x55       ; M21 M21 M31 M31
4455
  shufps    xmm0, xmm2, 0xAA       ; M22 M22 M32 M32
4456
  shufps    xmm7, xmm1, 0xAA       ; M12 M12 M22 M22
4457
  pshufd    xmm4, xmm0, 0x80       ; M22 M32 M32 M32
4458
  shufps    xmm5, xmm1, 0x55       ; M11 M11 M21 M21
4459
  pshufd    xmm0, xmm6, 0x80       ; M21 M31 M31 M31
4460
  mulps     xmm5, xmm4             ; (M11 * M22) (M11 * M32) (M21 * M32) (M21 * M32)
4461
  mulps     xmm7, xmm0             ; (M21 * M12) (M31 * M12) (M31 * M22) (M32 * M22)
4462
  subps     xmm5, xmm7             ; C11=(M11*M22)-(M21*M12), C10=(M11*M32)-(M31*M12), C08=(M21*M32)-(M31*M22), C08=(M21*M32)-(M31*M22)
4463
  movups    xmm10, xmm5
4464

4465
  ;  C12 := (A.M[2,0] * A.M[3,3]) - (A.M[3,0] * A.M[2,3]);
4466
  ;  C14 := (A.M[1,0] * A.M[3,3]) - (A.M[3,0] * A.M[1,3]);
4467
  ;  C15 := (A.M[1,0] * A.M[2,3]) - (A.M[2,0] * A.M[1,3]);
4468
  ;  F3 := Vector4(C12, C12, C14, C15);
4469
  movaps    xmm5, xmm2             ; M[2]
4470
  movaps    xmm7, xmm2             ; M[2]
4471
  movaps    xmm0, xmm3             ; M[3]
4472
  movaps    xmm6, xmm3             ; M[3]
4473
  shufps    xmm6, xmm2, 0x00       ; M20 M20 M30 M30
4474
  shufps    xmm0, xmm2, 0xFF       ; M23 M23 M33 M33
4475
  shufps    xmm7, xmm1, 0xFF       ; M13 M13 M23 M23
4476
  pshufd    xmm4, xmm0, 0x80       ; M23 M33 M33 M33
4477
  shufps    xmm5, xmm1, 0x00       ; M10 M10 M20 M20
4478
  pshufd    xmm0, xmm6, 0x80       ; M20 M30 M30 M30
4479
  mulps     xmm5, xmm4             ; (M10 * M23) (M10 * M33) (M20 * M33) (M20 * M33)
4480
  mulps     xmm7, xmm0             ; (M20 * M13) (M30 * M13) (M30 * M23) (M30 * M23)
4481
  subps     xmm5, xmm7             ; C15=(M10*M23)-(M20*M13), C14=(M10*M33)-(M30*M13), C12=(M20*M33)-(M30*M23), C12=(M20*M33)-(M30*M23)
4482
  movups    xmm11, xmm5
4483

4484
  ;  C16 := (A.M[2,0] * A.M[3,2]) - (A.M[3,0] * A.M[2,2]);
4485
  ;  C18 := (A.M[1,0] * A.M[3,2]) - (A.M[3,0] * A.M[1,2]);
4486
  ;  C19 := (A.M[1,0] * A.M[2,2]) - (A.M[2,0] * A.M[1,2]);
4487
  ;  F4 := Vector4(C16, C16, C18, C19);
4488
  movaps    xmm5, xmm2             ; M[2]
4489
  movaps    xmm7, xmm2             ; M[2]
4490
  movaps    xmm0, xmm3             ; M[3]
4491
  movaps    xmm6, xmm3             ; M[3]
4492
  shufps    xmm6, xmm2, 0x00       ; M20 M20 M30 M30
4493
  shufps    xmm0, xmm2, 0xAA       ; M22 M22 M32 M32
4494
  shufps    xmm7, xmm1, 0xAA       ; M12 M12 M22 M22
4495
  pshufd    xmm4, xmm0, 0x80       ; M22 M32 M32 M32
4496
  shufps    xmm5, xmm1, 0x00       ; M10 M10 M20 M20
4497
  pshufd    xmm0, xmm6, 0x80       ; M20 M30 M30 M30
4498
  mulps     xmm5, xmm4             ; (M10 * M22) (M10 * M32) (M20 * M32) (M20 * M32)
4499
  mulps     xmm7, xmm0             ; (M20 * M12) (M30 * M12) (M30 * M22) (M30 * M22)
4500
  subps     xmm5, xmm7             ; C19=(M10*M22)-(M20*M12), C18=(M10*M32)-(M30*M12), C16=(M20*M32)-(M30*M22), C16=(M20*M32)-(M30*M22)
4501
  movups    xmm12, xmm5
4502

4503
  ;  C20 := (A.M[2,0] * A.M[3,1]) - (A.M[3,0] * A.M[2,1]);
4504
  ;  C22 := (A.M[1,0] * A.M[3,1]) - (A.M[3,0] * A.M[1,1]);
4505
  ;  C23 := (A.M[1,0] * A.M[2,1]) - (A.M[2,0] * A.M[1,1]);
4506
  ;  F5 := Vector4(C20, C20, C22, C23);
4507
  movaps    xmm5, xmm2             ; M[2]
4508
  movaps    xmm7, xmm2             ; M[2]
4509
  movaps    xmm0, xmm3             ; M[3]
4510
  movaps    xmm6, xmm3             ; M[3]
4511
  shufps    xmm6, xmm2, 0x00       ; M20 M20 M30 M30
4512
  shufps    xmm0, xmm2, 0x55       ; M21 M21 M31 M31
4513
  shufps    xmm7, xmm1, 0x55       ; M11 M11 M21 M21
4514
  pshufd    xmm4, xmm0, 0x80       ; M21 M31 M31 M31
4515
  shufps    xmm5, xmm1, 0x00       ; M10 M10 M20 M20
4516
  pshufd    xmm0, xmm6, 0x80       ; M20 M30 M30 M30
4517
  mulps     xmm5, xmm4             ; (M10 * M21) (M10 * M31) (M20 * M31) (M20 * M31)
4518
  mulps     xmm7, xmm0             ; (M20 * M11) (M30 * M11) (M30 * M21) (M30 * M21)
4519
  subps     xmm5, xmm7             ; C23=(M10*M21)-(M20*M11), C22=(M10*M31)-(M30*M11), C20=(M20*M31)-(M30*M21), C20=(M20*M31)-(M30*M21)
4520
  movups    xmm13, xmm5
4521

4522
  ;  V0 := Vector4(A.M[1,0], A.M[0,0], A.M[0,0], A.M[0,0]);
4523
  ;  V1 := Vector4(A.M[1,1], A.M[0,1], A.M[0,1], A.M[0,1]);
4524
  ;  V2 := Vector4(A.M[1,2], A.M[0,2], A.M[0,2], A.M[0,2]);
4525
  ;  V3 := Vector4(A.M[1,3], A.M[0,3], A.M[0,3], A.M[0,3]);
4526
  movups    xmm0, [%2 + 0x00]      ; M[0]
4527
  movaps    xmm4, xmm1             ; M[1]
4528
  movaps    xmm5, xmm1             ; M[1]
4529
  movaps    xmm6, xmm1             ; M[1]
4530
  movaps    xmm7, xmm1             ; M[1]
4531

4532
  shufps    xmm4, xmm0, 0x00       ; M00 M00 M10 M10
4533
  shufps    xmm5, xmm0, 0x55       ; M01 M01 M11 M11
4534
  shufps    xmm6, xmm0, 0xAA       ; M02 M02 M12 M12
4535
  shufps    xmm7, xmm0, 0xFF       ; M03 M03 M13 M13
4536

4537
  pshufd    xmm4, xmm4, 0xA8       ; V0=M00 M00 M00 M10
4538
  pshufd    xmm5, xmm5, 0xA8       ; V1=M01 M01 M01 M11
4539
  pshufd    xmm6, xmm6, 0xA8       ; V2=M02 M02 M02 M12
4540
  pshufd    xmm7, xmm7, 0xA8       ; V3=M03 M03 M03 M13
4541

4542
  ;  I0 := (V1 * F0) - (V2 * F1) + (V3 * F2);
4543
  ;  I1 := (V0 * F0) - (V2 * F3) + (V3 * F4);
4544
  ;  I2 := (V0 * F1) - (V1 * F3) + (V3 * F5);
4545
  ;  I3 := (V0 * F2) - (V1 * F4) + (V2 * F5);
4546
  movaps    xmm0, xmm5             ; V1
4547
  movaps    xmm1, xmm6             ; V2
4548
  movaps    xmm2, xmm7             ; V3
4549
  mulps     xmm0, xmm8             ; V1 * F0
4550
  mulps     xmm1, xmm9             ; V2 * F1
4551
  mulps     xmm2, xmm10            ; V3 * F2
4552
  subps     xmm0, xmm1             ; (V1 * F0) - (V2 * F1)
4553
  movaps    xmm1, xmm4             ; V0
4554
  addps     xmm0, xmm2             ; I0=(V1 * F0) - (V2 * F1) + (V3 * F2)
4555

4556
  movaps    xmm2, xmm6             ; V2
4557
  movaps    xmm3, xmm7             ; V3
4558
  mulps     xmm1, xmm8             ; V0 * F0
4559
  mulps     xmm2, xmm11            ; V2 * F3
4560
  mulps     xmm3, xmm12            ; V3 * F4
4561
  subps     xmm1, xmm2             ; (V0 * F0) - (V2 * F3)
4562
  movaps    xmm2, xmm4             ; V0
4563
  addps     xmm1, xmm3             ; I1=(V0 * F0) - (V2 * F3) + (V3 * F4)
4564

4565
  movaps    xmm3, xmm5             ; V1
4566
  mulps     xmm2, xmm9             ; V0 * F1
4567
  mulps     xmm3, xmm11            ; V1 * F3
4568
  mulps     xmm7, xmm13            ; V3 * F5
4569
  subps     xmm2, xmm3             ; (V0 * F1) - (V1 * F3)
4570
  mulps     xmm4, xmm10            ; V0 * F2
4571
  addps     xmm2, xmm7             ; I2=(V0 * F1) - (V1 * F3) + (V3 * F5)
4572

4573
  mulps     xmm5, xmm12            ; V1 * F4
4574
  mulps     xmm6, xmm13            ; V2 * F5
4575
  subps     xmm4, xmm5             ; (V0 * F2) - (V1 * F4)
4576
  addps     xmm4, xmm6             ; I3=(V0 * F2) - (V1 * F4) + (V2 * F5)
4577

4578
  ;  SA := Vector4(+1, -1, +1, -1);
4579
  ;  SB := Vector4(-1, +1, -1, +1);
4580
  ;  Inv := Matrix4(I0 * SA, I1 * SB, I2 * SA, I3 * SB);
4581

4582
  movaps    xmm6, [rel kSSE_MASK_PNPN] ; SA
4583
  movaps    xmm7, [rel kSSE_MASK_NPNP] ; SB
4584
  xorps     xmm0, xmm6             ; Inv[0] = I0 * SA
4585
  xorps     xmm1, xmm7             ; Inv[1] = I1 * SB
4586
  xorps     xmm2, xmm6             ; Inv[2] = I2 * SA
4587
  xorps     xmm4, xmm7             ; Inv[3] = I3 * SB
4588

4589
  ;  Row := Vector4(Inv[0,0], Inv[1,0], Inv[2,0], Inv[3,0]);
4590
  movaps    xmm3, xmm0
4591
  movaps    xmm5, xmm2
4592
  movaps    xmm6, xmm1
4593

4594
  unpcklps  xmm3, xmm1             ; Inv[1,1] Inv[0,1] Inv[1,0] Inv[0,0]
4595
  unpcklps  xmm5, xmm4             ; Inv[3,1] Inv[2,1] Inv[3,0] Inv[2,0]
4596
  movups    xmm6, [%2 + 0x00]      ; A.C[0]
4597
  movlhps   xmm3, xmm5             ; Inv[3,0] Inv[2,0] Inv[1,0] Inv[0,0]
4598

4599
  ;  Dot := A.C[0] * Row;
4600
  mulps     xmm3, xmm6             ; Dot.W  Dot.Z  Dot.Y  Dot.X
4601

4602
  ;  OneOverDeterminant := 1 / ((Dot.X + Dot.Y) + (Dot.Z + Dot.W));
4603
  pshufd    xmm6, xmm3, 0x4E       ; Dot.Y  Dot.X  Dot.W  Dot.Z
4604
  addps     xmm3, xmm6             ; W+Y Z+X Y+W X+Z
4605
  pshufd    xmm6, xmm3, 0x11       ; X+Z Y+X X+Z Y+W
4606
  movaps    xmm5, [rel kSSE_ONE]   ; 1.0 (4x)
4607
  addps     xmm3, xmm6             ; X+Y+Z+W (4x)
4608
  divps     xmm5, xmm3             ; OneOverDeterminant (4x)
4609

4610
  ;  Result := Inv * OneOverDeterminant;
4611
  mulps     xmm0, xmm5
4612
  mulps     xmm1, xmm5
4613
  mulps     xmm2, xmm5
4614
  mulps     xmm4, xmm5
4615

4616
  movups    [%1 + 0x00], xmm0
4617
  movups    [%1 + 0x10], xmm1
4618
  movups    [%1 + 0x20], xmm2
4619
  movups    [%1 + 0x30], xmm4
4620
  ret
4621
%endmacro  
4622

4623
_matrix4_inverse:
4624
  M4_INVERSE Param1, Param2
4625

4626
_matrix4_set_inversed:
4627
  M4_INVERSE Param1, Param1
4628
  
4629
%macro M4_TRANSPOSE 2  
4630
  movups    xmm0, [%2 + 0x00]        ; A03 A02 A01 A00
4631
  movups    xmm1, [%2 + 0x10]        ; A13 A12 A11 A10
4632
  movups    xmm2, [%2 + 0x20]        ; A23 A22 A21 A20
4633
  movups    xmm3, [%2 + 0x30]        ; A33 A32 A31 A30
4634

4635
  movaps    xmm4, xmm2
4636
  unpcklps  xmm2, xmm3               ; A31 A21 A30 A20
4637
  unpckhps  xmm4, xmm3               ; A33 A23 A32 A22
4638

4639
  movaps    xmm3, xmm0
4640
  unpcklps  xmm0, xmm1               ; A11 A01 A10 A00
4641
  unpckhps  xmm3, xmm1               ; A13 A03 A12 A02
4642

4643
  movaps    xmm1, xmm0
4644
  unpcklpd  xmm0, xmm2               ; A30 A20 A10 A00
4645
  unpckhpd  xmm1, xmm2               ; A31 A21 A11 A01
4646

4647
  movaps    xmm2, xmm3
4648
  unpcklpd  xmm2, xmm4               ; A32 A22 A12 A02
4649
  unpckhpd  xmm3, xmm4               ; A33 A23 A13 A03
4650

4651
  movups    [%1 + 0x00], xmm0
4652
  movups    [%1 + 0x10], xmm1
4653
  movups    [%1 + 0x20], xmm2
4654
  movups    [%1 + 0x30], xmm3
4655
  ret
4656
%endmacro  
4657

4658
_matrix4_transpose:
4659
  M4_TRANSPOSE Param1, Param2
4660
  
4661
_matrix4_set_transposed:
4662
  M4_TRANSPOSE Param1, Param1

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.