MathgeomGLS

Neslib.FastMath.Sse2_32.inc
5720 строк · 169.9 Кб
Перенос по словам
1
const
2
  { SSE rounding modes (bits in MXCSR register) }
3
  SSE_ROUND_MASK    = $FFFF9FFF;
4
  SSE_ROUND_NEAREST = $00000000;
5
  SSE_ROUND_DOWN    = $00002000;
6
  SSE_ROUND_UP      = $00004000;
7
  SSE_ROUND_TRUNC   = $00006000;
8

9
  { These constants fit in a single XMM register. These values represent
10
    sign-bits as used by 32-bit floating-point values.
11
    XOR'ing a floating-point value with $80000000 swaps the sign.
12
    XOR'ing a floating-point value with $00000000 leaves the value unchanged. }
13
  SSE_MASK_SIGN: array [0..3] of UInt32 = ($80000000, $80000000, $80000000, $80000000);
14
  SSE_MASK_NPNP: array [0..3] of UInt32 = ($80000000, $00000000, $80000000, $00000000);
15
  SSE_MASK_PNPN: array [0..3] of UInt32 = ($00000000, $80000000, $00000000, $80000000);
16
  SSE_MASK_0FFF: array [0..3] of UInt32 = ($FFFFFFFF, $FFFFFFFF, $FFFFFFFF, $00000000);
17

18
  { These constants mask off an element of the binary representation of a
19
    32-bit floating-point value. }
20
  SSE_MASK_FRACTION: array [0..3] of UInt32 = ($007FFFFF, $007FFFFF, $007FFFFF, $007FFFFF);
21
  SSE_MASK_EXPONENT: array [0..3] of UInt32 = ($7F800000, $7F800000, $7F800000, $7F800000);
22
  SSE_MASK_ABS_VAL : array [0..3] of UInt32 = ($7FFFFFFF, $7FFFFFFF, $7FFFFFFF, $7FFFFFFF);
23

24
  { Commonly used floating-point values }
25
  SSE_ONE_HALF    : array [0..3] of Single = (0.5, 0.5, 0.5, 0.5);
26
  SSE_ONE         : array [0..3] of Single = (1, 1, 1, 1);
27
  SSE_TWO         : array [0..3] of Single = (2, 2, 2, 2);
28
  SSE_THREE       : array [0..3] of Single = (3, 3, 3, 3);
29
  SSE_PI_OVER_180 : array [0..3] of Single = (Pi / 180, Pi / 180, Pi / 180, Pi / 180);
30
  SSE_180_OVER_PI : array [0..3] of Single = (180 / Pi, 180 / Pi, 180 / Pi, 180 / Pi);
31
  SSE_NEG_INFINITY: array [0..3] of Single = (NegInfinity, NegInfinity, NegInfinity, NegInfinity);
32
  SSE_PI_OVER_4   : array [0..3] of Single = (Pi / 4, Pi / 4, Pi / 4, Pi / 4);
33

34
  { Commonly used integer values }
35
  SSE_INT_ONE     : array [0..3] of Integer = (1, 1, 1, 1);
36
  SSE_INT_NOT_ONE : array [0..3] of Cardinal = ($FFFFFFFE, $FFFFFFFE, $FFFFFFFE, $FFFFFFFE);
37
  SSE_INT_TWO     : array [0..3] of Integer = (2, 2, 2, 2);
38
  SSE_INT_FOUR    : array [0..3] of Integer = (4, 4, 4, 4);
39

40
  { Constants for approximating trigonometric functions }
41
  SSE_FOPI: array [0..3] of Single = (1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516);
42
  SSE_SINCOF_P0: array [0..3] of Single = (-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4);
43
  SSE_SINCOF_P1: array [0..3] of Single = (8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3);
44
  SSE_SINCOF_P2: array [0..3] of Single = (-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1);
45
  SSE_COSCOF_P0: array [0..3] of Single = (2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005);
46
  SSE_COSCOF_P1: array [0..3] of Single = (-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003);
47
  SSE_COSCOF_P2: array [0..3] of Single = (4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002);
48

49
  SSE_EXP_A1 : array [0..3] of Single = (12102203.1615614, 12102203.1615614, 12102203.1615614, 12102203.1615614);
50
  SSE_EXP_A2 : array [0..3] of Single = (1065353216, 1065353216, 1065353216, 1065353216);
51
  SSE_EXP_CST: array [0..3] of Single = (2139095040, 2139095040, 2139095040, 2139095040);
52
  SSE_EXP_F1 : array [0..3] of Single = (0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125);
53
  SSE_EXP_F2 : array [0..3] of Single = (0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375);
54
  SSE_EXP_F3 : array [0..3] of Single = (0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875);
55
  SSE_EXP_F4 : array [0..3] of Single = (-2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3);
56
  SSE_EXP_F5 : array [0..3] of Single = (1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2);
57
  SSE_EXP_I1 : array [0..3] of UInt32 = ($3F800000, $3F800000, $3F800000, $3F800000);
58

59
  SSE_LN_CST: array [0..3] of Single = (-89.93423858, -89.93423858, -89.93423858, -89.93423858);
60
  SSE_LN_F1 : array [0..3] of Single = (3.3977745, 3.3977745, 3.3977745, 3.3977745);
61
  SSE_LN_F2 : array [0..3] of Single = (2.2744832, 2.2744832, 2.2744832, 2.2744832);
62
  SSE_LN_F3 : array [0..3] of Single = (0.024982445, 0.024982445, 0.024982445, 0.024982445);
63
  SSE_LN_F4 : array [0..3] of Single = (0.24371102, 0.24371102, 0.24371102, 0.24371102);
64
  SSE_LN_F5 : array [0..3] of Single = (0.69314718055995, 0.69314718055995, 0.69314718055995, 0.69314718055995);
65

66
  SSE_LOG2_I1: array [0..3] of UInt32 = ($3F000000, $3F000000, $3F000000, $3F000000);
67
  SSE_LOG2_F1: array [0..3] of Single = (1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7);
68
  SSE_LOG2_F2: array [0..3] of Single = (124.22551499, 124.22551499, 124.22551499, 124.22551499);
69
  SSE_LOG2_F3: array [0..3] of Single = (1.498030302, 1.498030302, 1.498030302, 1.498030302);
70
  SSE_LOG2_F4: array [0..3] of Single = (1.72587999, 1.72587999, 1.72587999, 1.72587999);
71
  SSE_LOG2_F5: array [0..3] of Single = (0.3520887068, 0.3520887068, 0.3520887068, 0.3520887068);
72

73
  SSE_EXP2_F1: array [0..3] of Single = (121.2740575, 121.2740575, 121.2740575, 121.2740575);
74
  SSE_EXP2_F2: array [0..3] of Single = (27.7280233, 27.7280233, 27.7280233, 27.7280233);
75
  SSE_EXP2_F3: array [0..3] of Single = (4.84252568, 4.84252568, 4.84252568, 4.84252568);
76
  SSE_EXP2_F4: array [0..3] of Single = (1.49012907, 1.49012907, 1.49012907, 1.49012907);
77
  SSE_EXP2_F5: array [0..3] of Single = ($00800000, $00800000, $00800000, $00800000);
78

79
{ Angle and Trigonometry Functions }
80

81
function Radians(const ADegrees: Single): Single;
82
begin
83
  Result := ADegrees * (Pi / 180);
84
end;
85

86
function Radians(const ADegrees: TVector2): TVector2; assembler;
87
asm
88
  movlps xmm0, [ADegrees]
89
  movlps xmm1, QWORD [SSE_PI_OVER_180]
90
  mulps  xmm0, xmm1
91
  movlps [Result], xmm0
92
end;
93

94
function Radians(const ADegrees: TVector3): TVector3; assembler;
95
asm
96
  movq   xmm0, [ADegrees]
97
  movss  xmm1, [ADegrees+8]
98
  movups xmm2, [SSE_PI_OVER_180]
99
  mulps  xmm0, xmm2
100
  mulps  xmm1, xmm2
101
  movq   [Result], xmm0
102
  movss  [Result+8], xmm1
103
end;
104

105
function Radians(const ADegrees: TVector4): TVector4; assembler;
106
asm
107
  movups xmm0, [ADegrees]
108
  movups xmm1, [SSE_PI_OVER_180]
109
  mulps  xmm0, xmm1
110
  movups [Result], xmm0
111
end;
112

113
function Degrees(const ARadians: Single): Single;
114
begin
115
  Result := ARadians * (180 / Pi);
116
end;
117

118
function Degrees(const ARadians: TVector2): TVector2; assembler;
119
asm
120
  movlps xmm0, [ARadians]
121
  movlps xmm1, QWORD [SSE_180_OVER_PI]
122
  mulps  xmm0, xmm1
123
  movlps [Result], xmm0
124
end;
125

126
function Degrees(const ARadians: TVector3): TVector3; assembler;
127
asm
128
  movq   xmm0, [ARadians]
129
  movss  xmm1, [ARadians+8]
130
  movups xmm2, [SSE_180_OVER_PI]
131
  mulps  xmm0, xmm2
132
  mulps  xmm1, xmm2
133
  movq   [Result], xmm0
134
  movss  [Result+8], xmm1
135
end;
136

137
function Degrees(const ARadians: TVector4): TVector4; assembler;
138
asm
139
  movups xmm0, [ARadians]
140
  movups xmm1, [SSE_180_OVER_PI]
141
  mulps  xmm0, xmm1
142
  movups [Result], xmm0
143
end;
144

145
{ Exponential Functions }
146

147
function Sqrt(const A: Single): Single; assembler;
148
asm
149
  movss  xmm0, [A]
150
  sqrtss xmm0, xmm0
151
  movss  [Result], xmm0
152
end;
153

154
function Sqrt(const A: TVector2): TVector2; assembler;
155
asm
156
  movlps xmm0, [A]
157
  sqrtps xmm0, xmm0
158
  movlps [Result], xmm0
159
end;
160

161
function Sqrt(const A: TVector3): TVector3; assembler;
162
asm
163
  movq    xmm0, [A]
164
  movss   xmm1, [A+8]
165
  movlhps xmm0, xmm1
166
  sqrtps  xmm0, xmm0
167
  movhlps xmm1, xmm0
168
  movq    [Result], xmm0
169
  movss   [Result+8], xmm1
170
end;
171

172
function Sqrt(const A: TVector4): TVector4; assembler;
173
asm
174
  movups xmm0, [A]
175
  sqrtps xmm0, xmm0
176
  movups [Result], xmm0
177
end;
178

179
function InverseSqrt(const A: Single): Single; assembler;
180
asm
181
  movss   xmm0, [A]
182
  rsqrtss xmm0, xmm0
183
  movss   [Result], xmm0
184
end;
185

186
function InverseSqrt(const A: TVector2): TVector2;
187
asm
188
  movlps  xmm0, [A]
189
  rsqrtps xmm0, xmm0
190
  movlps  [Result], xmm0
191
end;
192

193
function InverseSqrt(const A: TVector3): TVector3;
194
asm
195
  movq    xmm0, [A]
196
  movss   xmm1, [A+8]
197
  movlhps xmm0, xmm1
198
  rsqrtps xmm0, xmm0
199
  movhlps xmm1, xmm0
200
  movq    [Result], xmm0
201
  movss   [Result+8], xmm1
202
end;
203

204
function InverseSqrt(const A: TVector4): TVector4; assembler;
205
asm
206
  movups  xmm0, [A]
207
  rsqrtps xmm0, xmm0
208
  movups  [Result], xmm0
209
end;
210

211
{ Fast approximate Functions }
212

213
function FastSin(const ARadians: Single): Single; assembler;
214
asm
215
  movss    xmm0, [ARadians]
216
  movss    xmm2, DWORD [SSE_MASK_ABS_VAL]
217
  movaps   xmm1, xmm0
218
  movss    xmm3, DWORD [SSE_MASK_SIGN]
219
  andps    xmm0, xmm2               // (xmm0) X := Abs(ARadians)
220
  andps    xmm1, xmm3               // (xmm1) SignBit
221
  movaps   xmm2, xmm0
222
  movss    xmm4, DWORD [SSE_FOPI]
223
  movss    xmm5, DWORD [SSE_INT_ONE]
224
  mulss    xmm2, xmm4
225
  movss    xmm6, DWORD [SSE_INT_NOT_ONE]
226
  cvtps2dq xmm2, xmm2               // J := Trunc(X * FOPI)
227
  movss    xmm7, DWORD [SSE_INT_FOUR]
228
  paddd    xmm2, xmm5
229
  pand     xmm2, xmm6               // (xmm2) J := (J + 1) and (not 1)
230
  movss    xmm6, DWORD [SSE_INT_TWO]
231
  cvtdq2ps xmm4, xmm2               // (xmm4) Y := J
232
  movaps   xmm5, xmm2
233
  pand     xmm2, xmm6               // J and 2
234
  pand     xmm5, xmm7               // J and 4
235
  pxor     xmm7, xmm7
236
  pslld    xmm5, 29                 // (xmm5) SwapSignBit := (J and 4) shl 29
237
  pcmpeqd  xmm2, xmm7               // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
238
  movss    xmm6, DWORD [SSE_PI_OVER_4]
239
  pxor     xmm1, xmm5               // (xmm1) SignBit := SignBit xor SwapSignBit
240
  mulss    xmm4, xmm6               // Y * Pi / 4
241
  movss    xmm3, DWORD [SSE_COSCOF_P0]
242
  subss    xmm0, xmm4               // (xmm0) X := X - (Y * Pi / 4)
243
  movss    xmm4, DWORD [SSE_COSCOF_P1]
244
  movaps   xmm7, xmm0
245
  movss    xmm6, DWORD [SSE_COSCOF_P2]
246
  mulss    xmm7, xmm7               // (xmm7) Z := X * X
247
  movss    xmm5, DWORD [SSE_SINCOF_P1]
248
  mulss    xmm3, xmm7               // COSCOF_P0 * Z
249
  addss    xmm3, xmm4               // Y := COSCOF_P0 * Z + COSCOF_P1
250
  movss    xmm4, DWORD [SSE_ONE_HALF]
251
  mulss    xmm3, xmm7               // Y * Z
252
  mulss    xmm4, xmm7               // Z * 0.5
253
  addps    xmm3, xmm6               // Y := (Y * Z) + COSCOF_P2
254
  movss    xmm6, DWORD [SSE_ONE]
255
  mulss    xmm3, xmm7               // Y * Z
256
  mulss    xmm3, xmm7               // Y := Y * (Z * Z)
257
  subss    xmm3, xmm4               // Y - Z * 0.5
258
  movss    xmm4, DWORD [SSE_SINCOF_P0]
259
  addps    xmm3, xmm6               // (xmm3) Y := Y - Z * 0.5 + 1
260
  movss    xmm6, DWORD [SSE_SINCOF_P2]
261
  mulss    xmm4, xmm7               // SINCOF_P0 * Z
262
  addss    xmm4, xmm5               // Y2 := SINCOF_P0 * Z + SINCOF_P1
263
  movaps   xmm5, xmm2
264
  mulss    xmm4, xmm7               // Y2 * Z
265
  addss    xmm4, xmm6               // Y2 := (Y2 * Z) + SINCOF_P2
266
  mulss    xmm4, xmm7               // Y2 * Z
267
  mulss    xmm4, xmm0               // Y2 * (Z * X)
268
  addss    xmm4, xmm0               // (xmm4) Y2 := Y2 * (Z * X) + X
269
  andps    xmm4, xmm2               // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
270
  andnps   xmm5, xmm3               // Y  := ((J and 2) = 0)? Yes: 0 , No: Y
271
  addss    xmm4, xmm5
272
  xorps    xmm4, xmm1               // (Y + Y2) xor SignBit
273
  movss    [Result], xmm4
274
end;
275

276
function FastSin(const ARadians: TVector2): TVector2; assembler;
277
asm
278
  movlps   xmm0, [ARadians]
279
  movlps   xmm2, QWORD [SSE_MASK_ABS_VAL]
280
  movaps   xmm1, xmm0
281
  movlps   xmm3, QWORD [SSE_MASK_SIGN]
282
  andps    xmm0, xmm2               // (xmm0) X := Abs(ARadians)
283
  andps    xmm1, xmm3               // (xmm1) SignBit
284
  movaps   xmm2, xmm0
285
  movlps   xmm4, QWORD [SSE_FOPI]
286
  movlps   xmm5, QWORD [SSE_INT_ONE]
287
  mulps    xmm2, xmm4
288
  movlps   xmm6, QWORD [SSE_INT_NOT_ONE]
289
  cvtps2dq xmm2, xmm2               // J := Trunc(X * FOPI)
290
  movlps   xmm7, QWORD [SSE_INT_FOUR]
291
  paddd    xmm2, xmm5
292
  pand     xmm2, xmm6               // (xmm2) J := (J + 1) and (not 1)
293
  movlps   xmm6, QWORD [SSE_INT_TWO]
294
  cvtdq2ps xmm4, xmm2               // (xmm4) Y := J
295
  movaps   xmm5, xmm2
296
  pand     xmm2, xmm6               // J and 2
297
  pand     xmm5, xmm7               // J and 4
298
  pxor     xmm7, xmm7
299
  pslld    xmm5, 29                 // (xmm5) SwapSignBit := (J and 4) shl 29
300
  pcmpeqd  xmm2, xmm7               // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
301
  movlps   xmm6, QWORD [SSE_PI_OVER_4]
302
  pxor     xmm1, xmm5               // (xmm1) SignBit := SignBit xor SwapSignBit
303
  mulps    xmm4, xmm6               // Y * Pi / 4
304
  movlps   xmm3, QWORD [SSE_COSCOF_P0]
305
  subps    xmm0, xmm4               // (xmm0) X := X - (Y * Pi / 4)
306
  movlps   xmm4, QWORD [SSE_COSCOF_P1]
307
  movaps   xmm7, xmm0
308
  movlps   xmm6, QWORD [SSE_COSCOF_P2]
309
  mulps    xmm7, xmm7               // (xmm7) Z := X * X
310
  movlps   xmm5, QWORD [SSE_SINCOF_P1]
311
  mulps    xmm3, xmm7               // COSCOF_P0 * Z
312
  addps    xmm3, xmm4               // Y := COSCOF_P0 * Z + COSCOF_P1
313
  movlps   xmm4, QWORD [SSE_ONE_HALF]
314
  mulps    xmm3, xmm7               // Y * Z
315
  mulps    xmm4, xmm7               // Z * 0.5
316
  addps    xmm3, xmm6               // Y := (Y * Z) + COSCOF_P2
317
  movlps   xmm6, QWORD [SSE_ONE]
318
  mulps    xmm3, xmm7               // Y * Z
319
  mulps    xmm3, xmm7               // Y := Y * (Z * Z)
320
  subps    xmm3, xmm4               // Y - Z * 0.5
321
  movlps   xmm4, QWORD [SSE_SINCOF_P0]
322
  addps    xmm3, xmm6               // (xmm3) Y := Y - Z * 0.5 + 1
323
  movlps   xmm6, QWORD [SSE_SINCOF_P2]
324
  mulps    xmm4, xmm7               // SINCOF_P0 * Z
325
  addps    xmm4, xmm5               // Y2 := SINCOF_P0 * Z + SINCOF_P1
326
  movaps   xmm5, xmm2
327
  mulps    xmm4, xmm7               // Y2 * Z
328
  addps    xmm4, xmm6               // Y2 := (Y2 * Z) + SINCOF_P2
329
  mulps    xmm4, xmm7               // Y2 * Z
330
  mulps    xmm4, xmm0               // Y2 * (Z * X)
331
  addps    xmm4, xmm0               // (xmm4) Y2 := Y2 * (Z * X) + X
332
  andps    xmm4, xmm2               // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
333
  andnps   xmm5, xmm3               // Y  := ((J and 2) = 0)? Yes: 0 , No: Y
334
  addps    xmm4, xmm5
335
  xorps    xmm4, xmm1               // (Y + Y2) xor SignBit
336
  movlps   [Result], xmm4
337
end;
338

339
function FastSin(const ARadians: TVector3): TVector3; assembler;
340
asm
341
  movq     xmm0, [ARadians]
342
  movss    xmm1, [ARadians+8]
343
  movlhps  xmm0, xmm1
344
  movups   xmm2, [SSE_MASK_ABS_VAL]
345
  movaps   xmm1, xmm0
346
  movups   xmm3, [SSE_MASK_SIGN]
347
  andps    xmm0, xmm2               // (xmm0) X := Abs(ARadians)
348
  andps    xmm1, xmm3               // (xmm1) SignBit
349
  movaps   xmm2, xmm0
350
  movups   xmm4, [SSE_FOPI]
351
  movups   xmm5, [SSE_INT_ONE]
352
  mulps    xmm2, xmm4
353
  movups   xmm6, [SSE_INT_NOT_ONE]
354
  cvtps2dq xmm2, xmm2               // J := Trunc(X * FOPI)
355
  movups   xmm7, [SSE_INT_FOUR]
356
  paddd    xmm2, xmm5
357
  pand     xmm2, xmm6               // (xmm2) J := (J + 1) and (not 1)
358
  movups   xmm6, [SSE_INT_TWO]
359
  cvtdq2ps xmm4, xmm2               // (xmm4) Y := J
360
  movaps   xmm5, xmm2
361
  pand     xmm2, xmm6               // J and 2
362
  pand     xmm5, xmm7               // J and 4
363
  pxor     xmm7, xmm7
364
  pslld    xmm5, 29                 // (xmm5) SwapSignBit := (J and 4) shl 29
365
  pcmpeqd  xmm2, xmm7               // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
366
  movups   xmm6, [SSE_PI_OVER_4]
367
  pxor     xmm1, xmm5               // (xmm1) SignBit := SignBit xor SwapSignBit
368
  mulps    xmm4, xmm6               // Y * Pi / 4
369
  movups   xmm3, [SSE_COSCOF_P0]
370
  subps    xmm0, xmm4               // (xmm0) X := X - (Y * Pi / 4)
371
  movups   xmm4, [SSE_COSCOF_P1]
372
  movaps   xmm7, xmm0
373
  movups   xmm6, [SSE_COSCOF_P2]
374
  mulps    xmm7, xmm7               // (xmm7) Z := X * X
375
  movups   xmm5, [SSE_SINCOF_P1]
376
  mulps    xmm3, xmm7               // COSCOF_P0 * Z
377
  addps    xmm3, xmm4               // Y := COSCOF_P0 * Z + COSCOF_P1
378
  movups   xmm4, [SSE_ONE_HALF]
379
  mulps    xmm3, xmm7               // Y * Z
380
  mulps    xmm4, xmm7               // Z * 0.5
381
  addps    xmm3, xmm6               // Y := (Y * Z) + COSCOF_P2
382
  movups   xmm6, [SSE_ONE]
383
  mulps    xmm3, xmm7               // Y * Z
384
  mulps    xmm3, xmm7               // Y := Y * (Z * Z)
385
  subps    xmm3, xmm4               // Y - Z * 0.5
386
  movups   xmm4, [SSE_SINCOF_P0]
387
  addps    xmm3, xmm6               // (xmm3) Y := Y - Z * 0.5 + 1
388
  movups   xmm6, [SSE_SINCOF_P2]
389
  mulps    xmm4, xmm7               // SINCOF_P0 * Z
390
  addps    xmm4, xmm5               // Y2 := SINCOF_P0 * Z + SINCOF_P1
391
  movaps   xmm5, xmm2
392
  mulps    xmm4, xmm7               // Y2 * Z
393
  addps    xmm4, xmm6               // Y2 := (Y2 * Z) + SINCOF_P2
394
  mulps    xmm4, xmm7               // Y2 * Z
395
  mulps    xmm4, xmm0               // Y2 * (Z * X)
396
  addps    xmm4, xmm0               // (xmm4) Y2 := Y2 * (Z * X) + X
397
  andps    xmm4, xmm2               // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
398
  andnps   xmm5, xmm3               // Y  := ((J and 2) = 0)? Yes: 0 , No: Y
399
  addps    xmm4, xmm5
400
  xorps    xmm4, xmm1               // (Y + Y2) xor SignBit
401
  movhlps  xmm5, xmm4
402
  movq     [Result], xmm4
403
  movss    [Result+8], xmm5
404
end;
405

406
function FastSin(const ARadians: TVector4): TVector4; assembler;
407
asm
408
  movups   xmm0, [ARadians]
409
  movups   xmm2, [SSE_MASK_ABS_VAL]
410
  movaps   xmm1, xmm0
411
  movups   xmm3, [SSE_MASK_SIGN]
412
  andps    xmm0, xmm2               // (xmm0) X := Abs(ARadians)
413
  andps    xmm1, xmm3               // (xmm1) SignBit
414
  movaps   xmm2, xmm0
415
  movups   xmm4, [SSE_FOPI]
416
  movups   xmm5, [SSE_INT_ONE]
417
  mulps    xmm2, xmm4
418
  movups   xmm6, [SSE_INT_NOT_ONE]
419
  cvtps2dq xmm2, xmm2               // J := Trunc(X * FOPI)
420
  movups   xmm7, [SSE_INT_FOUR]
421
  paddd    xmm2, xmm5
422
  pand     xmm2, xmm6               // (xmm2) J := (J + 1) and (not 1)
423
  movups   xmm6, [SSE_INT_TWO]
424
  cvtdq2ps xmm4, xmm2               // (xmm4) Y := J
425
  movaps   xmm5, xmm2
426
  pand     xmm2, xmm6               // J and 2
427
  pand     xmm5, xmm7               // J and 4
428
  pxor     xmm7, xmm7
429
  pslld    xmm5, 29                 // (xmm5) SwapSignBit := (J and 4) shl 29
430
  pcmpeqd  xmm2, xmm7               // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
431
  movups   xmm6, [SSE_PI_OVER_4]
432
  pxor     xmm1, xmm5               // (xmm1) SignBit := SignBit xor SwapSignBit
433
  mulps    xmm4, xmm6               // Y * Pi / 4
434
  movups   xmm3, [SSE_COSCOF_P0]
435
  subps    xmm0, xmm4               // (xmm0) X := X - (Y * Pi / 4)
436
  movups   xmm4, [SSE_COSCOF_P1]
437
  movaps   xmm7, xmm0
438
  movups   xmm6, [SSE_COSCOF_P2]
439
  mulps    xmm7, xmm7               // (xmm7) Z := X * X
440
  movups   xmm5, [SSE_SINCOF_P1]
441
  mulps    xmm3, xmm7               // COSCOF_P0 * Z
442
  addps    xmm3, xmm4               // Y := COSCOF_P0 * Z + COSCOF_P1
443
  movups   xmm4, [SSE_ONE_HALF]
444
  mulps    xmm3, xmm7               // Y * Z
445
  mulps    xmm4, xmm7               // Z * 0.5
446
  addps    xmm3, xmm6               // Y := (Y * Z) + COSCOF_P2
447
  movups   xmm6, [SSE_ONE]
448
  mulps    xmm3, xmm7               // Y * Z
449
  mulps    xmm3, xmm7               // Y := Y * (Z * Z)
450
  subps    xmm3, xmm4               // Y - Z * 0.5
451
  movups   xmm4, [SSE_SINCOF_P0]
452
  addps    xmm3, xmm6               // (xmm3) Y := Y - Z * 0.5 + 1
453
  movups   xmm6, [SSE_SINCOF_P2]
454
  mulps    xmm4, xmm7               // SINCOF_P0 * Z
455
  addps    xmm4, xmm5               // Y2 := SINCOF_P0 * Z + SINCOF_P1
456
  movaps   xmm5, xmm2
457
  mulps    xmm4, xmm7               // Y2 * Z
458
  addps    xmm4, xmm6               // Y2 := (Y2 * Z) + SINCOF_P2
459
  mulps    xmm4, xmm7               // Y2 * Z
460
  mulps    xmm4, xmm0               // Y2 * (Z * X)
461
  addps    xmm4, xmm0               // (xmm4) Y2 := Y2 * (Z * X) + X
462
  andps    xmm4, xmm2               // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
463
  andnps   xmm5, xmm3               // Y  := ((J and 2) = 0)? Yes: 0 , No: Y
464
  addps    xmm4, xmm5
465
  xorps    xmm4, xmm1               // (Y + Y2) xor SignBit
466
  movups   [Result], xmm4
467
end;
468

469
function FastCos(const ARadians: Single): Single; assembler;
470
asm
471
  movss    xmm0, [ARadians]
472
  movss    xmm1, DWORD [SSE_MASK_ABS_VAL]
473
  movss    xmm2, DWORD [SSE_FOPI]
474
  andps    xmm0, xmm1               // (xmm0) X := Abs(ARadians)
475
  movss    xmm3, DWORD [SSE_INT_NOT_ONE]
476
  movaps   xmm1, xmm0
477
  movss    xmm4, DWORD [SSE_INT_FOUR]
478
  mulss    xmm1, xmm2
479
  movss    xmm2, DWORD [SSE_INT_ONE]
480
  cvtps2dq xmm1, xmm1               // J := Trunc(X * FOPI)
481
  pxor     xmm6, xmm6
482
  paddd    xmm1, xmm2
483
  pand     xmm1, xmm3               // (xmm1) J := (J + 1) and (not 1)
484
  movss    xmm3, DWORD [SSE_INT_TWO]
485
  cvtdq2ps xmm2, xmm1               // (xmm2) Y := J
486
  psubd    xmm1, xmm3               // J - 2
487
  movaps   xmm5, xmm1
488
  pandn    xmm1, xmm4               // (not (J - 2)) and 4
489
  pand     xmm5, xmm3               // (J - 2) and 2
490
  pslld    xmm1, 29                 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
491
  movss    xmm3, DWORD [SSE_PI_OVER_4]
492
  pcmpeqd  xmm5, xmm6               // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
493
  mulss    xmm2, xmm3               // Y * Pi / 4
494
  movss    xmm3, DWORD [SSE_COSCOF_P1]
495
  subss    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
496
  movss    xmm2, DWORD [SSE_COSCOF_P0]
497
  movss    xmm4, DWORD [SSE_COSCOF_P2]
498
  movaps   xmm6, xmm0
499
  mulss    xmm6, xmm6               // (xmm6) Z := X * X
500
  mulss    xmm2, xmm6               // COSCOF_P0 * Z
501
  addps    xmm2, xmm3               // Y := COSCOF_P0 * Z + COSCOF_P1
502
  movss    xmm3, DWORD [SSE_ONE_HALF]
503
  mulss    xmm2, xmm6               // Y * Z
504
  mulss    xmm3, xmm6               // Z * 0.5
505
  addss    xmm2, xmm4               // Y := (Y * Z) + COSCOF_P2
506
  movss    xmm7, DWORD [SSE_ONE]
507
  mulss    xmm2, xmm6
508
  movss    xmm4, DWORD [SSE_SINCOF_P1]
509
  mulss    xmm2, xmm6               // Y := Y * (Z * Z)
510
  subss    xmm2, xmm3               // Y - Z * 0.5
511
  addss    xmm2, xmm7               // (xmm2) Y := Y - Z * 0.5 + 1
512
  movss    xmm3, DWORD [SSE_SINCOF_P0]
513
  movss    xmm7, DWORD [SSE_SINCOF_P2]
514
  mulss    xmm3, xmm6               // SINCOF_P0 * Z
515
  addss    xmm3, xmm4               // Y2 := SINCOF_P0 * Z + SINCOF_P1
516
  mulss    xmm3, xmm6               // Y2 * Z
517
  addss    xmm3, xmm7               // Y2 := (Y2 * Z) + SINCOF_P2
518
  mulss    xmm3, xmm6               // Y2 * Z
519
  mulss    xmm3, xmm0               // Y2 * (Z * X)
520
  addss    xmm3, xmm0               // Y2 := Y2 * (Z * X) + X
521
  andps    xmm3, xmm5               // ((J-2) and 2) = 0)? Yes: Y2, No: 0
522
  andnps   xmm5, xmm2               // ((J-2) and 2) = 0)? Yes: 0 , No: Y
523
  addss    xmm3, xmm5
524
  xorps    xmm3, xmm1               // (Y + Y2) xor SignBit
525
  movss    [Result], xmm3
526
end;
527

528
function FastCos(const ARadians: TVector2): TVector2; assembler;
529
asm
530
  movlps   xmm0, [ARadians]
531
  movlps   xmm1, QWORD [SSE_MASK_ABS_VAL]
532
  movlps   xmm2, QWORD [SSE_FOPI]
533
  andps    xmm0, xmm1               // (xmm0) X := Abs(ARadians)
534
  movlps   xmm3, QWORD [SSE_INT_NOT_ONE]
535
  movaps   xmm1, xmm0
536
  movlps   xmm4, QWORD [SSE_INT_FOUR]
537
  mulps    xmm1, xmm2
538
  movlps   xmm2, QWORD [SSE_INT_ONE]
539
  cvtps2dq xmm1, xmm1               // J := Trunc(X * FOPI)
540
  pxor     xmm6, xmm6
541
  paddd    xmm1, xmm2
542
  pand     xmm1, xmm3               // (xmm1) J := (J + 1) and (not 1)
543
  movlps   xmm3, QWORD [SSE_INT_TWO]
544
  cvtdq2ps xmm2, xmm1               // (xmm2) Y := J
545
  psubd    xmm1, xmm3               // J - 2
546
  movaps   xmm5, xmm1
547
  pandn    xmm1, xmm4               // (not (J - 2)) and 4
548
  pand     xmm5, xmm3               // (J - 2) and 2
549
  pslld    xmm1, 29                 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
550
  movlps   xmm3, QWORD [SSE_PI_OVER_4]
551
  pcmpeqd  xmm5, xmm6               // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
552
  mulps    xmm2, xmm3               // Y * Pi / 4
553
  movlps   xmm3, QWORD [SSE_COSCOF_P1]
554
  subps    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
555
  movlps   xmm2, QWORD [SSE_COSCOF_P0]
556
  movlps   xmm4, QWORD [SSE_COSCOF_P2]
557
  movaps   xmm6, xmm0
558
  mulps    xmm6, xmm6               // (xmm6) Z := X * X
559
  mulps    xmm2, xmm6               // COSCOF_P0 * Z
560
  addps    xmm2, xmm3               // Y := COSCOF_P0 * Z + COSCOF_P1
561
  movlps   xmm3, QWORD [SSE_ONE_HALF]
562
  mulps    xmm2, xmm6               // Y * Z
563
  mulps    xmm3, xmm6               // Z * 0.5
564
  addps    xmm2, xmm4               // Y := (Y * Z) + COSCOF_P2
565
  movlps   xmm7, QWORD [SSE_ONE]
566
  mulps    xmm2, xmm6
567
  movlps   xmm4, QWORD [SSE_SINCOF_P1]
568
  mulps    xmm2, xmm6               // Y := Y * (Z * Z)
569
  subps    xmm2, xmm3               // Y - Z * 0.5
570
  addps    xmm2, xmm7               // (xmm2) Y := Y - Z * 0.5 + 1
571
  movlps   xmm3, QWORD [SSE_SINCOF_P0]
572
  movlps   xmm7, QWORD [SSE_SINCOF_P2]
573
  mulps    xmm3, xmm6               // SINCOF_P0 * Z
574
  addps    xmm3, xmm4               // Y2 := SINCOF_P0 * Z + SINCOF_P1
575
  mulps    xmm3, xmm6               // Y2 * Z
576
  addps    xmm3, xmm7               // Y2 := (Y2 * Z) + SINCOF_P2
577
  mulps    xmm3, xmm6               // Y2 * Z
578
  mulps    xmm3, xmm0               // Y2 * (Z * X)
579
  addps    xmm3, xmm0               // Y2 := Y2 * (Z * X) + X
580
  andps    xmm3, xmm5               // ((J-2) and 2) = 0)? Yes: Y2, No: 0
581
  andnps   xmm5, xmm2               // ((J-2) and 2) = 0)? Yes: 0 , No: Y
582
  addps    xmm3, xmm5
583
  xorps    xmm3, xmm1               // (Y + Y2) xor SignBit
584
  movlps   [Result], xmm3
585
end;
586

587
function FastCos(const ARadians: TVector3): TVector3; assembler;
588
asm
589
  movq     xmm0, [ARadians]
590
  movss    xmm1, [ARadians+8]
591
  movlhps  xmm0, xmm1
592
  movups   xmm1, [SSE_MASK_ABS_VAL]
593
  movups   xmm2, [SSE_FOPI]
594
  andps    xmm0, xmm1               // (xmm0) X := Abs(ARadians)
595
  movups   xmm3, [SSE_INT_NOT_ONE]
596
  movaps   xmm1, xmm0
597
  movups   xmm4, [SSE_INT_FOUR]
598
  mulps    xmm1, xmm2
599
  movups   xmm2, [SSE_INT_ONE]
600
  cvtps2dq xmm1, xmm1               // J := Trunc(X * FOPI)
601
  pxor     xmm6, xmm6
602
  paddd    xmm1, xmm2
603
  pand     xmm1, xmm3               // (xmm1) J := (J + 1) and (not 1)
604
  movups   xmm3, [SSE_INT_TWO]
605
  cvtdq2ps xmm2, xmm1               // (xmm2) Y := J
606
  psubd    xmm1, xmm3               // J - 2
607
  movaps   xmm5, xmm1
608
  pandn    xmm1, xmm4               // (not (J - 2)) and 4
609
  pand     xmm5, xmm3               // (J - 2) and 2
610
  pslld    xmm1, 29                 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
611
  movups   xmm3, [SSE_PI_OVER_4]
612
  pcmpeqd  xmm5, xmm6               // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
613
  mulps    xmm2, xmm3               // Y * Pi / 4
614
  movups   xmm3, [SSE_COSCOF_P1]
615
  subps    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
616
  movups   xmm2, [SSE_COSCOF_P0]
617
  movups   xmm4, [SSE_COSCOF_P2]
618
  movaps   xmm6, xmm0
619
  mulps    xmm6, xmm6               // (xmm6) Z := X * X
620
  mulps    xmm2, xmm6               // COSCOF_P0 * Z
621
  addps    xmm2, xmm3               // Y := COSCOF_P0 * Z + COSCOF_P1
622
  movups   xmm3, [SSE_ONE_HALF]
623
  mulps    xmm2, xmm6               // Y * Z
624
  mulps    xmm3, xmm6               // Z * 0.5
625
  addps    xmm2, xmm4               // Y := (Y * Z) + COSCOF_P2
626
  movups   xmm7, [SSE_ONE]
627
  mulps    xmm2, xmm6
628
  movups   xmm4, [SSE_SINCOF_P1]
629
  mulps    xmm2, xmm6               // Y := Y * (Z * Z)
630
  subps    xmm2, xmm3               // Y - Z * 0.5
631
  addps    xmm2, xmm7               // (xmm2) Y := Y - Z * 0.5 + 1
632
  movups   xmm3, [SSE_SINCOF_P0]
633
  movups   xmm7, [SSE_SINCOF_P2]
634
  mulps    xmm3, xmm6               // SINCOF_P0 * Z
635
  addps    xmm3, xmm4               // Y2 := SINCOF_P0 * Z + SINCOF_P1
636
  mulps    xmm3, xmm6               // Y2 * Z
637
  addps    xmm3, xmm7               // Y2 := (Y2 * Z) + SINCOF_P2
638
  mulps    xmm3, xmm6               // Y2 * Z
639
  mulps    xmm3, xmm0               // Y2 * (Z * X)
640
  addps    xmm3, xmm0               // Y2 := Y2 * (Z * X) + X
641
  andps    xmm3, xmm5               // ((J-2) and 2) = 0)? Yes: Y2, No: 0
642
  andnps   xmm5, xmm2               // ((J-2) and 2) = 0)? Yes: 0 , No: Y
643
  addps    xmm3, xmm5
644
  xorps    xmm3, xmm1               // (Y + Y2) xor SignBit
645
  movhlps  xmm4, xmm3
646
  movq     [Result], xmm3
647
  movss    [Result+8], xmm4
648
end;
649

650
function FastCos(const ARadians: TVector4): TVector4; assembler;
651
asm
652
  movups   xmm0, [ARadians]
653
  movups   xmm1, [SSE_MASK_ABS_VAL]
654
  movups   xmm2, [SSE_FOPI]
655
  andps    xmm0, xmm1               // (xmm0) X := Abs(ARadians)
656
  movups   xmm3, [SSE_INT_NOT_ONE]
657
  movaps   xmm1, xmm0
658
  movups   xmm4, [SSE_INT_FOUR]
659
  mulps    xmm1, xmm2
660
  movups   xmm2, [SSE_INT_ONE]
661
  cvtps2dq xmm1, xmm1               // J := Trunc(X * FOPI)
662
  pxor     xmm6, xmm6
663
  paddd    xmm1, xmm2
664
  pand     xmm1, xmm3               // (xmm1) J := (J + 1) and (not 1)
665
  movups   xmm3, [SSE_INT_TWO]
666
  cvtdq2ps xmm2, xmm1               // (xmm2) Y := J
667
  psubd    xmm1, xmm3               // J - 2
668
  movaps   xmm5, xmm1
669
  pandn    xmm1, xmm4               // (not (J - 2)) and 4
670
  pand     xmm5, xmm3               // (J - 2) and 2
671
  pslld    xmm1, 29                 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
672
  movups   xmm3, [SSE_PI_OVER_4]
673
  pcmpeqd  xmm5, xmm6               // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
674
  mulps    xmm2, xmm3               // Y * Pi / 4
675
  movups   xmm3, [SSE_COSCOF_P1]
676
  subps    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
677
  movups   xmm2, [SSE_COSCOF_P0]
678
  movups   xmm4, [SSE_COSCOF_P2]
679
  movaps   xmm6, xmm0
680
  mulps    xmm6, xmm6               // (xmm6) Z := X * X
681
  mulps    xmm2, xmm6               // COSCOF_P0 * Z
682
  addps    xmm2, xmm3               // Y := COSCOF_P0 * Z + COSCOF_P1
683
  movups   xmm3, [SSE_ONE_HALF]
684
  mulps    xmm2, xmm6               // Y * Z
685
  mulps    xmm3, xmm6               // Z * 0.5
686
  addps    xmm2, xmm4               // Y := (Y * Z) + COSCOF_P2
687
  movups   xmm7, [SSE_ONE]
688
  mulps    xmm2, xmm6
689
  movups   xmm4, [SSE_SINCOF_P1]
690
  mulps    xmm2, xmm6               // Y := Y * (Z * Z)
691
  subps    xmm2, xmm3               // Y - Z * 0.5
692
  addps    xmm2, xmm7               // (xmm2) Y := Y - Z * 0.5 + 1
693
  movups   xmm3, [SSE_SINCOF_P0]
694
  movups   xmm7, [SSE_SINCOF_P2]
695
  mulps    xmm3, xmm6               // SINCOF_P0 * Z
696
  addps    xmm3, xmm4               // Y2 := SINCOF_P0 * Z + SINCOF_P1
697
  mulps    xmm3, xmm6               // Y2 * Z
698
  addps    xmm3, xmm7               // Y2 := (Y2 * Z) + SINCOF_P2
699
  mulps    xmm3, xmm6               // Y2 * Z
700
  mulps    xmm3, xmm0               // Y2 * (Z * X)
701
  addps    xmm3, xmm0               // Y2 := Y2 * (Z * X) + X
702
  andps    xmm3, xmm5               // ((J-2) and 2) = 0)? Yes: Y2, No: 0
703
  andnps   xmm5, xmm2               // ((J-2) and 2) = 0)? Yes: 0 , No: Y
704
  addps    xmm3, xmm5
705
  xorps    xmm3, xmm1               // (Y + Y2) xor SignBit
706
  movups   [Result], xmm3
707
end;
708

709
procedure FastSinCos(const ARadians: Single; out ASin, ACos: Single); assembler;
710
asm
711
  movss    xmm0, [ARadians]
712
  movss    xmm2, DWORD [SSE_MASK_SIGN]
713
  movss    xmm3, DWORD [SSE_MASK_ABS_VAL]
714
  movaps   xmm1, xmm0
715
  pand     xmm0, xmm3               // (xmm0) X := Abs(ARadians)
716
  pand     xmm1, xmm2               // (xmm1) SignBitSin
717
  movaps   xmm4, xmm0
718
  movss    xmm5, DWORD [SSE_FOPI]
719
  movss    xmm6, DWORD [SSE_INT_ONE]
720
  mulss    xmm4, xmm5
721
  movss    xmm7, DWORD [SSE_INT_NOT_ONE]
722
  cvtps2dq xmm4, xmm4               // (xmm4) J := Trunc(X * FOPI)
723
  movss    xmm5, DWORD [SSE_INT_FOUR]
724
  paddd    xmm4, xmm6
725
  pand     xmm4, xmm7               // (xmm4) J := (J + 1) and (not 1)
726
  movss    xmm7, DWORD [SSE_INT_TWO]
727
  cvtdq2ps xmm2, xmm4               // (xmm2) Y := J
728
  movaps   xmm3, xmm4
729
  movaps   xmm6, xmm4               // (xmm6) J
730
  pand     xmm3, xmm5               // J and 4
731
  pand     xmm4, xmm7               // J and 2
732
  pxor     xmm5, xmm5
733
  pslld    xmm3, 29                 // (xmm3) SwapSignBitSin := (J and 4) shl 29
734
  movss    xmm7, DWORD [SSE_PI_OVER_4]
735
  pcmpeqd  xmm4, xmm5               // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
736
  mulss    xmm2, xmm7               // Y * Pi / 4
737
  movss    xmm5, DWORD [SSE_INT_TWO]
738
  subss    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
739
  psubd    xmm6, xmm5               // J - 2
740
  movss    xmm7, DWORD [SSE_INT_FOUR]
741
  pxor     xmm1, xmm3               // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
742
  andnps   xmm6, xmm7               // (not (J - 2)) and 4
743
  movaps   xmm3, xmm0
744
  pslld    xmm6, 29                 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
745
  mulss    xmm3, xmm3               // (xmm3) Z := X * X
746
  movss    xmm2, DWORD [SSE_COSCOF_P0]
747
  movss    xmm5, DWORD [SSE_COSCOF_P1]
748
  movss    xmm7, DWORD [SSE_COSCOF_P2]
749
  mulss    xmm2, xmm3               // COSCOF_P0 * Z
750
  addss    xmm2, xmm5               // Y := COSCOF_P0 * Z + COSCOF_P1
751
  movss    xmm5, DWORD [SSE_ONE_HALF]
752
  mulss    xmm2, xmm3               // Y * Z
753
  addss    xmm2, xmm7               // Y := (Y * Z) + COSCOF_P2
754
  movss    xmm7, DWORD [SSE_ONE]
755
  mulss    xmm2, xmm3               // Y * Z
756
  mulss    xmm5, xmm3               // 0.5 * Z
757
  mulss    xmm2, xmm3               // Y * (Z * Z)
758
  subss    xmm2, xmm5               // Y - 0.5 * Z
759
  movss    xmm5, DWORD [SSE_SINCOF_P0]
760
  addss    xmm2, xmm7               // (xmm2) Y := Y - 0.5 * Z + 1
761
  movss    xmm7, DWORD [SSE_SINCOF_P1]
762
  mulss    xmm5, xmm3               // SINCOF_P0 * Z
763
  addss    xmm5, xmm7               // Y2 := SINCOF_P0 * Z + SINCOF_P1
764
  mulss    xmm5, xmm3               // Y2 * Z
765
  movss    xmm7, DWORD [SSE_SINCOF_P2]
766
  addss    xmm5, xmm7               // Y2 := Y2 * Z + SINCOF_P2
767
  mulss    xmm5, xmm3               // Y2 * Z
768
  mulss    xmm5, xmm0               // Y2 * (Z * X)
769
  addss    xmm5, xmm0               // (xmm5) Y2 := Y2 * (Z * X) + X
770
  movaps   xmm0, xmm2               // Y
771
  movaps   xmm3, xmm5               // Y2
772
  andps    xmm5, xmm4               // ((J and 2) = 0)? Yes: Y2, No: 0
773
  andnps   xmm4, xmm2               // ((J and 2) = 0)? Yes: 0 , No: Y
774
  subss    xmm3, xmm5               // ((J and 2) = 0)? Yes: 0 , No: Y2
775
  subss    xmm0, xmm4               // ((J and 2) = 0)? Yes: Y , No: 0
776
  addps    xmm4, xmm5               // ((J and 2) = 0)? Yes: Y2, No: Y
777
  addps    xmm3, xmm0               // ((J and 2) = 0)? Yes: Y , No: Y2
778
  xorps    xmm4, xmm1               // Sin
779
  xorps    xmm3, xmm6               // Cos
780
  movss    [ASin], xmm4
781
  movss    [ACos], xmm3
782
end;
783

784
procedure FastSinCos(const ARadians: TVector2; out ASin, ACos: TVector2); assembler;
785
asm
786
  movlps   xmm0, [ARadians]
787
  movlps   xmm2, QWORD [SSE_MASK_SIGN]
788
  movlps   xmm3, QWORD [SSE_MASK_ABS_VAL]
789
  movaps   xmm1, xmm0
790
  pand     xmm0, xmm3               // (xmm0) X := Abs(ARadians)
791
  pand     xmm1, xmm2               // (xmm1) SignBitSin
792
  movaps   xmm4, xmm0
793
  movlps   xmm5, QWORD [SSE_FOPI]
794
  movlps   xmm6, QWORD [SSE_INT_ONE]
795
  mulps    xmm4, xmm5
796
  movlps   xmm7, QWORD [SSE_INT_NOT_ONE]
797
  cvtps2dq xmm4, xmm4               // (xmm4) J := Trunc(X * FOPI)
798
  movlps   xmm5, QWORD [SSE_INT_FOUR]
799
  paddd    xmm4, xmm6
800
  pand     xmm4, xmm7               // (xmm4) J := (J + 1) and (not 1)
801
  movlps   xmm7, QWORD [SSE_INT_TWO]
802
  cvtdq2ps xmm2, xmm4               // (xmm2) Y := J
803
  movaps   xmm3, xmm4
804
  movaps   xmm6, xmm4               // (xmm6) J
805
  pand     xmm3, xmm5               // J and 4
806
  pand     xmm4, xmm7               // J and 2
807
  pxor     xmm5, xmm5
808
  pslld    xmm3, 29                 // (xmm3) SwapSignBitSin := (J and 4) shl 29
809
  movlps   xmm7, QWORD [SSE_PI_OVER_4]
810
  pcmpeqd  xmm4, xmm5               // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
811
  mulps    xmm2, xmm7               // Y * Pi / 4
812
  movlps   xmm5, QWORD [SSE_INT_TWO]
813
  subps    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
814
  psubd    xmm6, xmm5               // J - 2
815
  movlps   xmm7, QWORD [SSE_INT_FOUR]
816
  pxor     xmm1, xmm3               // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
817
  andnps   xmm6, xmm7               // (not (J - 2)) and 4
818
  movaps   xmm3, xmm0
819
  pslld    xmm6, 29                 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
820
  mulps    xmm3, xmm3               // (xmm3) Z := X * X
821
  movlps   xmm2, QWORD [SSE_COSCOF_P0]
822
  movlps   xmm5, QWORD [SSE_COSCOF_P1]
823
  movlps   xmm7, QWORD [SSE_COSCOF_P2]
824
  mulps    xmm2, xmm3               // COSCOF_P0 * Z
825
  addps    xmm2, xmm5               // Y := COSCOF_P0 * Z + COSCOF_P1
826
  movlps   xmm5, QWORD [SSE_ONE_HALF]
827
  mulps    xmm2, xmm3               // Y * Z
828
  addps    xmm2, xmm7               // Y := (Y * Z) + COSCOF_P2
829
  movlps   xmm7, QWORD [SSE_ONE]
830
  mulps    xmm2, xmm3               // Y * Z
831
  mulps    xmm5, xmm3               // 0.5 * Z
832
  mulps    xmm2, xmm3               // Y * (Z * Z)
833
  subps    xmm2, xmm5               // Y - 0.5 * Z
834
  movlps   xmm5, QWORD [SSE_SINCOF_P0]
835
  addps    xmm2, xmm7               // (xmm2) Y := Y - 0.5 * Z + 1
836
  movlps   xmm7, QWORD [SSE_SINCOF_P1]
837
  mulps    xmm5, xmm3               // SINCOF_P0 * Z
838
  addps    xmm5, xmm7               // Y2 := SINCOF_P0 * Z + SINCOF_P1
839
  mulps    xmm5, xmm3               // Y2 * Z
840
  movlps   xmm7, QWORD [SSE_SINCOF_P2]
841
  addps    xmm5, xmm7               // Y2 := Y2 * Z + SINCOF_P2
842
  mulps    xmm5, xmm3               // Y2 * Z
843
  mulps    xmm5, xmm0               // Y2 * (Z * X)
844
  addps    xmm5, xmm0               // (xmm5) Y2 := Y2 * (Z * X) + X
845
  movaps   xmm0, xmm2               // Y
846
  movaps   xmm3, xmm5               // Y2
847
  andps    xmm5, xmm4               // ((J and 2) = 0)? Yes: Y2, No: 0
848
  andnps   xmm4, xmm2               // ((J and 2) = 0)? Yes: 0 , No: Y
849
  subps    xmm3, xmm5               // ((J and 2) = 0)? Yes: 0 , No: Y2
850
  subps    xmm0, xmm4               // ((J and 2) = 0)? Yes: Y , No: 0
851
  addps    xmm4, xmm5               // ((J and 2) = 0)? Yes: Y2, No: Y
852
  addps    xmm3, xmm0               // ((J and 2) = 0)? Yes: Y , No: Y2
853
  xorps    xmm4, xmm1               // Sin
854
  xorps    xmm3, xmm6               // Cos
855
  movlps   [ASin], xmm4
856
  movlps   [ACos], xmm3
857
end;
858

859
procedure FastSinCos(const ARadians: TVector3; out ASin, ACos: TVector3); assembler;
860
asm
861
  movq     xmm0, [ARadians]
862
  movss    xmm1, [ARadians+8]
863
  movlhps  xmm0, xmm1
864
  movups   xmm2, [SSE_MASK_SIGN]
865
  movups   xmm3, [SSE_MASK_ABS_VAL]
866
  movaps   xmm1, xmm0
867
  pand     xmm0, xmm3               // (xmm0) X := Abs(ARadians)
868
  pand     xmm1, xmm2               // (xmm1) SignBitSin
869
  movaps   xmm4, xmm0
870
  movups   xmm5, [SSE_FOPI]
871
  movups   xmm6, [SSE_INT_ONE]
872
  mulps    xmm4, xmm5
873
  movups   xmm7, [SSE_INT_NOT_ONE]
874
  cvtps2dq xmm4, xmm4               // (xmm4) J := Trunc(X * FOPI)
875
  movups   xmm5, [SSE_INT_FOUR]
876
  paddd    xmm4, xmm6
877
  pand     xmm4, xmm7               // (xmm4) J := (J + 1) and (not 1)
878
  movups   xmm7, [SSE_INT_TWO]
879
  cvtdq2ps xmm2, xmm4               // (xmm2) Y := J
880
  movaps   xmm3, xmm4
881
  movaps   xmm6, xmm4               // (xmm6) J
882
  pand     xmm3, xmm5               // J and 4
883
  pand     xmm4, xmm7               // J and 2
884
  pxor     xmm5, xmm5
885
  pslld    xmm3, 29                 // (xmm3) SwapSignBitSin := (J and 4) shl 29
886
  movups   xmm7, [SSE_PI_OVER_4]
887
  pcmpeqd  xmm4, xmm5               // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
888
  mulps    xmm2, xmm7               // Y * Pi / 4
889
  movups   xmm5, [SSE_INT_TWO]
890
  subps    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
891
  psubd    xmm6, xmm5               // J - 2
892
  movups   xmm7, [SSE_INT_FOUR]
893
  pxor     xmm1, xmm3               // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
894
  andnps   xmm6, xmm7               // (not (J - 2)) and 4
895
  movaps   xmm3, xmm0
896
  pslld    xmm6, 29                 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
897
  mulps    xmm3, xmm3               // (xmm3) Z := X * X
898
  movups   xmm2, [SSE_COSCOF_P0]
899
  movups   xmm5, [SSE_COSCOF_P1]
900
  movups   xmm7, [SSE_COSCOF_P2]
901
  mulps    xmm2, xmm3               // COSCOF_P0 * Z
902
  addps    xmm2, xmm5               // Y := COSCOF_P0 * Z + COSCOF_P1
903
  movups   xmm5, [SSE_ONE_HALF]
904
  mulps    xmm2, xmm3               // Y * Z
905
  addps    xmm2, xmm7               // Y := (Y * Z) + COSCOF_P2
906
  movups   xmm7, [SSE_ONE]
907
  mulps    xmm2, xmm3               // Y * Z
908
  mulps    xmm5, xmm3               // 0.5 * Z
909
  mulps    xmm2, xmm3               // Y * (Z * Z)
910
  subps    xmm2, xmm5               // Y - 0.5 * Z
911
  movups   xmm5, [SSE_SINCOF_P0]
912
  addps    xmm2, xmm7               // (xmm2) Y := Y - 0.5 * Z + 1
913
  movups   xmm7, [SSE_SINCOF_P1]
914
  mulps    xmm5, xmm3               // SINCOF_P0 * Z
915
  addps    xmm5, xmm7               // Y2 := SINCOF_P0 * Z + SINCOF_P1
916
  mulps    xmm5, xmm3               // Y2 * Z
917
  movups   xmm7, [SSE_SINCOF_P2]
918
  addps    xmm5, xmm7               // Y2 := Y2 * Z + SINCOF_P2
919
  mulps    xmm5, xmm3               // Y2 * Z
920
  mulps    xmm5, xmm0               // Y2 * (Z * X)
921
  addps    xmm5, xmm0               // (xmm5) Y2 := Y2 * (Z * X) + X
922
  movaps   xmm0, xmm2               // Y
923
  movaps   xmm3, xmm5               // Y2
924
  andps    xmm5, xmm4               // ((J and 2) = 0)? Yes: Y2, No: 0
925
  andnps   xmm4, xmm2               // ((J and 2) = 0)? Yes: 0 , No: Y
926
  subps    xmm3, xmm5               // ((J and 2) = 0)? Yes: 0 , No: Y2
927
  subps    xmm0, xmm4               // ((J and 2) = 0)? Yes: Y , No: 0
928
  addps    xmm4, xmm5               // ((J and 2) = 0)? Yes: Y2, No: Y
929
  addps    xmm3, xmm0               // ((J and 2) = 0)? Yes: Y , No: Y2
930
  xorps    xmm4, xmm1               // Sin
931
  xorps    xmm3, xmm6               // Cos
932
  movhlps  xmm5, xmm4
933
  movhlps  xmm2, xmm3
934
  movq     [ASin], xmm4
935
  movss    [ASin+8], xmm5
936
  movq     [ACos], xmm3
937
  movss    [ACos+8], xmm2
938
end;
939

940
procedure FastSinCos(const ARadians: TVector4; out ASin, ACos: TVector4); assembler;
941
asm
942
  movups   xmm0, [ARadians]
943
  movups   xmm2, [SSE_MASK_SIGN]
944
  movups   xmm3, [SSE_MASK_ABS_VAL]
945
  movaps   xmm1, xmm0
946
  pand     xmm0, xmm3               // (xmm0) X := Abs(ARadians)
947
  pand     xmm1, xmm2               // (xmm1) SignBitSin
948
  movaps   xmm4, xmm0
949
  movups   xmm5, [SSE_FOPI]
950
  movups   xmm6, [SSE_INT_ONE]
951
  mulps    xmm4, xmm5
952
  movups   xmm7, [SSE_INT_NOT_ONE]
953
  cvtps2dq xmm4, xmm4               // (xmm4) J := Trunc(X * FOPI)
954
  movups   xmm5, [SSE_INT_FOUR]
955
  paddd    xmm4, xmm6
956
  pand     xmm4, xmm7               // (xmm4) J := (J + 1) and (not 1)
957
  movups   xmm7, [SSE_INT_TWO]
958
  cvtdq2ps xmm2, xmm4               // (xmm2) Y := J
959
  movaps   xmm3, xmm4
960
  movaps   xmm6, xmm4               // (xmm6) J
961
  pand     xmm3, xmm5               // J and 4
962
  pand     xmm4, xmm7               // J and 2
963
  pxor     xmm5, xmm5
964
  pslld    xmm3, 29                 // (xmm3) SwapSignBitSin := (J and 4) shl 29
965
  movups   xmm7, [SSE_PI_OVER_4]
966
  pcmpeqd  xmm4, xmm5               // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
967
  mulps    xmm2, xmm7               // Y * Pi / 4
968
  movups   xmm5, [SSE_INT_TWO]
969
  subps    xmm0, xmm2               // (xmm0) X := X - (Y * Pi / 4)
970
  psubd    xmm6, xmm5               // J - 2
971
  movups   xmm7, [SSE_INT_FOUR]
972
  pxor     xmm1, xmm3               // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
973
  andnps   xmm6, xmm7               // (not (J - 2)) and 4
974
  movaps   xmm3, xmm0
975
  pslld    xmm6, 29                 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
976
  mulps    xmm3, xmm3               // (xmm3) Z := X * X
977
  movups   xmm2, [SSE_COSCOF_P0]
978
  movups   xmm5, [SSE_COSCOF_P1]
979
  movups   xmm7, [SSE_COSCOF_P2]
980
  mulps    xmm2, xmm3               // COSCOF_P0 * Z
981
  addps    xmm2, xmm5               // Y := COSCOF_P0 * Z + COSCOF_P1
982
  movups   xmm5, [SSE_ONE_HALF]
983
  mulps    xmm2, xmm3               // Y * Z
984
  addps    xmm2, xmm7               // Y := (Y * Z) + COSCOF_P2
985
  movups   xmm7, [SSE_ONE]
986
  mulps    xmm2, xmm3               // Y * Z
987
  mulps    xmm5, xmm3               // 0.5 * Z
988
  mulps    xmm2, xmm3               // Y * (Z * Z)
989
  subps    xmm2, xmm5               // Y - 0.5 * Z
990
  movups   xmm5, [SSE_SINCOF_P0]
991
  addps    xmm2, xmm7               // (xmm2) Y := Y - 0.5 * Z + 1
992
  movups   xmm7, [SSE_SINCOF_P1]
993
  mulps    xmm5, xmm3               // SINCOF_P0 * Z
994
  addps    xmm5, xmm7               // Y2 := SINCOF_P0 * Z + SINCOF_P1
995
  mulps    xmm5, xmm3               // Y2 * Z
996
  movups   xmm7, [SSE_SINCOF_P2]
997
  addps    xmm5, xmm7               // Y2 := Y2 * Z + SINCOF_P2
998
  mulps    xmm5, xmm3               // Y2 * Z
999
  mulps    xmm5, xmm0               // Y2 * (Z * X)
1000
  addps    xmm5, xmm0               // (xmm5) Y2 := Y2 * (Z * X) + X
1001
  movaps   xmm0, xmm2               // Y
1002
  movaps   xmm3, xmm5               // Y2
1003
  andps    xmm5, xmm4               // ((J and 2) = 0)? Yes: Y2, No: 0
1004
  andnps   xmm4, xmm2               // ((J and 2) = 0)? Yes: 0 , No: Y
1005
  subps    xmm3, xmm5               // ((J and 2) = 0)? Yes: 0 , No: Y2
1006
  subps    xmm0, xmm4               // ((J and 2) = 0)? Yes: Y , No: 0
1007
  addps    xmm4, xmm5               // ((J and 2) = 0)? Yes: Y2, No: Y
1008
  addps    xmm3, xmm0               // ((J and 2) = 0)? Yes: Y , No: Y2
1009
  xorps    xmm4, xmm1               // Sin
1010
  xorps    xmm3, xmm6               // Cos
1011
  movups   [ASin], xmm4
1012
  movups   [ACos], xmm3
1013
end;
1014

1015
function FastExp(const A: Single): Single; assembler;
1016
asm
1017
  movss    xmm0, [A]
1018
  movss    xmm1, DWORD [SSE_EXP_A1]
1019
  movss    xmm2, DWORD [SSE_EXP_A2]
1020

1021
  // Val := 12102203.1615614 * A + 1065353216.0
1022
  mulss    xmm0, xmm1
1023
  movss    xmm3, DWORD [SSE_EXP_CST]
1024
  addss    xmm0, xmm2
1025

1026
  // if (Val >= EXP_CST) then Val := EXP_CST
1027
  movss    xmm1, xmm0
1028
  cmpltss  xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1029
  andps    xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1030
  andnps   xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1031
  orps     xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1032

1033
  // IVal := Trunc(Val)
1034
  xorps    xmm3, xmm3
1035
  cvtps2dq xmm1, xmm0
1036

1037
  // if (IVal < 0) then I := 0
1038
  movss    xmm2, DWORD [SSE_MASK_EXPONENT]
1039
  movdqa   xmm0, xmm1 // IVal
1040
  pcmpgtd  xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1041
  movss    xmm3, DWORD [SSE_MASK_FRACTION]
1042
  pand     xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1043

1044
  // XU.I := IVal and $7F800000
1045
  movss    xmm4, DWORD [SSE_EXP_I1]
1046
  movss    xmm1, xmm0
1047
  pand     xmm0, xmm2 // XU.I / XU.S
1048

1049
  // XU2.I := (IVal and $007FFFFF) or $3F800000;
1050
  pand     xmm1, xmm3
1051
  movss    xmm6, DWORD [SSE_EXP_F5]
1052
  por      xmm1, xmm4 // XU2.I / XU2.S
1053

1054
  //  Result := XU.S *
1055
  //    ( 0.509964287281036376953125 + B *
1056
  //    ( 0.3120158612728118896484375 + B *
1057
  //    ( 0.1666135489940643310546875 + B *
1058
  //    (-2.12528370320796966552734375e-3 + B *
1059
  //      1.3534179888665676116943359375e-2))));
1060
  movss    xmm5, DWORD [SSE_EXP_F4]
1061
  movss    xmm7, xmm1
1062

1063
  mulss    xmm1, xmm6
1064
  movss    xmm4, DWORD [SSE_EXP_F3]
1065
  addss    xmm1, xmm5
1066
  movss    xmm3, DWORD [SSE_EXP_F2]
1067
  mulss    xmm1, xmm7
1068
  movss    xmm2, DWORD [SSE_EXP_F1]
1069
  addss    xmm1, xmm4
1070
  mulss    xmm1, xmm7
1071
  addss    xmm1, xmm3
1072
  mulss    xmm1, xmm7
1073
  addss    xmm1, xmm2
1074
  mulss    xmm1, xmm0
1075

1076
  movss    [Result], xmm1
1077
end;
1078

1079
function FastExp(const A: TVector2): TVector2;
1080
asm
1081
  movlps   xmm0, [A]
1082
  movlps   xmm1, QWORD [SSE_EXP_A1]
1083
  movlps   xmm2, QWORD [SSE_EXP_A2]
1084

1085
  // Val := 12102203.1615614 * A + 1065353216.0
1086
  mulps    xmm0, xmm1
1087
  movlps   xmm3, QWORD [SSE_EXP_CST]
1088
  addps    xmm0, xmm2
1089

1090
  // if (Val >= EXP_CST) then Val := EXP_CST
1091
  movaps   xmm1, xmm0
1092
  cmpltps  xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1093
  andps    xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1094
  andnps   xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1095
  orps     xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1096

1097
  // IVal := Trunc(Val)
1098
  xorps    xmm3, xmm3
1099
  cvtps2dq xmm1, xmm0
1100

1101
  // if (IVal < 0) then I := 0
1102
  movlps   xmm2, QWORD [SSE_MASK_EXPONENT]
1103
  movdqa   xmm0, xmm1 // IVal
1104
  pcmpgtd  xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1105
  movlps   xmm3, QWORD [SSE_MASK_FRACTION]
1106
  pand     xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1107

1108
  // XU.I := IVal and $7F800000
1109
  movlps   xmm4, QWORD [SSE_EXP_I1]
1110
  movdqa   xmm1, xmm0
1111
  pand     xmm0, xmm2 // XU.I / XU.S
1112

1113
  // XU2.I := (IVal and $007FFFFF) or $3F800000;
1114
  pand     xmm1, xmm3
1115
  movlps   xmm6, QWORD [SSE_EXP_F5]
1116
  por      xmm1, xmm4 // XU2.I / XU2.S
1117

1118
  //  Result := XU.S *
1119
  //    ( 0.509964287281036376953125 + B *
1120
  //    ( 0.3120158612728118896484375 + B *
1121
  //    ( 0.1666135489940643310546875 + B *
1122
  //    (-2.12528370320796966552734375e-3 + B *
1123
  //      1.3534179888665676116943359375e-2))));
1124
  movlps   xmm5, QWORD [SSE_EXP_F4]
1125
  movaps   xmm7, xmm1
1126

1127
  mulps    xmm1, xmm6
1128
  movlps   xmm4, QWORD [SSE_EXP_F3]
1129
  addps    xmm1, xmm5
1130
  movlps   xmm3, QWORD [SSE_EXP_F2]
1131
  mulps    xmm1, xmm7
1132
  movlps   xmm2, QWORD [SSE_EXP_F1]
1133
  addps    xmm1, xmm4
1134
  mulps    xmm1, xmm7
1135
  addps    xmm1, xmm3
1136
  mulps    xmm1, xmm7
1137
  addps    xmm1, xmm2
1138
  mulps    xmm1, xmm0
1139

1140
  movlps   [Result], xmm1
1141
end;
1142

1143
function FastExp(const A: TVector3): TVector3;
1144
asm
1145
  movq     xmm0, [A]
1146
  movss    xmm1, [A+8]
1147
  movlhps  xmm0, xmm1
1148
  movups   xmm1, [SSE_EXP_A1]
1149
  movups   xmm2, [SSE_EXP_A2]
1150

1151
  // Val := 12102203.1615614 * A + 1065353216.0
1152
  mulps    xmm0, xmm1
1153
  movups   xmm3, [SSE_EXP_CST]
1154
  addps    xmm0, xmm2
1155

1156
  // if (Val >= EXP_CST) then Val := EXP_CST
1157
  movaps   xmm1, xmm0
1158
  cmpltps  xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1159
  andps    xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1160
  andnps   xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1161
  orps     xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1162

1163
  // IVal := Trunc(Val)
1164
  xorps    xmm3, xmm3
1165
  cvtps2dq xmm1, xmm0
1166

1167
  // if (IVal < 0) then I := 0
1168
  movups   xmm2, [SSE_MASK_EXPONENT]
1169
  movdqa   xmm0, xmm1 // IVal
1170
  pcmpgtd  xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1171
  movups   xmm3, [SSE_MASK_FRACTION]
1172
  pand     xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1173

1174
  // XU.I := IVal and $7F800000
1175
  movups   xmm4, [SSE_EXP_I1]
1176
  movdqa   xmm1, xmm0
1177
  pand     xmm0, xmm2 // XU.I / XU.S
1178

1179
  // XU2.I := (IVal and $007FFFFF) or $3F800000;
1180
  pand     xmm1, xmm3
1181
  movups   xmm6, [SSE_EXP_F5]
1182
  por      xmm1, xmm4 // XU2.I / XU2.S
1183

1184
  //  Result := XU.S *
1185
  //    ( 0.509964287281036376953125 + B *
1186
  //    ( 0.3120158612728118896484375 + B *
1187
  //    ( 0.1666135489940643310546875 + B *
1188
  //    (-2.12528370320796966552734375e-3 + B *
1189
  //      1.3534179888665676116943359375e-2))));
1190
  movups   xmm5, [SSE_EXP_F4]
1191
  movaps   xmm7, xmm1
1192

1193
  mulps    xmm1, xmm6
1194
  movups   xmm4, [SSE_EXP_F3]
1195
  addps    xmm1, xmm5
1196
  movups   xmm3, [SSE_EXP_F2]
1197
  mulps    xmm1, xmm7
1198
  movups   xmm2, [SSE_EXP_F1]
1199
  addps    xmm1, xmm4
1200
  mulps    xmm1, xmm7
1201
  addps    xmm1, xmm3
1202
  mulps    xmm1, xmm7
1203
  addps    xmm1, xmm2
1204
  mulps    xmm1, xmm0
1205

1206
  movhlps  xmm0, xmm1
1207
  movq     [Result], xmm1
1208
  movss    [Result+8], xmm0
1209
end;
1210

1211
function FastExp(const A: TVector4): TVector4;
1212
asm
1213
  movups   xmm0, [A]
1214
  movups   xmm1, [SSE_EXP_A1]
1215
  movups   xmm2, [SSE_EXP_A2]
1216

1217
  // Val := 12102203.1615614 * A + 1065353216.0
1218
  mulps    xmm0, xmm1
1219
  movups   xmm3, [SSE_EXP_CST]
1220
  addps    xmm0, xmm2
1221

1222
  // if (Val >= EXP_CST) then Val := EXP_CST
1223
  movaps   xmm1, xmm0
1224
  cmpltps  xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1225
  andps    xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1226
  andnps   xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1227
  orps     xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1228

1229
  // IVal := Trunc(Val)
1230
  xorps    xmm3, xmm3
1231
  cvtps2dq xmm1, xmm0
1232

1233
  // if (IVal < 0) then I := 0
1234
  movups   xmm2, [SSE_MASK_EXPONENT]
1235
  movdqa   xmm0, xmm1 // IVal
1236
  pcmpgtd  xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1237
  movups   xmm3, [SSE_MASK_FRACTION]
1238
  pand     xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1239

1240
  // XU.I := IVal and $7F800000
1241
  movups   xmm4, [SSE_EXP_I1]
1242
  movdqa   xmm1, xmm0
1243
  pand     xmm0, xmm2 // XU.I / XU.S
1244

1245
  // XU2.I := (IVal and $007FFFFF) or $3F800000;
1246
  pand     xmm1, xmm3
1247
  movups   xmm6, [SSE_EXP_F5]
1248
  por      xmm1, xmm4 // XU2.I / XU2.S
1249

1250
  //  Result := XU.S *
1251
  //    ( 0.509964287281036376953125 + B *
1252
  //    ( 0.3120158612728118896484375 + B *
1253
  //    ( 0.1666135489940643310546875 + B *
1254
  //    (-2.12528370320796966552734375e-3 + B *
1255
  //      1.3534179888665676116943359375e-2))));
1256
  movups   xmm5, [SSE_EXP_F4]
1257
  movaps   xmm7, xmm1
1258

1259
  mulps    xmm1, xmm6
1260
  movups   xmm4, [SSE_EXP_F3]
1261
  addps    xmm1, xmm5
1262
  movups   xmm3, [SSE_EXP_F2]
1263
  mulps    xmm1, xmm7
1264
  movups   xmm2, [SSE_EXP_F1]
1265
  addps    xmm1, xmm4
1266
  mulps    xmm1, xmm7
1267
  addps    xmm1, xmm3
1268
  mulps    xmm1, xmm7
1269
  addps    xmm1, xmm2
1270
  mulps    xmm1, xmm0
1271

1272
  movups   [Result], xmm1
1273
end;
1274

1275
function FastLn(const A: Single): Single; assembler;
1276
asm
1277
  movss    xmm0, [A]
1278
  xorps    xmm2, xmm2
1279
  movss    xmm1, xmm0
1280
  movss    xmm3, DWORD [SSE_LN_CST]
1281
  movss    xmm4, DWORD [SSE_NEG_INFINITY]
1282

1283
  // Exp := Val.I shr 23
1284
  psrld    xmm0, 23
1285
  movss    xmm5, xmm1
1286
  cvtdq2ps xmm0, xmm0 // xmm0=Exp
1287

1288
  // if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1289
  cmpnless xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1290
  movss    xmm2, DWORD [SSE_MASK_FRACTION]
1291
  andps    xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1292
  andnps   xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1293
  movss    xmm4, DWORD [SSE_EXP_I1]
1294
  orps     xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1295

1296
  // Val.I := (Val.I and $007FFFFF) or $3F800000
1297
  pand     xmm5, xmm2
1298
  movss    xmm2, DWORD [SSE_LN_F5]
1299
  por      xmm5, xmm4
1300
  movss    xmm6, DWORD [SSE_LN_F3]
1301
  movss    xmm3, xmm5 // xmm3=X
1302
  mulss    xmm5, xmm5 // xmm5=X2
1303

1304
  movss    xmm4, xmm3
1305
  movss    xmm7, DWORD [SSE_LN_F4]
1306
  mulss    xmm4, xmm6
1307
  mulss    xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1308
  subss    xmm4, xmm7
1309
  movss    xmm7, DWORD [SSE_LN_F2]
1310
  movss    xmm6, xmm3
1311
  mulss    xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1312
  subss    xmm6, xmm7
1313
  movss    xmm2, DWORD [SSE_LN_F1]
1314
  addss    xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1315
  mulss    xmm3, xmm2
1316
  mulss    xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1317
  addss    xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1318
  addss    xmm4, xmm0
1319
  addss    xmm3, xmm4
1320

1321
  movss    [Result], xmm3
1322
end;
1323

1324
function FastLn(const A: TVector2): TVector2; assembler;
1325
asm
1326
  movlps   xmm0, [A]
1327
  xorps    xmm2, xmm2
1328
  movaps   xmm1, xmm0
1329
  movlps   xmm3, QWORD [SSE_LN_CST]
1330
  movlps   xmm4, QWORD [SSE_NEG_INFINITY]
1331

1332
  // Exp := Val.I shr 23
1333
  psrld    xmm0, 23
1334
  movaps   xmm5, xmm1
1335
  cvtdq2ps xmm0, xmm0 // xmm0=Exp
1336

1337
  // if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1338
  cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1339
  movlps   xmm2, QWORD [SSE_MASK_FRACTION]
1340
  andps    xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1341
  andnps   xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1342
  movlps   xmm4, QWORD [SSE_EXP_I1]
1343
  orps     xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1344

1345
  // Val.I := (Val.I and $007FFFFF) or $3F800000
1346
  pand     xmm5, xmm2
1347
  movlps   xmm2, QWORD [SSE_LN_F5]
1348
  por      xmm5, xmm4
1349
  movlps   xmm6, QWORD [SSE_LN_F3]
1350
  movaps   xmm3, xmm5 // xmm3=X
1351
  mulps    xmm5, xmm5 // xmm5=X2
1352

1353
  movaps   xmm4, xmm3
1354
  movlps   xmm7, QWORD [SSE_LN_F4]
1355
  mulps    xmm4, xmm6
1356
  mulps    xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1357
  subps    xmm4, xmm7
1358
  movlps   xmm7, QWORD [SSE_LN_F2]
1359
  movaps   xmm6, xmm3
1360
  mulps    xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1361
  subps    xmm6, xmm7
1362
  movlps   xmm2, QWORD [SSE_LN_F1]
1363
  addps    xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1364
  mulps    xmm3, xmm2
1365
  mulps    xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1366
  addps    xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1367
  addps    xmm4, xmm0
1368
  addps    xmm3, xmm4
1369

1370
  movlps   [Result], xmm3
1371
end;
1372

1373
function FastLn(const A: TVector3): TVector3; assembler;
1374
asm
1375
  movq     xmm0, [A]
1376
  movss    xmm1, [A+8]
1377
  movlhps  xmm0, xmm1
1378
  xorps    xmm2, xmm2
1379
  movaps   xmm1, xmm0
1380
  movups   xmm3, [SSE_LN_CST]
1381
  movups   xmm4, [SSE_NEG_INFINITY]
1382

1383
  // Exp := Val.I shr 23
1384
  psrld    xmm0, 23
1385
  movaps   xmm5, xmm1
1386
  cvtdq2ps xmm0, xmm0 // xmm0=Exp
1387

1388
  // if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1389
  cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1390
  movups   xmm2, [SSE_MASK_FRACTION]
1391
  andps    xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1392
  andnps   xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1393
  movups   xmm4, [SSE_EXP_I1]
1394
  orps     xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1395

1396
  // Val.I := (Val.I and $007FFFFF) or $3F800000
1397
  pand     xmm5, xmm2
1398
  movups   xmm2, [SSE_LN_F5]
1399
  por      xmm5, xmm4
1400
  movups   xmm6, [SSE_LN_F3]
1401
  movaps   xmm3, xmm5 // xmm3=X
1402
  mulps    xmm5, xmm5 // xmm5=X2
1403

1404
  movaps   xmm4, xmm3
1405
  movups   xmm7, [SSE_LN_F4]
1406
  mulps    xmm4, xmm6
1407
  mulps    xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1408
  subps    xmm4, xmm7
1409
  movups   xmm7, [SSE_LN_F2]
1410
  movaps   xmm6, xmm3
1411
  mulps    xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1412
  subps    xmm6, xmm7
1413
  movups   xmm2, [SSE_LN_F1]
1414
  addps    xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1415
  mulps    xmm3, xmm2
1416
  mulps    xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1417
  addps    xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1418
  addps    xmm4, xmm0
1419
  addps    xmm3, xmm4
1420

1421
  movhlps  xmm4, xmm3
1422
  movq     [Result], xmm3
1423
  movss    [Result+8], xmm4
1424
end;
1425

1426
function FastLn(const A: TVector4): TVector4; assembler;
1427
asm
1428
  movups   xmm0, [A]
1429
  xorps    xmm2, xmm2
1430
  movaps   xmm1, xmm0
1431
  movups   xmm3, [SSE_LN_CST]
1432
  movups   xmm4, [SSE_NEG_INFINITY]
1433

1434
  // Exp := Val.I shr 23
1435
  psrld    xmm0, 23
1436
  movaps   xmm5, xmm1
1437
  cvtdq2ps xmm0, xmm0 // xmm0=Exp
1438

1439
  // if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1440
  cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1441
  movups   xmm2, [SSE_MASK_FRACTION]
1442
  andps    xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1443
  andnps   xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1444
  movups   xmm4, [SSE_EXP_I1]
1445
  orps     xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1446

1447
  // Val.I := (Val.I and $007FFFFF) or $3F800000
1448
  pand     xmm5, xmm2
1449
  movups   xmm2, [SSE_LN_F5]
1450
  por      xmm5, xmm4
1451
  movups   xmm6, [SSE_LN_F3]
1452
  movaps   xmm3, xmm5 // xmm3=X
1453
  mulps    xmm5, xmm5 // xmm5=X2
1454

1455
  movaps   xmm4, xmm3
1456
  movups   xmm7, [SSE_LN_F4]
1457
  mulps    xmm4, xmm6
1458
  mulps    xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1459
  subps    xmm4, xmm7
1460
  movups   xmm7, [SSE_LN_F2]
1461
  movaps   xmm6, xmm3
1462
  mulps    xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1463
  subps    xmm6, xmm7
1464
  movups   xmm2, [SSE_LN_F1]
1465
  addps    xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1466
  mulps    xmm3, xmm2
1467
  mulps    xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1468
  addps    xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1469
  addps    xmm4, xmm0
1470
  addps    xmm3, xmm4
1471

1472
  movups   [Result], xmm3
1473
end;
1474

1475
function FastLog2(const A: Single): Single; assembler;
1476
asm
1477
  movss    xmm0, [A]
1478
  movss    xmm2, DWORD [SSE_MASK_FRACTION]
1479
  movss    xmm1, xmm0
1480

1481
  // MX.I := (VX.I and $007FFFFF) or $3F000000
1482
  movss    xmm3, DWORD [SSE_LOG2_I1]
1483
  pand     xmm0, xmm2
1484
  cvtdq2ps xmm1, xmm1
1485
  movss    xmm4, DWORD [SSE_LOG2_F1]
1486
  por      xmm0, xmm3
1487

1488
  movss    xmm2, DWORD [SSE_LOG2_F2]
1489
  mulss    xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1490
  movss    xmm3, DWORD [SSE_LOG2_F3]
1491
  subss    xmm1, xmm2 // Result - 124.22551499
1492
  mulss    xmm3, xmm0
1493
  movss    xmm4, DWORD [SSE_LOG2_F5]
1494
  subss    xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1495
  movss    xmm2, DWORD [SSE_LOG2_F4]
1496
  addss    xmm0, xmm4
1497
  divss    xmm2, xmm0
1498
  subss    xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1499

1500
  movss    [Result], xmm1
1501
end;
1502

1503
function FastLog2(const A: TVector2): TVector2; assembler;
1504
asm
1505
  movlps   xmm0, [A]
1506
  movlps   xmm2, QWORD [SSE_MASK_FRACTION]
1507
  movaps   xmm1, xmm0
1508

1509
  // MX.I := (VX.I and $007FFFFF) or $3F000000
1510
  movlps   xmm3, QWORD [SSE_LOG2_I1]
1511
  pand     xmm0, xmm2
1512
  cvtdq2ps xmm1, xmm1
1513
  movlps   xmm4, QWORD [SSE_LOG2_F1]
1514
  por      xmm0, xmm3
1515

1516
  movlps   xmm2, QWORD [SSE_LOG2_F2]
1517
  mulps    xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1518
  movlps   xmm3, QWORD [SSE_LOG2_F3]
1519
  subps    xmm1, xmm2 // Result - 124.22551499
1520
  mulps    xmm3, xmm0
1521
  movlps   xmm4, QWORD [SSE_LOG2_F5]
1522
  subps    xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1523
  movlps   xmm2, QWORD [SSE_LOG2_F4]
1524
  addps    xmm0, xmm4
1525
  divps    xmm2, xmm0
1526
  subps    xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1527

1528
  movlps   [Result], xmm1
1529
end;
1530

1531
function FastLog2(const A: TVector3): TVector3; assembler;
1532
asm
1533
  movq     xmm0, [A]
1534
  movss    xmm1, [A+8]
1535
  movlhps  xmm0, xmm1
1536
  movups   xmm2, [SSE_MASK_FRACTION]
1537
  movaps   xmm1, xmm0
1538

1539
  // MX.I := (VX.I and $007FFFFF) or $3F000000
1540
  movups   xmm3, [SSE_LOG2_I1]
1541
  pand     xmm0, xmm2
1542
  cvtdq2ps xmm1, xmm1
1543
  movups   xmm4, [SSE_LOG2_F1]
1544
  por      xmm0, xmm3
1545

1546
  movups   xmm2, [SSE_LOG2_F2]
1547
  mulps    xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1548
  movups   xmm3, [SSE_LOG2_F3]
1549
  subps    xmm1, xmm2 // Result - 124.22551499
1550
  mulps    xmm3, xmm0
1551
  movups   xmm4, [SSE_LOG2_F5]
1552
  subps    xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1553
  movups   xmm2, [SSE_LOG2_F4]
1554
  addps    xmm0, xmm4
1555
  divps    xmm2, xmm0
1556
  subps    xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1557

1558
  movhlps  xmm0, xmm1
1559
  movq     [Result], xmm1
1560
  movss    [Result+8], xmm0
1561
end;
1562

1563
function FastLog2(const A: TVector4): TVector4; assembler;
1564
asm
1565
  movups   xmm0, [A]
1566
  movups   xmm2, [SSE_MASK_FRACTION]
1567
  movaps   xmm1, xmm0
1568

1569
  // MX.I := (VX.I and $007FFFFF) or $3F000000
1570
  movups   xmm3, [SSE_LOG2_I1]
1571
  pand     xmm0, xmm2
1572
  cvtdq2ps xmm1, xmm1
1573
  movups   xmm4, [SSE_LOG2_F1]
1574
  por      xmm0, xmm3
1575

1576
  movups   xmm2, [SSE_LOG2_F2]
1577
  mulps    xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1578
  movups   xmm3, [SSE_LOG2_F3]
1579
  subps    xmm1, xmm2 // Result - 124.22551499
1580
  mulps    xmm3, xmm0
1581
  movups   xmm4, [SSE_LOG2_F5]
1582
  subps    xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1583
  movups   xmm2, [SSE_LOG2_F4]
1584
  addps    xmm0, xmm4
1585
  divps    xmm2, xmm0
1586
  subps    xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1587

1588
  movups   [Result], xmm1
1589
end;
1590

1591
function FastExp2(const A: Single): Single; assembler;
1592
var
1593
  OldFlags, NewFlags: UInt32;
1594
asm
1595
  // Set rounding mode to Round Positive (=Round Down)
1596
  stmxcsr  [OldFlags]
1597
  movss    xmm0, [A]
1598
  mov      ecx, [OldFlags]
1599
  xorps    xmm1, xmm1
1600
  and      ecx, SSE_ROUND_MASK
1601
  movss    xmm3, xmm0
1602
  or       ecx, SSE_ROUND_DOWN
1603
  movss    xmm5, xmm0
1604
  mov      [NewFlags], ecx
1605

1606
  movss    xmm1, DWORD [SSE_EXP2_F1]
1607
  ldmxcsr  [NewFlags]
1608

1609
  // Z := A - RoundDown(A)
1610
  cvtps2dq xmm3, xmm3
1611
  addss    xmm1, xmm5 // A + 121.2740575
1612
  cvtdq2ps xmm3, xmm3
1613
  movss    xmm2, DWORD [SSE_EXP2_F2]
1614
  subss    xmm0, xmm3
1615

1616
  movss    xmm3, DWORD [SSE_EXP2_F3]
1617
  movss    xmm4, DWORD [SSE_EXP2_F4]
1618
  subss    xmm3, xmm0 // (4.84252568 - Z)
1619
  mulss    xmm0, xmm4 // 1.49012907 * Z
1620
  divss    xmm2, xmm3
1621
  movss    xmm5, DWORD [SSE_EXP2_F5]
1622
  addss    xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1623
  subss    xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1624
  mulss    xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1625
  cvtps2dq xmm1, xmm1
1626

1627
  // Restore rounding mode
1628
  ldmxcsr  [OldFlags]
1629

1630
  movss    [Result], xmm1
1631
end;
1632

1633
function FastExp2(const A: TVector2): TVector2; assembler;
1634
var
1635
  OldFlags, NewFlags: UInt32;
1636
asm
1637
  // Set rounding mode to Round Positive (=Round Down)
1638
  stmxcsr  [OldFlags]
1639
  movlps   xmm0, [A]
1640
  mov      ecx, [OldFlags]
1641
  xorps    xmm1, xmm1
1642
  and      ecx, SSE_ROUND_MASK
1643
  movaps   xmm3, xmm0
1644
  or       ecx, SSE_ROUND_DOWN
1645
  movaps   xmm5, xmm0
1646
  mov      [NewFlags], ecx
1647

1648
  movlps   xmm1, QWORD [SSE_EXP2_F1]
1649
  ldmxcsr  [NewFlags]
1650

1651
  // Z := A - RoundDown(A)
1652
  cvtps2dq xmm3, xmm3
1653
  addps    xmm1, xmm5 // A + 121.2740575
1654
  cvtdq2ps xmm3, xmm3
1655
  movlps   xmm2, QWORD [SSE_EXP2_F2]
1656
  subps    xmm0, xmm3
1657

1658
  movlps   xmm3, QWORD [SSE_EXP2_F3]
1659
  movlps   xmm4, QWORD [SSE_EXP2_F4]
1660
  subps    xmm3, xmm0 // (4.84252568 - Z)
1661
  mulps    xmm0, xmm4 // 1.49012907 * Z
1662
  divps    xmm2, xmm3
1663
  movlps   xmm5, QWORD [SSE_EXP2_F5]
1664
  addps    xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1665
  subps    xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1666
  mulps    xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1667
  cvtps2dq xmm1, xmm1
1668

1669
  // Restore rounding mode
1670
  ldmxcsr  [OldFlags]
1671

1672
  movlps   [Result], xmm1
1673
end;
1674

1675
function FastExp2(const A: TVector3): TVector3; assembler;
1676
var
1677
  OldFlags, NewFlags: UInt32;
1678
asm
1679
  // Set rounding mode to Round Positive (=Round Down)
1680
  stmxcsr  [OldFlags]
1681
  movq     xmm0, [A]
1682
  movss    xmm1, [A+8]
1683
  movlhps  xmm0, xmm1
1684
  mov      ecx, [OldFlags]
1685
  xorps    xmm1, xmm1
1686
  and      ecx, SSE_ROUND_MASK
1687
  movaps   xmm3, xmm0
1688
  or       ecx, SSE_ROUND_DOWN
1689
  movaps   xmm5, xmm0
1690
  mov      [NewFlags], ecx
1691

1692
  movups   xmm1, [SSE_EXP2_F1]
1693
  ldmxcsr  [NewFlags]
1694

1695
  // Z := A - RoundDown(A)
1696
  cvtps2dq xmm3, xmm3
1697
  addps    xmm1, xmm5 // A + 121.2740575
1698
  cvtdq2ps xmm3, xmm3
1699
  movups   xmm2, [SSE_EXP2_F2]
1700
  subps    xmm0, xmm3
1701

1702
  movups   xmm3, [SSE_EXP2_F3]
1703
  movups   xmm4, [SSE_EXP2_F4]
1704
  subps    xmm3, xmm0 // (4.84252568 - Z)
1705
  mulps    xmm0, xmm4 // 1.49012907 * Z
1706
  divps    xmm2, xmm3
1707
  movups   xmm5, [SSE_EXP2_F5]
1708
  addps    xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1709
  subps    xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1710
  mulps    xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1711
  cvtps2dq xmm1, xmm1
1712

1713
  // Restore rounding mode
1714
  ldmxcsr  [OldFlags]
1715

1716
  movhlps  xmm0, xmm1
1717
  movq     [Result], xmm1
1718
  movss    [Result+8], xmm0
1719
end;
1720

1721
function FastExp2(const A: TVector4): TVector4; assembler;
1722
var
1723
  OldFlags, NewFlags: UInt32;
1724
asm
1725
  // Set rounding mode to Round Positive (=Round Down)
1726
  stmxcsr  [OldFlags]
1727
  movups   xmm0, [A]
1728
  mov      ecx, [OldFlags]
1729
  xorps    xmm1, xmm1
1730
  and      ecx, SSE_ROUND_MASK
1731
  movaps   xmm3, xmm0
1732
  or       ecx, SSE_ROUND_DOWN
1733
  movaps   xmm5, xmm0
1734
  mov      [NewFlags], ecx
1735

1736
  movups   xmm1, [SSE_EXP2_F1]
1737
  ldmxcsr  [NewFlags]
1738

1739
  // Z := A - RoundDown(A)
1740
  cvtps2dq xmm3, xmm3
1741
  addps    xmm1, xmm5 // A + 121.2740575
1742
  cvtdq2ps xmm3, xmm3
1743
  movups   xmm2, [SSE_EXP2_F2]
1744
  subps    xmm0, xmm3
1745

1746
  movups   xmm3, [SSE_EXP2_F3]
1747
  movups   xmm4, [SSE_EXP2_F4]
1748
  subps    xmm3, xmm0 // (4.84252568 - Z)
1749
  mulps    xmm0, xmm4 // 1.49012907 * Z
1750
  divps    xmm2, xmm3
1751
  movups   xmm5, [SSE_EXP2_F5]
1752
  addps    xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1753
  subps    xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1754
  mulps    xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1755
  cvtps2dq xmm1, xmm1
1756

1757
  // Restore rounding mode
1758
  ldmxcsr  [OldFlags]
1759

1760
  movups   [Result], xmm1
1761
end;
1762

1763
{ Common Functions }
1764

1765
function Abs(const A: Single): Single;
1766
begin
1767
  Result := System.Abs(A);
1768
end;
1769

1770
function Abs(const A: TVector2): TVector2;
1771
begin
1772
  Result.Init(System.Abs(A.X), System.Abs(A.Y));
1773
end;
1774

1775
function Abs(const A: TVector3): TVector3; assembler;
1776
asm
1777
  movq     xmm0, [A]
1778
  movss    xmm1, [A+8]
1779
  movups   xmm2, [SSE_MASK_ABS_VAL]
1780
  andps    xmm0, xmm2
1781
  pand     xmm1, xmm2
1782
  movq     [Result], xmm0
1783
  movss    [Result+8], xmm1
1784
end;
1785

1786
function Abs(const A: TVector4): TVector4; assembler;
1787
asm
1788
  movups   xmm0, [A]
1789
  movups   xmm1, [SSE_MASK_ABS_VAL]
1790
  andps    xmm0, xmm1
1791
  movups   [Result], xmm0
1792
end;
1793

1794
function Sign(const A: Single): Single; assembler;
1795
asm
1796
  movss    xmm0, [A]
1797
  movss    xmm1, DWORD [SSE_ONE]
1798
  movss    xmm2, xmm0
1799
  movss    xmm3, DWORD [SSE_MASK_SIGN]
1800

1801
  andps    xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1802
  xorps    xmm4, xmm4
1803
  orps     xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1804
  cmpneqss xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1805
  andps    xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1806
  movss    [Result], xmm0
1807
end;
1808

1809
function Sign(const A: TVector2): TVector2; assembler;
1810
asm
1811
  movlps   xmm0, [A]
1812
  movlps   xmm1, QWORD [SSE_ONE]
1813
  movaps   xmm2, xmm0
1814
  movlps   xmm3, QWORD [SSE_MASK_SIGN]
1815

1816
  andps    xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1817
  xorps    xmm4, xmm4
1818
  orps     xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1819
  cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1820
  andps    xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1821
  movlps   [Result], xmm0
1822
end;
1823

1824
function Sign(const A: TVector3): TVector3; assembler;
1825
asm
1826
  movq     xmm0, [A]
1827
  movss    xmm1, [A+8]
1828
  movlhps  xmm0, xmm1
1829
  movups   xmm1, [SSE_ONE]
1830
  movaps   xmm2, xmm0
1831
  movups   xmm3, [SSE_MASK_SIGN]
1832

1833
  andps    xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1834
  xorps    xmm4, xmm4
1835
  orps     xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1836
  cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1837
  andps    xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1838
  movhlps  xmm1, xmm0
1839
  movq     [Result], xmm0
1840
  movss    [Result+8], xmm1
1841
end;
1842

1843
function Sign(const A: TVector4): TVector4; assembler;
1844
asm
1845
  movups   xmm0, [A]
1846
  movups   xmm1, [SSE_ONE]
1847
  movaps   xmm2, xmm0
1848
  movups   xmm3, [SSE_MASK_SIGN]
1849

1850
  andps    xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1851
  xorps    xmm4, xmm4
1852
  orps     xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1853
  cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1854
  andps    xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1855
  movups   [Result], xmm0
1856
end;
1857

1858
function Floor(const A: Single): Integer; assembler;
1859
var
1860
  OldFlags, NewFlags: UInt32;
1861
asm
1862
  // Set rounding mode to Round Down
1863
  stmxcsr  [OldFlags]
1864
  mov      ecx, [OldFlags]
1865
  and      ecx, SSE_ROUND_MASK
1866
  or       ecx, SSE_ROUND_DOWN
1867
  mov      [NewFlags], ecx
1868
  movss    xmm0, [A]
1869
  ldmxcsr  [NewFlags]
1870

1871
  cvtps2dq xmm0, xmm0
1872

1873
  // Restore rounding mode
1874
  ldmxcsr  [OldFlags]
1875

1876
  movd     eax, xmm0
1877
end;
1878

1879
function Floor(const A: TVector2): TIVector2; assembler;
1880
var
1881
  OldFlags, NewFlags: UInt32;
1882
asm
1883
  // Set rounding mode to Round Down
1884
  stmxcsr  [OldFlags]
1885
  mov      ecx, [OldFlags]
1886
  and      ecx, SSE_ROUND_MASK
1887
  or       ecx, SSE_ROUND_DOWN
1888
  mov      [NewFlags], ecx
1889
  movlps   xmm0, [A]
1890
  ldmxcsr  [NewFlags]
1891

1892
  cvtps2dq xmm0, xmm0
1893

1894
  // Restore rounding mode
1895
  ldmxcsr  [OldFlags]
1896

1897
  movlps   [Result], xmm0
1898
end;
1899

1900
function Floor(const A: TVector3): TIVector3; assembler;
1901
var
1902
  OldFlags, NewFlags: UInt32;
1903
asm
1904
  // Set rounding mode to Round Down
1905
  stmxcsr  [OldFlags]
1906
  mov      ecx, [OldFlags]
1907
  and      ecx, SSE_ROUND_MASK
1908
  or       ecx, SSE_ROUND_DOWN
1909
  mov      [NewFlags], ecx
1910
  movq     xmm0, [A]
1911
  movss    xmm1, [A+8]
1912
  movlhps  xmm0, xmm1
1913
  ldmxcsr  [NewFlags]
1914

1915
  cvtps2dq xmm0, xmm0
1916

1917
  // Restore rounding mode
1918
  ldmxcsr  [OldFlags]
1919

1920
  movhlps  xmm1, xmm0
1921
  movq     [Result], xmm0
1922
  movss    [Result+8], xmm1
1923
end;
1924

1925
function Floor(const A: TVector4): TIVector4; assembler;
1926
var
1927
  OldFlags, NewFlags: UInt32;
1928
asm
1929
  // Set rounding mode to Round Down
1930
  stmxcsr  [OldFlags]
1931
  mov      ecx, [OldFlags]
1932
  and      ecx, SSE_ROUND_MASK
1933
  or       ecx, SSE_ROUND_DOWN
1934
  mov      [NewFlags], ecx
1935
  movups   xmm0, [A]
1936
  ldmxcsr  [NewFlags]
1937

1938
  cvtps2dq xmm0, xmm0
1939

1940
  // Restore rounding mode
1941
  ldmxcsr  [OldFlags]
1942

1943
  movups   [Result], xmm0
1944
end;
1945

1946
function Trunc(const A: Single): Integer;
1947
begin
1948
  Result := System.Trunc(A);
1949
end;
1950
{function Trunc(const A: Single): Integer; assembler;
1951
var
1952
  OldFlags, NewFlags: UInt32;
1953
asm
1954
  // Set rounding mode to Truncate
1955
  stmxcsr  [OldFlags]
1956
  mov      ecx, [OldFlags]
1957
  and      ecx, SSE_ROUND_MASK
1958
  or       ecx, SSE_ROUND_TRUNC
1959
  mov      [NewFlags], ecx
1960
  movss    xmm0, [A]
1961
  ldmxcsr  [NewFlags]
1962

1963
  cvtps2dq xmm0, xmm0
1964

1965
  // Restore rounding mode
1966
  ldmxcsr  [OldFlags]
1967

1968
  movd     eax, xmm0
1969
end;}
1970

1971
function Trunc(const A: TVector2): TIVector2; assembler;
1972
var
1973
  OldFlags, NewFlags: UInt32;
1974
asm
1975
  // Set rounding mode to Truncate
1976
  stmxcsr  [OldFlags]
1977
  mov      ecx, [OldFlags]
1978
  and      ecx, SSE_ROUND_MASK
1979
  or       ecx, SSE_ROUND_TRUNC
1980
  mov      [NewFlags], ecx
1981
  movlps   xmm0, [A]
1982
  ldmxcsr  [NewFlags]
1983

1984
  cvtps2dq xmm0, xmm0
1985

1986
  // Restore rounding mode
1987
  ldmxcsr  [OldFlags]
1988

1989
  movlps   [Result], xmm0
1990
end;
1991

1992
function Trunc(const A: TVector3): TIVector3; assembler;
1993
var
1994
  OldFlags, NewFlags: UInt32;
1995
asm
1996
  // Set rounding mode to Truncate
1997
  stmxcsr  [OldFlags]
1998
  mov      ecx, [OldFlags]
1999
  and      ecx, SSE_ROUND_MASK
2000
  or       ecx, SSE_ROUND_TRUNC
2001
  mov      [NewFlags], ecx
2002
  movq     xmm0, [A]
2003
  movss    xmm1, [A+8]
2004
  movlhps  xmm0, xmm1
2005
  ldmxcsr  [NewFlags]
2006

2007
  cvtps2dq xmm0, xmm0
2008

2009
  // Restore rounding mode
2010
  ldmxcsr  [OldFlags]
2011

2012
  movhlps  xmm1, xmm0
2013
  movq     [Result], xmm0
2014
  movss    [Result+8], xmm1
2015
end;
2016

2017
function Trunc(const A: TVector4): TIVector4; assembler;
2018
var
2019
  OldFlags, NewFlags: UInt32;
2020
asm
2021
  // Set rounding mode to Truncate
2022
  stmxcsr  [OldFlags]
2023
  mov      ecx, [OldFlags]
2024
  and      ecx, SSE_ROUND_MASK
2025
  or       ecx, SSE_ROUND_TRUNC
2026
  mov      [NewFlags], ecx
2027
  movups   xmm0, [A]
2028
  ldmxcsr  [NewFlags]
2029

2030
  cvtps2dq xmm0, xmm0
2031

2032
  // Restore rounding mode
2033
  ldmxcsr  [OldFlags]
2034

2035
  movups   [Result], xmm0
2036
end;
2037

2038
function Round(const A: Single): Integer;
2039
begin
2040
  Result := System.Round(A);
2041
end;
2042

2043
function Round(const A: TVector2): TIVector2; assembler;
2044
asm
2045
  // Rounding mode defaults to round-to-nearest
2046
  movlps   xmm0, [A]
2047
  cvtps2dq xmm0, xmm0
2048
  movlps   [Result], xmm0
2049
end;
2050

2051
function Round(const A: TVector3): TIVector3; assembler;
2052
asm
2053
  // Rounding mode defaults to round-to-nearest
2054
  movq     xmm0, [A]
2055
  movss    xmm1, [A+8]
2056
  movlhps  xmm0, xmm1
2057
  cvtps2dq xmm0, xmm0
2058
  movhlps  xmm1, xmm0
2059
  movq     [Result], xmm0
2060
  movss    [Result+8], xmm1
2061
end;
2062

2063
function Round(const A: TVector4): TIVector4; assembler;
2064
asm
2065
  // Rounding mode defaults to round-to-nearest
2066
  movups   xmm0, [A]
2067
  cvtps2dq xmm0, xmm0
2068
  movups   [Result], xmm0
2069
end;
2070

2071
function Ceil(const A: Single): Integer; assembler;
2072
var
2073
  OldFlags, NewFlags: UInt32;
2074
asm
2075
  // Set rounding mode to Ceil
2076
  stmxcsr  [OldFlags]
2077
  mov      ecx, [OldFlags]
2078
  and      ecx, SSE_ROUND_MASK
2079
  or       ecx, SSE_ROUND_UP
2080
  mov      [NewFlags], ecx
2081
  movss    xmm0, [A]
2082
  ldmxcsr  [NewFlags]
2083

2084
  cvtps2dq xmm0, xmm0
2085

2086
  // Restore rounding mode
2087
  ldmxcsr  [OldFlags]
2088

2089
  movd     eax, xmm0
2090
end;
2091

2092
function Ceil(const A: TVector2): TIVector2; assembler;
2093
var
2094
  OldFlags, NewFlags: UInt32;
2095
asm
2096
  // Set rounding mode to Ceil
2097
  stmxcsr  [OldFlags]
2098
  mov      ecx, [OldFlags]
2099
  and      ecx, SSE_ROUND_MASK
2100
  or       ecx, SSE_ROUND_UP
2101
  mov      [NewFlags], ecx
2102
  movlps   xmm0, [A]
2103
  ldmxcsr  [NewFlags]
2104

2105
  cvtps2dq xmm0, xmm0
2106

2107
  // Restore rounding mode
2108
  ldmxcsr  [OldFlags]
2109

2110
  movlps   [Result], xmm0
2111
end;
2112

2113
function Ceil(const A: TVector3): TIVector3; assembler;
2114
var
2115
  OldFlags, NewFlags: UInt32;
2116
asm
2117
  // Set rounding mode to Ceil
2118
  stmxcsr  [OldFlags]
2119
  mov      ecx, [OldFlags]
2120
  and      ecx, SSE_ROUND_MASK
2121
  or       ecx, SSE_ROUND_UP
2122
  mov      [NewFlags], ecx
2123
  movq     xmm0, [A]
2124
  movss    xmm1, [A+8]
2125
  movlhps  xmm0, xmm1
2126
  ldmxcsr  [NewFlags]
2127

2128
  cvtps2dq xmm0, xmm0
2129

2130
  // Restore rounding mode
2131
  ldmxcsr  [OldFlags]
2132

2133
  movhlps  xmm1, xmm0
2134
  movq     [Result], xmm0
2135
  movss    [Result+8], xmm1
2136
end;
2137

2138
function Ceil(const A: TVector4): TIVector4; assembler;
2139
var
2140
  OldFlags, NewFlags: UInt32;
2141
asm
2142
  // Set rounding mode to Ceil
2143
  stmxcsr  [OldFlags]
2144
  mov      ecx, [OldFlags]
2145
  and      ecx, SSE_ROUND_MASK
2146
  or       ecx, SSE_ROUND_UP
2147
  mov      [NewFlags], ecx
2148
  movups   xmm0, [A]
2149
  ldmxcsr  [NewFlags]
2150

2151
  cvtps2dq xmm0, xmm0
2152

2153
  // Restore rounding mode
2154
  ldmxcsr  [OldFlags]
2155

2156
  movups   [Result], xmm0
2157
end;
2158

2159
function Frac(const A: Single): Single; assembler;
2160
var
2161
  OldFlags, NewFlags: UInt32;
2162
asm
2163
  // Set rounding mode to Truncate
2164
  stmxcsr  [OldFlags]
2165
  mov      ecx, [OldFlags]
2166
  and      ecx, SSE_ROUND_MASK
2167
  or       ecx, SSE_ROUND_TRUNC
2168
  movss    xmm0, [A]
2169
  mov      [NewFlags], ecx
2170
  movss    xmm1, xmm0
2171
  ldmxcsr  [NewFlags]
2172

2173
  cvtps2dq xmm0, xmm0
2174
  ldmxcsr  [OldFlags]
2175
  cvtdq2ps xmm0, xmm0
2176
  subss    xmm1, xmm0 // A - Trunc(A)
2177

2178
  movss    [Result], xmm1
2179
end;
2180

2181
function Frac(const A: TVector2): TVector2; assembler;
2182
var
2183
  OldFlags, NewFlags: UInt32;
2184
asm
2185
  // Set rounding mode to Truncate
2186
  stmxcsr  [OldFlags]
2187
  mov      ecx, [OldFlags]
2188
  and      ecx, SSE_ROUND_MASK
2189
  or       ecx, SSE_ROUND_TRUNC
2190
  movlps   xmm0, [A]
2191
  mov      [NewFlags], ecx
2192
  movaps   xmm1, xmm0
2193
  ldmxcsr  [NewFlags]
2194

2195
  cvtps2dq xmm0, xmm0
2196
  ldmxcsr  [OldFlags]
2197
  cvtdq2ps xmm0, xmm0
2198
  subps    xmm1, xmm0 // A - Trunc(A)
2199

2200
  movlps   [Result], xmm1
2201
end;
2202

2203
function Frac(const A: TVector3): TVector3; assembler;
2204
var
2205
  OldFlags, NewFlags: UInt32;
2206
asm
2207
  // Set rounding mode to Truncate
2208
  stmxcsr  [OldFlags]
2209
  mov      ecx, [OldFlags]
2210
  and      ecx, SSE_ROUND_MASK
2211
  or       ecx, SSE_ROUND_TRUNC
2212
  movq     xmm0, [A]
2213
  movss    xmm1, [A+8]
2214
  movlhps  xmm0, xmm1
2215
  mov      [NewFlags], ecx
2216
  movaps   xmm1, xmm0
2217
  ldmxcsr  [NewFlags]
2218

2219
  cvtps2dq xmm0, xmm0
2220
  ldmxcsr  [OldFlags]
2221
  cvtdq2ps xmm0, xmm0
2222
  subps    xmm1, xmm0 // A - Trunc(A)
2223

2224
  movhlps  xmm0, xmm1
2225
  movq     [Result], xmm1
2226
  movss    [Result+8], xmm0
2227
end;
2228

2229
function Frac(const A: TVector4): TVector4; assembler;
2230
var
2231
  OldFlags, NewFlags: UInt32;
2232
asm
2233
  // Set rounding mode to Truncate
2234
  stmxcsr  [OldFlags]
2235
  mov      ecx, [OldFlags]
2236
  and      ecx, SSE_ROUND_MASK
2237
  or       ecx, SSE_ROUND_TRUNC
2238
  movups   xmm0, [A]
2239
  mov      [NewFlags], ecx
2240
  movaps   xmm1, xmm0
2241
  ldmxcsr  [NewFlags]
2242

2243
  cvtps2dq xmm0, xmm0
2244
  ldmxcsr  [OldFlags]
2245
  cvtdq2ps xmm0, xmm0
2246
  subps    xmm1, xmm0 // A - Trunc(A)
2247

2248
  movups   [Result], xmm1
2249
end;
2250

2251
function FMod(const A, B: Single): Single;
2252
begin
2253
  Result := A - (B * Trunc(A / B));
2254
end;
2255
{function FMod(const A, B: Single): Single; assembler;
2256
var
2257
  OldFlags, NewFlags: UInt32;
2258
asm
2259
  // Set rounding mode to Truncate
2260
  movss    xmm0, [A]
2261
  stmxcsr  [OldFlags]
2262
  movss    xmm1, [B]
2263
  mov      edx, [OldFlags]
2264
  movss    xmm2, xmm0
2265
  and      edx, SSE_ROUND_MASK
2266
  movss    xmm3, xmm1
2267
  or       edx, SSE_ROUND_TRUNC
2268
  divss    xmm2, xmm3 // A / B
2269
  mov      [NewFlags], edx
2270
  ldmxcsr  [NewFlags]
2271

2272
  cvtps2dq xmm2, xmm2
2273
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2274
  mulss    xmm2, xmm1
2275
  subss    xmm0, xmm2 // A - (B * Trunc(A / B))
2276

2277
  // Restore rounding mode
2278
  ldmxcsr  [OldFlags]
2279

2280
  movss    [Result], xmm0
2281
end;}
2282

2283
function FMod(const A: TVector2; const B: Single): TVector2; assembler;
2284
var
2285
  OldFlags, NewFlags: UInt32;
2286
asm
2287
  // Set rounding mode to Truncate
2288
  movlps   xmm0, [A]
2289
  stmxcsr  [OldFlags]
2290
  movss    xmm1, [B]
2291
  mov      ecx, [OldFlags]
2292
  shufps   xmm1, xmm1, $00 // Replicate B
2293
  and      ecx, SSE_ROUND_MASK
2294
  movaps   xmm2, xmm0
2295
  or       ecx, SSE_ROUND_TRUNC
2296
  movaps   xmm3, xmm1
2297
  mov      [NewFlags], ecx
2298
  divps    xmm2, xmm3 // A / B
2299
  ldmxcsr  [NewFlags]
2300

2301
  cvtps2dq xmm2, xmm2
2302
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2303
  mulps    xmm2, xmm1
2304
  subps    xmm0, xmm2 // A - (B * Trunc(A / B))
2305

2306
  // Restore rounding mode
2307
  ldmxcsr  [OldFlags]
2308

2309
  movlps   [Result], xmm0
2310
end;
2311

2312
function FMod(const A, B: TVector2): TVector2; assembler;
2313
var
2314
  OldFlags, NewFlags: UInt32;
2315
asm
2316
  // Set rounding mode to Truncate
2317
  movlps   xmm0, [A]
2318
  stmxcsr  [OldFlags]
2319
  movlps   xmm1, [B]
2320
  mov      edx, [OldFlags]
2321
  movaps   xmm2, xmm0
2322
  and      edx, SSE_ROUND_MASK
2323
  movaps   xmm3, xmm1
2324
  or       edx, SSE_ROUND_TRUNC
2325
  divps    xmm2, xmm3 // A / B
2326
  mov      [NewFlags], edx
2327
  ldmxcsr  [NewFlags]
2328

2329
  cvtps2dq xmm2, xmm2
2330
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2331
  mulps    xmm2, xmm1
2332
  subps    xmm0, xmm2 // A - (B * Trunc(A / B))
2333

2334
  // Restore rounding mode
2335
  ldmxcsr  [OldFlags]
2336

2337
  movlps   [Result], xmm0
2338
end;
2339

2340
function FMod(const A: TVector3; const B: Single): TVector3; assembler;
2341
var
2342
  OldFlags, NewFlags: UInt32;
2343
asm
2344
  // Set rounding mode to Truncate
2345
  movq     xmm0, [A]
2346
  movss    xmm1, [A+8]
2347
  movlhps  xmm0, xmm1
2348
  stmxcsr  [OldFlags]
2349
  movss    xmm1, [B]
2350
  mov      ecx, [OldFlags]
2351
  shufps   xmm1, xmm1, $00 // Replicate B
2352
  and      ecx, SSE_ROUND_MASK
2353
  movaps   xmm2, xmm0
2354
  or       ecx, SSE_ROUND_TRUNC
2355
  movaps   xmm3, xmm1
2356
  mov      [NewFlags], ecx
2357
  divps    xmm2, xmm3 // A / B
2358
  ldmxcsr  [NewFlags]
2359

2360
  cvtps2dq xmm2, xmm2
2361
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2362
  mulps    xmm2, xmm1
2363
  subps    xmm0, xmm2 // A - (B * Trunc(A / B))
2364

2365
  // Restore rounding mode
2366
  ldmxcsr  [OldFlags]
2367

2368
  movhlps  xmm1, xmm0
2369
  movq     [Result], xmm0
2370
  movss    [Result+8], xmm1
2371
end;
2372

2373
function FMod(const A, B: TVector3): TVector3; assembler;
2374
var
2375
  OldFlags, NewFlags: UInt32;
2376
asm
2377
  // Set rounding mode to Truncate
2378
  movq     xmm0, [A]
2379
  movss    xmm1, [A+8]
2380
  movlhps  xmm0, xmm1
2381
  stmxcsr  [OldFlags]
2382
  movq     xmm1, [B]
2383
  movss    xmm2, [B+8]
2384
  movlhps  xmm1, xmm2
2385
  mov      edx, [OldFlags]
2386
  movaps   xmm2, xmm0
2387
  and      edx, SSE_ROUND_MASK
2388
  movaps   xmm3, xmm1
2389
  or       edx, SSE_ROUND_TRUNC
2390
  divps    xmm2, xmm3 // A / B
2391
  mov      [NewFlags], edx
2392
  ldmxcsr  [NewFlags]
2393

2394
  cvtps2dq xmm2, xmm2
2395
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2396
  mulps    xmm2, xmm1
2397
  subps    xmm0, xmm2 // A - (B * Trunc(A / B))
2398

2399
  // Restore rounding mode
2400
  ldmxcsr  [OldFlags]
2401

2402
  movhlps  xmm1, xmm0
2403
  movq     [Result], xmm0
2404
  movss    [Result+8], xmm1
2405
end;
2406

2407
function FMod(const A: TVector4; const B: Single): TVector4; assembler;
2408
var
2409
  OldFlags, NewFlags: UInt32;
2410
asm
2411
  // Set rounding mode to Truncate
2412
  movups   xmm0, [A]
2413
  stmxcsr  [OldFlags]
2414
  movss    xmm1, [B]
2415
  mov      ecx, [OldFlags]
2416
  shufps   xmm1, xmm1, $00 // Replicate B
2417
  and      ecx, SSE_ROUND_MASK
2418
  movaps   xmm2, xmm0
2419
  or       ecx, SSE_ROUND_TRUNC
2420
  movaps   xmm3, xmm1
2421
  mov      [NewFlags], ecx
2422
  divps    xmm2, xmm3 // A / B
2423
  ldmxcsr  [NewFlags]
2424

2425
  cvtps2dq xmm2, xmm2
2426
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2427
  mulps    xmm2, xmm1
2428
  subps    xmm0, xmm2 // A - (B * Trunc(A / B))
2429

2430
  // Restore rounding mode
2431
  ldmxcsr  [OldFlags]
2432

2433
  movups   [Result], xmm0
2434
end;
2435

2436
function FMod(const A, B: TVector4): TVector4; assembler;
2437
var
2438
  OldFlags, NewFlags: UInt32;
2439
asm
2440
  // Set rounding mode to Truncate
2441
  movups   xmm0, [A]
2442
  stmxcsr  [OldFlags]
2443
  movups   xmm1, [B]
2444
  mov      edx, [OldFlags]
2445
  movaps   xmm2, xmm0
2446
  and      edx, SSE_ROUND_MASK
2447
  movaps   xmm3, xmm1
2448
  or       edx, SSE_ROUND_TRUNC
2449
  divps    xmm2, xmm3 // A / B
2450
  mov      [NewFlags], edx
2451
  ldmxcsr  [NewFlags]
2452

2453
  cvtps2dq xmm2, xmm2
2454
  cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2455
  mulps    xmm2, xmm1
2456
  subps    xmm0, xmm2 // A - (B * Trunc(A / B))
2457

2458
  // Restore rounding mode
2459
  ldmxcsr  [OldFlags]
2460

2461
  movups   [Result], xmm0
2462
end;
2463

2464
function ModF(const A: Single; out B: Integer): Single; assembler;
2465
var
2466
  OldFlags, NewFlags: UInt32;
2467
asm
2468
  movss    xmm0, [A]
2469

2470
  // Set rounding mode to Truncate
2471
  stmxcsr  [OldFlags]
2472
  mov      edx, [OldFlags]
2473
  and      edx, SSE_ROUND_MASK
2474
  or       edx, SSE_ROUND_TRUNC
2475
  mov      [NewFlags], edx
2476
  ldmxcsr  [NewFlags]
2477

2478
  movss    xmm1, xmm0
2479
  cvtps2dq xmm0, xmm0
2480
  movss    [B], xmm0  // B = Trunc(A)
2481
  cvtdq2ps xmm0, xmm0
2482
  subss    xmm1, xmm0 // A - Trunc(A)
2483

2484
  // Restore rounding mode
2485
  ldmxcsr  [OldFlags]
2486

2487
  movss    [Result], xmm1
2488
end;
2489

2490
function ModF(const A: TVector2; out B: TIVector2): TVector2; assembler;
2491
var
2492
  OldFlags, NewFlags: UInt32;
2493
asm
2494
  movlps   xmm0, [A]
2495

2496
  // Set rounding mode to Truncate
2497
  stmxcsr  [OldFlags]
2498
  mov      eax, [OldFlags]
2499
  and      eax, SSE_ROUND_MASK
2500
  or       eax, SSE_ROUND_TRUNC
2501
  mov      [NewFlags], eax
2502
  ldmxcsr  [NewFlags]
2503

2504
  movaps   xmm1, xmm0
2505
  cvtps2dq xmm0, xmm0
2506
  movlps   [B], xmm0  // B = Trunc(A)
2507
  cvtdq2ps xmm0, xmm0
2508
  subps    xmm1, xmm0 // A - Trunc(A)
2509

2510
  // Restore rounding mode
2511
  ldmxcsr  [OldFlags]
2512

2513
  movlps   [Result], xmm1
2514
end;
2515

2516
function ModF(const A: TVector3; out B: TIVector3): TVector3; assembler;
2517
var
2518
  OldFlags, NewFlags: UInt32;
2519
asm
2520
  movq     xmm0, [A]
2521
  movss    xmm1, [A+8]
2522
  movlhps  xmm0, xmm1
2523

2524
  // Set rounding mode to Truncate
2525
  stmxcsr  [OldFlags]
2526
  mov      eax, [OldFlags]
2527
  and      eax, SSE_ROUND_MASK
2528
  or       eax, SSE_ROUND_TRUNC
2529
  mov      [NewFlags], eax
2530
  ldmxcsr  [NewFlags]
2531

2532
  movaps   xmm1, xmm0
2533
  cvtps2dq xmm0, xmm0
2534
  movhlps  xmm2, xmm0
2535
  movq     [B], xmm0  // B = Trunc(A)
2536
  movd     [B+8], xmm2
2537
  cvtdq2ps xmm0, xmm0
2538
  subps    xmm1, xmm0 // A - Trunc(A)
2539

2540
  // Restore rounding mode
2541
  ldmxcsr  [OldFlags]
2542

2543
  movhlps  xmm0, xmm1
2544
  movq     [Result], xmm1
2545
  movss    [Result+8], xmm0
2546
end;
2547

2548
function ModF(const A: TVector4; out B: TIVector4): TVector4; assembler;
2549
var
2550
  OldFlags, NewFlags: UInt32;
2551
asm
2552
  movups   xmm0, [A]
2553

2554
  // Set rounding mode to Truncate
2555
  stmxcsr  [OldFlags]
2556
  mov      eax, [OldFlags]
2557
  and      eax, SSE_ROUND_MASK
2558
  or       eax, SSE_ROUND_TRUNC
2559
  mov      [NewFlags], eax
2560
  ldmxcsr  [NewFlags]
2561

2562
  movaps   xmm1, xmm0
2563
  cvtps2dq xmm0, xmm0
2564
  movups   [B], xmm0  // B = Trunc(A)
2565
  cvtdq2ps xmm0, xmm0
2566
  subps    xmm1, xmm0 // A - Trunc(A)
2567

2568
  // Restore rounding mode
2569
  ldmxcsr  [OldFlags]
2570

2571
  movups   [Result], xmm1
2572
end;
2573

2574
function Min(const A: TVector2; const B: Single): TVector2; assembler;
2575
asm
2576
  movss  xmm1, [B]
2577
  movlps xmm0, [A]
2578
  shufps xmm1, xmm1, $00 // Replicate B
2579
  minps  xmm0, xmm1
2580
  movlps [Result], xmm0
2581
end;
2582

2583
function Min(const A, B: TVector2): TVector2; assembler;
2584
asm
2585
  movlps xmm0, [A]
2586
  movlps xmm1, [B]
2587
  minps  xmm0, xmm1
2588
  movlps [Result], xmm0
2589
end;
2590

2591
function Min(const A: TVector3; const B: Single): TVector3; assembler;
2592
asm
2593
  movss    xmm1, [B]
2594
  movq     xmm0, [A]
2595
  movss    xmm2, [A+8]
2596
  movlhps  xmm0, xmm2
2597
  shufps   xmm1, xmm1, $00 // Replicate B
2598
  minps    xmm0, xmm1
2599
  movhlps  xmm1, xmm0
2600
  movq     [Result], xmm0
2601
  movss    [Result+8], xmm1
2602
end;
2603

2604
function Min(const A, B: TVector3): TVector3; assembler;
2605
asm
2606
  movq     xmm0, [A]
2607
  movss    xmm1, [A+8]
2608
  movlhps  xmm0, xmm1
2609
  movq     xmm1, [B]
2610
  movss    xmm2, [B+8]
2611
  movlhps  xmm1, xmm2
2612
  minps    xmm0, xmm1
2613
  movhlps  xmm1, xmm0
2614
  movq     [Result], xmm0
2615
  movss    [Result+8], xmm1
2616
end;
2617

2618
function Min(const A: TVector4; const B: Single): TVector4; assembler;
2619
asm
2620
  movss  xmm1, [B]
2621
  movups xmm0, [A]
2622
  shufps xmm1, xmm1, $00 // Replicate B
2623
  minps  xmm0, xmm1
2624
  movups [Result], xmm0
2625
end;
2626

2627
function Min(const A, B: TVector4): TVector4; assembler;
2628
asm
2629
  movups xmm0, [A]
2630
  movups xmm1, [B]
2631
  minps  xmm0, xmm1
2632
  movups [Result], xmm0
2633
end;
2634

2635
function Max(const A: TVector2; const B: Single): TVector2; assembler;
2636
asm
2637
  movss  xmm1, [B]
2638
  movlps xmm0, [A]
2639
  shufps xmm1, xmm1, $00 // Replicate B
2640
  maxps  xmm0, xmm1
2641
  movlps [Result], xmm0
2642
end;
2643

2644
function Max(const A, B: TVector2): TVector2; assembler;
2645
asm
2646
  movlps xmm0, [A]
2647
  movlps xmm1, [B]
2648
  maxps  xmm0, xmm1
2649
  movlps [Result], xmm0
2650
end;
2651

2652
function Max(const A: TVector3; const B: Single): TVector3; assembler;
2653
asm
2654
  movss    xmm1, [B]
2655
  movq     xmm0, [A]
2656
  movss    xmm2, [A+8]
2657
  movlhps  xmm0, xmm2
2658
  shufps   xmm1, xmm1, $00 // Replicate B
2659
  maxps    xmm0, xmm1
2660
  movhlps  xmm1, xmm0
2661
  movq     [Result], xmm0
2662
  movss    [Result+8], xmm1
2663
end;
2664

2665
function Max(const A, B: TVector3): TVector3; assembler;
2666
asm
2667
  movq     xmm0, [A]
2668
  movss    xmm1, [A+8]
2669
  movlhps  xmm0, xmm1
2670
  movq     xmm1, [B]
2671
  movss    xmm2, [B+8]
2672
  movlhps  xmm1, xmm2
2673
  maxps    xmm0, xmm1
2674
  movhlps  xmm1, xmm0
2675
  movq     [Result], xmm0
2676
  movss    [Result+8], xmm1
2677
end;
2678

2679
function Max(const A: TVector4; const B: Single): TVector4; assembler;
2680
asm
2681
  movss  xmm1, [B]
2682
  movups xmm0, [A]
2683
  shufps xmm1, xmm1, $00 // Replicate B
2684
  maxps  xmm0, xmm1
2685
  movups [Result], xmm0
2686
end;
2687

2688
function Max(const A, B: TVector4): TVector4; assembler;
2689
asm
2690
  movups xmm0, [A]
2691
  movups xmm1, [B]
2692
  maxps  xmm0, xmm1
2693
  movups [Result], xmm0
2694
end;
2695

2696
function EnsureRange(const A, AMin, AMax: Single): Single; assembler;
2697
asm
2698
  movss  xmm0, [A]
2699
  movss  xmm1, [AMin]
2700
  movss  xmm2, [AMax]
2701
  maxss  xmm0, xmm1
2702
  minss  xmm0, xmm2
2703
  movss  [Result], xmm0
2704
end;
2705

2706
function EnsureRange(const A: TVector2; const AMin, AMax: Single): TVector2; assembler;
2707
asm
2708
  movlps xmm0, [A]
2709
  movss  xmm1, [AMin]
2710
  movss  xmm2, [AMax]
2711
  shufps xmm1, xmm1, $00 // Replicate AMin
2712
  shufps xmm2, xmm2, $00 // Replicate AMax
2713
  maxps  xmm0, xmm1
2714
  minps  xmm0, xmm2
2715
  movlps [Result], xmm0
2716
end;
2717

2718
function EnsureRange(const A, AMin, AMax: TVector2): TVector2; assembler;
2719
asm
2720
  movlps xmm0, [A]
2721
  movlps xmm1, [AMin]
2722
  movlps xmm2, [AMax]
2723
  maxps  xmm0, xmm1
2724
  mov    eax, [Result]
2725
  minps  xmm0, xmm2
2726
  movlps [eax], xmm0
2727
end;
2728

2729
function EnsureRange(const A: TVector3; const AMin, AMax: Single): TVector3; assembler;
2730
asm
2731
  movq     xmm0, [A]
2732
  movss    xmm1, [A+8]
2733
  movlhps  xmm0, xmm1
2734
  movss    xmm1, [AMin]
2735
  movss    xmm2, [AMax]
2736
  shufps   xmm1, xmm1, $00 // Replicate AMin
2737
  shufps   xmm2, xmm2, $00 // Replicate AMax
2738
  maxps    xmm0, xmm1
2739
  minps    xmm0, xmm2
2740
  movhlps  xmm1, xmm0
2741
  movq     [Result], xmm0
2742
  movss    [Result+8], xmm1
2743
end;
2744

2745
function EnsureRange(const A, AMin, AMax: TVector3): TVector3; assembler;
2746
asm
2747
  movq     xmm0, [A]
2748
  movss    xmm1, [A+8]
2749
  movlhps  xmm0, xmm1
2750
  movq     xmm1, [AMin]
2751
  movss    xmm2, [AMin+8]
2752
  movlhps  xmm1, xmm2
2753
  movq     xmm2, [AMax]
2754
  movss    xmm3, [AMax+8]
2755
  movlhps  xmm2, xmm3
2756
  maxps    xmm0, xmm1
2757
  mov      eax, [Result]
2758
  minps    xmm0, xmm2
2759
  movhlps  xmm1, xmm0
2760
  movq     [eax], xmm0
2761
  movss    [eax+8], xmm1
2762
end;
2763

2764
function EnsureRange(const A: TVector4; const AMin, AMax: Single): TVector4; assembler;
2765
asm
2766
  movups xmm0, [A]
2767
  movss  xmm1, [AMin]
2768
  movss  xmm2, [AMax]
2769
  shufps xmm1, xmm1, $00 // Replicate AMin
2770
  shufps xmm2, xmm2, $00 // Replicate AMax
2771
  maxps  xmm0, xmm1
2772
  minps  xmm0, xmm2
2773
  movups [Result], xmm0
2774
end;
2775

2776
function EnsureRange(const A, AMin, AMax: TVector4): TVector4; assembler;
2777
asm
2778
  movups xmm0, [A]
2779
  movups xmm1, [AMin]
2780
  movups xmm2, [AMax]
2781
  maxps  xmm0, xmm1
2782
  mov    eax, [Result]
2783
  minps  xmm0, xmm2
2784
  movups [eax], xmm0
2785
end;
2786

2787
function Mix(const A, B: TVector2; const T: Single): TVector2;
2788
begin
2789
  Result.Init(Mix(A.X, B.X, T), Mix(A.Y, B.Y, T));
2790
end;
2791

2792
function Mix(const A, B, T: TVector2): TVector2;
2793
begin
2794
  Result.Init(Mix(A.X, B.X, T.X), Mix(A.Y, B.Y, T.Y));
2795
end;
2796

2797
function Mix(const A, B: TVector3; const T: Single): TVector3; assembler;
2798
asm
2799
  movss    xmm2, [T]
2800
  movq     xmm0, [A]
2801
  movss    xmm1, [A+8]
2802
  movlhps  xmm0, xmm1
2803
  movq     xmm1, [B]
2804
  movss    xmm3, [B+8]
2805
  movlhps  xmm1, xmm3
2806
  shufps   xmm2, xmm2, $00 // Replicate T
2807
  subps    xmm1, xmm0
2808
  mulps    xmm1, xmm2
2809
  addps    xmm0, xmm1 // A + (T * (B - A))
2810
  movhlps  xmm1, xmm0
2811
  movq     [Result], xmm0
2812
  movss    [Result+8], xmm1
2813
end;
2814

2815
function Mix(const A, B, T: TVector3): TVector3; assembler;
2816
asm
2817
  movq     xmm0, [A]
2818
  movss    xmm1, [A+8]
2819
  movlhps  xmm0, xmm1
2820
  movq     xmm1, [B]
2821
  movss    xmm2, [B+8]
2822
  movlhps  xmm1, xmm2
2823
  movq     xmm2, [T]
2824
  movss    xmm3, [T+8]
2825
  movlhps  xmm2, xmm3
2826
  subps    xmm1, xmm0
2827
  mulps    xmm1, xmm2
2828
  mov      eax, [Result]
2829
  addps    xmm0, xmm1 // A + (T * (B - A))
2830
  movhlps  xmm1, xmm0
2831
  movq     [eax], xmm0
2832
  movss    [eax+8], xmm1
2833
end;
2834

2835
function Mix(const A, B: TVector4; const T: Single): TVector4; assembler;
2836
asm
2837
  movss  xmm2, [T]
2838
  movups xmm0, [A]
2839
  movups xmm1, [B]
2840
  shufps xmm2, xmm2, $00 // Replicate T
2841
  subps  xmm1, xmm0
2842
  mulps  xmm1, xmm2
2843
  addps  xmm0, xmm1 // A + (T * (B - A))
2844
  movups [Result], xmm0
2845
end;
2846

2847
function Mix(const A, B, T: TVector4): TVector4; assembler;
2848
asm
2849
  movups xmm0, [A]
2850
  movups xmm1, [B]
2851
  movups xmm2, [T]
2852
  subps  xmm1, xmm0
2853
  mulps  xmm1, xmm2
2854
  mov    eax, [Result]
2855
  addps  xmm0, xmm1 // A + (T * (B - A))
2856
  movups [eax], xmm0
2857
end;
2858

2859
function Step(const AEdge: Single; const A: TVector2): TVector2; assembler;
2860
asm
2861
  movss    xmm0, [AEdge]
2862
  movlps   xmm1, [A]
2863
  shufps   xmm0, xmm0, $00 // Replicate AEdge
2864
  movlps   xmm2, QWORD [SSE_ONE]
2865
  cmpnltps xmm1, xmm0      // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2866
  andps    xmm1, xmm2      // (A >= AEdge)? Yes: 1, No: 0
2867
  movlps   [Result], xmm1
2868
end;
2869

2870
function Step(const AEdge, A: TVector2): TVector2; assembler;
2871
asm
2872
  movlps   xmm0, [AEdge]
2873
  movlps   xmm1, [A]
2874
  movlps   xmm2, QWORD [SSE_ONE]
2875
  cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2876
  andps    xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2877
  movlps   [Result], xmm1
2878
end;
2879

2880
function Step(const AEdge: Single; const A: TVector3): TVector3; assembler;
2881
asm
2882
  movss    xmm0, [AEdge]
2883
  movq     xmm1, [A]
2884
  movss    xmm2, [A+8]
2885
  movlhps  xmm1, xmm2
2886
  shufps   xmm0, xmm0, $00 // Replicate AEdge
2887
  movups   xmm2, [SSE_ONE]
2888
  cmpnltps xmm1, xmm0      // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2889
  andps    xmm1, xmm2      // (A >= AEdge)? Yes: 1, No: 0
2890
  movhlps  xmm0, xmm1
2891
  movq     [Result], xmm1
2892
  movss    [Result+8], xmm0
2893
end;
2894

2895
function Step(const AEdge, A: TVector3): TVector3; assembler;
2896
asm
2897
  movq     xmm0, [AEdge]
2898
  movss    xmm1, [AEdge+8]
2899
  movlhps  xmm0, xmm1
2900
  movq     xmm1, [A]
2901
  movss    xmm2, [A+8]
2902
  movlhps  xmm1, xmm2
2903
  movups   xmm2, [SSE_ONE]
2904
  cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2905
  andps    xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2906
  movhlps  xmm0, xmm1
2907
  movq     [Result], xmm1
2908
  movss    [Result+8], xmm0
2909
end;
2910

2911
function Step(const AEdge: Single; const A: TVector4): TVector4; assembler;
2912
asm
2913
  movss    xmm0, [AEdge]
2914
  movups   xmm1, [A]
2915
  shufps   xmm0, xmm0, $00 // Replicate AEdge
2916
  movups   xmm2, [SSE_ONE]
2917
  cmpnltps xmm1, xmm0      // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2918
  andps    xmm1, xmm2      // (A >= AEdge)? Yes: 1, No: 0
2919
  movups   [Result], xmm1
2920
end;
2921

2922
function Step(const AEdge, A: TVector4): TVector4; assembler;
2923
asm
2924
  movups   xmm0, [AEdge]
2925
  movups   xmm1, [A]
2926
  movups   xmm2, [SSE_ONE]
2927
  cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2928
  andps    xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2929
  movups   [Result], xmm1
2930
end;
2931

2932
function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector2): TVector2;
2933
begin
2934
  Result.Init(SmoothStep(AEdge0, AEdge1, A.X), SmoothStep(AEdge0, AEdge1, A.Y));
2935
end;
2936

2937
function SmoothStep(const AEdge0, AEdge1, A: TVector2): TVector2;
2938
begin
2939
  Result.Init(SmoothStep(AEdge0.X, AEdge1.X, A.X), SmoothStep(AEdge0.Y, AEdge1.Y, A.Y));
2940
end;
2941

2942
function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector3): TVector3;
2943
begin
2944
  Result.Init(SmoothStep(AEdge0, AEdge1, A.X), SmoothStep(AEdge0, AEdge1, A.Y), SmoothStep(AEdge0, AEdge1, A.Z));
2945
end;
2946
{function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector3): TVector3; assembler;
2947
asm
2948
  movq     xmm2, [A]
2949
  movss    xmm1, [A+8]
2950
  movlhps  xmm2, xmm1
2951
  movss    xmm0, [AEdge0]
2952
  movss    xmm1, [AEdge1]
2953
  shufps   xmm0, xmm0, $00 // Replicate AEdge0
2954
  shufps   xmm1, xmm1, $00 // Replicate AEdge1
2955
  movaps   xmm3, xmm2
2956
  movaps   xmm4, xmm2
2957
  movaps   xmm5, xmm2
2958
  movups   xmm6, [SSE_ONE]
2959

2960
  cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
2961
  cmpleps  xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
2962
  subps    xmm1, xmm0
2963
  movaps   xmm5, xmm4
2964
  subps    xmm2, xmm0
2965
  andnps   xmm5, xmm6 // (A >  AEdge1)? Yes: 1.0, No: 0.0
2966

2967
  movups   xmm6, [SSE_TWO]
2968
  divps    xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2969
  movups   xmm7, [SSE_THREE]
2970
  mulps    xmm6, xmm2 // 2 * Temp
2971
  subps    xmm7, xmm6 // 3 - (2 * Temp)
2972
  mulps    xmm7, xmm2
2973
  mulps    xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
2974
  andps    xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
2975
  andps    xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
2976
  orps     xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
2977

2978
  movhlps  xmm6, xmm7
2979
  movq     [Result], xmm7
2980
  movss    [Result+8], xmm6
2981
end;}
2982

2983
function SmoothStep(const AEdge0, AEdge1, A: TVector3): TVector3; assembler;
2984
asm
2985
  movq     xmm2, [A]
2986
  movss    xmm1, [A+8]
2987
  movlhps  xmm2, xmm1
2988
  movq     xmm0, [AEdge0]
2989
  movss    xmm1, [AEdge0+8]
2990
  movlhps  xmm0, xmm1
2991
  movq     xmm1, [AEdge1]
2992
  movss    xmm3, [AEdge1+8]
2993
  movlhps  xmm1, xmm3
2994
  movaps   xmm3, xmm2
2995
  movaps   xmm4, xmm2
2996
  movaps   xmm5, xmm2
2997
  movups   xmm6, [SSE_ONE]
2998

2999
  cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3000
  cmpleps  xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3001
  subps    xmm1, xmm0
3002
  movaps   xmm5, xmm4
3003
  subps    xmm2, xmm0
3004
  andnps   xmm5, xmm6 // (A >  AEdge1)? Yes: 1.0, No: 0.0
3005

3006
  movups   xmm6, [SSE_TWO]
3007
  divps    xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3008
  movups   xmm7, [SSE_THREE]
3009
  mulps    xmm6, xmm2 // 2 * Temp
3010
  subps    xmm7, xmm6 // 3 - (2 * Temp)
3011
  mulps    xmm7, xmm2
3012
  mulps    xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3013
  andps    xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3014
  andps    xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3015
  orps     xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3016

3017
  mov      eax, [Result]
3018
  movhlps  xmm6, xmm7
3019
  movq     [eax], xmm7
3020
  movss    [eax+8], xmm6
3021
end;
3022

3023
function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector4): TVector4; assembler;
3024
asm
3025
  movups   xmm2, [A]
3026
  movss    xmm0, [AEdge0]
3027
  movss    xmm1, [AEdge1]
3028
  shufps   xmm0, xmm0, $00 // Replicate AEdge0
3029
  shufps   xmm1, xmm1, $00 // Replicate AEdge1
3030
  movaps   xmm3, xmm2
3031
  movaps   xmm4, xmm2
3032
  movaps   xmm5, xmm2
3033
  movups   xmm6, [SSE_ONE]
3034

3035
  cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3036
  cmpleps  xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3037
  subps    xmm1, xmm0
3038
  movaps   xmm5, xmm4
3039
  subps    xmm2, xmm0
3040
  andnps   xmm5, xmm6 // (A >  AEdge1)? Yes: 1.0, No: 0.0
3041

3042
  movups   xmm6, [SSE_TWO]
3043
  divps    xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3044
  movups   xmm7, [SSE_THREE]
3045
  mulps    xmm6, xmm2 // 2 * Temp
3046
  subps    xmm7, xmm6 // 3 - (2 * Temp)
3047
  mulps    xmm7, xmm2
3048
  mulps    xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3049
  andps    xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3050
  andps    xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3051
  orps     xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3052

3053
  movups   [Result], xmm7
3054
end;
3055

3056
function SmoothStep(const AEdge0, AEdge1, A: TVector4): TVector4; assembler;
3057
asm
3058
  movups   xmm2, [A]
3059
  movups   xmm0, [AEdge0]
3060
  movups   xmm1, [AEdge1]
3061
  movaps   xmm3, xmm2
3062
  movaps   xmm4, xmm2
3063
  movaps   xmm5, xmm2
3064
  movups   xmm6, [SSE_ONE]
3065

3066
  cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3067
  cmpleps  xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3068
  subps    xmm1, xmm0
3069
  movaps   xmm5, xmm4
3070
  subps    xmm2, xmm0
3071
  andnps   xmm5, xmm6 // (A >  AEdge1)? Yes: 1.0, No: 0.0
3072

3073
  movups   xmm6, [SSE_TWO]
3074
  divps    xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3075
  movups   xmm7, [SSE_THREE]
3076
  mulps    xmm6, xmm2 // 2 * Temp
3077
  subps    xmm7, xmm6 // 3 - (2 * Temp)
3078
  mulps    xmm7, xmm2
3079
  mulps    xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3080
  andps    xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3081
  andps    xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3082
  orps     xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3083

3084
  mov      eax, [Result]
3085
  movups   [eax], xmm7
3086
end;
3087

3088
function FMA(const A, B, C: TVector2): TVector2; assembler;
3089
asm
3090
  movlps xmm0, [A]
3091
  movlps xmm1, [B]
3092
  movlps xmm2, [C]
3093
  mulps  xmm0, xmm1
3094
  addps  xmm0, xmm2
3095
  mov    eax, [Result]
3096
  movlps [eax], xmm0
3097
end;
3098

3099
function FMA(const A, B, C: TVector3): TVector3; assembler;
3100
asm
3101
  movq     xmm0, [A]
3102
  movss    xmm1, [A+8]
3103
  movlhps  xmm0, xmm1
3104
  movq     xmm1, [B]
3105
  movss    xmm2, [B+8]
3106
  movlhps  xmm1, xmm2
3107
  movq     xmm2, [C]
3108
  movss    xmm3, [C+8]
3109
  movlhps  xmm2, xmm3
3110
  mulps    xmm0, xmm1
3111
  addps    xmm0, xmm2
3112
  mov      eax, [Result]
3113
  movhlps  xmm1, xmm0
3114
  movq     [eax], xmm0
3115
  movss    [eax+8], xmm1
3116
end;
3117

3118
function FMA(const A, B, C: TVector4): TVector4; assembler;
3119
asm
3120
  movups xmm0, [A]
3121
  movups xmm1, [B]
3122
  movups xmm2, [C]
3123
  mulps  xmm0, xmm1
3124
  addps  xmm0, xmm2
3125
  mov    eax, [Result]
3126
  movups [eax], xmm0
3127
end;
3128

3129
{ Matrix functions }
3130

3131
{$IFDEF FM_COLUMN_MAJOR}
3132
function OuterProduct(const C, R: TVector2): TMatrix2; assembler;
3133
asm
3134
  movlps xmm0, [R]
3135
  movlps xmm1, [C]
3136

3137
  shufps xmm0, xmm0, $50
3138
  shufps xmm1, xmm1, $44
3139

3140
  mulps  xmm1, xmm0
3141

3142
  // Store as matrix
3143
  movups [Result], xmm1
3144
end;
3145

3146
function OuterProduct(const C, R: TVector3): TMatrix3; assembler;
3147
asm
3148
  movq     xmm0, [C]
3149
  movss    xmm1, [C+8]
3150
  movlhps  xmm0, xmm1
3151
  movq     xmm1, [R]
3152
  movss    xmm2, [R+8]
3153
  movlhps  xmm1, xmm2
3154
  movaps   xmm2, xmm1
3155
  movaps   xmm3, xmm1
3156

3157
  shufps   xmm1, xmm1, $00
3158
  shufps   xmm2, xmm2, $55
3159
  shufps   xmm3, xmm3, $AA
3160

3161
  mulps    xmm1, xmm0
3162
  mulps    xmm2, xmm0
3163
  mulps    xmm3, xmm0
3164

3165
  // Store as matrix
3166
  movhlps  xmm0, xmm1
3167
  movhlps  xmm4, xmm2
3168
  movhlps  xmm5, xmm3
3169
  movq     QWORD [Result+$00], xmm1
3170
  movss    [Result+$08], xmm0
3171
  movq     QWORD [Result+$0C], xmm2
3172
  movss    [Result+$14], xmm4
3173
  movq     QWORD [Result+$18], xmm3
3174
  movss    [Result+$20], xmm5
3175
end;
3176

3177
function OuterProduct(const C, R: TVector4): TMatrix4; assembler;
3178
asm
3179
  movups xmm0, [C]
3180
  movups xmm1, [R]
3181
  movaps xmm2, xmm1
3182
  movaps xmm3, xmm1
3183
  movaps xmm4, xmm1
3184

3185
  shufps xmm1, xmm1, $00
3186
  shufps xmm2, xmm2, $55
3187
  shufps xmm3, xmm3, $AA
3188
  shufps xmm4, xmm4, $FF
3189

3190
  mulps  xmm1, xmm0
3191
  mulps  xmm2, xmm0
3192
  mulps  xmm3, xmm0
3193
  mulps  xmm4, xmm0
3194

3195
  // Store as matrix
3196
  movups DQWORD [Result + $00], xmm1
3197
  movups DQWORD [Result + $10], xmm2
3198
  movups DQWORD [Result + $20], xmm3
3199
  movups DQWORD [Result + $30], xmm4
3200
end;
3201
{$ELSE}
3202
function OuterProduct(const C, R: TVector2): TMatrix2; assembler;
3203
asm
3204
  movlps xmm0, [C]       // # # C.Y C.X
3205
  movlps xmm1, [R]       // # # R.Y R.X
3206

3207
  shufps xmm0, xmm0, $50 // C.Y C.X C.Y C.X
3208
  shufps xmm1, xmm1, $44 // R.Y R.Y R.X R.X
3209

3210
  mulps  xmm1, xmm0      // (C.Y*R.Y) (C.X*R.Y) (C.Y*R.X) (C.X*R.X)
3211

3212
  // Store as matrix
3213
  movups [Result], xmm1
3214
end;
3215

3216
function OuterProduct(const C, R: TVector3): TMatrix3; assembler;
3217
asm
3218
  movq     xmm0, [R]
3219
  movss    xmm1, [R+8]
3220
  movlhps  xmm0, xmm1
3221
  movq     xmm1, [C]
3222
  movss    xmm2, [C+8]
3223
  movlhps  xmm1, xmm2
3224
  movaps   xmm2, xmm1
3225
  movaps   xmm3, xmm1
3226

3227
  shufps   xmm1, xmm1, $00 // C.X (4x)
3228
  shufps   xmm2, xmm2, $55 // C.Y (4x)
3229
  shufps   xmm3, xmm3, $AA // C.Z (4x)
3230

3231
  mulps    xmm1, xmm0      // R * C.X
3232
  mulps    xmm2, xmm0      // R * C.Y
3233
  mulps    xmm3, xmm0      // R * C.Z
3234

3235
  // Store as matrix
3236
  movhlps  xmm0, xmm1
3237
  movhlps  xmm4, xmm2
3238
  movhlps  xmm5, xmm3
3239
  movq     QWORD [Result+$00], xmm1
3240
  movss    [Result+$08], xmm0
3241
  movq     QWORD [Result+$0C], xmm2
3242
  movss    [Result+$14], xmm4
3243
  movq     QWORD [Result+$18], xmm3
3244
  movss    [Result+$20], xmm5
3245
end;
3246

3247
function OuterProduct(const C, R: TVector4): TMatrix4; assembler;
3248
asm
3249
  movups xmm0, [R]
3250
  movups xmm1, [C]
3251
  movaps xmm2, xmm1
3252
  movaps xmm3, xmm1
3253
  movaps xmm4, xmm1
3254

3255
  shufps xmm1, xmm1, $00 // C.X (4x)
3256
  shufps xmm2, xmm2, $55 // C.Y (4x)
3257
  shufps xmm3, xmm3, $AA // C.Z (4x)
3258
  shufps xmm4, xmm4, $FF // C.W (4x)
3259

3260
  mulps  xmm1, xmm0      // R * C.X
3261
  mulps  xmm2, xmm0      // R * C.Y
3262
  mulps  xmm3, xmm0      // R * C.Z
3263
  mulps  xmm4, xmm0      // R * C.W
3264

3265
  // Store as matrix
3266
  movups DQWORD [Result + $00], xmm1
3267
  movups DQWORD [Result + $10], xmm2
3268
  movups DQWORD [Result + $20], xmm3
3269
  movups DQWORD [Result + $30], xmm4
3270
end;
3271
{$ENDIF}
3272

3273
{ TVector2 }
3274

3275
{ These SIMD versions are similar to the ones for TVector4. The main difference
3276
  is using the "movlps" instruction (to load 2 values) instead of the
3277
  "movups" instruction (that loads 4 values) }
3278

3279
class operator TVector2.Add(const A: TVector2; const B: Single): TVector2;
3280
begin
3281
  Result.X := A.X + B;
3282
  Result.Y := A.Y + B;
3283
end;
3284

3285
class operator TVector2.Add(const A: Single; const B: TVector2): TVector2;
3286
begin
3287
  Result.X := A + B.X;
3288
  Result.Y := A + B.Y;
3289
end;
3290

3291
class operator TVector2.Add(const A, B: TVector2): TVector2;
3292
begin
3293
  Result.X := A.X + B.X;
3294
  Result.Y := A.Y + B.Y;
3295
end;
3296

3297
function TVector2.Distance(const AOther: TVector2): Single;
3298
begin
3299
  Result := (Self - AOther).Length;
3300
end;
3301

3302
function TVector2.DistanceSquared(const AOther: TVector2): Single;
3303
begin
3304
  Result := (Self - AOther).LengthSquared;
3305
end;
3306

3307
class operator TVector2.Divide(const A: TVector2; const B: Single): TVector2;
3308
var
3309
  InvB: Single;
3310
begin
3311
  InvB := 1 / B;
3312
  Result.X := A.X * InvB;
3313
  Result.Y := A.Y * InvB;
3314
end;
3315

3316
class operator TVector2.Divide(const A: Single; const B: TVector2): TVector2; assembler;
3317
asm
3318
  movss  xmm0, [A]
3319
  movlps xmm1, [B]
3320
  shufps xmm0, xmm0, 0
3321
  divps  xmm0, xmm1
3322
  movlps [Result], xmm0
3323
end;
3324

3325
class operator TVector2.Divide(const A, B: TVector2): TVector2; assembler;
3326
asm
3327
  movlps xmm0, [A]
3328
  movlps xmm1, [B]
3329
  divps  xmm0, xmm1
3330
  movlps [Result], xmm0
3331
end;
3332

3333
function TVector2.Dot(const AOther: TVector2): Single;
3334
begin
3335
  Result := (X * AOther.X) + (Y * AOther.Y);
3336
end;
3337

3338
function TVector2.FaceForward(const I, NRef: TVector2): TVector2;
3339
begin
3340
  if (NRef.Dot(I) < 0) then
3341
    Result := Self
3342
  else
3343
    Result := -Self;
3344
end;
3345

3346
function TVector2.GetLength: Single;
3347
begin
3348
  Result := Sqrt((X * X) + (Y * Y));
3349
end;
3350

3351
function TVector2.GetLengthSquared: Single;
3352
begin
3353
  Result := (X * X) + (Y * Y);
3354
end;
3355

3356
class operator TVector2.Multiply(const A: TVector2; const B: Single): TVector2;
3357
begin
3358
  Result.X := A.X * B;
3359
  Result.Y := A.Y * B;
3360
end;
3361

3362
class operator TVector2.Multiply(const A: Single; const B: TVector2): TVector2;
3363
begin
3364
  Result.X := A * B.X;
3365
  Result.Y := A * B.Y;
3366
end;
3367

3368
class operator TVector2.Multiply(const A, B: TVector2): TVector2;
3369
begin
3370
  Result.X := A.X * B.X;
3371
  Result.Y := A.Y * B.Y;
3372
end;
3373

3374
function TVector2.NormalizeFast: TVector2; assembler;
3375
asm
3376
  movlps  xmm0, [Self]    // Y X
3377
  movaps  xmm2, xmm0
3378
  mulps   xmm0, xmm0      // Y*Y X*X
3379
  pshufd  xmm1, xmm0, $01 // X*X Y*Y
3380
  addps   xmm0, xmm1      // (X*X+Y*Y) (2x)
3381
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y)) (4x)
3382
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
3383
  movlps  [Result], xmm0
3384
end;
3385

3386
function TVector2.Reflect(const N: TVector2): TVector2;
3387
begin
3388
  Result := Self - ((2 * N.Dot(Self)) * N);
3389
end;
3390

3391
function TVector2.Refract(const N: TVector2; const Eta: Single): TVector2;
3392
var
3393
  D, K: Single;
3394
begin
3395
  D := N.Dot(Self);
3396
  K := 1 - Eta * Eta * (1 - D * D);
3397
  if (K < 0) then
3398
    Result.Init
3399
  else
3400
    Result := (Eta * Self) - ((Eta * D + Sqrt(K)) * N);
3401
end;
3402

3403
procedure TVector2.SetNormalizedFast; assembler;
3404
asm
3405
  movlps  xmm0, [Self]    // Y X
3406
  movaps  xmm2, xmm0
3407
  mulps   xmm0, xmm0      // Y*Y X*X
3408
  pshufd  xmm1, xmm0, $01 // X*X Y*Y
3409
  addps   xmm0, xmm1      // (X*X+Y*Y) (2x)
3410
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y)) (4x)
3411
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
3412
  movlps  [Self], xmm0
3413
end;
3414

3415
class operator TVector2.Subtract(const A: TVector2; const B: Single): TVector2;
3416
begin
3417
  Result.X := A.X - B;
3418
  Result.Y := A.Y - B;
3419
end;
3420

3421
class operator TVector2.Subtract(const A: Single; const B: TVector2): TVector2;
3422
begin
3423
  Result.X := A - B.X;
3424
  Result.Y := A - B.Y;
3425
end;
3426

3427
class operator TVector2.Subtract(const A, B: TVector2): TVector2;
3428
begin
3429
  Result.X := A.X - B.X;
3430
  Result.Y := A.Y - B.Y;
3431
end;
3432

3433
{ TVector3 }
3434

3435
class operator TVector3.Add(const A: TVector3; const B: Single): TVector3; assembler;
3436
asm
3437
  movss  xmm2, [B]      // Load single floating-point value
3438
  movq   xmm0, [A]      // Load 3 floating-point values
3439
  movss  xmm1, [A+8]
3440
  shufps xmm2, xmm2, 0  // Replicate B
3441
  addps  xmm0, xmm2     // A + B
3442
  addss  xmm1, xmm2
3443
  movq   [Result], xmm0
3444
  movss  [Result+8], xmm1
3445
end;
3446

3447
class operator TVector3.Add(const A: Single; const B: TVector3): TVector3; assembler;
3448
asm
3449
  movss  xmm2, [A]
3450
  movq   xmm0, [B]
3451
  movss  xmm1, [B+8]
3452
  shufps xmm2, xmm2, 0
3453
  addps  xmm0, xmm2
3454
  addss  xmm1, xmm2
3455
  movq   [Result], xmm0
3456
  movss  [Result+8], xmm1
3457
end;
3458

3459
class operator TVector3.Add(const A, B: TVector3): TVector3;
3460
begin
3461
  Result.X := A.X + B.X;
3462
  Result.Y := A.Y + B.Y;
3463
  Result.Z := A.Z + B.Z;
3464
end;
3465
{class operator TVector3.Add(const A, B: TVector3): TVector3; assembler;
3466
asm
3467
  movq   xmm0, [A]
3468
  movss  xmm1, [A+8]
3469
  movq   xmm2, [B]
3470
  movss  xmm3, [B+8]
3471
  addps  xmm0, xmm2
3472
  addss  xmm1, xmm3
3473
  movq   [Result], xmm0
3474
  movss  [Result+8], xmm1
3475
end;}
3476

3477
function TVector3.Distance(const AOther: TVector3): Single; assembler;
3478
asm
3479
  movq    xmm0, [Self]
3480
  movss   xmm1, [Self+8]
3481
  movq    xmm2, [AOther]
3482
  movss   xmm3, [AOther+8]
3483
  movlhps xmm0, xmm1
3484
  movlhps xmm2, xmm3
3485
  subps   xmm0, xmm2 // A - B
3486

3487
  // (A - B).Length
3488
  mulps   xmm0, xmm0
3489
  pshufd  xmm1, xmm0, $0E
3490
  addps   xmm0, xmm1
3491
  pshufd  xmm1, xmm0, $01
3492
  addss   xmm0, xmm1
3493
  sqrtss  xmm0, xmm0
3494
  movss   [Result], xmm0
3495
end;
3496

3497
function TVector3.DistanceSquared(const AOther: TVector3): Single; assembler;
3498
asm
3499
  movq    xmm0, [Self]
3500
  movss   xmm1, [Self+8]
3501
  movq    xmm2, [AOther]
3502
  movss   xmm3, [AOther+8]
3503
  movlhps xmm0, xmm1
3504
  movlhps xmm2, xmm3
3505
  subps   xmm0, xmm2 // A - B
3506

3507
  // (A - B).LengthSquared
3508
  mulps   xmm0, xmm0
3509
  pshufd  xmm1, xmm0, $0E
3510
  addps   xmm0, xmm1
3511
  pshufd  xmm1, xmm0, $01
3512
  addss   xmm0, xmm1
3513
  movss   [Result], xmm0
3514
end;
3515

3516
class operator TVector3.Divide(const A: TVector3; const B: Single): TVector3;
3517
var
3518
  InvB: Single;
3519
begin
3520
  InvB := 1 / B;
3521
  Result.X := A.X * InvB;
3522
  Result.Y := A.Y * InvB;
3523
  Result.Z := A.Z * InvB;
3524
end;
3525

3526
class operator TVector3.Divide(const A: Single; const B: TVector3): TVector3; assembler;
3527
asm
3528
  movss  xmm0, [A]
3529
  movq   xmm1, [B]
3530
  movss  xmm2, [B+8]
3531
  movss  xmm3, xmm0
3532
  shufps xmm0, xmm0, 0
3533
  divps  xmm0, xmm1
3534
  divss  xmm3, xmm2
3535
  movq   [Result], xmm0
3536
  movss  [Result+8], xmm3
3537
end;
3538

3539
class operator TVector3.Divide(const A, B: TVector3): TVector3; assembler;
3540
asm
3541
  movq   xmm0, [A]
3542
  movss  xmm1, [A+8]
3543
  movq   xmm2, [B]
3544
  movss  xmm3, [B+8]
3545
  divps  xmm0, xmm2
3546
  divss  xmm1, xmm3
3547
  movq   [Result], xmm0
3548
  movss  [Result+8], xmm1
3549
end;
3550

3551
function TVector3.Cross(const AOther: TVector3): TVector3;
3552
begin
3553
  Result.X := (Y * AOther.Z) - (AOther.Y * Z);
3554
  Result.Y := (Z * AOther.X) - (AOther.Z * X);
3555
  Result.Z := (X * AOther.Y) - (AOther.X * Y);
3556
end;
3557

3558
function TVector3.Dot(const AOther: TVector3): Single;
3559
begin
3560
  Result := (X * AOther.X) + (Y * AOther.Y) + (Z * AOther.Z);
3561
end;
3562

3563
function TVector3.FaceForward(const I, NRef: TVector3): TVector3;
3564
begin
3565
  if (NRef.Dot(I) < 0) then
3566
    Result := Self
3567
  else
3568
    Result := -Self;
3569
end;
3570

3571
function TVector3.GetLength: Single; assembler;
3572
asm
3573
  movq    xmm0, [Self]    // 0 0 Y X
3574
  movss   xmm1, [Self+8]  // 0 0 0 Z
3575
  movlhps xmm0, xmm1      // 0 Z Y Z
3576
  mulps   xmm0, xmm0      //  0  Z*Z Y*Y X*X
3577
  pshufd  xmm1, xmm0, $0E // Y*Y X*X  0  Z*Z
3578
  addps   xmm0, xmm1      //     #         #     (Y*Y)     (X*X+Z*Z)
3579
  pshufd  xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3580
  addss   xmm0, xmm1      // (X*X + Y*Y + Z*Z)
3581
  sqrtss  xmm0, xmm0      // Sqrt(X*X + Y*Y + Z*Z)
3582
  movss   [Result], xmm0
3583
end;
3584

3585
function TVector3.GetLengthSquared: Single;
3586
begin
3587
  Result := (X * X) + (Y * Y) + (Z * Z);
3588
end;
3589
{function TVector3.GetLengthSquared: Single; assembler;
3590
asm
3591
  movq    xmm0, [Self]    // 0 0 Y X
3592
  movss   xmm1, [Self+8]  // 0 0 0 Z
3593
  movlhps xmm0, xmm1      // 0 Z Y Z
3594
  mulps   xmm0, xmm0      //  0  Z*Z Y*Y X*X
3595
  pshufd  xmm1, xmm0, $0E // Y*Y X*X  0  Z*Z
3596
  addps   xmm0, xmm1      //     #         #     (Y*Y)     (X*X+Z*Z)
3597
  pshufd  xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3598
  addss   xmm0, xmm1      // (X*X + Y*Y + Z*Z)
3599
  movss   [Result], xmm0
3600
end;}
3601

3602
class operator TVector3.Multiply(const A: TVector3; const B: Single): TVector3; assembler;
3603
asm
3604
  movss  xmm2, [B]
3605
  movq   xmm0, [A]
3606
  movss  xmm1, [A+8]
3607
  shufps xmm2, xmm2, 0
3608
  mulps  xmm0, xmm2
3609
  mulss  xmm1, xmm2
3610
  movq   [Result], xmm0
3611
  movss  [Result+8], xmm1
3612
end;
3613

3614
class operator TVector3.Multiply(const A: Single; const B: TVector3): TVector3; assembler;
3615
asm
3616
  movss  xmm2, [A]
3617
  movq   xmm0, [B]
3618
  movss  xmm1, [B+8]
3619
  shufps xmm2, xmm2, 0
3620
  mulps  xmm0, xmm2
3621
  mulss  xmm1, xmm2
3622
  movq   [Result], xmm0
3623
  movss  [Result+8], xmm1
3624
end;
3625

3626
class operator TVector3.Multiply(const A, B: TVector3): TVector3;
3627
begin
3628
  Result.X := A.X * B.X;
3629
  Result.Y := A.Y * B.Y;
3630
  Result.Z := A.Z * B.Z;
3631
end;
3632
{class operator TVector3.Multiply(const A, B: TVector3): TVector3; assembler;
3633
asm
3634
  movq   xmm0, [A]
3635
  movss  xmm1, [A+8]
3636
  movq   xmm2, [B]
3637
  movss  xmm3, [B+8]
3638
  mulps  xmm0, xmm2
3639
  mulss  xmm1, xmm3
3640
  movq   [Result], xmm0
3641
  movss  [Result+8], xmm1
3642
end;}
3643

3644
class operator TVector3.Negative(const A: TVector3): TVector3;
3645
begin
3646
  Result.X := -A.X;
3647
  Result.Y := -A.Y;
3648
  Result.Z := -A.Z;
3649
end;
3650
{class operator TVector3.Negative(const A: TVector3): TVector3; assembler;
3651
asm
3652
  movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
3653
  movq   xmm1, [A]
3654
  movss  xmm2, [A+8]
3655
  xorps  xmm1, xmm0            // Flip sign bit
3656
  xorps  xmm2, xmm0
3657
  movq   [Result], xmm1
3658
  movss  [Result+8], xmm2
3659
end;}
3660

3661
function TVector3.NormalizeFast: TVector3; assembler;
3662
asm
3663
  movq    xmm0, [Self]    // 0 0 Y X
3664
  movss   xmm1, [Self+8]  // 0 0 0 Z
3665
  movlhps xmm0, xmm1      // 0 Z Y Z
3666
  movaps  xmm2, xmm0
3667

3668
  // Dot(A, A)
3669
  mulps   xmm0, xmm0      //  0  Z*Z Y*Y X*X
3670
  pshufd  xmm1, xmm0, $4E // Y*Y X*X  0  Z*Z
3671
  addps   xmm0, xmm1      //   (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3672
  pshufd  xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3673
  addps   xmm0, xmm1      // (X*X + Y*Y + Z*Z) (4x)
3674

3675
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3676
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
3677
  movhlps xmm1, xmm0
3678
  movq    [Result], xmm0
3679
  movss   [Result+8], xmm1
3680
end;
3681

3682
function TVector3.Reflect(const N: TVector3): TVector3; assembler;
3683
asm
3684
  movq     xmm0, [Self]
3685
  movss    xmm2, [Self+8]
3686
  movq     xmm1, [N]
3687
  movss    xmm3, [N+8]
3688
  movlhps  xmm0, xmm2
3689
  movlhps  xmm1, xmm3
3690
  movaps   xmm2, xmm0
3691
  movups   xmm3, [SSE_TWO]
3692

3693
  // Dot(N, I)
3694
  mulps    xmm0, xmm1
3695
  mulps    xmm3, xmm1 // N * 2
3696
  pshufd   xmm1, xmm0, $4E
3697
  addps    xmm0, xmm1
3698
  pshufd   xmm1, xmm0, $11
3699
  addps    xmm0, xmm1
3700

3701
  // (2 * Dot(N, I)) * N
3702
  mulps    xmm0, xmm3
3703

3704
  // I - ((2 * Dot(N, I)) * N)
3705
  subps    xmm2, xmm0
3706
  movhlps  xmm3, xmm2
3707
  movq     [Result], xmm2
3708
  movss    [Result+8], xmm3
3709
end;
3710

3711
function TVector3.Refract(const N: TVector3; const Eta: Single): TVector3; assembler;
3712
asm
3713
  movq     xmm0, [Self]
3714
  movss    xmm2, [Self+8]
3715
  movq     xmm1, [N]
3716
  movss    xmm3, [N+8]
3717
  movlhps  xmm0, xmm2
3718
  movlhps  xmm1, xmm3
3719
  movups   xmm7, xmm0
3720
  movss    xmm2, [Eta]
3721
  movss    xmm3, DWORD [SSE_ONE]
3722

3723
  // D := Dot(N, I)
3724
  mulps    xmm0, xmm1
3725
  movss    xmm4, xmm3 // 1
3726
  pshufd   xmm1, xmm0, $4E
3727
  movss    xmm5, xmm2 // Eta
3728
  addps    xmm0, xmm1
3729
  mulss    xmm5, xmm5 // Eta * Eta
3730
  pshufd   xmm1, xmm0, $11
3731
  addss    xmm0, xmm1
3732

3733
  // K := 1 - Eta * Eta * (1 - D * D)
3734
  movss    xmm6, xmm0  // D
3735
  mulss    xmm0, xmm0  // D * D
3736
  subss    xmm4, xmm0  // 1 - D * D
3737
  mulss    xmm4, xmm5  // Eta * Eta * (1 - D * D)
3738
  xorps    xmm5, xmm5  // 0
3739
  subss    xmm3, xmm4  // K := 1 - Eta * Eta * (1 - D * D)
3740

3741
  // if (K < 0) then
3742
  comiss   xmm3, xmm5
3743

3744
  jb       @KLessThanZero
3745

3746
  // K >= 0
3747
  mulss    xmm6, xmm2    // Eta * D
3748
  shufps   xmm2, xmm2, 0 // Replicate Eta (4x)
3749
  mulps    xmm7, xmm2    // Eta * I
3750
  sqrtss   xmm3, xmm3    // Sqrt(K)
3751
  addss    xmm6, xmm3    // Eta * D + Sqrt(K)
3752
  shufps   xmm6, xmm6, 0 // Replicate Eta * D + Sqrt(K) (4x)
3753
  movups   xmm1, [N]
3754
  mulps    xmm6, xmm1    // ((Eta * D + Sqrt(K)) * N)
3755
  subps    xmm7, xmm6    // (Eta * I) - ((Eta * D + Sqrt(K)) * N)
3756
  movhlps  xmm0, xmm7
3757
  movq     [Result], xmm7
3758
  movss    [Result+8], xmm0
3759
  jmp      @Finish
3760

3761
@KLessThanZero:
3762
  // K < 0: Result := Vector4(0, 0, 0, 0)
3763
  movlhps  xmm6, xmm5
3764
  movq     [Result], xmm5
3765
  movss    [Result+8], xmm6
3766

3767
@Finish:
3768
end;
3769

3770
procedure TVector3.SetNormalizedFast; assembler;
3771
asm
3772
  movq    xmm0, [Self]    // 0 0 Y X
3773
  movss   xmm1, [Self+8]  // 0 0 0 Z
3774
  movlhps xmm0, xmm1      // 0 Z Y Z
3775
  movaps  xmm2, xmm0
3776

3777
  // Dot(A, A)
3778
  mulps   xmm0, xmm0      //  0  Z*Z Y*Y X*X
3779
  pshufd  xmm1, xmm0, $4E // Y*Y X*X  0  Z*Z
3780
  addps   xmm0, xmm1      //   (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3781
  pshufd  xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3782
  addps   xmm0, xmm1      // (X*X + Y*Y + Z*Z) (4x)
3783

3784
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3785
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
3786
  movhlps xmm1, xmm0
3787
  movq    [Self], xmm0
3788
  movss   [Self+8], xmm1
3789
end;
3790

3791
class operator TVector3.Subtract(const A: TVector3; const B: Single): TVector3; assembler;
3792
asm
3793
  movss  xmm2, [B]
3794
  movq   xmm0, [A]
3795
  movss  xmm1, [A+8]
3796
  shufps xmm2, xmm2, 0
3797
  subps  xmm0, xmm2
3798
  subss  xmm1, xmm2
3799
  movq   [Result], xmm0
3800
  movss  [Result+8], xmm1
3801
end;
3802

3803
class operator TVector3.Subtract(const A: Single; const B: TVector3): TVector3; assembler;
3804
asm
3805
  movss  xmm0, [A]
3806
  movq   xmm1, [B]
3807
  movss  xmm2, [B+8]
3808
  movss  xmm3, xmm0
3809
  shufps xmm0, xmm0, 0
3810
  subps  xmm0, xmm1
3811
  subss  xmm3, xmm2
3812
  movq   [Result], xmm0
3813
  movss  [Result+8], xmm3
3814
end;
3815

3816
class operator TVector3.Subtract(const A, B: TVector3): TVector3;
3817
begin
3818
  Result.X := A.X - B.X;
3819
  Result.Y := A.Y - B.Y;
3820
  Result.Z := A.Z - B.Z;
3821
end;
3822
{class operator TVector3.Subtract(const A, B: TVector3): TVector3; assembler;
3823
asm
3824
  movq   xmm0, [A]
3825
  movss  xmm1, [A+8]
3826
  movq   xmm2, [B]
3827
  movss  xmm3, [B+8]
3828
  subps  xmm0, xmm2
3829
  subss  xmm1, xmm3
3830
  movq   [Result], xmm0
3831
  movss  [Result+8], xmm1
3832
end;}
3833

3834
{ TVector4 }
3835

3836
class operator TVector4.Add(const A: TVector4; const B: Single): TVector4; assembler;
3837
asm
3838
  movss  xmm1, [B]      // Load single floating-point value
3839
  movups xmm0, [A]      // Load 4 floating-point values
3840
  shufps xmm1, xmm1, 0  // Replicate B
3841
  addps  xmm0, xmm1     // A + B
3842
  movups [Result], xmm0 // Store result
3843
end;
3844

3845
class operator TVector4.Add(const A: Single; const B: TVector4): TVector4; assembler;
3846
asm
3847
  movss  xmm1, [A]
3848
  movups xmm0, [B]
3849
  shufps xmm1, xmm1, 0
3850
  addps  xmm0, xmm1
3851
  movups [Result], xmm0
3852
end;
3853

3854
class operator TVector4.Add(const A, B: TVector4): TVector4; assembler;
3855
asm
3856
  movups xmm0, [A]
3857
  movups xmm1, [B]
3858
  addps  xmm0, xmm1
3859
  movups [Result], xmm0
3860
end;
3861

3862
function TVector4.Distance(const AOther: TVector4): Single; assembler;
3863
asm
3864
  movups xmm0, [Self]
3865
  movups xmm1, [AOther]
3866
  subps  xmm0, xmm1 // A - B
3867

3868
  // (A - B).Length
3869
  mulps  xmm0, xmm0
3870
  pshufd xmm1, xmm0, $0E
3871
  addps  xmm0, xmm1
3872
  pshufd xmm1, xmm0, $01
3873
  addss  xmm0, xmm1
3874
  sqrtss xmm0, xmm0
3875
  movss  [Result], xmm0
3876
end;
3877

3878
function TVector4.DistanceSquared(const AOther: TVector4): Single; assembler;
3879
asm
3880
  movups xmm0, [Self]
3881
  movups xmm1, [AOther]
3882
  subps  xmm0, xmm1 // A - B
3883

3884
  // (A - B).LengthSquared
3885
  mulps  xmm0, xmm0
3886
  pshufd xmm1, xmm0, $0E
3887
  addps  xmm0, xmm1
3888
  pshufd xmm1, xmm0, $01
3889
  addss  xmm0, xmm1
3890
  movss  [Result], xmm0
3891
end;
3892

3893
class operator TVector4.Divide(const A: TVector4; const B: Single): TVector4; assembler;
3894
asm
3895
  movss  xmm1, [B]
3896
  movups xmm0, [A]
3897
  shufps xmm1, xmm1, 0
3898
  divps  xmm0, xmm1
3899
  movups [Result], xmm0
3900
end;
3901

3902
class operator TVector4.Divide(const A: Single; const B: TVector4): TVector4; assembler;
3903
asm
3904
  movss  xmm0, [A]
3905
  movups xmm1, [B]
3906
  shufps xmm0, xmm0, 0
3907
  divps  xmm0, xmm1
3908
  movups [Result], xmm0
3909
end;
3910

3911
class operator TVector4.Divide(const A, B: TVector4): TVector4; assembler;
3912
asm
3913
  movups xmm0, [A]
3914
  movups xmm1, [B]
3915
  divps  xmm0, xmm1
3916
  movups [Result], xmm0
3917
end;
3918

3919
function TVector4.Dot(const AOther: TVector4): Single;
3920
begin
3921
  Result := (X * AOther.X) + (Y * AOther.Y) + (Z * AOther.Z) + (W * AOther.W);
3922
end;
3923

3924
function TVector4.FaceForward(const I, NRef: TVector4): TVector4; assembler;
3925
asm
3926
  movups   xmm0, [Self]
3927
  movups   xmm1, [I]
3928
  movups   xmm2, [NRef]
3929
  xorps    xmm3, xmm3 // 0
3930
  movups   xmm4, [SSE_MASK_SIGN]
3931

3932
  // Dot(NRef, I)
3933
  mulps    xmm2, xmm1
3934
  pshufd   xmm1, xmm2, $4E
3935
  addps    xmm2, xmm1
3936
  pshufd   xmm1, xmm2, $11
3937
  addps    xmm2, xmm1
3938

3939
  // Dot(NRef, I) >= 0?  Yes: $FFFFFFFF, No: $00000000
3940
  cmpnltps xmm2, xmm3
3941
  andps    xmm2, xmm4 // Yes: $80000000, No: $00000000
3942

3943
  // Flip sign of N if (Dot(NRef, I) >= 0)
3944
  mov      edx, [Result]
3945
  xorps    xmm0, xmm2
3946
  movups   [edx], xmm0
3947
end;
3948

3949
function TVector4.GetLength: Single; assembler;
3950
asm
3951
  movups xmm0, [Self]    // W Z Y X
3952
  mulps  xmm0, xmm0      // W*W Z*Z Y*Y X*X
3953
  pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
3954
  addps  xmm0, xmm1      //     #         #     (Y*Y+W*W) (X*X+Z*Z)
3955
  pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3956
  addss  xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W)
3957
  sqrtss xmm0, xmm0      // Sqrt(X*X + Y*Y + Z*Z + W*W)
3958
  movss  [Result], xmm0
3959
end;
3960

3961
function TVector4.GetLengthSquared: Single;
3962
begin
3963
  Result := (X * X) + (Y * Y) + (Z * Z) + (W * W);
3964
end;
3965
{function TVector4.GetLengthSquared: Single; assembler;
3966
asm
3967
  movups xmm0, [Self]    // W Z Y X
3968
  mulps  xmm0, xmm0      // W*W Z*Z Y*Y X*X
3969
  pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
3970
  addps  xmm0, xmm1      //     #         #     (Y*Y+W*W) (X*X+Z*Z)
3971
  pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3972
  addss  xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W)
3973
  movss  [Result], xmm0
3974
end;}
3975

3976
class operator TVector4.Multiply(const A: TVector4; const B: Single): TVector4; assembler;
3977
asm
3978
  movss  xmm1, [B]
3979
  movups xmm0, [A]
3980
  shufps xmm1, xmm1, 0
3981
  mulps  xmm0, xmm1
3982
  movups [Result], xmm0
3983
end;
3984

3985
class operator TVector4.Multiply(const A: Single; const B: TVector4): TVector4; assembler;
3986
asm
3987
  movss  xmm0, [A]
3988
  movups xmm1, [B]
3989
  shufps xmm0, xmm0, 0
3990
  mulps  xmm0, xmm1
3991
  movups [Result], xmm0
3992
end;
3993

3994
class operator TVector4.Multiply(const A, B: TVector4): TVector4; assembler;
3995
asm
3996
  movups xmm0, [A]
3997
  movups xmm1, [B]
3998
  mulps  xmm0, xmm1
3999
  movups [Result], xmm0
4000
end;
4001

4002
class operator TVector4.Negative(const A: TVector4): TVector4; assembler;
4003
asm
4004
  movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
4005
  movups xmm1, [A]
4006
  xorps  xmm0, xmm1            // Flip sign bit
4007
  movups [Result], xmm0
4008
end;
4009

4010
function TVector4.NormalizeFast: TVector4;
4011
asm
4012
  movups  xmm0, [Self]    // W Z Y X
4013
  movaps  xmm2, xmm0
4014

4015
  // Dot(A, A)
4016
  mulps   xmm0, xmm0      // W*W Z*Z Y*Y X*X
4017
  pshufd  xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4018
  addps   xmm0, xmm1      // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4019
  pshufd  xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4020
  addps   xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W) (4x)
4021

4022
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4023
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
4024
  movups  [Result], xmm0
4025
end;
4026

4027
function TVector4.Reflect(const N: TVector4): TVector4; assembler;
4028
asm
4029
  movups   xmm0, [Self]
4030
  movups   xmm1, [N]
4031
  movaps   xmm2, xmm0
4032
  movups   xmm3, [SSE_TWO]
4033

4034
  // Dot(N, I)
4035
  mulps    xmm0, xmm1
4036
  mulps    xmm3, xmm1 // N * 2
4037
  pshufd   xmm1, xmm0, $4E
4038
  addps    xmm0, xmm1
4039
  pshufd   xmm1, xmm0, $11
4040
  addps    xmm0, xmm1
4041

4042
  // (2 * Dot(N, I)) * N
4043
  mulps    xmm0, xmm3
4044

4045
  // I - ((2 * Dot(N, I)) * N)
4046
  subps    xmm2, xmm0
4047
  movups   [Result], xmm2
4048
end;
4049

4050
function TVector4.Refract(const N: TVector4; const Eta: Single): TVector4; assembler;
4051
asm
4052
  movups   xmm0, [Self]
4053
  movups   xmm1, [N]
4054
  movups   xmm7, xmm0
4055
  movss    xmm2, [Eta]
4056
  movss    xmm3, DWORD [SSE_ONE]
4057

4058
  // D := Dot(N, I)
4059
  mulps    xmm0, xmm1
4060
  movss    xmm4, xmm3 // 1
4061
  pshufd   xmm1, xmm0, $4E
4062
  movss    xmm5, xmm2 // Eta
4063
  addps    xmm0, xmm1
4064
  mulss    xmm5, xmm5 // Eta * Eta
4065
  pshufd   xmm1, xmm0, $11
4066
  addss    xmm0, xmm1
4067

4068
  // K := 1 - Eta * Eta * (1 - D * D)
4069
  movss    xmm6, xmm0  // D
4070
  mulss    xmm0, xmm0  // D * D
4071
  subss    xmm4, xmm0  // 1 - D * D
4072
  mulss    xmm4, xmm5  // Eta * Eta * (1 - D * D)
4073
  xorps    xmm5, xmm5  // 0
4074
  subss    xmm3, xmm4  // K := 1 - Eta * Eta * (1 - D * D)
4075

4076
  // if (K < 0) then
4077
  comiss   xmm3, xmm5
4078

4079
  jb       @KLessThanZero
4080

4081
  // K >= 0
4082
  mulss    xmm6, xmm2    // Eta * D
4083
  shufps   xmm2, xmm2, 0 // Replicate Eta (4x)
4084
  mulps    xmm7, xmm2    // Eta * I
4085
  sqrtss   xmm3, xmm3    // Sqrt(K)
4086
  addss    xmm6, xmm3    // Eta * D + Sqrt(K)
4087
  shufps   xmm6, xmm6, 0 // Replicate Eta * D + Sqrt(K) (4x)
4088
  movups   xmm1, [N]
4089
  mulps    xmm6, xmm1    // ((Eta * D + Sqrt(K)) * N)
4090
  subps    xmm7, xmm6    // (Eta * I) - ((Eta * D + Sqrt(K)) * N)
4091
  movups   [Result], xmm7
4092
  jmp      @Finish
4093

4094
@KLessThanZero:
4095
  // K < 0: Result := Vector4(0, 0, 0, 0)
4096
  movups   [Result], xmm5
4097

4098
@Finish:
4099
end;
4100

4101
procedure TVector4.SetNormalizedFast; assembler;
4102
asm
4103
  movups  xmm0, [Self]    // W Z Y X
4104
  movaps  xmm2, xmm0
4105

4106
  // Dot(A, A)
4107
  mulps   xmm0, xmm0      // W*W Z*Z Y*Y X*X
4108
  pshufd  xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4109
  addps   xmm0, xmm1      // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4110
  pshufd  xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4111
  addps   xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W) (4x)
4112

4113
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4114
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
4115
  movups  [Self], xmm0
4116
end;
4117

4118
class operator TVector4.Subtract(const A: TVector4; const B: Single): TVector4; assembler;
4119
asm
4120
  movss  xmm1, [B]
4121
  movups xmm0, [A]
4122
  shufps xmm1, xmm1, 0
4123
  subps  xmm0, xmm1
4124
  movups [Result], xmm0
4125
end;
4126

4127
class operator TVector4.Subtract(const A: Single; const B: TVector4): TVector4; assembler;
4128
asm
4129
  movss  xmm0, [A]
4130
  movups xmm1, [B]
4131
  shufps xmm0, xmm0, 0
4132
  subps  xmm0, xmm1
4133
  movups [Result], xmm0
4134
end;
4135

4136
class operator TVector4.Subtract(const A, B: TVector4): TVector4; assembler;
4137
asm
4138
  movups xmm0, [A]
4139
  movups xmm1, [B]
4140
  subps  xmm0, xmm1
4141
  movups [Result], xmm0
4142
end;
4143

4144
{ TQuaternion }
4145

4146
class operator TQuaternion.Add(const A, B: TQuaternion): TQuaternion;
4147
asm
4148
  movups xmm0, [A]
4149
  movups xmm1, [B]
4150
  addps  xmm0, xmm1
4151
  movups [Result], xmm0
4152
end;
4153

4154
function TQuaternion.GetLength: Single;
4155
asm
4156
  movups xmm0, [Self]    // W Z Y X
4157
  mulps  xmm0, xmm0      // W*W Z*Z Y*Y X*X
4158
  pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4159
  addps  xmm0, xmm1      //     #         #     (Y*Y+W*W) (X*X+Z*Z)
4160
  pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4161
  addss  xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W)
4162
  sqrtss xmm0, xmm0      // Sqrt(X*X + Y*Y + Z*Z + W*W)
4163
  movss  [Result], xmm0
4164
end;
4165

4166
function TQuaternion.GetLengthSquared: Single;
4167
asm
4168
  movups xmm0, [Self]    // W Z Y X
4169
  mulps  xmm0, xmm0      // W*W Z*Z Y*Y X*X
4170
  pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4171
  addps  xmm0, xmm1      //     #         #     (Y*Y+W*W) (X*X+Z*Z)
4172
  pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4173
  addss  xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W)
4174
  movss  [Result], xmm0
4175
end;
4176

4177
class operator TQuaternion.Multiply(const A: TQuaternion; const B: Single): TQuaternion;
4178
asm
4179
  movss  xmm1, [B]
4180
  movups xmm0, [A]
4181
  shufps xmm1, xmm1, 0
4182
  mulps  xmm0, xmm1
4183
  movups [Result], xmm0
4184
end;
4185

4186
class operator TQuaternion.Multiply(const A: Single; const B: TQuaternion): TQuaternion;
4187
asm
4188
  movss  xmm0, [A]
4189
  movups xmm1, [B]
4190
  shufps xmm0, xmm0, 0
4191
  mulps  xmm0, xmm1
4192
  movups [Result], xmm0
4193
end;
4194

4195
class operator TQuaternion.Multiply(const A, B: TQuaternion): TQuaternion;
4196
begin
4197
  Result.X := (A.W * B.X) + (A.X * B.W) + (A.Y * B.Z) - (A.Z * B.Y);
4198
  Result.Y := (A.W * B.Y) + (A.Y * B.W) + (A.Z * B.X) - (A.X * B.Z);
4199
  Result.Z := (A.W * B.Z) + (A.Z * B.W) + (A.X * B.Y) - (A.Y * B.X);
4200
  Result.W := (A.W * B.W) - (A.X * B.X) - (A.Y * B.Y) - (A.Z * B.Z);
4201
end;
4202

4203
function TQuaternion.NormalizeFast: TQuaternion;
4204
asm
4205
  movups  xmm0, [Self]    // W Z Y X
4206
  movaps  xmm2, xmm0
4207

4208
  // Dot(A, A)
4209
  mulps   xmm0, xmm0      // W*W Z*Z Y*Y X*X
4210
  pshufd  xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4211
  addps   xmm0, xmm1      // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4212
  pshufd  xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4213
  addps   xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W) (4x)
4214

4215
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4216
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
4217
  movups  [Result], xmm0
4218
end;
4219

4220
procedure TQuaternion.SetNormalizedFast;
4221
asm
4222
  movups  xmm0, [Self]    // W Z Y X
4223
  movaps  xmm2, xmm0
4224

4225
  // Dot(A, A)
4226
  mulps   xmm0, xmm0      // W*W Z*Z Y*Y X*X
4227
  pshufd  xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4228
  addps   xmm0, xmm1      // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4229
  pshufd  xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4230
  addps   xmm0, xmm1      // (X*X + Y*Y + Z*Z + W*W) (4x)
4231

4232
  rsqrtps xmm0, xmm0      // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4233
  mulps   xmm0, xmm2      // A * (1 / Sqrt(Dot(A, A)))
4234
  movups  [Self], xmm0
4235
end;
4236

4237
{ TMatrix2 }
4238

4239
class operator TMatrix2.Add(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4240
asm
4241
  movss  xmm0, [B]              // Load single floating-point value
4242
  movups xmm1, [A]              // Load matrix
4243
  shufps xmm0, xmm0, 0          // Replicate B
4244
  addps  xmm1, xmm0             // Add B
4245
  movups [Result], xmm1
4246
end;
4247

4248
class operator TMatrix2.Add(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4249
asm
4250
  movss  xmm0, [A]              // Load single floating-point value
4251
  movups xmm1, [B]              // Load matrix
4252
  shufps xmm0, xmm0, 0          // Replicate A
4253
  addps  xmm1, xmm0             // Add A
4254
  movups [Result], xmm1
4255
end;
4256

4257
class operator TMatrix2.Add(const A, B: TMatrix2): TMatrix2; assembler;
4258
asm
4259
  movups xmm0, [A]   // Load A
4260
  movups xmm1, [B]   // Load B
4261
  addps  xmm0, xmm1  // Add
4262
  movups [Result], xmm0
4263
end;
4264

4265
function TMatrix2.CompMult(const AOther: TMatrix2): TMatrix2; assembler;
4266
asm
4267
  movups xmm0, [Self]
4268
  movups xmm1, [AOther]
4269

4270
  // Component-wise multiplication
4271
  mulps  xmm0, xmm1
4272

4273
  // Store result
4274
  movups [Result], xmm0
4275
end;
4276

4277
class operator TMatrix2.Divide(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4278
asm
4279
  movss  xmm0, [B]              // Load single floating-point value
4280
  movups xmm1, [A]              // Load matrix
4281
  shufps xmm0, xmm0, 0          // Replicate B
4282
  divps  xmm1, xmm0             // Divide B
4283
  movups [Result], xmm1
4284
end;
4285

4286
class operator TMatrix2.Divide(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4287
asm
4288
  movss  xmm0, [A]              // Load single floating-point value
4289
  movups xmm1, [B]              // Load matrix
4290
  shufps xmm0, xmm0, 0          // Replicate A
4291
  divps  xmm0, xmm1             // Divide B
4292
  movups [Result], xmm0
4293
end;
4294

4295
class operator TMatrix2.Multiply(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4296
asm
4297
  movss  xmm0, [B]              // Load single floating-point value
4298
  movups xmm1, [A]              // Load matrix
4299
  shufps xmm0, xmm0, 0          // Replicate B
4300
  mulps  xmm1, xmm0             // Multiply
4301
  movups [Result], xmm1
4302
end;
4303

4304
class operator TMatrix2.Multiply(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4305
asm
4306
  movss  xmm0, [A]              // Load single floating-point value
4307
  movups xmm1, [B]              // Load matrix
4308
  shufps xmm0, xmm0, 0          // Replicate A
4309
  mulps  xmm1, xmm0             // Multiply
4310
  movups [Result], xmm1
4311
end;
4312

4313
class operator TMatrix2.Multiply(const A: TVector2; const B: TMatrix2): TVector2;
4314
begin
4315
  Result.X := (A.X * B.M[0,0]) + (A.Y * B.M[0,1]);
4316
  Result.Y := (A.X * B.M[1,0]) + (A.Y * B.M[1,1]);
4317
end;
4318

4319
class operator TMatrix2.Multiply(const A: TMatrix2; const B: TVector2): TVector2;
4320
begin
4321
  Result.X := (A.M[0,0] * B.X) + (A.M[1,0] * B.Y);
4322
  Result.Y := (A.M[0,1] * B.X) + (A.M[1,1] * B.Y);
4323
end;
4324

4325
class operator TMatrix2.Multiply(const A, B: TMatrix2): TMatrix2;
4326
begin
4327
  Result.M[0,0] := (A.M[0,0] * B.M[0,0]) + (A.M[1,0] * B.M[0,1]);
4328
  Result.M[0,1] := (A.M[0,1] * B.M[0,0]) + (A.M[1,1] * B.M[0,1]);
4329
  Result.M[1,0] := (A.M[0,0] * B.M[1,0]) + (A.M[1,0] * B.M[1,1]);
4330
  Result.M[1,1] := (A.M[0,1] * B.M[1,0]) + (A.M[1,1] * B.M[1,1]);
4331
end;
4332

4333
class operator TMatrix2.Negative(const A: TMatrix2): TMatrix2; assembler;
4334
asm
4335
  movups xmm0, [SSE_MASK_SIGN]  // Load mask with 4 sign (upper) bits
4336
  movups xmm1, [A]              // Load matrix
4337
  xorps  xmm1, xmm0             // Flip sign bits
4338
  movups [Result], xmm1
4339
end;
4340

4341
procedure TMatrix2.SetTransposed;
4342
begin
4343
  Self := Transpose;
4344
end;
4345

4346
class operator TMatrix2.Subtract(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4347
asm
4348
  movss  xmm0, [B]              // Load single floating-point value
4349
  movups xmm1, [A]              // Load matrix
4350
  shufps xmm0, xmm0, 0          // Replicate B
4351
  subps  xmm1, xmm0             // Subtract B
4352
  movups [Result], xmm1
4353
end;
4354

4355
class operator TMatrix2.Subtract(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4356
asm
4357
  movss  xmm0, [A]              // Load single floating-point value
4358
  movups xmm1, [B]              // Load matrix
4359
  shufps xmm0, xmm0, 0          // Replicate A
4360
  subps  xmm0, xmm1             // Subtract B
4361
  movups [Result], xmm0
4362
end;
4363

4364
class operator TMatrix2.Subtract(const A, B: TMatrix2): TMatrix2; assembler;
4365
asm
4366
  movups xmm0, [A]   // Load A
4367
  movups xmm1, [B]   // Load B
4368
  subps  xmm0, xmm1  // Subtract
4369
  movups [Result], xmm0
4370
end;
4371

4372
function TMatrix2.Transpose: TMatrix2;
4373
begin
4374
  Result.M[0,0] := M[0,0];
4375
  Result.M[0,1] := M[1,0];
4376

4377
  Result.M[1,0] := M[0,1];
4378
  Result.M[1,1] := M[1,1];
4379
end;
4380

4381
{ TMatrix 3 }
4382

4383
class operator TMatrix3.Add(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4384
asm
4385
  movss  xmm0, [B]              // Load single floating-point value
4386
  movups xmm1, DQWORD [A + $00] // Load 3 rows
4387
  shufps xmm0, xmm0, 0          // Replicate B
4388
  movups xmm2, DQWORD [A + $10]
4389
  movss  xmm3, DWORD [A + $20]
4390
  addps  xmm1, xmm0             // Add B to each row
4391
  addps  xmm2, xmm0
4392
  addss  xmm3, xmm0
4393
  movups DQWORD [Result + $00], xmm1
4394
  movups DQWORD [Result + $10], xmm2
4395
  movss  DWORD [Result + $20], xmm3
4396
end;
4397

4398
class operator TMatrix3.Add(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4399
asm
4400
  movss  xmm0, [A]              // Load single floating-point value
4401
  movups xmm1, DQWORD [B + $00] // Load 3 rows
4402
  shufps xmm0, xmm0, 0          // Replicate A
4403
  movups xmm2, DQWORD [B + $10]
4404
  movss  xmm3, DWORD [B + $20]
4405
  addps  xmm1, xmm0             // Add A to each row
4406
  addps  xmm2, xmm0
4407
  addss  xmm3, xmm0
4408
  movups DQWORD [Result + $00], xmm1
4409
  movups DQWORD [Result + $10], xmm2
4410
  movss  DWORD [Result + $20], xmm3
4411
end;
4412

4413
class operator TMatrix3.Add(const A, B: TMatrix3): TMatrix3; assembler;
4414
asm
4415
  movups xmm0, DQWORD [A + $00] // Load 3 rows of A
4416
  movups xmm1, DQWORD [A + $10]
4417
  movss  xmm2, DWORD [A + $20]
4418
  movups xmm4, DQWORD [B + $00] // Load 3 rows of B
4419
  movups xmm5, DQWORD [B + $10]
4420
  movss  xmm6, DWORD [B + $20]
4421
  addps  xmm0, xmm4             // Add rows
4422
  addps  xmm1, xmm5
4423
  addss  xmm2, xmm6
4424
  movups DQWORD [Result + $00], xmm0
4425
  movups DQWORD [Result + $10], xmm1
4426
  movss  DWORD [Result + $20], xmm2
4427
end;
4428

4429
function TMatrix3.CompMult(const AOther: TMatrix3): TMatrix3; assembler;
4430
asm
4431
  movups xmm0, DQWORD[Self + $00]   // Self[0]
4432
  movups xmm1, DQWORD[Self + $10]   // Self[1]
4433
  movss  xmm2, DWORD[Self + $20]    // Self[2]
4434
  movups xmm4, DQWORD[AOther + $00] // AOther[0]
4435
  movups xmm5, DQWORD[AOther + $10] // AOther[1]
4436
  movss  xmm6, DWORD[AOther + $20]  // AOther[2]
4437

4438
  // Component-wise multiplication
4439
  mulps  xmm0, xmm4
4440
  mulps  xmm1, xmm5
4441
  mulss  xmm2, xmm6
4442

4443
  // Store result
4444
  movups DQWORD [Result + $00], xmm0
4445
  movups DQWORD [Result + $10], xmm1
4446
  movss  DWORD [Result + $20], xmm2
4447
end;
4448

4449
class operator TMatrix3.Divide(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4450
asm
4451
  movss  xmm0, [A]              // Load single floating-point value
4452
  movups xmm4, DQWORD [B + $00] // Load 3 rows
4453
  shufps xmm0, xmm0, 0          // Replicate A
4454
  movups xmm5, DQWORD [B + $10]
4455
  movaps xmm1, xmm0
4456
  movaps xmm2, xmm0
4457
  movss  xmm6, DWORD [B + $20]
4458
  divps  xmm0, xmm4             // Divide A by each row
4459
  divps  xmm1, xmm5
4460
  divss  xmm2, xmm6
4461
  movups DQWORD [Result + $00], xmm0
4462
  movups DQWORD [Result + $10], xmm1
4463
  movss  DWORD [Result + $20], xmm2
4464
end;
4465

4466
class operator TMatrix3.Divide(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4467
asm
4468
  movss  xmm0, [B]              // Load single floating-point value
4469
  movups xmm1, DQWORD [A + $00] // Load 3 rows
4470
  shufps xmm0, xmm0, 0          // Replicate B
4471
  movups xmm2, DQWORD [A + $10]
4472
  movss  xmm3, DWORD [A + $20]
4473
  divps  xmm1, xmm0             // Divide each row by B
4474
  divps  xmm2, xmm0
4475
  divps  xmm3, xmm0
4476
  movups DQWORD [Result + $00], xmm1
4477
  movups DQWORD [Result + $10], xmm2
4478
  movss  DWORD [Result + $20], xmm3
4479
end;
4480

4481
class operator TMatrix3.Multiply(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4482
asm
4483
  movss  xmm0, [A]              // Load single floating-point value
4484
  movups xmm1, DQWORD [B + $00] // Load 3 rows
4485
  shufps xmm0, xmm0, 0          // Replicate A
4486
  movups xmm2, DQWORD [B + $10]
4487
  movss  xmm3, DWORD [B + $20]
4488
  mulps  xmm1, xmm0             // Multiply each row by A
4489
  mulps  xmm2, xmm0
4490
  mulss  xmm3, xmm0
4491
  movups DQWORD [Result + $00], xmm1
4492
  movups DQWORD [Result + $10], xmm2
4493
  movss  DWORD [Result + $20], xmm3
4494
end;
4495

4496
class operator TMatrix3.Multiply(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4497
asm
4498
  movss  xmm0, [B]              // Load single floating-point value
4499
  movups xmm1, DQWORD [A + $00] // Load 3 rows
4500
  shufps xmm0, xmm0, 0          // Replicate B
4501
  movups xmm2, DQWORD [A + $10]
4502
  movss  xmm3, DWORD [A + $20]
4503
  mulps  xmm1, xmm0             // Multiply each row by B
4504
  mulps  xmm2, xmm0
4505
  mulss  xmm3, xmm0
4506
  movups DQWORD [Result + $00], xmm1
4507
  movups DQWORD [Result + $10], xmm2
4508
  movss  DWORD [Result + $20], xmm3
4509
end;
4510

4511
{$IFDEF FM_COLUMN_MAJOR}
4512
class operator TMatrix3.Multiply(const A: TMatrix3; const B: TVector3): TVector3; assembler;
4513
asm
4514
  movq    xmm0, [B]
4515
  movss   xmm1, [B+8]
4516
  movlhps xmm0, xmm1
4517

4518
  movq    xmm4, QWORD [A + $00]
4519
  movss   xmm1, DWORD [A + $08]
4520
  movlhps xmm4, xmm1
4521

4522
  movaps  xmm1, xmm0
4523
  movaps  xmm2, xmm0
4524
  shufps  xmm0, xmm0, $00
4525
  shufps  xmm1, xmm1, $55
4526
  shufps  xmm2, xmm2, $AA
4527

4528
  movq    xmm5, QWORD [A + $0C]
4529
  movss   xmm3, DWORD [A + $14]
4530
  movlhps xmm5, xmm3
4531

4532
  movq    xmm6, QWORD [A + $18]
4533
  movss   xmm3, DWORD [A + $20]
4534
  movlhps xmm6, xmm3
4535

4536
  mulps   xmm0, xmm4
4537
  mulps   xmm1, xmm5
4538
  mulps   xmm2, xmm6
4539
  addps   xmm0, xmm1
4540
  addps   xmm0, xmm2
4541
  movhlps xmm1, xmm0
4542
  movq    [Result], xmm0
4543
  movss   [Result+8], xmm1
4544
end;
4545

4546
class operator TMatrix3.Multiply(const A: TVector3; const B: TMatrix3): TVector3; assembler;
4547
asm
4548
  movq     xmm0, [A]
4549
  movss    xmm1, [A+8]
4550
  movlhps  xmm0, xmm1
4551

4552
  movq     xmm4, QWORD [B + $00]
4553
  movss    xmm1, DWORD [B + $08]
4554
  movlhps  xmm4, xmm1
4555

4556
  movaps   xmm1, xmm0
4557
  movaps   xmm2, xmm0
4558

4559
  movq     xmm5, QWORD [B + $0C]
4560
  movss    xmm6, DWORD [B + $14]
4561
  movlhps  xmm5, xmm6
4562

4563
  movq     xmm6, QWORD [B + $18]
4564
  movss    xmm3, DWORD [B + $20]
4565
  movlhps  xmm6, xmm3
4566

4567
  mulps    xmm0, xmm4
4568
  mulps    xmm1, xmm5
4569
  mulps    xmm2, xmm6
4570
  xorps    xmm3, xmm3
4571

4572
  { Transpose xmm0-xmm2 }
4573
  movaps   xmm4, xmm2
4574
  unpcklps xmm2, xmm3
4575
  unpckhps xmm4, xmm3
4576

4577
  movaps   xmm3, xmm0
4578
  unpcklps xmm0, xmm1
4579
  unpckhps xmm3, xmm1
4580

4581
  movaps   xmm1, xmm0
4582
  unpcklpd xmm0, xmm2
4583
  unpckhpd xmm1, xmm2
4584

4585
  unpcklpd xmm3, xmm4
4586

4587
  addps    xmm0, xmm1
4588
  addps    xmm0, xmm3
4589
  movhlps  xmm1, xmm0
4590
  movq     [Result], xmm0
4591
  movss    [Result+8], xmm1
4592
end;
4593

4594
class operator TMatrix3.Multiply(const A, B: TMatrix3): TMatrix3; assembler;
4595
{ Code below consists of 3 Vector*Matrix calculations }
4596
asm
4597
  movq    xmm0, QWORD [B + $00]
4598
  movss   xmm1, DWORD [B + $08]
4599
  movlhps xmm0, xmm1
4600

4601
  movq    xmm4, QWORD [A + $00]
4602
  movss   xmm1, DWORD [A + $08]
4603
  movlhps xmm4, xmm1
4604

4605
  movaps  xmm1, xmm0
4606
  movaps  xmm2, xmm0
4607
  shufps  xmm0, xmm0, $00
4608
  shufps  xmm1, xmm1, $55
4609
  shufps  xmm2, xmm2, $AA
4610

4611
  movq    xmm5, QWORD [A + $0C]
4612
  movss   xmm3, DWORD [A + $14]
4613
  movlhps xmm5, xmm3
4614

4615
  movq    xmm6, QWORD [A + $18]
4616
  movss   xmm3, DWORD [A + $20]
4617
  movlhps xmm6, xmm3
4618

4619
  mulps   xmm0, xmm4
4620
  mulps   xmm1, xmm5
4621
  mulps   xmm2, xmm6
4622
  addps   xmm0, xmm1
4623
  addps   xmm0, xmm2
4624
  movhlps xmm1, xmm0
4625
  movq    QWORD [Result + $00], xmm0
4626
  movss   DWORD [Result + $08], xmm1
4627

4628
  movq    xmm0, QWORD [B + $0C]
4629
  movss   xmm1, DWORD [B + $14]
4630
  movlhps xmm0, xmm1
4631

4632
  movaps  xmm1, xmm0
4633
  movaps  xmm2, xmm0
4634
  shufps  xmm0, xmm0, $00
4635
  shufps  xmm1, xmm1, $55
4636
  shufps  xmm2, xmm2, $AA
4637
  mulps   xmm0, xmm4
4638
  mulps   xmm1, xmm5
4639
  mulps   xmm2, xmm6
4640
  addps   xmm0, xmm1
4641
  addps   xmm0, xmm2
4642
  movhlps xmm1, xmm0
4643
  movq    QWORD [Result + $0C], xmm0
4644
  movss   DWORD [Result + $14], xmm1
4645

4646
  movq    xmm0, QWORD [B + $18]
4647
  movss   xmm1, DWORD [B + $20]
4648
  movlhps xmm0, xmm1
4649

4650
  movaps  xmm1, xmm0
4651
  movaps  xmm2, xmm0
4652
  shufps  xmm0, xmm0, $00
4653
  shufps  xmm1, xmm1, $55
4654
  shufps  xmm2, xmm2, $AA
4655
  mulps   xmm0, xmm4
4656
  mulps   xmm1, xmm5
4657
  mulps   xmm2, xmm6
4658
  addps   xmm0, xmm1
4659
  addps   xmm0, xmm2
4660
  movhlps xmm1, xmm0
4661
  movq    QWORD [Result + $18], xmm0
4662
  movss   DWORD [Result + $20], xmm1
4663
end;
4664
{$ELSE}
4665
class operator TMatrix3.Multiply(const A: TMatrix3; const B: TVector3): TVector3; assembler;
4666
asm
4667
  movq     xmm0, [B]              // Load vector
4668
  movss    xmm1, [B+8]
4669
  movlhps  xmm0, xmm1
4670

4671
  movq     xmm4, QWORD [A + $00]  // Load 3 rows
4672
  movss    xmm1, DWORD [A + $08]
4673
  movlhps  xmm4, xmm1
4674

4675
  movaps   xmm1, xmm0
4676
  movaps   xmm2, xmm0
4677

4678
  movq     xmm5, QWORD [A + $0C]
4679
  movss    xmm6, DWORD [A + $14]
4680
  movlhps  xmm5, xmm6
4681

4682
  movq     xmm6, QWORD [A + $18]
4683
  movss    xmm3, DWORD [A + $20]
4684
  movlhps  xmm6, xmm3
4685

4686
  mulps    xmm0, xmm4             // ###, (Az * B02), (Ay * B01), (Ax * B00)
4687
  mulps    xmm1, xmm5             // ###, (Az * B12), (Ay * B11), (Ax * B10)
4688
  mulps    xmm2, xmm6             // ###, (Az * B22), (Ay * B21), (Ax * B20)
4689
  xorps    xmm3, xmm3             // 000
4690

4691
  { Transpose xmm0-xmm2 }
4692
  movaps   xmm4, xmm2
4693
  unpcklps xmm2, xmm3             // 000 B21 000 B20
4694
  unpckhps xmm4, xmm3             // 000 ### 000 B22
4695

4696
  movaps   xmm3, xmm0
4697
  unpcklps xmm0, xmm1             // B11 B01 B10 B00
4698
  unpckhps xmm3, xmm1             // ### ### B12 B02
4699

4700
  movaps   xmm1, xmm0
4701
  unpcklpd xmm0, xmm2             // 000 B20 B10 B00
4702
  unpckhpd xmm1, xmm2             // 000 B21 B11 B01
4703

4704
  unpcklpd xmm3, xmm4             // 000 B22 B12 B02
4705

4706
  addps    xmm0, xmm1             // Add rows
4707
  addps    xmm0, xmm3
4708
  movhlps  xmm1, xmm0
4709
  movq     [Result], xmm0
4710
  movss    [Result+8], xmm1
4711
end;
4712

4713
class operator TMatrix3.Multiply(const A: TVector3; const B: TMatrix3): TVector3; assembler;
4714
asm
4715
  movq    xmm0, [A]              // Load vector
4716
  movss   xmm1, [A+8]
4717
  movlhps xmm0, xmm1
4718

4719
  movq    xmm4, QWORD [B + $00]  // Load 3 rows
4720
  movss   xmm1, DWORD [B + $08]
4721
  movlhps xmm4, xmm1
4722

4723
  movaps  xmm1, xmm0
4724
  movaps  xmm2, xmm0
4725
  shufps  xmm0, xmm0, $00        // Bx Bx Bx Bx
4726
  shufps  xmm1, xmm1, $55        // By By By By
4727
  shufps  xmm2, xmm2, $AA        // Bz Bz Bz Bz
4728

4729
  movq    xmm5, QWORD [B + $0C]
4730
  movss   xmm3, DWORD [B + $14]
4731
  movlhps xmm5, xmm3
4732

4733
  movq    xmm6, QWORD [B + $18]
4734
  movss   xmm3, DWORD [B + $20]
4735
  movlhps xmm6, xmm3
4736

4737
  mulps   xmm0, xmm4             // (A00 * Bx), (A01 * Bx), (A02 * Bx), #
4738
  mulps   xmm1, xmm5             // (A10 * By), (A11 * By), (A12 * By), #
4739
  mulps   xmm2, xmm6             // (A20 * Bz), (A21 * Bz), (A22 * Bz), #
4740
  addps   xmm0, xmm1             // Add rows
4741
  addps   xmm0, xmm2
4742
  movhlps xmm1, xmm0
4743
  movq    [Result], xmm0
4744
  movss   [Result+8], xmm1
4745
end;
4746

4747
class operator TMatrix3.Multiply(const A, B: TMatrix3): TMatrix3; assembler;
4748
{ Code below consists of 3 Vector*Matrix calculations }
4749
asm
4750
  { A.R[0] * B }
4751
  movq    xmm0, QWORD [A + $00]
4752
  movss   xmm1, DWORD [A + $08]
4753
  movlhps xmm0, xmm1
4754

4755
  movq    xmm4, QWORD [B + $00]
4756
  movss   xmm1, DWORD [B + $08]
4757
  movlhps xmm4, xmm1
4758

4759
  movaps  xmm1, xmm0
4760
  movaps  xmm2, xmm0
4761
  shufps  xmm0, xmm0, $00
4762
  shufps  xmm1, xmm1, $55
4763
  shufps  xmm2, xmm2, $AA
4764

4765
  movq    xmm5, QWORD [B + $0C]
4766
  movss   xmm3, DWORD [B + $14]
4767
  movlhps xmm5, xmm3
4768

4769
  movq    xmm6, QWORD [B + $18]
4770
  movss   xmm3, DWORD [B + $20]
4771
  movlhps xmm6, xmm3
4772

4773
  mulps   xmm0, xmm4
4774
  mulps   xmm1, xmm5
4775
  mulps   xmm2, xmm6
4776
  addps   xmm0, xmm1
4777
  addps   xmm0, xmm2
4778
  movhlps xmm1, xmm0
4779
  movq    QWORD [Result + $00], xmm0
4780
  movss   DWORD [Result + $08], xmm1
4781

4782
  { A.R[1] * B }
4783
  movq    xmm0, QWORD [A + $0C]
4784
  movss   xmm1, DWORD [A + $14]
4785
  movlhps xmm0, xmm1
4786

4787
  movaps  xmm1, xmm0
4788
  movaps  xmm2, xmm0
4789
  shufps  xmm0, xmm0, $00
4790
  shufps  xmm1, xmm1, $55
4791
  shufps  xmm2, xmm2, $AA
4792
  mulps   xmm0, xmm4
4793
  mulps   xmm1, xmm5
4794
  mulps   xmm2, xmm6
4795
  addps   xmm0, xmm1
4796
  addps   xmm0, xmm2
4797
  movhlps xmm1, xmm0
4798
  movq    QWORD [Result + $0C], xmm0
4799
  movss   DWORD [Result + $14], xmm1
4800

4801
  { A.R[2] * B }
4802
  movq    xmm0, QWORD [A + $18]
4803
  movss   xmm1, DWORD [A + $20]
4804
  movlhps xmm0, xmm1
4805

4806
  movaps  xmm1, xmm0
4807
  movaps  xmm2, xmm0
4808
  shufps  xmm0, xmm0, $00
4809
  shufps  xmm1, xmm1, $55
4810
  shufps  xmm2, xmm2, $AA
4811
  mulps   xmm0, xmm4
4812
  mulps   xmm1, xmm5
4813
  mulps   xmm2, xmm6
4814
  addps   xmm0, xmm1
4815
  addps   xmm0, xmm2
4816
  movhlps xmm1, xmm0
4817
  movq    QWORD [Result + $18], xmm0
4818
  movss   DWORD [Result + $20], xmm1
4819
end;
4820
{$ENDIF}
4821

4822
class operator TMatrix3.Negative(const A: TMatrix3): TMatrix3; assembler;
4823
asm
4824
  movups xmm0, [SSE_MASK_SIGN]  // Load mask with 4 sign (upper) bits
4825
  movups xmm1, DQWORD [A + $00] // Load 3 rows
4826
  movups xmm2, DQWORD [A + $10]
4827
  movss  xmm3, DWORD [A + $20]
4828
  xorps  xmm1, xmm0             // Flip sign bits of each element in each row
4829
  xorps  xmm2, xmm0
4830
  pxor   xmm3, xmm0
4831
  movups DQWORD [Result + $00], xmm1
4832
  movups DQWORD [Result + $10], xmm2
4833
  movss  DWORD [Result + $20], xmm3
4834
end;
4835

4836
procedure TMatrix3.SetTransposed; assembler;
4837
asm
4838
  movss  xmm0, DWORD [Self + $04]
4839
  movss  xmm1, DWORD [Self + $08]
4840

4841
  movss  xmm2, DWORD [Self + $0C]
4842
  movss  xmm3, DWORD [Self + $14]
4843

4844
  movss  xmm4, DWORD [Self + $18]
4845
  movss  xmm5, DWORD [Self + $1C]
4846

4847
  movss  DWORD [Self + $0C], xmm0
4848
  movss  DWORD [Self + $18], xmm1
4849

4850
  movss  DWORD [Self + $04], xmm2
4851
  movss  DWORD [Self + $1C], xmm3
4852

4853
  movss  DWORD [Self + $08], xmm4
4854
  movss  DWORD [Self + $14], xmm5
4855
end;
4856

4857
class operator TMatrix3.Subtract(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4858
asm
4859
  movss  xmm0, [B]              // Load single floating-point value
4860
  movups xmm1, DQWORD [A + $00] // Load 3 rows
4861
  shufps xmm0, xmm0, 0          // Replicate B
4862
  movups xmm2, DQWORD [A + $10]
4863
  movss  xmm3, DWORD [A + $20]
4864
  subps  xmm1, xmm0             // Subtract B from each row
4865
  subps  xmm2, xmm0
4866
  subps  xmm3, xmm0
4867
  movups DQWORD [Result + $00], xmm1
4868
  movups DQWORD [Result + $10], xmm2
4869
  movss  DWORD [Result + $20], xmm3
4870
end;
4871

4872
class operator TMatrix3.Subtract(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4873
asm
4874
  movss  xmm0, [A]              // Load single floating-point value
4875
  movups xmm4, DQWORD [B + $00] // Load 3 rows
4876
  shufps xmm0, xmm0, 0          // Replicate A
4877
  movups xmm5, DQWORD [B + $10]
4878
  movaps xmm1, xmm0
4879
  movaps xmm2, xmm0
4880
  movss  xmm6, DWORD [B + $20]
4881
  subps  xmm0, xmm4             // Subtract each row from A
4882
  subps  xmm1, xmm5
4883
  subss  xmm2, xmm6
4884
  movups DQWORD [Result + $00], xmm0
4885
  movups DQWORD [Result + $10], xmm1
4886
  movss  DWORD [Result + $20], xmm2
4887
end;
4888

4889
class operator TMatrix3.Subtract(const A, B: TMatrix3): TMatrix3; assembler;
4890
asm
4891
  movups xmm0, DQWORD [A + $00] // Load 3 rows of A
4892
  movups xmm1, DQWORD [A + $10]
4893
  movss  xmm2, DWORD [A + $20]
4894
  movups xmm4, DQWORD [B + $00] // Load 3 rows of B
4895
  movups xmm5, DQWORD [B + $10]
4896
  movss  xmm6, DWORD [B + $20]
4897
  subps  xmm0, xmm4             // Subtract rows
4898
  subps  xmm1, xmm5
4899
  subss  xmm2, xmm6
4900
  movups DQWORD [Result + $00], xmm0
4901
  movups DQWORD [Result + $10], xmm1
4902
  movss  DWORD [Result + $20], xmm2
4903
end;
4904

4905
function TMatrix3.Transpose: TMatrix3; assembler;
4906
asm
4907
  movss xmm0, DWORD [Self + $00]
4908
  movss xmm1, DWORD [Self + $04]
4909
  movss xmm2, DWORD [Self + $08]
4910

4911
  movss DWORD [Result + $00], xmm0
4912
  movss DWORD [Result + $0C], xmm1
4913
  movss DWORD [Result + $18], xmm2
4914

4915
  movss xmm0, DWORD [Self + $0C]
4916
  movss xmm1, DWORD [Self + $10]
4917
  movss xmm2, DWORD [Self + $14]
4918

4919
  movss DWORD [Result + $04], xmm0
4920
  movss DWORD [Result + $10], xmm1
4921
  movss DWORD [Result + $1C], xmm2
4922

4923
  movss xmm0, DWORD [Self + $18]
4924
  movss xmm1, DWORD [Self + $1C]
4925
  movss xmm2, DWORD [Self + $20]
4926

4927
  movss DWORD [Result + $08], xmm0
4928
  movss DWORD [Result + $14], xmm1
4929
  movss DWORD [Result + $20], xmm2
4930
end;
4931

4932
{ TMatrix 4 }
4933

4934
class operator TMatrix4.Add(const A: TMatrix4; const B: Single): TMatrix4; assembler;
4935
asm
4936
  movss  xmm0, [B]              // Load single floating-point value
4937
  movups xmm1, DQWORD [A + $00] // Load 4 rows
4938
  shufps xmm0, xmm0, 0          // Replicate B
4939
  movups xmm2, DQWORD [A + $10]
4940
  movups xmm3, DQWORD [A + $20]
4941
  movups xmm4, DQWORD [A + $30]
4942
  addps  xmm1, xmm0             // Add B to each row
4943
  addps  xmm2, xmm0
4944
  addps  xmm3, xmm0
4945
  addps  xmm4, xmm0
4946
  movups DQWORD [Result + $00], xmm1
4947
  movups DQWORD [Result + $10], xmm2
4948
  movups DQWORD [Result + $20], xmm3
4949
  movups DQWORD [Result + $30], xmm4
4950
end;
4951

4952
class operator TMatrix4.Add(const A: Single; const B: TMatrix4): TMatrix4; assembler;
4953
asm
4954
  movss  xmm0, [A]              // Load single floating-point value
4955
  movups xmm1, DQWORD [B + $00] // Load 4 rows
4956
  shufps xmm0, xmm0, 0          // Replicate A
4957
  movups xmm2, DQWORD [B + $10]
4958
  movups xmm3, DQWORD [B + $20]
4959
  movups xmm4, DQWORD [B + $30]
4960
  addps  xmm1, xmm0             // Add A to each row
4961
  addps  xmm2, xmm0
4962
  addps  xmm3, xmm0
4963
  addps  xmm4, xmm0
4964
  movups DQWORD [Result + $00], xmm1
4965
  movups DQWORD [Result + $10], xmm2
4966
  movups DQWORD [Result + $20], xmm3
4967
  movups DQWORD [Result + $30], xmm4
4968
end;
4969

4970
class operator TMatrix4.Add(const A, B: TMatrix4): TMatrix4; assembler;
4971
asm
4972
  movups xmm0, DQWORD [A + $00] // Load 4 rows of A
4973
  movups xmm1, DQWORD [A + $10]
4974
  movups xmm2, DQWORD [A + $20]
4975
  movups xmm3, DQWORD [A + $30]
4976
  movups xmm4, DQWORD [B + $00] // Load 4 rows of B
4977
  movups xmm5, DQWORD [B + $10]
4978
  movups xmm6, DQWORD [B + $20]
4979
  movups xmm7, DQWORD [B + $30]
4980
  addps  xmm0, xmm4             // Add rows
4981
  addps  xmm1, xmm5
4982
  addps  xmm2, xmm6
4983
  addps  xmm3, xmm7
4984
  movups DQWORD [Result + $00], xmm0
4985
  movups DQWORD [Result + $10], xmm1
4986
  movups DQWORD [Result + $20], xmm2
4987
  movups DQWORD [Result + $30], xmm3
4988
end;
4989

4990
function TMatrix4.CompMult(const AOther: TMatrix4): TMatrix4; assembler;
4991
asm
4992
  movups xmm0, DQWORD[Self + $00]   // Self[0]
4993
  movups xmm1, DQWORD[Self + $10]   // Self[1]
4994
  movups xmm2, DQWORD[Self + $20]   // Self[2]
4995
  movups xmm3, DQWORD[Self + $30]   // Self[3]
4996
  movups xmm4, DQWORD[AOther + $00] // AOther[0]
4997
  movups xmm5, DQWORD[AOther + $10] // AOther[1]
4998
  movups xmm6, DQWORD[AOther + $20] // AOther[2]
4999
  movups xmm7, DQWORD[AOther + $30] // AOther[3]
5000

5001
  // Component-wise multiplication
5002
  mulps  xmm0, xmm4
5003
  mulps  xmm1, xmm5
5004
  mulps  xmm2, xmm6
5005
  mulps  xmm3, xmm7
5006

5007
  // Store result
5008
  movups DQWORD [Result + $00], xmm0
5009
  movups DQWORD [Result + $10], xmm1
5010
  movups DQWORD [Result + $20], xmm2
5011
  movups DQWORD [Result + $30], xmm3
5012
end;
5013

5014
class operator TMatrix4.Divide(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5015
asm
5016
  movss  xmm0, [A]              // Load single floating-point value
5017
  movups xmm4, DQWORD [B + $00] // Load 4 rows
5018
  shufps xmm0, xmm0, 0          // Replicate A
5019
  movups xmm5, DQWORD [B + $10]
5020
  movaps xmm1, xmm0
5021
  movaps xmm2, xmm0
5022
  movaps xmm3, xmm0
5023
  movups xmm6, DQWORD [B + $20]
5024
  movups xmm7, DQWORD [B + $30]
5025
  divps  xmm0, xmm4             // Divide A by each row
5026
  divps  xmm1, xmm5
5027
  divps  xmm2, xmm6
5028
  divps  xmm3, xmm7
5029
  movups DQWORD [Result + $00], xmm0
5030
  movups DQWORD [Result + $10], xmm1
5031
  movups DQWORD [Result + $20], xmm2
5032
  movups DQWORD [Result + $30], xmm3
5033
end;
5034

5035
class operator TMatrix4.Divide(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5036
asm
5037
  movss  xmm0, [B]              // Load single floating-point value
5038
  movups xmm1, DQWORD [A + $00] // Load 4 rows
5039
  shufps xmm0, xmm0, 0          // Replicate B
5040
  movups xmm2, DQWORD [A + $10]
5041
  movups xmm3, DQWORD [A + $20]
5042
  movups xmm4, DQWORD [A + $30]
5043
  divps  xmm1, xmm0             // Divide each row by B
5044
  divps  xmm2, xmm0             // NOTE: We could speed it up by multiplying by
5045
  divps  xmm3, xmm0             // 1/B instead, using the "rcpps" instruction,
5046
  divps  xmm4, xmm0             // but that instruction is an approximation,
5047
                                // so we lose accuracy.
5048
  movups DQWORD [Result + $00], xmm1
5049
  movups DQWORD [Result + $10], xmm2
5050
  movups DQWORD [Result + $20], xmm3
5051
  movups DQWORD [Result + $30], xmm4
5052
end;
5053

5054
function TMatrix4.Inverse: TMatrix4; assembler;
5055
type
5056
  TStack = record
5057
    case Byte of
5058
      0: (WorkSpace: array [0..6] of TVector4);
5059
      1: (F0, F1, F2, F3, F4, F5, Padding: TVector4);
5060
  end;
5061
var
5062
  Stack: TStack;
5063
asm
5064
  // Align stack to 16-byte boundary
5065
  push   ebp
5066
  add    ebp, 16
5067
  and    ebp, not 15
5068

5069
  movups xmm1, DQWORD[Self + $10] // M[1]
5070
  movups xmm2, DQWORD[Self + $20] // M[2]
5071
  movups xmm3, DQWORD[Self + $30] // M[3]
5072

5073
  //  C00 := (A.M[2,2] * A.M[3,3]) - (A.M[3,2] * A.M[2,3]);
5074
  //  C02 := (A.M[1,2] * A.M[3,3]) - (A.M[3,2] * A.M[1,3]);
5075
  //  C03 := (A.M[1,2] * A.M[2,3]) - (A.M[2,2] * A.M[1,3]);
5076
  //  F0 := Vector4(C00, C00, C02, C03);
5077
  movaps xmm5, xmm2            // M[2]
5078
  movaps xmm7, xmm2            // M[2]
5079
  movaps xmm0, xmm3            // M[3]
5080
  movaps xmm6, xmm3            // M[3]
5081
  shufps xmm6, xmm2, $AA       // M22 M22 M32 M32
5082
  shufps xmm0, xmm2, $FF       // M23 M23 M33 M33
5083
  shufps xmm7, xmm1, $FF       // M13 M13 M23 M23
5084
  pshufd xmm4, xmm0, $80       // M23 M33 M33 M33
5085
  shufps xmm5, xmm1, $AA       // M12 M12 M22 M22
5086
  pshufd xmm0, xmm6, $80       // M22 M32 M32 M32
5087
  mulps  xmm5, xmm4            // (M12 * M23) (M12 * M33) (M22 * M33) (M22 * M33)
5088
  mulps  xmm7, xmm0            // (M22 * M13) (M32 * M13) (M32 * M23) (M32 * M23)
5089
  subps  xmm5, xmm7            // C03=(M12*M23)-(M22*M13), C02=(M12*M33)-(M32*M13), C00=(M22*M33)-(M32*M23), C00=(M22*M33)-(M32*M23)
5090
  movups [Stack.F0], xmm5
5091

5092
  //  C04 := (A.M[2,1] * A.M[3,3]) - (A.M[3,1] * A.M[2,3]);
5093
  //  C06 := (A.M[1,1] * A.M[3,3]) - (A.M[3,1] * A.M[1,3]);
5094
  //  C07 := (A.M[1,1] * A.M[2,3]) - (A.M[2,1] * A.M[1,3]);
5095
  //  F1 := Vector4(C04, C04, C06, C07);
5096
  movaps xmm5, xmm2            // M[2]
5097
  movaps xmm7, xmm2            // M[2]
5098
  movaps xmm0, xmm3            // M[3]
5099
  movaps xmm6, xmm3            // M[3]
5100
  shufps xmm6, xmm2, $55       // M21 M21 M31 M31
5101
  shufps xmm0, xmm2, $FF       // M23 M23 M33 M33
5102
  shufps xmm7, xmm1, $FF       // M13 M13 M23 M23
5103
  pshufd xmm4, xmm0, $80       // M23 M33 M33 M33
5104
  shufps xmm5, xmm1, $55       // M11 M11 M21 M21
5105
  pshufd xmm0, xmm6, $80       // M21 M31 M31 M31
5106
  mulps  xmm5, xmm4            // (M11 * M23) (M11 * M33) (M21 * M33) (M21 * M33)
5107
  mulps  xmm7, xmm0            // (M21 * M13) (M31 * M13) (M31 * M23) (M31 * M23)
5108
  subps  xmm5, xmm7            // C07=(M11*M23)-(M21*M13), C06=(M11*M33)-(M31*M13), C04=(M21*M33)-(M31*M23), C04=(M21*M33)-(M31*M23)
5109
  movups [Stack.F1], xmm5
5110

5111
  //  C08 := (A.M[2,1] * A.M[3,2]) - (A.M[3,1] * A.M[2,2]);
5112
  //  C10 := (A.M[1,1] * A.M[3,2]) - (A.M[3,1] * A.M[1,2]);
5113
  //  C11 := (A.M[1,1] * A.M[2,2]) - (A.M[2,1] * A.M[1,2]);
5114
  //  F2 := Vector4(C08, C08, C10, C11);
5115
  movaps xmm5, xmm2            // M[2]
5116
  movaps xmm7, xmm2            // M[2]
5117
  movaps xmm0, xmm3            // M[3]
5118
  movaps xmm6, xmm3            // M[3]
5119
  shufps xmm6, xmm2, $55       // M21 M21 M31 M31
5120
  shufps xmm0, xmm2, $AA       // M22 M22 M32 M32
5121
  shufps xmm7, xmm1, $AA       // M12 M12 M22 M22
5122
  pshufd xmm4, xmm0, $80       // M22 M32 M32 M32
5123
  shufps xmm5, xmm1, $55       // M11 M11 M21 M21
5124
  pshufd xmm0, xmm6, $80       // M21 M31 M31 M31
5125
  mulps  xmm5, xmm4            // (M11 * M22) (M11 * M32) (M21 * M32) (M21 * M32)
5126
  mulps  xmm7, xmm0            // (M21 * M12) (M31 * M12) (M31 * M22) (M32 * M22)
5127
  subps  xmm5, xmm7            // C11=(M11*M22)-(M21*M12), C10=(M11*M32)-(M31*M12), C08=(M21*M32)-(M31*M22), C08=(M21*M32)-(M31*M22)
5128
  movups [Stack.F2], xmm5
5129

5130
  //  C12 := (A.M[2,0] * A.M[3,3]) - (A.M[3,0] * A.M[2,3]);
5131
  //  C14 := (A.M[1,0] * A.M[3,3]) - (A.M[3,0] * A.M[1,3]);
5132
  //  C15 := (A.M[1,0] * A.M[2,3]) - (A.M[2,0] * A.M[1,3]);
5133
  //  F3 := Vector4(C12, C12, C14, C15);
5134
  movaps xmm5, xmm2            // M[2]
5135
  movaps xmm7, xmm2            // M[2]
5136
  movaps xmm0, xmm3            // M[3]
5137
  movaps xmm6, xmm3            // M[3]
5138
  shufps xmm6, xmm2, $00       // M20 M20 M30 M30
5139
  shufps xmm0, xmm2, $FF       // M23 M23 M33 M33
5140
  shufps xmm7, xmm1, $FF       // M13 M13 M23 M23
5141
  pshufd xmm4, xmm0, $80       // M23 M33 M33 M33
5142
  shufps xmm5, xmm1, $00       // M10 M10 M20 M20
5143
  pshufd xmm0, xmm6, $80       // M20 M30 M30 M30
5144
  mulps  xmm5, xmm4            // (M10 * M23) (M10 * M33) (M20 * M33) (M20 * M33)
5145
  mulps  xmm7, xmm0            // (M20 * M13) (M30 * M13) (M30 * M23) (M30 * M23)
5146
  subps  xmm5, xmm7            // C15=(M10*M23)-(M20*M13), C14=(M10*M33)-(M30*M13), C12=(M20*M33)-(M30*M23), C12=(M20*M33)-(M30*M23)
5147
  movups [Stack.F3], xmm5
5148

5149
  //  C16 := (A.M[2,0] * A.M[3,2]) - (A.M[3,0] * A.M[2,2]);
5150
  //  C18 := (A.M[1,0] * A.M[3,2]) - (A.M[3,0] * A.M[1,2]);
5151
  //  C19 := (A.M[1,0] * A.M[2,2]) - (A.M[2,0] * A.M[1,2]);
5152
  //  F4 := Vector4(C16, C16, C18, C19);
5153
  movaps xmm5, xmm2            // M[2]
5154
  movaps xmm7, xmm2            // M[2]
5155
  movaps xmm0, xmm3            // M[3]
5156
  movaps xmm6, xmm3            // M[3]
5157
  shufps xmm6, xmm2, $00       // M20 M20 M30 M30
5158
  shufps xmm0, xmm2, $AA       // M22 M22 M32 M32
5159
  shufps xmm7, xmm1, $AA       // M12 M12 M22 M22
5160
  pshufd xmm4, xmm0, $80       // M22 M32 M32 M32
5161
  shufps xmm5, xmm1, $00       // M10 M10 M20 M20
5162
  pshufd xmm0, xmm6, $80       // M20 M30 M30 M30
5163
  mulps  xmm5, xmm4            // (M10 * M22) (M10 * M32) (M20 * M32) (M20 * M32)
5164
  mulps  xmm7, xmm0            // (M20 * M12) (M30 * M12) (M30 * M22) (M30 * M22)
5165
  subps  xmm5, xmm7            // C19=(M10*M22)-(M20*M12), C18=(M10*M32)-(M30*M12), C16=(M20*M32)-(M30*M22), C16=(M20*M32)-(M30*M22)
5166
  movups [Stack.F4], xmm5
5167

5168
  //  C20 := (A.M[2,0] * A.M[3,1]) - (A.M[3,0] * A.M[2,1]);
5169
  //  C22 := (A.M[1,0] * A.M[3,1]) - (A.M[3,0] * A.M[1,1]);
5170
  //  C23 := (A.M[1,0] * A.M[2,1]) - (A.M[2,0] * A.M[1,1]);
5171
  //  F5 := Vector4(C20, C20, C22, C23);
5172
  movaps xmm5, xmm2            // M[2]
5173
  movaps xmm7, xmm2            // M[2]
5174
  movaps xmm0, xmm3            // M[3]
5175
  movaps xmm6, xmm3            // M[3]
5176
  shufps xmm6, xmm2, $00       // M20 M20 M30 M30
5177
  shufps xmm0, xmm2, $55       // M21 M21 M31 M31
5178
  shufps xmm7, xmm1, $55       // M11 M11 M21 M21
5179
  pshufd xmm4, xmm0, $80       // M21 M31 M31 M31
5180
  shufps xmm5, xmm1, $00       // M10 M10 M20 M20
5181
  pshufd xmm0, xmm6, $80       // M20 M30 M30 M30
5182
  mulps  xmm5, xmm4            // (M10 * M21) (M10 * M31) (M20 * M31) (M20 * M31)
5183
  mulps  xmm7, xmm0            // (M20 * M11) (M30 * M11) (M30 * M21) (M30 * M21)
5184
  subps  xmm5, xmm7            // C23=(M10*M21)-(M20*M11), C22=(M10*M31)-(M30*M11), C20=(M20*M31)-(M30*M21), C20=(M20*M31)-(M30*M21)
5185
  movups [Stack.F5], xmm5
5186

5187
  //  V0 := Vector4(A.M[1,0], A.M[0,0], A.M[0,0], A.M[0,0]);
5188
  //  V1 := Vector4(A.M[1,1], A.M[0,1], A.M[0,1], A.M[0,1]);
5189
  //  V2 := Vector4(A.M[1,2], A.M[0,2], A.M[0,2], A.M[0,2]);
5190
  //  V3 := Vector4(A.M[1,3], A.M[0,3], A.M[0,3], A.M[0,3]);
5191
  movups xmm0, DQWORD[Self + $00] // M[0]
5192
  movaps xmm4, xmm1            // M[1]
5193
  movaps xmm5, xmm1            // M[1]
5194
  movaps xmm6, xmm1            // M[1]
5195
  movaps xmm7, xmm1            // M[1]
5196

5197
  shufps xmm4, xmm0, $00       // M00 M00 M10 M10
5198
  shufps xmm5, xmm0, $55       // M01 M01 M11 M11
5199
  shufps xmm6, xmm0, $AA       // M02 M02 M12 M12
5200
  shufps xmm7, xmm0, $FF       // M03 M03 M13 M13
5201

5202
  pshufd xmm4, xmm4, $A8       // V0=M00 M00 M00 M10
5203
  pshufd xmm5, xmm5, $A8       // V1=M01 M01 M01 M11
5204
  pshufd xmm6, xmm6, $A8       // V2=M02 M02 M02 M12
5205
  pshufd xmm7, xmm7, $A8       // V3=M03 M03 M03 M13
5206

5207
  //  I0 := (V1 * F0) - (V2 * F1) + (V3 * F2);
5208
  //  I1 := (V0 * F0) - (V2 * F3) + (V3 * F4);
5209
  //  I2 := (V0 * F1) - (V1 * F3) + (V3 * F5);
5210
  //  I3 := (V0 * F2) - (V1 * F4) + (V2 * F5);
5211
  movaps xmm0, xmm5            // V1
5212
  movaps xmm1, xmm6            // V2
5213
  movaps xmm2, xmm7            // V3
5214
  mulps  xmm0, [Stack.F0]      // V1 * F0
5215
  mulps  xmm1, [Stack.F1]      // V2 * F1
5216
  mulps  xmm2, [Stack.F2]      // V3 * F2
5217
  subps  xmm0, xmm1            // (V1 * F0) - (V2 * F1)
5218
  movaps xmm1, xmm4            // V0
5219
  addps  xmm0, xmm2            // I0=(V1 * F0) - (V2 * F1) + (V3 * F2)
5220

5221
  movaps xmm2, xmm6            // V2
5222
  movaps xmm3, xmm7            // V3
5223
  mulps  xmm1, [Stack.F0]      // V0 * F0
5224
  mulps  xmm2, [Stack.F3]      // V2 * F3
5225
  mulps  xmm3, [Stack.F4]      // V3 * F4
5226
  subps  xmm1, xmm2            // (V0 * F0) - (V2 * F3)
5227
  movaps xmm2, xmm4            // V0
5228
  addps  xmm1, xmm3            // I1=(V0 * F0) - (V2 * F3) + (V3 * F4)
5229

5230
  movaps xmm3, xmm5            // V1
5231
  mulps  xmm2, [Stack.F1]      // V0 * F1
5232
  mulps  xmm3, [Stack.F3]      // V1 * F3
5233
  mulps  xmm7, [Stack.F5]      // V3 * F5
5234
  subps  xmm2, xmm3            // (V0 * F1) - (V1 * F3)
5235
  mulps  xmm4, [Stack.F2]      // V0 * F2
5236
  addps  xmm2, xmm7            // I2=(V0 * F1) - (V1 * F3) + (V3 * F5)
5237

5238
  mulps  xmm5, [Stack.F4]      // V1 * F4
5239
  mulps  xmm6, [Stack.F5]      // V2 * F5
5240
  subps  xmm4, xmm5            // (V0 * F2) - (V1 * F4)
5241
  addps  xmm4, xmm6            // I3=(V0 * F2) - (V1 * F4) + (V2 * F5)
5242

5243
  //  SA := Vector4(+1, -1, +1, -1);
5244
  //  SB := Vector4(-1, +1, -1, +1);
5245
  //  Inv := Matrix4(I0 * SA, I1 * SB, I2 * SA, I3 * SB);
5246

5247
  movups xmm6, [SSE_MASK_PNPN] // SA
5248
  movups xmm7, [SSE_MASK_NPNP] // SB
5249
  xorps  xmm0, xmm6            // Inv[0] = I0 * SA
5250
  xorps  xmm1, xmm7            // Inv[1] = I1 * SB
5251
  xorps  xmm2, xmm6            // Inv[2] = I2 * SA
5252
  xorps  xmm4, xmm7            // Inv[3] = I3 * SB
5253

5254
  //  Row := Vector4(Inv[0,0], Inv[1,0], Inv[2,0], Inv[3,0]);
5255
  movaps   xmm3, xmm0
5256
  movaps   xmm5, xmm2
5257
  movaps   xmm6, xmm1
5258

5259
  unpcklps xmm3, xmm1          // Inv[1,1] Inv[0,1] Inv[1,0] Inv[0,0]
5260
  unpcklps xmm5, xmm4          // Inv[3,1] Inv[2,1] Inv[3,0] Inv[2,0]
5261
  movups   xmm6, DQWORD[Self + $00] // A.C[0]
5262
  movlhps  xmm3, xmm5          // Inv[3,0] Inv[2,0] Inv[1,0] Inv[0,0]
5263

5264
  //  Dot := A.C[0] * Row;
5265
  mulps    xmm3, xmm6          // Dot.W  Dot.Z  Dot.Y  Dot.X
5266

5267
  //  OneOverDeterminant := 1 / ((Dot.X + Dot.Y) + (Dot.Z + Dot.W));
5268
  pshufd   xmm6, xmm3, $4E     // Dot.Y  Dot.X  Dot.W  Dot.Z
5269
  addps    xmm3, xmm6          // W+Y Z+X Y+W X+Z
5270
  pshufd   xmm6, xmm3, $11     // X+Z Y+X X+Z Y+W
5271
  movups   xmm5, [SSE_ONE]     // 1.0 (4x)
5272
  addps    xmm3, xmm6          // X+Y+Z+W (4x)
5273
  divps    xmm5, xmm3          // OneOverDeterminant (4x)
5274

5275
  //  Result := Inv * OneOverDeterminant;
5276
  mulps    xmm0, xmm5
5277
  mulps    xmm1, xmm5
5278
  mulps    xmm2, xmm5
5279
  mulps    xmm4, xmm5
5280

5281
  movups   DQWORD[Result + $00], xmm0
5282
  movups   DQWORD[Result + $10], xmm1
5283
  movups   DQWORD[Result + $20], xmm2
5284
  movups   DQWORD[Result + $30], xmm4
5285

5286
  pop      ebp
5287
end;
5288

5289
class operator TMatrix4.Multiply(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5290
asm
5291
  movss  xmm0, [A]              // Load single floating-point value
5292
  movups xmm1, DQWORD [B + $00] // Load 4 rows
5293
  shufps xmm0, xmm0, 0          // Replicate A
5294
  movups xmm2, DQWORD [B + $10]
5295
  movups xmm3, DQWORD [B + $20]
5296
  movups xmm4, DQWORD [B + $30]
5297
  mulps  xmm1, xmm0             // Multiply each row by A
5298
  mulps  xmm2, xmm0
5299
  mulps  xmm3, xmm0
5300
  mulps  xmm4, xmm0
5301
  movups DQWORD [Result + $00], xmm1
5302
  movups DQWORD [Result + $10], xmm2
5303
  movups DQWORD [Result + $20], xmm3
5304
  movups DQWORD [Result + $30], xmm4
5305
end;
5306

5307
class operator TMatrix4.Multiply(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5308
asm
5309
  movss  xmm0, [B]              // Load single floating-point value
5310
  movups xmm1, DQWORD [A + $00] // Load 4 rows
5311
  shufps xmm0, xmm0, 0          // Replicate B
5312
  movups xmm2, DQWORD [A + $10]
5313
  movups xmm3, DQWORD [A + $20]
5314
  movups xmm4, DQWORD [A + $30]
5315
  mulps  xmm1, xmm0             // Multiply each row by B
5316
  mulps  xmm2, xmm0
5317
  mulps  xmm3, xmm0
5318
  mulps  xmm4, xmm0
5319
  movups DQWORD [Result + $00], xmm1
5320
  movups DQWORD [Result + $10], xmm2
5321
  movups DQWORD [Result + $20], xmm3
5322
  movups DQWORD [Result + $30], xmm4
5323
end;
5324

5325
{$IFDEF FM_COLUMN_MAJOR}
5326
class operator TMatrix4.Multiply(const A: TMatrix4; const B: TVector4): TVector4; assembler;
5327
asm
5328
  movups xmm0, [B]
5329
  movups xmm4, DQWORD [A + $00]
5330
  movaps xmm1, xmm0
5331
  movaps xmm2, xmm0
5332
  movaps xmm3, xmm0
5333
  shufps xmm0, xmm0, $00
5334
  shufps xmm1, xmm1, $55
5335
  shufps xmm2, xmm2, $AA
5336
  shufps xmm3, xmm3, $FF
5337
  movups xmm5, DQWORD [A + $10]
5338
  movups xmm6, DQWORD [A + $20]
5339
  movups xmm7, DQWORD [A + $30]
5340
  mulps  xmm0, xmm4
5341
  mulps  xmm1, xmm5
5342
  mulps  xmm2, xmm6
5343
  mulps  xmm3, xmm7
5344
  addps  xmm0, xmm1
5345
  addps  xmm2, xmm3
5346
  addps  xmm0, xmm2
5347
  movups [Result], xmm0
5348
end;
5349

5350
class operator TMatrix4.Multiply(const A: TVector4; const B: TMatrix4): TVector4; assembler;
5351
asm
5352
  movups   xmm0, [A]
5353
  movups   xmm4, DQWORD [B + $00]
5354
  movaps   xmm1, xmm0
5355
  movaps   xmm2, xmm0
5356
  movaps   xmm3, xmm0
5357
  movups   xmm5, DQWORD [B + $10]
5358
  movups   xmm6, DQWORD [B + $20]
5359
  movups   xmm7, DQWORD [B + $30]
5360
  mulps    xmm0, xmm4
5361
  mulps    xmm1, xmm5
5362
  mulps    xmm2, xmm6
5363
  mulps    xmm3, xmm7
5364

5365
  { Transpose xmm0-xmm3 }
5366
  movaps   xmm4, xmm2
5367
  unpcklps xmm2, xmm3
5368
  unpckhps xmm4, xmm3
5369

5370
  movaps   xmm3, xmm0
5371
  unpcklps xmm0, xmm1
5372
  unpckhps xmm3, xmm1
5373

5374
  movaps   xmm1, xmm0
5375
  unpcklpd xmm0, xmm2
5376
  unpckhpd xmm1, xmm2
5377

5378
  movaps   xmm2, xmm3
5379
  unpcklpd xmm2, xmm4
5380
  unpckhpd xmm3, xmm4
5381

5382
  addps    xmm0, xmm1
5383
  addps    xmm2, xmm3
5384
  addps    xmm0, xmm2
5385
  movups   [Result], xmm0
5386
end;
5387

5388
class operator TMatrix4.Multiply(const A, B: TMatrix4): TMatrix4; assembler;
5389
{ Code below consists of 4 Vector*Matrix calculations }
5390
asm
5391
  movups xmm0, DQWORD [B + $00]
5392
  movups xmm4, DQWORD [A + $00]
5393
  movaps xmm1, xmm0
5394
  movaps xmm2, xmm0
5395
  movaps xmm3, xmm0
5396
  shufps xmm0, xmm0, $00
5397
  shufps xmm1, xmm1, $55
5398
  shufps xmm2, xmm2, $AA
5399
  shufps xmm3, xmm3, $FF
5400
  movups xmm5, DQWORD [A + $10]
5401
  movups xmm6, DQWORD [A + $20]
5402
  movups xmm7, DQWORD [A + $30]
5403
  mulps  xmm0, xmm4
5404
  mulps  xmm1, xmm5
5405
  mulps  xmm2, xmm6
5406
  mulps  xmm3, xmm7
5407
  addps  xmm0, xmm1
5408
  addps  xmm2, xmm3
5409
  addps  xmm0, xmm2
5410
  movups DQWORD [Result + $00], xmm0
5411

5412
  movups xmm0, DQWORD [B + $10]
5413
  movaps xmm1, xmm0
5414
  movaps xmm2, xmm0
5415
  movaps xmm3, xmm0
5416
  shufps xmm0, xmm0, $00
5417
  shufps xmm1, xmm1, $55
5418
  shufps xmm2, xmm2, $AA
5419
  shufps xmm3, xmm3, $FF
5420
  mulps  xmm0, xmm4
5421
  mulps  xmm1, xmm5
5422
  mulps  xmm2, xmm6
5423
  mulps  xmm3, xmm7
5424
  addps  xmm0, xmm1
5425
  addps  xmm2, xmm3
5426
  addps  xmm0, xmm2
5427
  movups DQWORD [Result + $10], xmm0
5428

5429
  movups xmm0, DQWORD [B + $20]
5430
  movaps xmm1, xmm0
5431
  movaps xmm2, xmm0
5432
  movaps xmm3, xmm0
5433
  shufps xmm0, xmm0, $00
5434
  shufps xmm1, xmm1, $55
5435
  shufps xmm2, xmm2, $AA
5436
  shufps xmm3, xmm3, $FF
5437
  mulps  xmm0, xmm4
5438
  mulps  xmm1, xmm5
5439
  mulps  xmm2, xmm6
5440
  mulps  xmm3, xmm7
5441
  addps  xmm0, xmm1
5442
  addps  xmm2, xmm3
5443
  addps  xmm0, xmm2
5444
  movups DQWORD [Result + $20], xmm0
5445

5446
  movups xmm0, DQWORD [B + $30]
5447
  movaps xmm1, xmm0
5448
  movaps xmm2, xmm0
5449
  movaps xmm3, xmm0
5450
  shufps xmm0, xmm0, $00
5451
  shufps xmm1, xmm1, $55
5452
  shufps xmm2, xmm2, $AA
5453
  shufps xmm3, xmm3, $FF
5454
  mulps  xmm0, xmm4
5455
  mulps  xmm1, xmm5
5456
  mulps  xmm2, xmm6
5457
  mulps  xmm3, xmm7
5458
  addps  xmm0, xmm1
5459
  addps  xmm2, xmm3
5460
  addps  xmm0, xmm2
5461
  movups DQWORD [Result + $30], xmm0
5462
end;
5463
{$ELSE}
5464
class operator TMatrix4.Multiply(const A: TMatrix4; const B: TVector4): TVector4; assembler;
5465
asm
5466
  movups   xmm0, [B]              // Load vector
5467
  movups   xmm4, DQWORD [A + $00] // Load 4 rows
5468
  movaps   xmm1, xmm0
5469
  movaps   xmm2, xmm0
5470
  movaps   xmm3, xmm0
5471
  movups   xmm5, DQWORD [A + $10]
5472
  movups   xmm6, DQWORD [A + $20]
5473
  movups   xmm7, DQWORD [A + $30]
5474
  mulps    xmm0, xmm4             // (Ax * B00), (Ay * B01), (Az * B02), (Aw * B03)
5475
  mulps    xmm1, xmm5             // (Ax * B10), (Ay * B11), (Az * B12), (Aw * B13)
5476
  mulps    xmm2, xmm6             // (Ax * B20), (Ay * B21), (Az * B22), (Aw * B23)
5477
  mulps    xmm3, xmm7             // (Ax * B30), (Ay * B31), (Az * B32), (Aw * B33)
5478

5479
  { Transpose xmm0-xmm3 }
5480
  movaps   xmm4, xmm2
5481
  unpcklps xmm2, xmm3             // B32 B22 B33 B23
5482
  unpckhps xmm4, xmm3             // B30 B20 B31 B21
5483

5484
  movaps   xmm3, xmm0
5485
  unpcklps xmm0, xmm1             // B12 B02 B13 B03
5486
  unpckhps xmm3, xmm1             // B10 B00 B11 B01
5487

5488
  movaps   xmm1, xmm0
5489
  unpcklpd xmm0, xmm2             // B33 B23 B13 B03
5490
  unpckhpd xmm1, xmm2             // B32 B22 B12 B02
5491

5492
  movaps   xmm2, xmm3
5493
  unpcklpd xmm2, xmm4             // B31 B21 B11 B01
5494
  unpckhpd xmm3, xmm4             // B30 B20 B10 B00
5495

5496
  addps    xmm0, xmm1             // Add rows
5497
  addps    xmm2, xmm3
5498
  addps    xmm0, xmm2
5499
  movups   [Result], xmm0
5500
end;
5501

5502
class operator TMatrix4.Multiply(const A: TVector4; const B: TMatrix4): TVector4; assembler;
5503
asm
5504
  movups xmm0, [A]              // Load vector
5505
  movups xmm4, DQWORD [B + $00] // Load 4 rows
5506
  movaps xmm1, xmm0
5507
  movaps xmm2, xmm0
5508
  movaps xmm3, xmm0
5509
  shufps xmm0, xmm0, $00        // Bx Bx Bx Bx
5510
  shufps xmm1, xmm1, $55        // By By By By
5511
  shufps xmm2, xmm2, $AA        // Bz Bz Bz Bz
5512
  shufps xmm3, xmm3, $FF        // Bw Bw Bw Bw
5513
  movups xmm5, DQWORD [B + $10]
5514
  movups xmm6, DQWORD [B + $20]
5515
  movups xmm7, DQWORD [B + $30]
5516
  mulps  xmm0, xmm4             // (A00 * Bx), (A01 * Bx), (A02 * Bx), (A03 * Bx)
5517
  mulps  xmm1, xmm5             // (A10 * By), (A11 * By), (A12 * By), (A13 * By)
5518
  mulps  xmm2, xmm6             // (A20 * Bz), (A21 * Bz), (A22 * Bz), (A23 * Bz)
5519
  mulps  xmm3, xmm7             // (A30 * Bw), (A31 * Bw), (A32 * Bw), (A33 * Bw)
5520
  addps  xmm0, xmm1             // Add rows
5521
  addps  xmm2, xmm3
5522
  addps  xmm0, xmm2
5523
  movups [Result], xmm0
5524
end;
5525

5526
class operator TMatrix4.Multiply(const A, B: TMatrix4): TMatrix4; assembler;
5527
{ Code below consists of 4 Vector*Matrix calculations }
5528
asm
5529
  { A.R[0] * B }
5530
  movups xmm0, DQWORD [A + $00]
5531
  movups xmm4, DQWORD [B + $00]
5532
  movaps xmm1, xmm0
5533
  movaps xmm2, xmm0
5534
  movaps xmm3, xmm0
5535
  shufps xmm0, xmm0, $00
5536
  shufps xmm1, xmm1, $55
5537
  shufps xmm2, xmm2, $AA
5538
  shufps xmm3, xmm3, $FF
5539
  movups xmm5, DQWORD [B + $10]
5540
  movups xmm6, DQWORD [B + $20]
5541
  movups xmm7, DQWORD [B + $30]
5542
  mulps  xmm0, xmm4
5543
  mulps  xmm1, xmm5
5544
  mulps  xmm2, xmm6
5545
  mulps  xmm3, xmm7
5546
  addps  xmm0, xmm1
5547
  addps  xmm2, xmm3
5548
  addps  xmm0, xmm2
5549
  movups DQWORD [Result + $00], xmm0
5550

5551
  { A.R[1] * B }
5552
  movups xmm0, DQWORD [A + $10]
5553
  movaps xmm1, xmm0
5554
  movaps xmm2, xmm0
5555
  movaps xmm3, xmm0
5556
  shufps xmm0, xmm0, $00
5557
  shufps xmm1, xmm1, $55
5558
  shufps xmm2, xmm2, $AA
5559
  shufps xmm3, xmm3, $FF
5560
  mulps  xmm0, xmm4
5561
  mulps  xmm1, xmm5
5562
  mulps  xmm2, xmm6
5563
  mulps  xmm3, xmm7
5564
  addps  xmm0, xmm1
5565
  addps  xmm2, xmm3
5566
  addps  xmm0, xmm2
5567
  movups DQWORD [Result + $10], xmm0
5568

5569
  { A.R[2] * B }
5570
  movups xmm0, DQWORD [A + $20]
5571
  movaps xmm1, xmm0
5572
  movaps xmm2, xmm0
5573
  movaps xmm3, xmm0
5574
  shufps xmm0, xmm0, $00
5575
  shufps xmm1, xmm1, $55
5576
  shufps xmm2, xmm2, $AA
5577
  shufps xmm3, xmm3, $FF
5578
  mulps  xmm0, xmm4
5579
  mulps  xmm1, xmm5
5580
  mulps  xmm2, xmm6
5581
  mulps  xmm3, xmm7
5582
  addps  xmm0, xmm1
5583
  addps  xmm2, xmm3
5584
  addps  xmm0, xmm2
5585
  movups DQWORD [Result + $20], xmm0
5586

5587
  { A.R[3] * B }
5588
  movups xmm0, DQWORD [A + $30]
5589
  movaps xmm1, xmm0
5590
  movaps xmm2, xmm0
5591
  movaps xmm3, xmm0
5592
  shufps xmm0, xmm0, $00
5593
  shufps xmm1, xmm1, $55
5594
  shufps xmm2, xmm2, $AA
5595
  shufps xmm3, xmm3, $FF
5596
  mulps  xmm0, xmm4
5597
  mulps  xmm1, xmm5
5598
  mulps  xmm2, xmm6
5599
  mulps  xmm3, xmm7
5600
  addps  xmm0, xmm1
5601
  addps  xmm2, xmm3
5602
  addps  xmm0, xmm2
5603
  movups DQWORD [Result + $30], xmm0
5604
end;
5605
{$ENDIF}
5606

5607
class operator TMatrix4.Negative(const A: TMatrix4): TMatrix4; assembler;
5608
asm
5609
  movups xmm0, [SSE_MASK_SIGN]  // Load mask with 4 sign (upper) bits
5610
  movups xmm1, DQWORD [A + $00] // Load 4 rows
5611
  movups xmm2, DQWORD [A + $10]
5612
  movups xmm3, DQWORD [A + $20]
5613
  movups xmm4, DQWORD [A + $30]
5614
  xorps  xmm1, xmm0             // Flip sign bits of each element in each row
5615
  xorps  xmm2, xmm0
5616
  xorps  xmm3, xmm0
5617
  xorps  xmm4, xmm0
5618
  movups DQWORD [Result + $00], xmm1
5619
  movups DQWORD [Result + $10], xmm2
5620
  movups DQWORD [Result + $20], xmm3
5621
  movups DQWORD [Result + $30], xmm4
5622
end;
5623

5624
procedure TMatrix4.SetInversed;
5625
begin
5626
  Self := Inverse;
5627
end;
5628

5629
procedure TMatrix4.SetTransposed;
5630
begin
5631
  Self := Transpose;
5632
end;
5633

5634
class operator TMatrix4.Subtract(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5635
asm
5636
  movss  xmm0, [B]              // Load single floating-point value
5637
  movups xmm1, DQWORD [A + $00] // Load 4 rows
5638
  shufps xmm0, xmm0, 0          // Replicate B
5639
  movups xmm2, DQWORD [A + $10]
5640
  movups xmm3, DQWORD [A + $20]
5641
  movups xmm4, DQWORD [A + $30]
5642
  subps  xmm1, xmm0             // Subtract B from each row
5643
  subps  xmm2, xmm0
5644
  subps  xmm3, xmm0
5645
  subps  xmm4, xmm0
5646
  movups DQWORD [Result + $00], xmm1
5647
  movups DQWORD [Result + $10], xmm2
5648
  movups DQWORD [Result + $20], xmm3
5649
  movups DQWORD [Result + $30], xmm4
5650
end;
5651

5652
class operator TMatrix4.Subtract(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5653
asm
5654
  movss  xmm0, [A]              // Load single floating-point value
5655
  movups xmm4, DQWORD [B + $00] // Load 4 rows
5656
  shufps xmm0, xmm0, 0          // Replicate A
5657
  movups xmm5, DQWORD [B + $10]
5658
  movaps xmm1, xmm0
5659
  movaps xmm2, xmm0
5660
  movaps xmm3, xmm0
5661
  movups xmm6, DQWORD [B + $20]
5662
  movups xmm7, DQWORD [B + $30]
5663
  subps  xmm0, xmm4             // Subtract each row from A
5664
  subps  xmm1, xmm5
5665
  subps  xmm2, xmm6
5666
  subps  xmm3, xmm7
5667
  movups DQWORD [Result + $00], xmm0
5668
  movups DQWORD [Result + $10], xmm1
5669
  movups DQWORD [Result + $20], xmm2
5670
  movups DQWORD [Result + $30], xmm3
5671
end;
5672

5673
class operator TMatrix4.Subtract(const A, B: TMatrix4): TMatrix4; assembler;
5674
asm
5675
  movups xmm0, DQWORD [A + $00] // Load 4 rows of A
5676
  movups xmm1, DQWORD [A + $10]
5677
  movups xmm2, DQWORD [A + $20]
5678
  movups xmm3, DQWORD [A + $30]
5679
  movups xmm4, DQWORD [B + $00] // Load 4 rows of B
5680
  movups xmm5, DQWORD [B + $10]
5681
  movups xmm6, DQWORD [B + $20]
5682
  movups xmm7, DQWORD [B + $30]
5683
  subps  xmm0, xmm4             // Subtract rows
5684
  subps  xmm1, xmm5
5685
  subps  xmm2, xmm6
5686
  subps  xmm3, xmm7
5687
  movups DQWORD [Result + $00], xmm0
5688
  movups DQWORD [Result + $10], xmm1
5689
  movups DQWORD [Result + $20], xmm2
5690
  movups DQWORD [Result + $30], xmm3
5691
end;
5692

5693
function TMatrix4.Transpose: TMatrix4; assembler;
5694
asm
5695
  movups   xmm0, DQWORD[Self + $00] // A03 A02 A01 A00
5696
  movups   xmm1, DQWORD[Self + $10] // A13 A12 A11 A10
5697
  movups   xmm2, DQWORD[Self + $20] // A23 A22 A21 A20
5698
  movups   xmm3, DQWORD[Self + $30] // A33 A32 A31 A30
5699

5700
  movaps   xmm4, xmm2
5701
  unpcklps xmm2, xmm3               // A31 A21 A30 A20
5702
  unpckhps xmm4, xmm3               // A33 A23 A32 A22
5703

5704
  movaps   xmm3, xmm0
5705
  unpcklps xmm0, xmm1               // A11 A01 A10 A00
5706
  unpckhps xmm3, xmm1               // A13 A03 A12 A02
5707

5708
  movaps   xmm1, xmm0
5709
  unpcklpd xmm0, xmm2               // A30 A20 A10 A00
5710
  unpckhpd xmm1, xmm2               // A31 A21 A11 A01
5711

5712
  movaps   xmm2, xmm3
5713
  unpcklpd xmm2, xmm4               // A32 A22 A12 A02
5714
  unpckhpd xmm3, xmm4               // A33 A23 A13 A03
5715

5716
  movups   DQWORD[Result + $00], xmm0
5717
  movups   DQWORD[Result + $10], xmm1
5718
  movups   DQWORD[Result + $20], xmm2
5719
  movups   DQWORD[Result + $30], xmm3
5720
end;
5721
MathgeomGLS

Использование cookies