MathgeomGLS
5720 строк · 169.9 Кб
1const
2{ SSE rounding modes (bits in MXCSR register) }
3SSE_ROUND_MASK = $FFFF9FFF;
4SSE_ROUND_NEAREST = $00000000;
5SSE_ROUND_DOWN = $00002000;
6SSE_ROUND_UP = $00004000;
7SSE_ROUND_TRUNC = $00006000;
8
9{ These constants fit in a single XMM register. These values represent
10sign-bits as used by 32-bit floating-point values.
11XOR'ing a floating-point value with $80000000 swaps the sign.
12XOR'ing a floating-point value with $00000000 leaves the value unchanged. }
13SSE_MASK_SIGN: array [0..3] of UInt32 = ($80000000, $80000000, $80000000, $80000000);
14SSE_MASK_NPNP: array [0..3] of UInt32 = ($80000000, $00000000, $80000000, $00000000);
15SSE_MASK_PNPN: array [0..3] of UInt32 = ($00000000, $80000000, $00000000, $80000000);
16SSE_MASK_0FFF: array [0..3] of UInt32 = ($FFFFFFFF, $FFFFFFFF, $FFFFFFFF, $00000000);
17
18{ These constants mask off an element of the binary representation of a
1932-bit floating-point value. }
20SSE_MASK_FRACTION: array [0..3] of UInt32 = ($007FFFFF, $007FFFFF, $007FFFFF, $007FFFFF);
21SSE_MASK_EXPONENT: array [0..3] of UInt32 = ($7F800000, $7F800000, $7F800000, $7F800000);
22SSE_MASK_ABS_VAL : array [0..3] of UInt32 = ($7FFFFFFF, $7FFFFFFF, $7FFFFFFF, $7FFFFFFF);
23
24{ Commonly used floating-point values }
25SSE_ONE_HALF : array [0..3] of Single = (0.5, 0.5, 0.5, 0.5);
26SSE_ONE : array [0..3] of Single = (1, 1, 1, 1);
27SSE_TWO : array [0..3] of Single = (2, 2, 2, 2);
28SSE_THREE : array [0..3] of Single = (3, 3, 3, 3);
29SSE_PI_OVER_180 : array [0..3] of Single = (Pi / 180, Pi / 180, Pi / 180, Pi / 180);
30SSE_180_OVER_PI : array [0..3] of Single = (180 / Pi, 180 / Pi, 180 / Pi, 180 / Pi);
31SSE_NEG_INFINITY: array [0..3] of Single = (NegInfinity, NegInfinity, NegInfinity, NegInfinity);
32SSE_PI_OVER_4 : array [0..3] of Single = (Pi / 4, Pi / 4, Pi / 4, Pi / 4);
33
34{ Commonly used integer values }
35SSE_INT_ONE : array [0..3] of Integer = (1, 1, 1, 1);
36SSE_INT_NOT_ONE : array [0..3] of Cardinal = ($FFFFFFFE, $FFFFFFFE, $FFFFFFFE, $FFFFFFFE);
37SSE_INT_TWO : array [0..3] of Integer = (2, 2, 2, 2);
38SSE_INT_FOUR : array [0..3] of Integer = (4, 4, 4, 4);
39
40{ Constants for approximating trigonometric functions }
41SSE_FOPI: array [0..3] of Single = (1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516);
42SSE_SINCOF_P0: array [0..3] of Single = (-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4);
43SSE_SINCOF_P1: array [0..3] of Single = (8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3);
44SSE_SINCOF_P2: array [0..3] of Single = (-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1);
45SSE_COSCOF_P0: array [0..3] of Single = (2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005);
46SSE_COSCOF_P1: array [0..3] of Single = (-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003);
47SSE_COSCOF_P2: array [0..3] of Single = (4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002);
48
49SSE_EXP_A1 : array [0..3] of Single = (12102203.1615614, 12102203.1615614, 12102203.1615614, 12102203.1615614);
50SSE_EXP_A2 : array [0..3] of Single = (1065353216, 1065353216, 1065353216, 1065353216);
51SSE_EXP_CST: array [0..3] of Single = (2139095040, 2139095040, 2139095040, 2139095040);
52SSE_EXP_F1 : array [0..3] of Single = (0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125);
53SSE_EXP_F2 : array [0..3] of Single = (0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375);
54SSE_EXP_F3 : array [0..3] of Single = (0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875);
55SSE_EXP_F4 : array [0..3] of Single = (-2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3);
56SSE_EXP_F5 : array [0..3] of Single = (1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2);
57SSE_EXP_I1 : array [0..3] of UInt32 = ($3F800000, $3F800000, $3F800000, $3F800000);
58
59SSE_LN_CST: array [0..3] of Single = (-89.93423858, -89.93423858, -89.93423858, -89.93423858);
60SSE_LN_F1 : array [0..3] of Single = (3.3977745, 3.3977745, 3.3977745, 3.3977745);
61SSE_LN_F2 : array [0..3] of Single = (2.2744832, 2.2744832, 2.2744832, 2.2744832);
62SSE_LN_F3 : array [0..3] of Single = (0.024982445, 0.024982445, 0.024982445, 0.024982445);
63SSE_LN_F4 : array [0..3] of Single = (0.24371102, 0.24371102, 0.24371102, 0.24371102);
64SSE_LN_F5 : array [0..3] of Single = (0.69314718055995, 0.69314718055995, 0.69314718055995, 0.69314718055995);
65
66SSE_LOG2_I1: array [0..3] of UInt32 = ($3F000000, $3F000000, $3F000000, $3F000000);
67SSE_LOG2_F1: array [0..3] of Single = (1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7);
68SSE_LOG2_F2: array [0..3] of Single = (124.22551499, 124.22551499, 124.22551499, 124.22551499);
69SSE_LOG2_F3: array [0..3] of Single = (1.498030302, 1.498030302, 1.498030302, 1.498030302);
70SSE_LOG2_F4: array [0..3] of Single = (1.72587999, 1.72587999, 1.72587999, 1.72587999);
71SSE_LOG2_F5: array [0..3] of Single = (0.3520887068, 0.3520887068, 0.3520887068, 0.3520887068);
72
73SSE_EXP2_F1: array [0..3] of Single = (121.2740575, 121.2740575, 121.2740575, 121.2740575);
74SSE_EXP2_F2: array [0..3] of Single = (27.7280233, 27.7280233, 27.7280233, 27.7280233);
75SSE_EXP2_F3: array [0..3] of Single = (4.84252568, 4.84252568, 4.84252568, 4.84252568);
76SSE_EXP2_F4: array [0..3] of Single = (1.49012907, 1.49012907, 1.49012907, 1.49012907);
77SSE_EXP2_F5: array [0..3] of Single = ($00800000, $00800000, $00800000, $00800000);
78
79{ Angle and Trigonometry Functions }
80
81function Radians(const ADegrees: Single): Single;
82begin
83Result := ADegrees * (Pi / 180);
84end;
85
86function Radians(const ADegrees: TVector2): TVector2; assembler;
87asm
88movlps xmm0, [ADegrees]
89movlps xmm1, QWORD [SSE_PI_OVER_180]
90mulps xmm0, xmm1
91movlps [Result], xmm0
92end;
93
94function Radians(const ADegrees: TVector3): TVector3; assembler;
95asm
96movq xmm0, [ADegrees]
97movss xmm1, [ADegrees+8]
98movups xmm2, [SSE_PI_OVER_180]
99mulps xmm0, xmm2
100mulps xmm1, xmm2
101movq [Result], xmm0
102movss [Result+8], xmm1
103end;
104
105function Radians(const ADegrees: TVector4): TVector4; assembler;
106asm
107movups xmm0, [ADegrees]
108movups xmm1, [SSE_PI_OVER_180]
109mulps xmm0, xmm1
110movups [Result], xmm0
111end;
112
113function Degrees(const ARadians: Single): Single;
114begin
115Result := ARadians * (180 / Pi);
116end;
117
118function Degrees(const ARadians: TVector2): TVector2; assembler;
119asm
120movlps xmm0, [ARadians]
121movlps xmm1, QWORD [SSE_180_OVER_PI]
122mulps xmm0, xmm1
123movlps [Result], xmm0
124end;
125
126function Degrees(const ARadians: TVector3): TVector3; assembler;
127asm
128movq xmm0, [ARadians]
129movss xmm1, [ARadians+8]
130movups xmm2, [SSE_180_OVER_PI]
131mulps xmm0, xmm2
132mulps xmm1, xmm2
133movq [Result], xmm0
134movss [Result+8], xmm1
135end;
136
137function Degrees(const ARadians: TVector4): TVector4; assembler;
138asm
139movups xmm0, [ARadians]
140movups xmm1, [SSE_180_OVER_PI]
141mulps xmm0, xmm1
142movups [Result], xmm0
143end;
144
145{ Exponential Functions }
146
147function Sqrt(const A: Single): Single; assembler;
148asm
149movss xmm0, [A]
150sqrtss xmm0, xmm0
151movss [Result], xmm0
152end;
153
154function Sqrt(const A: TVector2): TVector2; assembler;
155asm
156movlps xmm0, [A]
157sqrtps xmm0, xmm0
158movlps [Result], xmm0
159end;
160
161function Sqrt(const A: TVector3): TVector3; assembler;
162asm
163movq xmm0, [A]
164movss xmm1, [A+8]
165movlhps xmm0, xmm1
166sqrtps xmm0, xmm0
167movhlps xmm1, xmm0
168movq [Result], xmm0
169movss [Result+8], xmm1
170end;
171
172function Sqrt(const A: TVector4): TVector4; assembler;
173asm
174movups xmm0, [A]
175sqrtps xmm0, xmm0
176movups [Result], xmm0
177end;
178
179function InverseSqrt(const A: Single): Single; assembler;
180asm
181movss xmm0, [A]
182rsqrtss xmm0, xmm0
183movss [Result], xmm0
184end;
185
186function InverseSqrt(const A: TVector2): TVector2;
187asm
188movlps xmm0, [A]
189rsqrtps xmm0, xmm0
190movlps [Result], xmm0
191end;
192
193function InverseSqrt(const A: TVector3): TVector3;
194asm
195movq xmm0, [A]
196movss xmm1, [A+8]
197movlhps xmm0, xmm1
198rsqrtps xmm0, xmm0
199movhlps xmm1, xmm0
200movq [Result], xmm0
201movss [Result+8], xmm1
202end;
203
204function InverseSqrt(const A: TVector4): TVector4; assembler;
205asm
206movups xmm0, [A]
207rsqrtps xmm0, xmm0
208movups [Result], xmm0
209end;
210
211{ Fast approximate Functions }
212
213function FastSin(const ARadians: Single): Single; assembler;
214asm
215movss xmm0, [ARadians]
216movss xmm2, DWORD [SSE_MASK_ABS_VAL]
217movaps xmm1, xmm0
218movss xmm3, DWORD [SSE_MASK_SIGN]
219andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
220andps xmm1, xmm3 // (xmm1) SignBit
221movaps xmm2, xmm0
222movss xmm4, DWORD [SSE_FOPI]
223movss xmm5, DWORD [SSE_INT_ONE]
224mulss xmm2, xmm4
225movss xmm6, DWORD [SSE_INT_NOT_ONE]
226cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
227movss xmm7, DWORD [SSE_INT_FOUR]
228paddd xmm2, xmm5
229pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
230movss xmm6, DWORD [SSE_INT_TWO]
231cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
232movaps xmm5, xmm2
233pand xmm2, xmm6 // J and 2
234pand xmm5, xmm7 // J and 4
235pxor xmm7, xmm7
236pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
237pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
238movss xmm6, DWORD [SSE_PI_OVER_4]
239pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
240mulss xmm4, xmm6 // Y * Pi / 4
241movss xmm3, DWORD [SSE_COSCOF_P0]
242subss xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
243movss xmm4, DWORD [SSE_COSCOF_P1]
244movaps xmm7, xmm0
245movss xmm6, DWORD [SSE_COSCOF_P2]
246mulss xmm7, xmm7 // (xmm7) Z := X * X
247movss xmm5, DWORD [SSE_SINCOF_P1]
248mulss xmm3, xmm7 // COSCOF_P0 * Z
249addss xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
250movss xmm4, DWORD [SSE_ONE_HALF]
251mulss xmm3, xmm7 // Y * Z
252mulss xmm4, xmm7 // Z * 0.5
253addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
254movss xmm6, DWORD [SSE_ONE]
255mulss xmm3, xmm7 // Y * Z
256mulss xmm3, xmm7 // Y := Y * (Z * Z)
257subss xmm3, xmm4 // Y - Z * 0.5
258movss xmm4, DWORD [SSE_SINCOF_P0]
259addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
260movss xmm6, DWORD [SSE_SINCOF_P2]
261mulss xmm4, xmm7 // SINCOF_P0 * Z
262addss xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
263movaps xmm5, xmm2
264mulss xmm4, xmm7 // Y2 * Z
265addss xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
266mulss xmm4, xmm7 // Y2 * Z
267mulss xmm4, xmm0 // Y2 * (Z * X)
268addss xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
269andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
270andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
271addss xmm4, xmm5
272xorps xmm4, xmm1 // (Y + Y2) xor SignBit
273movss [Result], xmm4
274end;
275
276function FastSin(const ARadians: TVector2): TVector2; assembler;
277asm
278movlps xmm0, [ARadians]
279movlps xmm2, QWORD [SSE_MASK_ABS_VAL]
280movaps xmm1, xmm0
281movlps xmm3, QWORD [SSE_MASK_SIGN]
282andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
283andps xmm1, xmm3 // (xmm1) SignBit
284movaps xmm2, xmm0
285movlps xmm4, QWORD [SSE_FOPI]
286movlps xmm5, QWORD [SSE_INT_ONE]
287mulps xmm2, xmm4
288movlps xmm6, QWORD [SSE_INT_NOT_ONE]
289cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
290movlps xmm7, QWORD [SSE_INT_FOUR]
291paddd xmm2, xmm5
292pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
293movlps xmm6, QWORD [SSE_INT_TWO]
294cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
295movaps xmm5, xmm2
296pand xmm2, xmm6 // J and 2
297pand xmm5, xmm7 // J and 4
298pxor xmm7, xmm7
299pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
300pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
301movlps xmm6, QWORD [SSE_PI_OVER_4]
302pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
303mulps xmm4, xmm6 // Y * Pi / 4
304movlps xmm3, QWORD [SSE_COSCOF_P0]
305subps xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
306movlps xmm4, QWORD [SSE_COSCOF_P1]
307movaps xmm7, xmm0
308movlps xmm6, QWORD [SSE_COSCOF_P2]
309mulps xmm7, xmm7 // (xmm7) Z := X * X
310movlps xmm5, QWORD [SSE_SINCOF_P1]
311mulps xmm3, xmm7 // COSCOF_P0 * Z
312addps xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
313movlps xmm4, QWORD [SSE_ONE_HALF]
314mulps xmm3, xmm7 // Y * Z
315mulps xmm4, xmm7 // Z * 0.5
316addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
317movlps xmm6, QWORD [SSE_ONE]
318mulps xmm3, xmm7 // Y * Z
319mulps xmm3, xmm7 // Y := Y * (Z * Z)
320subps xmm3, xmm4 // Y - Z * 0.5
321movlps xmm4, QWORD [SSE_SINCOF_P0]
322addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
323movlps xmm6, QWORD [SSE_SINCOF_P2]
324mulps xmm4, xmm7 // SINCOF_P0 * Z
325addps xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
326movaps xmm5, xmm2
327mulps xmm4, xmm7 // Y2 * Z
328addps xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
329mulps xmm4, xmm7 // Y2 * Z
330mulps xmm4, xmm0 // Y2 * (Z * X)
331addps xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
332andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
333andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
334addps xmm4, xmm5
335xorps xmm4, xmm1 // (Y + Y2) xor SignBit
336movlps [Result], xmm4
337end;
338
339function FastSin(const ARadians: TVector3): TVector3; assembler;
340asm
341movq xmm0, [ARadians]
342movss xmm1, [ARadians+8]
343movlhps xmm0, xmm1
344movups xmm2, [SSE_MASK_ABS_VAL]
345movaps xmm1, xmm0
346movups xmm3, [SSE_MASK_SIGN]
347andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
348andps xmm1, xmm3 // (xmm1) SignBit
349movaps xmm2, xmm0
350movups xmm4, [SSE_FOPI]
351movups xmm5, [SSE_INT_ONE]
352mulps xmm2, xmm4
353movups xmm6, [SSE_INT_NOT_ONE]
354cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
355movups xmm7, [SSE_INT_FOUR]
356paddd xmm2, xmm5
357pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
358movups xmm6, [SSE_INT_TWO]
359cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
360movaps xmm5, xmm2
361pand xmm2, xmm6 // J and 2
362pand xmm5, xmm7 // J and 4
363pxor xmm7, xmm7
364pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
365pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
366movups xmm6, [SSE_PI_OVER_4]
367pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
368mulps xmm4, xmm6 // Y * Pi / 4
369movups xmm3, [SSE_COSCOF_P0]
370subps xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
371movups xmm4, [SSE_COSCOF_P1]
372movaps xmm7, xmm0
373movups xmm6, [SSE_COSCOF_P2]
374mulps xmm7, xmm7 // (xmm7) Z := X * X
375movups xmm5, [SSE_SINCOF_P1]
376mulps xmm3, xmm7 // COSCOF_P0 * Z
377addps xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
378movups xmm4, [SSE_ONE_HALF]
379mulps xmm3, xmm7 // Y * Z
380mulps xmm4, xmm7 // Z * 0.5
381addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
382movups xmm6, [SSE_ONE]
383mulps xmm3, xmm7 // Y * Z
384mulps xmm3, xmm7 // Y := Y * (Z * Z)
385subps xmm3, xmm4 // Y - Z * 0.5
386movups xmm4, [SSE_SINCOF_P0]
387addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
388movups xmm6, [SSE_SINCOF_P2]
389mulps xmm4, xmm7 // SINCOF_P0 * Z
390addps xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
391movaps xmm5, xmm2
392mulps xmm4, xmm7 // Y2 * Z
393addps xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
394mulps xmm4, xmm7 // Y2 * Z
395mulps xmm4, xmm0 // Y2 * (Z * X)
396addps xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
397andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
398andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
399addps xmm4, xmm5
400xorps xmm4, xmm1 // (Y + Y2) xor SignBit
401movhlps xmm5, xmm4
402movq [Result], xmm4
403movss [Result+8], xmm5
404end;
405
406function FastSin(const ARadians: TVector4): TVector4; assembler;
407asm
408movups xmm0, [ARadians]
409movups xmm2, [SSE_MASK_ABS_VAL]
410movaps xmm1, xmm0
411movups xmm3, [SSE_MASK_SIGN]
412andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
413andps xmm1, xmm3 // (xmm1) SignBit
414movaps xmm2, xmm0
415movups xmm4, [SSE_FOPI]
416movups xmm5, [SSE_INT_ONE]
417mulps xmm2, xmm4
418movups xmm6, [SSE_INT_NOT_ONE]
419cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
420movups xmm7, [SSE_INT_FOUR]
421paddd xmm2, xmm5
422pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
423movups xmm6, [SSE_INT_TWO]
424cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
425movaps xmm5, xmm2
426pand xmm2, xmm6 // J and 2
427pand xmm5, xmm7 // J and 4
428pxor xmm7, xmm7
429pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
430pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
431movups xmm6, [SSE_PI_OVER_4]
432pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
433mulps xmm4, xmm6 // Y * Pi / 4
434movups xmm3, [SSE_COSCOF_P0]
435subps xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
436movups xmm4, [SSE_COSCOF_P1]
437movaps xmm7, xmm0
438movups xmm6, [SSE_COSCOF_P2]
439mulps xmm7, xmm7 // (xmm7) Z := X * X
440movups xmm5, [SSE_SINCOF_P1]
441mulps xmm3, xmm7 // COSCOF_P0 * Z
442addps xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
443movups xmm4, [SSE_ONE_HALF]
444mulps xmm3, xmm7 // Y * Z
445mulps xmm4, xmm7 // Z * 0.5
446addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
447movups xmm6, [SSE_ONE]
448mulps xmm3, xmm7 // Y * Z
449mulps xmm3, xmm7 // Y := Y * (Z * Z)
450subps xmm3, xmm4 // Y - Z * 0.5
451movups xmm4, [SSE_SINCOF_P0]
452addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
453movups xmm6, [SSE_SINCOF_P2]
454mulps xmm4, xmm7 // SINCOF_P0 * Z
455addps xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
456movaps xmm5, xmm2
457mulps xmm4, xmm7 // Y2 * Z
458addps xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
459mulps xmm4, xmm7 // Y2 * Z
460mulps xmm4, xmm0 // Y2 * (Z * X)
461addps xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
462andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
463andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
464addps xmm4, xmm5
465xorps xmm4, xmm1 // (Y + Y2) xor SignBit
466movups [Result], xmm4
467end;
468
469function FastCos(const ARadians: Single): Single; assembler;
470asm
471movss xmm0, [ARadians]
472movss xmm1, DWORD [SSE_MASK_ABS_VAL]
473movss xmm2, DWORD [SSE_FOPI]
474andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
475movss xmm3, DWORD [SSE_INT_NOT_ONE]
476movaps xmm1, xmm0
477movss xmm4, DWORD [SSE_INT_FOUR]
478mulss xmm1, xmm2
479movss xmm2, DWORD [SSE_INT_ONE]
480cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
481pxor xmm6, xmm6
482paddd xmm1, xmm2
483pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
484movss xmm3, DWORD [SSE_INT_TWO]
485cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
486psubd xmm1, xmm3 // J - 2
487movaps xmm5, xmm1
488pandn xmm1, xmm4 // (not (J - 2)) and 4
489pand xmm5, xmm3 // (J - 2) and 2
490pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
491movss xmm3, DWORD [SSE_PI_OVER_4]
492pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
493mulss xmm2, xmm3 // Y * Pi / 4
494movss xmm3, DWORD [SSE_COSCOF_P1]
495subss xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
496movss xmm2, DWORD [SSE_COSCOF_P0]
497movss xmm4, DWORD [SSE_COSCOF_P2]
498movaps xmm6, xmm0
499mulss xmm6, xmm6 // (xmm6) Z := X * X
500mulss xmm2, xmm6 // COSCOF_P0 * Z
501addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
502movss xmm3, DWORD [SSE_ONE_HALF]
503mulss xmm2, xmm6 // Y * Z
504mulss xmm3, xmm6 // Z * 0.5
505addss xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
506movss xmm7, DWORD [SSE_ONE]
507mulss xmm2, xmm6
508movss xmm4, DWORD [SSE_SINCOF_P1]
509mulss xmm2, xmm6 // Y := Y * (Z * Z)
510subss xmm2, xmm3 // Y - Z * 0.5
511addss xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
512movss xmm3, DWORD [SSE_SINCOF_P0]
513movss xmm7, DWORD [SSE_SINCOF_P2]
514mulss xmm3, xmm6 // SINCOF_P0 * Z
515addss xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
516mulss xmm3, xmm6 // Y2 * Z
517addss xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
518mulss xmm3, xmm6 // Y2 * Z
519mulss xmm3, xmm0 // Y2 * (Z * X)
520addss xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
521andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
522andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
523addss xmm3, xmm5
524xorps xmm3, xmm1 // (Y + Y2) xor SignBit
525movss [Result], xmm3
526end;
527
528function FastCos(const ARadians: TVector2): TVector2; assembler;
529asm
530movlps xmm0, [ARadians]
531movlps xmm1, QWORD [SSE_MASK_ABS_VAL]
532movlps xmm2, QWORD [SSE_FOPI]
533andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
534movlps xmm3, QWORD [SSE_INT_NOT_ONE]
535movaps xmm1, xmm0
536movlps xmm4, QWORD [SSE_INT_FOUR]
537mulps xmm1, xmm2
538movlps xmm2, QWORD [SSE_INT_ONE]
539cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
540pxor xmm6, xmm6
541paddd xmm1, xmm2
542pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
543movlps xmm3, QWORD [SSE_INT_TWO]
544cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
545psubd xmm1, xmm3 // J - 2
546movaps xmm5, xmm1
547pandn xmm1, xmm4 // (not (J - 2)) and 4
548pand xmm5, xmm3 // (J - 2) and 2
549pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
550movlps xmm3, QWORD [SSE_PI_OVER_4]
551pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
552mulps xmm2, xmm3 // Y * Pi / 4
553movlps xmm3, QWORD [SSE_COSCOF_P1]
554subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
555movlps xmm2, QWORD [SSE_COSCOF_P0]
556movlps xmm4, QWORD [SSE_COSCOF_P2]
557movaps xmm6, xmm0
558mulps xmm6, xmm6 // (xmm6) Z := X * X
559mulps xmm2, xmm6 // COSCOF_P0 * Z
560addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
561movlps xmm3, QWORD [SSE_ONE_HALF]
562mulps xmm2, xmm6 // Y * Z
563mulps xmm3, xmm6 // Z * 0.5
564addps xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
565movlps xmm7, QWORD [SSE_ONE]
566mulps xmm2, xmm6
567movlps xmm4, QWORD [SSE_SINCOF_P1]
568mulps xmm2, xmm6 // Y := Y * (Z * Z)
569subps xmm2, xmm3 // Y - Z * 0.5
570addps xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
571movlps xmm3, QWORD [SSE_SINCOF_P0]
572movlps xmm7, QWORD [SSE_SINCOF_P2]
573mulps xmm3, xmm6 // SINCOF_P0 * Z
574addps xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
575mulps xmm3, xmm6 // Y2 * Z
576addps xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
577mulps xmm3, xmm6 // Y2 * Z
578mulps xmm3, xmm0 // Y2 * (Z * X)
579addps xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
580andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
581andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
582addps xmm3, xmm5
583xorps xmm3, xmm1 // (Y + Y2) xor SignBit
584movlps [Result], xmm3
585end;
586
587function FastCos(const ARadians: TVector3): TVector3; assembler;
588asm
589movq xmm0, [ARadians]
590movss xmm1, [ARadians+8]
591movlhps xmm0, xmm1
592movups xmm1, [SSE_MASK_ABS_VAL]
593movups xmm2, [SSE_FOPI]
594andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
595movups xmm3, [SSE_INT_NOT_ONE]
596movaps xmm1, xmm0
597movups xmm4, [SSE_INT_FOUR]
598mulps xmm1, xmm2
599movups xmm2, [SSE_INT_ONE]
600cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
601pxor xmm6, xmm6
602paddd xmm1, xmm2
603pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
604movups xmm3, [SSE_INT_TWO]
605cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
606psubd xmm1, xmm3 // J - 2
607movaps xmm5, xmm1
608pandn xmm1, xmm4 // (not (J - 2)) and 4
609pand xmm5, xmm3 // (J - 2) and 2
610pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
611movups xmm3, [SSE_PI_OVER_4]
612pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
613mulps xmm2, xmm3 // Y * Pi / 4
614movups xmm3, [SSE_COSCOF_P1]
615subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
616movups xmm2, [SSE_COSCOF_P0]
617movups xmm4, [SSE_COSCOF_P2]
618movaps xmm6, xmm0
619mulps xmm6, xmm6 // (xmm6) Z := X * X
620mulps xmm2, xmm6 // COSCOF_P0 * Z
621addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
622movups xmm3, [SSE_ONE_HALF]
623mulps xmm2, xmm6 // Y * Z
624mulps xmm3, xmm6 // Z * 0.5
625addps xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
626movups xmm7, [SSE_ONE]
627mulps xmm2, xmm6
628movups xmm4, [SSE_SINCOF_P1]
629mulps xmm2, xmm6 // Y := Y * (Z * Z)
630subps xmm2, xmm3 // Y - Z * 0.5
631addps xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
632movups xmm3, [SSE_SINCOF_P0]
633movups xmm7, [SSE_SINCOF_P2]
634mulps xmm3, xmm6 // SINCOF_P0 * Z
635addps xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
636mulps xmm3, xmm6 // Y2 * Z
637addps xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
638mulps xmm3, xmm6 // Y2 * Z
639mulps xmm3, xmm0 // Y2 * (Z * X)
640addps xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
641andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
642andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
643addps xmm3, xmm5
644xorps xmm3, xmm1 // (Y + Y2) xor SignBit
645movhlps xmm4, xmm3
646movq [Result], xmm3
647movss [Result+8], xmm4
648end;
649
650function FastCos(const ARadians: TVector4): TVector4; assembler;
651asm
652movups xmm0, [ARadians]
653movups xmm1, [SSE_MASK_ABS_VAL]
654movups xmm2, [SSE_FOPI]
655andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
656movups xmm3, [SSE_INT_NOT_ONE]
657movaps xmm1, xmm0
658movups xmm4, [SSE_INT_FOUR]
659mulps xmm1, xmm2
660movups xmm2, [SSE_INT_ONE]
661cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
662pxor xmm6, xmm6
663paddd xmm1, xmm2
664pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
665movups xmm3, [SSE_INT_TWO]
666cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
667psubd xmm1, xmm3 // J - 2
668movaps xmm5, xmm1
669pandn xmm1, xmm4 // (not (J - 2)) and 4
670pand xmm5, xmm3 // (J - 2) and 2
671pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
672movups xmm3, [SSE_PI_OVER_4]
673pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
674mulps xmm2, xmm3 // Y * Pi / 4
675movups xmm3, [SSE_COSCOF_P1]
676subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
677movups xmm2, [SSE_COSCOF_P0]
678movups xmm4, [SSE_COSCOF_P2]
679movaps xmm6, xmm0
680mulps xmm6, xmm6 // (xmm6) Z := X * X
681mulps xmm2, xmm6 // COSCOF_P0 * Z
682addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
683movups xmm3, [SSE_ONE_HALF]
684mulps xmm2, xmm6 // Y * Z
685mulps xmm3, xmm6 // Z * 0.5
686addps xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
687movups xmm7, [SSE_ONE]
688mulps xmm2, xmm6
689movups xmm4, [SSE_SINCOF_P1]
690mulps xmm2, xmm6 // Y := Y * (Z * Z)
691subps xmm2, xmm3 // Y - Z * 0.5
692addps xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
693movups xmm3, [SSE_SINCOF_P0]
694movups xmm7, [SSE_SINCOF_P2]
695mulps xmm3, xmm6 // SINCOF_P0 * Z
696addps xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
697mulps xmm3, xmm6 // Y2 * Z
698addps xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
699mulps xmm3, xmm6 // Y2 * Z
700mulps xmm3, xmm0 // Y2 * (Z * X)
701addps xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
702andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
703andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
704addps xmm3, xmm5
705xorps xmm3, xmm1 // (Y + Y2) xor SignBit
706movups [Result], xmm3
707end;
708
709procedure FastSinCos(const ARadians: Single; out ASin, ACos: Single); assembler;
710asm
711movss xmm0, [ARadians]
712movss xmm2, DWORD [SSE_MASK_SIGN]
713movss xmm3, DWORD [SSE_MASK_ABS_VAL]
714movaps xmm1, xmm0
715pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
716pand xmm1, xmm2 // (xmm1) SignBitSin
717movaps xmm4, xmm0
718movss xmm5, DWORD [SSE_FOPI]
719movss xmm6, DWORD [SSE_INT_ONE]
720mulss xmm4, xmm5
721movss xmm7, DWORD [SSE_INT_NOT_ONE]
722cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
723movss xmm5, DWORD [SSE_INT_FOUR]
724paddd xmm4, xmm6
725pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
726movss xmm7, DWORD [SSE_INT_TWO]
727cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
728movaps xmm3, xmm4
729movaps xmm6, xmm4 // (xmm6) J
730pand xmm3, xmm5 // J and 4
731pand xmm4, xmm7 // J and 2
732pxor xmm5, xmm5
733pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
734movss xmm7, DWORD [SSE_PI_OVER_4]
735pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
736mulss xmm2, xmm7 // Y * Pi / 4
737movss xmm5, DWORD [SSE_INT_TWO]
738subss xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
739psubd xmm6, xmm5 // J - 2
740movss xmm7, DWORD [SSE_INT_FOUR]
741pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
742andnps xmm6, xmm7 // (not (J - 2)) and 4
743movaps xmm3, xmm0
744pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
745mulss xmm3, xmm3 // (xmm3) Z := X * X
746movss xmm2, DWORD [SSE_COSCOF_P0]
747movss xmm5, DWORD [SSE_COSCOF_P1]
748movss xmm7, DWORD [SSE_COSCOF_P2]
749mulss xmm2, xmm3 // COSCOF_P0 * Z
750addss xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
751movss xmm5, DWORD [SSE_ONE_HALF]
752mulss xmm2, xmm3 // Y * Z
753addss xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
754movss xmm7, DWORD [SSE_ONE]
755mulss xmm2, xmm3 // Y * Z
756mulss xmm5, xmm3 // 0.5 * Z
757mulss xmm2, xmm3 // Y * (Z * Z)
758subss xmm2, xmm5 // Y - 0.5 * Z
759movss xmm5, DWORD [SSE_SINCOF_P0]
760addss xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
761movss xmm7, DWORD [SSE_SINCOF_P1]
762mulss xmm5, xmm3 // SINCOF_P0 * Z
763addss xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
764mulss xmm5, xmm3 // Y2 * Z
765movss xmm7, DWORD [SSE_SINCOF_P2]
766addss xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
767mulss xmm5, xmm3 // Y2 * Z
768mulss xmm5, xmm0 // Y2 * (Z * X)
769addss xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
770movaps xmm0, xmm2 // Y
771movaps xmm3, xmm5 // Y2
772andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
773andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
774subss xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
775subss xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
776addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
777addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
778xorps xmm4, xmm1 // Sin
779xorps xmm3, xmm6 // Cos
780movss [ASin], xmm4
781movss [ACos], xmm3
782end;
783
784procedure FastSinCos(const ARadians: TVector2; out ASin, ACos: TVector2); assembler;
785asm
786movlps xmm0, [ARadians]
787movlps xmm2, QWORD [SSE_MASK_SIGN]
788movlps xmm3, QWORD [SSE_MASK_ABS_VAL]
789movaps xmm1, xmm0
790pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
791pand xmm1, xmm2 // (xmm1) SignBitSin
792movaps xmm4, xmm0
793movlps xmm5, QWORD [SSE_FOPI]
794movlps xmm6, QWORD [SSE_INT_ONE]
795mulps xmm4, xmm5
796movlps xmm7, QWORD [SSE_INT_NOT_ONE]
797cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
798movlps xmm5, QWORD [SSE_INT_FOUR]
799paddd xmm4, xmm6
800pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
801movlps xmm7, QWORD [SSE_INT_TWO]
802cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
803movaps xmm3, xmm4
804movaps xmm6, xmm4 // (xmm6) J
805pand xmm3, xmm5 // J and 4
806pand xmm4, xmm7 // J and 2
807pxor xmm5, xmm5
808pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
809movlps xmm7, QWORD [SSE_PI_OVER_4]
810pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
811mulps xmm2, xmm7 // Y * Pi / 4
812movlps xmm5, QWORD [SSE_INT_TWO]
813subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
814psubd xmm6, xmm5 // J - 2
815movlps xmm7, QWORD [SSE_INT_FOUR]
816pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
817andnps xmm6, xmm7 // (not (J - 2)) and 4
818movaps xmm3, xmm0
819pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
820mulps xmm3, xmm3 // (xmm3) Z := X * X
821movlps xmm2, QWORD [SSE_COSCOF_P0]
822movlps xmm5, QWORD [SSE_COSCOF_P1]
823movlps xmm7, QWORD [SSE_COSCOF_P2]
824mulps xmm2, xmm3 // COSCOF_P0 * Z
825addps xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
826movlps xmm5, QWORD [SSE_ONE_HALF]
827mulps xmm2, xmm3 // Y * Z
828addps xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
829movlps xmm7, QWORD [SSE_ONE]
830mulps xmm2, xmm3 // Y * Z
831mulps xmm5, xmm3 // 0.5 * Z
832mulps xmm2, xmm3 // Y * (Z * Z)
833subps xmm2, xmm5 // Y - 0.5 * Z
834movlps xmm5, QWORD [SSE_SINCOF_P0]
835addps xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
836movlps xmm7, QWORD [SSE_SINCOF_P1]
837mulps xmm5, xmm3 // SINCOF_P0 * Z
838addps xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
839mulps xmm5, xmm3 // Y2 * Z
840movlps xmm7, QWORD [SSE_SINCOF_P2]
841addps xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
842mulps xmm5, xmm3 // Y2 * Z
843mulps xmm5, xmm0 // Y2 * (Z * X)
844addps xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
845movaps xmm0, xmm2 // Y
846movaps xmm3, xmm5 // Y2
847andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
848andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
849subps xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
850subps xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
851addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
852addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
853xorps xmm4, xmm1 // Sin
854xorps xmm3, xmm6 // Cos
855movlps [ASin], xmm4
856movlps [ACos], xmm3
857end;
858
859procedure FastSinCos(const ARadians: TVector3; out ASin, ACos: TVector3); assembler;
860asm
861movq xmm0, [ARadians]
862movss xmm1, [ARadians+8]
863movlhps xmm0, xmm1
864movups xmm2, [SSE_MASK_SIGN]
865movups xmm3, [SSE_MASK_ABS_VAL]
866movaps xmm1, xmm0
867pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
868pand xmm1, xmm2 // (xmm1) SignBitSin
869movaps xmm4, xmm0
870movups xmm5, [SSE_FOPI]
871movups xmm6, [SSE_INT_ONE]
872mulps xmm4, xmm5
873movups xmm7, [SSE_INT_NOT_ONE]
874cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
875movups xmm5, [SSE_INT_FOUR]
876paddd xmm4, xmm6
877pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
878movups xmm7, [SSE_INT_TWO]
879cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
880movaps xmm3, xmm4
881movaps xmm6, xmm4 // (xmm6) J
882pand xmm3, xmm5 // J and 4
883pand xmm4, xmm7 // J and 2
884pxor xmm5, xmm5
885pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
886movups xmm7, [SSE_PI_OVER_4]
887pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
888mulps xmm2, xmm7 // Y * Pi / 4
889movups xmm5, [SSE_INT_TWO]
890subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
891psubd xmm6, xmm5 // J - 2
892movups xmm7, [SSE_INT_FOUR]
893pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
894andnps xmm6, xmm7 // (not (J - 2)) and 4
895movaps xmm3, xmm0
896pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
897mulps xmm3, xmm3 // (xmm3) Z := X * X
898movups xmm2, [SSE_COSCOF_P0]
899movups xmm5, [SSE_COSCOF_P1]
900movups xmm7, [SSE_COSCOF_P2]
901mulps xmm2, xmm3 // COSCOF_P0 * Z
902addps xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
903movups xmm5, [SSE_ONE_HALF]
904mulps xmm2, xmm3 // Y * Z
905addps xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
906movups xmm7, [SSE_ONE]
907mulps xmm2, xmm3 // Y * Z
908mulps xmm5, xmm3 // 0.5 * Z
909mulps xmm2, xmm3 // Y * (Z * Z)
910subps xmm2, xmm5 // Y - 0.5 * Z
911movups xmm5, [SSE_SINCOF_P0]
912addps xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
913movups xmm7, [SSE_SINCOF_P1]
914mulps xmm5, xmm3 // SINCOF_P0 * Z
915addps xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
916mulps xmm5, xmm3 // Y2 * Z
917movups xmm7, [SSE_SINCOF_P2]
918addps xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
919mulps xmm5, xmm3 // Y2 * Z
920mulps xmm5, xmm0 // Y2 * (Z * X)
921addps xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
922movaps xmm0, xmm2 // Y
923movaps xmm3, xmm5 // Y2
924andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
925andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
926subps xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
927subps xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
928addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
929addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
930xorps xmm4, xmm1 // Sin
931xorps xmm3, xmm6 // Cos
932movhlps xmm5, xmm4
933movhlps xmm2, xmm3
934movq [ASin], xmm4
935movss [ASin+8], xmm5
936movq [ACos], xmm3
937movss [ACos+8], xmm2
938end;
939
940procedure FastSinCos(const ARadians: TVector4; out ASin, ACos: TVector4); assembler;
941asm
942movups xmm0, [ARadians]
943movups xmm2, [SSE_MASK_SIGN]
944movups xmm3, [SSE_MASK_ABS_VAL]
945movaps xmm1, xmm0
946pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
947pand xmm1, xmm2 // (xmm1) SignBitSin
948movaps xmm4, xmm0
949movups xmm5, [SSE_FOPI]
950movups xmm6, [SSE_INT_ONE]
951mulps xmm4, xmm5
952movups xmm7, [SSE_INT_NOT_ONE]
953cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
954movups xmm5, [SSE_INT_FOUR]
955paddd xmm4, xmm6
956pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
957movups xmm7, [SSE_INT_TWO]
958cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
959movaps xmm3, xmm4
960movaps xmm6, xmm4 // (xmm6) J
961pand xmm3, xmm5 // J and 4
962pand xmm4, xmm7 // J and 2
963pxor xmm5, xmm5
964pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
965movups xmm7, [SSE_PI_OVER_4]
966pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
967mulps xmm2, xmm7 // Y * Pi / 4
968movups xmm5, [SSE_INT_TWO]
969subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
970psubd xmm6, xmm5 // J - 2
971movups xmm7, [SSE_INT_FOUR]
972pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
973andnps xmm6, xmm7 // (not (J - 2)) and 4
974movaps xmm3, xmm0
975pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
976mulps xmm3, xmm3 // (xmm3) Z := X * X
977movups xmm2, [SSE_COSCOF_P0]
978movups xmm5, [SSE_COSCOF_P1]
979movups xmm7, [SSE_COSCOF_P2]
980mulps xmm2, xmm3 // COSCOF_P0 * Z
981addps xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
982movups xmm5, [SSE_ONE_HALF]
983mulps xmm2, xmm3 // Y * Z
984addps xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
985movups xmm7, [SSE_ONE]
986mulps xmm2, xmm3 // Y * Z
987mulps xmm5, xmm3 // 0.5 * Z
988mulps xmm2, xmm3 // Y * (Z * Z)
989subps xmm2, xmm5 // Y - 0.5 * Z
990movups xmm5, [SSE_SINCOF_P0]
991addps xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
992movups xmm7, [SSE_SINCOF_P1]
993mulps xmm5, xmm3 // SINCOF_P0 * Z
994addps xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
995mulps xmm5, xmm3 // Y2 * Z
996movups xmm7, [SSE_SINCOF_P2]
997addps xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
998mulps xmm5, xmm3 // Y2 * Z
999mulps xmm5, xmm0 // Y2 * (Z * X)
1000addps xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
1001movaps xmm0, xmm2 // Y
1002movaps xmm3, xmm5 // Y2
1003andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
1004andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
1005subps xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
1006subps xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
1007addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
1008addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
1009xorps xmm4, xmm1 // Sin
1010xorps xmm3, xmm6 // Cos
1011movups [ASin], xmm4
1012movups [ACos], xmm3
1013end;
1014
1015function FastExp(const A: Single): Single; assembler;
1016asm
1017movss xmm0, [A]
1018movss xmm1, DWORD [SSE_EXP_A1]
1019movss xmm2, DWORD [SSE_EXP_A2]
1020
1021// Val := 12102203.1615614 * A + 1065353216.0
1022mulss xmm0, xmm1
1023movss xmm3, DWORD [SSE_EXP_CST]
1024addss xmm0, xmm2
1025
1026// if (Val >= EXP_CST) then Val := EXP_CST
1027movss xmm1, xmm0
1028cmpltss xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1029andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1030andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1031orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1032
1033// IVal := Trunc(Val)
1034xorps xmm3, xmm3
1035cvtps2dq xmm1, xmm0
1036
1037// if (IVal < 0) then I := 0
1038movss xmm2, DWORD [SSE_MASK_EXPONENT]
1039movdqa xmm0, xmm1 // IVal
1040pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1041movss xmm3, DWORD [SSE_MASK_FRACTION]
1042pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1043
1044// XU.I := IVal and $7F800000
1045movss xmm4, DWORD [SSE_EXP_I1]
1046movss xmm1, xmm0
1047pand xmm0, xmm2 // XU.I / XU.S
1048
1049// XU2.I := (IVal and $007FFFFF) or $3F800000;
1050pand xmm1, xmm3
1051movss xmm6, DWORD [SSE_EXP_F5]
1052por xmm1, xmm4 // XU2.I / XU2.S
1053
1054// Result := XU.S *
1055// ( 0.509964287281036376953125 + B *
1056// ( 0.3120158612728118896484375 + B *
1057// ( 0.1666135489940643310546875 + B *
1058// (-2.12528370320796966552734375e-3 + B *
1059// 1.3534179888665676116943359375e-2))));
1060movss xmm5, DWORD [SSE_EXP_F4]
1061movss xmm7, xmm1
1062
1063mulss xmm1, xmm6
1064movss xmm4, DWORD [SSE_EXP_F3]
1065addss xmm1, xmm5
1066movss xmm3, DWORD [SSE_EXP_F2]
1067mulss xmm1, xmm7
1068movss xmm2, DWORD [SSE_EXP_F1]
1069addss xmm1, xmm4
1070mulss xmm1, xmm7
1071addss xmm1, xmm3
1072mulss xmm1, xmm7
1073addss xmm1, xmm2
1074mulss xmm1, xmm0
1075
1076movss [Result], xmm1
1077end;
1078
1079function FastExp(const A: TVector2): TVector2;
1080asm
1081movlps xmm0, [A]
1082movlps xmm1, QWORD [SSE_EXP_A1]
1083movlps xmm2, QWORD [SSE_EXP_A2]
1084
1085// Val := 12102203.1615614 * A + 1065353216.0
1086mulps xmm0, xmm1
1087movlps xmm3, QWORD [SSE_EXP_CST]
1088addps xmm0, xmm2
1089
1090// if (Val >= EXP_CST) then Val := EXP_CST
1091movaps xmm1, xmm0
1092cmpltps xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1093andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1094andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1095orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1096
1097// IVal := Trunc(Val)
1098xorps xmm3, xmm3
1099cvtps2dq xmm1, xmm0
1100
1101// if (IVal < 0) then I := 0
1102movlps xmm2, QWORD [SSE_MASK_EXPONENT]
1103movdqa xmm0, xmm1 // IVal
1104pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1105movlps xmm3, QWORD [SSE_MASK_FRACTION]
1106pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1107
1108// XU.I := IVal and $7F800000
1109movlps xmm4, QWORD [SSE_EXP_I1]
1110movdqa xmm1, xmm0
1111pand xmm0, xmm2 // XU.I / XU.S
1112
1113// XU2.I := (IVal and $007FFFFF) or $3F800000;
1114pand xmm1, xmm3
1115movlps xmm6, QWORD [SSE_EXP_F5]
1116por xmm1, xmm4 // XU2.I / XU2.S
1117
1118// Result := XU.S *
1119// ( 0.509964287281036376953125 + B *
1120// ( 0.3120158612728118896484375 + B *
1121// ( 0.1666135489940643310546875 + B *
1122// (-2.12528370320796966552734375e-3 + B *
1123// 1.3534179888665676116943359375e-2))));
1124movlps xmm5, QWORD [SSE_EXP_F4]
1125movaps xmm7, xmm1
1126
1127mulps xmm1, xmm6
1128movlps xmm4, QWORD [SSE_EXP_F3]
1129addps xmm1, xmm5
1130movlps xmm3, QWORD [SSE_EXP_F2]
1131mulps xmm1, xmm7
1132movlps xmm2, QWORD [SSE_EXP_F1]
1133addps xmm1, xmm4
1134mulps xmm1, xmm7
1135addps xmm1, xmm3
1136mulps xmm1, xmm7
1137addps xmm1, xmm2
1138mulps xmm1, xmm0
1139
1140movlps [Result], xmm1
1141end;
1142
1143function FastExp(const A: TVector3): TVector3;
1144asm
1145movq xmm0, [A]
1146movss xmm1, [A+8]
1147movlhps xmm0, xmm1
1148movups xmm1, [SSE_EXP_A1]
1149movups xmm2, [SSE_EXP_A2]
1150
1151// Val := 12102203.1615614 * A + 1065353216.0
1152mulps xmm0, xmm1
1153movups xmm3, [SSE_EXP_CST]
1154addps xmm0, xmm2
1155
1156// if (Val >= EXP_CST) then Val := EXP_CST
1157movaps xmm1, xmm0
1158cmpltps xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1159andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1160andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1161orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1162
1163// IVal := Trunc(Val)
1164xorps xmm3, xmm3
1165cvtps2dq xmm1, xmm0
1166
1167// if (IVal < 0) then I := 0
1168movups xmm2, [SSE_MASK_EXPONENT]
1169movdqa xmm0, xmm1 // IVal
1170pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1171movups xmm3, [SSE_MASK_FRACTION]
1172pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1173
1174// XU.I := IVal and $7F800000
1175movups xmm4, [SSE_EXP_I1]
1176movdqa xmm1, xmm0
1177pand xmm0, xmm2 // XU.I / XU.S
1178
1179// XU2.I := (IVal and $007FFFFF) or $3F800000;
1180pand xmm1, xmm3
1181movups xmm6, [SSE_EXP_F5]
1182por xmm1, xmm4 // XU2.I / XU2.S
1183
1184// Result := XU.S *
1185// ( 0.509964287281036376953125 + B *
1186// ( 0.3120158612728118896484375 + B *
1187// ( 0.1666135489940643310546875 + B *
1188// (-2.12528370320796966552734375e-3 + B *
1189// 1.3534179888665676116943359375e-2))));
1190movups xmm5, [SSE_EXP_F4]
1191movaps xmm7, xmm1
1192
1193mulps xmm1, xmm6
1194movups xmm4, [SSE_EXP_F3]
1195addps xmm1, xmm5
1196movups xmm3, [SSE_EXP_F2]
1197mulps xmm1, xmm7
1198movups xmm2, [SSE_EXP_F1]
1199addps xmm1, xmm4
1200mulps xmm1, xmm7
1201addps xmm1, xmm3
1202mulps xmm1, xmm7
1203addps xmm1, xmm2
1204mulps xmm1, xmm0
1205
1206movhlps xmm0, xmm1
1207movq [Result], xmm1
1208movss [Result+8], xmm0
1209end;
1210
1211function FastExp(const A: TVector4): TVector4;
1212asm
1213movups xmm0, [A]
1214movups xmm1, [SSE_EXP_A1]
1215movups xmm2, [SSE_EXP_A2]
1216
1217// Val := 12102203.1615614 * A + 1065353216.0
1218mulps xmm0, xmm1
1219movups xmm3, [SSE_EXP_CST]
1220addps xmm0, xmm2
1221
1222// if (Val >= EXP_CST) then Val := EXP_CST
1223movaps xmm1, xmm0
1224cmpltps xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1225andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1226andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1227orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1228
1229// IVal := Trunc(Val)
1230xorps xmm3, xmm3
1231cvtps2dq xmm1, xmm0
1232
1233// if (IVal < 0) then I := 0
1234movups xmm2, [SSE_MASK_EXPONENT]
1235movdqa xmm0, xmm1 // IVal
1236pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1237movups xmm3, [SSE_MASK_FRACTION]
1238pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1239
1240// XU.I := IVal and $7F800000
1241movups xmm4, [SSE_EXP_I1]
1242movdqa xmm1, xmm0
1243pand xmm0, xmm2 // XU.I / XU.S
1244
1245// XU2.I := (IVal and $007FFFFF) or $3F800000;
1246pand xmm1, xmm3
1247movups xmm6, [SSE_EXP_F5]
1248por xmm1, xmm4 // XU2.I / XU2.S
1249
1250// Result := XU.S *
1251// ( 0.509964287281036376953125 + B *
1252// ( 0.3120158612728118896484375 + B *
1253// ( 0.1666135489940643310546875 + B *
1254// (-2.12528370320796966552734375e-3 + B *
1255// 1.3534179888665676116943359375e-2))));
1256movups xmm5, [SSE_EXP_F4]
1257movaps xmm7, xmm1
1258
1259mulps xmm1, xmm6
1260movups xmm4, [SSE_EXP_F3]
1261addps xmm1, xmm5
1262movups xmm3, [SSE_EXP_F2]
1263mulps xmm1, xmm7
1264movups xmm2, [SSE_EXP_F1]
1265addps xmm1, xmm4
1266mulps xmm1, xmm7
1267addps xmm1, xmm3
1268mulps xmm1, xmm7
1269addps xmm1, xmm2
1270mulps xmm1, xmm0
1271
1272movups [Result], xmm1
1273end;
1274
1275function FastLn(const A: Single): Single; assembler;
1276asm
1277movss xmm0, [A]
1278xorps xmm2, xmm2
1279movss xmm1, xmm0
1280movss xmm3, DWORD [SSE_LN_CST]
1281movss xmm4, DWORD [SSE_NEG_INFINITY]
1282
1283// Exp := Val.I shr 23
1284psrld xmm0, 23
1285movss xmm5, xmm1
1286cvtdq2ps xmm0, xmm0 // xmm0=Exp
1287
1288// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1289cmpnless xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1290movss xmm2, DWORD [SSE_MASK_FRACTION]
1291andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1292andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1293movss xmm4, DWORD [SSE_EXP_I1]
1294orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1295
1296// Val.I := (Val.I and $007FFFFF) or $3F800000
1297pand xmm5, xmm2
1298movss xmm2, DWORD [SSE_LN_F5]
1299por xmm5, xmm4
1300movss xmm6, DWORD [SSE_LN_F3]
1301movss xmm3, xmm5 // xmm3=X
1302mulss xmm5, xmm5 // xmm5=X2
1303
1304movss xmm4, xmm3
1305movss xmm7, DWORD [SSE_LN_F4]
1306mulss xmm4, xmm6
1307mulss xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1308subss xmm4, xmm7
1309movss xmm7, DWORD [SSE_LN_F2]
1310movss xmm6, xmm3
1311mulss xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1312subss xmm6, xmm7
1313movss xmm2, DWORD [SSE_LN_F1]
1314addss xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1315mulss xmm3, xmm2
1316mulss xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1317addss xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1318addss xmm4, xmm0
1319addss xmm3, xmm4
1320
1321movss [Result], xmm3
1322end;
1323
1324function FastLn(const A: TVector2): TVector2; assembler;
1325asm
1326movlps xmm0, [A]
1327xorps xmm2, xmm2
1328movaps xmm1, xmm0
1329movlps xmm3, QWORD [SSE_LN_CST]
1330movlps xmm4, QWORD [SSE_NEG_INFINITY]
1331
1332// Exp := Val.I shr 23
1333psrld xmm0, 23
1334movaps xmm5, xmm1
1335cvtdq2ps xmm0, xmm0 // xmm0=Exp
1336
1337// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1338cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1339movlps xmm2, QWORD [SSE_MASK_FRACTION]
1340andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1341andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1342movlps xmm4, QWORD [SSE_EXP_I1]
1343orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1344
1345// Val.I := (Val.I and $007FFFFF) or $3F800000
1346pand xmm5, xmm2
1347movlps xmm2, QWORD [SSE_LN_F5]
1348por xmm5, xmm4
1349movlps xmm6, QWORD [SSE_LN_F3]
1350movaps xmm3, xmm5 // xmm3=X
1351mulps xmm5, xmm5 // xmm5=X2
1352
1353movaps xmm4, xmm3
1354movlps xmm7, QWORD [SSE_LN_F4]
1355mulps xmm4, xmm6
1356mulps xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1357subps xmm4, xmm7
1358movlps xmm7, QWORD [SSE_LN_F2]
1359movaps xmm6, xmm3
1360mulps xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1361subps xmm6, xmm7
1362movlps xmm2, QWORD [SSE_LN_F1]
1363addps xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1364mulps xmm3, xmm2
1365mulps xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1366addps xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1367addps xmm4, xmm0
1368addps xmm3, xmm4
1369
1370movlps [Result], xmm3
1371end;
1372
1373function FastLn(const A: TVector3): TVector3; assembler;
1374asm
1375movq xmm0, [A]
1376movss xmm1, [A+8]
1377movlhps xmm0, xmm1
1378xorps xmm2, xmm2
1379movaps xmm1, xmm0
1380movups xmm3, [SSE_LN_CST]
1381movups xmm4, [SSE_NEG_INFINITY]
1382
1383// Exp := Val.I shr 23
1384psrld xmm0, 23
1385movaps xmm5, xmm1
1386cvtdq2ps xmm0, xmm0 // xmm0=Exp
1387
1388// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1389cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1390movups xmm2, [SSE_MASK_FRACTION]
1391andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1392andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1393movups xmm4, [SSE_EXP_I1]
1394orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1395
1396// Val.I := (Val.I and $007FFFFF) or $3F800000
1397pand xmm5, xmm2
1398movups xmm2, [SSE_LN_F5]
1399por xmm5, xmm4
1400movups xmm6, [SSE_LN_F3]
1401movaps xmm3, xmm5 // xmm3=X
1402mulps xmm5, xmm5 // xmm5=X2
1403
1404movaps xmm4, xmm3
1405movups xmm7, [SSE_LN_F4]
1406mulps xmm4, xmm6
1407mulps xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1408subps xmm4, xmm7
1409movups xmm7, [SSE_LN_F2]
1410movaps xmm6, xmm3
1411mulps xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1412subps xmm6, xmm7
1413movups xmm2, [SSE_LN_F1]
1414addps xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1415mulps xmm3, xmm2
1416mulps xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1417addps xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1418addps xmm4, xmm0
1419addps xmm3, xmm4
1420
1421movhlps xmm4, xmm3
1422movq [Result], xmm3
1423movss [Result+8], xmm4
1424end;
1425
1426function FastLn(const A: TVector4): TVector4; assembler;
1427asm
1428movups xmm0, [A]
1429xorps xmm2, xmm2
1430movaps xmm1, xmm0
1431movups xmm3, [SSE_LN_CST]
1432movups xmm4, [SSE_NEG_INFINITY]
1433
1434// Exp := Val.I shr 23
1435psrld xmm0, 23
1436movaps xmm5, xmm1
1437cvtdq2ps xmm0, xmm0 // xmm0=Exp
1438
1439// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1440cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1441movups xmm2, [SSE_MASK_FRACTION]
1442andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1443andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1444movups xmm4, [SSE_EXP_I1]
1445orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1446
1447// Val.I := (Val.I and $007FFFFF) or $3F800000
1448pand xmm5, xmm2
1449movups xmm2, [SSE_LN_F5]
1450por xmm5, xmm4
1451movups xmm6, [SSE_LN_F3]
1452movaps xmm3, xmm5 // xmm3=X
1453mulps xmm5, xmm5 // xmm5=X2
1454
1455movaps xmm4, xmm3
1456movups xmm7, [SSE_LN_F4]
1457mulps xmm4, xmm6
1458mulps xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1459subps xmm4, xmm7
1460movups xmm7, [SSE_LN_F2]
1461movaps xmm6, xmm3
1462mulps xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1463subps xmm6, xmm7
1464movups xmm2, [SSE_LN_F1]
1465addps xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1466mulps xmm3, xmm2
1467mulps xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1468addps xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1469addps xmm4, xmm0
1470addps xmm3, xmm4
1471
1472movups [Result], xmm3
1473end;
1474
1475function FastLog2(const A: Single): Single; assembler;
1476asm
1477movss xmm0, [A]
1478movss xmm2, DWORD [SSE_MASK_FRACTION]
1479movss xmm1, xmm0
1480
1481// MX.I := (VX.I and $007FFFFF) or $3F000000
1482movss xmm3, DWORD [SSE_LOG2_I1]
1483pand xmm0, xmm2
1484cvtdq2ps xmm1, xmm1
1485movss xmm4, DWORD [SSE_LOG2_F1]
1486por xmm0, xmm3
1487
1488movss xmm2, DWORD [SSE_LOG2_F2]
1489mulss xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1490movss xmm3, DWORD [SSE_LOG2_F3]
1491subss xmm1, xmm2 // Result - 124.22551499
1492mulss xmm3, xmm0
1493movss xmm4, DWORD [SSE_LOG2_F5]
1494subss xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1495movss xmm2, DWORD [SSE_LOG2_F4]
1496addss xmm0, xmm4
1497divss xmm2, xmm0
1498subss xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1499
1500movss [Result], xmm1
1501end;
1502
1503function FastLog2(const A: TVector2): TVector2; assembler;
1504asm
1505movlps xmm0, [A]
1506movlps xmm2, QWORD [SSE_MASK_FRACTION]
1507movaps xmm1, xmm0
1508
1509// MX.I := (VX.I and $007FFFFF) or $3F000000
1510movlps xmm3, QWORD [SSE_LOG2_I1]
1511pand xmm0, xmm2
1512cvtdq2ps xmm1, xmm1
1513movlps xmm4, QWORD [SSE_LOG2_F1]
1514por xmm0, xmm3
1515
1516movlps xmm2, QWORD [SSE_LOG2_F2]
1517mulps xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1518movlps xmm3, QWORD [SSE_LOG2_F3]
1519subps xmm1, xmm2 // Result - 124.22551499
1520mulps xmm3, xmm0
1521movlps xmm4, QWORD [SSE_LOG2_F5]
1522subps xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1523movlps xmm2, QWORD [SSE_LOG2_F4]
1524addps xmm0, xmm4
1525divps xmm2, xmm0
1526subps xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1527
1528movlps [Result], xmm1
1529end;
1530
1531function FastLog2(const A: TVector3): TVector3; assembler;
1532asm
1533movq xmm0, [A]
1534movss xmm1, [A+8]
1535movlhps xmm0, xmm1
1536movups xmm2, [SSE_MASK_FRACTION]
1537movaps xmm1, xmm0
1538
1539// MX.I := (VX.I and $007FFFFF) or $3F000000
1540movups xmm3, [SSE_LOG2_I1]
1541pand xmm0, xmm2
1542cvtdq2ps xmm1, xmm1
1543movups xmm4, [SSE_LOG2_F1]
1544por xmm0, xmm3
1545
1546movups xmm2, [SSE_LOG2_F2]
1547mulps xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1548movups xmm3, [SSE_LOG2_F3]
1549subps xmm1, xmm2 // Result - 124.22551499
1550mulps xmm3, xmm0
1551movups xmm4, [SSE_LOG2_F5]
1552subps xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1553movups xmm2, [SSE_LOG2_F4]
1554addps xmm0, xmm4
1555divps xmm2, xmm0
1556subps xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1557
1558movhlps xmm0, xmm1
1559movq [Result], xmm1
1560movss [Result+8], xmm0
1561end;
1562
1563function FastLog2(const A: TVector4): TVector4; assembler;
1564asm
1565movups xmm0, [A]
1566movups xmm2, [SSE_MASK_FRACTION]
1567movaps xmm1, xmm0
1568
1569// MX.I := (VX.I and $007FFFFF) or $3F000000
1570movups xmm3, [SSE_LOG2_I1]
1571pand xmm0, xmm2
1572cvtdq2ps xmm1, xmm1
1573movups xmm4, [SSE_LOG2_F1]
1574por xmm0, xmm3
1575
1576movups xmm2, [SSE_LOG2_F2]
1577mulps xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1578movups xmm3, [SSE_LOG2_F3]
1579subps xmm1, xmm2 // Result - 124.22551499
1580mulps xmm3, xmm0
1581movups xmm4, [SSE_LOG2_F5]
1582subps xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1583movups xmm2, [SSE_LOG2_F4]
1584addps xmm0, xmm4
1585divps xmm2, xmm0
1586subps xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1587
1588movups [Result], xmm1
1589end;
1590
1591function FastExp2(const A: Single): Single; assembler;
1592var
1593OldFlags, NewFlags: UInt32;
1594asm
1595// Set rounding mode to Round Positive (=Round Down)
1596stmxcsr [OldFlags]
1597movss xmm0, [A]
1598mov ecx, [OldFlags]
1599xorps xmm1, xmm1
1600and ecx, SSE_ROUND_MASK
1601movss xmm3, xmm0
1602or ecx, SSE_ROUND_DOWN
1603movss xmm5, xmm0
1604mov [NewFlags], ecx
1605
1606movss xmm1, DWORD [SSE_EXP2_F1]
1607ldmxcsr [NewFlags]
1608
1609// Z := A - RoundDown(A)
1610cvtps2dq xmm3, xmm3
1611addss xmm1, xmm5 // A + 121.2740575
1612cvtdq2ps xmm3, xmm3
1613movss xmm2, DWORD [SSE_EXP2_F2]
1614subss xmm0, xmm3
1615
1616movss xmm3, DWORD [SSE_EXP2_F3]
1617movss xmm4, DWORD [SSE_EXP2_F4]
1618subss xmm3, xmm0 // (4.84252568 - Z)
1619mulss xmm0, xmm4 // 1.49012907 * Z
1620divss xmm2, xmm3
1621movss xmm5, DWORD [SSE_EXP2_F5]
1622addss xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1623subss xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1624mulss xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1625cvtps2dq xmm1, xmm1
1626
1627// Restore rounding mode
1628ldmxcsr [OldFlags]
1629
1630movss [Result], xmm1
1631end;
1632
1633function FastExp2(const A: TVector2): TVector2; assembler;
1634var
1635OldFlags, NewFlags: UInt32;
1636asm
1637// Set rounding mode to Round Positive (=Round Down)
1638stmxcsr [OldFlags]
1639movlps xmm0, [A]
1640mov ecx, [OldFlags]
1641xorps xmm1, xmm1
1642and ecx, SSE_ROUND_MASK
1643movaps xmm3, xmm0
1644or ecx, SSE_ROUND_DOWN
1645movaps xmm5, xmm0
1646mov [NewFlags], ecx
1647
1648movlps xmm1, QWORD [SSE_EXP2_F1]
1649ldmxcsr [NewFlags]
1650
1651// Z := A - RoundDown(A)
1652cvtps2dq xmm3, xmm3
1653addps xmm1, xmm5 // A + 121.2740575
1654cvtdq2ps xmm3, xmm3
1655movlps xmm2, QWORD [SSE_EXP2_F2]
1656subps xmm0, xmm3
1657
1658movlps xmm3, QWORD [SSE_EXP2_F3]
1659movlps xmm4, QWORD [SSE_EXP2_F4]
1660subps xmm3, xmm0 // (4.84252568 - Z)
1661mulps xmm0, xmm4 // 1.49012907 * Z
1662divps xmm2, xmm3
1663movlps xmm5, QWORD [SSE_EXP2_F5]
1664addps xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1665subps xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1666mulps xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1667cvtps2dq xmm1, xmm1
1668
1669// Restore rounding mode
1670ldmxcsr [OldFlags]
1671
1672movlps [Result], xmm1
1673end;
1674
1675function FastExp2(const A: TVector3): TVector3; assembler;
1676var
1677OldFlags, NewFlags: UInt32;
1678asm
1679// Set rounding mode to Round Positive (=Round Down)
1680stmxcsr [OldFlags]
1681movq xmm0, [A]
1682movss xmm1, [A+8]
1683movlhps xmm0, xmm1
1684mov ecx, [OldFlags]
1685xorps xmm1, xmm1
1686and ecx, SSE_ROUND_MASK
1687movaps xmm3, xmm0
1688or ecx, SSE_ROUND_DOWN
1689movaps xmm5, xmm0
1690mov [NewFlags], ecx
1691
1692movups xmm1, [SSE_EXP2_F1]
1693ldmxcsr [NewFlags]
1694
1695// Z := A - RoundDown(A)
1696cvtps2dq xmm3, xmm3
1697addps xmm1, xmm5 // A + 121.2740575
1698cvtdq2ps xmm3, xmm3
1699movups xmm2, [SSE_EXP2_F2]
1700subps xmm0, xmm3
1701
1702movups xmm3, [SSE_EXP2_F3]
1703movups xmm4, [SSE_EXP2_F4]
1704subps xmm3, xmm0 // (4.84252568 - Z)
1705mulps xmm0, xmm4 // 1.49012907 * Z
1706divps xmm2, xmm3
1707movups xmm5, [SSE_EXP2_F5]
1708addps xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1709subps xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1710mulps xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1711cvtps2dq xmm1, xmm1
1712
1713// Restore rounding mode
1714ldmxcsr [OldFlags]
1715
1716movhlps xmm0, xmm1
1717movq [Result], xmm1
1718movss [Result+8], xmm0
1719end;
1720
1721function FastExp2(const A: TVector4): TVector4; assembler;
1722var
1723OldFlags, NewFlags: UInt32;
1724asm
1725// Set rounding mode to Round Positive (=Round Down)
1726stmxcsr [OldFlags]
1727movups xmm0, [A]
1728mov ecx, [OldFlags]
1729xorps xmm1, xmm1
1730and ecx, SSE_ROUND_MASK
1731movaps xmm3, xmm0
1732or ecx, SSE_ROUND_DOWN
1733movaps xmm5, xmm0
1734mov [NewFlags], ecx
1735
1736movups xmm1, [SSE_EXP2_F1]
1737ldmxcsr [NewFlags]
1738
1739// Z := A - RoundDown(A)
1740cvtps2dq xmm3, xmm3
1741addps xmm1, xmm5 // A + 121.2740575
1742cvtdq2ps xmm3, xmm3
1743movups xmm2, [SSE_EXP2_F2]
1744subps xmm0, xmm3
1745
1746movups xmm3, [SSE_EXP2_F3]
1747movups xmm4, [SSE_EXP2_F4]
1748subps xmm3, xmm0 // (4.84252568 - Z)
1749mulps xmm0, xmm4 // 1.49012907 * Z
1750divps xmm2, xmm3
1751movups xmm5, [SSE_EXP2_F5]
1752addps xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1753subps xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1754mulps xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1755cvtps2dq xmm1, xmm1
1756
1757// Restore rounding mode
1758ldmxcsr [OldFlags]
1759
1760movups [Result], xmm1
1761end;
1762
1763{ Common Functions }
1764
1765function Abs(const A: Single): Single;
1766begin
1767Result := System.Abs(A);
1768end;
1769
1770function Abs(const A: TVector2): TVector2;
1771begin
1772Result.Init(System.Abs(A.X), System.Abs(A.Y));
1773end;
1774
1775function Abs(const A: TVector3): TVector3; assembler;
1776asm
1777movq xmm0, [A]
1778movss xmm1, [A+8]
1779movups xmm2, [SSE_MASK_ABS_VAL]
1780andps xmm0, xmm2
1781pand xmm1, xmm2
1782movq [Result], xmm0
1783movss [Result+8], xmm1
1784end;
1785
1786function Abs(const A: TVector4): TVector4; assembler;
1787asm
1788movups xmm0, [A]
1789movups xmm1, [SSE_MASK_ABS_VAL]
1790andps xmm0, xmm1
1791movups [Result], xmm0
1792end;
1793
1794function Sign(const A: Single): Single; assembler;
1795asm
1796movss xmm0, [A]
1797movss xmm1, DWORD [SSE_ONE]
1798movss xmm2, xmm0
1799movss xmm3, DWORD [SSE_MASK_SIGN]
1800
1801andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1802xorps xmm4, xmm4
1803orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1804cmpneqss xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1805andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1806movss [Result], xmm0
1807end;
1808
1809function Sign(const A: TVector2): TVector2; assembler;
1810asm
1811movlps xmm0, [A]
1812movlps xmm1, QWORD [SSE_ONE]
1813movaps xmm2, xmm0
1814movlps xmm3, QWORD [SSE_MASK_SIGN]
1815
1816andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1817xorps xmm4, xmm4
1818orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1819cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1820andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1821movlps [Result], xmm0
1822end;
1823
1824function Sign(const A: TVector3): TVector3; assembler;
1825asm
1826movq xmm0, [A]
1827movss xmm1, [A+8]
1828movlhps xmm0, xmm1
1829movups xmm1, [SSE_ONE]
1830movaps xmm2, xmm0
1831movups xmm3, [SSE_MASK_SIGN]
1832
1833andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1834xorps xmm4, xmm4
1835orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1836cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1837andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1838movhlps xmm1, xmm0
1839movq [Result], xmm0
1840movss [Result+8], xmm1
1841end;
1842
1843function Sign(const A: TVector4): TVector4; assembler;
1844asm
1845movups xmm0, [A]
1846movups xmm1, [SSE_ONE]
1847movaps xmm2, xmm0
1848movups xmm3, [SSE_MASK_SIGN]
1849
1850andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1851xorps xmm4, xmm4
1852orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1853cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1854andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1855movups [Result], xmm0
1856end;
1857
1858function Floor(const A: Single): Integer; assembler;
1859var
1860OldFlags, NewFlags: UInt32;
1861asm
1862// Set rounding mode to Round Down
1863stmxcsr [OldFlags]
1864mov ecx, [OldFlags]
1865and ecx, SSE_ROUND_MASK
1866or ecx, SSE_ROUND_DOWN
1867mov [NewFlags], ecx
1868movss xmm0, [A]
1869ldmxcsr [NewFlags]
1870
1871cvtps2dq xmm0, xmm0
1872
1873// Restore rounding mode
1874ldmxcsr [OldFlags]
1875
1876movd eax, xmm0
1877end;
1878
1879function Floor(const A: TVector2): TIVector2; assembler;
1880var
1881OldFlags, NewFlags: UInt32;
1882asm
1883// Set rounding mode to Round Down
1884stmxcsr [OldFlags]
1885mov ecx, [OldFlags]
1886and ecx, SSE_ROUND_MASK
1887or ecx, SSE_ROUND_DOWN
1888mov [NewFlags], ecx
1889movlps xmm0, [A]
1890ldmxcsr [NewFlags]
1891
1892cvtps2dq xmm0, xmm0
1893
1894// Restore rounding mode
1895ldmxcsr [OldFlags]
1896
1897movlps [Result], xmm0
1898end;
1899
1900function Floor(const A: TVector3): TIVector3; assembler;
1901var
1902OldFlags, NewFlags: UInt32;
1903asm
1904// Set rounding mode to Round Down
1905stmxcsr [OldFlags]
1906mov ecx, [OldFlags]
1907and ecx, SSE_ROUND_MASK
1908or ecx, SSE_ROUND_DOWN
1909mov [NewFlags], ecx
1910movq xmm0, [A]
1911movss xmm1, [A+8]
1912movlhps xmm0, xmm1
1913ldmxcsr [NewFlags]
1914
1915cvtps2dq xmm0, xmm0
1916
1917// Restore rounding mode
1918ldmxcsr [OldFlags]
1919
1920movhlps xmm1, xmm0
1921movq [Result], xmm0
1922movss [Result+8], xmm1
1923end;
1924
1925function Floor(const A: TVector4): TIVector4; assembler;
1926var
1927OldFlags, NewFlags: UInt32;
1928asm
1929// Set rounding mode to Round Down
1930stmxcsr [OldFlags]
1931mov ecx, [OldFlags]
1932and ecx, SSE_ROUND_MASK
1933or ecx, SSE_ROUND_DOWN
1934mov [NewFlags], ecx
1935movups xmm0, [A]
1936ldmxcsr [NewFlags]
1937
1938cvtps2dq xmm0, xmm0
1939
1940// Restore rounding mode
1941ldmxcsr [OldFlags]
1942
1943movups [Result], xmm0
1944end;
1945
1946function Trunc(const A: Single): Integer;
1947begin
1948Result := System.Trunc(A);
1949end;
1950{function Trunc(const A: Single): Integer; assembler;
1951var
1952OldFlags, NewFlags: UInt32;
1953asm
1954// Set rounding mode to Truncate
1955stmxcsr [OldFlags]
1956mov ecx, [OldFlags]
1957and ecx, SSE_ROUND_MASK
1958or ecx, SSE_ROUND_TRUNC
1959mov [NewFlags], ecx
1960movss xmm0, [A]
1961ldmxcsr [NewFlags]
1962
1963cvtps2dq xmm0, xmm0
1964
1965// Restore rounding mode
1966ldmxcsr [OldFlags]
1967
1968movd eax, xmm0
1969end;}
1970
1971function Trunc(const A: TVector2): TIVector2; assembler;
1972var
1973OldFlags, NewFlags: UInt32;
1974asm
1975// Set rounding mode to Truncate
1976stmxcsr [OldFlags]
1977mov ecx, [OldFlags]
1978and ecx, SSE_ROUND_MASK
1979or ecx, SSE_ROUND_TRUNC
1980mov [NewFlags], ecx
1981movlps xmm0, [A]
1982ldmxcsr [NewFlags]
1983
1984cvtps2dq xmm0, xmm0
1985
1986// Restore rounding mode
1987ldmxcsr [OldFlags]
1988
1989movlps [Result], xmm0
1990end;
1991
1992function Trunc(const A: TVector3): TIVector3; assembler;
1993var
1994OldFlags, NewFlags: UInt32;
1995asm
1996// Set rounding mode to Truncate
1997stmxcsr [OldFlags]
1998mov ecx, [OldFlags]
1999and ecx, SSE_ROUND_MASK
2000or ecx, SSE_ROUND_TRUNC
2001mov [NewFlags], ecx
2002movq xmm0, [A]
2003movss xmm1, [A+8]
2004movlhps xmm0, xmm1
2005ldmxcsr [NewFlags]
2006
2007cvtps2dq xmm0, xmm0
2008
2009// Restore rounding mode
2010ldmxcsr [OldFlags]
2011
2012movhlps xmm1, xmm0
2013movq [Result], xmm0
2014movss [Result+8], xmm1
2015end;
2016
2017function Trunc(const A: TVector4): TIVector4; assembler;
2018var
2019OldFlags, NewFlags: UInt32;
2020asm
2021// Set rounding mode to Truncate
2022stmxcsr [OldFlags]
2023mov ecx, [OldFlags]
2024and ecx, SSE_ROUND_MASK
2025or ecx, SSE_ROUND_TRUNC
2026mov [NewFlags], ecx
2027movups xmm0, [A]
2028ldmxcsr [NewFlags]
2029
2030cvtps2dq xmm0, xmm0
2031
2032// Restore rounding mode
2033ldmxcsr [OldFlags]
2034
2035movups [Result], xmm0
2036end;
2037
2038function Round(const A: Single): Integer;
2039begin
2040Result := System.Round(A);
2041end;
2042
2043function Round(const A: TVector2): TIVector2; assembler;
2044asm
2045// Rounding mode defaults to round-to-nearest
2046movlps xmm0, [A]
2047cvtps2dq xmm0, xmm0
2048movlps [Result], xmm0
2049end;
2050
2051function Round(const A: TVector3): TIVector3; assembler;
2052asm
2053// Rounding mode defaults to round-to-nearest
2054movq xmm0, [A]
2055movss xmm1, [A+8]
2056movlhps xmm0, xmm1
2057cvtps2dq xmm0, xmm0
2058movhlps xmm1, xmm0
2059movq [Result], xmm0
2060movss [Result+8], xmm1
2061end;
2062
2063function Round(const A: TVector4): TIVector4; assembler;
2064asm
2065// Rounding mode defaults to round-to-nearest
2066movups xmm0, [A]
2067cvtps2dq xmm0, xmm0
2068movups [Result], xmm0
2069end;
2070
2071function Ceil(const A: Single): Integer; assembler;
2072var
2073OldFlags, NewFlags: UInt32;
2074asm
2075// Set rounding mode to Ceil
2076stmxcsr [OldFlags]
2077mov ecx, [OldFlags]
2078and ecx, SSE_ROUND_MASK
2079or ecx, SSE_ROUND_UP
2080mov [NewFlags], ecx
2081movss xmm0, [A]
2082ldmxcsr [NewFlags]
2083
2084cvtps2dq xmm0, xmm0
2085
2086// Restore rounding mode
2087ldmxcsr [OldFlags]
2088
2089movd eax, xmm0
2090end;
2091
2092function Ceil(const A: TVector2): TIVector2; assembler;
2093var
2094OldFlags, NewFlags: UInt32;
2095asm
2096// Set rounding mode to Ceil
2097stmxcsr [OldFlags]
2098mov ecx, [OldFlags]
2099and ecx, SSE_ROUND_MASK
2100or ecx, SSE_ROUND_UP
2101mov [NewFlags], ecx
2102movlps xmm0, [A]
2103ldmxcsr [NewFlags]
2104
2105cvtps2dq xmm0, xmm0
2106
2107// Restore rounding mode
2108ldmxcsr [OldFlags]
2109
2110movlps [Result], xmm0
2111end;
2112
2113function Ceil(const A: TVector3): TIVector3; assembler;
2114var
2115OldFlags, NewFlags: UInt32;
2116asm
2117// Set rounding mode to Ceil
2118stmxcsr [OldFlags]
2119mov ecx, [OldFlags]
2120and ecx, SSE_ROUND_MASK
2121or ecx, SSE_ROUND_UP
2122mov [NewFlags], ecx
2123movq xmm0, [A]
2124movss xmm1, [A+8]
2125movlhps xmm0, xmm1
2126ldmxcsr [NewFlags]
2127
2128cvtps2dq xmm0, xmm0
2129
2130// Restore rounding mode
2131ldmxcsr [OldFlags]
2132
2133movhlps xmm1, xmm0
2134movq [Result], xmm0
2135movss [Result+8], xmm1
2136end;
2137
2138function Ceil(const A: TVector4): TIVector4; assembler;
2139var
2140OldFlags, NewFlags: UInt32;
2141asm
2142// Set rounding mode to Ceil
2143stmxcsr [OldFlags]
2144mov ecx, [OldFlags]
2145and ecx, SSE_ROUND_MASK
2146or ecx, SSE_ROUND_UP
2147mov [NewFlags], ecx
2148movups xmm0, [A]
2149ldmxcsr [NewFlags]
2150
2151cvtps2dq xmm0, xmm0
2152
2153// Restore rounding mode
2154ldmxcsr [OldFlags]
2155
2156movups [Result], xmm0
2157end;
2158
2159function Frac(const A: Single): Single; assembler;
2160var
2161OldFlags, NewFlags: UInt32;
2162asm
2163// Set rounding mode to Truncate
2164stmxcsr [OldFlags]
2165mov ecx, [OldFlags]
2166and ecx, SSE_ROUND_MASK
2167or ecx, SSE_ROUND_TRUNC
2168movss xmm0, [A]
2169mov [NewFlags], ecx
2170movss xmm1, xmm0
2171ldmxcsr [NewFlags]
2172
2173cvtps2dq xmm0, xmm0
2174ldmxcsr [OldFlags]
2175cvtdq2ps xmm0, xmm0
2176subss xmm1, xmm0 // A - Trunc(A)
2177
2178movss [Result], xmm1
2179end;
2180
2181function Frac(const A: TVector2): TVector2; assembler;
2182var
2183OldFlags, NewFlags: UInt32;
2184asm
2185// Set rounding mode to Truncate
2186stmxcsr [OldFlags]
2187mov ecx, [OldFlags]
2188and ecx, SSE_ROUND_MASK
2189or ecx, SSE_ROUND_TRUNC
2190movlps xmm0, [A]
2191mov [NewFlags], ecx
2192movaps xmm1, xmm0
2193ldmxcsr [NewFlags]
2194
2195cvtps2dq xmm0, xmm0
2196ldmxcsr [OldFlags]
2197cvtdq2ps xmm0, xmm0
2198subps xmm1, xmm0 // A - Trunc(A)
2199
2200movlps [Result], xmm1
2201end;
2202
2203function Frac(const A: TVector3): TVector3; assembler;
2204var
2205OldFlags, NewFlags: UInt32;
2206asm
2207// Set rounding mode to Truncate
2208stmxcsr [OldFlags]
2209mov ecx, [OldFlags]
2210and ecx, SSE_ROUND_MASK
2211or ecx, SSE_ROUND_TRUNC
2212movq xmm0, [A]
2213movss xmm1, [A+8]
2214movlhps xmm0, xmm1
2215mov [NewFlags], ecx
2216movaps xmm1, xmm0
2217ldmxcsr [NewFlags]
2218
2219cvtps2dq xmm0, xmm0
2220ldmxcsr [OldFlags]
2221cvtdq2ps xmm0, xmm0
2222subps xmm1, xmm0 // A - Trunc(A)
2223
2224movhlps xmm0, xmm1
2225movq [Result], xmm1
2226movss [Result+8], xmm0
2227end;
2228
2229function Frac(const A: TVector4): TVector4; assembler;
2230var
2231OldFlags, NewFlags: UInt32;
2232asm
2233// Set rounding mode to Truncate
2234stmxcsr [OldFlags]
2235mov ecx, [OldFlags]
2236and ecx, SSE_ROUND_MASK
2237or ecx, SSE_ROUND_TRUNC
2238movups xmm0, [A]
2239mov [NewFlags], ecx
2240movaps xmm1, xmm0
2241ldmxcsr [NewFlags]
2242
2243cvtps2dq xmm0, xmm0
2244ldmxcsr [OldFlags]
2245cvtdq2ps xmm0, xmm0
2246subps xmm1, xmm0 // A - Trunc(A)
2247
2248movups [Result], xmm1
2249end;
2250
2251function FMod(const A, B: Single): Single;
2252begin
2253Result := A - (B * Trunc(A / B));
2254end;
2255{function FMod(const A, B: Single): Single; assembler;
2256var
2257OldFlags, NewFlags: UInt32;
2258asm
2259// Set rounding mode to Truncate
2260movss xmm0, [A]
2261stmxcsr [OldFlags]
2262movss xmm1, [B]
2263mov edx, [OldFlags]
2264movss xmm2, xmm0
2265and edx, SSE_ROUND_MASK
2266movss xmm3, xmm1
2267or edx, SSE_ROUND_TRUNC
2268divss xmm2, xmm3 // A / B
2269mov [NewFlags], edx
2270ldmxcsr [NewFlags]
2271
2272cvtps2dq xmm2, xmm2
2273cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2274mulss xmm2, xmm1
2275subss xmm0, xmm2 // A - (B * Trunc(A / B))
2276
2277// Restore rounding mode
2278ldmxcsr [OldFlags]
2279
2280movss [Result], xmm0
2281end;}
2282
2283function FMod(const A: TVector2; const B: Single): TVector2; assembler;
2284var
2285OldFlags, NewFlags: UInt32;
2286asm
2287// Set rounding mode to Truncate
2288movlps xmm0, [A]
2289stmxcsr [OldFlags]
2290movss xmm1, [B]
2291mov ecx, [OldFlags]
2292shufps xmm1, xmm1, $00 // Replicate B
2293and ecx, SSE_ROUND_MASK
2294movaps xmm2, xmm0
2295or ecx, SSE_ROUND_TRUNC
2296movaps xmm3, xmm1
2297mov [NewFlags], ecx
2298divps xmm2, xmm3 // A / B
2299ldmxcsr [NewFlags]
2300
2301cvtps2dq xmm2, xmm2
2302cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2303mulps xmm2, xmm1
2304subps xmm0, xmm2 // A - (B * Trunc(A / B))
2305
2306// Restore rounding mode
2307ldmxcsr [OldFlags]
2308
2309movlps [Result], xmm0
2310end;
2311
2312function FMod(const A, B: TVector2): TVector2; assembler;
2313var
2314OldFlags, NewFlags: UInt32;
2315asm
2316// Set rounding mode to Truncate
2317movlps xmm0, [A]
2318stmxcsr [OldFlags]
2319movlps xmm1, [B]
2320mov edx, [OldFlags]
2321movaps xmm2, xmm0
2322and edx, SSE_ROUND_MASK
2323movaps xmm3, xmm1
2324or edx, SSE_ROUND_TRUNC
2325divps xmm2, xmm3 // A / B
2326mov [NewFlags], edx
2327ldmxcsr [NewFlags]
2328
2329cvtps2dq xmm2, xmm2
2330cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2331mulps xmm2, xmm1
2332subps xmm0, xmm2 // A - (B * Trunc(A / B))
2333
2334// Restore rounding mode
2335ldmxcsr [OldFlags]
2336
2337movlps [Result], xmm0
2338end;
2339
2340function FMod(const A: TVector3; const B: Single): TVector3; assembler;
2341var
2342OldFlags, NewFlags: UInt32;
2343asm
2344// Set rounding mode to Truncate
2345movq xmm0, [A]
2346movss xmm1, [A+8]
2347movlhps xmm0, xmm1
2348stmxcsr [OldFlags]
2349movss xmm1, [B]
2350mov ecx, [OldFlags]
2351shufps xmm1, xmm1, $00 // Replicate B
2352and ecx, SSE_ROUND_MASK
2353movaps xmm2, xmm0
2354or ecx, SSE_ROUND_TRUNC
2355movaps xmm3, xmm1
2356mov [NewFlags], ecx
2357divps xmm2, xmm3 // A / B
2358ldmxcsr [NewFlags]
2359
2360cvtps2dq xmm2, xmm2
2361cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2362mulps xmm2, xmm1
2363subps xmm0, xmm2 // A - (B * Trunc(A / B))
2364
2365// Restore rounding mode
2366ldmxcsr [OldFlags]
2367
2368movhlps xmm1, xmm0
2369movq [Result], xmm0
2370movss [Result+8], xmm1
2371end;
2372
2373function FMod(const A, B: TVector3): TVector3; assembler;
2374var
2375OldFlags, NewFlags: UInt32;
2376asm
2377// Set rounding mode to Truncate
2378movq xmm0, [A]
2379movss xmm1, [A+8]
2380movlhps xmm0, xmm1
2381stmxcsr [OldFlags]
2382movq xmm1, [B]
2383movss xmm2, [B+8]
2384movlhps xmm1, xmm2
2385mov edx, [OldFlags]
2386movaps xmm2, xmm0
2387and edx, SSE_ROUND_MASK
2388movaps xmm3, xmm1
2389or edx, SSE_ROUND_TRUNC
2390divps xmm2, xmm3 // A / B
2391mov [NewFlags], edx
2392ldmxcsr [NewFlags]
2393
2394cvtps2dq xmm2, xmm2
2395cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2396mulps xmm2, xmm1
2397subps xmm0, xmm2 // A - (B * Trunc(A / B))
2398
2399// Restore rounding mode
2400ldmxcsr [OldFlags]
2401
2402movhlps xmm1, xmm0
2403movq [Result], xmm0
2404movss [Result+8], xmm1
2405end;
2406
2407function FMod(const A: TVector4; const B: Single): TVector4; assembler;
2408var
2409OldFlags, NewFlags: UInt32;
2410asm
2411// Set rounding mode to Truncate
2412movups xmm0, [A]
2413stmxcsr [OldFlags]
2414movss xmm1, [B]
2415mov ecx, [OldFlags]
2416shufps xmm1, xmm1, $00 // Replicate B
2417and ecx, SSE_ROUND_MASK
2418movaps xmm2, xmm0
2419or ecx, SSE_ROUND_TRUNC
2420movaps xmm3, xmm1
2421mov [NewFlags], ecx
2422divps xmm2, xmm3 // A / B
2423ldmxcsr [NewFlags]
2424
2425cvtps2dq xmm2, xmm2
2426cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2427mulps xmm2, xmm1
2428subps xmm0, xmm2 // A - (B * Trunc(A / B))
2429
2430// Restore rounding mode
2431ldmxcsr [OldFlags]
2432
2433movups [Result], xmm0
2434end;
2435
2436function FMod(const A, B: TVector4): TVector4; assembler;
2437var
2438OldFlags, NewFlags: UInt32;
2439asm
2440// Set rounding mode to Truncate
2441movups xmm0, [A]
2442stmxcsr [OldFlags]
2443movups xmm1, [B]
2444mov edx, [OldFlags]
2445movaps xmm2, xmm0
2446and edx, SSE_ROUND_MASK
2447movaps xmm3, xmm1
2448or edx, SSE_ROUND_TRUNC
2449divps xmm2, xmm3 // A / B
2450mov [NewFlags], edx
2451ldmxcsr [NewFlags]
2452
2453cvtps2dq xmm2, xmm2
2454cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2455mulps xmm2, xmm1
2456subps xmm0, xmm2 // A - (B * Trunc(A / B))
2457
2458// Restore rounding mode
2459ldmxcsr [OldFlags]
2460
2461movups [Result], xmm0
2462end;
2463
2464function ModF(const A: Single; out B: Integer): Single; assembler;
2465var
2466OldFlags, NewFlags: UInt32;
2467asm
2468movss xmm0, [A]
2469
2470// Set rounding mode to Truncate
2471stmxcsr [OldFlags]
2472mov edx, [OldFlags]
2473and edx, SSE_ROUND_MASK
2474or edx, SSE_ROUND_TRUNC
2475mov [NewFlags], edx
2476ldmxcsr [NewFlags]
2477
2478movss xmm1, xmm0
2479cvtps2dq xmm0, xmm0
2480movss [B], xmm0 // B = Trunc(A)
2481cvtdq2ps xmm0, xmm0
2482subss xmm1, xmm0 // A - Trunc(A)
2483
2484// Restore rounding mode
2485ldmxcsr [OldFlags]
2486
2487movss [Result], xmm1
2488end;
2489
2490function ModF(const A: TVector2; out B: TIVector2): TVector2; assembler;
2491var
2492OldFlags, NewFlags: UInt32;
2493asm
2494movlps xmm0, [A]
2495
2496// Set rounding mode to Truncate
2497stmxcsr [OldFlags]
2498mov eax, [OldFlags]
2499and eax, SSE_ROUND_MASK
2500or eax, SSE_ROUND_TRUNC
2501mov [NewFlags], eax
2502ldmxcsr [NewFlags]
2503
2504movaps xmm1, xmm0
2505cvtps2dq xmm0, xmm0
2506movlps [B], xmm0 // B = Trunc(A)
2507cvtdq2ps xmm0, xmm0
2508subps xmm1, xmm0 // A - Trunc(A)
2509
2510// Restore rounding mode
2511ldmxcsr [OldFlags]
2512
2513movlps [Result], xmm1
2514end;
2515
2516function ModF(const A: TVector3; out B: TIVector3): TVector3; assembler;
2517var
2518OldFlags, NewFlags: UInt32;
2519asm
2520movq xmm0, [A]
2521movss xmm1, [A+8]
2522movlhps xmm0, xmm1
2523
2524// Set rounding mode to Truncate
2525stmxcsr [OldFlags]
2526mov eax, [OldFlags]
2527and eax, SSE_ROUND_MASK
2528or eax, SSE_ROUND_TRUNC
2529mov [NewFlags], eax
2530ldmxcsr [NewFlags]
2531
2532movaps xmm1, xmm0
2533cvtps2dq xmm0, xmm0
2534movhlps xmm2, xmm0
2535movq [B], xmm0 // B = Trunc(A)
2536movd [B+8], xmm2
2537cvtdq2ps xmm0, xmm0
2538subps xmm1, xmm0 // A - Trunc(A)
2539
2540// Restore rounding mode
2541ldmxcsr [OldFlags]
2542
2543movhlps xmm0, xmm1
2544movq [Result], xmm1
2545movss [Result+8], xmm0
2546end;
2547
2548function ModF(const A: TVector4; out B: TIVector4): TVector4; assembler;
2549var
2550OldFlags, NewFlags: UInt32;
2551asm
2552movups xmm0, [A]
2553
2554// Set rounding mode to Truncate
2555stmxcsr [OldFlags]
2556mov eax, [OldFlags]
2557and eax, SSE_ROUND_MASK
2558or eax, SSE_ROUND_TRUNC
2559mov [NewFlags], eax
2560ldmxcsr [NewFlags]
2561
2562movaps xmm1, xmm0
2563cvtps2dq xmm0, xmm0
2564movups [B], xmm0 // B = Trunc(A)
2565cvtdq2ps xmm0, xmm0
2566subps xmm1, xmm0 // A - Trunc(A)
2567
2568// Restore rounding mode
2569ldmxcsr [OldFlags]
2570
2571movups [Result], xmm1
2572end;
2573
2574function Min(const A: TVector2; const B: Single): TVector2; assembler;
2575asm
2576movss xmm1, [B]
2577movlps xmm0, [A]
2578shufps xmm1, xmm1, $00 // Replicate B
2579minps xmm0, xmm1
2580movlps [Result], xmm0
2581end;
2582
2583function Min(const A, B: TVector2): TVector2; assembler;
2584asm
2585movlps xmm0, [A]
2586movlps xmm1, [B]
2587minps xmm0, xmm1
2588movlps [Result], xmm0
2589end;
2590
2591function Min(const A: TVector3; const B: Single): TVector3; assembler;
2592asm
2593movss xmm1, [B]
2594movq xmm0, [A]
2595movss xmm2, [A+8]
2596movlhps xmm0, xmm2
2597shufps xmm1, xmm1, $00 // Replicate B
2598minps xmm0, xmm1
2599movhlps xmm1, xmm0
2600movq [Result], xmm0
2601movss [Result+8], xmm1
2602end;
2603
2604function Min(const A, B: TVector3): TVector3; assembler;
2605asm
2606movq xmm0, [A]
2607movss xmm1, [A+8]
2608movlhps xmm0, xmm1
2609movq xmm1, [B]
2610movss xmm2, [B+8]
2611movlhps xmm1, xmm2
2612minps xmm0, xmm1
2613movhlps xmm1, xmm0
2614movq [Result], xmm0
2615movss [Result+8], xmm1
2616end;
2617
2618function Min(const A: TVector4; const B: Single): TVector4; assembler;
2619asm
2620movss xmm1, [B]
2621movups xmm0, [A]
2622shufps xmm1, xmm1, $00 // Replicate B
2623minps xmm0, xmm1
2624movups [Result], xmm0
2625end;
2626
2627function Min(const A, B: TVector4): TVector4; assembler;
2628asm
2629movups xmm0, [A]
2630movups xmm1, [B]
2631minps xmm0, xmm1
2632movups [Result], xmm0
2633end;
2634
2635function Max(const A: TVector2; const B: Single): TVector2; assembler;
2636asm
2637movss xmm1, [B]
2638movlps xmm0, [A]
2639shufps xmm1, xmm1, $00 // Replicate B
2640maxps xmm0, xmm1
2641movlps [Result], xmm0
2642end;
2643
2644function Max(const A, B: TVector2): TVector2; assembler;
2645asm
2646movlps xmm0, [A]
2647movlps xmm1, [B]
2648maxps xmm0, xmm1
2649movlps [Result], xmm0
2650end;
2651
2652function Max(const A: TVector3; const B: Single): TVector3; assembler;
2653asm
2654movss xmm1, [B]
2655movq xmm0, [A]
2656movss xmm2, [A+8]
2657movlhps xmm0, xmm2
2658shufps xmm1, xmm1, $00 // Replicate B
2659maxps xmm0, xmm1
2660movhlps xmm1, xmm0
2661movq [Result], xmm0
2662movss [Result+8], xmm1
2663end;
2664
2665function Max(const A, B: TVector3): TVector3; assembler;
2666asm
2667movq xmm0, [A]
2668movss xmm1, [A+8]
2669movlhps xmm0, xmm1
2670movq xmm1, [B]
2671movss xmm2, [B+8]
2672movlhps xmm1, xmm2
2673maxps xmm0, xmm1
2674movhlps xmm1, xmm0
2675movq [Result], xmm0
2676movss [Result+8], xmm1
2677end;
2678
2679function Max(const A: TVector4; const B: Single): TVector4; assembler;
2680asm
2681movss xmm1, [B]
2682movups xmm0, [A]
2683shufps xmm1, xmm1, $00 // Replicate B
2684maxps xmm0, xmm1
2685movups [Result], xmm0
2686end;
2687
2688function Max(const A, B: TVector4): TVector4; assembler;
2689asm
2690movups xmm0, [A]
2691movups xmm1, [B]
2692maxps xmm0, xmm1
2693movups [Result], xmm0
2694end;
2695
2696function EnsureRange(const A, AMin, AMax: Single): Single; assembler;
2697asm
2698movss xmm0, [A]
2699movss xmm1, [AMin]
2700movss xmm2, [AMax]
2701maxss xmm0, xmm1
2702minss xmm0, xmm2
2703movss [Result], xmm0
2704end;
2705
2706function EnsureRange(const A: TVector2; const AMin, AMax: Single): TVector2; assembler;
2707asm
2708movlps xmm0, [A]
2709movss xmm1, [AMin]
2710movss xmm2, [AMax]
2711shufps xmm1, xmm1, $00 // Replicate AMin
2712shufps xmm2, xmm2, $00 // Replicate AMax
2713maxps xmm0, xmm1
2714minps xmm0, xmm2
2715movlps [Result], xmm0
2716end;
2717
2718function EnsureRange(const A, AMin, AMax: TVector2): TVector2; assembler;
2719asm
2720movlps xmm0, [A]
2721movlps xmm1, [AMin]
2722movlps xmm2, [AMax]
2723maxps xmm0, xmm1
2724mov eax, [Result]
2725minps xmm0, xmm2
2726movlps [eax], xmm0
2727end;
2728
2729function EnsureRange(const A: TVector3; const AMin, AMax: Single): TVector3; assembler;
2730asm
2731movq xmm0, [A]
2732movss xmm1, [A+8]
2733movlhps xmm0, xmm1
2734movss xmm1, [AMin]
2735movss xmm2, [AMax]
2736shufps xmm1, xmm1, $00 // Replicate AMin
2737shufps xmm2, xmm2, $00 // Replicate AMax
2738maxps xmm0, xmm1
2739minps xmm0, xmm2
2740movhlps xmm1, xmm0
2741movq [Result], xmm0
2742movss [Result+8], xmm1
2743end;
2744
2745function EnsureRange(const A, AMin, AMax: TVector3): TVector3; assembler;
2746asm
2747movq xmm0, [A]
2748movss xmm1, [A+8]
2749movlhps xmm0, xmm1
2750movq xmm1, [AMin]
2751movss xmm2, [AMin+8]
2752movlhps xmm1, xmm2
2753movq xmm2, [AMax]
2754movss xmm3, [AMax+8]
2755movlhps xmm2, xmm3
2756maxps xmm0, xmm1
2757mov eax, [Result]
2758minps xmm0, xmm2
2759movhlps xmm1, xmm0
2760movq [eax], xmm0
2761movss [eax+8], xmm1
2762end;
2763
2764function EnsureRange(const A: TVector4; const AMin, AMax: Single): TVector4; assembler;
2765asm
2766movups xmm0, [A]
2767movss xmm1, [AMin]
2768movss xmm2, [AMax]
2769shufps xmm1, xmm1, $00 // Replicate AMin
2770shufps xmm2, xmm2, $00 // Replicate AMax
2771maxps xmm0, xmm1
2772minps xmm0, xmm2
2773movups [Result], xmm0
2774end;
2775
2776function EnsureRange(const A, AMin, AMax: TVector4): TVector4; assembler;
2777asm
2778movups xmm0, [A]
2779movups xmm1, [AMin]
2780movups xmm2, [AMax]
2781maxps xmm0, xmm1
2782mov eax, [Result]
2783minps xmm0, xmm2
2784movups [eax], xmm0
2785end;
2786
2787function Mix(const A, B: TVector2; const T: Single): TVector2;
2788begin
2789Result.Init(Mix(A.X, B.X, T), Mix(A.Y, B.Y, T));
2790end;
2791
2792function Mix(const A, B, T: TVector2): TVector2;
2793begin
2794Result.Init(Mix(A.X, B.X, T.X), Mix(A.Y, B.Y, T.Y));
2795end;
2796
2797function Mix(const A, B: TVector3; const T: Single): TVector3; assembler;
2798asm
2799movss xmm2, [T]
2800movq xmm0, [A]
2801movss xmm1, [A+8]
2802movlhps xmm0, xmm1
2803movq xmm1, [B]
2804movss xmm3, [B+8]
2805movlhps xmm1, xmm3
2806shufps xmm2, xmm2, $00 // Replicate T
2807subps xmm1, xmm0
2808mulps xmm1, xmm2
2809addps xmm0, xmm1 // A + (T * (B - A))
2810movhlps xmm1, xmm0
2811movq [Result], xmm0
2812movss [Result+8], xmm1
2813end;
2814
2815function Mix(const A, B, T: TVector3): TVector3; assembler;
2816asm
2817movq xmm0, [A]
2818movss xmm1, [A+8]
2819movlhps xmm0, xmm1
2820movq xmm1, [B]
2821movss xmm2, [B+8]
2822movlhps xmm1, xmm2
2823movq xmm2, [T]
2824movss xmm3, [T+8]
2825movlhps xmm2, xmm3
2826subps xmm1, xmm0
2827mulps xmm1, xmm2
2828mov eax, [Result]
2829addps xmm0, xmm1 // A + (T * (B - A))
2830movhlps xmm1, xmm0
2831movq [eax], xmm0
2832movss [eax+8], xmm1
2833end;
2834
2835function Mix(const A, B: TVector4; const T: Single): TVector4; assembler;
2836asm
2837movss xmm2, [T]
2838movups xmm0, [A]
2839movups xmm1, [B]
2840shufps xmm2, xmm2, $00 // Replicate T
2841subps xmm1, xmm0
2842mulps xmm1, xmm2
2843addps xmm0, xmm1 // A + (T * (B - A))
2844movups [Result], xmm0
2845end;
2846
2847function Mix(const A, B, T: TVector4): TVector4; assembler;
2848asm
2849movups xmm0, [A]
2850movups xmm1, [B]
2851movups xmm2, [T]
2852subps xmm1, xmm0
2853mulps xmm1, xmm2
2854mov eax, [Result]
2855addps xmm0, xmm1 // A + (T * (B - A))
2856movups [eax], xmm0
2857end;
2858
2859function Step(const AEdge: Single; const A: TVector2): TVector2; assembler;
2860asm
2861movss xmm0, [AEdge]
2862movlps xmm1, [A]
2863shufps xmm0, xmm0, $00 // Replicate AEdge
2864movlps xmm2, QWORD [SSE_ONE]
2865cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2866andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2867movlps [Result], xmm1
2868end;
2869
2870function Step(const AEdge, A: TVector2): TVector2; assembler;
2871asm
2872movlps xmm0, [AEdge]
2873movlps xmm1, [A]
2874movlps xmm2, QWORD [SSE_ONE]
2875cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2876andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2877movlps [Result], xmm1
2878end;
2879
2880function Step(const AEdge: Single; const A: TVector3): TVector3; assembler;
2881asm
2882movss xmm0, [AEdge]
2883movq xmm1, [A]
2884movss xmm2, [A+8]
2885movlhps xmm1, xmm2
2886shufps xmm0, xmm0, $00 // Replicate AEdge
2887movups xmm2, [SSE_ONE]
2888cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2889andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2890movhlps xmm0, xmm1
2891movq [Result], xmm1
2892movss [Result+8], xmm0
2893end;
2894
2895function Step(const AEdge, A: TVector3): TVector3; assembler;
2896asm
2897movq xmm0, [AEdge]
2898movss xmm1, [AEdge+8]
2899movlhps xmm0, xmm1
2900movq xmm1, [A]
2901movss xmm2, [A+8]
2902movlhps xmm1, xmm2
2903movups xmm2, [SSE_ONE]
2904cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2905andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2906movhlps xmm0, xmm1
2907movq [Result], xmm1
2908movss [Result+8], xmm0
2909end;
2910
2911function Step(const AEdge: Single; const A: TVector4): TVector4; assembler;
2912asm
2913movss xmm0, [AEdge]
2914movups xmm1, [A]
2915shufps xmm0, xmm0, $00 // Replicate AEdge
2916movups xmm2, [SSE_ONE]
2917cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2918andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2919movups [Result], xmm1
2920end;
2921
2922function Step(const AEdge, A: TVector4): TVector4; assembler;
2923asm
2924movups xmm0, [AEdge]
2925movups xmm1, [A]
2926movups xmm2, [SSE_ONE]
2927cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
2928andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
2929movups [Result], xmm1
2930end;
2931
2932function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector2): TVector2;
2933begin
2934Result.Init(SmoothStep(AEdge0, AEdge1, A.X), SmoothStep(AEdge0, AEdge1, A.Y));
2935end;
2936
2937function SmoothStep(const AEdge0, AEdge1, A: TVector2): TVector2;
2938begin
2939Result.Init(SmoothStep(AEdge0.X, AEdge1.X, A.X), SmoothStep(AEdge0.Y, AEdge1.Y, A.Y));
2940end;
2941
2942function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector3): TVector3;
2943begin
2944Result.Init(SmoothStep(AEdge0, AEdge1, A.X), SmoothStep(AEdge0, AEdge1, A.Y), SmoothStep(AEdge0, AEdge1, A.Z));
2945end;
2946{function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector3): TVector3; assembler;
2947asm
2948movq xmm2, [A]
2949movss xmm1, [A+8]
2950movlhps xmm2, xmm1
2951movss xmm0, [AEdge0]
2952movss xmm1, [AEdge1]
2953shufps xmm0, xmm0, $00 // Replicate AEdge0
2954shufps xmm1, xmm1, $00 // Replicate AEdge1
2955movaps xmm3, xmm2
2956movaps xmm4, xmm2
2957movaps xmm5, xmm2
2958movups xmm6, [SSE_ONE]
2959
2960cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
2961cmpleps xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
2962subps xmm1, xmm0
2963movaps xmm5, xmm4
2964subps xmm2, xmm0
2965andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
2966
2967movups xmm6, [SSE_TWO]
2968divps xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
2969movups xmm7, [SSE_THREE]
2970mulps xmm6, xmm2 // 2 * Temp
2971subps xmm7, xmm6 // 3 - (2 * Temp)
2972mulps xmm7, xmm2
2973mulps xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
2974andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
2975andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
2976orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
2977
2978movhlps xmm6, xmm7
2979movq [Result], xmm7
2980movss [Result+8], xmm6
2981end;}
2982
2983function SmoothStep(const AEdge0, AEdge1, A: TVector3): TVector3; assembler;
2984asm
2985movq xmm2, [A]
2986movss xmm1, [A+8]
2987movlhps xmm2, xmm1
2988movq xmm0, [AEdge0]
2989movss xmm1, [AEdge0+8]
2990movlhps xmm0, xmm1
2991movq xmm1, [AEdge1]
2992movss xmm3, [AEdge1+8]
2993movlhps xmm1, xmm3
2994movaps xmm3, xmm2
2995movaps xmm4, xmm2
2996movaps xmm5, xmm2
2997movups xmm6, [SSE_ONE]
2998
2999cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3000cmpleps xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3001subps xmm1, xmm0
3002movaps xmm5, xmm4
3003subps xmm2, xmm0
3004andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3005
3006movups xmm6, [SSE_TWO]
3007divps xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3008movups xmm7, [SSE_THREE]
3009mulps xmm6, xmm2 // 2 * Temp
3010subps xmm7, xmm6 // 3 - (2 * Temp)
3011mulps xmm7, xmm2
3012mulps xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3013andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3014andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3015orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3016
3017mov eax, [Result]
3018movhlps xmm6, xmm7
3019movq [eax], xmm7
3020movss [eax+8], xmm6
3021end;
3022
3023function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector4): TVector4; assembler;
3024asm
3025movups xmm2, [A]
3026movss xmm0, [AEdge0]
3027movss xmm1, [AEdge1]
3028shufps xmm0, xmm0, $00 // Replicate AEdge0
3029shufps xmm1, xmm1, $00 // Replicate AEdge1
3030movaps xmm3, xmm2
3031movaps xmm4, xmm2
3032movaps xmm5, xmm2
3033movups xmm6, [SSE_ONE]
3034
3035cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3036cmpleps xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3037subps xmm1, xmm0
3038movaps xmm5, xmm4
3039subps xmm2, xmm0
3040andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3041
3042movups xmm6, [SSE_TWO]
3043divps xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3044movups xmm7, [SSE_THREE]
3045mulps xmm6, xmm2 // 2 * Temp
3046subps xmm7, xmm6 // 3 - (2 * Temp)
3047mulps xmm7, xmm2
3048mulps xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3049andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3050andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3051orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3052
3053movups [Result], xmm7
3054end;
3055
3056function SmoothStep(const AEdge0, AEdge1, A: TVector4): TVector4; assembler;
3057asm
3058movups xmm2, [A]
3059movups xmm0, [AEdge0]
3060movups xmm1, [AEdge1]
3061movaps xmm3, xmm2
3062movaps xmm4, xmm2
3063movaps xmm5, xmm2
3064movups xmm6, [SSE_ONE]
3065
3066cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3067cmpleps xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3068subps xmm1, xmm0
3069movaps xmm5, xmm4
3070subps xmm2, xmm0
3071andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3072
3073movups xmm6, [SSE_TWO]
3074divps xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3075movups xmm7, [SSE_THREE]
3076mulps xmm6, xmm2 // 2 * Temp
3077subps xmm7, xmm6 // 3 - (2 * Temp)
3078mulps xmm7, xmm2
3079mulps xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3080andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3081andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3082orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3083
3084mov eax, [Result]
3085movups [eax], xmm7
3086end;
3087
3088function FMA(const A, B, C: TVector2): TVector2; assembler;
3089asm
3090movlps xmm0, [A]
3091movlps xmm1, [B]
3092movlps xmm2, [C]
3093mulps xmm0, xmm1
3094addps xmm0, xmm2
3095mov eax, [Result]
3096movlps [eax], xmm0
3097end;
3098
3099function FMA(const A, B, C: TVector3): TVector3; assembler;
3100asm
3101movq xmm0, [A]
3102movss xmm1, [A+8]
3103movlhps xmm0, xmm1
3104movq xmm1, [B]
3105movss xmm2, [B+8]
3106movlhps xmm1, xmm2
3107movq xmm2, [C]
3108movss xmm3, [C+8]
3109movlhps xmm2, xmm3
3110mulps xmm0, xmm1
3111addps xmm0, xmm2
3112mov eax, [Result]
3113movhlps xmm1, xmm0
3114movq [eax], xmm0
3115movss [eax+8], xmm1
3116end;
3117
3118function FMA(const A, B, C: TVector4): TVector4; assembler;
3119asm
3120movups xmm0, [A]
3121movups xmm1, [B]
3122movups xmm2, [C]
3123mulps xmm0, xmm1
3124addps xmm0, xmm2
3125mov eax, [Result]
3126movups [eax], xmm0
3127end;
3128
3129{ Matrix functions }
3130
3131{$IFDEF FM_COLUMN_MAJOR}
3132function OuterProduct(const C, R: TVector2): TMatrix2; assembler;
3133asm
3134movlps xmm0, [R]
3135movlps xmm1, [C]
3136
3137shufps xmm0, xmm0, $50
3138shufps xmm1, xmm1, $44
3139
3140mulps xmm1, xmm0
3141
3142// Store as matrix
3143movups [Result], xmm1
3144end;
3145
3146function OuterProduct(const C, R: TVector3): TMatrix3; assembler;
3147asm
3148movq xmm0, [C]
3149movss xmm1, [C+8]
3150movlhps xmm0, xmm1
3151movq xmm1, [R]
3152movss xmm2, [R+8]
3153movlhps xmm1, xmm2
3154movaps xmm2, xmm1
3155movaps xmm3, xmm1
3156
3157shufps xmm1, xmm1, $00
3158shufps xmm2, xmm2, $55
3159shufps xmm3, xmm3, $AA
3160
3161mulps xmm1, xmm0
3162mulps xmm2, xmm0
3163mulps xmm3, xmm0
3164
3165// Store as matrix
3166movhlps xmm0, xmm1
3167movhlps xmm4, xmm2
3168movhlps xmm5, xmm3
3169movq QWORD [Result+$00], xmm1
3170movss [Result+$08], xmm0
3171movq QWORD [Result+$0C], xmm2
3172movss [Result+$14], xmm4
3173movq QWORD [Result+$18], xmm3
3174movss [Result+$20], xmm5
3175end;
3176
3177function OuterProduct(const C, R: TVector4): TMatrix4; assembler;
3178asm
3179movups xmm0, [C]
3180movups xmm1, [R]
3181movaps xmm2, xmm1
3182movaps xmm3, xmm1
3183movaps xmm4, xmm1
3184
3185shufps xmm1, xmm1, $00
3186shufps xmm2, xmm2, $55
3187shufps xmm3, xmm3, $AA
3188shufps xmm4, xmm4, $FF
3189
3190mulps xmm1, xmm0
3191mulps xmm2, xmm0
3192mulps xmm3, xmm0
3193mulps xmm4, xmm0
3194
3195// Store as matrix
3196movups DQWORD [Result + $00], xmm1
3197movups DQWORD [Result + $10], xmm2
3198movups DQWORD [Result + $20], xmm3
3199movups DQWORD [Result + $30], xmm4
3200end;
3201{$ELSE}
3202function OuterProduct(const C, R: TVector2): TMatrix2; assembler;
3203asm
3204movlps xmm0, [C] // # # C.Y C.X
3205movlps xmm1, [R] // # # R.Y R.X
3206
3207shufps xmm0, xmm0, $50 // C.Y C.X C.Y C.X
3208shufps xmm1, xmm1, $44 // R.Y R.Y R.X R.X
3209
3210mulps xmm1, xmm0 // (C.Y*R.Y) (C.X*R.Y) (C.Y*R.X) (C.X*R.X)
3211
3212// Store as matrix
3213movups [Result], xmm1
3214end;
3215
3216function OuterProduct(const C, R: TVector3): TMatrix3; assembler;
3217asm
3218movq xmm0, [R]
3219movss xmm1, [R+8]
3220movlhps xmm0, xmm1
3221movq xmm1, [C]
3222movss xmm2, [C+8]
3223movlhps xmm1, xmm2
3224movaps xmm2, xmm1
3225movaps xmm3, xmm1
3226
3227shufps xmm1, xmm1, $00 // C.X (4x)
3228shufps xmm2, xmm2, $55 // C.Y (4x)
3229shufps xmm3, xmm3, $AA // C.Z (4x)
3230
3231mulps xmm1, xmm0 // R * C.X
3232mulps xmm2, xmm0 // R * C.Y
3233mulps xmm3, xmm0 // R * C.Z
3234
3235// Store as matrix
3236movhlps xmm0, xmm1
3237movhlps xmm4, xmm2
3238movhlps xmm5, xmm3
3239movq QWORD [Result+$00], xmm1
3240movss [Result+$08], xmm0
3241movq QWORD [Result+$0C], xmm2
3242movss [Result+$14], xmm4
3243movq QWORD [Result+$18], xmm3
3244movss [Result+$20], xmm5
3245end;
3246
3247function OuterProduct(const C, R: TVector4): TMatrix4; assembler;
3248asm
3249movups xmm0, [R]
3250movups xmm1, [C]
3251movaps xmm2, xmm1
3252movaps xmm3, xmm1
3253movaps xmm4, xmm1
3254
3255shufps xmm1, xmm1, $00 // C.X (4x)
3256shufps xmm2, xmm2, $55 // C.Y (4x)
3257shufps xmm3, xmm3, $AA // C.Z (4x)
3258shufps xmm4, xmm4, $FF // C.W (4x)
3259
3260mulps xmm1, xmm0 // R * C.X
3261mulps xmm2, xmm0 // R * C.Y
3262mulps xmm3, xmm0 // R * C.Z
3263mulps xmm4, xmm0 // R * C.W
3264
3265// Store as matrix
3266movups DQWORD [Result + $00], xmm1
3267movups DQWORD [Result + $10], xmm2
3268movups DQWORD [Result + $20], xmm3
3269movups DQWORD [Result + $30], xmm4
3270end;
3271{$ENDIF}
3272
3273{ TVector2 }
3274
3275{ These SIMD versions are similar to the ones for TVector4. The main difference
3276is using the "movlps" instruction (to load 2 values) instead of the
3277"movups" instruction (that loads 4 values) }
3278
3279class operator TVector2.Add(const A: TVector2; const B: Single): TVector2;
3280begin
3281Result.X := A.X + B;
3282Result.Y := A.Y + B;
3283end;
3284
3285class operator TVector2.Add(const A: Single; const B: TVector2): TVector2;
3286begin
3287Result.X := A + B.X;
3288Result.Y := A + B.Y;
3289end;
3290
3291class operator TVector2.Add(const A, B: TVector2): TVector2;
3292begin
3293Result.X := A.X + B.X;
3294Result.Y := A.Y + B.Y;
3295end;
3296
3297function TVector2.Distance(const AOther: TVector2): Single;
3298begin
3299Result := (Self - AOther).Length;
3300end;
3301
3302function TVector2.DistanceSquared(const AOther: TVector2): Single;
3303begin
3304Result := (Self - AOther).LengthSquared;
3305end;
3306
3307class operator TVector2.Divide(const A: TVector2; const B: Single): TVector2;
3308var
3309InvB: Single;
3310begin
3311InvB := 1 / B;
3312Result.X := A.X * InvB;
3313Result.Y := A.Y * InvB;
3314end;
3315
3316class operator TVector2.Divide(const A: Single; const B: TVector2): TVector2; assembler;
3317asm
3318movss xmm0, [A]
3319movlps xmm1, [B]
3320shufps xmm0, xmm0, 0
3321divps xmm0, xmm1
3322movlps [Result], xmm0
3323end;
3324
3325class operator TVector2.Divide(const A, B: TVector2): TVector2; assembler;
3326asm
3327movlps xmm0, [A]
3328movlps xmm1, [B]
3329divps xmm0, xmm1
3330movlps [Result], xmm0
3331end;
3332
3333function TVector2.Dot(const AOther: TVector2): Single;
3334begin
3335Result := (X * AOther.X) + (Y * AOther.Y);
3336end;
3337
3338function TVector2.FaceForward(const I, NRef: TVector2): TVector2;
3339begin
3340if (NRef.Dot(I) < 0) then
3341Result := Self
3342else
3343Result := -Self;
3344end;
3345
3346function TVector2.GetLength: Single;
3347begin
3348Result := Sqrt((X * X) + (Y * Y));
3349end;
3350
3351function TVector2.GetLengthSquared: Single;
3352begin
3353Result := (X * X) + (Y * Y);
3354end;
3355
3356class operator TVector2.Multiply(const A: TVector2; const B: Single): TVector2;
3357begin
3358Result.X := A.X * B;
3359Result.Y := A.Y * B;
3360end;
3361
3362class operator TVector2.Multiply(const A: Single; const B: TVector2): TVector2;
3363begin
3364Result.X := A * B.X;
3365Result.Y := A * B.Y;
3366end;
3367
3368class operator TVector2.Multiply(const A, B: TVector2): TVector2;
3369begin
3370Result.X := A.X * B.X;
3371Result.Y := A.Y * B.Y;
3372end;
3373
3374function TVector2.NormalizeFast: TVector2; assembler;
3375asm
3376movlps xmm0, [Self] // Y X
3377movaps xmm2, xmm0
3378mulps xmm0, xmm0 // Y*Y X*X
3379pshufd xmm1, xmm0, $01 // X*X Y*Y
3380addps xmm0, xmm1 // (X*X+Y*Y) (2x)
3381rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y)) (4x)
3382mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3383movlps [Result], xmm0
3384end;
3385
3386function TVector2.Reflect(const N: TVector2): TVector2;
3387begin
3388Result := Self - ((2 * N.Dot(Self)) * N);
3389end;
3390
3391function TVector2.Refract(const N: TVector2; const Eta: Single): TVector2;
3392var
3393D, K: Single;
3394begin
3395D := N.Dot(Self);
3396K := 1 - Eta * Eta * (1 - D * D);
3397if (K < 0) then
3398Result.Init
3399else
3400Result := (Eta * Self) - ((Eta * D + Sqrt(K)) * N);
3401end;
3402
3403procedure TVector2.SetNormalizedFast; assembler;
3404asm
3405movlps xmm0, [Self] // Y X
3406movaps xmm2, xmm0
3407mulps xmm0, xmm0 // Y*Y X*X
3408pshufd xmm1, xmm0, $01 // X*X Y*Y
3409addps xmm0, xmm1 // (X*X+Y*Y) (2x)
3410rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y)) (4x)
3411mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3412movlps [Self], xmm0
3413end;
3414
3415class operator TVector2.Subtract(const A: TVector2; const B: Single): TVector2;
3416begin
3417Result.X := A.X - B;
3418Result.Y := A.Y - B;
3419end;
3420
3421class operator TVector2.Subtract(const A: Single; const B: TVector2): TVector2;
3422begin
3423Result.X := A - B.X;
3424Result.Y := A - B.Y;
3425end;
3426
3427class operator TVector2.Subtract(const A, B: TVector2): TVector2;
3428begin
3429Result.X := A.X - B.X;
3430Result.Y := A.Y - B.Y;
3431end;
3432
3433{ TVector3 }
3434
3435class operator TVector3.Add(const A: TVector3; const B: Single): TVector3; assembler;
3436asm
3437movss xmm2, [B] // Load single floating-point value
3438movq xmm0, [A] // Load 3 floating-point values
3439movss xmm1, [A+8]
3440shufps xmm2, xmm2, 0 // Replicate B
3441addps xmm0, xmm2 // A + B
3442addss xmm1, xmm2
3443movq [Result], xmm0
3444movss [Result+8], xmm1
3445end;
3446
3447class operator TVector3.Add(const A: Single; const B: TVector3): TVector3; assembler;
3448asm
3449movss xmm2, [A]
3450movq xmm0, [B]
3451movss xmm1, [B+8]
3452shufps xmm2, xmm2, 0
3453addps xmm0, xmm2
3454addss xmm1, xmm2
3455movq [Result], xmm0
3456movss [Result+8], xmm1
3457end;
3458
3459class operator TVector3.Add(const A, B: TVector3): TVector3;
3460begin
3461Result.X := A.X + B.X;
3462Result.Y := A.Y + B.Y;
3463Result.Z := A.Z + B.Z;
3464end;
3465{class operator TVector3.Add(const A, B: TVector3): TVector3; assembler;
3466asm
3467movq xmm0, [A]
3468movss xmm1, [A+8]
3469movq xmm2, [B]
3470movss xmm3, [B+8]
3471addps xmm0, xmm2
3472addss xmm1, xmm3
3473movq [Result], xmm0
3474movss [Result+8], xmm1
3475end;}
3476
3477function TVector3.Distance(const AOther: TVector3): Single; assembler;
3478asm
3479movq xmm0, [Self]
3480movss xmm1, [Self+8]
3481movq xmm2, [AOther]
3482movss xmm3, [AOther+8]
3483movlhps xmm0, xmm1
3484movlhps xmm2, xmm3
3485subps xmm0, xmm2 // A - B
3486
3487// (A - B).Length
3488mulps xmm0, xmm0
3489pshufd xmm1, xmm0, $0E
3490addps xmm0, xmm1
3491pshufd xmm1, xmm0, $01
3492addss xmm0, xmm1
3493sqrtss xmm0, xmm0
3494movss [Result], xmm0
3495end;
3496
3497function TVector3.DistanceSquared(const AOther: TVector3): Single; assembler;
3498asm
3499movq xmm0, [Self]
3500movss xmm1, [Self+8]
3501movq xmm2, [AOther]
3502movss xmm3, [AOther+8]
3503movlhps xmm0, xmm1
3504movlhps xmm2, xmm3
3505subps xmm0, xmm2 // A - B
3506
3507// (A - B).LengthSquared
3508mulps xmm0, xmm0
3509pshufd xmm1, xmm0, $0E
3510addps xmm0, xmm1
3511pshufd xmm1, xmm0, $01
3512addss xmm0, xmm1
3513movss [Result], xmm0
3514end;
3515
3516class operator TVector3.Divide(const A: TVector3; const B: Single): TVector3;
3517var
3518InvB: Single;
3519begin
3520InvB := 1 / B;
3521Result.X := A.X * InvB;
3522Result.Y := A.Y * InvB;
3523Result.Z := A.Z * InvB;
3524end;
3525
3526class operator TVector3.Divide(const A: Single; const B: TVector3): TVector3; assembler;
3527asm
3528movss xmm0, [A]
3529movq xmm1, [B]
3530movss xmm2, [B+8]
3531movss xmm3, xmm0
3532shufps xmm0, xmm0, 0
3533divps xmm0, xmm1
3534divss xmm3, xmm2
3535movq [Result], xmm0
3536movss [Result+8], xmm3
3537end;
3538
3539class operator TVector3.Divide(const A, B: TVector3): TVector3; assembler;
3540asm
3541movq xmm0, [A]
3542movss xmm1, [A+8]
3543movq xmm2, [B]
3544movss xmm3, [B+8]
3545divps xmm0, xmm2
3546divss xmm1, xmm3
3547movq [Result], xmm0
3548movss [Result+8], xmm1
3549end;
3550
3551function TVector3.Cross(const AOther: TVector3): TVector3;
3552begin
3553Result.X := (Y * AOther.Z) - (AOther.Y * Z);
3554Result.Y := (Z * AOther.X) - (AOther.Z * X);
3555Result.Z := (X * AOther.Y) - (AOther.X * Y);
3556end;
3557
3558function TVector3.Dot(const AOther: TVector3): Single;
3559begin
3560Result := (X * AOther.X) + (Y * AOther.Y) + (Z * AOther.Z);
3561end;
3562
3563function TVector3.FaceForward(const I, NRef: TVector3): TVector3;
3564begin
3565if (NRef.Dot(I) < 0) then
3566Result := Self
3567else
3568Result := -Self;
3569end;
3570
3571function TVector3.GetLength: Single; assembler;
3572asm
3573movq xmm0, [Self] // 0 0 Y X
3574movss xmm1, [Self+8] // 0 0 0 Z
3575movlhps xmm0, xmm1 // 0 Z Y Z
3576mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3577pshufd xmm1, xmm0, $0E // Y*Y X*X 0 Z*Z
3578addps xmm0, xmm1 // # # (Y*Y) (X*X+Z*Z)
3579pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3580addss xmm0, xmm1 // (X*X + Y*Y + Z*Z)
3581sqrtss xmm0, xmm0 // Sqrt(X*X + Y*Y + Z*Z)
3582movss [Result], xmm0
3583end;
3584
3585function TVector3.GetLengthSquared: Single;
3586begin
3587Result := (X * X) + (Y * Y) + (Z * Z);
3588end;
3589{function TVector3.GetLengthSquared: Single; assembler;
3590asm
3591movq xmm0, [Self] // 0 0 Y X
3592movss xmm1, [Self+8] // 0 0 0 Z
3593movlhps xmm0, xmm1 // 0 Z Y Z
3594mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3595pshufd xmm1, xmm0, $0E // Y*Y X*X 0 Z*Z
3596addps xmm0, xmm1 // # # (Y*Y) (X*X+Z*Z)
3597pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3598addss xmm0, xmm1 // (X*X + Y*Y + Z*Z)
3599movss [Result], xmm0
3600end;}
3601
3602class operator TVector3.Multiply(const A: TVector3; const B: Single): TVector3; assembler;
3603asm
3604movss xmm2, [B]
3605movq xmm0, [A]
3606movss xmm1, [A+8]
3607shufps xmm2, xmm2, 0
3608mulps xmm0, xmm2
3609mulss xmm1, xmm2
3610movq [Result], xmm0
3611movss [Result+8], xmm1
3612end;
3613
3614class operator TVector3.Multiply(const A: Single; const B: TVector3): TVector3; assembler;
3615asm
3616movss xmm2, [A]
3617movq xmm0, [B]
3618movss xmm1, [B+8]
3619shufps xmm2, xmm2, 0
3620mulps xmm0, xmm2
3621mulss xmm1, xmm2
3622movq [Result], xmm0
3623movss [Result+8], xmm1
3624end;
3625
3626class operator TVector3.Multiply(const A, B: TVector3): TVector3;
3627begin
3628Result.X := A.X * B.X;
3629Result.Y := A.Y * B.Y;
3630Result.Z := A.Z * B.Z;
3631end;
3632{class operator TVector3.Multiply(const A, B: TVector3): TVector3; assembler;
3633asm
3634movq xmm0, [A]
3635movss xmm1, [A+8]
3636movq xmm2, [B]
3637movss xmm3, [B+8]
3638mulps xmm0, xmm2
3639mulss xmm1, xmm3
3640movq [Result], xmm0
3641movss [Result+8], xmm1
3642end;}
3643
3644class operator TVector3.Negative(const A: TVector3): TVector3;
3645begin
3646Result.X := -A.X;
3647Result.Y := -A.Y;
3648Result.Z := -A.Z;
3649end;
3650{class operator TVector3.Negative(const A: TVector3): TVector3; assembler;
3651asm
3652movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
3653movq xmm1, [A]
3654movss xmm2, [A+8]
3655xorps xmm1, xmm0 // Flip sign bit
3656xorps xmm2, xmm0
3657movq [Result], xmm1
3658movss [Result+8], xmm2
3659end;}
3660
3661function TVector3.NormalizeFast: TVector3; assembler;
3662asm
3663movq xmm0, [Self] // 0 0 Y X
3664movss xmm1, [Self+8] // 0 0 0 Z
3665movlhps xmm0, xmm1 // 0 Z Y Z
3666movaps xmm2, xmm0
3667
3668// Dot(A, A)
3669mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3670pshufd xmm1, xmm0, $4E // Y*Y X*X 0 Z*Z
3671addps xmm0, xmm1 // (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3672pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3673addps xmm0, xmm1 // (X*X + Y*Y + Z*Z) (4x)
3674
3675rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3676mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3677movhlps xmm1, xmm0
3678movq [Result], xmm0
3679movss [Result+8], xmm1
3680end;
3681
3682function TVector3.Reflect(const N: TVector3): TVector3; assembler;
3683asm
3684movq xmm0, [Self]
3685movss xmm2, [Self+8]
3686movq xmm1, [N]
3687movss xmm3, [N+8]
3688movlhps xmm0, xmm2
3689movlhps xmm1, xmm3
3690movaps xmm2, xmm0
3691movups xmm3, [SSE_TWO]
3692
3693// Dot(N, I)
3694mulps xmm0, xmm1
3695mulps xmm3, xmm1 // N * 2
3696pshufd xmm1, xmm0, $4E
3697addps xmm0, xmm1
3698pshufd xmm1, xmm0, $11
3699addps xmm0, xmm1
3700
3701// (2 * Dot(N, I)) * N
3702mulps xmm0, xmm3
3703
3704// I - ((2 * Dot(N, I)) * N)
3705subps xmm2, xmm0
3706movhlps xmm3, xmm2
3707movq [Result], xmm2
3708movss [Result+8], xmm3
3709end;
3710
3711function TVector3.Refract(const N: TVector3; const Eta: Single): TVector3; assembler;
3712asm
3713movq xmm0, [Self]
3714movss xmm2, [Self+8]
3715movq xmm1, [N]
3716movss xmm3, [N+8]
3717movlhps xmm0, xmm2
3718movlhps xmm1, xmm3
3719movups xmm7, xmm0
3720movss xmm2, [Eta]
3721movss xmm3, DWORD [SSE_ONE]
3722
3723// D := Dot(N, I)
3724mulps xmm0, xmm1
3725movss xmm4, xmm3 // 1
3726pshufd xmm1, xmm0, $4E
3727movss xmm5, xmm2 // Eta
3728addps xmm0, xmm1
3729mulss xmm5, xmm5 // Eta * Eta
3730pshufd xmm1, xmm0, $11
3731addss xmm0, xmm1
3732
3733// K := 1 - Eta * Eta * (1 - D * D)
3734movss xmm6, xmm0 // D
3735mulss xmm0, xmm0 // D * D
3736subss xmm4, xmm0 // 1 - D * D
3737mulss xmm4, xmm5 // Eta * Eta * (1 - D * D)
3738xorps xmm5, xmm5 // 0
3739subss xmm3, xmm4 // K := 1 - Eta * Eta * (1 - D * D)
3740
3741// if (K < 0) then
3742comiss xmm3, xmm5
3743
3744jb @KLessThanZero
3745
3746// K >= 0
3747mulss xmm6, xmm2 // Eta * D
3748shufps xmm2, xmm2, 0 // Replicate Eta (4x)
3749mulps xmm7, xmm2 // Eta * I
3750sqrtss xmm3, xmm3 // Sqrt(K)
3751addss xmm6, xmm3 // Eta * D + Sqrt(K)
3752shufps xmm6, xmm6, 0 // Replicate Eta * D + Sqrt(K) (4x)
3753movups xmm1, [N]
3754mulps xmm6, xmm1 // ((Eta * D + Sqrt(K)) * N)
3755subps xmm7, xmm6 // (Eta * I) - ((Eta * D + Sqrt(K)) * N)
3756movhlps xmm0, xmm7
3757movq [Result], xmm7
3758movss [Result+8], xmm0
3759jmp @Finish
3760
3761@KLessThanZero:
3762// K < 0: Result := Vector4(0, 0, 0, 0)
3763movlhps xmm6, xmm5
3764movq [Result], xmm5
3765movss [Result+8], xmm6
3766
3767@Finish:
3768end;
3769
3770procedure TVector3.SetNormalizedFast; assembler;
3771asm
3772movq xmm0, [Self] // 0 0 Y X
3773movss xmm1, [Self+8] // 0 0 0 Z
3774movlhps xmm0, xmm1 // 0 Z Y Z
3775movaps xmm2, xmm0
3776
3777// Dot(A, A)
3778mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3779pshufd xmm1, xmm0, $4E // Y*Y X*X 0 Z*Z
3780addps xmm0, xmm1 // (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3781pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3782addps xmm0, xmm1 // (X*X + Y*Y + Z*Z) (4x)
3783
3784rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3785mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3786movhlps xmm1, xmm0
3787movq [Self], xmm0
3788movss [Self+8], xmm1
3789end;
3790
3791class operator TVector3.Subtract(const A: TVector3; const B: Single): TVector3; assembler;
3792asm
3793movss xmm2, [B]
3794movq xmm0, [A]
3795movss xmm1, [A+8]
3796shufps xmm2, xmm2, 0
3797subps xmm0, xmm2
3798subss xmm1, xmm2
3799movq [Result], xmm0
3800movss [Result+8], xmm1
3801end;
3802
3803class operator TVector3.Subtract(const A: Single; const B: TVector3): TVector3; assembler;
3804asm
3805movss xmm0, [A]
3806movq xmm1, [B]
3807movss xmm2, [B+8]
3808movss xmm3, xmm0
3809shufps xmm0, xmm0, 0
3810subps xmm0, xmm1
3811subss xmm3, xmm2
3812movq [Result], xmm0
3813movss [Result+8], xmm3
3814end;
3815
3816class operator TVector3.Subtract(const A, B: TVector3): TVector3;
3817begin
3818Result.X := A.X - B.X;
3819Result.Y := A.Y - B.Y;
3820Result.Z := A.Z - B.Z;
3821end;
3822{class operator TVector3.Subtract(const A, B: TVector3): TVector3; assembler;
3823asm
3824movq xmm0, [A]
3825movss xmm1, [A+8]
3826movq xmm2, [B]
3827movss xmm3, [B+8]
3828subps xmm0, xmm2
3829subss xmm1, xmm3
3830movq [Result], xmm0
3831movss [Result+8], xmm1
3832end;}
3833
3834{ TVector4 }
3835
3836class operator TVector4.Add(const A: TVector4; const B: Single): TVector4; assembler;
3837asm
3838movss xmm1, [B] // Load single floating-point value
3839movups xmm0, [A] // Load 4 floating-point values
3840shufps xmm1, xmm1, 0 // Replicate B
3841addps xmm0, xmm1 // A + B
3842movups [Result], xmm0 // Store result
3843end;
3844
3845class operator TVector4.Add(const A: Single; const B: TVector4): TVector4; assembler;
3846asm
3847movss xmm1, [A]
3848movups xmm0, [B]
3849shufps xmm1, xmm1, 0
3850addps xmm0, xmm1
3851movups [Result], xmm0
3852end;
3853
3854class operator TVector4.Add(const A, B: TVector4): TVector4; assembler;
3855asm
3856movups xmm0, [A]
3857movups xmm1, [B]
3858addps xmm0, xmm1
3859movups [Result], xmm0
3860end;
3861
3862function TVector4.Distance(const AOther: TVector4): Single; assembler;
3863asm
3864movups xmm0, [Self]
3865movups xmm1, [AOther]
3866subps xmm0, xmm1 // A - B
3867
3868// (A - B).Length
3869mulps xmm0, xmm0
3870pshufd xmm1, xmm0, $0E
3871addps xmm0, xmm1
3872pshufd xmm1, xmm0, $01
3873addss xmm0, xmm1
3874sqrtss xmm0, xmm0
3875movss [Result], xmm0
3876end;
3877
3878function TVector4.DistanceSquared(const AOther: TVector4): Single; assembler;
3879asm
3880movups xmm0, [Self]
3881movups xmm1, [AOther]
3882subps xmm0, xmm1 // A - B
3883
3884// (A - B).LengthSquared
3885mulps xmm0, xmm0
3886pshufd xmm1, xmm0, $0E
3887addps xmm0, xmm1
3888pshufd xmm1, xmm0, $01
3889addss xmm0, xmm1
3890movss [Result], xmm0
3891end;
3892
3893class operator TVector4.Divide(const A: TVector4; const B: Single): TVector4; assembler;
3894asm
3895movss xmm1, [B]
3896movups xmm0, [A]
3897shufps xmm1, xmm1, 0
3898divps xmm0, xmm1
3899movups [Result], xmm0
3900end;
3901
3902class operator TVector4.Divide(const A: Single; const B: TVector4): TVector4; assembler;
3903asm
3904movss xmm0, [A]
3905movups xmm1, [B]
3906shufps xmm0, xmm0, 0
3907divps xmm0, xmm1
3908movups [Result], xmm0
3909end;
3910
3911class operator TVector4.Divide(const A, B: TVector4): TVector4; assembler;
3912asm
3913movups xmm0, [A]
3914movups xmm1, [B]
3915divps xmm0, xmm1
3916movups [Result], xmm0
3917end;
3918
3919function TVector4.Dot(const AOther: TVector4): Single;
3920begin
3921Result := (X * AOther.X) + (Y * AOther.Y) + (Z * AOther.Z) + (W * AOther.W);
3922end;
3923
3924function TVector4.FaceForward(const I, NRef: TVector4): TVector4; assembler;
3925asm
3926movups xmm0, [Self]
3927movups xmm1, [I]
3928movups xmm2, [NRef]
3929xorps xmm3, xmm3 // 0
3930movups xmm4, [SSE_MASK_SIGN]
3931
3932// Dot(NRef, I)
3933mulps xmm2, xmm1
3934pshufd xmm1, xmm2, $4E
3935addps xmm2, xmm1
3936pshufd xmm1, xmm2, $11
3937addps xmm2, xmm1
3938
3939// Dot(NRef, I) >= 0? Yes: $FFFFFFFF, No: $00000000
3940cmpnltps xmm2, xmm3
3941andps xmm2, xmm4 // Yes: $80000000, No: $00000000
3942
3943// Flip sign of N if (Dot(NRef, I) >= 0)
3944mov edx, [Result]
3945xorps xmm0, xmm2
3946movups [edx], xmm0
3947end;
3948
3949function TVector4.GetLength: Single; assembler;
3950asm
3951movups xmm0, [Self] // W Z Y X
3952mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
3953pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
3954addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
3955pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3956addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
3957sqrtss xmm0, xmm0 // Sqrt(X*X + Y*Y + Z*Z + W*W)
3958movss [Result], xmm0
3959end;
3960
3961function TVector4.GetLengthSquared: Single;
3962begin
3963Result := (X * X) + (Y * Y) + (Z * Z) + (W * W);
3964end;
3965{function TVector4.GetLengthSquared: Single; assembler;
3966asm
3967movups xmm0, [Self] // W Z Y X
3968mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
3969pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
3970addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
3971pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
3972addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
3973movss [Result], xmm0
3974end;}
3975
3976class operator TVector4.Multiply(const A: TVector4; const B: Single): TVector4; assembler;
3977asm
3978movss xmm1, [B]
3979movups xmm0, [A]
3980shufps xmm1, xmm1, 0
3981mulps xmm0, xmm1
3982movups [Result], xmm0
3983end;
3984
3985class operator TVector4.Multiply(const A: Single; const B: TVector4): TVector4; assembler;
3986asm
3987movss xmm0, [A]
3988movups xmm1, [B]
3989shufps xmm0, xmm0, 0
3990mulps xmm0, xmm1
3991movups [Result], xmm0
3992end;
3993
3994class operator TVector4.Multiply(const A, B: TVector4): TVector4; assembler;
3995asm
3996movups xmm0, [A]
3997movups xmm1, [B]
3998mulps xmm0, xmm1
3999movups [Result], xmm0
4000end;
4001
4002class operator TVector4.Negative(const A: TVector4): TVector4; assembler;
4003asm
4004movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
4005movups xmm1, [A]
4006xorps xmm0, xmm1 // Flip sign bit
4007movups [Result], xmm0
4008end;
4009
4010function TVector4.NormalizeFast: TVector4;
4011asm
4012movups xmm0, [Self] // W Z Y X
4013movaps xmm2, xmm0
4014
4015// Dot(A, A)
4016mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4017pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4018addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4019pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4020addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4021
4022rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4023mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4024movups [Result], xmm0
4025end;
4026
4027function TVector4.Reflect(const N: TVector4): TVector4; assembler;
4028asm
4029movups xmm0, [Self]
4030movups xmm1, [N]
4031movaps xmm2, xmm0
4032movups xmm3, [SSE_TWO]
4033
4034// Dot(N, I)
4035mulps xmm0, xmm1
4036mulps xmm3, xmm1 // N * 2
4037pshufd xmm1, xmm0, $4E
4038addps xmm0, xmm1
4039pshufd xmm1, xmm0, $11
4040addps xmm0, xmm1
4041
4042// (2 * Dot(N, I)) * N
4043mulps xmm0, xmm3
4044
4045// I - ((2 * Dot(N, I)) * N)
4046subps xmm2, xmm0
4047movups [Result], xmm2
4048end;
4049
4050function TVector4.Refract(const N: TVector4; const Eta: Single): TVector4; assembler;
4051asm
4052movups xmm0, [Self]
4053movups xmm1, [N]
4054movups xmm7, xmm0
4055movss xmm2, [Eta]
4056movss xmm3, DWORD [SSE_ONE]
4057
4058// D := Dot(N, I)
4059mulps xmm0, xmm1
4060movss xmm4, xmm3 // 1
4061pshufd xmm1, xmm0, $4E
4062movss xmm5, xmm2 // Eta
4063addps xmm0, xmm1
4064mulss xmm5, xmm5 // Eta * Eta
4065pshufd xmm1, xmm0, $11
4066addss xmm0, xmm1
4067
4068// K := 1 - Eta * Eta * (1 - D * D)
4069movss xmm6, xmm0 // D
4070mulss xmm0, xmm0 // D * D
4071subss xmm4, xmm0 // 1 - D * D
4072mulss xmm4, xmm5 // Eta * Eta * (1 - D * D)
4073xorps xmm5, xmm5 // 0
4074subss xmm3, xmm4 // K := 1 - Eta * Eta * (1 - D * D)
4075
4076// if (K < 0) then
4077comiss xmm3, xmm5
4078
4079jb @KLessThanZero
4080
4081// K >= 0
4082mulss xmm6, xmm2 // Eta * D
4083shufps xmm2, xmm2, 0 // Replicate Eta (4x)
4084mulps xmm7, xmm2 // Eta * I
4085sqrtss xmm3, xmm3 // Sqrt(K)
4086addss xmm6, xmm3 // Eta * D + Sqrt(K)
4087shufps xmm6, xmm6, 0 // Replicate Eta * D + Sqrt(K) (4x)
4088movups xmm1, [N]
4089mulps xmm6, xmm1 // ((Eta * D + Sqrt(K)) * N)
4090subps xmm7, xmm6 // (Eta * I) - ((Eta * D + Sqrt(K)) * N)
4091movups [Result], xmm7
4092jmp @Finish
4093
4094@KLessThanZero:
4095// K < 0: Result := Vector4(0, 0, 0, 0)
4096movups [Result], xmm5
4097
4098@Finish:
4099end;
4100
4101procedure TVector4.SetNormalizedFast; assembler;
4102asm
4103movups xmm0, [Self] // W Z Y X
4104movaps xmm2, xmm0
4105
4106// Dot(A, A)
4107mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4108pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4109addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4110pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4111addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4112
4113rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4114mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4115movups [Self], xmm0
4116end;
4117
4118class operator TVector4.Subtract(const A: TVector4; const B: Single): TVector4; assembler;
4119asm
4120movss xmm1, [B]
4121movups xmm0, [A]
4122shufps xmm1, xmm1, 0
4123subps xmm0, xmm1
4124movups [Result], xmm0
4125end;
4126
4127class operator TVector4.Subtract(const A: Single; const B: TVector4): TVector4; assembler;
4128asm
4129movss xmm0, [A]
4130movups xmm1, [B]
4131shufps xmm0, xmm0, 0
4132subps xmm0, xmm1
4133movups [Result], xmm0
4134end;
4135
4136class operator TVector4.Subtract(const A, B: TVector4): TVector4; assembler;
4137asm
4138movups xmm0, [A]
4139movups xmm1, [B]
4140subps xmm0, xmm1
4141movups [Result], xmm0
4142end;
4143
4144{ TQuaternion }
4145
4146class operator TQuaternion.Add(const A, B: TQuaternion): TQuaternion;
4147asm
4148movups xmm0, [A]
4149movups xmm1, [B]
4150addps xmm0, xmm1
4151movups [Result], xmm0
4152end;
4153
4154function TQuaternion.GetLength: Single;
4155asm
4156movups xmm0, [Self] // W Z Y X
4157mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4158pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4159addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
4160pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4161addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
4162sqrtss xmm0, xmm0 // Sqrt(X*X + Y*Y + Z*Z + W*W)
4163movss [Result], xmm0
4164end;
4165
4166function TQuaternion.GetLengthSquared: Single;
4167asm
4168movups xmm0, [Self] // W Z Y X
4169mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4170pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4171addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
4172pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4173addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
4174movss [Result], xmm0
4175end;
4176
4177class operator TQuaternion.Multiply(const A: TQuaternion; const B: Single): TQuaternion;
4178asm
4179movss xmm1, [B]
4180movups xmm0, [A]
4181shufps xmm1, xmm1, 0
4182mulps xmm0, xmm1
4183movups [Result], xmm0
4184end;
4185
4186class operator TQuaternion.Multiply(const A: Single; const B: TQuaternion): TQuaternion;
4187asm
4188movss xmm0, [A]
4189movups xmm1, [B]
4190shufps xmm0, xmm0, 0
4191mulps xmm0, xmm1
4192movups [Result], xmm0
4193end;
4194
4195class operator TQuaternion.Multiply(const A, B: TQuaternion): TQuaternion;
4196begin
4197Result.X := (A.W * B.X) + (A.X * B.W) + (A.Y * B.Z) - (A.Z * B.Y);
4198Result.Y := (A.W * B.Y) + (A.Y * B.W) + (A.Z * B.X) - (A.X * B.Z);
4199Result.Z := (A.W * B.Z) + (A.Z * B.W) + (A.X * B.Y) - (A.Y * B.X);
4200Result.W := (A.W * B.W) - (A.X * B.X) - (A.Y * B.Y) - (A.Z * B.Z);
4201end;
4202
4203function TQuaternion.NormalizeFast: TQuaternion;
4204asm
4205movups xmm0, [Self] // W Z Y X
4206movaps xmm2, xmm0
4207
4208// Dot(A, A)
4209mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4210pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4211addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4212pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4213addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4214
4215rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4216mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4217movups [Result], xmm0
4218end;
4219
4220procedure TQuaternion.SetNormalizedFast;
4221asm
4222movups xmm0, [Self] // W Z Y X
4223movaps xmm2, xmm0
4224
4225// Dot(A, A)
4226mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4227pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4228addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4229pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4230addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4231
4232rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4233mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4234movups [Self], xmm0
4235end;
4236
4237{ TMatrix2 }
4238
4239class operator TMatrix2.Add(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4240asm
4241movss xmm0, [B] // Load single floating-point value
4242movups xmm1, [A] // Load matrix
4243shufps xmm0, xmm0, 0 // Replicate B
4244addps xmm1, xmm0 // Add B
4245movups [Result], xmm1
4246end;
4247
4248class operator TMatrix2.Add(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4249asm
4250movss xmm0, [A] // Load single floating-point value
4251movups xmm1, [B] // Load matrix
4252shufps xmm0, xmm0, 0 // Replicate A
4253addps xmm1, xmm0 // Add A
4254movups [Result], xmm1
4255end;
4256
4257class operator TMatrix2.Add(const A, B: TMatrix2): TMatrix2; assembler;
4258asm
4259movups xmm0, [A] // Load A
4260movups xmm1, [B] // Load B
4261addps xmm0, xmm1 // Add
4262movups [Result], xmm0
4263end;
4264
4265function TMatrix2.CompMult(const AOther: TMatrix2): TMatrix2; assembler;
4266asm
4267movups xmm0, [Self]
4268movups xmm1, [AOther]
4269
4270// Component-wise multiplication
4271mulps xmm0, xmm1
4272
4273// Store result
4274movups [Result], xmm0
4275end;
4276
4277class operator TMatrix2.Divide(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4278asm
4279movss xmm0, [B] // Load single floating-point value
4280movups xmm1, [A] // Load matrix
4281shufps xmm0, xmm0, 0 // Replicate B
4282divps xmm1, xmm0 // Divide B
4283movups [Result], xmm1
4284end;
4285
4286class operator TMatrix2.Divide(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4287asm
4288movss xmm0, [A] // Load single floating-point value
4289movups xmm1, [B] // Load matrix
4290shufps xmm0, xmm0, 0 // Replicate A
4291divps xmm0, xmm1 // Divide B
4292movups [Result], xmm0
4293end;
4294
4295class operator TMatrix2.Multiply(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4296asm
4297movss xmm0, [B] // Load single floating-point value
4298movups xmm1, [A] // Load matrix
4299shufps xmm0, xmm0, 0 // Replicate B
4300mulps xmm1, xmm0 // Multiply
4301movups [Result], xmm1
4302end;
4303
4304class operator TMatrix2.Multiply(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4305asm
4306movss xmm0, [A] // Load single floating-point value
4307movups xmm1, [B] // Load matrix
4308shufps xmm0, xmm0, 0 // Replicate A
4309mulps xmm1, xmm0 // Multiply
4310movups [Result], xmm1
4311end;
4312
4313class operator TMatrix2.Multiply(const A: TVector2; const B: TMatrix2): TVector2;
4314begin
4315Result.X := (A.X * B.M[0,0]) + (A.Y * B.M[0,1]);
4316Result.Y := (A.X * B.M[1,0]) + (A.Y * B.M[1,1]);
4317end;
4318
4319class operator TMatrix2.Multiply(const A: TMatrix2; const B: TVector2): TVector2;
4320begin
4321Result.X := (A.M[0,0] * B.X) + (A.M[1,0] * B.Y);
4322Result.Y := (A.M[0,1] * B.X) + (A.M[1,1] * B.Y);
4323end;
4324
4325class operator TMatrix2.Multiply(const A, B: TMatrix2): TMatrix2;
4326begin
4327Result.M[0,0] := (A.M[0,0] * B.M[0,0]) + (A.M[1,0] * B.M[0,1]);
4328Result.M[0,1] := (A.M[0,1] * B.M[0,0]) + (A.M[1,1] * B.M[0,1]);
4329Result.M[1,0] := (A.M[0,0] * B.M[1,0]) + (A.M[1,0] * B.M[1,1]);
4330Result.M[1,1] := (A.M[0,1] * B.M[1,0]) + (A.M[1,1] * B.M[1,1]);
4331end;
4332
4333class operator TMatrix2.Negative(const A: TMatrix2): TMatrix2; assembler;
4334asm
4335movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
4336movups xmm1, [A] // Load matrix
4337xorps xmm1, xmm0 // Flip sign bits
4338movups [Result], xmm1
4339end;
4340
4341procedure TMatrix2.SetTransposed;
4342begin
4343Self := Transpose;
4344end;
4345
4346class operator TMatrix2.Subtract(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4347asm
4348movss xmm0, [B] // Load single floating-point value
4349movups xmm1, [A] // Load matrix
4350shufps xmm0, xmm0, 0 // Replicate B
4351subps xmm1, xmm0 // Subtract B
4352movups [Result], xmm1
4353end;
4354
4355class operator TMatrix2.Subtract(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4356asm
4357movss xmm0, [A] // Load single floating-point value
4358movups xmm1, [B] // Load matrix
4359shufps xmm0, xmm0, 0 // Replicate A
4360subps xmm0, xmm1 // Subtract B
4361movups [Result], xmm0
4362end;
4363
4364class operator TMatrix2.Subtract(const A, B: TMatrix2): TMatrix2; assembler;
4365asm
4366movups xmm0, [A] // Load A
4367movups xmm1, [B] // Load B
4368subps xmm0, xmm1 // Subtract
4369movups [Result], xmm0
4370end;
4371
4372function TMatrix2.Transpose: TMatrix2;
4373begin
4374Result.M[0,0] := M[0,0];
4375Result.M[0,1] := M[1,0];
4376
4377Result.M[1,0] := M[0,1];
4378Result.M[1,1] := M[1,1];
4379end;
4380
4381{ TMatrix 3 }
4382
4383class operator TMatrix3.Add(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4384asm
4385movss xmm0, [B] // Load single floating-point value
4386movups xmm1, DQWORD [A + $00] // Load 3 rows
4387shufps xmm0, xmm0, 0 // Replicate B
4388movups xmm2, DQWORD [A + $10]
4389movss xmm3, DWORD [A + $20]
4390addps xmm1, xmm0 // Add B to each row
4391addps xmm2, xmm0
4392addss xmm3, xmm0
4393movups DQWORD [Result + $00], xmm1
4394movups DQWORD [Result + $10], xmm2
4395movss DWORD [Result + $20], xmm3
4396end;
4397
4398class operator TMatrix3.Add(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4399asm
4400movss xmm0, [A] // Load single floating-point value
4401movups xmm1, DQWORD [B + $00] // Load 3 rows
4402shufps xmm0, xmm0, 0 // Replicate A
4403movups xmm2, DQWORD [B + $10]
4404movss xmm3, DWORD [B + $20]
4405addps xmm1, xmm0 // Add A to each row
4406addps xmm2, xmm0
4407addss xmm3, xmm0
4408movups DQWORD [Result + $00], xmm1
4409movups DQWORD [Result + $10], xmm2
4410movss DWORD [Result + $20], xmm3
4411end;
4412
4413class operator TMatrix3.Add(const A, B: TMatrix3): TMatrix3; assembler;
4414asm
4415movups xmm0, DQWORD [A + $00] // Load 3 rows of A
4416movups xmm1, DQWORD [A + $10]
4417movss xmm2, DWORD [A + $20]
4418movups xmm4, DQWORD [B + $00] // Load 3 rows of B
4419movups xmm5, DQWORD [B + $10]
4420movss xmm6, DWORD [B + $20]
4421addps xmm0, xmm4 // Add rows
4422addps xmm1, xmm5
4423addss xmm2, xmm6
4424movups DQWORD [Result + $00], xmm0
4425movups DQWORD [Result + $10], xmm1
4426movss DWORD [Result + $20], xmm2
4427end;
4428
4429function TMatrix3.CompMult(const AOther: TMatrix3): TMatrix3; assembler;
4430asm
4431movups xmm0, DQWORD[Self + $00] // Self[0]
4432movups xmm1, DQWORD[Self + $10] // Self[1]
4433movss xmm2, DWORD[Self + $20] // Self[2]
4434movups xmm4, DQWORD[AOther + $00] // AOther[0]
4435movups xmm5, DQWORD[AOther + $10] // AOther[1]
4436movss xmm6, DWORD[AOther + $20] // AOther[2]
4437
4438// Component-wise multiplication
4439mulps xmm0, xmm4
4440mulps xmm1, xmm5
4441mulss xmm2, xmm6
4442
4443// Store result
4444movups DQWORD [Result + $00], xmm0
4445movups DQWORD [Result + $10], xmm1
4446movss DWORD [Result + $20], xmm2
4447end;
4448
4449class operator TMatrix3.Divide(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4450asm
4451movss xmm0, [A] // Load single floating-point value
4452movups xmm4, DQWORD [B + $00] // Load 3 rows
4453shufps xmm0, xmm0, 0 // Replicate A
4454movups xmm5, DQWORD [B + $10]
4455movaps xmm1, xmm0
4456movaps xmm2, xmm0
4457movss xmm6, DWORD [B + $20]
4458divps xmm0, xmm4 // Divide A by each row
4459divps xmm1, xmm5
4460divss xmm2, xmm6
4461movups DQWORD [Result + $00], xmm0
4462movups DQWORD [Result + $10], xmm1
4463movss DWORD [Result + $20], xmm2
4464end;
4465
4466class operator TMatrix3.Divide(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4467asm
4468movss xmm0, [B] // Load single floating-point value
4469movups xmm1, DQWORD [A + $00] // Load 3 rows
4470shufps xmm0, xmm0, 0 // Replicate B
4471movups xmm2, DQWORD [A + $10]
4472movss xmm3, DWORD [A + $20]
4473divps xmm1, xmm0 // Divide each row by B
4474divps xmm2, xmm0
4475divps xmm3, xmm0
4476movups DQWORD [Result + $00], xmm1
4477movups DQWORD [Result + $10], xmm2
4478movss DWORD [Result + $20], xmm3
4479end;
4480
4481class operator TMatrix3.Multiply(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4482asm
4483movss xmm0, [A] // Load single floating-point value
4484movups xmm1, DQWORD [B + $00] // Load 3 rows
4485shufps xmm0, xmm0, 0 // Replicate A
4486movups xmm2, DQWORD [B + $10]
4487movss xmm3, DWORD [B + $20]
4488mulps xmm1, xmm0 // Multiply each row by A
4489mulps xmm2, xmm0
4490mulss xmm3, xmm0
4491movups DQWORD [Result + $00], xmm1
4492movups DQWORD [Result + $10], xmm2
4493movss DWORD [Result + $20], xmm3
4494end;
4495
4496class operator TMatrix3.Multiply(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4497asm
4498movss xmm0, [B] // Load single floating-point value
4499movups xmm1, DQWORD [A + $00] // Load 3 rows
4500shufps xmm0, xmm0, 0 // Replicate B
4501movups xmm2, DQWORD [A + $10]
4502movss xmm3, DWORD [A + $20]
4503mulps xmm1, xmm0 // Multiply each row by B
4504mulps xmm2, xmm0
4505mulss xmm3, xmm0
4506movups DQWORD [Result + $00], xmm1
4507movups DQWORD [Result + $10], xmm2
4508movss DWORD [Result + $20], xmm3
4509end;
4510
4511{$IFDEF FM_COLUMN_MAJOR}
4512class operator TMatrix3.Multiply(const A: TMatrix3; const B: TVector3): TVector3; assembler;
4513asm
4514movq xmm0, [B]
4515movss xmm1, [B+8]
4516movlhps xmm0, xmm1
4517
4518movq xmm4, QWORD [A + $00]
4519movss xmm1, DWORD [A + $08]
4520movlhps xmm4, xmm1
4521
4522movaps xmm1, xmm0
4523movaps xmm2, xmm0
4524shufps xmm0, xmm0, $00
4525shufps xmm1, xmm1, $55
4526shufps xmm2, xmm2, $AA
4527
4528movq xmm5, QWORD [A + $0C]
4529movss xmm3, DWORD [A + $14]
4530movlhps xmm5, xmm3
4531
4532movq xmm6, QWORD [A + $18]
4533movss xmm3, DWORD [A + $20]
4534movlhps xmm6, xmm3
4535
4536mulps xmm0, xmm4
4537mulps xmm1, xmm5
4538mulps xmm2, xmm6
4539addps xmm0, xmm1
4540addps xmm0, xmm2
4541movhlps xmm1, xmm0
4542movq [Result], xmm0
4543movss [Result+8], xmm1
4544end;
4545
4546class operator TMatrix3.Multiply(const A: TVector3; const B: TMatrix3): TVector3; assembler;
4547asm
4548movq xmm0, [A]
4549movss xmm1, [A+8]
4550movlhps xmm0, xmm1
4551
4552movq xmm4, QWORD [B + $00]
4553movss xmm1, DWORD [B + $08]
4554movlhps xmm4, xmm1
4555
4556movaps xmm1, xmm0
4557movaps xmm2, xmm0
4558
4559movq xmm5, QWORD [B + $0C]
4560movss xmm6, DWORD [B + $14]
4561movlhps xmm5, xmm6
4562
4563movq xmm6, QWORD [B + $18]
4564movss xmm3, DWORD [B + $20]
4565movlhps xmm6, xmm3
4566
4567mulps xmm0, xmm4
4568mulps xmm1, xmm5
4569mulps xmm2, xmm6
4570xorps xmm3, xmm3
4571
4572{ Transpose xmm0-xmm2 }
4573movaps xmm4, xmm2
4574unpcklps xmm2, xmm3
4575unpckhps xmm4, xmm3
4576
4577movaps xmm3, xmm0
4578unpcklps xmm0, xmm1
4579unpckhps xmm3, xmm1
4580
4581movaps xmm1, xmm0
4582unpcklpd xmm0, xmm2
4583unpckhpd xmm1, xmm2
4584
4585unpcklpd xmm3, xmm4
4586
4587addps xmm0, xmm1
4588addps xmm0, xmm3
4589movhlps xmm1, xmm0
4590movq [Result], xmm0
4591movss [Result+8], xmm1
4592end;
4593
4594class operator TMatrix3.Multiply(const A, B: TMatrix3): TMatrix3; assembler;
4595{ Code below consists of 3 Vector*Matrix calculations }
4596asm
4597movq xmm0, QWORD [B + $00]
4598movss xmm1, DWORD [B + $08]
4599movlhps xmm0, xmm1
4600
4601movq xmm4, QWORD [A + $00]
4602movss xmm1, DWORD [A + $08]
4603movlhps xmm4, xmm1
4604
4605movaps xmm1, xmm0
4606movaps xmm2, xmm0
4607shufps xmm0, xmm0, $00
4608shufps xmm1, xmm1, $55
4609shufps xmm2, xmm2, $AA
4610
4611movq xmm5, QWORD [A + $0C]
4612movss xmm3, DWORD [A + $14]
4613movlhps xmm5, xmm3
4614
4615movq xmm6, QWORD [A + $18]
4616movss xmm3, DWORD [A + $20]
4617movlhps xmm6, xmm3
4618
4619mulps xmm0, xmm4
4620mulps xmm1, xmm5
4621mulps xmm2, xmm6
4622addps xmm0, xmm1
4623addps xmm0, xmm2
4624movhlps xmm1, xmm0
4625movq QWORD [Result + $00], xmm0
4626movss DWORD [Result + $08], xmm1
4627
4628movq xmm0, QWORD [B + $0C]
4629movss xmm1, DWORD [B + $14]
4630movlhps xmm0, xmm1
4631
4632movaps xmm1, xmm0
4633movaps xmm2, xmm0
4634shufps xmm0, xmm0, $00
4635shufps xmm1, xmm1, $55
4636shufps xmm2, xmm2, $AA
4637mulps xmm0, xmm4
4638mulps xmm1, xmm5
4639mulps xmm2, xmm6
4640addps xmm0, xmm1
4641addps xmm0, xmm2
4642movhlps xmm1, xmm0
4643movq QWORD [Result + $0C], xmm0
4644movss DWORD [Result + $14], xmm1
4645
4646movq xmm0, QWORD [B + $18]
4647movss xmm1, DWORD [B + $20]
4648movlhps xmm0, xmm1
4649
4650movaps xmm1, xmm0
4651movaps xmm2, xmm0
4652shufps xmm0, xmm0, $00
4653shufps xmm1, xmm1, $55
4654shufps xmm2, xmm2, $AA
4655mulps xmm0, xmm4
4656mulps xmm1, xmm5
4657mulps xmm2, xmm6
4658addps xmm0, xmm1
4659addps xmm0, xmm2
4660movhlps xmm1, xmm0
4661movq QWORD [Result + $18], xmm0
4662movss DWORD [Result + $20], xmm1
4663end;
4664{$ELSE}
4665class operator TMatrix3.Multiply(const A: TMatrix3; const B: TVector3): TVector3; assembler;
4666asm
4667movq xmm0, [B] // Load vector
4668movss xmm1, [B+8]
4669movlhps xmm0, xmm1
4670
4671movq xmm4, QWORD [A + $00] // Load 3 rows
4672movss xmm1, DWORD [A + $08]
4673movlhps xmm4, xmm1
4674
4675movaps xmm1, xmm0
4676movaps xmm2, xmm0
4677
4678movq xmm5, QWORD [A + $0C]
4679movss xmm6, DWORD [A + $14]
4680movlhps xmm5, xmm6
4681
4682movq xmm6, QWORD [A + $18]
4683movss xmm3, DWORD [A + $20]
4684movlhps xmm6, xmm3
4685
4686mulps xmm0, xmm4 // ###, (Az * B02), (Ay * B01), (Ax * B00)
4687mulps xmm1, xmm5 // ###, (Az * B12), (Ay * B11), (Ax * B10)
4688mulps xmm2, xmm6 // ###, (Az * B22), (Ay * B21), (Ax * B20)
4689xorps xmm3, xmm3 // 000
4690
4691{ Transpose xmm0-xmm2 }
4692movaps xmm4, xmm2
4693unpcklps xmm2, xmm3 // 000 B21 000 B20
4694unpckhps xmm4, xmm3 // 000 ### 000 B22
4695
4696movaps xmm3, xmm0
4697unpcklps xmm0, xmm1 // B11 B01 B10 B00
4698unpckhps xmm3, xmm1 // ### ### B12 B02
4699
4700movaps xmm1, xmm0
4701unpcklpd xmm0, xmm2 // 000 B20 B10 B00
4702unpckhpd xmm1, xmm2 // 000 B21 B11 B01
4703
4704unpcklpd xmm3, xmm4 // 000 B22 B12 B02
4705
4706addps xmm0, xmm1 // Add rows
4707addps xmm0, xmm3
4708movhlps xmm1, xmm0
4709movq [Result], xmm0
4710movss [Result+8], xmm1
4711end;
4712
4713class operator TMatrix3.Multiply(const A: TVector3; const B: TMatrix3): TVector3; assembler;
4714asm
4715movq xmm0, [A] // Load vector
4716movss xmm1, [A+8]
4717movlhps xmm0, xmm1
4718
4719movq xmm4, QWORD [B + $00] // Load 3 rows
4720movss xmm1, DWORD [B + $08]
4721movlhps xmm4, xmm1
4722
4723movaps xmm1, xmm0
4724movaps xmm2, xmm0
4725shufps xmm0, xmm0, $00 // Bx Bx Bx Bx
4726shufps xmm1, xmm1, $55 // By By By By
4727shufps xmm2, xmm2, $AA // Bz Bz Bz Bz
4728
4729movq xmm5, QWORD [B + $0C]
4730movss xmm3, DWORD [B + $14]
4731movlhps xmm5, xmm3
4732
4733movq xmm6, QWORD [B + $18]
4734movss xmm3, DWORD [B + $20]
4735movlhps xmm6, xmm3
4736
4737mulps xmm0, xmm4 // (A00 * Bx), (A01 * Bx), (A02 * Bx), #
4738mulps xmm1, xmm5 // (A10 * By), (A11 * By), (A12 * By), #
4739mulps xmm2, xmm6 // (A20 * Bz), (A21 * Bz), (A22 * Bz), #
4740addps xmm0, xmm1 // Add rows
4741addps xmm0, xmm2
4742movhlps xmm1, xmm0
4743movq [Result], xmm0
4744movss [Result+8], xmm1
4745end;
4746
4747class operator TMatrix3.Multiply(const A, B: TMatrix3): TMatrix3; assembler;
4748{ Code below consists of 3 Vector*Matrix calculations }
4749asm
4750{ A.R[0] * B }
4751movq xmm0, QWORD [A + $00]
4752movss xmm1, DWORD [A + $08]
4753movlhps xmm0, xmm1
4754
4755movq xmm4, QWORD [B + $00]
4756movss xmm1, DWORD [B + $08]
4757movlhps xmm4, xmm1
4758
4759movaps xmm1, xmm0
4760movaps xmm2, xmm0
4761shufps xmm0, xmm0, $00
4762shufps xmm1, xmm1, $55
4763shufps xmm2, xmm2, $AA
4764
4765movq xmm5, QWORD [B + $0C]
4766movss xmm3, DWORD [B + $14]
4767movlhps xmm5, xmm3
4768
4769movq xmm6, QWORD [B + $18]
4770movss xmm3, DWORD [B + $20]
4771movlhps xmm6, xmm3
4772
4773mulps xmm0, xmm4
4774mulps xmm1, xmm5
4775mulps xmm2, xmm6
4776addps xmm0, xmm1
4777addps xmm0, xmm2
4778movhlps xmm1, xmm0
4779movq QWORD [Result + $00], xmm0
4780movss DWORD [Result + $08], xmm1
4781
4782{ A.R[1] * B }
4783movq xmm0, QWORD [A + $0C]
4784movss xmm1, DWORD [A + $14]
4785movlhps xmm0, xmm1
4786
4787movaps xmm1, xmm0
4788movaps xmm2, xmm0
4789shufps xmm0, xmm0, $00
4790shufps xmm1, xmm1, $55
4791shufps xmm2, xmm2, $AA
4792mulps xmm0, xmm4
4793mulps xmm1, xmm5
4794mulps xmm2, xmm6
4795addps xmm0, xmm1
4796addps xmm0, xmm2
4797movhlps xmm1, xmm0
4798movq QWORD [Result + $0C], xmm0
4799movss DWORD [Result + $14], xmm1
4800
4801{ A.R[2] * B }
4802movq xmm0, QWORD [A + $18]
4803movss xmm1, DWORD [A + $20]
4804movlhps xmm0, xmm1
4805
4806movaps xmm1, xmm0
4807movaps xmm2, xmm0
4808shufps xmm0, xmm0, $00
4809shufps xmm1, xmm1, $55
4810shufps xmm2, xmm2, $AA
4811mulps xmm0, xmm4
4812mulps xmm1, xmm5
4813mulps xmm2, xmm6
4814addps xmm0, xmm1
4815addps xmm0, xmm2
4816movhlps xmm1, xmm0
4817movq QWORD [Result + $18], xmm0
4818movss DWORD [Result + $20], xmm1
4819end;
4820{$ENDIF}
4821
4822class operator TMatrix3.Negative(const A: TMatrix3): TMatrix3; assembler;
4823asm
4824movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
4825movups xmm1, DQWORD [A + $00] // Load 3 rows
4826movups xmm2, DQWORD [A + $10]
4827movss xmm3, DWORD [A + $20]
4828xorps xmm1, xmm0 // Flip sign bits of each element in each row
4829xorps xmm2, xmm0
4830pxor xmm3, xmm0
4831movups DQWORD [Result + $00], xmm1
4832movups DQWORD [Result + $10], xmm2
4833movss DWORD [Result + $20], xmm3
4834end;
4835
4836procedure TMatrix3.SetTransposed; assembler;
4837asm
4838movss xmm0, DWORD [Self + $04]
4839movss xmm1, DWORD [Self + $08]
4840
4841movss xmm2, DWORD [Self + $0C]
4842movss xmm3, DWORD [Self + $14]
4843
4844movss xmm4, DWORD [Self + $18]
4845movss xmm5, DWORD [Self + $1C]
4846
4847movss DWORD [Self + $0C], xmm0
4848movss DWORD [Self + $18], xmm1
4849
4850movss DWORD [Self + $04], xmm2
4851movss DWORD [Self + $1C], xmm3
4852
4853movss DWORD [Self + $08], xmm4
4854movss DWORD [Self + $14], xmm5
4855end;
4856
4857class operator TMatrix3.Subtract(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4858asm
4859movss xmm0, [B] // Load single floating-point value
4860movups xmm1, DQWORD [A + $00] // Load 3 rows
4861shufps xmm0, xmm0, 0 // Replicate B
4862movups xmm2, DQWORD [A + $10]
4863movss xmm3, DWORD [A + $20]
4864subps xmm1, xmm0 // Subtract B from each row
4865subps xmm2, xmm0
4866subps xmm3, xmm0
4867movups DQWORD [Result + $00], xmm1
4868movups DQWORD [Result + $10], xmm2
4869movss DWORD [Result + $20], xmm3
4870end;
4871
4872class operator TMatrix3.Subtract(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4873asm
4874movss xmm0, [A] // Load single floating-point value
4875movups xmm4, DQWORD [B + $00] // Load 3 rows
4876shufps xmm0, xmm0, 0 // Replicate A
4877movups xmm5, DQWORD [B + $10]
4878movaps xmm1, xmm0
4879movaps xmm2, xmm0
4880movss xmm6, DWORD [B + $20]
4881subps xmm0, xmm4 // Subtract each row from A
4882subps xmm1, xmm5
4883subss xmm2, xmm6
4884movups DQWORD [Result + $00], xmm0
4885movups DQWORD [Result + $10], xmm1
4886movss DWORD [Result + $20], xmm2
4887end;
4888
4889class operator TMatrix3.Subtract(const A, B: TMatrix3): TMatrix3; assembler;
4890asm
4891movups xmm0, DQWORD [A + $00] // Load 3 rows of A
4892movups xmm1, DQWORD [A + $10]
4893movss xmm2, DWORD [A + $20]
4894movups xmm4, DQWORD [B + $00] // Load 3 rows of B
4895movups xmm5, DQWORD [B + $10]
4896movss xmm6, DWORD [B + $20]
4897subps xmm0, xmm4 // Subtract rows
4898subps xmm1, xmm5
4899subss xmm2, xmm6
4900movups DQWORD [Result + $00], xmm0
4901movups DQWORD [Result + $10], xmm1
4902movss DWORD [Result + $20], xmm2
4903end;
4904
4905function TMatrix3.Transpose: TMatrix3; assembler;
4906asm
4907movss xmm0, DWORD [Self + $00]
4908movss xmm1, DWORD [Self + $04]
4909movss xmm2, DWORD [Self + $08]
4910
4911movss DWORD [Result + $00], xmm0
4912movss DWORD [Result + $0C], xmm1
4913movss DWORD [Result + $18], xmm2
4914
4915movss xmm0, DWORD [Self + $0C]
4916movss xmm1, DWORD [Self + $10]
4917movss xmm2, DWORD [Self + $14]
4918
4919movss DWORD [Result + $04], xmm0
4920movss DWORD [Result + $10], xmm1
4921movss DWORD [Result + $1C], xmm2
4922
4923movss xmm0, DWORD [Self + $18]
4924movss xmm1, DWORD [Self + $1C]
4925movss xmm2, DWORD [Self + $20]
4926
4927movss DWORD [Result + $08], xmm0
4928movss DWORD [Result + $14], xmm1
4929movss DWORD [Result + $20], xmm2
4930end;
4931
4932{ TMatrix 4 }
4933
4934class operator TMatrix4.Add(const A: TMatrix4; const B: Single): TMatrix4; assembler;
4935asm
4936movss xmm0, [B] // Load single floating-point value
4937movups xmm1, DQWORD [A + $00] // Load 4 rows
4938shufps xmm0, xmm0, 0 // Replicate B
4939movups xmm2, DQWORD [A + $10]
4940movups xmm3, DQWORD [A + $20]
4941movups xmm4, DQWORD [A + $30]
4942addps xmm1, xmm0 // Add B to each row
4943addps xmm2, xmm0
4944addps xmm3, xmm0
4945addps xmm4, xmm0
4946movups DQWORD [Result + $00], xmm1
4947movups DQWORD [Result + $10], xmm2
4948movups DQWORD [Result + $20], xmm3
4949movups DQWORD [Result + $30], xmm4
4950end;
4951
4952class operator TMatrix4.Add(const A: Single; const B: TMatrix4): TMatrix4; assembler;
4953asm
4954movss xmm0, [A] // Load single floating-point value
4955movups xmm1, DQWORD [B + $00] // Load 4 rows
4956shufps xmm0, xmm0, 0 // Replicate A
4957movups xmm2, DQWORD [B + $10]
4958movups xmm3, DQWORD [B + $20]
4959movups xmm4, DQWORD [B + $30]
4960addps xmm1, xmm0 // Add A to each row
4961addps xmm2, xmm0
4962addps xmm3, xmm0
4963addps xmm4, xmm0
4964movups DQWORD [Result + $00], xmm1
4965movups DQWORD [Result + $10], xmm2
4966movups DQWORD [Result + $20], xmm3
4967movups DQWORD [Result + $30], xmm4
4968end;
4969
4970class operator TMatrix4.Add(const A, B: TMatrix4): TMatrix4; assembler;
4971asm
4972movups xmm0, DQWORD [A + $00] // Load 4 rows of A
4973movups xmm1, DQWORD [A + $10]
4974movups xmm2, DQWORD [A + $20]
4975movups xmm3, DQWORD [A + $30]
4976movups xmm4, DQWORD [B + $00] // Load 4 rows of B
4977movups xmm5, DQWORD [B + $10]
4978movups xmm6, DQWORD [B + $20]
4979movups xmm7, DQWORD [B + $30]
4980addps xmm0, xmm4 // Add rows
4981addps xmm1, xmm5
4982addps xmm2, xmm6
4983addps xmm3, xmm7
4984movups DQWORD [Result + $00], xmm0
4985movups DQWORD [Result + $10], xmm1
4986movups DQWORD [Result + $20], xmm2
4987movups DQWORD [Result + $30], xmm3
4988end;
4989
4990function TMatrix4.CompMult(const AOther: TMatrix4): TMatrix4; assembler;
4991asm
4992movups xmm0, DQWORD[Self + $00] // Self[0]
4993movups xmm1, DQWORD[Self + $10] // Self[1]
4994movups xmm2, DQWORD[Self + $20] // Self[2]
4995movups xmm3, DQWORD[Self + $30] // Self[3]
4996movups xmm4, DQWORD[AOther + $00] // AOther[0]
4997movups xmm5, DQWORD[AOther + $10] // AOther[1]
4998movups xmm6, DQWORD[AOther + $20] // AOther[2]
4999movups xmm7, DQWORD[AOther + $30] // AOther[3]
5000
5001// Component-wise multiplication
5002mulps xmm0, xmm4
5003mulps xmm1, xmm5
5004mulps xmm2, xmm6
5005mulps xmm3, xmm7
5006
5007// Store result
5008movups DQWORD [Result + $00], xmm0
5009movups DQWORD [Result + $10], xmm1
5010movups DQWORD [Result + $20], xmm2
5011movups DQWORD [Result + $30], xmm3
5012end;
5013
5014class operator TMatrix4.Divide(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5015asm
5016movss xmm0, [A] // Load single floating-point value
5017movups xmm4, DQWORD [B + $00] // Load 4 rows
5018shufps xmm0, xmm0, 0 // Replicate A
5019movups xmm5, DQWORD [B + $10]
5020movaps xmm1, xmm0
5021movaps xmm2, xmm0
5022movaps xmm3, xmm0
5023movups xmm6, DQWORD [B + $20]
5024movups xmm7, DQWORD [B + $30]
5025divps xmm0, xmm4 // Divide A by each row
5026divps xmm1, xmm5
5027divps xmm2, xmm6
5028divps xmm3, xmm7
5029movups DQWORD [Result + $00], xmm0
5030movups DQWORD [Result + $10], xmm1
5031movups DQWORD [Result + $20], xmm2
5032movups DQWORD [Result + $30], xmm3
5033end;
5034
5035class operator TMatrix4.Divide(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5036asm
5037movss xmm0, [B] // Load single floating-point value
5038movups xmm1, DQWORD [A + $00] // Load 4 rows
5039shufps xmm0, xmm0, 0 // Replicate B
5040movups xmm2, DQWORD [A + $10]
5041movups xmm3, DQWORD [A + $20]
5042movups xmm4, DQWORD [A + $30]
5043divps xmm1, xmm0 // Divide each row by B
5044divps xmm2, xmm0 // NOTE: We could speed it up by multiplying by
5045divps xmm3, xmm0 // 1/B instead, using the "rcpps" instruction,
5046divps xmm4, xmm0 // but that instruction is an approximation,
5047// so we lose accuracy.
5048movups DQWORD [Result + $00], xmm1
5049movups DQWORD [Result + $10], xmm2
5050movups DQWORD [Result + $20], xmm3
5051movups DQWORD [Result + $30], xmm4
5052end;
5053
5054function TMatrix4.Inverse: TMatrix4; assembler;
5055type
5056TStack = record
5057case Byte of
50580: (WorkSpace: array [0..6] of TVector4);
50591: (F0, F1, F2, F3, F4, F5, Padding: TVector4);
5060end;
5061var
5062Stack: TStack;
5063asm
5064// Align stack to 16-byte boundary
5065push ebp
5066add ebp, 16
5067and ebp, not 15
5068
5069movups xmm1, DQWORD[Self + $10] // M[1]
5070movups xmm2, DQWORD[Self + $20] // M[2]
5071movups xmm3, DQWORD[Self + $30] // M[3]
5072
5073// C00 := (A.M[2,2] * A.M[3,3]) - (A.M[3,2] * A.M[2,3]);
5074// C02 := (A.M[1,2] * A.M[3,3]) - (A.M[3,2] * A.M[1,3]);
5075// C03 := (A.M[1,2] * A.M[2,3]) - (A.M[2,2] * A.M[1,3]);
5076// F0 := Vector4(C00, C00, C02, C03);
5077movaps xmm5, xmm2 // M[2]
5078movaps xmm7, xmm2 // M[2]
5079movaps xmm0, xmm3 // M[3]
5080movaps xmm6, xmm3 // M[3]
5081shufps xmm6, xmm2, $AA // M22 M22 M32 M32
5082shufps xmm0, xmm2, $FF // M23 M23 M33 M33
5083shufps xmm7, xmm1, $FF // M13 M13 M23 M23
5084pshufd xmm4, xmm0, $80 // M23 M33 M33 M33
5085shufps xmm5, xmm1, $AA // M12 M12 M22 M22
5086pshufd xmm0, xmm6, $80 // M22 M32 M32 M32
5087mulps xmm5, xmm4 // (M12 * M23) (M12 * M33) (M22 * M33) (M22 * M33)
5088mulps xmm7, xmm0 // (M22 * M13) (M32 * M13) (M32 * M23) (M32 * M23)
5089subps xmm5, xmm7 // C03=(M12*M23)-(M22*M13), C02=(M12*M33)-(M32*M13), C00=(M22*M33)-(M32*M23), C00=(M22*M33)-(M32*M23)
5090movups [Stack.F0], xmm5
5091
5092// C04 := (A.M[2,1] * A.M[3,3]) - (A.M[3,1] * A.M[2,3]);
5093// C06 := (A.M[1,1] * A.M[3,3]) - (A.M[3,1] * A.M[1,3]);
5094// C07 := (A.M[1,1] * A.M[2,3]) - (A.M[2,1] * A.M[1,3]);
5095// F1 := Vector4(C04, C04, C06, C07);
5096movaps xmm5, xmm2 // M[2]
5097movaps xmm7, xmm2 // M[2]
5098movaps xmm0, xmm3 // M[3]
5099movaps xmm6, xmm3 // M[3]
5100shufps xmm6, xmm2, $55 // M21 M21 M31 M31
5101shufps xmm0, xmm2, $FF // M23 M23 M33 M33
5102shufps xmm7, xmm1, $FF // M13 M13 M23 M23
5103pshufd xmm4, xmm0, $80 // M23 M33 M33 M33
5104shufps xmm5, xmm1, $55 // M11 M11 M21 M21
5105pshufd xmm0, xmm6, $80 // M21 M31 M31 M31
5106mulps xmm5, xmm4 // (M11 * M23) (M11 * M33) (M21 * M33) (M21 * M33)
5107mulps xmm7, xmm0 // (M21 * M13) (M31 * M13) (M31 * M23) (M31 * M23)
5108subps xmm5, xmm7 // C07=(M11*M23)-(M21*M13), C06=(M11*M33)-(M31*M13), C04=(M21*M33)-(M31*M23), C04=(M21*M33)-(M31*M23)
5109movups [Stack.F1], xmm5
5110
5111// C08 := (A.M[2,1] * A.M[3,2]) - (A.M[3,1] * A.M[2,2]);
5112// C10 := (A.M[1,1] * A.M[3,2]) - (A.M[3,1] * A.M[1,2]);
5113// C11 := (A.M[1,1] * A.M[2,2]) - (A.M[2,1] * A.M[1,2]);
5114// F2 := Vector4(C08, C08, C10, C11);
5115movaps xmm5, xmm2 // M[2]
5116movaps xmm7, xmm2 // M[2]
5117movaps xmm0, xmm3 // M[3]
5118movaps xmm6, xmm3 // M[3]
5119shufps xmm6, xmm2, $55 // M21 M21 M31 M31
5120shufps xmm0, xmm2, $AA // M22 M22 M32 M32
5121shufps xmm7, xmm1, $AA // M12 M12 M22 M22
5122pshufd xmm4, xmm0, $80 // M22 M32 M32 M32
5123shufps xmm5, xmm1, $55 // M11 M11 M21 M21
5124pshufd xmm0, xmm6, $80 // M21 M31 M31 M31
5125mulps xmm5, xmm4 // (M11 * M22) (M11 * M32) (M21 * M32) (M21 * M32)
5126mulps xmm7, xmm0 // (M21 * M12) (M31 * M12) (M31 * M22) (M32 * M22)
5127subps xmm5, xmm7 // C11=(M11*M22)-(M21*M12), C10=(M11*M32)-(M31*M12), C08=(M21*M32)-(M31*M22), C08=(M21*M32)-(M31*M22)
5128movups [Stack.F2], xmm5
5129
5130// C12 := (A.M[2,0] * A.M[3,3]) - (A.M[3,0] * A.M[2,3]);
5131// C14 := (A.M[1,0] * A.M[3,3]) - (A.M[3,0] * A.M[1,3]);
5132// C15 := (A.M[1,0] * A.M[2,3]) - (A.M[2,0] * A.M[1,3]);
5133// F3 := Vector4(C12, C12, C14, C15);
5134movaps xmm5, xmm2 // M[2]
5135movaps xmm7, xmm2 // M[2]
5136movaps xmm0, xmm3 // M[3]
5137movaps xmm6, xmm3 // M[3]
5138shufps xmm6, xmm2, $00 // M20 M20 M30 M30
5139shufps xmm0, xmm2, $FF // M23 M23 M33 M33
5140shufps xmm7, xmm1, $FF // M13 M13 M23 M23
5141pshufd xmm4, xmm0, $80 // M23 M33 M33 M33
5142shufps xmm5, xmm1, $00 // M10 M10 M20 M20
5143pshufd xmm0, xmm6, $80 // M20 M30 M30 M30
5144mulps xmm5, xmm4 // (M10 * M23) (M10 * M33) (M20 * M33) (M20 * M33)
5145mulps xmm7, xmm0 // (M20 * M13) (M30 * M13) (M30 * M23) (M30 * M23)
5146subps xmm5, xmm7 // C15=(M10*M23)-(M20*M13), C14=(M10*M33)-(M30*M13), C12=(M20*M33)-(M30*M23), C12=(M20*M33)-(M30*M23)
5147movups [Stack.F3], xmm5
5148
5149// C16 := (A.M[2,0] * A.M[3,2]) - (A.M[3,0] * A.M[2,2]);
5150// C18 := (A.M[1,0] * A.M[3,2]) - (A.M[3,0] * A.M[1,2]);
5151// C19 := (A.M[1,0] * A.M[2,2]) - (A.M[2,0] * A.M[1,2]);
5152// F4 := Vector4(C16, C16, C18, C19);
5153movaps xmm5, xmm2 // M[2]
5154movaps xmm7, xmm2 // M[2]
5155movaps xmm0, xmm3 // M[3]
5156movaps xmm6, xmm3 // M[3]
5157shufps xmm6, xmm2, $00 // M20 M20 M30 M30
5158shufps xmm0, xmm2, $AA // M22 M22 M32 M32
5159shufps xmm7, xmm1, $AA // M12 M12 M22 M22
5160pshufd xmm4, xmm0, $80 // M22 M32 M32 M32
5161shufps xmm5, xmm1, $00 // M10 M10 M20 M20
5162pshufd xmm0, xmm6, $80 // M20 M30 M30 M30
5163mulps xmm5, xmm4 // (M10 * M22) (M10 * M32) (M20 * M32) (M20 * M32)
5164mulps xmm7, xmm0 // (M20 * M12) (M30 * M12) (M30 * M22) (M30 * M22)
5165subps xmm5, xmm7 // C19=(M10*M22)-(M20*M12), C18=(M10*M32)-(M30*M12), C16=(M20*M32)-(M30*M22), C16=(M20*M32)-(M30*M22)
5166movups [Stack.F4], xmm5
5167
5168// C20 := (A.M[2,0] * A.M[3,1]) - (A.M[3,0] * A.M[2,1]);
5169// C22 := (A.M[1,0] * A.M[3,1]) - (A.M[3,0] * A.M[1,1]);
5170// C23 := (A.M[1,0] * A.M[2,1]) - (A.M[2,0] * A.M[1,1]);
5171// F5 := Vector4(C20, C20, C22, C23);
5172movaps xmm5, xmm2 // M[2]
5173movaps xmm7, xmm2 // M[2]
5174movaps xmm0, xmm3 // M[3]
5175movaps xmm6, xmm3 // M[3]
5176shufps xmm6, xmm2, $00 // M20 M20 M30 M30
5177shufps xmm0, xmm2, $55 // M21 M21 M31 M31
5178shufps xmm7, xmm1, $55 // M11 M11 M21 M21
5179pshufd xmm4, xmm0, $80 // M21 M31 M31 M31
5180shufps xmm5, xmm1, $00 // M10 M10 M20 M20
5181pshufd xmm0, xmm6, $80 // M20 M30 M30 M30
5182mulps xmm5, xmm4 // (M10 * M21) (M10 * M31) (M20 * M31) (M20 * M31)
5183mulps xmm7, xmm0 // (M20 * M11) (M30 * M11) (M30 * M21) (M30 * M21)
5184subps xmm5, xmm7 // C23=(M10*M21)-(M20*M11), C22=(M10*M31)-(M30*M11), C20=(M20*M31)-(M30*M21), C20=(M20*M31)-(M30*M21)
5185movups [Stack.F5], xmm5
5186
5187// V0 := Vector4(A.M[1,0], A.M[0,0], A.M[0,0], A.M[0,0]);
5188// V1 := Vector4(A.M[1,1], A.M[0,1], A.M[0,1], A.M[0,1]);
5189// V2 := Vector4(A.M[1,2], A.M[0,2], A.M[0,2], A.M[0,2]);
5190// V3 := Vector4(A.M[1,3], A.M[0,3], A.M[0,3], A.M[0,3]);
5191movups xmm0, DQWORD[Self + $00] // M[0]
5192movaps xmm4, xmm1 // M[1]
5193movaps xmm5, xmm1 // M[1]
5194movaps xmm6, xmm1 // M[1]
5195movaps xmm7, xmm1 // M[1]
5196
5197shufps xmm4, xmm0, $00 // M00 M00 M10 M10
5198shufps xmm5, xmm0, $55 // M01 M01 M11 M11
5199shufps xmm6, xmm0, $AA // M02 M02 M12 M12
5200shufps xmm7, xmm0, $FF // M03 M03 M13 M13
5201
5202pshufd xmm4, xmm4, $A8 // V0=M00 M00 M00 M10
5203pshufd xmm5, xmm5, $A8 // V1=M01 M01 M01 M11
5204pshufd xmm6, xmm6, $A8 // V2=M02 M02 M02 M12
5205pshufd xmm7, xmm7, $A8 // V3=M03 M03 M03 M13
5206
5207// I0 := (V1 * F0) - (V2 * F1) + (V3 * F2);
5208// I1 := (V0 * F0) - (V2 * F3) + (V3 * F4);
5209// I2 := (V0 * F1) - (V1 * F3) + (V3 * F5);
5210// I3 := (V0 * F2) - (V1 * F4) + (V2 * F5);
5211movaps xmm0, xmm5 // V1
5212movaps xmm1, xmm6 // V2
5213movaps xmm2, xmm7 // V3
5214mulps xmm0, [Stack.F0] // V1 * F0
5215mulps xmm1, [Stack.F1] // V2 * F1
5216mulps xmm2, [Stack.F2] // V3 * F2
5217subps xmm0, xmm1 // (V1 * F0) - (V2 * F1)
5218movaps xmm1, xmm4 // V0
5219addps xmm0, xmm2 // I0=(V1 * F0) - (V2 * F1) + (V3 * F2)
5220
5221movaps xmm2, xmm6 // V2
5222movaps xmm3, xmm7 // V3
5223mulps xmm1, [Stack.F0] // V0 * F0
5224mulps xmm2, [Stack.F3] // V2 * F3
5225mulps xmm3, [Stack.F4] // V3 * F4
5226subps xmm1, xmm2 // (V0 * F0) - (V2 * F3)
5227movaps xmm2, xmm4 // V0
5228addps xmm1, xmm3 // I1=(V0 * F0) - (V2 * F3) + (V3 * F4)
5229
5230movaps xmm3, xmm5 // V1
5231mulps xmm2, [Stack.F1] // V0 * F1
5232mulps xmm3, [Stack.F3] // V1 * F3
5233mulps xmm7, [Stack.F5] // V3 * F5
5234subps xmm2, xmm3 // (V0 * F1) - (V1 * F3)
5235mulps xmm4, [Stack.F2] // V0 * F2
5236addps xmm2, xmm7 // I2=(V0 * F1) - (V1 * F3) + (V3 * F5)
5237
5238mulps xmm5, [Stack.F4] // V1 * F4
5239mulps xmm6, [Stack.F5] // V2 * F5
5240subps xmm4, xmm5 // (V0 * F2) - (V1 * F4)
5241addps xmm4, xmm6 // I3=(V0 * F2) - (V1 * F4) + (V2 * F5)
5242
5243// SA := Vector4(+1, -1, +1, -1);
5244// SB := Vector4(-1, +1, -1, +1);
5245// Inv := Matrix4(I0 * SA, I1 * SB, I2 * SA, I3 * SB);
5246
5247movups xmm6, [SSE_MASK_PNPN] // SA
5248movups xmm7, [SSE_MASK_NPNP] // SB
5249xorps xmm0, xmm6 // Inv[0] = I0 * SA
5250xorps xmm1, xmm7 // Inv[1] = I1 * SB
5251xorps xmm2, xmm6 // Inv[2] = I2 * SA
5252xorps xmm4, xmm7 // Inv[3] = I3 * SB
5253
5254// Row := Vector4(Inv[0,0], Inv[1,0], Inv[2,0], Inv[3,0]);
5255movaps xmm3, xmm0
5256movaps xmm5, xmm2
5257movaps xmm6, xmm1
5258
5259unpcklps xmm3, xmm1 // Inv[1,1] Inv[0,1] Inv[1,0] Inv[0,0]
5260unpcklps xmm5, xmm4 // Inv[3,1] Inv[2,1] Inv[3,0] Inv[2,0]
5261movups xmm6, DQWORD[Self + $00] // A.C[0]
5262movlhps xmm3, xmm5 // Inv[3,0] Inv[2,0] Inv[1,0] Inv[0,0]
5263
5264// Dot := A.C[0] * Row;
5265mulps xmm3, xmm6 // Dot.W Dot.Z Dot.Y Dot.X
5266
5267// OneOverDeterminant := 1 / ((Dot.X + Dot.Y) + (Dot.Z + Dot.W));
5268pshufd xmm6, xmm3, $4E // Dot.Y Dot.X Dot.W Dot.Z
5269addps xmm3, xmm6 // W+Y Z+X Y+W X+Z
5270pshufd xmm6, xmm3, $11 // X+Z Y+X X+Z Y+W
5271movups xmm5, [SSE_ONE] // 1.0 (4x)
5272addps xmm3, xmm6 // X+Y+Z+W (4x)
5273divps xmm5, xmm3 // OneOverDeterminant (4x)
5274
5275// Result := Inv * OneOverDeterminant;
5276mulps xmm0, xmm5
5277mulps xmm1, xmm5
5278mulps xmm2, xmm5
5279mulps xmm4, xmm5
5280
5281movups DQWORD[Result + $00], xmm0
5282movups DQWORD[Result + $10], xmm1
5283movups DQWORD[Result + $20], xmm2
5284movups DQWORD[Result + $30], xmm4
5285
5286pop ebp
5287end;
5288
5289class operator TMatrix4.Multiply(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5290asm
5291movss xmm0, [A] // Load single floating-point value
5292movups xmm1, DQWORD [B + $00] // Load 4 rows
5293shufps xmm0, xmm0, 0 // Replicate A
5294movups xmm2, DQWORD [B + $10]
5295movups xmm3, DQWORD [B + $20]
5296movups xmm4, DQWORD [B + $30]
5297mulps xmm1, xmm0 // Multiply each row by A
5298mulps xmm2, xmm0
5299mulps xmm3, xmm0
5300mulps xmm4, xmm0
5301movups DQWORD [Result + $00], xmm1
5302movups DQWORD [Result + $10], xmm2
5303movups DQWORD [Result + $20], xmm3
5304movups DQWORD [Result + $30], xmm4
5305end;
5306
5307class operator TMatrix4.Multiply(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5308asm
5309movss xmm0, [B] // Load single floating-point value
5310movups xmm1, DQWORD [A + $00] // Load 4 rows
5311shufps xmm0, xmm0, 0 // Replicate B
5312movups xmm2, DQWORD [A + $10]
5313movups xmm3, DQWORD [A + $20]
5314movups xmm4, DQWORD [A + $30]
5315mulps xmm1, xmm0 // Multiply each row by B
5316mulps xmm2, xmm0
5317mulps xmm3, xmm0
5318mulps xmm4, xmm0
5319movups DQWORD [Result + $00], xmm1
5320movups DQWORD [Result + $10], xmm2
5321movups DQWORD [Result + $20], xmm3
5322movups DQWORD [Result + $30], xmm4
5323end;
5324
5325{$IFDEF FM_COLUMN_MAJOR}
5326class operator TMatrix4.Multiply(const A: TMatrix4; const B: TVector4): TVector4; assembler;
5327asm
5328movups xmm0, [B]
5329movups xmm4, DQWORD [A + $00]
5330movaps xmm1, xmm0
5331movaps xmm2, xmm0
5332movaps xmm3, xmm0
5333shufps xmm0, xmm0, $00
5334shufps xmm1, xmm1, $55
5335shufps xmm2, xmm2, $AA
5336shufps xmm3, xmm3, $FF
5337movups xmm5, DQWORD [A + $10]
5338movups xmm6, DQWORD [A + $20]
5339movups xmm7, DQWORD [A + $30]
5340mulps xmm0, xmm4
5341mulps xmm1, xmm5
5342mulps xmm2, xmm6
5343mulps xmm3, xmm7
5344addps xmm0, xmm1
5345addps xmm2, xmm3
5346addps xmm0, xmm2
5347movups [Result], xmm0
5348end;
5349
5350class operator TMatrix4.Multiply(const A: TVector4; const B: TMatrix4): TVector4; assembler;
5351asm
5352movups xmm0, [A]
5353movups xmm4, DQWORD [B + $00]
5354movaps xmm1, xmm0
5355movaps xmm2, xmm0
5356movaps xmm3, xmm0
5357movups xmm5, DQWORD [B + $10]
5358movups xmm6, DQWORD [B + $20]
5359movups xmm7, DQWORD [B + $30]
5360mulps xmm0, xmm4
5361mulps xmm1, xmm5
5362mulps xmm2, xmm6
5363mulps xmm3, xmm7
5364
5365{ Transpose xmm0-xmm3 }
5366movaps xmm4, xmm2
5367unpcklps xmm2, xmm3
5368unpckhps xmm4, xmm3
5369
5370movaps xmm3, xmm0
5371unpcklps xmm0, xmm1
5372unpckhps xmm3, xmm1
5373
5374movaps xmm1, xmm0
5375unpcklpd xmm0, xmm2
5376unpckhpd xmm1, xmm2
5377
5378movaps xmm2, xmm3
5379unpcklpd xmm2, xmm4
5380unpckhpd xmm3, xmm4
5381
5382addps xmm0, xmm1
5383addps xmm2, xmm3
5384addps xmm0, xmm2
5385movups [Result], xmm0
5386end;
5387
5388class operator TMatrix4.Multiply(const A, B: TMatrix4): TMatrix4; assembler;
5389{ Code below consists of 4 Vector*Matrix calculations }
5390asm
5391movups xmm0, DQWORD [B + $00]
5392movups xmm4, DQWORD [A + $00]
5393movaps xmm1, xmm0
5394movaps xmm2, xmm0
5395movaps xmm3, xmm0
5396shufps xmm0, xmm0, $00
5397shufps xmm1, xmm1, $55
5398shufps xmm2, xmm2, $AA
5399shufps xmm3, xmm3, $FF
5400movups xmm5, DQWORD [A + $10]
5401movups xmm6, DQWORD [A + $20]
5402movups xmm7, DQWORD [A + $30]
5403mulps xmm0, xmm4
5404mulps xmm1, xmm5
5405mulps xmm2, xmm6
5406mulps xmm3, xmm7
5407addps xmm0, xmm1
5408addps xmm2, xmm3
5409addps xmm0, xmm2
5410movups DQWORD [Result + $00], xmm0
5411
5412movups xmm0, DQWORD [B + $10]
5413movaps xmm1, xmm0
5414movaps xmm2, xmm0
5415movaps xmm3, xmm0
5416shufps xmm0, xmm0, $00
5417shufps xmm1, xmm1, $55
5418shufps xmm2, xmm2, $AA
5419shufps xmm3, xmm3, $FF
5420mulps xmm0, xmm4
5421mulps xmm1, xmm5
5422mulps xmm2, xmm6
5423mulps xmm3, xmm7
5424addps xmm0, xmm1
5425addps xmm2, xmm3
5426addps xmm0, xmm2
5427movups DQWORD [Result + $10], xmm0
5428
5429movups xmm0, DQWORD [B + $20]
5430movaps xmm1, xmm0
5431movaps xmm2, xmm0
5432movaps xmm3, xmm0
5433shufps xmm0, xmm0, $00
5434shufps xmm1, xmm1, $55
5435shufps xmm2, xmm2, $AA
5436shufps xmm3, xmm3, $FF
5437mulps xmm0, xmm4
5438mulps xmm1, xmm5
5439mulps xmm2, xmm6
5440mulps xmm3, xmm7
5441addps xmm0, xmm1
5442addps xmm2, xmm3
5443addps xmm0, xmm2
5444movups DQWORD [Result + $20], xmm0
5445
5446movups xmm0, DQWORD [B + $30]
5447movaps xmm1, xmm0
5448movaps xmm2, xmm0
5449movaps xmm3, xmm0
5450shufps xmm0, xmm0, $00
5451shufps xmm1, xmm1, $55
5452shufps xmm2, xmm2, $AA
5453shufps xmm3, xmm3, $FF
5454mulps xmm0, xmm4
5455mulps xmm1, xmm5
5456mulps xmm2, xmm6
5457mulps xmm3, xmm7
5458addps xmm0, xmm1
5459addps xmm2, xmm3
5460addps xmm0, xmm2
5461movups DQWORD [Result + $30], xmm0
5462end;
5463{$ELSE}
5464class operator TMatrix4.Multiply(const A: TMatrix4; const B: TVector4): TVector4; assembler;
5465asm
5466movups xmm0, [B] // Load vector
5467movups xmm4, DQWORD [A + $00] // Load 4 rows
5468movaps xmm1, xmm0
5469movaps xmm2, xmm0
5470movaps xmm3, xmm0
5471movups xmm5, DQWORD [A + $10]
5472movups xmm6, DQWORD [A + $20]
5473movups xmm7, DQWORD [A + $30]
5474mulps xmm0, xmm4 // (Ax * B00), (Ay * B01), (Az * B02), (Aw * B03)
5475mulps xmm1, xmm5 // (Ax * B10), (Ay * B11), (Az * B12), (Aw * B13)
5476mulps xmm2, xmm6 // (Ax * B20), (Ay * B21), (Az * B22), (Aw * B23)
5477mulps xmm3, xmm7 // (Ax * B30), (Ay * B31), (Az * B32), (Aw * B33)
5478
5479{ Transpose xmm0-xmm3 }
5480movaps xmm4, xmm2
5481unpcklps xmm2, xmm3 // B32 B22 B33 B23
5482unpckhps xmm4, xmm3 // B30 B20 B31 B21
5483
5484movaps xmm3, xmm0
5485unpcklps xmm0, xmm1 // B12 B02 B13 B03
5486unpckhps xmm3, xmm1 // B10 B00 B11 B01
5487
5488movaps xmm1, xmm0
5489unpcklpd xmm0, xmm2 // B33 B23 B13 B03
5490unpckhpd xmm1, xmm2 // B32 B22 B12 B02
5491
5492movaps xmm2, xmm3
5493unpcklpd xmm2, xmm4 // B31 B21 B11 B01
5494unpckhpd xmm3, xmm4 // B30 B20 B10 B00
5495
5496addps xmm0, xmm1 // Add rows
5497addps xmm2, xmm3
5498addps xmm0, xmm2
5499movups [Result], xmm0
5500end;
5501
5502class operator TMatrix4.Multiply(const A: TVector4; const B: TMatrix4): TVector4; assembler;
5503asm
5504movups xmm0, [A] // Load vector
5505movups xmm4, DQWORD [B + $00] // Load 4 rows
5506movaps xmm1, xmm0
5507movaps xmm2, xmm0
5508movaps xmm3, xmm0
5509shufps xmm0, xmm0, $00 // Bx Bx Bx Bx
5510shufps xmm1, xmm1, $55 // By By By By
5511shufps xmm2, xmm2, $AA // Bz Bz Bz Bz
5512shufps xmm3, xmm3, $FF // Bw Bw Bw Bw
5513movups xmm5, DQWORD [B + $10]
5514movups xmm6, DQWORD [B + $20]
5515movups xmm7, DQWORD [B + $30]
5516mulps xmm0, xmm4 // (A00 * Bx), (A01 * Bx), (A02 * Bx), (A03 * Bx)
5517mulps xmm1, xmm5 // (A10 * By), (A11 * By), (A12 * By), (A13 * By)
5518mulps xmm2, xmm6 // (A20 * Bz), (A21 * Bz), (A22 * Bz), (A23 * Bz)
5519mulps xmm3, xmm7 // (A30 * Bw), (A31 * Bw), (A32 * Bw), (A33 * Bw)
5520addps xmm0, xmm1 // Add rows
5521addps xmm2, xmm3
5522addps xmm0, xmm2
5523movups [Result], xmm0
5524end;
5525
5526class operator TMatrix4.Multiply(const A, B: TMatrix4): TMatrix4; assembler;
5527{ Code below consists of 4 Vector*Matrix calculations }
5528asm
5529{ A.R[0] * B }
5530movups xmm0, DQWORD [A + $00]
5531movups xmm4, DQWORD [B + $00]
5532movaps xmm1, xmm0
5533movaps xmm2, xmm0
5534movaps xmm3, xmm0
5535shufps xmm0, xmm0, $00
5536shufps xmm1, xmm1, $55
5537shufps xmm2, xmm2, $AA
5538shufps xmm3, xmm3, $FF
5539movups xmm5, DQWORD [B + $10]
5540movups xmm6, DQWORD [B + $20]
5541movups xmm7, DQWORD [B + $30]
5542mulps xmm0, xmm4
5543mulps xmm1, xmm5
5544mulps xmm2, xmm6
5545mulps xmm3, xmm7
5546addps xmm0, xmm1
5547addps xmm2, xmm3
5548addps xmm0, xmm2
5549movups DQWORD [Result + $00], xmm0
5550
5551{ A.R[1] * B }
5552movups xmm0, DQWORD [A + $10]
5553movaps xmm1, xmm0
5554movaps xmm2, xmm0
5555movaps xmm3, xmm0
5556shufps xmm0, xmm0, $00
5557shufps xmm1, xmm1, $55
5558shufps xmm2, xmm2, $AA
5559shufps xmm3, xmm3, $FF
5560mulps xmm0, xmm4
5561mulps xmm1, xmm5
5562mulps xmm2, xmm6
5563mulps xmm3, xmm7
5564addps xmm0, xmm1
5565addps xmm2, xmm3
5566addps xmm0, xmm2
5567movups DQWORD [Result + $10], xmm0
5568
5569{ A.R[2] * B }
5570movups xmm0, DQWORD [A + $20]
5571movaps xmm1, xmm0
5572movaps xmm2, xmm0
5573movaps xmm3, xmm0
5574shufps xmm0, xmm0, $00
5575shufps xmm1, xmm1, $55
5576shufps xmm2, xmm2, $AA
5577shufps xmm3, xmm3, $FF
5578mulps xmm0, xmm4
5579mulps xmm1, xmm5
5580mulps xmm2, xmm6
5581mulps xmm3, xmm7
5582addps xmm0, xmm1
5583addps xmm2, xmm3
5584addps xmm0, xmm2
5585movups DQWORD [Result + $20], xmm0
5586
5587{ A.R[3] * B }
5588movups xmm0, DQWORD [A + $30]
5589movaps xmm1, xmm0
5590movaps xmm2, xmm0
5591movaps xmm3, xmm0
5592shufps xmm0, xmm0, $00
5593shufps xmm1, xmm1, $55
5594shufps xmm2, xmm2, $AA
5595shufps xmm3, xmm3, $FF
5596mulps xmm0, xmm4
5597mulps xmm1, xmm5
5598mulps xmm2, xmm6
5599mulps xmm3, xmm7
5600addps xmm0, xmm1
5601addps xmm2, xmm3
5602addps xmm0, xmm2
5603movups DQWORD [Result + $30], xmm0
5604end;
5605{$ENDIF}
5606
5607class operator TMatrix4.Negative(const A: TMatrix4): TMatrix4; assembler;
5608asm
5609movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
5610movups xmm1, DQWORD [A + $00] // Load 4 rows
5611movups xmm2, DQWORD [A + $10]
5612movups xmm3, DQWORD [A + $20]
5613movups xmm4, DQWORD [A + $30]
5614xorps xmm1, xmm0 // Flip sign bits of each element in each row
5615xorps xmm2, xmm0
5616xorps xmm3, xmm0
5617xorps xmm4, xmm0
5618movups DQWORD [Result + $00], xmm1
5619movups DQWORD [Result + $10], xmm2
5620movups DQWORD [Result + $20], xmm3
5621movups DQWORD [Result + $30], xmm4
5622end;
5623
5624procedure TMatrix4.SetInversed;
5625begin
5626Self := Inverse;
5627end;
5628
5629procedure TMatrix4.SetTransposed;
5630begin
5631Self := Transpose;
5632end;
5633
5634class operator TMatrix4.Subtract(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5635asm
5636movss xmm0, [B] // Load single floating-point value
5637movups xmm1, DQWORD [A + $00] // Load 4 rows
5638shufps xmm0, xmm0, 0 // Replicate B
5639movups xmm2, DQWORD [A + $10]
5640movups xmm3, DQWORD [A + $20]
5641movups xmm4, DQWORD [A + $30]
5642subps xmm1, xmm0 // Subtract B from each row
5643subps xmm2, xmm0
5644subps xmm3, xmm0
5645subps xmm4, xmm0
5646movups DQWORD [Result + $00], xmm1
5647movups DQWORD [Result + $10], xmm2
5648movups DQWORD [Result + $20], xmm3
5649movups DQWORD [Result + $30], xmm4
5650end;
5651
5652class operator TMatrix4.Subtract(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5653asm
5654movss xmm0, [A] // Load single floating-point value
5655movups xmm4, DQWORD [B + $00] // Load 4 rows
5656shufps xmm0, xmm0, 0 // Replicate A
5657movups xmm5, DQWORD [B + $10]
5658movaps xmm1, xmm0
5659movaps xmm2, xmm0
5660movaps xmm3, xmm0
5661movups xmm6, DQWORD [B + $20]
5662movups xmm7, DQWORD [B + $30]
5663subps xmm0, xmm4 // Subtract each row from A
5664subps xmm1, xmm5
5665subps xmm2, xmm6
5666subps xmm3, xmm7
5667movups DQWORD [Result + $00], xmm0
5668movups DQWORD [Result + $10], xmm1
5669movups DQWORD [Result + $20], xmm2
5670movups DQWORD [Result + $30], xmm3
5671end;
5672
5673class operator TMatrix4.Subtract(const A, B: TMatrix4): TMatrix4; assembler;
5674asm
5675movups xmm0, DQWORD [A + $00] // Load 4 rows of A
5676movups xmm1, DQWORD [A + $10]
5677movups xmm2, DQWORD [A + $20]
5678movups xmm3, DQWORD [A + $30]
5679movups xmm4, DQWORD [B + $00] // Load 4 rows of B
5680movups xmm5, DQWORD [B + $10]
5681movups xmm6, DQWORD [B + $20]
5682movups xmm7, DQWORD [B + $30]
5683subps xmm0, xmm4 // Subtract rows
5684subps xmm1, xmm5
5685subps xmm2, xmm6
5686subps xmm3, xmm7
5687movups DQWORD [Result + $00], xmm0
5688movups DQWORD [Result + $10], xmm1
5689movups DQWORD [Result + $20], xmm2
5690movups DQWORD [Result + $30], xmm3
5691end;
5692
5693function TMatrix4.Transpose: TMatrix4; assembler;
5694asm
5695movups xmm0, DQWORD[Self + $00] // A03 A02 A01 A00
5696movups xmm1, DQWORD[Self + $10] // A13 A12 A11 A10
5697movups xmm2, DQWORD[Self + $20] // A23 A22 A21 A20
5698movups xmm3, DQWORD[Self + $30] // A33 A32 A31 A30
5699
5700movaps xmm4, xmm2
5701unpcklps xmm2, xmm3 // A31 A21 A30 A20
5702unpckhps xmm4, xmm3 // A33 A23 A32 A22
5703
5704movaps xmm3, xmm0
5705unpcklps xmm0, xmm1 // A11 A01 A10 A00
5706unpckhps xmm3, xmm1 // A13 A03 A12 A02
5707
5708movaps xmm1, xmm0
5709unpcklpd xmm0, xmm2 // A30 A20 A10 A00
5710unpckhpd xmm1, xmm2 // A31 A21 A11 A01
5711
5712movaps xmm2, xmm3
5713unpcklpd xmm2, xmm4 // A32 A22 A12 A02
5714unpckhpd xmm3, xmm4 // A33 A23 A13 A03
5715
5716movups DQWORD[Result + $00], xmm0
5717movups DQWORD[Result + $10], xmm1
5718movups DQWORD[Result + $20], xmm2
5719movups DQWORD[Result + $30], xmm3
5720end;
5721