MathgeomGLS
5975 строк · 174.3 Кб
1{ Note about x64 calling convention
2---------------------------------
3
4Delphi uses the Microsoft x64 calling convention:
5* Arguments that fit into 1, 2, 4 or 8 bytes are passed in registers RCX, RDX,
6R8 and R9. This includes TVector2 records (which are 8 bytes). So those need
7to be moved using MOVQ instead of MOVLPS.
8NOTE: Delphi 10.3 Rio changed this behavior for 8-byte records (TVector2):
9the register contains the address of the parameter instead (as is the case
10in 32-bit, so we should use MOVLPS).
11* The same goes for function results. Those go into RAX. Floating-point values
12are returned in XMM0.
13* The first 4 floating-point arguments go into XMM0, XMM1, XMM2 and XMM3.
14* These registers must be preserved:
15R12, R13, R14, R15, RDI, RSI, RBX, RBP, RSP, XMM6-XMM15.
16* At the start of a function, RBP is always aligned to a 16-byte boundary (its
17address always ends in 0).
18* RSP is also always a multiple of 16, but its address always ends in 8 (since
19the return address is pushed to the stack }
20
21const
22{ SSE rounding modes (bits in MXCSR register) }
23SSE_ROUND_MASK = $FFFF9FFF;
24SSE_ROUND_NEAREST = $00000000;
25SSE_ROUND_DOWN = $00002000;
26SSE_ROUND_UP = $00004000;
27SSE_ROUND_TRUNC = $00006000;
28
29{ These constants fit in a single XMM register. These values represent
30sign-bits as used by 32-bit floating-point values.
31XOR'ing a floating-point value with $80000000 swaps the sign.
32XOR'ing a floating-point value with $00000000 leaves the value unchanged. }
33SSE_MASK_SIGN: array [0..3] of UInt32 = ($80000000, $80000000, $80000000, $80000000);
34SSE_MASK_NPNP: array [0..3] of UInt32 = ($80000000, $00000000, $80000000, $00000000);
35SSE_MASK_PNPN: array [0..3] of UInt32 = ($00000000, $80000000, $00000000, $80000000);
36SSE_MASK_0FFF: array [0..3] of UInt32 = ($FFFFFFFF, $FFFFFFFF, $FFFFFFFF, $00000000);
37
38{ These constants mask off an element of the binary representation of a
3932-bit floating-point value. }
40SSE_MASK_FRACTION: array [0..3] of UInt32 = ($007FFFFF, $007FFFFF, $007FFFFF, $007FFFFF);
41SSE_MASK_EXPONENT: array [0..3] of UInt32 = ($7F800000, $7F800000, $7F800000, $7F800000);
42SSE_MASK_ABS_VAL : array [0..3] of UInt32 = ($7FFFFFFF, $7FFFFFFF, $7FFFFFFF, $7FFFFFFF);
43
44{ Commonly used floating-point values }
45SSE_ONE_HALF : array [0..3] of Single = (0.5, 0.5, 0.5, 0.5);
46SSE_ONE : array [0..3] of Single = (1, 1, 1, 1);
47SSE_TWO : array [0..3] of Single = (2, 2, 2, 2);
48SSE_THREE : array [0..3] of Single = (3, 3, 3, 3);
49SSE_PI_OVER_180 : array [0..3] of Single = (Pi / 180, Pi / 180, Pi / 180, Pi / 180);
50SSE_180_OVER_PI : array [0..3] of Single = (180 / Pi, 180 / Pi, 180 / Pi, 180 / Pi);
51SSE_NEG_INFINITY: array [0..3] of Single = (NegInfinity, NegInfinity, NegInfinity, NegInfinity);
52SSE_PI_OVER_4 : array [0..3] of Single = (Pi / 4, Pi / 4, Pi / 4, Pi / 4);
53
54{ Commonly used integer values }
55SSE_INT_ONE : array [0..3] of Integer = (1, 1, 1, 1);
56SSE_INT_NOT_ONE : array [0..3] of Cardinal = ($FFFFFFFE, $FFFFFFFE, $FFFFFFFE, $FFFFFFFE);
57SSE_INT_TWO : array [0..3] of Integer = (2, 2, 2, 2);
58SSE_INT_FOUR : array [0..3] of Integer = (4, 4, 4, 4);
59
60{ Constants for approximating trigonometric functions }
61SSE_FOPI: array [0..3] of Single = (1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516);
62SSE_SINCOF_P0: array [0..3] of Single = (-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4);
63SSE_SINCOF_P1: array [0..3] of Single = (8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3);
64SSE_SINCOF_P2: array [0..3] of Single = (-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1);
65SSE_COSCOF_P0: array [0..3] of Single = (2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005);
66SSE_COSCOF_P1: array [0..3] of Single = (-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003);
67SSE_COSCOF_P2: array [0..3] of Single = (4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002);
68
69SSE_EXP_A1 : array [0..3] of Single = (12102203.1615614, 12102203.1615614, 12102203.1615614, 12102203.1615614);
70SSE_EXP_A2 : array [0..3] of Single = (1065353216, 1065353216, 1065353216, 1065353216);
71SSE_EXP_CST: array [0..3] of Single = (2139095040, 2139095040, 2139095040, 2139095040);
72SSE_EXP_F1 : array [0..3] of Single = (0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125, 0.509964287281036376953125);
73SSE_EXP_F2 : array [0..3] of Single = (0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375, 0.3120158612728118896484375);
74SSE_EXP_F3 : array [0..3] of Single = (0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875, 0.1666135489940643310546875);
75SSE_EXP_F4 : array [0..3] of Single = (-2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3, -2.12528370320796966552734375e-3);
76SSE_EXP_F5 : array [0..3] of Single = (1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2, 1.3534179888665676116943359375e-2);
77SSE_EXP_I1 : array [0..3] of UInt32 = ($3F800000, $3F800000, $3F800000, $3F800000);
78
79SSE_LN_CST: array [0..3] of Single = (-89.93423858, -89.93423858, -89.93423858, -89.93423858);
80SSE_LN_F1 : array [0..3] of Single = (3.3977745, 3.3977745, 3.3977745, 3.3977745);
81SSE_LN_F2 : array [0..3] of Single = (2.2744832, 2.2744832, 2.2744832, 2.2744832);
82SSE_LN_F3 : array [0..3] of Single = (0.024982445, 0.024982445, 0.024982445, 0.024982445);
83SSE_LN_F4 : array [0..3] of Single = (0.24371102, 0.24371102, 0.24371102, 0.24371102);
84SSE_LN_F5 : array [0..3] of Single = (0.69314718055995, 0.69314718055995, 0.69314718055995, 0.69314718055995);
85
86SSE_LOG2_I1: array [0..3] of UInt32 = ($3F000000, $3F000000, $3F000000, $3F000000);
87SSE_LOG2_F1: array [0..3] of Single = (1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7, 1.1920928955078125e-7);
88SSE_LOG2_F2: array [0..3] of Single = (124.22551499, 124.22551499, 124.22551499, 124.22551499);
89SSE_LOG2_F3: array [0..3] of Single = (1.498030302, 1.498030302, 1.498030302, 1.498030302);
90SSE_LOG2_F4: array [0..3] of Single = (1.72587999, 1.72587999, 1.72587999, 1.72587999);
91SSE_LOG2_F5: array [0..3] of Single = (0.3520887068, 0.3520887068, 0.3520887068, 0.3520887068);
92
93SSE_EXP2_F1: array [0..3] of Single = (121.2740575, 121.2740575, 121.2740575, 121.2740575);
94SSE_EXP2_F2: array [0..3] of Single = (27.7280233, 27.7280233, 27.7280233, 27.7280233);
95SSE_EXP2_F3: array [0..3] of Single = (4.84252568, 4.84252568, 4.84252568, 4.84252568);
96SSE_EXP2_F4: array [0..3] of Single = (1.49012907, 1.49012907, 1.49012907, 1.49012907);
97SSE_EXP2_F5: array [0..3] of Single = ($00800000, $00800000, $00800000, $00800000);
98
99{ Angle and Trigonometry Functions }
100
101function Radians(const ADegrees: Single): Single;
102begin
103Result := ADegrees * (Pi / 180);
104end;
105
106function Radians(const ADegrees: TVector2): TVector2; assembler;
107asm
108{$IF RTLVersion >= 33}
109movlps xmm0, [ADegrees]
110{$ELSE}
111movq xmm0, ADegrees
112{$ENDIF}
113movlps xmm1, QWORD [SSE_PI_OVER_180]
114mulps xmm0, xmm1
115movq rax, xmm0
116end;
117
118function Radians(const ADegrees: TVector3): TVector3; assembler;
119asm
120movq xmm0, [ADegrees]
121movss xmm1, DWORD [ADegrees+8]
122movups xmm2, [SSE_PI_OVER_180]
123mulps xmm0, xmm2
124mulss xmm1, xmm2
125movq [Result], xmm0
126movss DWORD [Result+8], xmm1
127end;
128
129function Radians(const ADegrees: TVector4): TVector4; assembler;
130asm
131movups xmm0, [ADegrees]
132movups xmm1, [SSE_PI_OVER_180]
133mulps xmm0, xmm1
134movups [Result], xmm0
135end;
136
137function Degrees(const ARadians: Single): Single;
138begin
139Result := ARadians * (180 / Pi);
140end;
141
142function Degrees(const ARadians: TVector2): TVector2; assembler;
143asm
144{$IF RTLVersion >= 33}
145movlps xmm0, [ARadians]
146{$ELSE}
147movq xmm0, ARadians
148{$ENDIF}
149movlps xmm1, QWORD [SSE_180_OVER_PI]
150mulps xmm0, xmm1
151movq rax, xmm0
152end;
153
154function Degrees(const ARadians: TVector3): TVector3; assembler;
155asm
156movq xmm0, [ARadians]
157movss xmm1, DWORD [ARadians+8]
158movups xmm2, [SSE_180_OVER_PI]
159mulps xmm0, xmm2
160mulss xmm1, xmm2
161movq [Result], xmm0
162movss DWORD [Result+8], xmm1
163end;
164
165function Degrees(const ARadians: TVector4): TVector4; assembler;
166asm
167movups xmm0, [ARadians]
168movups xmm1, [SSE_180_OVER_PI]
169mulps xmm0, xmm1
170movups [Result], xmm0
171end;
172
173{ Exponential Functions }
174
175function Sqrt(const A: Single): Single; assembler;
176asm
177sqrtss xmm0, xmm0
178end;
179
180function Sqrt(const A: TVector2): TVector2; assembler;
181asm
182{$IF RTLVersion >= 33}
183movlps xmm0, [A]
184{$ELSE}
185movq xmm0, A
186{$ENDIF}
187sqrtps xmm0, xmm0
188movq rax, xmm0
189end;
190
191function Sqrt(const A: TVector3): TVector3; assembler;
192asm
193movq xmm0, [A]
194movss xmm1, DWORD [A+8]
195movlhps xmm0, xmm1
196sqrtps xmm0, xmm0
197movhlps xmm1, xmm0
198movq [Result], xmm0
199movss DWORD [Result+8], xmm1
200end;
201
202function Sqrt(const A: TVector4): TVector4; assembler;
203asm
204movups xmm0, [A]
205sqrtps xmm0, xmm0
206movups [Result], xmm0
207end;
208
209function InverseSqrt(const A: Single): Single; assembler;
210asm
211rsqrtss xmm0, xmm0
212end;
213
214function InverseSqrt(const A: TVector2): TVector2;
215asm
216{$IF RTLVersion >= 33}
217movlps xmm0, [A]
218{$ELSE}
219movq xmm0, A
220{$ENDIF}
221rsqrtps xmm0, xmm0
222movq rax, xmm0
223end;
224
225function InverseSqrt(const A: TVector3): TVector3;
226asm
227movq xmm0, [A]
228movss xmm1, DWORD [A+8]
229movlhps xmm0, xmm1
230rsqrtps xmm0, xmm0
231movhlps xmm1, xmm0
232movq [Result], xmm0
233movss DWORD [Result+8], xmm1
234end;
235
236function InverseSqrt(const A: TVector4): TVector4; assembler;
237asm
238movups xmm0, [A]
239rsqrtps xmm0, xmm0
240movups [Result], xmm0
241end;
242
243{ Fast approximate Functions }
244
245function FastSin(const ARadians: Single): Single; assembler;
246asm
247movdqa [rsp-24], xmm6
248movdqa [rsp-40], xmm7
249
250movss xmm2, DWORD [SSE_MASK_ABS_VAL]
251movaps xmm1, xmm0
252movss xmm3, DWORD [SSE_MASK_SIGN]
253andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
254andps xmm1, xmm3 // (xmm1) SignBit
255movaps xmm2, xmm0
256movss xmm4, DWORD [SSE_FOPI]
257movss xmm5, DWORD [SSE_INT_ONE]
258mulss xmm2, xmm4
259movss xmm6, DWORD [SSE_INT_NOT_ONE]
260cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
261movss xmm7, DWORD [SSE_INT_FOUR]
262paddd xmm2, xmm5
263pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
264movss xmm6, DWORD [SSE_INT_TWO]
265cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
266movaps xmm5, xmm2
267pand xmm2, xmm6 // J and 2
268pand xmm5, xmm7 // J and 4
269pxor xmm7, xmm7
270pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
271pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
272movss xmm6, DWORD [SSE_PI_OVER_4]
273pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
274mulss xmm4, xmm6 // Y * Pi / 4
275movss xmm3, DWORD [SSE_COSCOF_P0]
276subss xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
277movss xmm4, DWORD [SSE_COSCOF_P1]
278movaps xmm7, xmm0
279movss xmm6, DWORD [SSE_COSCOF_P2]
280mulss xmm7, xmm7 // (xmm7) Z := X * X
281movss xmm5, DWORD [SSE_SINCOF_P1]
282mulss xmm3, xmm7 // COSCOF_P0 * Z
283addss xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
284movss xmm4, DWORD [SSE_ONE_HALF]
285mulss xmm3, xmm7 // Y * Z
286mulss xmm4, xmm7 // Z * 0.5
287addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
288movss xmm6, DWORD [SSE_ONE]
289mulss xmm3, xmm7 // Y * Z
290mulss xmm3, xmm7 // Y := Y * (Z * Z)
291subss xmm3, xmm4 // Y - Z * 0.5
292movss xmm4, DWORD [SSE_SINCOF_P0]
293addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
294movss xmm6, DWORD [SSE_SINCOF_P2]
295mulss xmm4, xmm7 // SINCOF_P0 * Z
296addss xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
297movaps xmm5, xmm2
298mulss xmm4, xmm7 // Y2 * Z
299addss xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
300mulss xmm4, xmm7 // Y2 * Z
301mulss xmm4, xmm0 // Y2 * (Z * X)
302addss xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
303andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
304andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
305addss xmm4, xmm5
306xorps xmm4, xmm1 // (Y + Y2) xor SignBit
307movss xmm0, xmm4
308
309movdqa xmm6, [rsp-24]
310movdqa xmm7, [rsp-40]
311end;
312
313function FastSin(const ARadians: TVector2): TVector2; assembler;
314asm
315movdqa [rsp-24], xmm6
316movdqa [rsp-40], xmm7
317
318{$IF RTLVersion >= 33}
319movlps xmm0, [ARadians]
320{$ELSE}
321movq xmm0, ARadians
322{$ENDIF}
323movlps xmm2, QWORD [SSE_MASK_ABS_VAL]
324movaps xmm1, xmm0
325movlps xmm3, QWORD [SSE_MASK_SIGN]
326andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
327andps xmm1, xmm3 // (xmm1) SignBit
328movaps xmm2, xmm0
329movlps xmm4, QWORD [SSE_FOPI]
330movlps xmm5, QWORD [SSE_INT_ONE]
331mulps xmm2, xmm4
332movlps xmm6, QWORD [SSE_INT_NOT_ONE]
333cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
334movlps xmm7, QWORD [SSE_INT_FOUR]
335paddd xmm2, xmm5
336pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
337movlps xmm6, QWORD [SSE_INT_TWO]
338cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
339movaps xmm5, xmm2
340pand xmm2, xmm6 // J and 2
341pand xmm5, xmm7 // J and 4
342pxor xmm7, xmm7
343pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
344pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
345movlps xmm6, QWORD [SSE_PI_OVER_4]
346pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
347mulps xmm4, xmm6 // Y * Pi / 4
348movlps xmm3, QWORD [SSE_COSCOF_P0]
349subps xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
350movlps xmm4, QWORD [SSE_COSCOF_P1]
351movaps xmm7, xmm0
352movlps xmm6, QWORD [SSE_COSCOF_P2]
353mulps xmm7, xmm7 // (xmm7) Z := X * X
354movlps xmm5, QWORD [SSE_SINCOF_P1]
355mulps xmm3, xmm7 // COSCOF_P0 * Z
356addps xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
357movlps xmm4, QWORD [SSE_ONE_HALF]
358mulps xmm3, xmm7 // Y * Z
359mulps xmm4, xmm7 // Z * 0.5
360addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
361movlps xmm6, QWORD [SSE_ONE]
362mulps xmm3, xmm7 // Y * Z
363mulps xmm3, xmm7 // Y := Y * (Z * Z)
364subps xmm3, xmm4 // Y - Z * 0.5
365movlps xmm4, QWORD [SSE_SINCOF_P0]
366addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
367movlps xmm6, QWORD [SSE_SINCOF_P2]
368mulps xmm4, xmm7 // SINCOF_P0 * Z
369addps xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
370movaps xmm5, xmm2
371mulps xmm4, xmm7 // Y2 * Z
372addps xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
373mulps xmm4, xmm7 // Y2 * Z
374mulps xmm4, xmm0 // Y2 * (Z * X)
375addps xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
376andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
377andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
378addps xmm4, xmm5
379xorps xmm4, xmm1 // (Y + Y2) xor SignBit
380movq rax, xmm4
381
382movdqa xmm6, [rsp-24]
383movdqa xmm7, [rsp-40]
384end;
385
386function FastSin(const ARadians: TVector3): TVector3; assembler;
387asm
388movdqa [rsp-24], xmm6
389movdqa [rsp-40], xmm7
390
391movq xmm0, [ARadians]
392movss xmm1, DWORD [ARadians+8]
393movlhps xmm0, xmm1
394movups xmm2, [SSE_MASK_ABS_VAL]
395movaps xmm1, xmm0
396movups xmm3, [SSE_MASK_SIGN]
397andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
398andps xmm1, xmm3 // (xmm1) SignBit
399movaps xmm2, xmm0
400movups xmm4, [SSE_FOPI]
401movups xmm5, [SSE_INT_ONE]
402mulps xmm2, xmm4
403movups xmm6, [SSE_INT_NOT_ONE]
404cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
405movups xmm7, [SSE_INT_FOUR]
406paddd xmm2, xmm5
407pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
408movups xmm6, [SSE_INT_TWO]
409cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
410movaps xmm5, xmm2
411pand xmm2, xmm6 // J and 2
412pand xmm5, xmm7 // J and 4
413pxor xmm7, xmm7
414pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
415pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
416movups xmm6, [SSE_PI_OVER_4]
417pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
418mulps xmm4, xmm6 // Y * Pi / 4
419movups xmm3, [SSE_COSCOF_P0]
420subps xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
421movups xmm4, [SSE_COSCOF_P1]
422movaps xmm7, xmm0
423movups xmm6, [SSE_COSCOF_P2]
424mulps xmm7, xmm7 // (xmm7) Z := X * X
425movups xmm5, [SSE_SINCOF_P1]
426mulps xmm3, xmm7 // COSCOF_P0 * Z
427addps xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
428movups xmm4, [SSE_ONE_HALF]
429mulps xmm3, xmm7 // Y * Z
430mulps xmm4, xmm7 // Z * 0.5
431addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
432movups xmm6, [SSE_ONE]
433mulps xmm3, xmm7 // Y * Z
434mulps xmm3, xmm7 // Y := Y * (Z * Z)
435subps xmm3, xmm4 // Y - Z * 0.5
436movups xmm4, [SSE_SINCOF_P0]
437addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
438movups xmm6, [SSE_SINCOF_P2]
439mulps xmm4, xmm7 // SINCOF_P0 * Z
440addps xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
441movaps xmm5, xmm2
442mulps xmm4, xmm7 // Y2 * Z
443addps xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
444mulps xmm4, xmm7 // Y2 * Z
445mulps xmm4, xmm0 // Y2 * (Z * X)
446addps xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
447andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
448andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
449addps xmm4, xmm5
450xorps xmm4, xmm1 // (Y + Y2) xor SignBit
451movhlps xmm5, xmm4
452movq [Result], xmm4
453movss DWORD [Result+8], xmm5
454
455movdqa xmm6, [rsp-24]
456movdqa xmm7, [rsp-40]
457end;
458
459function FastSin(const ARadians: TVector4): TVector4; assembler;
460asm
461movdqa [rsp-24], xmm6
462movdqa [rsp-40], xmm7
463
464movups xmm0, [ARadians]
465movups xmm2, [SSE_MASK_ABS_VAL]
466movaps xmm1, xmm0
467movups xmm3, [SSE_MASK_SIGN]
468andps xmm0, xmm2 // (xmm0) X := Abs(ARadians)
469andps xmm1, xmm3 // (xmm1) SignBit
470movaps xmm2, xmm0
471movups xmm4, [SSE_FOPI]
472movups xmm5, [SSE_INT_ONE]
473mulps xmm2, xmm4
474movups xmm6, [SSE_INT_NOT_ONE]
475cvtps2dq xmm2, xmm2 // J := Trunc(X * FOPI)
476movups xmm7, [SSE_INT_FOUR]
477paddd xmm2, xmm5
478pand xmm2, xmm6 // (xmm2) J := (J + 1) and (not 1)
479movups xmm6, [SSE_INT_TWO]
480cvtdq2ps xmm4, xmm2 // (xmm4) Y := J
481movaps xmm5, xmm2
482pand xmm2, xmm6 // J and 2
483pand xmm5, xmm7 // J and 4
484pxor xmm7, xmm7
485pslld xmm5, 29 // (xmm5) SwapSignBit := (J and 4) shl 29
486pcmpeqd xmm2, xmm7 // (xmm2) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
487movups xmm6, [SSE_PI_OVER_4]
488pxor xmm1, xmm5 // (xmm1) SignBit := SignBit xor SwapSignBit
489mulps xmm4, xmm6 // Y * Pi / 4
490movups xmm3, [SSE_COSCOF_P0]
491subps xmm0, xmm4 // (xmm0) X := X - (Y * Pi / 4)
492movups xmm4, [SSE_COSCOF_P1]
493movaps xmm7, xmm0
494movups xmm6, [SSE_COSCOF_P2]
495mulps xmm7, xmm7 // (xmm7) Z := X * X
496movups xmm5, [SSE_SINCOF_P1]
497mulps xmm3, xmm7 // COSCOF_P0 * Z
498addps xmm3, xmm4 // Y := COSCOF_P0 * Z + COSCOF_P1
499movups xmm4, [SSE_ONE_HALF]
500mulps xmm3, xmm7 // Y * Z
501mulps xmm4, xmm7 // Z * 0.5
502addps xmm3, xmm6 // Y := (Y * Z) + COSCOF_P2
503movups xmm6, [SSE_ONE]
504mulps xmm3, xmm7 // Y * Z
505mulps xmm3, xmm7 // Y := Y * (Z * Z)
506subps xmm3, xmm4 // Y - Z * 0.5
507movups xmm4, [SSE_SINCOF_P0]
508addps xmm3, xmm6 // (xmm3) Y := Y - Z * 0.5 + 1
509movups xmm6, [SSE_SINCOF_P2]
510mulps xmm4, xmm7 // SINCOF_P0 * Z
511addps xmm4, xmm5 // Y2 := SINCOF_P0 * Z + SINCOF_P1
512movaps xmm5, xmm2
513mulps xmm4, xmm7 // Y2 * Z
514addps xmm4, xmm6 // Y2 := (Y2 * Z) + SINCOF_P2
515mulps xmm4, xmm7 // Y2 * Z
516mulps xmm4, xmm0 // Y2 * (Z * X)
517addps xmm4, xmm0 // (xmm4) Y2 := Y2 * (Z * X) + X
518andps xmm4, xmm2 // Y2 := ((J and 2) = 0)? Yes: Y2, No: 0
519andnps xmm5, xmm3 // Y := ((J and 2) = 0)? Yes: 0 , No: Y
520addps xmm4, xmm5
521xorps xmm4, xmm1 // (Y + Y2) xor SignBit
522movups [Result], xmm4
523
524movdqa xmm6, [rsp-24]
525movdqa xmm7, [rsp-40]
526end;
527
528function FastCos(const ARadians: Single): Single; assembler;
529asm
530movdqa [rsp-24], xmm6
531movdqa [rsp-40], xmm7
532
533movss xmm1, DWORD [SSE_MASK_ABS_VAL]
534movss xmm2, DWORD [SSE_FOPI]
535andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
536movss xmm3, DWORD [SSE_INT_NOT_ONE]
537movaps xmm1, xmm0
538movss xmm4, DWORD [SSE_INT_FOUR]
539mulss xmm1, xmm2
540movss xmm2, DWORD [SSE_INT_ONE]
541cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
542pxor xmm6, xmm6
543paddd xmm1, xmm2
544pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
545movss xmm3, DWORD [SSE_INT_TWO]
546cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
547psubd xmm1, xmm3 // J - 2
548movaps xmm5, xmm1
549pandn xmm1, xmm4 // (not (J - 2)) and 4
550pand xmm5, xmm3 // (J - 2) and 2
551pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
552movss xmm3, DWORD [SSE_PI_OVER_4]
553pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
554mulss xmm2, xmm3 // Y * Pi / 4
555movss xmm3, DWORD [SSE_COSCOF_P1]
556subss xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
557movss xmm2, DWORD [SSE_COSCOF_P0]
558movss xmm4, DWORD [SSE_COSCOF_P2]
559movaps xmm6, xmm0
560mulss xmm6, xmm6 // (xmm6) Z := X * X
561mulss xmm2, xmm6 // COSCOF_P0 * Z
562addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
563movss xmm3, DWORD [SSE_ONE_HALF]
564mulss xmm2, xmm6 // Y * Z
565mulss xmm3, xmm6 // Z * 0.5
566addss xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
567movss xmm7, DWORD [SSE_ONE]
568mulss xmm2, xmm6
569movss xmm4, DWORD [SSE_SINCOF_P1]
570mulss xmm2, xmm6 // Y := Y * (Z * Z)
571subss xmm2, xmm3 // Y - Z * 0.5
572addss xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
573movss xmm3, DWORD [SSE_SINCOF_P0]
574movss xmm7, DWORD [SSE_SINCOF_P2]
575mulss xmm3, xmm6 // SINCOF_P0 * Z
576addss xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
577mulss xmm3, xmm6 // Y2 * Z
578addss xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
579mulss xmm3, xmm6 // Y2 * Z
580mulss xmm3, xmm0 // Y2 * (Z * X)
581addss xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
582andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
583andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
584addss xmm3, xmm5
585xorps xmm3, xmm1 // (Y + Y2) xor SignBit
586movss xmm0, xmm3
587
588movdqa xmm6, [rsp-24]
589movdqa xmm7, [rsp-40]
590end;
591
592function FastCos(const ARadians: TVector2): TVector2; assembler;
593asm
594movdqa [rsp-24], xmm6
595movdqa [rsp-40], xmm7
596
597{$IF RTLVersion >= 33}
598movlps xmm0, [ARadians]
599{$ELSE}
600movq xmm0, ARadians
601{$ENDIF}
602movlps xmm1, QWORD [SSE_MASK_ABS_VAL]
603movlps xmm2, QWORD [SSE_FOPI]
604andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
605movlps xmm3, QWORD [SSE_INT_NOT_ONE]
606movaps xmm1, xmm0
607movlps xmm4, QWORD [SSE_INT_FOUR]
608mulps xmm1, xmm2
609movlps xmm2, QWORD [SSE_INT_ONE]
610cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
611pxor xmm6, xmm6
612paddd xmm1, xmm2
613pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
614movlps xmm3, QWORD [SSE_INT_TWO]
615cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
616psubd xmm1, xmm3 // J - 2
617movaps xmm5, xmm1
618pandn xmm1, xmm4 // (not (J - 2)) and 4
619pand xmm5, xmm3 // (J - 2) and 2
620pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
621movlps xmm3, QWORD [SSE_PI_OVER_4]
622pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
623mulps xmm2, xmm3 // Y * Pi / 4
624movlps xmm3, QWORD [SSE_COSCOF_P1]
625subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
626movlps xmm2, QWORD [SSE_COSCOF_P0]
627movlps xmm4, QWORD [SSE_COSCOF_P2]
628movaps xmm6, xmm0
629mulps xmm6, xmm6 // (xmm6) Z := X * X
630mulps xmm2, xmm6 // COSCOF_P0 * Z
631addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
632movlps xmm3, QWORD [SSE_ONE_HALF]
633mulps xmm2, xmm6 // Y * Z
634mulps xmm3, xmm6 // Z * 0.5
635addps xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
636movlps xmm7, QWORD [SSE_ONE]
637mulps xmm2, xmm6
638movlps xmm4, QWORD [SSE_SINCOF_P1]
639mulps xmm2, xmm6 // Y := Y * (Z * Z)
640subps xmm2, xmm3 // Y - Z * 0.5
641addps xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
642movlps xmm3, QWORD [SSE_SINCOF_P0]
643movlps xmm7, QWORD [SSE_SINCOF_P2]
644mulps xmm3, xmm6 // SINCOF_P0 * Z
645addps xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
646mulps xmm3, xmm6 // Y2 * Z
647addps xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
648mulps xmm3, xmm6 // Y2 * Z
649mulps xmm3, xmm0 // Y2 * (Z * X)
650addps xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
651andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
652andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
653addps xmm3, xmm5
654xorps xmm3, xmm1 // (Y + Y2) xor SignBit
655movq rax, xmm3
656
657movdqa xmm6, [rsp-24]
658movdqa xmm7, [rsp-40]
659end;
660
661function FastCos(const ARadians: TVector3): TVector3; assembler;
662asm
663movdqa [rsp-24], xmm6
664movdqa [rsp-40], xmm7
665
666movq xmm0, [ARadians]
667movss xmm1, DWORD [ARadians+8]
668movlhps xmm0, xmm1
669movups xmm1, [SSE_MASK_ABS_VAL]
670movups xmm2, [SSE_FOPI]
671andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
672movups xmm3, [SSE_INT_NOT_ONE]
673movaps xmm1, xmm0
674movups xmm4, [SSE_INT_FOUR]
675mulps xmm1, xmm2
676movups xmm2, [SSE_INT_ONE]
677cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
678pxor xmm6, xmm6
679paddd xmm1, xmm2
680pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
681movups xmm3, [SSE_INT_TWO]
682cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
683psubd xmm1, xmm3 // J - 2
684movaps xmm5, xmm1
685pandn xmm1, xmm4 // (not (J - 2)) and 4
686pand xmm5, xmm3 // (J - 2) and 2
687pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
688movups xmm3, [SSE_PI_OVER_4]
689pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
690mulps xmm2, xmm3 // Y * Pi / 4
691movups xmm3, [SSE_COSCOF_P1]
692subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
693movups xmm2, [SSE_COSCOF_P0]
694movups xmm4, [SSE_COSCOF_P2]
695movaps xmm6, xmm0
696mulps xmm6, xmm6 // (xmm6) Z := X * X
697mulps xmm2, xmm6 // COSCOF_P0 * Z
698addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
699movups xmm3, [SSE_ONE_HALF]
700mulps xmm2, xmm6 // Y * Z
701mulps xmm3, xmm6 // Z * 0.5
702addps xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
703movups xmm7, [SSE_ONE]
704mulps xmm2, xmm6
705movups xmm4, [SSE_SINCOF_P1]
706mulps xmm2, xmm6 // Y := Y * (Z * Z)
707subps xmm2, xmm3 // Y - Z * 0.5
708addps xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
709movups xmm3, [SSE_SINCOF_P0]
710movups xmm7, [SSE_SINCOF_P2]
711mulps xmm3, xmm6 // SINCOF_P0 * Z
712addps xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
713mulps xmm3, xmm6 // Y2 * Z
714addps xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
715mulps xmm3, xmm6 // Y2 * Z
716mulps xmm3, xmm0 // Y2 * (Z * X)
717addps xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
718andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
719andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
720addps xmm3, xmm5
721xorps xmm3, xmm1 // (Y + Y2) xor SignBit
722movhlps xmm4, xmm3
723movq [Result], xmm3
724movss DWORD [Result+8], xmm4
725
726movdqa xmm6, [rsp-24]
727movdqa xmm7, [rsp-40]
728end;
729
730function FastCos(const ARadians: TVector4): TVector4; assembler;
731asm
732movdqa [rsp-24], xmm6
733movdqa [rsp-40], xmm7
734
735movups xmm0, [ARadians]
736movups xmm1, [SSE_MASK_ABS_VAL]
737movups xmm2, [SSE_FOPI]
738andps xmm0, xmm1 // (xmm0) X := Abs(ARadians)
739movups xmm3, [SSE_INT_NOT_ONE]
740movaps xmm1, xmm0
741movups xmm4, [SSE_INT_FOUR]
742mulps xmm1, xmm2
743movups xmm2, [SSE_INT_ONE]
744cvtps2dq xmm1, xmm1 // J := Trunc(X * FOPI)
745pxor xmm6, xmm6
746paddd xmm1, xmm2
747pand xmm1, xmm3 // (xmm1) J := (J + 1) and (not 1)
748movups xmm3, [SSE_INT_TWO]
749cvtdq2ps xmm2, xmm1 // (xmm2) Y := J
750psubd xmm1, xmm3 // J - 2
751movaps xmm5, xmm1
752pandn xmm1, xmm4 // (not (J - 2)) and 4
753pand xmm5, xmm3 // (J - 2) and 2
754pslld xmm1, 29 // (xmm1) SignBit := ((not (J - 2)) and 4) shl 29
755movups xmm3, [SSE_PI_OVER_4]
756pcmpeqd xmm5, xmm6 // (xmm5) PolyMask := ((J-2) and 2)=0)? Yes: $FFFFFFFF, No: $00000000
757mulps xmm2, xmm3 // Y * Pi / 4
758movups xmm3, [SSE_COSCOF_P1]
759subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
760movups xmm2, [SSE_COSCOF_P0]
761movups xmm4, [SSE_COSCOF_P2]
762movaps xmm6, xmm0
763mulps xmm6, xmm6 // (xmm6) Z := X * X
764mulps xmm2, xmm6 // COSCOF_P0 * Z
765addps xmm2, xmm3 // Y := COSCOF_P0 * Z + COSCOF_P1
766movups xmm3, [SSE_ONE_HALF]
767mulps xmm2, xmm6 // Y * Z
768mulps xmm3, xmm6 // Z * 0.5
769addps xmm2, xmm4 // Y := (Y * Z) + COSCOF_P2
770movups xmm7, [SSE_ONE]
771mulps xmm2, xmm6
772movups xmm4, [SSE_SINCOF_P1]
773mulps xmm2, xmm6 // Y := Y * (Z * Z)
774subps xmm2, xmm3 // Y - Z * 0.5
775addps xmm2, xmm7 // (xmm2) Y := Y - Z * 0.5 + 1
776movups xmm3, [SSE_SINCOF_P0]
777movups xmm7, [SSE_SINCOF_P2]
778mulps xmm3, xmm6 // SINCOF_P0 * Z
779addps xmm3, xmm4 // Y2 := SINCOF_P0 * Z + SINCOF_P1
780mulps xmm3, xmm6 // Y2 * Z
781addps xmm3, xmm7 // Y2 := (Y2 * Z) + SINCOF_P2
782mulps xmm3, xmm6 // Y2 * Z
783mulps xmm3, xmm0 // Y2 * (Z * X)
784addps xmm3, xmm0 // Y2 := Y2 * (Z * X) + X
785andps xmm3, xmm5 // ((J-2) and 2) = 0)? Yes: Y2, No: 0
786andnps xmm5, xmm2 // ((J-2) and 2) = 0)? Yes: 0 , No: Y
787addps xmm3, xmm5
788xorps xmm3, xmm1 // (Y + Y2) xor SignBit
789movups [Result], xmm3
790
791movdqa xmm6, [rsp-24]
792movdqa xmm7, [rsp-40]
793end;
794
795procedure FastSinCos(const ARadians: Single; out ASin, ACos: Single); assembler;
796asm
797movdqa [rsp-24], xmm6
798movdqa [rsp-40], xmm7
799
800movss xmm2, DWORD [SSE_MASK_SIGN]
801movss xmm3, DWORD [SSE_MASK_ABS_VAL]
802movaps xmm1, xmm0
803pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
804pand xmm1, xmm2 // (xmm1) SignBitSin
805movaps xmm4, xmm0
806movss xmm5, DWORD [SSE_FOPI]
807movss xmm6, DWORD [SSE_INT_ONE]
808mulss xmm4, xmm5
809movss xmm7, DWORD [SSE_INT_NOT_ONE]
810cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
811movss xmm5, DWORD [SSE_INT_FOUR]
812paddd xmm4, xmm6
813pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
814movss xmm7, DWORD [SSE_INT_TWO]
815cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
816movaps xmm3, xmm4
817movaps xmm6, xmm4 // (xmm6) J
818pand xmm3, xmm5 // J and 4
819pand xmm4, xmm7 // J and 2
820pxor xmm5, xmm5
821pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
822movss xmm7, DWORD [SSE_PI_OVER_4]
823pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
824mulss xmm2, xmm7 // Y * Pi / 4
825movss xmm5, DWORD [SSE_INT_TWO]
826subss xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
827psubd xmm6, xmm5 // J - 2
828movss xmm7, DWORD [SSE_INT_FOUR]
829pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
830andnps xmm6, xmm7 // (not (J - 2)) and 4
831movaps xmm3, xmm0
832pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
833mulss xmm3, xmm3 // (xmm3) Z := X * X
834movss xmm2, DWORD [SSE_COSCOF_P0]
835movss xmm5, DWORD [SSE_COSCOF_P1]
836movss xmm7, DWORD [SSE_COSCOF_P2]
837mulss xmm2, xmm3 // COSCOF_P0 * Z
838addss xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
839movss xmm5, DWORD [SSE_ONE_HALF]
840mulss xmm2, xmm3 // Y * Z
841addss xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
842movss xmm7, DWORD [SSE_ONE]
843mulss xmm2, xmm3 // Y * Z
844mulss xmm5, xmm3 // 0.5 * Z
845mulss xmm2, xmm3 // Y * (Z * Z)
846subss xmm2, xmm5 // Y - 0.5 * Z
847movss xmm5, DWORD [SSE_SINCOF_P0]
848addss xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
849movss xmm7, DWORD [SSE_SINCOF_P1]
850mulss xmm5, xmm3 // SINCOF_P0 * Z
851addss xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
852mulss xmm5, xmm3 // Y2 * Z
853movss xmm7, DWORD [SSE_SINCOF_P2]
854addss xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
855mulss xmm5, xmm3 // Y2 * Z
856mulss xmm5, xmm0 // Y2 * (Z * X)
857addss xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
858movaps xmm0, xmm2 // Y
859movaps xmm3, xmm5 // Y2
860andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
861andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
862subss xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
863subss xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
864addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
865addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
866xorps xmm4, xmm1 // Sin
867xorps xmm3, xmm6 // Cos
868movss [ASin], xmm4
869movss [ACos], xmm3
870
871movdqa xmm6, [rsp-24]
872movdqa xmm7, [rsp-40]
873end;
874
875procedure FastSinCos(const ARadians: TVector2; out ASin, ACos: TVector2); assembler;
876asm
877movdqa [rsp-24], xmm6
878movdqa [rsp-40], xmm7
879
880{$IF RTLVersion >= 33}
881movlps xmm0, [ARadians]
882{$ELSE}
883movq xmm0, ARadians
884{$ENDIF}
885movlps xmm2, QWORD [SSE_MASK_SIGN]
886movlps xmm3, QWORD [SSE_MASK_ABS_VAL]
887movaps xmm1, xmm0
888pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
889pand xmm1, xmm2 // (xmm1) SignBitSin
890movaps xmm4, xmm0
891movlps xmm5, QWORD [SSE_FOPI]
892movlps xmm6, QWORD [SSE_INT_ONE]
893mulps xmm4, xmm5
894movlps xmm7, QWORD [SSE_INT_NOT_ONE]
895cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
896movlps xmm5, QWORD [SSE_INT_FOUR]
897paddd xmm4, xmm6
898pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
899movlps xmm7, QWORD [SSE_INT_TWO]
900cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
901movaps xmm3, xmm4
902movaps xmm6, xmm4 // (xmm6) J
903pand xmm3, xmm5 // J and 4
904pand xmm4, xmm7 // J and 2
905pxor xmm5, xmm5
906pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
907movlps xmm7, QWORD [SSE_PI_OVER_4]
908pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
909mulps xmm2, xmm7 // Y * Pi / 4
910movlps xmm5, QWORD [SSE_INT_TWO]
911subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
912psubd xmm6, xmm5 // J - 2
913movlps xmm7, QWORD [SSE_INT_FOUR]
914pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
915andnps xmm6, xmm7 // (not (J - 2)) and 4
916movaps xmm3, xmm0
917pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
918mulps xmm3, xmm3 // (xmm3) Z := X * X
919movlps xmm2, QWORD [SSE_COSCOF_P0]
920movlps xmm5, QWORD [SSE_COSCOF_P1]
921movlps xmm7, QWORD [SSE_COSCOF_P2]
922mulps xmm2, xmm3 // COSCOF_P0 * Z
923addps xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
924movlps xmm5, QWORD [SSE_ONE_HALF]
925mulps xmm2, xmm3 // Y * Z
926addps xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
927movlps xmm7, QWORD [SSE_ONE]
928mulps xmm2, xmm3 // Y * Z
929mulps xmm5, xmm3 // 0.5 * Z
930mulps xmm2, xmm3 // Y * (Z * Z)
931subps xmm2, xmm5 // Y - 0.5 * Z
932movlps xmm5, QWORD [SSE_SINCOF_P0]
933addps xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
934movlps xmm7, QWORD [SSE_SINCOF_P1]
935mulps xmm5, xmm3 // SINCOF_P0 * Z
936addps xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
937mulps xmm5, xmm3 // Y2 * Z
938movlps xmm7, QWORD [SSE_SINCOF_P2]
939addps xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
940mulps xmm5, xmm3 // Y2 * Z
941mulps xmm5, xmm0 // Y2 * (Z * X)
942addps xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
943movaps xmm0, xmm2 // Y
944movaps xmm3, xmm5 // Y2
945andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
946andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
947subps xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
948subps xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
949addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
950addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
951xorps xmm4, xmm1 // Sin
952xorps xmm3, xmm6 // Cos
953movlps [ASin], xmm4
954movlps [ACos], xmm3
955
956movdqa xmm6, [rsp-24]
957movdqa xmm7, [rsp-40]
958end;
959
960procedure FastSinCos(const ARadians: TVector3; out ASin, ACos: TVector3); assembler;
961asm
962movdqa [rsp-24], xmm6
963movdqa [rsp-40], xmm7
964
965movq xmm0, [ARadians]
966movss xmm1, DWORD [ARadians+8]
967movlhps xmm0, xmm1
968movups xmm2, [SSE_MASK_SIGN]
969movups xmm3, [SSE_MASK_ABS_VAL]
970movaps xmm1, xmm0
971pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
972pand xmm1, xmm2 // (xmm1) SignBitSin
973movaps xmm4, xmm0
974movups xmm5, [SSE_FOPI]
975movups xmm6, [SSE_INT_ONE]
976mulps xmm4, xmm5
977movups xmm7, [SSE_INT_NOT_ONE]
978cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
979movups xmm5, [SSE_INT_FOUR]
980paddd xmm4, xmm6
981pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
982movups xmm7, [SSE_INT_TWO]
983cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
984movaps xmm3, xmm4
985movaps xmm6, xmm4 // (xmm6) J
986pand xmm3, xmm5 // J and 4
987pand xmm4, xmm7 // J and 2
988pxor xmm5, xmm5
989pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
990movups xmm7, [SSE_PI_OVER_4]
991pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
992mulps xmm2, xmm7 // Y * Pi / 4
993movups xmm5, [SSE_INT_TWO]
994subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
995psubd xmm6, xmm5 // J - 2
996movups xmm7, [SSE_INT_FOUR]
997pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
998andnps xmm6, xmm7 // (not (J - 2)) and 4
999movaps xmm3, xmm0
1000pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
1001mulps xmm3, xmm3 // (xmm3) Z := X * X
1002movups xmm2, [SSE_COSCOF_P0]
1003movups xmm5, [SSE_COSCOF_P1]
1004movups xmm7, [SSE_COSCOF_P2]
1005mulps xmm2, xmm3 // COSCOF_P0 * Z
1006addps xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
1007movups xmm5, [SSE_ONE_HALF]
1008mulps xmm2, xmm3 // Y * Z
1009addps xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
1010movups xmm7, [SSE_ONE]
1011mulps xmm2, xmm3 // Y * Z
1012mulps xmm5, xmm3 // 0.5 * Z
1013mulps xmm2, xmm3 // Y * (Z * Z)
1014subps xmm2, xmm5 // Y - 0.5 * Z
1015movups xmm5, [SSE_SINCOF_P0]
1016addps xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
1017movups xmm7, [SSE_SINCOF_P1]
1018mulps xmm5, xmm3 // SINCOF_P0 * Z
1019addps xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
1020mulps xmm5, xmm3 // Y2 * Z
1021movups xmm7, [SSE_SINCOF_P2]
1022addps xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
1023mulps xmm5, xmm3 // Y2 * Z
1024mulps xmm5, xmm0 // Y2 * (Z * X)
1025addps xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
1026movaps xmm0, xmm2 // Y
1027movaps xmm3, xmm5 // Y2
1028andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
1029andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
1030subps xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
1031subps xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
1032addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
1033addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
1034xorps xmm4, xmm1 // Sin
1035xorps xmm3, xmm6 // Cos
1036movhlps xmm5, xmm4
1037movhlps xmm2, xmm3
1038movq [ASin], xmm4
1039movss DWORD [ASin+8], xmm5
1040movq [ACos], xmm3
1041movss DWORD [ACos+8], xmm2
1042
1043movdqa xmm6, [rsp-24]
1044movdqa xmm7, [rsp-40]
1045end;
1046
1047procedure FastSinCos(const ARadians: TVector4; out ASin, ACos: TVector4); assembler;
1048asm
1049movdqa [rsp-24], xmm6
1050movdqa [rsp-40], xmm7
1051
1052movups xmm0, [ARadians]
1053movups xmm2, [SSE_MASK_SIGN]
1054movups xmm3, [SSE_MASK_ABS_VAL]
1055movaps xmm1, xmm0
1056pand xmm0, xmm3 // (xmm0) X := Abs(ARadians)
1057pand xmm1, xmm2 // (xmm1) SignBitSin
1058movaps xmm4, xmm0
1059movups xmm5, [SSE_FOPI]
1060movups xmm6, [SSE_INT_ONE]
1061mulps xmm4, xmm5
1062movups xmm7, [SSE_INT_NOT_ONE]
1063cvtps2dq xmm4, xmm4 // (xmm4) J := Trunc(X * FOPI)
1064movups xmm5, [SSE_INT_FOUR]
1065paddd xmm4, xmm6
1066pand xmm4, xmm7 // (xmm4) J := (J + 1) and (not 1)
1067movups xmm7, [SSE_INT_TWO]
1068cvtdq2ps xmm2, xmm4 // (xmm2) Y := J
1069movaps xmm3, xmm4
1070movaps xmm6, xmm4 // (xmm6) J
1071pand xmm3, xmm5 // J and 4
1072pand xmm4, xmm7 // J and 2
1073pxor xmm5, xmm5
1074pslld xmm3, 29 // (xmm3) SwapSignBitSin := (J and 4) shl 29
1075movups xmm7, [SSE_PI_OVER_4]
1076pcmpeqd xmm4, xmm5 // (xmm4) PolyMask := ((J and 2) = 0)? Yes: $FFFFFFFF, No: $00000000
1077mulps xmm2, xmm7 // Y * Pi / 4
1078movups xmm5, [SSE_INT_TWO]
1079subps xmm0, xmm2 // (xmm0) X := X - (Y * Pi / 4)
1080psubd xmm6, xmm5 // J - 2
1081movups xmm7, [SSE_INT_FOUR]
1082pxor xmm1, xmm3 // (xmm1) SignBitSin := SignBitSin xor SwapSignBitSin
1083andnps xmm6, xmm7 // (not (J - 2)) and 4
1084movaps xmm3, xmm0
1085pslld xmm6, 29 // (xmm6) SignBitCos := ((not (J - 2)) and 4) shl 29
1086mulps xmm3, xmm3 // (xmm3) Z := X * X
1087movups xmm2, [SSE_COSCOF_P0]
1088movups xmm5, [SSE_COSCOF_P1]
1089movups xmm7, [SSE_COSCOF_P2]
1090mulps xmm2, xmm3 // COSCOF_P0 * Z
1091addps xmm2, xmm5 // Y := COSCOF_P0 * Z + COSCOF_P1
1092movups xmm5, [SSE_ONE_HALF]
1093mulps xmm2, xmm3 // Y * Z
1094addps xmm2, xmm7 // Y := (Y * Z) + COSCOF_P2
1095movups xmm7, [SSE_ONE]
1096mulps xmm2, xmm3 // Y * Z
1097mulps xmm5, xmm3 // 0.5 * Z
1098mulps xmm2, xmm3 // Y * (Z * Z)
1099subps xmm2, xmm5 // Y - 0.5 * Z
1100movups xmm5, [SSE_SINCOF_P0]
1101addps xmm2, xmm7 // (xmm2) Y := Y - 0.5 * Z + 1
1102movups xmm7, [SSE_SINCOF_P1]
1103mulps xmm5, xmm3 // SINCOF_P0 * Z
1104addps xmm5, xmm7 // Y2 := SINCOF_P0 * Z + SINCOF_P1
1105mulps xmm5, xmm3 // Y2 * Z
1106movups xmm7, [SSE_SINCOF_P2]
1107addps xmm5, xmm7 // Y2 := Y2 * Z + SINCOF_P2
1108mulps xmm5, xmm3 // Y2 * Z
1109mulps xmm5, xmm0 // Y2 * (Z * X)
1110addps xmm5, xmm0 // (xmm5) Y2 := Y2 * (Z * X) + X
1111movaps xmm0, xmm2 // Y
1112movaps xmm3, xmm5 // Y2
1113andps xmm5, xmm4 // ((J and 2) = 0)? Yes: Y2, No: 0
1114andnps xmm4, xmm2 // ((J and 2) = 0)? Yes: 0 , No: Y
1115subps xmm3, xmm5 // ((J and 2) = 0)? Yes: 0 , No: Y2
1116subps xmm0, xmm4 // ((J and 2) = 0)? Yes: Y , No: 0
1117addps xmm4, xmm5 // ((J and 2) = 0)? Yes: Y2, No: Y
1118addps xmm3, xmm0 // ((J and 2) = 0)? Yes: Y , No: Y2
1119xorps xmm4, xmm1 // Sin
1120xorps xmm3, xmm6 // Cos
1121movups [ASin], xmm4
1122movups [ACos], xmm3
1123
1124movdqa xmm6, [rsp-24]
1125movdqa xmm7, [rsp-40]
1126end;
1127
1128function FastExp(const A: Single): Single; assembler;
1129asm
1130movdqa [rsp-24], xmm6
1131movdqa [rsp-40], xmm7
1132
1133movss xmm1, DWORD [SSE_EXP_A1]
1134movss xmm2, DWORD [SSE_EXP_A2]
1135
1136// Val := 12102203.1615614 * A + 1065353216.0
1137mulss xmm0, xmm1
1138movss xmm3, DWORD [SSE_EXP_CST]
1139addss xmm0, xmm2
1140
1141// if (Val >= EXP_CST) then Val := EXP_CST
1142movss xmm1, xmm0
1143cmpltss xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1144andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1145andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1146orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1147
1148// IVal := Trunc(Val)
1149xorps xmm3, xmm3
1150cvtps2dq xmm1, xmm0
1151
1152// if (IVal < 0) then I := 0
1153movss xmm2, DWORD [SSE_MASK_EXPONENT]
1154movdqa xmm0, xmm1 // IVal
1155pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1156movss xmm3, DWORD [SSE_MASK_FRACTION]
1157pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1158
1159// XU.I := IVal and $7F800000
1160movss xmm4, DWORD [SSE_EXP_I1]
1161movss xmm1, xmm0
1162pand xmm0, xmm2 // XU.I / XU.S
1163
1164// XU2.I := (IVal and $007FFFFF) or $3F800000;
1165pand xmm1, xmm3
1166movss xmm6, DWORD [SSE_EXP_F5]
1167por xmm1, xmm4 // XU2.I / XU2.S
1168
1169// Result := XU.S *
1170// ( 0.509964287281036376953125 + B *
1171// ( 0.3120158612728118896484375 + B *
1172// ( 0.1666135489940643310546875 + B *
1173// (-2.12528370320796966552734375e-3 + B *
1174// 1.3534179888665676116943359375e-2))));
1175movss xmm5, DWORD [SSE_EXP_F4]
1176movss xmm7, xmm1
1177
1178mulss xmm1, xmm6
1179movss xmm4, DWORD [SSE_EXP_F3]
1180addss xmm1, xmm5
1181movss xmm3, DWORD [SSE_EXP_F2]
1182mulss xmm1, xmm7
1183movss xmm2, DWORD [SSE_EXP_F1]
1184addss xmm1, xmm4
1185mulss xmm1, xmm7
1186addss xmm1, xmm3
1187mulss xmm1, xmm7
1188addss xmm1, xmm2
1189mulss xmm1, xmm0
1190
1191movss xmm0, xmm1
1192
1193movdqa xmm6, [rsp-24]
1194movdqa xmm7, [rsp-40]
1195end;
1196
1197function FastExp(const A: TVector2): TVector2;
1198asm
1199movdqa [rsp-24], xmm6
1200movdqa [rsp-40], xmm7
1201
1202{$IF RTLVersion >= 33}
1203movlps xmm0, [A]
1204{$ELSE}
1205movq xmm0, A
1206{$ENDIF}
1207movlps xmm1, QWORD [SSE_EXP_A1]
1208movlps xmm2, QWORD [SSE_EXP_A2]
1209
1210// Val := 12102203.1615614 * A + 1065353216.0
1211mulps xmm0, xmm1
1212movlps xmm3, QWORD [SSE_EXP_CST]
1213addps xmm0, xmm2
1214
1215// if (Val >= EXP_CST) then Val := EXP_CST
1216movaps xmm1, xmm0
1217cmpltps xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1218andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1219andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1220orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1221
1222// IVal := Trunc(Val)
1223xorps xmm3, xmm3
1224cvtps2dq xmm1, xmm0
1225
1226// if (IVal < 0) then I := 0
1227movlps xmm2, QWORD [SSE_MASK_EXPONENT]
1228movdqa xmm0, xmm1 // IVal
1229pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1230movlps xmm3, QWORD [SSE_MASK_FRACTION]
1231pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1232
1233// XU.I := IVal and $7F800000
1234movlps xmm4, QWORD [SSE_EXP_I1]
1235movdqa xmm1, xmm0
1236pand xmm0, xmm2 // XU.I / XU.S
1237
1238// XU2.I := (IVal and $007FFFFF) or $3F800000;
1239pand xmm1, xmm3
1240movlps xmm6, QWORD [SSE_EXP_F5]
1241por xmm1, xmm4 // XU2.I / XU2.S
1242
1243// Result := XU.S *
1244// ( 0.509964287281036376953125 + B *
1245// ( 0.3120158612728118896484375 + B *
1246// ( 0.1666135489940643310546875 + B *
1247// (-2.12528370320796966552734375e-3 + B *
1248// 1.3534179888665676116943359375e-2))));
1249movlps xmm5, QWORD [SSE_EXP_F4]
1250movaps xmm7, xmm1
1251
1252mulps xmm1, xmm6
1253movlps xmm4, QWORD [SSE_EXP_F3]
1254addps xmm1, xmm5
1255movlps xmm3, QWORD [SSE_EXP_F2]
1256mulps xmm1, xmm7
1257movlps xmm2, QWORD [SSE_EXP_F1]
1258addps xmm1, xmm4
1259mulps xmm1, xmm7
1260addps xmm1, xmm3
1261mulps xmm1, xmm7
1262addps xmm1, xmm2
1263mulps xmm1, xmm0
1264
1265movq rax, xmm1
1266
1267movdqa xmm6, [rsp-24]
1268movdqa xmm7, [rsp-40]
1269end;
1270
1271function FastExp(const A: TVector3): TVector3;
1272asm
1273movdqa [rsp-24], xmm6
1274movdqa [rsp-40], xmm7
1275
1276movq xmm0, [A]
1277movss xmm1, DWORD [A+8]
1278movlhps xmm0, xmm1
1279movups xmm1, [SSE_EXP_A1]
1280movups xmm2, [SSE_EXP_A2]
1281
1282// Val := 12102203.1615614 * A + 1065353216.0
1283mulps xmm0, xmm1
1284movups xmm3, [SSE_EXP_CST]
1285addps xmm0, xmm2
1286
1287// if (Val >= EXP_CST) then Val := EXP_CST
1288movaps xmm1, xmm0
1289cmpltps xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1290andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1291andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1292orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1293
1294// IVal := Trunc(Val)
1295xorps xmm3, xmm3
1296cvtps2dq xmm1, xmm0
1297
1298// if (IVal < 0) then I := 0
1299movups xmm2, [SSE_MASK_EXPONENT]
1300movdqa xmm0, xmm1 // IVal
1301pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1302movups xmm3, [SSE_MASK_FRACTION]
1303pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1304
1305// XU.I := IVal and $7F800000
1306movups xmm4, [SSE_EXP_I1]
1307movdqa xmm1, xmm0
1308pand xmm0, xmm2 // XU.I / XU.S
1309
1310// XU2.I := (IVal and $007FFFFF) or $3F800000;
1311pand xmm1, xmm3
1312movups xmm6, [SSE_EXP_F5]
1313por xmm1, xmm4 // XU2.I / XU2.S
1314
1315// Result := XU.S *
1316// ( 0.509964287281036376953125 + B *
1317// ( 0.3120158612728118896484375 + B *
1318// ( 0.1666135489940643310546875 + B *
1319// (-2.12528370320796966552734375e-3 + B *
1320// 1.3534179888665676116943359375e-2))));
1321movups xmm5, [SSE_EXP_F4]
1322movaps xmm7, xmm1
1323
1324mulps xmm1, xmm6
1325movups xmm4, [SSE_EXP_F3]
1326addps xmm1, xmm5
1327movups xmm3, [SSE_EXP_F2]
1328mulps xmm1, xmm7
1329movups xmm2, [SSE_EXP_F1]
1330addps xmm1, xmm4
1331mulps xmm1, xmm7
1332addps xmm1, xmm3
1333mulps xmm1, xmm7
1334addps xmm1, xmm2
1335mulps xmm1, xmm0
1336
1337movhlps xmm0, xmm1
1338movq [Result], xmm1
1339movss DWORD [Result+8], xmm0
1340
1341movdqa xmm6, [rsp-24]
1342movdqa xmm7, [rsp-40]
1343end;
1344
1345function FastExp(const A: TVector4): TVector4;
1346asm
1347movdqa [rsp-24], xmm6
1348movdqa [rsp-40], xmm7
1349
1350movups xmm0, [A]
1351movups xmm1, [SSE_EXP_A1]
1352movups xmm2, [SSE_EXP_A2]
1353
1354// Val := 12102203.1615614 * A + 1065353216.0
1355mulps xmm0, xmm1
1356movups xmm3, [SSE_EXP_CST]
1357addps xmm0, xmm2
1358
1359// if (Val >= EXP_CST) then Val := EXP_CST
1360movaps xmm1, xmm0
1361cmpltps xmm0, xmm3 // (Val < EXP_CST)? Yes: $FFFFFFFF, No: $00000000
1362andps xmm1, xmm0 // (Val < EXP_CST)? Yes: Val, No: 0
1363andnps xmm0, xmm3 // (Val < EXP_CST)? Yes: 0, No: EXP_CST
1364orps xmm0, xmm1 // (Val < EXP_CST)? Yes: Val, No: EXP_CST
1365
1366// IVal := Trunc(Val)
1367xorps xmm3, xmm3
1368cvtps2dq xmm1, xmm0
1369
1370// if (IVal < 0) then I := 0
1371movups xmm2, [SSE_MASK_EXPONENT]
1372movdqa xmm0, xmm1 // IVal
1373pcmpgtd xmm1, xmm3 // (IVal > 0)? Yes: $FFFFFFFF, No: $00000000
1374movups xmm3, [SSE_MASK_FRACTION]
1375pand xmm0, xmm1 // (IVal > 0)? Yes: IVal, No: 0
1376
1377// XU.I := IVal and $7F800000
1378movups xmm4, [SSE_EXP_I1]
1379movdqa xmm1, xmm0
1380pand xmm0, xmm2 // XU.I / XU.S
1381
1382// XU2.I := (IVal and $007FFFFF) or $3F800000;
1383pand xmm1, xmm3
1384movups xmm6, [SSE_EXP_F5]
1385por xmm1, xmm4 // XU2.I / XU2.S
1386
1387// Result := XU.S *
1388// ( 0.509964287281036376953125 + B *
1389// ( 0.3120158612728118896484375 + B *
1390// ( 0.1666135489940643310546875 + B *
1391// (-2.12528370320796966552734375e-3 + B *
1392// 1.3534179888665676116943359375e-2))));
1393movups xmm5, [SSE_EXP_F4]
1394movaps xmm7, xmm1
1395
1396mulps xmm1, xmm6
1397movups xmm4, [SSE_EXP_F3]
1398addps xmm1, xmm5
1399movups xmm3, [SSE_EXP_F2]
1400mulps xmm1, xmm7
1401movups xmm2, [SSE_EXP_F1]
1402addps xmm1, xmm4
1403mulps xmm1, xmm7
1404addps xmm1, xmm3
1405mulps xmm1, xmm7
1406addps xmm1, xmm2
1407mulps xmm1, xmm0
1408
1409movups [Result], xmm1
1410
1411movdqa xmm6, [rsp-24]
1412movdqa xmm7, [rsp-40]
1413end;
1414
1415function FastLn(const A: Single): Single; assembler;
1416asm
1417movdqa [rsp-24], xmm6
1418movdqa [rsp-40], xmm7
1419
1420xorps xmm2, xmm2
1421movss xmm1, xmm0
1422movss xmm3, DWORD [SSE_LN_CST]
1423movss xmm4, DWORD [SSE_NEG_INFINITY]
1424
1425// Exp := Val.I shr 23
1426psrld xmm0, 23
1427movss xmm5, xmm1
1428cvtdq2ps xmm0, xmm0 // xmm0=Exp
1429
1430// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1431cmpnless xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1432movss xmm2, DWORD [SSE_MASK_FRACTION]
1433andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1434andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1435movss xmm4, DWORD [SSE_EXP_I1]
1436orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1437
1438// Val.I := (Val.I and $007FFFFF) or $3F800000
1439pand xmm5, xmm2
1440movss xmm2, DWORD [SSE_LN_F5]
1441por xmm5, xmm4
1442movss xmm6, DWORD [SSE_LN_F3]
1443movss xmm3, xmm5 // xmm3=X
1444mulss xmm5, xmm5 // xmm5=X2
1445
1446movss xmm4, xmm3
1447movss xmm7, DWORD [SSE_LN_F4]
1448mulss xmm4, xmm6
1449mulss xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1450subss xmm4, xmm7
1451movss xmm7, DWORD [SSE_LN_F2]
1452movss xmm6, xmm3
1453mulss xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1454subss xmm6, xmm7
1455movss xmm2, DWORD [SSE_LN_F1]
1456addss xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1457mulss xmm3, xmm2
1458mulss xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1459addss xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1460addss xmm4, xmm0
1461addss xmm3, xmm4
1462
1463movss xmm0, xmm3
1464
1465movdqa xmm6, [rsp-24]
1466movdqa xmm7, [rsp-40]
1467end;
1468
1469function FastLn(const A: TVector2): TVector2; assembler;
1470asm
1471movdqa [rsp-24], xmm6
1472movdqa [rsp-40], xmm7
1473
1474{$IF RTLVersion >= 33}
1475movlps xmm0, [A]
1476{$ELSE}
1477movq xmm0, A
1478{$ENDIF}
1479xorps xmm2, xmm2
1480movaps xmm1, xmm0
1481movlps xmm3, QWORD [SSE_LN_CST]
1482movlps xmm4, QWORD [SSE_NEG_INFINITY]
1483
1484// Exp := Val.I shr 23
1485psrld xmm0, 23
1486movaps xmm5, xmm1
1487cvtdq2ps xmm0, xmm0 // xmm0=Exp
1488
1489// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1490cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1491movlps xmm2, QWORD [SSE_MASK_FRACTION]
1492andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1493andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1494movlps xmm4, QWORD [SSE_EXP_I1]
1495orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1496
1497// Val.I := (Val.I and $007FFFFF) or $3F800000
1498pand xmm5, xmm2
1499movlps xmm2, QWORD [SSE_LN_F5]
1500por xmm5, xmm4
1501movlps xmm6, QWORD [SSE_LN_F3]
1502movaps xmm3, xmm5 // xmm3=X
1503mulps xmm5, xmm5 // xmm5=X2
1504
1505movaps xmm4, xmm3
1506movlps xmm7, QWORD [SSE_LN_F4]
1507mulps xmm4, xmm6
1508mulps xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1509subps xmm4, xmm7
1510movlps xmm7, QWORD [SSE_LN_F2]
1511movaps xmm6, xmm3
1512mulps xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1513subps xmm6, xmm7
1514movlps xmm2, QWORD [SSE_LN_F1]
1515addps xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1516mulps xmm3, xmm2
1517mulps xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1518addps xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1519addps xmm4, xmm0
1520addps xmm3, xmm4
1521
1522movq rax, xmm3
1523
1524movdqa xmm6, [rsp-24]
1525movdqa xmm7, [rsp-40]
1526end;
1527
1528function FastLn(const A: TVector3): TVector3; assembler;
1529asm
1530movdqa [rsp-24], xmm6
1531movdqa [rsp-40], xmm7
1532
1533movq xmm0, [A]
1534movss xmm1, DWORD [A+8]
1535movlhps xmm0, xmm1
1536xorps xmm2, xmm2
1537movaps xmm1, xmm0
1538movups xmm3, [SSE_LN_CST]
1539movups xmm4, [SSE_NEG_INFINITY]
1540
1541// Exp := Val.I shr 23
1542psrld xmm0, 23
1543movaps xmm5, xmm1
1544cvtdq2ps xmm0, xmm0 // xmm0=Exp
1545
1546// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1547cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1548movups xmm2, [SSE_MASK_FRACTION]
1549andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1550andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1551movups xmm4, [SSE_EXP_I1]
1552orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1553
1554// Val.I := (Val.I and $007FFFFF) or $3F800000
1555pand xmm5, xmm2
1556movups xmm2, [SSE_LN_F5]
1557por xmm5, xmm4
1558movups xmm6, [SSE_LN_F3]
1559movaps xmm3, xmm5 // xmm3=X
1560mulps xmm5, xmm5 // xmm5=X2
1561
1562movaps xmm4, xmm3
1563movups xmm7, [SSE_LN_F4]
1564mulps xmm4, xmm6
1565mulps xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1566subps xmm4, xmm7
1567movups xmm7, [SSE_LN_F2]
1568movaps xmm6, xmm3
1569mulps xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1570subps xmm6, xmm7
1571movups xmm2, [SSE_LN_F1]
1572addps xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1573mulps xmm3, xmm2
1574mulps xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1575addps xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1576addps xmm4, xmm0
1577addps xmm3, xmm4
1578
1579movhlps xmm2, xmm3
1580movq [Result], xmm3
1581movss DWORD [Result+8], xmm2
1582
1583movdqa xmm6, [rsp-24]
1584movdqa xmm7, [rsp-40]
1585end;
1586
1587function FastLn(const A: TVector4): TVector4; assembler;
1588asm
1589movdqa [rsp-24], xmm6
1590movdqa [rsp-40], xmm7
1591
1592movups xmm0, [A]
1593xorps xmm2, xmm2
1594movaps xmm1, xmm0
1595movups xmm3, [SSE_LN_CST]
1596movups xmm4, [SSE_NEG_INFINITY]
1597
1598// Exp := Val.I shr 23
1599psrld xmm0, 23
1600movaps xmm5, xmm1
1601cvtdq2ps xmm0, xmm0 // xmm0=Exp
1602
1603// if (A > 0) then AddCst := -89.93423858 else AddCst := NegInfinity
1604cmpnleps xmm1, xmm2 // (A > 0)? Yes: $FFFFFFFF, No: $00000000
1605movups xmm2, [SSE_MASK_FRACTION]
1606andps xmm3, xmm1 // (A > 0)? Yes: -89.93423858, No: 0
1607andnps xmm1, xmm4 // (A > 0)? Yes: 0, No: NegInfinity
1608movups xmm4, [SSE_EXP_I1]
1609orps xmm1, xmm3 // (A > 0)? Yes: -89.93423858, No: NegInfinity
1610
1611// Val.I := (Val.I and $007FFFFF) or $3F800000
1612pand xmm5, xmm2
1613movups xmm2, [SSE_LN_F5]
1614por xmm5, xmm4
1615movups xmm6, [SSE_LN_F3]
1616movaps xmm3, xmm5 // xmm3=X
1617mulps xmm5, xmm5 // xmm5=X2
1618
1619movaps xmm4, xmm3
1620movups xmm7, [SSE_LN_F4]
1621mulps xmm4, xmm6
1622mulps xmm0, xmm2 // xmm0 = Exp * 0.69314718055995
1623subps xmm4, xmm7
1624movups xmm7, [SSE_LN_F2]
1625movaps xmm6, xmm3
1626mulps xmm4, xmm5 // xmm4 = X2 * (0.024982445 * X - 0.24371102)
1627subps xmm6, xmm7
1628movups xmm2, [SSE_LN_F1]
1629addps xmm4, xmm6 // xmm4 = (X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102)
1630mulps xmm3, xmm2
1631mulps xmm4, xmm5 // xmm4 = X2 * ((X - 2.2744832) + X2 * (0.024982445 * X - 0.24371102))
1632addps xmm3, xmm1 // xmm3 = (3.3977745 * X + AddCst)
1633addps xmm4, xmm0
1634addps xmm3, xmm4
1635
1636movups [Result], xmm3
1637
1638movdqa xmm6, [rsp-24]
1639movdqa xmm7, [rsp-40]
1640end;
1641
1642function FastLog2(const A: Single): Single; assembler;
1643asm
1644movss xmm2, DWORD [SSE_MASK_FRACTION]
1645movss xmm1, xmm0
1646
1647// MX.I := (VX.I and $007FFFFF) or $3F000000
1648movss xmm3, DWORD [SSE_LOG2_I1]
1649pand xmm0, xmm2
1650cvtdq2ps xmm1, xmm1
1651movss xmm4, DWORD [SSE_LOG2_F1]
1652por xmm0, xmm3
1653
1654movss xmm2, DWORD [SSE_LOG2_F2]
1655mulss xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1656movss xmm3, DWORD [SSE_LOG2_F3]
1657subss xmm1, xmm2 // Result - 124.22551499
1658mulss xmm3, xmm0
1659movss xmm4, DWORD [SSE_LOG2_F5]
1660subss xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1661movss xmm2, DWORD [SSE_LOG2_F4]
1662addss xmm0, xmm4
1663divss xmm2, xmm0
1664subss xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1665
1666movss xmm0, xmm1
1667end;
1668
1669function FastLog2(const A: TVector2): TVector2; assembler;
1670asm
1671{$IF RTLVersion >= 33}
1672movlps xmm0, [A]
1673{$ELSE}
1674movq xmm0, A
1675{$ENDIF}
1676movlps xmm2, QWORD [SSE_MASK_FRACTION]
1677movaps xmm1, xmm0
1678
1679// MX.I := (VX.I and $007FFFFF) or $3F000000
1680movlps xmm3, QWORD [SSE_LOG2_I1]
1681pand xmm0, xmm2
1682cvtdq2ps xmm1, xmm1
1683movlps xmm4, QWORD [SSE_LOG2_F1]
1684por xmm0, xmm3
1685
1686movlps xmm2, QWORD [SSE_LOG2_F2]
1687mulps xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1688movlps xmm3, QWORD [SSE_LOG2_F3]
1689subps xmm1, xmm2 // Result - 124.22551499
1690mulps xmm3, xmm0
1691movlps xmm4, QWORD [SSE_LOG2_F5]
1692subps xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1693movlps xmm2, QWORD [SSE_LOG2_F4]
1694addps xmm0, xmm4
1695divps xmm2, xmm0
1696subps xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1697
1698movq rax, xmm1
1699end;
1700
1701function FastLog2(const A: TVector3): TVector3; assembler;
1702asm
1703movq xmm0, [A]
1704movss xmm1, DWORD [A+8]
1705movlhps xmm0, xmm1
1706movups xmm2, [SSE_MASK_FRACTION]
1707movaps xmm1, xmm0
1708
1709// MX.I := (VX.I and $007FFFFF) or $3F000000
1710movups xmm3, [SSE_LOG2_I1]
1711pand xmm0, xmm2
1712cvtdq2ps xmm1, xmm1
1713movups xmm4, [SSE_LOG2_F1]
1714por xmm0, xmm3
1715
1716movups xmm2, [SSE_LOG2_F2]
1717mulps xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1718movups xmm3, [SSE_LOG2_F3]
1719subps xmm1, xmm2 // Result - 124.22551499
1720mulps xmm3, xmm0
1721movups xmm4, [SSE_LOG2_F5]
1722subps xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1723movups xmm2, [SSE_LOG2_F4]
1724addps xmm0, xmm4
1725divps xmm2, xmm0
1726subps xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1727
1728movhlps xmm0, xmm1
1729movq [Result], xmm1
1730movss DWORD [Result+8], xmm0
1731end;
1732
1733function FastLog2(const A: TVector4): TVector4; assembler;
1734asm
1735movups xmm0, [A]
1736movups xmm2, [SSE_MASK_FRACTION]
1737movaps xmm1, xmm0
1738
1739// MX.I := (VX.I and $007FFFFF) or $3F000000
1740movups xmm3, [SSE_LOG2_I1]
1741pand xmm0, xmm2
1742cvtdq2ps xmm1, xmm1
1743movups xmm4, [SSE_LOG2_F1]
1744por xmm0, xmm3
1745
1746movups xmm2, [SSE_LOG2_F2]
1747mulps xmm1, xmm4 // VX.I * 1.1920928955078125e-7
1748movups xmm3, [SSE_LOG2_F3]
1749subps xmm1, xmm2 // Result - 124.22551499
1750mulps xmm3, xmm0
1751movups xmm4, [SSE_LOG2_F5]
1752subps xmm1, xmm3 // Result - 124.22551499 - 1.498030302 * MX.S
1753movups xmm2, [SSE_LOG2_F4]
1754addps xmm0, xmm4
1755divps xmm2, xmm0
1756subps xmm1, xmm2 // Result - 124.22551499 - 1.498030302 * MX.S - 1.72587999 / (0.3520887068 + MX.S)
1757
1758movups [Result], xmm1
1759end;
1760
1761function FastExp2(const A: Single): Single; assembler;
1762var
1763OldFlags, NewFlags: UInt32;
1764asm
1765// Set rounding mode to Round Positive (=Round Down)
1766stmxcsr [OldFlags]
1767mov ecx, [OldFlags]
1768xorps xmm1, xmm1
1769and ecx, SSE_ROUND_MASK
1770movss xmm3, xmm0
1771or ecx, SSE_ROUND_DOWN
1772movss xmm5, xmm0
1773mov [NewFlags], ecx
1774
1775movss xmm1, DWORD [SSE_EXP2_F1]
1776ldmxcsr [NewFlags]
1777
1778// Z := A - RoundDown(A)
1779cvtps2dq xmm3, xmm3
1780addss xmm1, xmm5 // A + 121.2740575
1781cvtdq2ps xmm3, xmm3
1782movss xmm2, DWORD [SSE_EXP2_F2]
1783subss xmm0, xmm3
1784
1785movss xmm3, DWORD [SSE_EXP2_F3]
1786movss xmm4, DWORD [SSE_EXP2_F4]
1787subss xmm3, xmm0 // (4.84252568 - Z)
1788mulss xmm0, xmm4 // 1.49012907 * Z
1789divss xmm2, xmm3
1790movss xmm5, DWORD [SSE_EXP2_F5]
1791addss xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1792subss xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1793mulss xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1794cvtps2dq xmm1, xmm1
1795
1796// Restore rounding mode
1797ldmxcsr [OldFlags]
1798
1799movss xmm0, xmm1
1800end;
1801
1802function FastExp2(const A: TVector2): TVector2; assembler;
1803var
1804OldFlags, NewFlags: UInt32;
1805asm
1806// Set rounding mode to Round Positive (=Round Down)
1807stmxcsr [OldFlags]
1808{$IF RTLVersion >= 33}
1809movlps xmm0, [A]
1810{$ELSE}
1811movq xmm0, A
1812{$ENDIF}
1813mov ecx, [OldFlags]
1814xorps xmm1, xmm1
1815and ecx, SSE_ROUND_MASK
1816movaps xmm3, xmm0
1817or ecx, SSE_ROUND_DOWN
1818movaps xmm5, xmm0
1819mov [NewFlags], ecx
1820
1821movlps xmm1, QWORD [SSE_EXP2_F1]
1822ldmxcsr [NewFlags]
1823
1824// Z := A - RoundDown(A)
1825cvtps2dq xmm3, xmm3
1826addps xmm1, xmm5 // A + 121.2740575
1827cvtdq2ps xmm3, xmm3
1828movlps xmm2, QWORD [SSE_EXP2_F2]
1829subps xmm0, xmm3
1830
1831movlps xmm3, QWORD [SSE_EXP2_F3]
1832movlps xmm4, QWORD [SSE_EXP2_F4]
1833subps xmm3, xmm0 // (4.84252568 - Z)
1834mulps xmm0, xmm4 // 1.49012907 * Z
1835divps xmm2, xmm3
1836movlps xmm5, QWORD [SSE_EXP2_F5]
1837addps xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1838subps xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1839mulps xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1840cvtps2dq xmm1, xmm1
1841
1842// Restore rounding mode
1843ldmxcsr [OldFlags]
1844
1845movq rax, xmm1
1846end;
1847
1848function FastExp2(const A: TVector3): TVector3; assembler;
1849var
1850OldFlags, NewFlags: UInt32;
1851asm
1852// Set rounding mode to Round Positive (=Round Down)
1853stmxcsr [OldFlags]
1854movq xmm0, [A]
1855movss xmm1, DWORD [A+8]
1856movlhps xmm0, xmm1
1857mov edx, [OldFlags]
1858xorps xmm1, xmm1
1859and edx, SSE_ROUND_MASK
1860movaps xmm3, xmm0
1861or edx, SSE_ROUND_DOWN
1862movaps xmm5, xmm0
1863mov [NewFlags], edx
1864
1865movups xmm1, [SSE_EXP2_F1]
1866ldmxcsr [NewFlags]
1867
1868// Z := A - RoundDown(A)
1869cvtps2dq xmm3, xmm3
1870addps xmm1, xmm5 // A + 121.2740575
1871cvtdq2ps xmm3, xmm3
1872movups xmm2, [SSE_EXP2_F2]
1873subps xmm0, xmm3
1874
1875movups xmm3, [SSE_EXP2_F3]
1876movups xmm4, [SSE_EXP2_F4]
1877subps xmm3, xmm0 // (4.84252568 - Z)
1878mulps xmm0, xmm4 // 1.49012907 * Z
1879divps xmm2, xmm3
1880movups xmm5, [SSE_EXP2_F5]
1881addps xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1882subps xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1883mulps xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1884cvtps2dq xmm1, xmm1
1885
1886// Restore rounding mode
1887ldmxcsr [OldFlags]
1888
1889movhlps xmm0, xmm1
1890movq [Result], xmm1
1891movss DWORD [Result+8], xmm0
1892end;
1893
1894function FastExp2(const A: TVector4): TVector4; assembler;
1895var
1896OldFlags, NewFlags: UInt32;
1897asm
1898// Set rounding mode to Round Positive (=Round Down)
1899stmxcsr [OldFlags]
1900movups xmm0, [A]
1901mov edx, [OldFlags]
1902xorps xmm1, xmm1
1903and edx, SSE_ROUND_MASK
1904movaps xmm3, xmm0
1905or edx, SSE_ROUND_DOWN
1906movaps xmm5, xmm0
1907mov [NewFlags], edx
1908
1909movups xmm1, [SSE_EXP2_F1]
1910ldmxcsr [NewFlags]
1911
1912// Z := A - RoundDown(A)
1913cvtps2dq xmm3, xmm3
1914addps xmm1, xmm5 // A + 121.2740575
1915cvtdq2ps xmm3, xmm3
1916movups xmm2, [SSE_EXP2_F2]
1917subps xmm0, xmm3
1918
1919movups xmm3, [SSE_EXP2_F3]
1920movups xmm4, [SSE_EXP2_F4]
1921subps xmm3, xmm0 // (4.84252568 - Z)
1922mulps xmm0, xmm4 // 1.49012907 * Z
1923divps xmm2, xmm3
1924movups xmm5, [SSE_EXP2_F5]
1925addps xmm1, xmm2 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z)
1926subps xmm1, xmm0 // A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z
1927mulps xmm1, xmm5 // (1 shl 23) * (A + 121.2740575 + 27.7280233 / (4.84252568 - Z) - 1.49012907 * Z)
1928cvtps2dq xmm1, xmm1
1929
1930// Restore rounding mode
1931ldmxcsr [OldFlags]
1932
1933movups [Result], xmm1
1934end;
1935
1936{ Common Functions }
1937
1938function Abs(const A: Single): Single;
1939begin
1940Result := System.Abs(A);
1941end;
1942
1943function Abs(const A: TVector2): TVector2;
1944begin
1945Result.Init(System.Abs(A.X), System.Abs(A.Y));
1946end;
1947
1948function Abs(const A: TVector3): TVector3; assembler;
1949asm
1950movq xmm0, [A]
1951movss xmm1, DWORD [A+8]
1952movups xmm2, [SSE_MASK_ABS_VAL]
1953andps xmm0, xmm2
1954pand xmm1, xmm2
1955movq [Result], xmm0
1956movss DWORD [Result+8], xmm1
1957end;
1958
1959function Abs(const A: TVector4): TVector4; assembler;
1960asm
1961movups xmm0, [A]
1962movups xmm1, [SSE_MASK_ABS_VAL]
1963andps xmm0, xmm1
1964movups [Result], xmm0
1965end;
1966
1967function Sign(const A: Single): Single; assembler;
1968asm
1969movss xmm1, DWORD [SSE_ONE]
1970movss xmm2, xmm0
1971movss xmm3, DWORD [SSE_MASK_SIGN]
1972
1973andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1974xorps xmm4, xmm4
1975orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1976cmpneqss xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1977andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1978end;
1979
1980function Sign(const A: TVector2): TVector2; assembler;
1981asm
1982{$IF RTLVersion >= 33}
1983movlps xmm0, [A]
1984{$ELSE}
1985movq xmm0, A
1986{$ENDIF}
1987movlps xmm1, QWORD [SSE_ONE]
1988movaps xmm2, xmm0
1989movlps xmm3, QWORD [SSE_MASK_SIGN]
1990
1991andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
1992xorps xmm4, xmm4
1993orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
1994cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
1995andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
1996movq rax, xmm0
1997end;
1998
1999function Sign(const A: TVector3): TVector3; assembler;
2000asm
2001movq xmm0, [A]
2002movss xmm1, DWORD [A+8]
2003movlhps xmm0, xmm1
2004movups xmm1, [SSE_ONE]
2005movaps xmm2, xmm0
2006movups xmm3, [SSE_MASK_SIGN]
2007
2008andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
2009xorps xmm4, xmm4
2010orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
2011cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
2012andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
2013movhlps xmm1, xmm0
2014movq [Result], xmm0
2015movss DWORD [Result+8], xmm1
2016end;
2017
2018function Sign(const A: TVector4): TVector4; assembler;
2019asm
2020movups xmm0, [A]
2021movups xmm1, [SSE_ONE]
2022movaps xmm2, xmm0
2023movups xmm3, [SSE_MASK_SIGN]
2024
2025andps xmm0, xmm3 // (A < 0)? Yes: $80000000, No: $00000000
2026xorps xmm4, xmm4
2027orps xmm0, xmm1 // (A < 0)? Yes: -1, No: 1
2028cmpneqps xmm2, xmm4 // (A = 0)? Yes: $00000000, No: $FFFFFFFF
2029andps xmm0, xmm2 // (A = 0)? Yes: 0, No: -1 or 1
2030movups [Result], xmm0
2031end;
2032
2033function Floor(const A: Single): Integer;
2034begin
2035Result := System.Math.Floor(A);
2036end;
2037{function Floor(const A: Single): Integer; assembler;
2038var
2039OldFlags, NewFlags: UInt32;
2040asm
2041// Set rounding mode to Round Down
2042stmxcsr [OldFlags]
2043mov ecx, [OldFlags]
2044and ecx, SSE_ROUND_MASK
2045or ecx, SSE_ROUND_DOWN
2046mov [NewFlags], ecx
2047ldmxcsr [NewFlags]
2048
2049cvtps2dq xmm0, xmm0
2050
2051// Restore rounding mode
2052ldmxcsr [OldFlags]
2053
2054movd eax, xmm0
2055end;}
2056
2057function Floor(const A: TVector2): TIVector2; assembler;
2058var
2059OldFlags, NewFlags: UInt32;
2060asm
2061// Set rounding mode to Round Down
2062stmxcsr [OldFlags]
2063mov eax, [OldFlags]
2064and eax, SSE_ROUND_MASK
2065or eax, SSE_ROUND_DOWN
2066mov [NewFlags], eax
2067{$IF RTLVersion >= 33}
2068movlps xmm0, [A]
2069{$ELSE}
2070movq xmm0, A
2071{$ENDIF}
2072ldmxcsr [NewFlags]
2073
2074cvtps2dq xmm0, xmm0
2075
2076// Restore rounding mode
2077ldmxcsr [OldFlags]
2078
2079movq rax, xmm0
2080end;
2081
2082function Floor(const A: TVector3): TIVector3; assembler;
2083var
2084OldFlags, NewFlags: UInt32;
2085asm
2086// Set rounding mode to Round Down
2087stmxcsr [OldFlags]
2088mov eax, [OldFlags]
2089and eax, SSE_ROUND_MASK
2090or eax, SSE_ROUND_DOWN
2091mov [NewFlags], eax
2092movq xmm0, [A]
2093movss xmm1, DWORD [A+8]
2094movlhps xmm0, xmm1
2095ldmxcsr [NewFlags]
2096
2097cvtps2dq xmm0, xmm0
2098
2099// Restore rounding mode
2100ldmxcsr [OldFlags]
2101
2102movhlps xmm1, xmm0
2103movq [Result], xmm0
2104movss DWORD [Result+8], xmm1
2105end;
2106
2107function Floor(const A: TVector4): TIVector4; assembler;
2108var
2109OldFlags, NewFlags: UInt32;
2110asm
2111// Set rounding mode to Round Down
2112stmxcsr [OldFlags]
2113mov eax, [OldFlags]
2114and eax, SSE_ROUND_MASK
2115or eax, SSE_ROUND_DOWN
2116mov [NewFlags], eax
2117movups xmm0, [A]
2118ldmxcsr [NewFlags]
2119
2120cvtps2dq xmm0, xmm0
2121
2122// Restore rounding mode
2123ldmxcsr [OldFlags]
2124
2125movups [Result], xmm0
2126end;
2127
2128function Trunc(const A: Single): Integer;
2129begin
2130Result := System.Trunc(A);
2131end;
2132
2133{function Trunc(const A: Single): Integer; assembler;
2134var
2135OldFlags, NewFlags: UInt32;
2136asm
2137// Set rounding mode to Truncate
2138stmxcsr [OldFlags]
2139mov ecx, [OldFlags]
2140and ecx, SSE_ROUND_MASK
2141or ecx, SSE_ROUND_TRUNC
2142mov [NewFlags], ecx
2143ldmxcsr [NewFlags]
2144
2145cvtps2dq xmm0, xmm0
2146
2147// Restore rounding mode
2148ldmxcsr [OldFlags]
2149
2150movd eax, xmm0
2151end;}
2152
2153function Trunc(const A: TVector2): TIVector2; assembler;
2154var
2155OldFlags, NewFlags: UInt32;
2156asm
2157// Set rounding mode to Truncate
2158stmxcsr [OldFlags]
2159mov eax, [OldFlags]
2160and eax, SSE_ROUND_MASK
2161or eax, SSE_ROUND_TRUNC
2162mov [NewFlags], eax
2163{$IF RTLVersion >= 33}
2164movlps xmm0, [A]
2165{$ELSE}
2166movq xmm0, A
2167{$ENDIF}
2168ldmxcsr [NewFlags]
2169
2170cvtps2dq xmm0, xmm0
2171
2172// Restore rounding mode
2173ldmxcsr [OldFlags]
2174
2175movq rax, xmm0
2176end;
2177
2178function Trunc(const A: TVector3): TIVector3; assembler;
2179var
2180OldFlags, NewFlags: UInt32;
2181asm
2182// Set rounding mode to Truncate
2183stmxcsr [OldFlags]
2184mov eax, [OldFlags]
2185and eax, SSE_ROUND_MASK
2186or eax, SSE_ROUND_TRUNC
2187mov [NewFlags], eax
2188movq xmm0, [A]
2189movss xmm1, DWORD [A+8]
2190movlhps xmm0, xmm1
2191ldmxcsr [NewFlags]
2192
2193cvtps2dq xmm0, xmm0
2194
2195// Restore rounding mode
2196ldmxcsr [OldFlags]
2197
2198movhlps xmm1, xmm0
2199movq [Result], xmm0
2200movss DWORD [Result+8], xmm1
2201end;
2202
2203function Trunc(const A: TVector4): TIVector4; assembler;
2204var
2205OldFlags, NewFlags: UInt32;
2206asm
2207// Set rounding mode to Truncate
2208stmxcsr [OldFlags]
2209mov eax, [OldFlags]
2210and eax, SSE_ROUND_MASK
2211or eax, SSE_ROUND_TRUNC
2212mov [NewFlags], eax
2213movups xmm0, [A]
2214ldmxcsr [NewFlags]
2215
2216cvtps2dq xmm0, xmm0
2217
2218// Restore rounding mode
2219ldmxcsr [OldFlags]
2220
2221movups [Result], xmm0
2222end;
2223
2224function Round(const A: Single): Integer;
2225begin
2226Result := System.Round(A);
2227end;
2228
2229function Round(const A: TVector2): TIVector2; assembler;
2230asm
2231// Rounding mode defaults to round-to-nearest
2232{$IF RTLVersion >= 33}
2233movlps xmm0, [A]
2234{$ELSE}
2235movq xmm0, A
2236{$ENDIF}
2237cvtps2dq xmm0, xmm0
2238movq rax, xmm0
2239end;
2240
2241function Round(const A: TVector3): TIVector3; assembler;
2242asm
2243// Rounding mode defaults to round-to-nearest
2244movq xmm0, [A]
2245movss xmm1, DWORD [A+8]
2246movlhps xmm0, xmm1
2247cvtps2dq xmm0, xmm0
2248movhlps xmm1, xmm0
2249movq [Result], xmm0
2250movss DWORD [Result+8], xmm1
2251end;
2252
2253function Round(const A: TVector4): TIVector4; assembler;
2254asm
2255// Rounding mode defaults to round-to-nearest
2256movups xmm0, [A]
2257cvtps2dq xmm0, xmm0
2258movups [Result], xmm0
2259end;
2260
2261function Ceil(const A: Single): Integer;
2262begin
2263Result := System.Math.Ceil(A);
2264end;
2265{function Ceil(const A: Single): Integer; assembler;
2266var
2267OldFlags, NewFlags: UInt32;
2268asm
2269// Set rounding mode to Ceil Down
2270stmxcsr [OldFlags]
2271mov ecx, [OldFlags]
2272and ecx, SSE_ROUND_MASK
2273or ecx, SSE_ROUND_UP
2274mov [NewFlags], ecx
2275ldmxcsr [NewFlags]
2276
2277cvtps2dq xmm0, xmm0
2278
2279// Restore rounding mode
2280ldmxcsr [OldFlags]
2281
2282movd eax, xmm0
2283end;}
2284
2285function Ceil(const A: TVector2): TIVector2; assembler;
2286var
2287OldFlags, NewFlags: UInt32;
2288asm
2289// Set rounding mode to Ceil Down
2290stmxcsr [OldFlags]
2291mov eax, [OldFlags]
2292and eax, SSE_ROUND_MASK
2293or eax, SSE_ROUND_UP
2294mov [NewFlags], eax
2295{$IF RTLVersion >= 33}
2296movlps xmm0, [A]
2297{$ELSE}
2298movq xmm0, A
2299{$ENDIF}
2300ldmxcsr [NewFlags]
2301
2302cvtps2dq xmm0, xmm0
2303
2304// Restore rounding mode
2305ldmxcsr [OldFlags]
2306
2307movq rax, xmm0
2308end;
2309
2310function Ceil(const A: TVector3): TIVector3; assembler;
2311var
2312OldFlags, NewFlags: UInt32;
2313asm
2314// Set rounding mode to Ceil Down
2315stmxcsr [OldFlags]
2316mov eax, [OldFlags]
2317and eax, SSE_ROUND_MASK
2318or eax, SSE_ROUND_UP
2319mov [NewFlags], eax
2320movq xmm0, [A]
2321movss xmm1, DWORD [A+8]
2322movlhps xmm0, xmm1
2323ldmxcsr [NewFlags]
2324
2325cvtps2dq xmm0, xmm0
2326
2327// Restore rounding mode
2328ldmxcsr [OldFlags]
2329
2330movhlps xmm1, xmm0
2331movq [Result], xmm0
2332movss DWORD [Result+8], xmm1
2333end;
2334
2335function Ceil(const A: TVector4): TIVector4; assembler;
2336var
2337OldFlags, NewFlags: UInt32;
2338asm
2339// Set rounding mode to Ceil Down
2340stmxcsr [OldFlags]
2341mov eax, [OldFlags]
2342and eax, SSE_ROUND_MASK
2343or eax, SSE_ROUND_UP
2344mov [NewFlags], eax
2345movups xmm0, [A]
2346ldmxcsr [NewFlags]
2347
2348cvtps2dq xmm0, xmm0
2349
2350// Restore rounding mode
2351ldmxcsr [OldFlags]
2352
2353movups [Result], xmm0
2354end;
2355
2356function Frac(const A: Single): Single;
2357begin
2358Result := System.Frac(A);
2359end;
2360{function Frac(const A: Single): Single; assembler;
2361var
2362OldFlags, NewFlags: UInt32;
2363asm
2364// Set rounding mode to Truncate
2365stmxcsr [OldFlags]
2366mov ecx, [OldFlags]
2367and ecx, SSE_ROUND_MASK
2368or ecx, SSE_ROUND_TRUNC
2369mov [NewFlags], ecx
2370movss xmm1, xmm0
2371ldmxcsr [NewFlags]
2372
2373cvtps2dq xmm0, xmm0
2374ldmxcsr [OldFlags]
2375cvtdq2ps xmm0, xmm0
2376subss xmm1, xmm0 // A - Trunc(A)
2377
2378movss xmm0, xmm1
2379end;}
2380
2381function Frac(const A: TVector2): TVector2; assembler;
2382var
2383OldFlags, NewFlags: UInt32;
2384asm
2385// Set rounding mode to Truncate
2386stmxcsr [OldFlags]
2387mov edx, [OldFlags]
2388and edx, SSE_ROUND_MASK
2389or edx, SSE_ROUND_TRUNC
2390{$IF RTLVersion >= 33}
2391movlps xmm0, [A]
2392{$ELSE}
2393movq xmm0, A
2394{$ENDIF}
2395mov [NewFlags], edx
2396movaps xmm1, xmm0
2397ldmxcsr [NewFlags]
2398
2399cvtps2dq xmm0, xmm0
2400ldmxcsr [OldFlags]
2401cvtdq2ps xmm0, xmm0
2402subps xmm1, xmm0 // A - Trunc(A)
2403
2404movq rax, xmm1
2405end;
2406
2407function Frac(const A: TVector3): TVector3; assembler;
2408var
2409OldFlags, NewFlags: UInt32;
2410asm
2411// Set rounding mode to Truncate
2412stmxcsr [OldFlags]
2413mov eax, [OldFlags]
2414and eax, SSE_ROUND_MASK
2415or eax, SSE_ROUND_TRUNC
2416movq xmm0, [A]
2417movss xmm1, DWORD [A+8]
2418movlhps xmm0, xmm1
2419mov [NewFlags], eax
2420movaps xmm1, xmm0
2421ldmxcsr [NewFlags]
2422
2423cvtps2dq xmm0, xmm0
2424ldmxcsr [OldFlags]
2425cvtdq2ps xmm0, xmm0
2426subps xmm1, xmm0 // A - Trunc(A)
2427
2428movhlps xmm0, xmm1
2429movq [Result], xmm1
2430movss DWORD [Result+8], xmm0
2431end;
2432
2433function Frac(const A: TVector4): TVector4; assembler;
2434var
2435OldFlags, NewFlags: UInt32;
2436asm
2437// Set rounding mode to Truncate
2438stmxcsr [OldFlags]
2439mov eax, [OldFlags]
2440and eax, SSE_ROUND_MASK
2441or eax, SSE_ROUND_TRUNC
2442movups xmm0, [A]
2443mov [NewFlags], eax
2444movaps xmm1, xmm0
2445ldmxcsr [NewFlags]
2446
2447cvtps2dq xmm0, xmm0
2448ldmxcsr [OldFlags]
2449cvtdq2ps xmm0, xmm0
2450subps xmm1, xmm0 // A - Trunc(A)
2451
2452movups [Result], xmm1
2453end;
2454
2455function FMod(const A, B: Single): Single;
2456begin
2457Result := A - (B * Trunc(A / B));
2458end;
2459{function FMod(const A, B: Single): Single; assembler;
2460var
2461OldFlags, NewFlags: UInt32;
2462asm
2463// Set rounding mode to Truncate
2464stmxcsr [OldFlags]
2465mov edx, [OldFlags]
2466movss xmm2, xmm0
2467and edx, SSE_ROUND_MASK
2468movss xmm3, xmm1
2469or edx, SSE_ROUND_TRUNC
2470divss xmm2, xmm3 // A / B
2471mov [NewFlags], edx
2472ldmxcsr [NewFlags]
2473
2474cvtps2dq xmm2, xmm2
2475cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2476mulss xmm2, xmm1
2477subss xmm0, xmm2 // A - (B * Trunc(A / B))
2478
2479// Restore rounding mode
2480ldmxcsr [OldFlags]
2481end;}
2482
2483function FMod(const A: TVector2; const B: Single): TVector2; assembler;
2484var
2485OldFlags, NewFlags: UInt32;
2486asm
2487// Set rounding mode to Truncate
2488{$IF RTLVersion >= 33}
2489movlps xmm0, [A]
2490{$ELSE}
2491movq xmm0, A
2492{$ENDIF}
2493stmxcsr [OldFlags]
2494mov ecx, [OldFlags]
2495shufps xmm1, xmm1, $00 // Replicate B
2496and ecx, SSE_ROUND_MASK
2497movaps xmm2, xmm0
2498or ecx, SSE_ROUND_TRUNC
2499movaps xmm3, xmm1
2500mov [NewFlags], ecx
2501divps xmm2, xmm3 // A / B
2502ldmxcsr [NewFlags]
2503
2504cvtps2dq xmm2, xmm2
2505cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2506mulps xmm2, xmm1
2507subps xmm0, xmm2 // A - (B * Trunc(A / B))
2508
2509// Restore rounding mode
2510ldmxcsr [OldFlags]
2511
2512movq rax, xmm0
2513end;
2514
2515function FMod(const A, B: TVector2): TVector2; assembler;
2516var
2517OldFlags, NewFlags: UInt32;
2518asm
2519// Set rounding mode to Truncate
2520{$IF RTLVersion >= 33}
2521movlps xmm0, [A]
2522{$ELSE}
2523movq xmm0, A
2524{$ENDIF}
2525stmxcsr [OldFlags]
2526{$IF RTLVersion >= 33}
2527movlps xmm1, [B]
2528{$ELSE}
2529movq xmm1, B
2530{$ENDIF}
2531mov edx, [OldFlags]
2532movaps xmm2, xmm0
2533and edx, SSE_ROUND_MASK
2534movaps xmm3, xmm1
2535or edx, SSE_ROUND_TRUNC
2536divps xmm2, xmm3 // A / B
2537mov [NewFlags], edx
2538ldmxcsr [NewFlags]
2539
2540cvtps2dq xmm2, xmm2
2541cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2542mulps xmm2, xmm1
2543subps xmm0, xmm2 // A - (B * Trunc(A / B))
2544
2545// Restore rounding mode
2546ldmxcsr [OldFlags]
2547
2548movq rax, xmm0
2549end;
2550
2551function FMod(const A: TVector3; const B: Single): TVector3; assembler;
2552var
2553OldFlags, NewFlags: UInt32;
2554asm
2555// Set rounding mode to Truncate
2556movq xmm0, [A]
2557movss xmm1, DWORD [A+8]
2558movlhps xmm0, xmm1
2559movss xmm1, B
2560stmxcsr [OldFlags]
2561mov edx, [OldFlags]
2562shufps xmm1, xmm1, $00 // Replicate B
2563and edx, SSE_ROUND_MASK
2564movaps xmm2, xmm0
2565or edx, SSE_ROUND_TRUNC
2566movaps xmm3, xmm1
2567mov [NewFlags], edx
2568divps xmm2, xmm3 // A / B
2569ldmxcsr [NewFlags]
2570
2571cvtps2dq xmm2, xmm2
2572cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2573mulps xmm2, xmm1
2574subps xmm0, xmm2 // A - (B * Trunc(A / B))
2575
2576// Restore rounding mode
2577ldmxcsr [OldFlags]
2578
2579movhlps xmm1, xmm0
2580movq [Result], xmm0
2581movss DWORD [Result+8], xmm1
2582end;
2583
2584function FMod(const A, B: TVector3): TVector3;
2585begin
2586Result.Init(Neslib.FastMath.FMod(A.X, B.X), Neslib.FastMath.FMod(A.Y, B.Y), Neslib.FastMath.FMod(A.Z, B.Z));
2587end;
2588{function FMod(const A, B: TVector3): TVector3; assembler;
2589var
2590OldFlags, NewFlags: UInt32;
2591asm
2592// Set rounding mode to Truncate
2593movq xmm0, [A]
2594movss xmm1, DWORD [A+8]
2595movlhps xmm0, xmm1
2596stmxcsr [OldFlags]
2597movq xmm1, [B]
2598movss xmm2, DWORD [B+8]
2599movlhps xmm1, xmm2
2600mov edx, [OldFlags]
2601movaps xmm2, xmm0
2602and edx, SSE_ROUND_MASK
2603movaps xmm3, xmm1
2604or edx, SSE_ROUND_TRUNC
2605divps xmm2, xmm3 // A / B
2606mov [NewFlags], edx
2607ldmxcsr [NewFlags]
2608
2609cvtps2dq xmm2, xmm2
2610cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2611mulps xmm2, xmm1
2612subps xmm0, xmm2 // A - (B * Trunc(A / B))
2613
2614// Restore rounding mode
2615ldmxcsr [OldFlags]
2616
2617movhlps xmm1, xmm0
2618movq [Result], xmm0
2619movss DWORD [Result+8], xmm1
2620end;}
2621
2622function FMod(const A: TVector4; const B: Single): TVector4; assembler;
2623var
2624OldFlags, NewFlags: UInt32;
2625asm
2626// Set rounding mode to Truncate
2627movups xmm0, [A]
2628movss xmm1, B
2629stmxcsr [OldFlags]
2630mov edx, [OldFlags]
2631shufps xmm1, xmm1, $00 // Replicate B
2632and edx, SSE_ROUND_MASK
2633movaps xmm2, xmm0
2634or edx, SSE_ROUND_TRUNC
2635movaps xmm3, xmm1
2636mov [NewFlags], edx
2637divps xmm2, xmm3 // A / B
2638ldmxcsr [NewFlags]
2639
2640cvtps2dq xmm2, xmm2
2641cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2642mulps xmm2, xmm1
2643subps xmm0, xmm2 // A - (B * Trunc(A / B))
2644
2645// Restore rounding mode
2646ldmxcsr [OldFlags]
2647
2648movups [Result], xmm0
2649end;
2650
2651function FMod(const A, B: TVector4): TVector4; assembler;
2652var
2653OldFlags, NewFlags: UInt32;
2654asm
2655// Set rounding mode to Truncate
2656movups xmm0, [A]
2657stmxcsr [OldFlags]
2658movups xmm1, [B]
2659mov edx, [OldFlags]
2660movaps xmm2, xmm0
2661and edx, SSE_ROUND_MASK
2662movaps xmm3, xmm1
2663or edx, SSE_ROUND_TRUNC
2664divps xmm2, xmm3 // A / B
2665mov [NewFlags], edx
2666ldmxcsr [NewFlags]
2667
2668cvtps2dq xmm2, xmm2
2669cvtdq2ps xmm2, xmm2 // Trunc(A / B)
2670mulps xmm2, xmm1
2671subps xmm0, xmm2 // A - (B * Trunc(A / B))
2672
2673// Restore rounding mode
2674ldmxcsr [OldFlags]
2675
2676movups [Result], xmm0
2677end;
2678
2679function ModF(const A: Single; out B: Integer): Single;
2680begin
2681B := Trunc(A);
2682Result := Frac(A);
2683end;
2684{function ModF(const A: Single; out B: Integer): Single; assembler;
2685var
2686OldFlags, NewFlags: UInt32;
2687asm
2688// Set rounding mode to Truncate
2689stmxcsr [OldFlags]
2690mov ecx, [OldFlags]
2691and ecx, SSE_ROUND_MASK
2692or ecx, SSE_ROUND_TRUNC
2693mov [NewFlags], ecx
2694ldmxcsr [NewFlags]
2695
2696movss xmm1, xmm0
2697cvtps2dq xmm0, xmm0
2698movss [B], xmm0 // B = Trunc(A)
2699cvtdq2ps xmm0, xmm0
2700subss xmm1, xmm0 // A - Trunc(A)
2701
2702// Restore rounding mode
2703ldmxcsr [OldFlags]
2704
2705movss xmm0, xmm1
2706end;}
2707
2708function ModF(const A: TVector2; out B: TIVector2): TVector2; assembler;
2709var
2710OldFlags, NewFlags: UInt32;
2711asm
2712{$IF RTLVersion >= 33}
2713movlps xmm0, [A]
2714{$ELSE}
2715movq xmm0, A
2716{$ENDIF}
2717
2718// Set rounding mode to Truncate
2719stmxcsr [OldFlags]
2720mov eax, [OldFlags]
2721and eax, SSE_ROUND_MASK
2722or eax, SSE_ROUND_TRUNC
2723mov [NewFlags], eax
2724ldmxcsr [NewFlags]
2725
2726movaps xmm1, xmm0
2727cvtps2dq xmm0, xmm0
2728movlps [B], xmm0 // B = Trunc(A)
2729cvtdq2ps xmm0, xmm0
2730subps xmm1, xmm0 // A - Trunc(A)
2731
2732// Restore rounding mode
2733ldmxcsr [OldFlags]
2734
2735movq rax, xmm1
2736end;
2737
2738function ModF(const A: TVector3; out B: TIVector3): TVector3; assembler;
2739var
2740OldFlags, NewFlags: UInt32;
2741asm
2742movq xmm0, [A]
2743movss xmm1, DWORD [A+8]
2744movlhps xmm0, xmm1
2745
2746// Set rounding mode to Truncate
2747stmxcsr [OldFlags]
2748mov eax, [OldFlags]
2749and eax, SSE_ROUND_MASK
2750or eax, SSE_ROUND_TRUNC
2751mov [NewFlags], eax
2752ldmxcsr [NewFlags]
2753
2754movaps xmm1, xmm0
2755cvtps2dq xmm0, xmm0
2756movhlps xmm2, xmm0
2757movq [B], xmm0 // B = Trunc(A)
2758movd DWORD [B+8], xmm2
2759cvtdq2ps xmm0, xmm0
2760subps xmm1, xmm0 // A - Trunc(A)
2761
2762// Restore rounding mode
2763ldmxcsr [OldFlags]
2764
2765movhlps xmm0, xmm1
2766movq [Result], xmm1
2767movss DWORD [Result+8], xmm0
2768end;
2769
2770function ModF(const A: TVector4; out B: TIVector4): TVector4; assembler;
2771var
2772OldFlags, NewFlags: UInt32;
2773asm
2774movups xmm0, [A]
2775
2776// Set rounding mode to Truncate
2777stmxcsr [OldFlags]
2778mov eax, [OldFlags]
2779and eax, SSE_ROUND_MASK
2780or eax, SSE_ROUND_TRUNC
2781mov [NewFlags], eax
2782ldmxcsr [NewFlags]
2783
2784movaps xmm1, xmm0
2785cvtps2dq xmm0, xmm0
2786movups [B], xmm0 // B = Trunc(A)
2787cvtdq2ps xmm0, xmm0
2788subps xmm1, xmm0 // A - Trunc(A)
2789
2790// Restore rounding mode
2791ldmxcsr [OldFlags]
2792
2793movups [Result], xmm1
2794end;
2795
2796function Min(const A: TVector2; const B: Single): TVector2; assembler;
2797asm
2798{$IF RTLVersion >= 33}
2799movlps xmm0, [A]
2800{$ELSE}
2801movq xmm0, A
2802{$ENDIF}
2803shufps xmm1, xmm1, $00 // Replicate B
2804minps xmm0, xmm1
2805movq rax, xmm0
2806end;
2807
2808function Min(const A, B: TVector2): TVector2; assembler;
2809asm
2810{$IF RTLVersion >= 33}
2811movlps xmm0, [A]
2812movlps xmm1, [B]
2813{$ELSE}
2814movq xmm0, A
2815movq xmm1, B
2816{$ENDIF}
2817minps xmm0, xmm1
2818movq rax, xmm0
2819end;
2820
2821function Min(const A: TVector3; const B: Single): TVector3; assembler;
2822asm
2823movq xmm0, [A]
2824movss xmm1, DWORD [A+8]
2825movlhps xmm0, xmm1
2826shufps xmm2, xmm2, $00 // Replicate B
2827minps xmm0, xmm2
2828movhlps xmm1, xmm0
2829movq [Result], xmm0
2830movss DWORD [Result+8], xmm1
2831end;
2832
2833function Min(const A, B: TVector3): TVector3; assembler;
2834asm
2835movq xmm0, [A]
2836movss xmm1, DWORD [A+8]
2837movlhps xmm0, xmm1
2838movq xmm1, [B]
2839movss xmm2, DWORD [B+8]
2840movlhps xmm1, xmm2
2841minps xmm0, xmm1
2842movhlps xmm1, xmm0
2843movq [Result], xmm0
2844movss DWORD [Result+8], xmm1
2845end;
2846
2847function Min(const A: TVector4; const B: Single): TVector4; assembler;
2848asm
2849movups xmm0, [A]
2850shufps xmm2, xmm2, $00 // Replicate B
2851minps xmm0, xmm2
2852movups [Result], xmm0
2853end;
2854
2855function Min(const A, B: TVector4): TVector4; assembler;
2856asm
2857movups xmm0, [A]
2858movups xmm1, [B]
2859minps xmm0, xmm1
2860movups [Result], xmm0
2861end;
2862
2863function Max(const A: TVector2; const B: Single): TVector2; assembler;
2864asm
2865{$IF RTLVersion >= 33}
2866movlps xmm0, [A]
2867{$ELSE}
2868movq xmm0, A
2869{$ENDIF}
2870shufps xmm1, xmm1, $00 // Replicate B
2871maxps xmm0, xmm1
2872movq rax, xmm0
2873end;
2874
2875function Max(const A, B: TVector2): TVector2; assembler;
2876asm
2877{$IF RTLVersion >= 33}
2878movlps xmm0, [A]
2879movlps xmm1, [B]
2880{$ELSE}
2881movq xmm0, A
2882movq xmm1, B
2883{$ENDIF}
2884maxps xmm0, xmm1
2885movq rax, xmm0
2886end;
2887
2888function Max(const A: TVector3; const B: Single): TVector3; assembler;
2889asm
2890movq xmm0, [A]
2891movss xmm1, DWORD [A+8]
2892movlhps xmm0, xmm1
2893shufps xmm2, xmm2, $00 // Replicate B
2894maxps xmm0, xmm2
2895movhlps xmm1, xmm0
2896movq [Result], xmm0
2897movss DWORD [Result+8], xmm1
2898end;
2899
2900function Max(const A, B: TVector3): TVector3; assembler;
2901asm
2902movq xmm0, [A]
2903movss xmm1, DWORD [A+8]
2904movlhps xmm0, xmm1
2905movq xmm1, [B]
2906movss xmm2, DWORD [B+8]
2907movlhps xmm1, xmm2
2908maxps xmm0, xmm1
2909movhlps xmm1, xmm0
2910movq [Result], xmm0
2911movss DWORD [Result+8], xmm1
2912end;
2913
2914function Max(const A: TVector4; const B: Single): TVector4; assembler;
2915asm
2916movups xmm0, [A]
2917shufps xmm2, xmm2, $00 // Replicate B
2918maxps xmm0, xmm2
2919movups [Result], xmm0
2920end;
2921
2922function Max(const A, B: TVector4): TVector4; assembler;
2923asm
2924movups xmm0, [A]
2925movups xmm1, [B]
2926maxps xmm0, xmm1
2927movups [Result], xmm0
2928end;
2929
2930function EnsureRange(const A, AMin, AMax: Single): Single; assembler;
2931asm
2932maxss xmm0, xmm1
2933minss xmm0, xmm2
2934end;
2935
2936function EnsureRange(const A: TVector2; const AMin, AMax: Single): TVector2; assembler;
2937asm
2938{$IF RTLVersion >= 33}
2939movlps xmm0, [A]
2940{$ELSE}
2941movq xmm0, A
2942{$ENDIF}
2943shufps xmm1, xmm1, $00 // Replicate AMin
2944shufps xmm2, xmm2, $00 // Replicate AMax
2945maxps xmm0, xmm1
2946minps xmm0, xmm2
2947movq rax, xmm0
2948end;
2949
2950function EnsureRange(const A, AMin, AMax: TVector2): TVector2; assembler;
2951asm
2952{$IF RTLVersion >= 33}
2953movlps xmm0, [A]
2954movlps xmm1, [AMin]
2955movlps xmm2, [AMax]
2956{$ELSE}
2957movq xmm0, A
2958movq xmm1, AMin
2959movq xmm2, AMax
2960{$ENDIF}
2961maxps xmm0, xmm1
2962minps xmm0, xmm2
2963movq rax, xmm0
2964end;
2965
2966function EnsureRange(const A: TVector3; const AMin, AMax: Single): TVector3; assembler;
2967asm
2968movq xmm0, [A]
2969movss xmm1, DWORD [A+8]
2970movlhps xmm0, xmm1
2971shufps xmm2, xmm2, $00 // Replicate AMin
2972shufps xmm3, xmm3, $00 // Replicate AMax
2973maxps xmm0, xmm2
2974minps xmm0, xmm3
2975movhlps xmm1, xmm0
2976movq [Result], xmm0
2977movss DWORD [Result+8], xmm1
2978end;
2979
2980function EnsureRange(const A, AMin, AMax: TVector3): TVector3; assembler;
2981asm
2982movq xmm0, [A]
2983movss xmm1, DWORD [A+8]
2984movlhps xmm0, xmm1
2985movq xmm1, [AMin]
2986movss xmm2, DWORD [AMin+8]
2987movlhps xmm1, xmm2
2988movq xmm2, [AMax]
2989movss xmm3, DWORD [AMax+8]
2990movlhps xmm2, xmm3
2991maxps xmm0, xmm1
2992minps xmm0, xmm2
2993movhlps xmm1, xmm0
2994movq [Result], xmm0
2995movss DWORD [Result+8], xmm1
2996end;
2997
2998function EnsureRange(const A: TVector4; const AMin, AMax: Single): TVector4; assembler;
2999asm
3000movups xmm0, [A]
3001shufps xmm2, xmm2, $00 // Replicate AMin
3002shufps xmm3, xmm3, $00 // Replicate AMax
3003maxps xmm0, xmm2
3004minps xmm0, xmm3
3005movups [Result], xmm0
3006end;
3007
3008function EnsureRange(const A, AMin, AMax: TVector4): TVector4; assembler;
3009asm
3010movups xmm0, [A]
3011movups xmm1, [AMin]
3012movups xmm2, [AMax]
3013maxps xmm0, xmm1
3014minps xmm0, xmm2
3015movups [Result], xmm0
3016end;
3017
3018function Mix(const A, B: TVector2; const T: Single): TVector2;
3019begin
3020Result.Init(Mix(A.X, B.X, T), Mix(A.Y, B.Y, T));
3021end;
3022
3023function Mix(const A, B, T: TVector2): TVector2;
3024begin
3025Result.Init(Mix(A.X, B.X, T.X), Mix(A.Y, B.Y, T.Y));
3026end;
3027
3028function Mix(const A, B: TVector3; const T: Single): TVector3; assembler;
3029asm
3030movq xmm0, [A]
3031movss xmm1, DWORD [A+8]
3032movlhps xmm0, xmm1
3033movq xmm1, [B]
3034movss xmm2, DWORD [B+8]
3035movlhps xmm1, xmm2
3036shufps xmm3, xmm3, $00 // Replicate T
3037subps xmm1, xmm0
3038mulps xmm1, xmm3
3039addps xmm0, xmm1 // A + (T * (B - A))
3040movhlps xmm1, xmm0
3041movq [Result], xmm0
3042movss DWORD [Result+8], xmm1
3043end;
3044
3045function Mix(const A, B, T: TVector3): TVector3; assembler;
3046asm
3047movq xmm0, [A]
3048movss xmm1, DWORD [A+8]
3049movlhps xmm0, xmm1
3050movq xmm1, [B]
3051movss xmm2, DWORD [B+8]
3052movlhps xmm1, xmm2
3053movq xmm2, [T]
3054movss xmm3, DWORD [T+8]
3055movlhps xmm2, xmm3
3056subps xmm1, xmm0
3057mulps xmm1, xmm2
3058addps xmm0, xmm1 // A + (T * (B - A))
3059movhlps xmm1, xmm0
3060movq [Result], xmm0
3061movss DWORD [Result+8], xmm1
3062end;
3063
3064function Mix(const A, B: TVector4; const T: Single): TVector4; assembler;
3065asm
3066movups xmm0, [A]
3067movups xmm1, [B]
3068shufps xmm3, xmm3, $00 // Replicate T
3069subps xmm1, xmm0
3070mulps xmm1, xmm3
3071addps xmm0, xmm1 // A + (T * (B - A))
3072movups [Result], xmm0
3073end;
3074
3075function Mix(const A, B, T: TVector4): TVector4; assembler;
3076asm
3077movups xmm0, [A]
3078movups xmm1, [B]
3079movups xmm2, [T]
3080subps xmm1, xmm0
3081mulps xmm1, xmm2
3082addps xmm0, xmm1 // A + (T * (B - A))
3083movups [Result], xmm0
3084end;
3085
3086function Step(const AEdge: Single; const A: TVector2): TVector2; assembler;
3087asm
3088{$IF RTLVersion >= 33}
3089movlps xmm1, [A]
3090{$ELSE}
3091movq xmm1, A
3092{$ENDIF}
3093shufps xmm0, xmm0, $00 // Replicate AEdge
3094movlps xmm2, QWORD [SSE_ONE]
3095cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
3096andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
3097movq rax, xmm1
3098end;
3099
3100function Step(const AEdge, A: TVector2): TVector2; assembler;
3101asm
3102{$IF RTLVersion >= 33}
3103movlps xmm0, [AEdge]
3104movlps xmm1, [A]
3105{$ELSE}
3106movq xmm0, AEdge
3107movq xmm1, A
3108{$ENDIF}
3109movlps xmm2, QWORD [SSE_ONE]
3110cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
3111andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
3112movq rax, xmm1
3113end;
3114
3115function Step(const AEdge: Single; const A: TVector3): TVector3; assembler;
3116asm
3117movq xmm0, [A]
3118movss xmm2, DWORD [A+8]
3119movlhps xmm0, xmm2
3120shufps xmm1, xmm1, $00 // Replicate AEdge
3121movups xmm2, [SSE_ONE]
3122cmpnltps xmm0, xmm1 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
3123andps xmm0, xmm2 // (A >= AEdge)? Yes: 1, No: 0
3124movhlps xmm1, xmm0
3125movq [Result], xmm0
3126movss DWORD [Result+8], xmm1
3127end;
3128
3129function Step(const AEdge, A: TVector3): TVector3; assembler;
3130asm
3131movq xmm0, [AEdge]
3132movss xmm1, DWORD [AEdge+8]
3133movlhps xmm0, xmm1
3134movq xmm1, [A]
3135movss xmm2, DWORD [A+8]
3136movlhps xmm1, xmm2
3137movups xmm2, [SSE_ONE]
3138cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
3139andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
3140movhlps xmm0, xmm1
3141movq [Result], xmm1
3142movss DWORD [Result+8], xmm0
3143end;
3144
3145function Step(const AEdge: Single; const A: TVector4): TVector4; assembler;
3146asm
3147movups xmm0, [A]
3148shufps xmm1, xmm1, $00 // Replicate AEdge
3149movups xmm2, [SSE_ONE]
3150cmpnltps xmm0, xmm1 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
3151andps xmm0, xmm2 // (A >= AEdge)? Yes: 1, No: 0
3152movups [Result], xmm0
3153end;
3154
3155function Step(const AEdge, A: TVector4): TVector4;
3156asm
3157movups xmm0, [AEdge]
3158movups xmm1, [A]
3159movups xmm2, [SSE_ONE]
3160cmpnltps xmm1, xmm0 // (A >= AEdge)? Yes: $FFFFFFFF, No: $00000000
3161andps xmm1, xmm2 // (A >= AEdge)? Yes: 1, No: 0
3162movups [Result], xmm1
3163end;
3164
3165function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector2): TVector2;
3166begin
3167Result.Init(SmoothStep(AEdge0, AEdge1, A.X), SmoothStep(AEdge0, AEdge1, A.Y));
3168end;
3169
3170function SmoothStep(const AEdge0, AEdge1, A: TVector2): TVector2;
3171begin
3172Result.Init(SmoothStep(AEdge0.X, AEdge1.X, A.X), SmoothStep(AEdge0.Y, AEdge1.Y, A.Y));
3173end;
3174
3175function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector3): TVector3;
3176begin
3177Result.Init(SmoothStep(AEdge0, AEdge1, A.X), SmoothStep(AEdge0, AEdge1, A.Y), SmoothStep(AEdge0, AEdge1, A.Z));
3178end;
3179{function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector3): TVector3; assembler;
3180asm
3181movdqa [rsp-24], xmm6
3182movdqa [rsp-40], xmm7
3183
3184movq xmm0, [A]
3185movss xmm3, DWORD [A+8]
3186movlhps xmm0, xmm3
3187shufps xmm1, xmm1, $00 // Replicate AEdge0
3188shufps xmm2, xmm2, $00 // Replicate AEdge1
3189movaps xmm3, xmm0
3190movaps xmm4, xmm0
3191movaps xmm5, xmm0
3192movups xmm6, [SSE_ONE]
3193
3194cmpnltps xmm3, xmm1 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3195cmpleps xmm4, xmm2 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3196subps xmm2, xmm1
3197movaps xmm5, xmm4
3198subps xmm0, xmm1
3199andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3200
3201movups xmm6, [SSE_TWO]
3202divps xmm0, xmm2 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3203movups xmm7, [SSE_THREE]
3204mulps xmm6, xmm0 // 2 * Temp
3205subps xmm7, xmm6 // 3 - (2 * Temp)
3206mulps xmm7, xmm0
3207mulps xmm7, xmm0 // Result := Temp * Temp * (3 - (2 * Temp))
3208andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3209andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3210orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3211
3212movhlps xmm6, xmm7
3213movq [Result], xmm7
3214movss DWORD [Result+8], xmm6
3215
3216movdqa xmm6, [rsp-24]
3217movdqa xmm7, [rsp-40]
3218end;}
3219
3220function SmoothStep(const AEdge0, AEdge1, A: TVector3): TVector3; assembler;
3221asm
3222movdqa [rsp-24], xmm6
3223movdqa [rsp-40], xmm7
3224
3225movq xmm2, [A]
3226movss xmm3, DWORD [A+8]
3227movlhps xmm2, xmm3
3228movq xmm0, [AEdge0]
3229movss xmm1, DWORD [AEdge0+8]
3230movlhps xmm0, xmm1
3231movq xmm1, [AEdge1]
3232movss xmm3, DWORD [AEdge1+8]
3233movlhps xmm1, xmm3
3234
3235movaps xmm3, xmm2
3236movaps xmm4, xmm2
3237movaps xmm5, xmm2
3238movups xmm6, [SSE_ONE]
3239
3240cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3241cmpleps xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3242subps xmm1, xmm0
3243movaps xmm5, xmm4
3244subps xmm2, xmm0
3245andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3246
3247movups xmm6, [SSE_TWO]
3248divps xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3249movups xmm7, [SSE_THREE]
3250mulps xmm6, xmm2 // 2 * Temp
3251subps xmm7, xmm6 // 3 - (2 * Temp)
3252mulps xmm7, xmm2
3253mulps xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3254andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3255andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3256orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3257
3258movhlps xmm6, xmm7
3259movq [Result], xmm7
3260movss DWORD [Result+8], xmm6
3261
3262movdqa xmm6, [rsp-24]
3263movdqa xmm7, [rsp-40]
3264end;
3265
3266function SmoothStep(const AEdge0, AEdge1: Single; const A: TVector4): TVector4;
3267asm
3268movdqa [rsp-24], xmm6
3269movdqa [rsp-40], xmm7
3270
3271movups xmm0, [A]
3272shufps xmm1, xmm1, $00 // Replicate AEdge0
3273shufps xmm2, xmm2, $00 // Replicate AEdge1
3274movaps xmm3, xmm0
3275movaps xmm4, xmm0
3276movaps xmm5, xmm0
3277movups xmm6, [SSE_ONE]
3278
3279cmpnltps xmm3, xmm1 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3280cmpleps xmm4, xmm2 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3281subps xmm2, xmm1
3282movaps xmm5, xmm4
3283subps xmm0, xmm1
3284andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3285
3286movups xmm6, [SSE_TWO]
3287divps xmm0, xmm2 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3288movups xmm7, [SSE_THREE]
3289mulps xmm6, xmm0 // 2 * Temp
3290subps xmm7, xmm6 // 3 - (2 * Temp)
3291mulps xmm7, xmm0
3292mulps xmm7, xmm0 // Result := Temp * Temp * (3 - (2 * Temp))
3293andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3294andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3295orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3296
3297movups [Result], xmm7
3298
3299movdqa xmm6, [rsp-24]
3300movdqa xmm7, [rsp-40]
3301end;
3302
3303function SmoothStep(const AEdge0, AEdge1, A: TVector4): TVector4; assembler;
3304asm
3305movdqa [rsp-24], xmm6
3306movdqa [rsp-40], xmm7
3307
3308movups xmm2, [A]
3309movups xmm0, [AEdge0]
3310movups xmm1, [AEdge1]
3311movaps xmm3, xmm2
3312movaps xmm4, xmm2
3313movaps xmm5, xmm2
3314movups xmm6, [SSE_ONE]
3315
3316cmpnltps xmm3, xmm0 // (A >= AEdge0)? Yes: $FFFFFFFF, No: $00000000
3317cmpleps xmm4, xmm1 // (A <= AEdge1)? Yes: $FFFFFFFF, No: $00000000
3318subps xmm1, xmm0
3319movaps xmm5, xmm4
3320subps xmm2, xmm0
3321andnps xmm5, xmm6 // (A > AEdge1)? Yes: 1.0, No: 0.0
3322
3323movups xmm6, [SSE_TWO]
3324divps xmm2, xmm1 // Temp := (A - AEdge0) / (AEdge1 - AEdge0)
3325movups xmm7, [SSE_THREE]
3326mulps xmm6, xmm2 // 2 * Temp
3327subps xmm7, xmm6 // 3 - (2 * Temp)
3328mulps xmm7, xmm2
3329mulps xmm7, xmm2 // Result := Temp * Temp * (3 - (2 * Temp))
3330andps xmm7, xmm3 // (A < AEdge0)? Yes: 0, No: Result
3331andps xmm7, xmm4 // (A > AEdge1)? Yes: 0, No: Result
3332orps xmm7, xmm5 // (A > AEdge1)? Yes: 1, No: Result
3333
3334movups [Result], xmm7
3335
3336movdqa xmm6, [rsp-24]
3337movdqa xmm7, [rsp-40]
3338end;
3339
3340function FMA(const A, B, C: TVector2): TVector2; assembler;
3341asm
3342{$IF RTLVersion >= 33}
3343movlps xmm0, [A]
3344movlps xmm1, [B]
3345movlps xmm2, [C]
3346{$ELSE}
3347movq xmm0, A
3348movq xmm1, B
3349movq xmm2, C
3350{$ENDIF}
3351mulps xmm0, xmm1
3352addps xmm0, xmm2
3353movq rax, xmm0
3354end;
3355
3356function FMA(const A, B, C: TVector3): TVector3; assembler;
3357asm
3358movq xmm0, [A]
3359movss xmm1, DWORD [A+8]
3360movlhps xmm0, xmm1
3361movq xmm1, [B]
3362movss xmm2, DWORD [B+8]
3363movlhps xmm1, xmm2
3364movq xmm2, [C]
3365movss xmm3, DWORD [C+8]
3366movlhps xmm2, xmm3
3367mulps xmm0, xmm1
3368addps xmm0, xmm2
3369movhlps xmm1, xmm0
3370movq [Result], xmm0
3371movss DWORD [Result+8], xmm1
3372end;
3373
3374function FMA(const A, B, C: TVector4): TVector4; assembler;
3375asm
3376movups xmm0, [A]
3377movups xmm1, [B]
3378movups xmm2, [C]
3379mulps xmm0, xmm1
3380addps xmm0, xmm2
3381movups [Result], xmm0
3382end;
3383
3384{ Matrix functions }
3385
3386{$IFDEF FM_COLUMN_MAJOR}
3387function OuterProduct(const C, R: TVector2): TMatrix2; assembler;
3388asm
3389{$IF RTLVersion >= 33}
3390movlps xmm0, [R] // # # C.Y C.X
3391movlps xmm1, [C] // # # R.Y R.X
3392{$ELSE}
3393movq xmm0, R // # # C.Y C.X
3394movq xmm1, C // # # R.Y R.X
3395{$ENDIF}
3396
3397shufps xmm0, xmm0, $50 // C.Y C.X C.Y C.X
3398shufps xmm1, xmm1, $44 // R.Y R.Y R.X R.X
3399
3400mulps xmm1, xmm0 // (C.Y*R.Y) (C.X*R.Y) (C.Y*R.X) (C.X*R.X)
3401
3402// Store as matrix
3403movups [Result], xmm1
3404end;
3405
3406function OuterProduct(const C, R: TVector3): TMatrix3; assembler;
3407asm
3408movq xmm0, [C]
3409movss xmm1, DWORD [C+8]
3410movlhps xmm0, xmm1
3411movq xmm1, [R]
3412movss xmm2, DWORD [R+8]
3413movlhps xmm1, xmm2
3414movaps xmm2, xmm1
3415movaps xmm3, xmm1
3416
3417shufps xmm1, xmm1, $00 // C.X (4x)
3418shufps xmm2, xmm2, $55 // C.Y (4x)
3419shufps xmm3, xmm3, $AA // C.Z (4x)
3420
3421mulps xmm1, xmm0 // R * C.X
3422mulps xmm2, xmm0 // R * C.Y
3423mulps xmm3, xmm0 // R * C.Z
3424
3425// Store as matrix
3426movhlps xmm0, xmm1
3427movhlps xmm4, xmm2
3428movhlps xmm5, xmm3
3429movq [Result+$00], xmm1
3430movss DWORD [Result+$08], xmm0
3431movq [Result+$0C], xmm2
3432movss DWORD [Result+$14], xmm4
3433movq [Result+$18], xmm3
3434movss DWORD [Result+$20], xmm5
3435end;
3436
3437function OuterProduct(const C, R: TVector4): TMatrix4; assembler;
3438asm
3439movups xmm0, [C]
3440movups xmm1, [R]
3441movaps xmm2, xmm1
3442movaps xmm3, xmm1
3443movaps xmm4, xmm1
3444
3445shufps xmm1, xmm1, $00 // C.X (4x)
3446shufps xmm2, xmm2, $55 // C.Y (4x)
3447shufps xmm3, xmm3, $AA // C.Z (4x)
3448shufps xmm4, xmm4, $FF // C.W (4x)
3449
3450mulps xmm1, xmm0 // R * C.X
3451mulps xmm2, xmm0 // R * C.Y
3452mulps xmm3, xmm0 // R * C.Z
3453mulps xmm4, xmm0 // R * C.W
3454
3455// Store as matrix
3456movups DQWORD [Result + $00], xmm1
3457movups DQWORD [Result + $10], xmm2
3458movups DQWORD [Result + $20], xmm3
3459movups DQWORD [Result + $30], xmm4
3460end;
3461{$ELSE}
3462function OuterProduct(const C, R: TVector2): TMatrix2; assembler;
3463asm
3464{$IF RTLVersion >= 33}
3465movlps xmm0, [C] // # # C.Y C.X
3466movlps xmm1, [R] // # # R.Y R.X
3467{$ELSE}
3468movq xmm0, C // # # C.Y C.X
3469movq xmm1, R // # # R.Y R.X
3470{$ENDIF}
3471
3472shufps xmm0, xmm0, $50 // C.Y C.X C.Y C.X
3473shufps xmm1, xmm1, $44 // R.Y R.Y R.X R.X
3474
3475mulps xmm1, xmm0 // (C.Y*R.Y) (C.X*R.Y) (C.Y*R.X) (C.X*R.X)
3476
3477// Store as matrix
3478movups [Result], xmm1
3479end;
3480
3481function OuterProduct(const C, R: TVector3): TMatrix3; assembler;
3482asm
3483movq xmm0, [R]
3484movss xmm1, DWORD [R+8]
3485movlhps xmm0, xmm1
3486movq xmm1, [C]
3487movss xmm2, DWORD [C+8]
3488movlhps xmm1, xmm2
3489movaps xmm2, xmm1
3490movaps xmm3, xmm1
3491
3492shufps xmm1, xmm1, $00 // C.X (4x)
3493shufps xmm2, xmm2, $55 // C.Y (4x)
3494shufps xmm3, xmm3, $AA // C.Z (4x)
3495
3496mulps xmm1, xmm0 // R * C.X
3497mulps xmm2, xmm0 // R * C.Y
3498mulps xmm3, xmm0 // R * C.Z
3499
3500// Store as matrix
3501movhlps xmm0, xmm1
3502movhlps xmm4, xmm2
3503movhlps xmm5, xmm3
3504movq [Result+$00], xmm1
3505movss DWORD [Result+$08], xmm0
3506movq [Result+$0C], xmm2
3507movss DWORD [Result+$14], xmm4
3508movq [Result+$18], xmm3
3509movss DWORD [Result+$20], xmm5
3510end;
3511
3512function OuterProduct(const C, R: TVector4): TMatrix4; assembler;
3513asm
3514movups xmm0, [R]
3515movups xmm1, [C]
3516movaps xmm2, xmm1
3517movaps xmm3, xmm1
3518movaps xmm4, xmm1
3519
3520shufps xmm1, xmm1, $00 // C.X (4x)
3521shufps xmm2, xmm2, $55 // C.Y (4x)
3522shufps xmm3, xmm3, $AA // C.Z (4x)
3523shufps xmm4, xmm4, $FF // C.W (4x)
3524
3525mulps xmm1, xmm0 // R * C.X
3526mulps xmm2, xmm0 // R * C.Y
3527mulps xmm3, xmm0 // R * C.Z
3528mulps xmm4, xmm0 // R * C.W
3529
3530// Store as matrix
3531movups DQWORD [Result + $00], xmm1
3532movups DQWORD [Result + $10], xmm2
3533movups DQWORD [Result + $20], xmm3
3534movups DQWORD [Result + $30], xmm4
3535end;
3536{$ENDIF}
3537
3538{ TVector2 }
3539
3540class operator TVector2.Add(const A: TVector2; const B: Single): TVector2;
3541begin
3542Result.X := A.X + B;
3543Result.Y := A.Y + B;
3544end;
3545
3546class operator TVector2.Add(const A: Single; const B: TVector2): TVector2;
3547begin
3548Result.X := A + B.X;
3549Result.Y := A + B.Y;
3550end;
3551
3552class operator TVector2.Add(const A, B: TVector2): TVector2;
3553begin
3554Result.X := A.X + B.X;
3555Result.Y := A.Y + B.Y;
3556end;
3557
3558function TVector2.Distance(const AOther: TVector2): Single;
3559begin
3560Result := (Self - AOther).Length;
3561end;
3562
3563function TVector2.DistanceSquared(const AOther: TVector2): Single;
3564begin
3565Result := (Self - AOther).LengthSquared;
3566end;
3567
3568class operator TVector2.Divide(const A: TVector2; const B: Single): TVector2; assembler;
3569asm
3570{$IF RTLVersion >= 33}
3571movlps xmm0, [A]
3572{$ELSE}
3573movq xmm0, A
3574{$ENDIF}
3575shufps xmm1, xmm1, 0
3576divps xmm0, xmm1
3577movq rax, xmm0
3578end;
3579
3580class operator TVector2.Divide(const A: Single; const B: TVector2): TVector2; assembler;
3581asm
3582{$IF RTLVersion >= 33}
3583movlps xmm1, [B]
3584{$ELSE}
3585movq xmm1, B
3586{$ENDIF}
3587shufps xmm0, xmm0, 0
3588divps xmm0, xmm1
3589movq rax, xmm0
3590end;
3591
3592class operator TVector2.Divide(const A, B: TVector2): TVector2; assembler;
3593asm
3594{$IF RTLVersion >= 33}
3595movlps xmm0, [A]
3596movlps xmm1, [B]
3597{$ELSE}
3598movq xmm0, A
3599movq xmm1, B
3600{$ENDIF}
3601divps xmm0, xmm1
3602movq rax, xmm0
3603end;
3604
3605function TVector2.Dot(const AOther: TVector2): Single;
3606begin
3607Result := (X * AOther.X) + (Y * AOther.Y);
3608end;
3609
3610function TVector2.FaceForward(const I, NRef: TVector2): TVector2;
3611begin
3612if (NRef.Dot(I) < 0) then
3613Result := Self
3614else
3615Result := -Self;
3616end;
3617
3618function TVector2.GetLength: Single;
3619begin
3620Result := Sqrt((X * X) + (Y * Y));
3621end;
3622
3623function TVector2.GetLengthSquared: Single;
3624begin
3625Result := (X * X) + (Y * Y);
3626end;
3627
3628class operator TVector2.Multiply(const A: TVector2; const B: Single): TVector2;
3629begin
3630Result.X := A.X * B;
3631Result.Y := A.Y * B;
3632end;
3633
3634class operator TVector2.Multiply(const A: Single; const B: TVector2): TVector2;
3635begin
3636Result.X := A * B.X;
3637Result.Y := A * B.Y;
3638end;
3639
3640class operator TVector2.Multiply(const A, B: TVector2): TVector2;
3641begin
3642Result.X := A.X * B.X;
3643Result.Y := A.Y * B.Y;
3644end;
3645
3646function TVector2.NormalizeFast: TVector2; assembler;
3647asm
3648movlps xmm0, [Self] // Y X
3649movaps xmm2, xmm0
3650mulps xmm0, xmm0 // Y*Y X*X
3651pshufd xmm1, xmm0, $01 // X*X Y*Y
3652addps xmm0, xmm1 // (X*X+Y*Y) (2x)
3653rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y)) (4x)
3654mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3655movq rax, xmm0
3656end;
3657
3658function TVector2.Reflect(const N: TVector2): TVector2;
3659begin
3660Result := Self - ((2 * N.Dot(Self)) * N);
3661end;
3662
3663function TVector2.Refract(const N: TVector2; const Eta: Single): TVector2;
3664var
3665D, K: Single;
3666begin
3667D := N.Dot(Self);
3668K := 1 - Eta * Eta * (1 - D * D);
3669if (K < 0) then
3670Result.Init
3671else
3672Result := (Eta * Self) - ((Eta * D + Sqrt(K)) * N);
3673end;
3674
3675procedure TVector2.SetNormalizedFast; assembler;
3676asm
3677movlps xmm0, [Self] // Y X
3678movaps xmm2, xmm0
3679mulps xmm0, xmm0 // Y*Y X*X
3680pshufd xmm1, xmm0, $01 // X*X Y*Y
3681addps xmm0, xmm1 // (X*X+Y*Y) (2x)
3682rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y)) (4x)
3683mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3684movlps [Self], xmm0
3685end;
3686
3687class operator TVector2.Subtract(const A: TVector2; const B: Single): TVector2;
3688begin
3689Result.X := A.X - B;
3690Result.Y := A.Y - B;
3691end;
3692
3693class operator TVector2.Subtract(const A: Single; const B: TVector2): TVector2;
3694begin
3695Result.X := A - B.X;
3696Result.Y := A - B.Y;
3697end;
3698
3699class operator TVector2.Subtract(const A, B: TVector2): TVector2;
3700begin
3701Result.X := A.X - B.X;
3702Result.Y := A.Y - B.Y;
3703end;
3704
3705{ TVector3 }
3706
3707class operator TVector3.Add(const A: TVector3; const B: Single): TVector3; assembler;
3708asm
3709movq xmm0, [A] // Load 3 floating-point values
3710movss xmm1, DWORD [A+8]
3711shufps xmm2, xmm2, 0 // Replicate B
3712addps xmm0, xmm2 // A + B
3713addss xmm1, xmm2
3714movq [Result], xmm0 // Store result
3715movss DWORD [Result+8], xmm1
3716end;
3717
3718class operator TVector3.Add(const A: Single; const B: TVector3): TVector3; assembler;
3719asm
3720movq xmm0, [B]
3721movss xmm2, DWORD [B+8]
3722shufps xmm1, xmm1, 0
3723addps xmm0, xmm1
3724addss xmm2, xmm1
3725movq [Result], xmm0
3726movss DWORD [Result+8], xmm2
3727end;
3728
3729class operator TVector3.Add(const A, B: TVector3): TVector3; assembler;
3730asm
3731movq xmm0, [A]
3732movss xmm1, DWORD [A+8]
3733movq xmm2, [B]
3734movss xmm3, DWORD [B+8]
3735addps xmm0, xmm2
3736addss xmm1, xmm3
3737movq [Result], xmm0
3738movss DWORD [Result+8], xmm1
3739end;
3740
3741function TVector3.Distance(const AOther: TVector3): Single; assembler;
3742asm
3743movq xmm0, [Self]
3744movss xmm1, DWORD [Self+8]
3745movq xmm2, [AOther]
3746movss xmm3, DWORD [AOther+8]
3747movlhps xmm0, xmm1
3748movlhps xmm2, xmm3
3749subps xmm0, xmm2 // A - B
3750
3751// (A - B).Length
3752mulps xmm0, xmm0
3753pshufd xmm1, xmm0, $0E
3754addps xmm0, xmm1
3755pshufd xmm1, xmm0, $01
3756addss xmm0, xmm1
3757sqrtss xmm0, xmm0
3758end;
3759
3760function TVector3.DistanceSquared(const AOther: TVector3): Single; assembler;
3761asm
3762movq xmm0, [Self]
3763movss xmm1, DWORD [Self+8]
3764movq xmm2, [AOther]
3765movss xmm3, DWORD [AOther+8]
3766movlhps xmm0, xmm1
3767movlhps xmm2, xmm3
3768subps xmm0, xmm2 // A - B
3769
3770// (A - B).Length
3771mulps xmm0, xmm0
3772pshufd xmm1, xmm0, $0E
3773addps xmm0, xmm1
3774pshufd xmm1, xmm0, $01
3775addss xmm0, xmm1
3776end;
3777
3778class operator TVector3.Divide(const A: TVector3; const B: Single): TVector3;
3779var
3780InvB: Single;
3781begin
3782InvB := 1 / B;
3783Result.X := A.X * InvB;
3784Result.Y := A.Y * InvB;
3785Result.Z := A.Z * InvB;
3786end;
3787{class operator TVector3.Divide(const A: TVector3; const B: Single): TVector3; assembler;
3788asm
3789movq xmm0, [A]
3790movss xmm1, DWORD [A+8]
3791shufps xmm2, xmm2, 0
3792divps xmm0, xmm2
3793divss xmm1, xmm2
3794movq [Result], xmm0
3795movss DWORD [Result+8], xmm1
3796end;}
3797
3798class operator TVector3.Divide(const A: Single; const B: TVector3): TVector3; assembler;
3799asm
3800movq xmm0, [B]
3801movss xmm2, DWORD [B+8]
3802movss xmm3, xmm1
3803shufps xmm1, xmm1, 0
3804divps xmm1, xmm0
3805divss xmm3, xmm2
3806movq [Result], xmm1
3807movss DWORD [Result+8], xmm3
3808end;
3809
3810class operator TVector3.Divide(const A, B: TVector3): TVector3; assembler;
3811asm
3812movq xmm0, [A]
3813movss xmm1, DWORD [A+8]
3814movq xmm2, [B]
3815movss xmm3, DWORD [B+8]
3816divps xmm0, xmm2
3817divss xmm1, xmm3
3818movq [Result], xmm0
3819movss DWORD [Result+8], xmm1
3820end;
3821
3822function TVector3.Cross(const AOther: TVector3): TVector3;
3823begin
3824Result.X := (Y * AOther.Z) - (AOther.Y * Z);
3825Result.Y := (Z * AOther.X) - (AOther.Z * X);
3826Result.Z := (X * AOther.Y) - (AOther.X * Y);
3827end;
3828
3829function TVector3.Dot(const AOther: TVector3): Single;
3830begin
3831Result := (X * AOther.X) + (Y * AOther.Y) + (Z * AOther.Z);
3832end;
3833
3834function TVector3.FaceForward(const I, NRef: TVector3): TVector3; assembler;
3835begin
3836if (NRef.Dot(I) < 0) then
3837Result := Self
3838else
3839Result := -Self;
3840end;
3841
3842function TVector3.GetLength: Single; assembler;
3843asm
3844movq xmm0, [Self] // 0 0 Y X
3845movss xmm1, DWORD [Self+8] // 0 0 0 Z
3846movlhps xmm0, xmm1 // 0 Z Y Z
3847mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3848pshufd xmm1, xmm0, $0E // Y*Y X*X 0 Z*Z
3849addps xmm0, xmm1 // # # (Y*Y) (X*X+Z*Z)
3850pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3851addss xmm0, xmm1 // (X*X + Y*Y + Z*Z)
3852sqrtss xmm0, xmm0 // Sqrt(X*X + Y*Y + Z*Z)
3853end;
3854
3855function TVector3.GetLengthSquared: Single;
3856begin
3857Result := (X * X) + (Y * Y) + (Z * Z);
3858end;
3859{function TVector3.GetLengthSquared: Single; assembler;
3860asm
3861movq xmm0, [Self] // 0 0 Y X
3862movss xmm1, DWORD [Self+8] // 0 0 0 Z
3863movlhps xmm0, xmm1 // 0 Z Y Z
3864mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3865pshufd xmm1, xmm0, $0E // Y*Y X*X 0 Z*Z
3866addps xmm0, xmm1 // # # (Y*Y) (X*X+Z*Z)
3867pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y)
3868addss xmm0, xmm1 // (X*X + Y*Y + Z*Z)
3869end;}
3870
3871class operator TVector3.Multiply(const A: TVector3; const B: Single): TVector3; assembler;
3872asm
3873movq xmm0, [A]
3874movss xmm1, DWORD [A+8]
3875shufps xmm2, xmm2, 0
3876mulps xmm0, xmm2
3877mulss xmm1, xmm2
3878movq [Result], xmm0
3879movss DWORD [Result+8], xmm1
3880end;
3881
3882class operator TVector3.Multiply(const A: Single; const B: TVector3): TVector3; assembler;
3883asm
3884movq xmm0, [B]
3885movss xmm2, DWORD [B+8]
3886shufps xmm1, xmm1, 0
3887mulps xmm0, xmm1
3888mulss xmm2, xmm1
3889movq [Result], xmm0
3890movss DWORD [Result+8], xmm2
3891end;
3892
3893class operator TVector3.Multiply(const A, B: TVector3): TVector3; assembler;
3894asm
3895movq xmm0, [A]
3896movss xmm1, DWORD [A+8]
3897movq xmm2, [B]
3898movss xmm3, DWORD [B+8]
3899mulps xmm0, xmm2
3900mulss xmm1, xmm3
3901movq [Result], xmm0
3902movss DWORD [Result+8], xmm1
3903end;
3904
3905class operator TVector3.Negative(const A: TVector3): TVector3;
3906begin
3907Result.X := -A.X;
3908Result.Y := -A.Y;
3909Result.Z := -A.Z;
3910end;
3911{class operator TVector3.Negative(const A: TVector3): TVector3; assembler;
3912asm
3913movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
3914movq xmm1, [A]
3915movss xmm2, DWORD [A+8]
3916xorps xmm1, xmm0 // Flip sign bit
3917xorps xmm2, xmm0
3918movq [Result], xmm1
3919movss DWORD [Result+8], xmm2
3920end;}
3921
3922function TVector3.NormalizeFast: TVector3; assembler;
3923asm
3924movq xmm0, [Self] // 0 0 Y X
3925movss xmm1, DWORD [Self+8] // 0 0 0 Z
3926movlhps xmm0, xmm1 // 0 Z Y Z
3927movaps xmm2, xmm0
3928
3929// Dot(A, A)
3930mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
3931pshufd xmm1, xmm0, $4E // Y*Y X*X 0 Z*Z
3932addps xmm0, xmm1 // (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
3933pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
3934addps xmm0, xmm1 // (X*X + Y*Y + Z*Z) (4x)
3935
3936rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
3937mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
3938movhlps xmm1, xmm0
3939movq [Result], xmm0
3940movss DWORD [Result+8], xmm1
3941end;
3942
3943function TVector3.Reflect(const N: TVector3): TVector3; assembler;
3944asm
3945movq xmm0, [Self]
3946movss xmm2, DWORD [Self+8]
3947movq xmm1, [N]
3948movss xmm3, DWORD [N+8]
3949movlhps xmm0, xmm2
3950movlhps xmm1, xmm3
3951movaps xmm2, xmm0
3952movups xmm3, [SSE_TWO]
3953
3954// Dot(N, I)
3955mulps xmm0, xmm1
3956mulps xmm3, xmm1 // N * 2
3957pshufd xmm1, xmm0, $4E
3958addps xmm0, xmm1
3959pshufd xmm1, xmm0, $11
3960addps xmm0, xmm1
3961
3962// (2 * Dot(N, I)) * N
3963mulps xmm0, xmm3
3964
3965// I - ((2 * Dot(N, I)) * N)
3966subps xmm2, xmm0
3967movhlps xmm3, xmm2
3968movq [Result], xmm2
3969movss DWORD [Result+8], xmm3
3970end;
3971
3972function TVector3.Refract(const N: TVector3; const Eta: Single): TVector3; assembler;
3973asm
3974movdqa [rsp-24], xmm6
3975movdqa [rsp-40], xmm7
3976
3977movq xmm0, [Self]
3978movss xmm2, DWORD [Self+8]
3979movq xmm1, [N]
3980movss xmm4, DWORD [N+8]
3981movlhps xmm0, xmm2
3982movlhps xmm1, xmm4
3983movups xmm7, xmm0
3984movss xmm2, DWORD [SSE_ONE]
3985
3986// D := Dot(N, I)
3987mulps xmm0, xmm1
3988movss xmm4, xmm2 // 1
3989pshufd xmm1, xmm0, $4E
3990movss xmm5, xmm3 // Eta
3991addps xmm0, xmm1
3992mulss xmm5, xmm5 // Eta * Eta
3993pshufd xmm1, xmm0, $11
3994addss xmm0, xmm1
3995
3996// K := 1 - Eta * Eta * (1 - D * D)
3997movss xmm6, xmm0 // D
3998mulss xmm0, xmm0 // D * D
3999subss xmm4, xmm0 // 1 - D * D
4000mulss xmm4, xmm5 // Eta * Eta * (1 - D * D)
4001xorps xmm5, xmm5 // 0
4002subss xmm2, xmm4 // K := 1 - Eta * Eta * (1 - D * D)
4003
4004// if (K < 0) then
4005comiss xmm2, xmm5
4006
4007jb @KLessThanZero
4008
4009// K >= 0
4010mulss xmm6, xmm3 // Eta * D
4011shufps xmm3, xmm3, 0 // Replicate Eta (4x)
4012mulps xmm7, xmm3 // Eta * I
4013sqrtss xmm2, xmm2 // Sqrt(K)
4014addss xmm6, xmm2 // Eta * D + Sqrt(K)
4015shufps xmm6, xmm6, 0 // Replicate Eta * D + Sqrt(K) (4x)
4016movups xmm1, [N]
4017mulps xmm6, xmm1 // ((Eta * D + Sqrt(K)) * N)
4018subps xmm7, xmm6 // (Eta * I) - ((Eta * D + Sqrt(K)) * N)
4019movhlps xmm0, xmm7
4020movq [Result], xmm7
4021movss DWORD [Result+8], xmm0
4022jmp @Finish
4023
4024@KLessThanZero:
4025// K < 0: Result := Vector4(0, 0, 0, 0)
4026movlhps xmm6, xmm5
4027movq [Result], xmm5
4028movss DWORD [Result+8], xmm6
4029
4030@Finish:
4031movdqa xmm6, [rsp-24]
4032movdqa xmm7, [rsp-40]
4033end;
4034
4035procedure TVector3.SetNormalizedFast; assembler;
4036asm
4037movq xmm0, [Self] // 0 0 Y X
4038movss xmm1, DWORD [Self+8] // 0 0 0 Z
4039movlhps xmm0, xmm1 // 0 Z Y Z
4040movaps xmm2, xmm0
4041
4042// Dot(A, A)
4043mulps xmm0, xmm0 // 0 Z*Z Y*Y X*X
4044pshufd xmm1, xmm0, $4E // Y*Y X*X 0 Z*Z
4045addps xmm0, xmm1 // (Y*Y) (X*X+Z*Z) (Y*Y) (X*X+Z*Z)
4046pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y) (X*X+Z*Z) (Y*Y)
4047addps xmm0, xmm1 // (X*X + Y*Y + Z*Z) (4x)
4048
4049rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z)) (4x)
4050mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4051movhlps xmm1, xmm0
4052movq [Self], xmm0
4053movss DWORD [Self+8], xmm1
4054end;
4055
4056class operator TVector3.Subtract(const A: TVector3; const B: Single): TVector3; assembler;
4057asm
4058movq xmm0, [A]
4059movss xmm1, DWORD [A+8]
4060shufps xmm2, xmm2, 0
4061subps xmm0, xmm2
4062subss xmm1, xmm2
4063movq [Result], xmm0
4064movss DWORD [Result+8], xmm1
4065end;
4066
4067class operator TVector3.Subtract(const A: Single; const B: TVector3): TVector3; assembler;
4068asm
4069movq xmm4, [B]
4070movss xmm2, DWORD [B+8]
4071movss xmm3, xmm1
4072shufps xmm1, xmm1, 0
4073subps xmm1, xmm4
4074subss xmm3, xmm2
4075movq [Result], xmm1
4076movss DWORD [Result+8], xmm3
4077end;
4078
4079class operator TVector3.Subtract(const A, B: TVector3): TVector3; assembler;
4080asm
4081movq xmm0, [A]
4082movss xmm1, DWORD [A+8]
4083movq xmm2, [B]
4084movss xmm3, DWORD [B+8]
4085subps xmm0, xmm2
4086subss xmm1, xmm3
4087movq [Result], xmm0
4088movss DWORD [Result+8], xmm1
4089end;
4090
4091{ TVector4 }
4092
4093class operator TVector4.Add(const A: TVector4; const B: Single): TVector4; assembler;
4094asm
4095movups xmm0, [A] // Load 4 floating-point values
4096shufps xmm2, xmm2, 0 // Replicate B
4097addps xmm0, xmm2 // A + B
4098movups [Result], xmm0 // Store result
4099end;
4100
4101class operator TVector4.Add(const A: Single; const B: TVector4): TVector4; assembler;
4102asm
4103movups xmm0, [B]
4104shufps xmm1, xmm1, 0
4105addps xmm0, xmm1
4106movups [Result], xmm0
4107end;
4108
4109class operator TVector4.Add(const A, B: TVector4): TVector4; assembler;
4110asm
4111movups xmm0, [A]
4112movups xmm1, [B]
4113addps xmm0, xmm1
4114movups [Result], xmm0
4115end;
4116
4117function TVector4.Distance(const AOther: TVector4): Single; assembler;
4118asm
4119movups xmm0, [Self]
4120movups xmm1, [AOther]
4121subps xmm0, xmm1 // A - B
4122
4123// (A - B).Length
4124mulps xmm0, xmm0
4125pshufd xmm1, xmm0, $0E
4126addps xmm0, xmm1
4127pshufd xmm1, xmm0, $01
4128addss xmm0, xmm1
4129sqrtss xmm0, xmm0
4130end;
4131
4132function TVector4.DistanceSquared(const AOther: TVector4): Single; assembler;
4133asm
4134movups xmm0, [Self]
4135movups xmm1, [AOther]
4136subps xmm0, xmm1 // A - B
4137
4138// (A - B).LengthSquared
4139mulps xmm0, xmm0
4140pshufd xmm1, xmm0, $0E
4141addps xmm0, xmm1
4142pshufd xmm1, xmm0, $01
4143addss xmm0, xmm1
4144end;
4145
4146class operator TVector4.Divide(const A: TVector4; const B: Single): TVector4; assembler;
4147asm
4148movups xmm0, [A]
4149shufps xmm2, xmm2, 0
4150divps xmm0, xmm2
4151movups [Result], xmm0
4152end;
4153
4154class operator TVector4.Divide(const A: Single; const B: TVector4): TVector4; assembler;
4155asm
4156movups xmm0, [B]
4157shufps xmm1, xmm1, 0
4158divps xmm1, xmm0
4159movups [Result], xmm1
4160end;
4161
4162class operator TVector4.Divide(const A, B: TVector4): TVector4; assembler;
4163asm
4164movups xmm0, [A]
4165movups xmm1, [B]
4166divps xmm0, xmm1
4167movups [Result], xmm0
4168end;
4169
4170function TVector4.Dot(const AOther: TVector4): Single;
4171begin
4172Result := (X * AOther.X) + (Y * AOther.Y) + (Z * AOther.Z) + (W * AOther.W);
4173end;
4174
4175function TVector4.FaceForward(const I, NRef: TVector4): TVector4; assembler;
4176asm
4177movups xmm0, [Self]
4178movups xmm1, [I]
4179movups xmm2, [NRef]
4180xorps xmm3, xmm3 // 0
4181movups xmm4, [SSE_MASK_SIGN]
4182
4183// Dot(NRef, I)
4184mulps xmm2, xmm1
4185pshufd xmm1, xmm2, $4E
4186addps xmm2, xmm1
4187pshufd xmm1, xmm2, $11
4188addps xmm2, xmm1
4189
4190// Dot(NRef, I) >= 0? Yes: $FFFFFFFF, No: $00000000
4191cmpnltps xmm2, xmm3
4192andps xmm2, xmm4 // Yes: $80000000, No: $00000000
4193
4194// Flip sign of N if (Dot(NRef, I) >= 0)
4195xorps xmm0, xmm2
4196movups [Result], xmm0
4197end;
4198
4199function TVector4.GetLength: Single; assembler;
4200asm
4201movups xmm0, [Self] // W Z Y X
4202mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4203pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4204addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
4205pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4206addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
4207sqrtss xmm0, xmm0 // Sqrt(X*X + Y*Y + Z*Z + W*W)
4208end;
4209
4210function TVector4.GetLengthSquared: Single;
4211begin
4212Result := (X * X) + (Y * Y) + (Z * Z) + (W * W);
4213end;
4214{function TVector4.GetLengthSquared: Single; assembler;
4215asm
4216movups xmm0, [Self] // W Z Y X
4217mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4218pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4219addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
4220pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4221addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
4222end;}
4223
4224class operator TVector4.Multiply(const A: TVector4; const B: Single): TVector4; assembler;
4225asm
4226movups xmm0, [A]
4227shufps xmm2, xmm2, 0
4228mulps xmm0, xmm2
4229movups [Result], xmm0
4230end;
4231
4232class operator TVector4.Multiply(const A: Single; const B: TVector4): TVector4; assembler;
4233asm
4234movups xmm0, [B]
4235shufps xmm1, xmm1, 0
4236mulps xmm1, xmm0
4237movups [Result], xmm1
4238end;
4239
4240class operator TVector4.Multiply(const A, B: TVector4): TVector4; assembler;
4241asm
4242movups xmm0, [A]
4243movups xmm1, [B]
4244mulps xmm0, xmm1
4245movups [Result], xmm0
4246end;
4247
4248class operator TVector4.Negative(const A: TVector4): TVector4; assembler;
4249asm
4250movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
4251movups xmm1, [A]
4252xorps xmm0, xmm1 // Flip sign bit
4253movups [Result], xmm0
4254end;
4255
4256function TVector4.NormalizeFast: TVector4;
4257asm
4258movups xmm0, [Self] // W Z Y X
4259movaps xmm2, xmm0
4260
4261// Dot(A, A)
4262mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4263pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4264addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4265pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4266addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4267
4268rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4269mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4270movups [Result], xmm0
4271end;
4272
4273function TVector4.Reflect(const N: TVector4): TVector4; assembler;
4274asm
4275movups xmm0, [Self]
4276movups xmm1, [N]
4277movaps xmm2, xmm0
4278movups xmm3, [SSE_TWO]
4279
4280// Dot(N, I)
4281mulps xmm0, xmm1
4282mulps xmm3, xmm1 // N * 2
4283pshufd xmm1, xmm0, $4E
4284addps xmm0, xmm1
4285pshufd xmm1, xmm0, $11
4286addps xmm0, xmm1
4287
4288// (2 * Dot(N, I)) * N
4289mulps xmm0, xmm3
4290
4291// I - ((2 * Dot(N, I)) * N)
4292subps xmm2, xmm0
4293movups [Result], xmm2
4294end;
4295
4296function TVector4.Refract(const N: TVector4; const Eta: Single): TVector4; assembler;
4297asm
4298movdqa [rsp-24], xmm6
4299movdqa [rsp-40], xmm7
4300
4301movups xmm0, [Self]
4302movups xmm1, [N]
4303movups xmm7, xmm0
4304movss xmm2, DWORD [SSE_ONE]
4305
4306// D := Dot(N, I)
4307mulps xmm0, xmm1
4308movss xmm4, xmm2 // 1
4309pshufd xmm1, xmm0, $4E
4310movss xmm5, xmm3 // Eta
4311addps xmm0, xmm1
4312mulss xmm5, xmm5 // Eta * Eta
4313pshufd xmm1, xmm0, $11
4314addss xmm0, xmm1
4315
4316// K := 1 - Eta * Eta * (1 - D * D)
4317movss xmm6, xmm0 // D
4318mulss xmm0, xmm0 // D * D
4319subss xmm4, xmm0 // 1 - D * D
4320mulss xmm4, xmm5 // Eta * Eta * (1 - D * D)
4321xorps xmm5, xmm5 // 0
4322subss xmm2, xmm4 // K := 1 - Eta * Eta * (1 - D * D)
4323
4324// if (K < 0) then
4325comiss xmm2, xmm5
4326
4327jb @KLessThanZero
4328
4329// K >= 0
4330mulss xmm6, xmm3 // Eta * D
4331shufps xmm3, xmm3, 0 // Replicate Eta (4x)
4332mulps xmm7, xmm3 // Eta * I
4333sqrtss xmm2, xmm2 // Sqrt(K)
4334addss xmm6, xmm2 // Eta * D + Sqrt(K)
4335shufps xmm6, xmm6, 0 // Replicate Eta * D + Sqrt(K) (4x)
4336movups xmm1, [N]
4337mulps xmm6, xmm1 // ((Eta * D + Sqrt(K)) * N)
4338subps xmm7, xmm6 // (Eta * I) - ((Eta * D + Sqrt(K)) * N)
4339movups [Result], xmm7
4340jmp @Finish
4341
4342@KLessThanZero:
4343// K < 0: Result := Vector4(0, 0, 0, 0)
4344movups [Result], xmm5
4345
4346@Finish:
4347movdqa xmm6, [rsp-24]
4348movdqa xmm7, [rsp-40]
4349end;
4350
4351procedure TVector4.SetNormalizedFast; assembler;
4352asm
4353movups xmm0, [Self] // W Z Y X
4354movaps xmm2, xmm0
4355
4356// Dot(A, A)
4357mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4358pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4359addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4360pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4361addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4362
4363rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4364mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4365movups [Self], xmm0
4366end;
4367
4368class operator TVector4.Subtract(const A: TVector4; const B: Single): TVector4; assembler;
4369asm
4370movups xmm0, [A]
4371shufps xmm2, xmm2, 0
4372subps xmm0, xmm2
4373movups [Result], xmm0
4374end;
4375
4376class operator TVector4.Subtract(const A: Single; const B: TVector4): TVector4; assembler;
4377asm
4378movups xmm0, [B]
4379shufps xmm1, xmm1, 0
4380subps xmm1, xmm0
4381movups [Result], xmm1
4382end;
4383
4384class operator TVector4.Subtract(const A, B: TVector4): TVector4; assembler;
4385asm
4386movups xmm0, [A]
4387movups xmm1, [B]
4388subps xmm0, xmm1
4389movups [Result], xmm0
4390end;
4391
4392{ TQuaternion }
4393
4394class operator TQuaternion.Add(const A, B: TQuaternion): TQuaternion;
4395asm
4396movups xmm0, [A]
4397movups xmm1, [B]
4398addps xmm0, xmm1
4399movups [Result], xmm0
4400end;
4401
4402function TQuaternion.GetLength: Single;
4403asm
4404movups xmm0, [Self] // W Z Y X
4405mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4406pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4407addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
4408pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4409addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
4410sqrtss xmm0, xmm0 // Sqrt(X*X + Y*Y + Z*Z + W*W)
4411end;
4412
4413function TQuaternion.GetLengthSquared: Single;
4414asm
4415movups xmm0, [Self] // W Z Y X
4416mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4417pshufd xmm1, xmm0, $0E // Y*Y X*X W*W Z*Z
4418addps xmm0, xmm1 // # # (Y*Y+W*W) (X*X+Z*Z)
4419pshufd xmm1, xmm0, $01 // (X*X+Z*Z) (X*X+Z*Z) (X*X+Z*Z) (Y*Y+W*W)
4420addss xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W)
4421end;
4422
4423class operator TQuaternion.Multiply(const A: TQuaternion; const B: Single): TQuaternion;
4424asm
4425movups xmm0, [A]
4426shufps xmm2, xmm2, 0
4427mulps xmm0, xmm2
4428movups [Result], xmm0
4429end;
4430
4431class operator TQuaternion.Multiply(const A: Single; const B: TQuaternion): TQuaternion;
4432asm
4433movups xmm0, [B]
4434shufps xmm1, xmm1, 0
4435mulps xmm1, xmm0
4436movups [Result], xmm1
4437end;
4438
4439class operator TQuaternion.Multiply(const A, B: TQuaternion): TQuaternion;
4440begin
4441Result.X := (A.W * B.X) + (A.X * B.W) + (A.Y * B.Z) - (A.Z * B.Y);
4442Result.Y := (A.W * B.Y) + (A.Y * B.W) + (A.Z * B.X) - (A.X * B.Z);
4443Result.Z := (A.W * B.Z) + (A.Z * B.W) + (A.X * B.Y) - (A.Y * B.X);
4444Result.W := (A.W * B.W) - (A.X * B.X) - (A.Y * B.Y) - (A.Z * B.Z);
4445end;
4446
4447function TQuaternion.NormalizeFast: TQuaternion;
4448asm
4449movups xmm0, [Self] // W Z Y X
4450movaps xmm2, xmm0
4451
4452// Dot(A, A)
4453mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4454pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4455addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4456pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4457addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4458
4459rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4460mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4461movups [Result], xmm0
4462end;
4463
4464procedure TQuaternion.SetNormalizedFast;
4465asm
4466movups xmm0, [Self] // W Z Y X
4467movaps xmm2, xmm0
4468
4469// Dot(A, A)
4470mulps xmm0, xmm0 // W*W Z*Z Y*Y X*X
4471pshufd xmm1, xmm0, $4E // Y*Y X*X W*W Z*Z
4472addps xmm0, xmm1 // (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z)
4473pshufd xmm1, xmm0, $11 // (X*X+Z*Z) (Y*Y+W*W) (X*X+Z*Z) (Y*Y+W*W)
4474addps xmm0, xmm1 // (X*X + Y*Y + Z*Z + W*W) (4x)
4475
4476rsqrtps xmm0, xmm0 // (1 / Sqrt(X*X + Y*Y + Z*Z + W*W)) (4x)
4477mulps xmm0, xmm2 // A * (1 / Sqrt(Dot(A, A)))
4478movups [Self], xmm0
4479end;
4480
4481{ TMatrix 2 }
4482
4483class operator TMatrix2.Add(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4484asm
4485movups xmm1, [A] // Load matrix
4486shufps xmm2, xmm2, 0 // Replicate B
4487addps xmm1, xmm2 // Add B
4488movups [Result], xmm1
4489end;
4490
4491class operator TMatrix2.Add(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4492asm
4493movups xmm0, [B] // Load matrix
4494shufps xmm1, xmm1, 0 // Replicate A
4495addps xmm0, xmm1 // Add A
4496movups [Result], xmm0
4497end;
4498
4499class operator TMatrix2.Add(const A, B: TMatrix2): TMatrix2; assembler;
4500asm
4501movups xmm0, [A] // Load A
4502movups xmm1, [B] // Load B
4503addps xmm0, xmm1 // Add
4504movups [Result], xmm0
4505end;
4506
4507function TMatrix2.CompMult(const AOther: TMatrix2): TMatrix2; assembler;
4508asm
4509movups xmm0, [Self]
4510movups xmm1, [AOther]
4511
4512// Component-wise multiplication
4513mulps xmm0, xmm1
4514
4515// Store result
4516movups [Result], xmm0
4517end;
4518
4519class operator TMatrix2.Divide(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4520asm
4521movups xmm1, [A] // Load matrix
4522shufps xmm2, xmm2, 0 // Replicate B
4523divps xmm1, xmm2 // Divide B
4524movups [Result], xmm1
4525end;
4526
4527class operator TMatrix2.Divide(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4528asm
4529movups xmm0, [B] // Load matrix
4530shufps xmm1, xmm1, 0 // Replicate A
4531divps xmm1, xmm0 // Divide B
4532movups [Result], xmm1
4533end;
4534
4535class operator TMatrix2.Multiply(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4536asm
4537movups xmm1, [A] // Load matrix
4538shufps xmm2, xmm2, 0 // Replicate B
4539mulps xmm1, xmm2 // Multiply
4540movups [Result], xmm1
4541end;
4542
4543class operator TMatrix2.Multiply(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4544asm
4545movups xmm0, [B] // Load matrix
4546shufps xmm1, xmm1, 0 // Replicate A
4547mulps xmm0, xmm1 // Multiply
4548movups [Result], xmm0
4549end;
4550
4551class operator TMatrix2.Multiply(const A: TVector2; const B: TMatrix2): TVector2;
4552begin
4553Result.X := (A.X * B.M[0,0]) + (A.Y * B.M[0,1]);
4554Result.Y := (A.X * B.M[1,0]) + (A.Y * B.M[1,1]);
4555end;
4556
4557class operator TMatrix2.Multiply(const A: TMatrix2; const B: TVector2): TVector2;
4558begin
4559Result.X := (A.M[0,0] * B.X) + (A.M[1,0] * B.Y);
4560Result.Y := (A.M[0,1] * B.X) + (A.M[1,1] * B.Y);
4561end;
4562
4563class operator TMatrix2.Multiply(const A, B: TMatrix2): TMatrix2;
4564begin
4565Result.M[0,0] := (A.M[0,0] * B.M[0,0]) + (A.M[1,0] * B.M[0,1]);
4566Result.M[0,1] := (A.M[0,1] * B.M[0,0]) + (A.M[1,1] * B.M[0,1]);
4567Result.M[1,0] := (A.M[0,0] * B.M[1,0]) + (A.M[1,0] * B.M[1,1]);
4568Result.M[1,1] := (A.M[0,1] * B.M[1,0]) + (A.M[1,1] * B.M[1,1]);
4569end;
4570
4571class operator TMatrix2.Negative(const A: TMatrix2): TMatrix2; assembler;
4572asm
4573movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
4574movups xmm1, [A] // Load matrix
4575xorps xmm1, xmm0 // Flip sign bits
4576movups [Result], xmm1
4577end;
4578
4579procedure TMatrix2.SetTransposed;
4580begin
4581Self := Transpose;
4582end;
4583
4584class operator TMatrix2.Subtract(const A: TMatrix2; const B: Single): TMatrix2; assembler;
4585asm
4586movups xmm1, [A] // Load matrix
4587shufps xmm2, xmm2, 0 // Replicate B
4588subps xmm1, xmm2 // Subtract B
4589movups [Result], xmm1
4590end;
4591
4592class operator TMatrix2.Subtract(const A: Single; const B: TMatrix2): TMatrix2; assembler;
4593asm
4594movups xmm0, [B] // Load matrix
4595shufps xmm1, xmm1, 0 // Replicate A
4596subps xmm1, xmm0 // Subtract B
4597movups [Result], xmm1
4598end;
4599
4600class operator TMatrix2.Subtract(const A, B: TMatrix2): TMatrix2; assembler;
4601asm
4602movups xmm0, [A] // Load A
4603movups xmm1, [B] // Load B
4604subps xmm0, xmm1 // Subtract
4605movups [Result], xmm0
4606end;
4607
4608function TMatrix2.Transpose: TMatrix2;
4609begin
4610Result.M[0,0] := M[0,0];
4611Result.M[0,1] := M[1,0];
4612
4613Result.M[1,0] := M[0,1];
4614Result.M[1,1] := M[1,1];
4615end;
4616
4617{ TMatrix3 }
4618
4619class operator TMatrix3.Add(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4620asm
4621movups xmm1, DQWORD [A + $00] // Load 3 rows
4622shufps xmm2, xmm2, 0 // Replicate B
4623movups xmm3, DQWORD [A + $10]
4624movss xmm4, DWORD [A + $20]
4625addps xmm1, xmm2 // Add B to each row
4626addps xmm3, xmm2
4627addss xmm4, xmm2
4628movups DQWORD [Result + $00], xmm1
4629movups DQWORD [Result + $10], xmm3
4630movss DWORD [Result + $20], xmm4
4631end;
4632
4633class operator TMatrix3.Add(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4634asm
4635movups xmm0, DQWORD [B + $00] // Load 3 rows
4636shufps xmm1, xmm1, 0 // Replicate A
4637movups xmm2, DQWORD [B + $10]
4638movss xmm3, DWORD [B + $20]
4639addps xmm0, xmm1 // Add A to each row
4640addps xmm2, xmm1
4641addss xmm3, xmm1
4642movups DQWORD [Result + $00], xmm0
4643movups DQWORD [Result + $10], xmm2
4644movss DWORD [Result + $20], xmm3
4645end;
4646
4647class operator TMatrix3.Add(const A, B: TMatrix3): TMatrix3; assembler;
4648asm
4649movups xmm0, DQWORD [A + $00] // Load 3 rows of A
4650movups xmm1, DQWORD [A + $10]
4651movss xmm2, DWORD [A + $20]
4652movups xmm4, DQWORD [B + $00] // Load 3 rows of B
4653movups xmm5, DQWORD [B + $10]
4654movss xmm3, DWORD [B + $20]
4655addps xmm0, xmm4 // Add rows
4656addps xmm1, xmm5
4657addss xmm2, xmm3
4658movups DQWORD [Result + $00], xmm0
4659movups DQWORD [Result + $10], xmm1
4660movss DWORD [Result + $20], xmm2
4661end;
4662
4663function TMatrix3.CompMult(const AOther: TMatrix3): TMatrix3; assembler;
4664asm
4665movups xmm0, DQWORD[Self + $00] // Self[0]
4666movups xmm1, DQWORD[Self + $10] // Self[1]
4667movss xmm2, DWORD[Self + $20] // Self[2]
4668movups xmm4, DQWORD[AOther + $00] // AOther[0]
4669movups xmm5, DQWORD[AOther + $10] // AOther[1]
4670movss xmm3, DWORD[AOther + $20] // AOther[2]
4671
4672// Component-wise multiplication
4673mulps xmm0, xmm4
4674mulps xmm1, xmm5
4675mulss xmm2, xmm3
4676
4677// Store result
4678movups DQWORD [Result + $00], xmm0
4679movups DQWORD [Result + $10], xmm1
4680movss DWORD [Result + $20], xmm2
4681end;
4682
4683class operator TMatrix3.Divide(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4684asm
4685movups xmm4, DQWORD [B + $00] // Load 3 rows
4686shufps xmm1, xmm1, 0 // Replicate A
4687movups xmm5, DQWORD [B + $10]
4688movaps xmm0, xmm1
4689movaps xmm2, xmm1
4690movss xmm3, DWORD [B + $20]
4691divps xmm1, xmm4 // Divide A by each row
4692divps xmm0, xmm5
4693divss xmm2, xmm3
4694movups DQWORD [Result + $00], xmm1
4695movups DQWORD [Result + $10], xmm0
4696movss DWORD [Result + $20], xmm2
4697end;
4698
4699class operator TMatrix3.Divide(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4700asm
4701movups xmm1, DQWORD [A + $00] // Load 3 rows
4702shufps xmm2, xmm2, 0 // Replicate B
4703movups xmm0, DQWORD [A + $10]
4704movss xmm3, DWORD [A + $20]
4705divps xmm1, xmm2 // Divide each row by B
4706divps xmm0, xmm2
4707divss xmm3, xmm2
4708movups DQWORD [Result + $00], xmm1
4709movups DQWORD [Result + $10], xmm0
4710movss DWORD [Result + $20], xmm3
4711end;
4712
4713class operator TMatrix3.Multiply(const A: Single; const B: TMatrix3): TMatrix3; assembler;
4714asm
4715movups xmm0, DQWORD [B + $00] // Load 3 rows
4716shufps xmm1, xmm1, 0 // Replicate A
4717movups xmm2, DQWORD [B + $10]
4718movss xmm3, DWORD [B + $20]
4719mulps xmm0, xmm1 // Multiply each row by A
4720mulps xmm2, xmm1
4721mulss xmm3, xmm1
4722movups DQWORD [Result + $00], xmm0
4723movups DQWORD [Result + $10], xmm2
4724movss DWORD [Result + $20], xmm3
4725end;
4726
4727class operator TMatrix3.Multiply(const A: TMatrix3; const B: Single): TMatrix3; assembler;
4728asm
4729movups xmm1, DQWORD [A + $00] // Load 3 rows
4730shufps xmm2, xmm2, 0 // Replicate B
4731movups xmm0, DQWORD [A + $10]
4732movss xmm3, DWORD [A + $20]
4733mulps xmm1, xmm2 // Multiply each row by B
4734mulps xmm0, xmm2
4735mulss xmm3, xmm2
4736movups DQWORD [Result + $00], xmm1
4737movups DQWORD [Result + $10], xmm0
4738movss DWORD [Result + $20], xmm3
4739end;
4740
4741{$IFDEF FM_COLUMN_MAJOR}
4742class operator TMatrix3.Multiply(const A: TMatrix3; const B: TVector3): TVector3; assembler;
4743asm
4744movdqa [rsp-24], xmm6
4745
4746movq xmm0, [B]
4747movss xmm1, DWORD [B+8]
4748movlhps xmm0, xmm1
4749
4750movq xmm4, QWORD [A + $00]
4751movss xmm1, DWORD [A + $08]
4752movlhps xmm4, xmm1
4753
4754movaps xmm1, xmm0
4755movaps xmm2, xmm0
4756shufps xmm0, xmm0, $00
4757shufps xmm1, xmm1, $55
4758shufps xmm2, xmm2, $AA
4759
4760movq xmm5, QWORD [A + $0C]
4761movss xmm3, DWORD [A + $14]
4762movlhps xmm5, xmm3
4763
4764movq xmm6, QWORD [A + $18]
4765movss xmm3, DWORD [A + $20]
4766movlhps xmm6, xmm3
4767
4768mulps xmm0, xmm4
4769mulps xmm1, xmm5
4770mulps xmm2, xmm6
4771addps xmm0, xmm1
4772addps xmm0, xmm2
4773movhlps xmm1, xmm0
4774movq [Result], xmm0
4775movss DWORD [Result+8], xmm1
4776
4777movdqa xmm6, [rsp-24]
4778end;
4779
4780class operator TMatrix3.Multiply(const A: TVector3; const B: TMatrix3): TVector3; assembler;
4781asm
4782movdqa [rsp-24], xmm6
4783
4784movq xmm0, [A]
4785movss xmm1, DWORD [A+8]
4786movlhps xmm0, xmm1
4787
4788movq xmm4, QWORD [B + $00]
4789movss xmm1, DWORD [B + $08]
4790movlhps xmm4, xmm1
4791
4792movaps xmm1, xmm0
4793movaps xmm2, xmm0
4794
4795movq xmm5, QWORD [B + $0C]
4796movss xmm6, DWORD [B + $14]
4797movlhps xmm5, xmm6
4798
4799movq xmm6, QWORD [B + $18]
4800movss xmm3, DWORD [B + $20]
4801movlhps xmm6, xmm3
4802
4803mulps xmm0, xmm4
4804mulps xmm1, xmm5
4805mulps xmm2, xmm6
4806xorps xmm3, xmm3
4807
4808{ Transpose xmm0-xmm2 }
4809movaps xmm4, xmm2
4810unpcklps xmm2, xmm3
4811unpckhps xmm4, xmm3
4812
4813movaps xmm3, xmm0
4814unpcklps xmm0, xmm1
4815unpckhps xmm3, xmm1
4816
4817movaps xmm1, xmm0
4818unpcklpd xmm0, xmm2
4819unpckhpd xmm1, xmm2
4820
4821unpcklpd xmm3, xmm4
4822
4823addps xmm0, xmm1
4824addps xmm0, xmm3
4825movhlps xmm1, xmm0
4826movq [Result], xmm0
4827movss DWORD [Result+8], xmm1
4828
4829movdqa xmm6, [rsp-24]
4830end;
4831
4832class operator TMatrix3.Multiply(const A, B: TMatrix3): TMatrix3; assembler;
4833{ Code below consists of 3 Vector*Matrix calculations }
4834asm
4835movdqa [rsp-24], xmm6
4836
4837movq xmm0, QWORD [B + $00]
4838movss xmm1, DWORD [B + $08]
4839movlhps xmm0, xmm1
4840
4841movq xmm4, QWORD [A + $00]
4842movss xmm1, DWORD [A + $08]
4843movlhps xmm4, xmm1
4844
4845movaps xmm1, xmm0
4846movaps xmm2, xmm0
4847shufps xmm0, xmm0, $00
4848shufps xmm1, xmm1, $55
4849shufps xmm2, xmm2, $AA
4850
4851movq xmm5, QWORD [A + $0C]
4852movss xmm3, DWORD [A + $14]
4853movlhps xmm5, xmm3
4854
4855movq xmm6, QWORD [A + $18]
4856movss xmm3, DWORD [A + $20]
4857movlhps xmm6, xmm3
4858
4859mulps xmm0, xmm4
4860mulps xmm1, xmm5
4861mulps xmm2, xmm6
4862addps xmm0, xmm1
4863addps xmm0, xmm2
4864movhlps xmm1, xmm0
4865movq QWORD [Result + $00], xmm0
4866movss DWORD [Result + $08], xmm1
4867
4868movq xmm0, QWORD [B + $0C]
4869movss xmm1, DWORD [B + $14]
4870movlhps xmm0, xmm1
4871
4872movaps xmm1, xmm0
4873movaps xmm2, xmm0
4874shufps xmm0, xmm0, $00
4875shufps xmm1, xmm1, $55
4876shufps xmm2, xmm2, $AA
4877mulps xmm0, xmm4
4878mulps xmm1, xmm5
4879mulps xmm2, xmm6
4880addps xmm0, xmm1
4881addps xmm0, xmm2
4882movhlps xmm1, xmm0
4883movq QWORD [Result + $0C], xmm0
4884movss DWORD [Result + $14], xmm1
4885
4886movq xmm0, QWORD [B + $18]
4887movss xmm1, DWORD [B + $20]
4888movlhps xmm0, xmm1
4889
4890movaps xmm1, xmm0
4891movaps xmm2, xmm0
4892shufps xmm0, xmm0, $00
4893shufps xmm1, xmm1, $55
4894shufps xmm2, xmm2, $AA
4895mulps xmm0, xmm4
4896mulps xmm1, xmm5
4897mulps xmm2, xmm6
4898addps xmm0, xmm1
4899addps xmm0, xmm2
4900movhlps xmm1, xmm0
4901movq QWORD [Result + $18], xmm0
4902movss DWORD [Result + $20], xmm1
4903
4904movdqa xmm6, [rsp-24]
4905end;
4906{$ELSE}
4907class operator TMatrix3.Multiply(const A: TMatrix3; const B: TVector3): TVector3; assembler;
4908asm
4909movdqa [rsp-24], xmm6
4910
4911movq xmm0, [B] // Load vector
4912movss xmm1, DWORD [B+8]
4913movlhps xmm0, xmm1
4914
4915movq xmm4, QWORD [A + $00] // Load 3 rows
4916movss xmm1, DWORD [A + $08]
4917movlhps xmm4, xmm1
4918
4919movaps xmm1, xmm0
4920movaps xmm2, xmm0
4921
4922movq xmm5, QWORD [A + $0C]
4923movss xmm6, DWORD [A + $14]
4924movlhps xmm5, xmm6
4925
4926movq xmm6, QWORD [A + $18]
4927movss xmm3, DWORD [A + $20]
4928movlhps xmm6, xmm3
4929
4930mulps xmm0, xmm4 // ###, (Az * B02), (Ay * B01), (Ax * B00)
4931mulps xmm1, xmm5 // ###, (Az * B12), (Ay * B11), (Ax * B10)
4932mulps xmm2, xmm6 // ###, (Az * B22), (Ay * B21), (Ax * B20)
4933xorps xmm3, xmm3 // 000
4934
4935{ Transpose xmm0-xmm2 }
4936movaps xmm4, xmm2
4937unpcklps xmm2, xmm3 // 000 B21 000 B20
4938unpckhps xmm4, xmm3 // 000 ### 000 B22
4939
4940movaps xmm3, xmm0
4941unpcklps xmm0, xmm1 // B11 B01 B10 B00
4942unpckhps xmm3, xmm1 // ### ### B12 B02
4943
4944movaps xmm1, xmm0
4945unpcklpd xmm0, xmm2 // 000 B20 B10 B00
4946unpckhpd xmm1, xmm2 // 000 B21 B11 B01
4947
4948unpcklpd xmm3, xmm4 // 000 B22 B12 B02
4949
4950addps xmm0, xmm1 // Add rows
4951addps xmm0, xmm3
4952movhlps xmm1, xmm0
4953movq [Result], xmm0
4954movss DWORD [Result+8], xmm1
4955
4956movdqa xmm6, [rsp-24]
4957end;
4958
4959class operator TMatrix3.Multiply(const A: TVector3; const B: TMatrix3): TVector3; assembler;
4960asm
4961movdqa [rsp-24], xmm6
4962
4963movq xmm0, [A] // Load vector
4964movss xmm1, DWORD [A+8]
4965movlhps xmm0, xmm1
4966
4967movq xmm4, QWORD [B + $00] // Load 3 rows
4968movss xmm1, DWORD [B + $08]
4969movlhps xmm4, xmm1
4970
4971movaps xmm1, xmm0
4972movaps xmm2, xmm0
4973shufps xmm0, xmm0, $00 // Bx Bx Bx Bx
4974shufps xmm1, xmm1, $55 // By By By By
4975shufps xmm2, xmm2, $AA // Bz Bz Bz Bz
4976
4977movq xmm5, QWORD [B + $0C]
4978movss xmm3, DWORD [B + $14]
4979movlhps xmm5, xmm3
4980
4981movq xmm6, QWORD [B + $18]
4982movss xmm3, DWORD [B + $20]
4983movlhps xmm6, xmm3
4984
4985mulps xmm0, xmm4 // (A00 * Bx), (A01 * Bx), (A02 * Bx), #
4986mulps xmm1, xmm5 // (A10 * By), (A11 * By), (A12 * By), #
4987mulps xmm2, xmm6 // (A20 * Bz), (A21 * Bz), (A22 * Bz), #
4988addps xmm0, xmm1 // Add rows
4989addps xmm0, xmm2
4990movhlps xmm1, xmm0
4991movq [Result], xmm0
4992movss DWORD [Result+8], xmm1
4993
4994movdqa xmm6, [rsp-24]
4995end;
4996
4997class operator TMatrix3.Multiply(const A, B: TMatrix3): TMatrix3; assembler;
4998{ Code below consists of 3 Vector*Matrix calculations }
4999asm
5000movdqa [rsp-24], xmm6
5001
5002{ A.R[0] * B }
5003movq xmm0, QWORD [A + $00]
5004movss xmm1, DWORD [A + $08]
5005movlhps xmm0, xmm1
5006
5007movq xmm4, QWORD [B + $00]
5008movss xmm1, DWORD [B + $08]
5009movlhps xmm4, xmm1
5010
5011movaps xmm1, xmm0
5012movaps xmm2, xmm0
5013shufps xmm0, xmm0, $00
5014shufps xmm1, xmm1, $55
5015shufps xmm2, xmm2, $AA
5016
5017movq xmm5, QWORD [B + $0C]
5018movss xmm3, DWORD [B + $14]
5019movlhps xmm5, xmm3
5020
5021movq xmm6, QWORD [B + $18]
5022movss xmm3, DWORD [B + $20]
5023movlhps xmm6, xmm3
5024
5025mulps xmm0, xmm4
5026mulps xmm1, xmm5
5027mulps xmm2, xmm6
5028addps xmm0, xmm1
5029addps xmm0, xmm2
5030movhlps xmm1, xmm0
5031movq QWORD [Result + $00], xmm0
5032movss DWORD [Result + $08], xmm1
5033
5034{ A.R[1] * B }
5035movq xmm0, QWORD [A + $0C]
5036movss xmm1, DWORD [A + $14]
5037movlhps xmm0, xmm1
5038
5039movaps xmm1, xmm0
5040movaps xmm2, xmm0
5041shufps xmm0, xmm0, $00
5042shufps xmm1, xmm1, $55
5043shufps xmm2, xmm2, $AA
5044mulps xmm0, xmm4
5045mulps xmm1, xmm5
5046mulps xmm2, xmm6
5047addps xmm0, xmm1
5048addps xmm0, xmm2
5049movhlps xmm1, xmm0
5050movq QWORD [Result + $0C], xmm0
5051movss DWORD [Result + $14], xmm1
5052
5053{ A.R[2] * B }
5054movq xmm0, QWORD [A + $18]
5055movss xmm1, DWORD [A + $20]
5056movlhps xmm0, xmm1
5057
5058movaps xmm1, xmm0
5059movaps xmm2, xmm0
5060shufps xmm0, xmm0, $00
5061shufps xmm1, xmm1, $55
5062shufps xmm2, xmm2, $AA
5063mulps xmm0, xmm4
5064mulps xmm1, xmm5
5065mulps xmm2, xmm6
5066addps xmm0, xmm1
5067addps xmm0, xmm2
5068movhlps xmm1, xmm0
5069movq QWORD [Result + $18], xmm0
5070movss DWORD [Result + $20], xmm1
5071
5072movdqa xmm6, [rsp-24]
5073end;
5074{$ENDIF}
5075
5076class operator TMatrix3.Negative(const A: TMatrix3): TMatrix3; assembler;
5077asm
5078movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
5079movups xmm1, DQWORD [A + $00] // Load 3 rows
5080movups xmm2, DQWORD [A + $10]
5081movss xmm3, DWORD [A + $20]
5082xorps xmm1, xmm0 // Flip sign bits of each element in each row
5083xorps xmm2, xmm0
5084pxor xmm3, xmm0
5085movups DQWORD [Result + $00], xmm1
5086movups DQWORD [Result + $10], xmm2
5087movss DWORD [Result + $20], xmm3
5088end;
5089
5090procedure TMatrix3.SetTransposed; assembler;
5091asm
5092movss xmm0, DWORD [Self + $04]
5093movss xmm1, DWORD [Self + $08]
5094
5095movss xmm2, DWORD [Self + $0C]
5096movss xmm3, DWORD [Self + $14]
5097
5098movss xmm4, DWORD [Self + $18]
5099movss xmm5, DWORD [Self + $1C]
5100
5101movss DWORD [Self + $0C], xmm0
5102movss DWORD [Self + $18], xmm1
5103
5104movss DWORD [Self + $04], xmm2
5105movss DWORD [Self + $1C], xmm3
5106
5107movss DWORD [Self + $08], xmm4
5108movss DWORD [Self + $14], xmm5
5109end;
5110
5111class operator TMatrix3.Subtract(const A: TMatrix3; const B: Single): TMatrix3; assembler;
5112asm
5113movups xmm1, DQWORD [A + $00] // Load 3 rows
5114shufps xmm2, xmm2, 0 // Replicate B
5115movups xmm0, DQWORD [A + $10]
5116movss xmm3, DWORD [A + $20]
5117subps xmm1, xmm2 // Subtract B from each row
5118subps xmm0, xmm2
5119subss xmm3, xmm2
5120movups DQWORD [Result + $00], xmm1
5121movups DQWORD [Result + $10], xmm0
5122movss DWORD [Result + $20], xmm3
5123end;
5124
5125class operator TMatrix3.Subtract(const A: Single; const B: TMatrix3): TMatrix3; assembler;
5126asm
5127movups xmm4, DQWORD [B + $00] // Load 3 rows
5128shufps xmm1, xmm1, 0 // Replicate A
5129movups xmm5, DQWORD [B + $10]
5130movaps xmm0, xmm1
5131movaps xmm2, xmm1
5132movss xmm6, DWORD [B + $20]
5133subps xmm1, xmm4 // Subtract each row from A
5134subps xmm0, xmm5
5135subss xmm2, xmm6
5136movups DQWORD [Result + $00], xmm1
5137movups DQWORD [Result + $10], xmm0
5138movss DWORD [Result + $20], xmm2
5139end;
5140
5141class operator TMatrix3.Subtract(const A, B: TMatrix3): TMatrix3; assembler;
5142asm
5143movups xmm0, DQWORD [A + $00] // Load 3 rows of A
5144movups xmm1, DQWORD [A + $10]
5145movss xmm2, DWORD [A + $20]
5146movups xmm4, DQWORD [B + $00] // Load 3 rows of B
5147movups xmm5, DQWORD [B + $10]
5148movss xmm6, DWORD [B + $20]
5149subps xmm0, xmm4 // Subtract rows
5150subps xmm1, xmm5
5151subss xmm2, xmm6
5152movups DQWORD [Result + $00], xmm0
5153movups DQWORD [Result + $10], xmm1
5154movss DWORD [Result + $20], xmm2
5155end;
5156
5157function TMatrix3.Transpose: TMatrix3; assembler;
5158asm
5159movss xmm0, DWORD [Self + $00]
5160movss xmm1, DWORD [Self + $04]
5161movss xmm2, DWORD [Self + $08]
5162
5163movss DWORD [Result + $00], xmm0
5164movss DWORD [Result + $0C], xmm1
5165movss DWORD [Result + $18], xmm2
5166
5167movss xmm0, DWORD [Self + $0C]
5168movss xmm1, DWORD [Self + $10]
5169movss xmm2, DWORD [Self + $14]
5170
5171movss DWORD [Result + $04], xmm0
5172movss DWORD [Result + $10], xmm1
5173movss DWORD [Result + $1C], xmm2
5174
5175movss xmm0, DWORD [Self + $18]
5176movss xmm1, DWORD [Self + $1C]
5177movss xmm2, DWORD [Self + $20]
5178
5179movss DWORD [Result + $08], xmm0
5180movss DWORD [Result + $14], xmm1
5181movss DWORD [Result + $20], xmm2
5182end;
5183
5184{ TMatrix 4 }
5185
5186class operator TMatrix4.Add(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5187asm
5188movups xmm1, DQWORD [A + $00] // Load 4 rows
5189shufps xmm2, xmm2, 0 // Replicate B
5190movups xmm0, DQWORD [A + $10]
5191movups xmm3, DQWORD [A + $20]
5192movups xmm4, DQWORD [A + $30]
5193addps xmm1, xmm2 // Add B to each row
5194addps xmm0, xmm2
5195addps xmm3, xmm2
5196addps xmm4, xmm2
5197movups DQWORD [Result + $00], xmm1
5198movups DQWORD [Result + $10], xmm0
5199movups DQWORD [Result + $20], xmm3
5200movups DQWORD [Result + $30], xmm4
5201end;
5202
5203class operator TMatrix4.Add(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5204asm
5205movups xmm0, DQWORD [B + $00] // Load 4 rows
5206shufps xmm1, xmm1, 0 // Replicate A
5207movups xmm2, DQWORD [B + $10]
5208movups xmm3, DQWORD [B + $20]
5209movups xmm4, DQWORD [B + $30]
5210addps xmm0, xmm1 // Add A to each row
5211addps xmm2, xmm1
5212addps xmm3, xmm1
5213addps xmm4, xmm1
5214movups DQWORD [Result + $00], xmm0
5215movups DQWORD [Result + $10], xmm2
5216movups DQWORD [Result + $20], xmm3
5217movups DQWORD [Result + $30], xmm4
5218end;
5219
5220class operator TMatrix4.Add(const A, B: TMatrix4): TMatrix4; assembler;
5221asm
5222movups xmm0, DQWORD [A + $00] // Load 4 rows of A
5223movups xmm1, DQWORD [A + $10]
5224movups xmm2, DQWORD [A + $20]
5225movups xmm3, DQWORD [A + $30]
5226movups xmm4, DQWORD [B + $00] // Load 2 rows of B
5227movups xmm5, DQWORD [B + $10]
5228addps xmm0, xmm4 // Add rows
5229addps xmm1, xmm5
5230movups xmm4, DQWORD [B + $20] // Load 2 rows of B
5231movups xmm5, DQWORD [B + $30]
5232addps xmm2, xmm4 // Add rows
5233addps xmm3, xmm5
5234movups DQWORD [Result + $00], xmm0
5235movups DQWORD [Result + $10], xmm1
5236movups DQWORD [Result + $20], xmm2
5237movups DQWORD [Result + $30], xmm3
5238end;
5239
5240function TMatrix4.CompMult(const AOther: TMatrix4): TMatrix4; assembler;
5241asm
5242movups xmm0, DQWORD[Self + $00] // Self[0]
5243movups xmm1, DQWORD[Self + $10] // Self[1]
5244movups xmm2, DQWORD[Self + $20] // Self[2]
5245movups xmm3, DQWORD[Self + $30] // Self[3]
5246movups xmm4, DQWORD[AOther + $00] // AOther[0]
5247movups xmm5, DQWORD[AOther + $10] // AOther[1]
5248
5249// Component-wise multiplication
5250mulps xmm0, xmm4
5251mulps xmm1, xmm5
5252movups xmm4, DQWORD[AOther + $20] // AOther[2]
5253movups xmm5, DQWORD[AOther + $30] // AOther[3]
5254mulps xmm2, xmm4
5255mulps xmm3, xmm5
5256
5257// Store result
5258movups DQWORD [Result + $00], xmm0
5259movups DQWORD [Result + $10], xmm1
5260movups DQWORD [Result + $20], xmm2
5261movups DQWORD [Result + $30], xmm3
5262end;
5263
5264class operator TMatrix4.Divide(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5265asm
5266movups xmm4, DQWORD [B + $00] // Load 4 rows
5267shufps xmm1, xmm1, 0 // Replicate A
5268movups xmm5, DQWORD [B + $10]
5269movaps xmm0, xmm1
5270movaps xmm2, xmm1
5271movaps xmm3, xmm1
5272divps xmm1, xmm4 // Divide A by each row
5273divps xmm0, xmm5
5274movups xmm4, DQWORD [B + $20]
5275movups xmm5, DQWORD [B + $30]
5276divps xmm2, xmm4
5277divps xmm3, xmm5
5278movups DQWORD [Result + $00], xmm1
5279movups DQWORD [Result + $10], xmm0
5280movups DQWORD [Result + $20], xmm2
5281movups DQWORD [Result + $30], xmm3
5282end;
5283
5284class operator TMatrix4.Divide(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5285asm
5286movups xmm1, DQWORD [A + $00] // Load 4 rows
5287shufps xmm2, xmm2, 0 // Replicate B
5288movups xmm0, DQWORD [A + $10]
5289movups xmm3, DQWORD [A + $20]
5290movups xmm4, DQWORD [A + $30]
5291divps xmm1, xmm2 // Divide each row by B
5292divps xmm0, xmm2 // NOTE: We could speed it up by multiplying by
5293divps xmm3, xmm2 // 1/B instead, using the "rcpps" instruction,
5294divps xmm4, xmm2 // but that instruction is an approximation,
5295// so we lose accuracy.
5296movups DQWORD [Result + $00], xmm1
5297movups DQWORD [Result + $10], xmm0
5298movups DQWORD [Result + $20], xmm3
5299movups DQWORD [Result + $30], xmm4
5300end;
5301
5302function TMatrix4.Inverse: TMatrix4; assembler;
5303type
5304TStack = record
5305case Byte of
53060: (WorkSpace: array [0..7] of TVector4);
53071: (F0, F1, F2, F3, F4, F5, X6, X7: TVector4);
5308end;
5309var
5310Stack: TStack;
5311asm
5312movdqa [Stack.X6], xmm6
5313movdqa [Stack.X7], xmm6
5314
5315movups xmm1, DQWORD[Self + $10] // M[1]
5316movups xmm2, DQWORD[Self + $20] // M[2]
5317movups xmm3, DQWORD[Self + $30] // M[3]
5318
5319// C00 := (A.M[2,2] * A.M[3,3]) - (A.M[3,2] * A.M[2,3]);
5320// C02 := (A.M[1,2] * A.M[3,3]) - (A.M[3,2] * A.M[1,3]);
5321// C03 := (A.M[1,2] * A.M[2,3]) - (A.M[2,2] * A.M[1,3]);
5322// F0 := Vector4(C00, C00, C02, C03);
5323movaps xmm5, xmm2 // M[2]
5324movaps xmm7, xmm2 // M[2]
5325movaps xmm0, xmm3 // M[3]
5326movaps xmm6, xmm3 // M[3]
5327shufps xmm6, xmm2, $AA // M22 M22 M32 M32
5328shufps xmm0, xmm2, $FF // M23 M23 M33 M33
5329shufps xmm7, xmm1, $FF // M13 M13 M23 M23
5330pshufd xmm4, xmm0, $80 // M23 M33 M33 M33
5331shufps xmm5, xmm1, $AA // M12 M12 M22 M22
5332pshufd xmm0, xmm6, $80 // M22 M32 M32 M32
5333mulps xmm5, xmm4 // (M12 * M23) (M12 * M33) (M22 * M33) (M22 * M33)
5334mulps xmm7, xmm0 // (M22 * M13) (M32 * M13) (M32 * M23) (M32 * M23)
5335subps xmm5, xmm7 // C03=(M12*M23)-(M22*M13), C02=(M12*M33)-(M32*M13), C00=(M22*M33)-(M32*M23), C00=(M22*M33)-(M32*M23)
5336movups [Stack.F0], xmm5
5337
5338// C04 := (A.M[2,1] * A.M[3,3]) - (A.M[3,1] * A.M[2,3]);
5339// C06 := (A.M[1,1] * A.M[3,3]) - (A.M[3,1] * A.M[1,3]);
5340// C07 := (A.M[1,1] * A.M[2,3]) - (A.M[2,1] * A.M[1,3]);
5341// F1 := Vector4(C04, C04, C06, C07);
5342movaps xmm5, xmm2 // M[2]
5343movaps xmm7, xmm2 // M[2]
5344movaps xmm0, xmm3 // M[3]
5345movaps xmm6, xmm3 // M[3]
5346shufps xmm6, xmm2, $55 // M21 M21 M31 M31
5347shufps xmm0, xmm2, $FF // M23 M23 M33 M33
5348shufps xmm7, xmm1, $FF // M13 M13 M23 M23
5349pshufd xmm4, xmm0, $80 // M23 M33 M33 M33
5350shufps xmm5, xmm1, $55 // M11 M11 M21 M21
5351pshufd xmm0, xmm6, $80 // M21 M31 M31 M31
5352mulps xmm5, xmm4 // (M11 * M23) (M11 * M33) (M21 * M33) (M21 * M33)
5353mulps xmm7, xmm0 // (M21 * M13) (M31 * M13) (M31 * M23) (M31 * M23)
5354subps xmm5, xmm7 // C07=(M11*M23)-(M21*M13), C06=(M11*M33)-(M31*M13), C04=(M21*M33)-(M31*M23), C04=(M21*M33)-(M31*M23)
5355movups [Stack.F1], xmm5
5356
5357// C08 := (A.M[2,1] * A.M[3,2]) - (A.M[3,1] * A.M[2,2]);
5358// C10 := (A.M[1,1] * A.M[3,2]) - (A.M[3,1] * A.M[1,2]);
5359// C11 := (A.M[1,1] * A.M[2,2]) - (A.M[2,1] * A.M[1,2]);
5360// F2 := Vector4(C08, C08, C10, C11);
5361movaps xmm5, xmm2 // M[2]
5362movaps xmm7, xmm2 // M[2]
5363movaps xmm0, xmm3 // M[3]
5364movaps xmm6, xmm3 // M[3]
5365shufps xmm6, xmm2, $55 // M21 M21 M31 M31
5366shufps xmm0, xmm2, $AA // M22 M22 M32 M32
5367shufps xmm7, xmm1, $AA // M12 M12 M22 M22
5368pshufd xmm4, xmm0, $80 // M22 M32 M32 M32
5369shufps xmm5, xmm1, $55 // M11 M11 M21 M21
5370pshufd xmm0, xmm6, $80 // M21 M31 M31 M31
5371mulps xmm5, xmm4 // (M11 * M22) (M11 * M32) (M21 * M32) (M21 * M32)
5372mulps xmm7, xmm0 // (M21 * M12) (M31 * M12) (M31 * M22) (M32 * M22)
5373subps xmm5, xmm7 // C11=(M11*M22)-(M21*M12), C10=(M11*M32)-(M31*M12), C08=(M21*M32)-(M31*M22), C08=(M21*M32)-(M31*M22)
5374movups [Stack.F2], xmm5
5375
5376// C12 := (A.M[2,0] * A.M[3,3]) - (A.M[3,0] * A.M[2,3]);
5377// C14 := (A.M[1,0] * A.M[3,3]) - (A.M[3,0] * A.M[1,3]);
5378// C15 := (A.M[1,0] * A.M[2,3]) - (A.M[2,0] * A.M[1,3]);
5379// F3 := Vector4(C12, C12, C14, C15);
5380movaps xmm5, xmm2 // M[2]
5381movaps xmm7, xmm2 // M[2]
5382movaps xmm0, xmm3 // M[3]
5383movaps xmm6, xmm3 // M[3]
5384shufps xmm6, xmm2, $00 // M20 M20 M30 M30
5385shufps xmm0, xmm2, $FF // M23 M23 M33 M33
5386shufps xmm7, xmm1, $FF // M13 M13 M23 M23
5387pshufd xmm4, xmm0, $80 // M23 M33 M33 M33
5388shufps xmm5, xmm1, $00 // M10 M10 M20 M20
5389pshufd xmm0, xmm6, $80 // M20 M30 M30 M30
5390mulps xmm5, xmm4 // (M10 * M23) (M10 * M33) (M20 * M33) (M20 * M33)
5391mulps xmm7, xmm0 // (M20 * M13) (M30 * M13) (M30 * M23) (M30 * M23)
5392subps xmm5, xmm7 // C15=(M10*M23)-(M20*M13), C14=(M10*M33)-(M30*M13), C12=(M20*M33)-(M30*M23), C12=(M20*M33)-(M30*M23)
5393movups [Stack.F3], xmm5
5394
5395// C16 := (A.M[2,0] * A.M[3,2]) - (A.M[3,0] * A.M[2,2]);
5396// C18 := (A.M[1,0] * A.M[3,2]) - (A.M[3,0] * A.M[1,2]);
5397// C19 := (A.M[1,0] * A.M[2,2]) - (A.M[2,0] * A.M[1,2]);
5398// F4 := Vector4(C16, C16, C18, C19);
5399movaps xmm5, xmm2 // M[2]
5400movaps xmm7, xmm2 // M[2]
5401movaps xmm0, xmm3 // M[3]
5402movaps xmm6, xmm3 // M[3]
5403shufps xmm6, xmm2, $00 // M20 M20 M30 M30
5404shufps xmm0, xmm2, $AA // M22 M22 M32 M32
5405shufps xmm7, xmm1, $AA // M12 M12 M22 M22
5406pshufd xmm4, xmm0, $80 // M22 M32 M32 M32
5407shufps xmm5, xmm1, $00 // M10 M10 M20 M20
5408pshufd xmm0, xmm6, $80 // M20 M30 M30 M30
5409mulps xmm5, xmm4 // (M10 * M22) (M10 * M32) (M20 * M32) (M20 * M32)
5410mulps xmm7, xmm0 // (M20 * M12) (M30 * M12) (M30 * M22) (M30 * M22)
5411subps xmm5, xmm7 // C19=(M10*M22)-(M20*M12), C18=(M10*M32)-(M30*M12), C16=(M20*M32)-(M30*M22), C16=(M20*M32)-(M30*M22)
5412movups [Stack.F4], xmm5
5413
5414// C20 := (A.M[2,0] * A.M[3,1]) - (A.M[3,0] * A.M[2,1]);
5415// C22 := (A.M[1,0] * A.M[3,1]) - (A.M[3,0] * A.M[1,1]);
5416// C23 := (A.M[1,0] * A.M[2,1]) - (A.M[2,0] * A.M[1,1]);
5417// F5 := Vector4(C20, C20, C22, C23);
5418movaps xmm5, xmm2 // M[2]
5419movaps xmm7, xmm2 // M[2]
5420movaps xmm0, xmm3 // M[3]
5421movaps xmm6, xmm3 // M[3]
5422shufps xmm6, xmm2, $00 // M20 M20 M30 M30
5423shufps xmm0, xmm2, $55 // M21 M21 M31 M31
5424shufps xmm7, xmm1, $55 // M11 M11 M21 M21
5425pshufd xmm4, xmm0, $80 // M21 M31 M31 M31
5426shufps xmm5, xmm1, $00 // M10 M10 M20 M20
5427pshufd xmm0, xmm6, $80 // M20 M30 M30 M30
5428mulps xmm5, xmm4 // (M10 * M21) (M10 * M31) (M20 * M31) (M20 * M31)
5429mulps xmm7, xmm0 // (M20 * M11) (M30 * M11) (M30 * M21) (M30 * M21)
5430subps xmm5, xmm7 // C23=(M10*M21)-(M20*M11), C22=(M10*M31)-(M30*M11), C20=(M20*M31)-(M30*M21), C20=(M20*M31)-(M30*M21)
5431movups [Stack.F5], xmm5
5432
5433// V0 := Vector4(A.M[1,0], A.M[0,0], A.M[0,0], A.M[0,0]);
5434// V1 := Vector4(A.M[1,1], A.M[0,1], A.M[0,1], A.M[0,1]);
5435// V2 := Vector4(A.M[1,2], A.M[0,2], A.M[0,2], A.M[0,2]);
5436// V3 := Vector4(A.M[1,3], A.M[0,3], A.M[0,3], A.M[0,3]);
5437movups xmm0, DQWORD[Self + $00] // M[0]
5438movaps xmm4, xmm1 // M[1]
5439movaps xmm5, xmm1 // M[1]
5440movaps xmm6, xmm1 // M[1]
5441movaps xmm7, xmm1 // M[1]
5442
5443shufps xmm4, xmm0, $00 // M00 M00 M10 M10
5444shufps xmm5, xmm0, $55 // M01 M01 M11 M11
5445shufps xmm6, xmm0, $AA // M02 M02 M12 M12
5446shufps xmm7, xmm0, $FF // M03 M03 M13 M13
5447
5448pshufd xmm4, xmm4, $A8 // V0=M00 M00 M00 M10
5449pshufd xmm5, xmm5, $A8 // V1=M01 M01 M01 M11
5450pshufd xmm6, xmm6, $A8 // V2=M02 M02 M02 M12
5451pshufd xmm7, xmm7, $A8 // V3=M03 M03 M03 M13
5452
5453// I0 := (V1 * F0) - (V2 * F1) + (V3 * F2);
5454// I1 := (V0 * F0) - (V2 * F3) + (V3 * F4);
5455// I2 := (V0 * F1) - (V1 * F3) + (V3 * F5);
5456// I3 := (V0 * F2) - (V1 * F4) + (V2 * F5);
5457movaps xmm0, xmm5 // V1
5458movaps xmm1, xmm6 // V2
5459movaps xmm2, xmm7 // V3
5460mulps xmm0, [Stack.F0] // V1 * F0
5461mulps xmm1, [Stack.F1] // V2 * F1
5462mulps xmm2, [Stack.F2] // V3 * F2
5463subps xmm0, xmm1 // (V1 * F0) - (V2 * F1)
5464movaps xmm1, xmm4 // V0
5465addps xmm0, xmm2 // I0=(V1 * F0) - (V2 * F1) + (V3 * F2)
5466
5467movaps xmm2, xmm6 // V2
5468movaps xmm3, xmm7 // V3
5469mulps xmm1, [Stack.F0] // V0 * F0
5470mulps xmm2, [Stack.F3] // V2 * F3
5471mulps xmm3, [Stack.F4] // V3 * F4
5472subps xmm1, xmm2 // (V0 * F0) - (V2 * F3)
5473movaps xmm2, xmm4 // V0
5474addps xmm1, xmm3 // I1=(V0 * F0) - (V2 * F3) + (V3 * F4)
5475
5476movaps xmm3, xmm5 // V1
5477mulps xmm2, [Stack.F1] // V0 * F1
5478mulps xmm3, [Stack.F3] // V1 * F3
5479mulps xmm7, [Stack.F5] // V3 * F5
5480subps xmm2, xmm3 // (V0 * F1) - (V1 * F3)
5481mulps xmm4, [Stack.F2] // V0 * F2
5482addps xmm2, xmm7 // I2=(V0 * F1) - (V1 * F3) + (V3 * F5)
5483
5484mulps xmm5, [Stack.F4] // V1 * F4
5485mulps xmm6, [Stack.F5] // V2 * F5
5486subps xmm4, xmm5 // (V0 * F2) - (V1 * F4)
5487addps xmm4, xmm6 // I3=(V0 * F2) - (V1 * F4) + (V2 * F5)
5488
5489// SA := Vector4(+1, -1, +1, -1);
5490// SB := Vector4(-1, +1, -1, +1);
5491// Inv := Matrix4(I0 * SA, I1 * SB, I2 * SA, I3 * SB);
5492
5493movups xmm6, [SSE_MASK_PNPN] // SA
5494movups xmm7, [SSE_MASK_NPNP] // SB
5495xorps xmm0, xmm6 // Inv[0] = I0 * SA
5496xorps xmm1, xmm7 // Inv[1] = I1 * SB
5497xorps xmm2, xmm6 // Inv[2] = I2 * SA
5498xorps xmm4, xmm7 // Inv[3] = I3 * SB
5499
5500// Row := Vector4(Inv[0,0], Inv[1,0], Inv[2,0], Inv[3,0]);
5501movaps xmm3, xmm0
5502movaps xmm5, xmm2
5503movaps xmm6, xmm1
5504
5505unpcklps xmm3, xmm1 // Inv[1,1] Inv[0,1] Inv[1,0] Inv[0,0]
5506unpcklps xmm5, xmm4 // Inv[3,1] Inv[2,1] Inv[3,0] Inv[2,0]
5507movups xmm6, DQWORD[Self + $00] // A.C[0]
5508movlhps xmm3, xmm5 // Inv[3,0] Inv[2,0] Inv[1,0] Inv[0,0]
5509
5510// Dot := A.C[0] * Row;
5511mulps xmm3, xmm6 // Dot.W Dot.Z Dot.Y Dot.X
5512
5513// OneOverDeterminant := 1 / ((Dot.X + Dot.Y) + (Dot.Z + Dot.W));
5514pshufd xmm6, xmm3, $4E // Dot.Y Dot.X Dot.W Dot.Z
5515addps xmm3, xmm6 // W+Y Z+X Y+W X+Z
5516pshufd xmm6, xmm3, $11 // X+Z Y+X X+Z Y+W
5517movups xmm5, [SSE_ONE] // 1.0 (4x)
5518addps xmm3, xmm6 // X+Y+Z+W (4x)
5519divps xmm5, xmm3 // OneOverDeterminant (4x)
5520
5521// Result := Inv * OneOverDeterminant;
5522mulps xmm0, xmm5
5523mulps xmm1, xmm5
5524mulps xmm2, xmm5
5525mulps xmm4, xmm5
5526
5527movups DQWORD[Result + $00], xmm0
5528movups DQWORD[Result + $10], xmm1
5529movups DQWORD[Result + $20], xmm2
5530movups DQWORD[Result + $30], xmm4
5531
5532movdqa xmm6, [Stack.X6]
5533movdqa xmm7, [Stack.X7]
5534end;
5535
5536class operator TMatrix4.Multiply(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5537asm
5538movups xmm0, DQWORD [B + $00] // Load 4 rows
5539shufps xmm1, xmm1, 0 // Replicate A
5540movups xmm2, DQWORD [B + $10]
5541movups xmm3, DQWORD [B + $20]
5542movups xmm4, DQWORD [B + $30]
5543mulps xmm0, xmm1 // Multiply each row by A
5544mulps xmm2, xmm1
5545mulps xmm3, xmm1
5546mulps xmm4, xmm1
5547movups DQWORD [Result + $00], xmm0
5548movups DQWORD [Result + $10], xmm2
5549movups DQWORD [Result + $20], xmm3
5550movups DQWORD [Result + $30], xmm4
5551end;
5552
5553class operator TMatrix4.Multiply(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5554asm
5555movups xmm1, DQWORD [A + $00] // Load 4 rows
5556shufps xmm2, xmm2, 0 // Replicate B
5557movups xmm0, DQWORD [A + $10]
5558movups xmm3, DQWORD [A + $20]
5559movups xmm4, DQWORD [A + $30]
5560mulps xmm1, xmm2 // Multiply each row by B
5561mulps xmm0, xmm2
5562mulps xmm3, xmm2
5563mulps xmm4, xmm2
5564movups DQWORD [Result + $00], xmm1
5565movups DQWORD [Result + $10], xmm0
5566movups DQWORD [Result + $20], xmm3
5567movups DQWORD [Result + $30], xmm4
5568end;
5569
5570{$IFDEF FM_COLUMN_MAJOR}
5571class operator TMatrix4.Multiply(const A: TMatrix4; const B: TVector4): TVector4; assembler;
5572asm
5573movups xmm0, [B]
5574movups xmm4, DQWORD [A + $00]
5575movaps xmm1, xmm0
5576movaps xmm2, xmm0
5577movaps xmm3, xmm0
5578shufps xmm0, xmm0, $00
5579shufps xmm1, xmm1, $55
5580shufps xmm2, xmm2, $AA
5581shufps xmm3, xmm3, $FF
5582movups xmm5, DQWORD [A + $10]
5583mulps xmm0, xmm4
5584mulps xmm1, xmm5
5585movups xmm4, DQWORD [A + $20]
5586movups xmm5, DQWORD [A + $30]
5587mulps xmm2, xmm4
5588mulps xmm3, xmm5
5589addps xmm0, xmm1
5590addps xmm2, xmm3
5591addps xmm0, xmm2
5592movups [Result], xmm0
5593end;
5594
5595class operator TMatrix4.Multiply(const A: TVector4; const B: TMatrix4): TVector4; assembler;
5596asm
5597movups xmm0, [A]
5598movups xmm4, DQWORD [B + $00]
5599movaps xmm1, xmm0
5600movaps xmm2, xmm0
5601movaps xmm3, xmm0
5602movups xmm5, DQWORD [B + $10]
5603mulps xmm0, xmm4
5604mulps xmm1, xmm5
5605movups xmm4, DQWORD [B + $20]
5606movups xmm5, DQWORD [B + $30]
5607mulps xmm2, xmm4
5608mulps xmm3, xmm5
5609
5610{ Transpose xmm0-xmm3 }
5611movaps xmm4, xmm2
5612unpcklps xmm2, xmm3
5613unpckhps xmm4, xmm3
5614
5615movaps xmm3, xmm0
5616unpcklps xmm0, xmm1
5617unpckhps xmm3, xmm1
5618
5619movaps xmm1, xmm0
5620unpcklpd xmm0, xmm2
5621unpckhpd xmm1, xmm2
5622
5623movaps xmm2, xmm3
5624unpcklpd xmm2, xmm4
5625unpckhpd xmm3, xmm4
5626
5627addps xmm0, xmm1
5628addps xmm2, xmm3
5629addps xmm0, xmm2
5630movups [Result], xmm0
5631end;
5632
5633class operator TMatrix4.Multiply(const A, B: TMatrix4): TMatrix4; assembler;
5634{ Code below consists of 4 Vector*Matrix calculations }
5635asm
5636movdqa [rsp-24], xmm6
5637movdqa [rsp-40], xmm7
5638
5639movups xmm0, DQWORD [B + $00]
5640movups xmm4, DQWORD [A + $00]
5641movaps xmm1, xmm0
5642movaps xmm2, xmm0
5643movaps xmm3, xmm0
5644shufps xmm0, xmm0, $00
5645shufps xmm1, xmm1, $55
5646shufps xmm2, xmm2, $AA
5647shufps xmm3, xmm3, $FF
5648movups xmm5, DQWORD [A + $10]
5649movups xmm6, DQWORD [A + $20]
5650movups xmm7, DQWORD [A + $30]
5651mulps xmm0, xmm4
5652mulps xmm1, xmm5
5653mulps xmm2, xmm6
5654mulps xmm3, xmm7
5655addps xmm0, xmm1
5656addps xmm2, xmm3
5657addps xmm0, xmm2
5658movups DQWORD [Result + $00], xmm0
5659
5660movups xmm0, DQWORD [B + $10]
5661movaps xmm1, xmm0
5662movaps xmm2, xmm0
5663movaps xmm3, xmm0
5664shufps xmm0, xmm0, $00
5665shufps xmm1, xmm1, $55
5666shufps xmm2, xmm2, $AA
5667shufps xmm3, xmm3, $FF
5668mulps xmm0, xmm4
5669mulps xmm1, xmm5
5670mulps xmm2, xmm6
5671mulps xmm3, xmm7
5672addps xmm0, xmm1
5673addps xmm2, xmm3
5674addps xmm0, xmm2
5675movups DQWORD [Result + $10], xmm0
5676
5677movups xmm0, DQWORD [B + $20]
5678movaps xmm1, xmm0
5679movaps xmm2, xmm0
5680movaps xmm3, xmm0
5681shufps xmm0, xmm0, $00
5682shufps xmm1, xmm1, $55
5683shufps xmm2, xmm2, $AA
5684shufps xmm3, xmm3, $FF
5685mulps xmm0, xmm4
5686mulps xmm1, xmm5
5687mulps xmm2, xmm6
5688mulps xmm3, xmm7
5689addps xmm0, xmm1
5690addps xmm2, xmm3
5691addps xmm0, xmm2
5692movups DQWORD [Result + $20], xmm0
5693
5694movups xmm0, DQWORD [B + $30]
5695movaps xmm1, xmm0
5696movaps xmm2, xmm0
5697movaps xmm3, xmm0
5698shufps xmm0, xmm0, $00
5699shufps xmm1, xmm1, $55
5700shufps xmm2, xmm2, $AA
5701shufps xmm3, xmm3, $FF
5702mulps xmm0, xmm4
5703mulps xmm1, xmm5
5704mulps xmm2, xmm6
5705mulps xmm3, xmm7
5706addps xmm0, xmm1
5707addps xmm2, xmm3
5708addps xmm0, xmm2
5709movups DQWORD [Result + $30], xmm0
5710
5711movdqa xmm6, [rsp-24]
5712movdqa xmm7, [rsp-40]
5713end;
5714{$ELSE}
5715class operator TMatrix4.Multiply(const A: TMatrix4; const B: TVector4): TVector4; assembler;
5716asm
5717movups xmm0, [B] // Load vector
5718movups xmm4, DQWORD [A + $00] // Load 4 rows
5719movaps xmm1, xmm0
5720movaps xmm2, xmm0
5721movaps xmm3, xmm0
5722movups xmm5, DQWORD [A + $10]
5723mulps xmm0, xmm4 // (Ax * B00), (Ay * B01), (Az * B02), (Aw * B03)
5724mulps xmm1, xmm5 // (Ax * B10), (Ay * B11), (Az * B12), (Aw * B13)
5725movups xmm4, DQWORD [A + $20]
5726movups xmm5, DQWORD [A + $30]
5727mulps xmm2, xmm4 // (Ax * B20), (Ay * B21), (Az * B22), (Aw * B23)
5728mulps xmm3, xmm5 // (Ax * B30), (Ay * B31), (Az * B32), (Aw * B33)
5729
5730{ Transpose xmm0-xmm3 }
5731movaps xmm4, xmm2
5732unpcklps xmm2, xmm3 // B32 B22 B33 B23
5733unpckhps xmm4, xmm3 // B30 B20 B31 B21
5734
5735movaps xmm3, xmm0
5736unpcklps xmm0, xmm1 // B12 B02 B13 B03
5737unpckhps xmm3, xmm1 // B10 B00 B11 B01
5738
5739movaps xmm1, xmm0
5740unpcklpd xmm0, xmm2 // B33 B23 B13 B03
5741unpckhpd xmm1, xmm2 // B32 B22 B12 B02
5742
5743movaps xmm2, xmm3
5744unpcklpd xmm2, xmm4 // B31 B21 B11 B01
5745unpckhpd xmm3, xmm4 // B30 B20 B10 B00
5746
5747addps xmm0, xmm1 // Add rows
5748addps xmm2, xmm3
5749addps xmm0, xmm2
5750movups [Result], xmm0
5751end;
5752
5753class operator TMatrix4.Multiply(const A: TVector4; const B: TMatrix4): TVector4; assembler;
5754asm
5755movups xmm0, [A] // Load vector
5756movups xmm4, DQWORD [B + $00] // Load 4 rows
5757movaps xmm1, xmm0
5758movaps xmm2, xmm0
5759movaps xmm3, xmm0
5760shufps xmm0, xmm0, $00 // Bx Bx Bx Bx
5761shufps xmm1, xmm1, $55 // By By By By
5762shufps xmm2, xmm2, $AA // Bz Bz Bz Bz
5763shufps xmm3, xmm3, $FF // Bw Bw Bw Bw
5764movups xmm5, DQWORD [B + $10]
5765mulps xmm0, xmm4 // (A00 * Bx), (A01 * Bx), (A02 * Bx), (A03 * Bx)
5766mulps xmm1, xmm5 // (A10 * By), (A11 * By), (A12 * By), (A13 * By)
5767movups xmm4, DQWORD [B + $20]
5768movups xmm5, DQWORD [B + $30]
5769mulps xmm2, xmm4 // (A20 * Bz), (A21 * Bz), (A22 * Bz), (A23 * Bz)
5770mulps xmm3, xmm5 // (A30 * Bw), (A31 * Bw), (A32 * Bw), (A33 * Bw)
5771addps xmm0, xmm1 // Add rows
5772addps xmm2, xmm3
5773addps xmm0, xmm2
5774movups [Result], xmm0
5775end;
5776
5777class operator TMatrix4.Multiply(const A, B: TMatrix4): TMatrix4; assembler;
5778{ Code below consists of 4 Vector*Matrix calculations }
5779asm
5780movdqa [rsp-24], xmm6
5781movdqa [rsp-40], xmm7
5782
5783{ A.R[0] * B }
5784movups xmm0, DQWORD [A + $00]
5785movups xmm4, DQWORD [B + $00]
5786movaps xmm1, xmm0
5787movaps xmm2, xmm0
5788movaps xmm3, xmm0
5789shufps xmm0, xmm0, $00
5790shufps xmm1, xmm1, $55
5791shufps xmm2, xmm2, $AA
5792shufps xmm3, xmm3, $FF
5793movups xmm5, DQWORD [B + $10]
5794movups xmm6, DQWORD [B + $20]
5795movups xmm7, DQWORD [B + $30]
5796mulps xmm0, xmm4
5797mulps xmm1, xmm5
5798mulps xmm2, xmm6
5799mulps xmm3, xmm7
5800addps xmm0, xmm1
5801addps xmm2, xmm3
5802addps xmm0, xmm2
5803movups DQWORD [Result + $00], xmm0
5804
5805{ A.R[1] * B }
5806movups xmm0, DQWORD [A + $10]
5807movaps xmm1, xmm0
5808movaps xmm2, xmm0
5809movaps xmm3, xmm0
5810shufps xmm0, xmm0, $00
5811shufps xmm1, xmm1, $55
5812shufps xmm2, xmm2, $AA
5813shufps xmm3, xmm3, $FF
5814mulps xmm0, xmm4
5815mulps xmm1, xmm5
5816mulps xmm2, xmm6
5817mulps xmm3, xmm7
5818addps xmm0, xmm1
5819addps xmm2, xmm3
5820addps xmm0, xmm2
5821movups DQWORD [Result + $10], xmm0
5822
5823{ A.R[2] * B }
5824movups xmm0, DQWORD [A + $20]
5825movaps xmm1, xmm0
5826movaps xmm2, xmm0
5827movaps xmm3, xmm0
5828shufps xmm0, xmm0, $00
5829shufps xmm1, xmm1, $55
5830shufps xmm2, xmm2, $AA
5831shufps xmm3, xmm3, $FF
5832mulps xmm0, xmm4
5833mulps xmm1, xmm5
5834mulps xmm2, xmm6
5835mulps xmm3, xmm7
5836addps xmm0, xmm1
5837addps xmm2, xmm3
5838addps xmm0, xmm2
5839movups DQWORD [Result + $20], xmm0
5840
5841{ A.R[3] * B }
5842movups xmm0, DQWORD [A + $30]
5843movaps xmm1, xmm0
5844movaps xmm2, xmm0
5845movaps xmm3, xmm0
5846shufps xmm0, xmm0, $00
5847shufps xmm1, xmm1, $55
5848shufps xmm2, xmm2, $AA
5849shufps xmm3, xmm3, $FF
5850mulps xmm0, xmm4
5851mulps xmm1, xmm5
5852mulps xmm2, xmm6
5853mulps xmm3, xmm7
5854addps xmm0, xmm1
5855addps xmm2, xmm3
5856addps xmm0, xmm2
5857movups DQWORD [Result + $30], xmm0
5858
5859movdqa xmm6, [rsp-24]
5860movdqa xmm7, [rsp-40]
5861end;
5862{$ENDIF}
5863
5864class operator TMatrix4.Negative(const A: TMatrix4): TMatrix4; assembler;
5865asm
5866movups xmm0, [SSE_MASK_SIGN] // Load mask with 4 sign (upper) bits
5867movups xmm1, DQWORD [A + $00] // Load 4 rows
5868movups xmm2, DQWORD [A + $10]
5869movups xmm3, DQWORD [A + $20]
5870movups xmm4, DQWORD [A + $30]
5871xorps xmm1, xmm0 // Flip sign bits of each element in each row
5872xorps xmm2, xmm0
5873xorps xmm3, xmm0
5874xorps xmm4, xmm0
5875movups DQWORD [Result + $00], xmm1
5876movups DQWORD [Result + $10], xmm2
5877movups DQWORD [Result + $20], xmm3
5878movups DQWORD [Result + $30], xmm4
5879end;
5880
5881procedure TMatrix4.SetInversed;
5882begin
5883Self := Inverse;
5884end;
5885
5886procedure TMatrix4.SetTransposed;
5887begin
5888Self := Transpose;
5889end;
5890
5891class operator TMatrix4.Subtract(const A: TMatrix4; const B: Single): TMatrix4; assembler;
5892asm
5893movups xmm1, DQWORD [A + $00] // Load 4 rows
5894shufps xmm2, xmm2, 0 // Replicate B
5895movups xmm0, DQWORD [A + $10]
5896movups xmm3, DQWORD [A + $20]
5897movups xmm4, DQWORD [A + $30]
5898subps xmm1, xmm2 // Subtract B from each row
5899subps xmm0, xmm2
5900subps xmm3, xmm2
5901subps xmm4, xmm2
5902movups DQWORD [Result + $00], xmm1
5903movups DQWORD [Result + $10], xmm0
5904movups DQWORD [Result + $20], xmm3
5905movups DQWORD [Result + $30], xmm4
5906end;
5907
5908class operator TMatrix4.Subtract(const A: Single; const B: TMatrix4): TMatrix4; assembler;
5909asm
5910movups xmm4, DQWORD [B + $00] // Load 4 rows
5911shufps xmm1, xmm1, 0 // Replicate A
5912movups xmm5, DQWORD [B + $10]
5913movaps xmm0, xmm1
5914movaps xmm2, xmm1
5915movaps xmm3, xmm1
5916subps xmm1, xmm4 // Subtract each row from A
5917subps xmm0, xmm5
5918movups xmm4, DQWORD [B + $20]
5919movups xmm5, DQWORD [B + $30]
5920subps xmm2, xmm4
5921subps xmm3, xmm5
5922movups DQWORD [Result + $00], xmm1
5923movups DQWORD [Result + $10], xmm0
5924movups DQWORD [Result + $20], xmm2
5925movups DQWORD [Result + $30], xmm3
5926end;
5927
5928class operator TMatrix4.Subtract(const A, B: TMatrix4): TMatrix4; assembler;
5929asm
5930movups xmm0, DQWORD [A + $00] // Load 4 rows of A
5931movups xmm1, DQWORD [A + $10]
5932movups xmm2, DQWORD [A + $20]
5933movups xmm3, DQWORD [A + $30]
5934movups xmm4, DQWORD [B + $00] // Load 4 rows of B
5935movups xmm5, DQWORD [B + $10]
5936subps xmm0, xmm4 // Subtract rows
5937subps xmm1, xmm5
5938movups xmm4, DQWORD [B + $20]
5939movups xmm5, DQWORD [B + $30]
5940subps xmm2, xmm4
5941subps xmm3, xmm5
5942movups DQWORD [Result + $00], xmm0
5943movups DQWORD [Result + $10], xmm1
5944movups DQWORD [Result + $20], xmm2
5945movups DQWORD [Result + $30], xmm3
5946end;
5947
5948function TMatrix4.Transpose: TMatrix4; assembler;
5949asm
5950movups xmm0, DQWORD[Self + $00] // A03 A02 A01 A00
5951movups xmm1, DQWORD[Self + $10] // A13 A12 A11 A10
5952movups xmm2, DQWORD[Self + $20] // A23 A22 A21 A20
5953movups xmm3, DQWORD[Self + $30] // A33 A32 A31 A30
5954
5955movaps xmm4, xmm2
5956unpcklps xmm2, xmm3 // A31 A21 A30 A20
5957unpckhps xmm4, xmm3 // A33 A23 A32 A22
5958
5959movaps xmm3, xmm0
5960unpcklps xmm0, xmm1 // A11 A01 A10 A00
5961unpckhps xmm3, xmm1 // A13 A03 A12 A02
5962
5963movaps xmm1, xmm0
5964unpcklpd xmm0, xmm2 // A30 A20 A10 A00
5965unpckhpd xmm1, xmm2 // A31 A21 A11 A01
5966
5967movaps xmm2, xmm3
5968unpcklpd xmm2, xmm4 // A32 A22 A12 A02
5969unpckhpd xmm3, xmm4 // A33 A23 A13 A03
5970
5971movups DQWORD[Result + $00], xmm0
5972movups DQWORD[Result + $10], xmm1
5973movups DQWORD[Result + $20], xmm2
5974movups DQWORD[Result + $30], xmm3
5975end;
5976