2
* Copyright (c) 2016, 2021, Intel Corporation. All rights reserved.
3
* Intel Math Library (LIBM) Source Code
5
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7
* This code is free software; you can redistribute it and/or modify it
8
* under the terms of the GNU General Public License version 2 only, as
9
* published by the Free Software Foundation.
11
* This code is distributed in the hope that it will be useful, but WITHOUT
12
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14
* version 2 for more details (a copy is included in the LICENSE file that
15
* accompanied this code).
17
* You should have received a copy of the GNU General Public License version
18
* 2 along with this work; if not, write to the Free Software Foundation,
19
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
* or visit www.oracle.com if you need additional information or have any
27
#include "precompiled.hpp"
28
#include "macroAssembler_x86.hpp"
29
#include "stubGenerator_x86_64.hpp"
31
/******************************************************************************/
32
// ALGORITHM DESCRIPTION - COS()
33
// ---------------------
37
// We perform an initial range reduction from X to r with
41
// so that |r| <= pi/64 + epsilon. We restrict inputs to those
42
// where |N| <= 932560. Beyond this, the range reduction is
43
// insufficiently accurate. For extremely small inputs,
44
// denormalization can occur internally, impacting performance.
45
// This means that the main path is actually only taken for
46
// 2^-252 <= |X| < 90112.
48
// To avoid branches, we perform the range reduction to full
51
// X - N * (P_1 + P_2 + P_3)
53
// where P_1 and P_2 are 32-bit numbers (so multiplication by N
54
// is exact) and P_3 is a 53-bit number. Together, these
55
// approximate pi well enough for all cases in the restricted
58
// The main reduction sequence is:
62
// (computed by adding and subtracting off SHIFTER)
68
// (this r can be used for most of the calculation)
77
// The algorithm uses a table lookup based on B = M * pi / 32
78
// where M = N mod 64. The stored values are:
79
// sigma closest power of 2 to cos(B)
80
// C_hl 53-bit cos(B) - sigma
81
// S_hi + S_lo 2 * 53-bit sin(B)
83
// The computation is organized as follows:
85
// sin(B + r + c) = [sin(B) + sigma * r] +
86
// r * (cos(B) - sigma) +
87
// sin(B) * [cos(r + c) - 1] +
88
// cos(B) * [sin(r + c) - r]
90
// which is approximately:
92
// [S_hi + sigma * r] +
94
// S_lo + S_hi * [(cos(r) - 1) - r * c] +
95
// (C_hl + sigma) * [(sin(r) - r) + c]
97
// and this is what is actually computed. We separate this sum
100
// hi + med + pols + corr
104
// hi = S_hi + sigma r
106
// pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
107
// corr = S_lo + c * ((C_hl + sigma) - S_hi * r)
111
// The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
112
// (sin(r) - r) can be rearranged freely, since it is quite
113
// small, so we exploit parallelism to the fullest.
123
// sincospols = psc1 + msc3
124
// pols = sincospols *
125
// <S_hi * r^2 | (C_hl + sigma) * r^3>
129
// This is where the "c" component of the range reduction is
130
// taken into account; recall that just "r" is used for most of
134
// -d = S_hi * r - (C_hl + sigma)
135
// corr = -c * -d + S_lo
137
// 5. COMPENSATED SUMMATIONS
139
// The two successive compensated summations add up the high
140
// and medium parts, leaving just the low parts to add up at
144
// res_int = S_hi + rs
145
// k_0 = S_hi - res_int
148
// res_hi = res_int + med
149
// k_1 = res_int - res_hi
154
// We now add up all the small parts:
156
// res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
158
// Now the overall result is just:
164
// Inputs with |X| < 2^-252 are treated specially as
168
// cos(NaN) = quiet NaN, and raise invalid exception
169
// cos(INF) = NaN and raise invalid exception
172
/******************************************************************************/
176
address StubGenerator::generate_libmCos() {
177
StubCodeMark mark(this, "StubRoutines", "libmCos");
178
address start = __ pc();
180
Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1;
181
Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1;
182
Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1;
183
Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_4;
185
__ enter(); // required for proper stackwalking of RuntimeStub frame
194
__ movsd(Address(rsp, 8), xmm0);
197
__ movl(rax, Address(rsp, 12));
198
__ movq(xmm1, ExternalAddress(PI32INV), rbx /*rscratch*/); //0x6dc9c883UL, 0x40245f30UL
199
__ andl(rax, 2147418112);
200
__ subl(rax, 808452096);
201
__ cmpl(rax, 281346048);
202
__ jcc(Assembler::above, L_2TAG_PACKET_0_0_1);
203
__ mulsd(xmm1, xmm0);
204
__ movdqu(xmm5, ExternalAddress(ONEHALF), rbx /*rscratch*/); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
205
__ movq(xmm4, ExternalAddress(SIGN_MASK), rbx /*rscratch*/); //0x00000000UL, 0x80000000UL
208
__ addpd(xmm1, xmm5);
209
__ cvttsd2sil(rdx, xmm1);
210
__ cvtsi2sdl(xmm1, rdx);
211
__ movdqu(xmm2, ExternalAddress(P_2), rbx /*rscratch*/); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
212
__ movq(xmm3, ExternalAddress(P_1), rbx /*rscratch*/); //0x54400000UL, 0x3fb921fbUL
213
__ mulsd(xmm3, xmm1);
214
__ unpcklpd(xmm1, xmm1);
215
__ addq(rdx, 1865232);
216
__ movdqu(xmm4, xmm0);
218
__ movdqu(xmm5, ExternalAddress(SC_4), rbx /*rscratch*/); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
219
__ lea(rax, ExternalAddress(Ctable));
222
__ mulpd(xmm2, xmm1);
223
__ subsd(xmm0, xmm3);
224
__ mulsd(xmm1, ExternalAddress(P_3), rbx /*rscratch*/); //0x2e037073UL, 0x3b63198aUL
225
__ subsd(xmm4, xmm3);
226
__ movq(xmm7, Address(rax, 8));
227
__ unpcklpd(xmm0, xmm0);
228
__ movdqu(xmm3, xmm4);
229
__ subsd(xmm4, xmm2);
230
__ mulpd(xmm5, xmm0);
231
__ subpd(xmm0, xmm2);
232
__ movdqu(xmm6, ExternalAddress(SC_2), rbx /*rscratch*/); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
233
__ mulsd(xmm7, xmm4);
234
__ subsd(xmm3, xmm4);
235
__ mulpd(xmm5, xmm0);
236
__ mulpd(xmm0, xmm0);
237
__ subsd(xmm3, xmm2);
238
__ movdqu(xmm2, Address(rax, 0));
239
__ subsd(xmm1, xmm3);
240
__ movq(xmm3, Address(rax, 24));
241
__ addsd(xmm2, xmm3);
242
__ subsd(xmm7, xmm2);
243
__ mulsd(xmm2, xmm4);
244
__ mulpd(xmm6, xmm0);
245
__ mulsd(xmm3, xmm4);
246
__ mulpd(xmm2, xmm0);
247
__ mulpd(xmm0, xmm0);
248
__ addpd(xmm5, ExternalAddress(SC_3), rbx /*rscratch*/); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
249
__ mulsd(xmm4, Address(rax, 0));
250
__ addpd(xmm6, ExternalAddress(SC_1), rbx /*rscratch*/); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
251
__ mulpd(xmm5, xmm0);
252
__ movdqu(xmm0, xmm3);
253
__ addsd(xmm3, Address(rax, 8));
254
__ mulpd(xmm1, xmm7);
255
__ movdqu(xmm7, xmm4);
256
__ addsd(xmm4, xmm3);
257
__ addpd(xmm6, xmm5);
258
__ movq(xmm5, Address(rax, 8));
259
__ subsd(xmm5, xmm3);
260
__ subsd(xmm3, xmm4);
261
__ addsd(xmm1, Address(rax, 16));
262
__ mulpd(xmm6, xmm2);
263
__ addsd(xmm0, xmm5);
264
__ addsd(xmm3, xmm7);
265
__ addsd(xmm0, xmm1);
266
__ addsd(xmm0, xmm3);
267
__ addsd(xmm0, xmm6);
268
__ unpckhpd(xmm6, xmm6);
269
__ addsd(xmm0, xmm6);
270
__ addsd(xmm0, xmm4);
273
__ bind(L_2TAG_PACKET_0_0_1);
274
__ jcc(Assembler::greater, L_2TAG_PACKET_1_0_1);
275
__ pextrw(rax, xmm0, 3);
277
__ pinsrw(xmm0, rax, 3);
278
__ movq(xmm1, ExternalAddress(ONE), rbx /*rscratch*/); // 0x00000000UL, 0x3ff00000UL
279
__ subsd(xmm1, xmm0);
280
__ movdqu(xmm0, xmm1);
283
__ bind(L_2TAG_PACKET_1_0_1);
284
__ pextrw(rax, xmm0, 3);
287
__ jcc(Assembler::equal, L_2TAG_PACKET_2_0_1);
288
__ pextrw(rcx, xmm0, 3);
293
__ lea(r11, ExternalAddress(PI_INV_TABLE));
296
__ movl(r10, Address(rcx, 20));
297
__ movl(r8, Address(rcx, 24));
300
__ orl(rax, INT_MIN);
306
__ movl(rsi, Address(rcx, 16));
307
__ movl(rdi, Address(rcx, 12));
330
__ movl(r9, Address(rcx, 8));
331
__ movl(rsi, Address(rcx, 4));
354
__ movl(rax, Address(rcx, 0));
363
__ pextrw(rbx, xmm0, 3);
364
__ lea(rdi, ExternalAddress(PI_INV_TABLE));
380
__ jcc(Assembler::less, L_2TAG_PACKET_3_0_1);
385
__ andl(r9, 536870911);
386
__ testl(r9, 268435456);
387
__ jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1);
393
__ bind(L_2TAG_PACKET_5_0_1);
395
__ bind(L_2TAG_PACKET_6_0_1);
397
__ jcc(Assembler::equal, L_2TAG_PACKET_7_0_1);
399
__ bind(L_2TAG_PACKET_8_0_1);
403
__ jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1);
415
__ bind(L_2TAG_PACKET_10_0_1);
416
__ cvtsi2sdq(xmm0, r9);
418
__ cvtsi2sdq(xmm3, r10);
419
__ xorpd(xmm4, xmm4);
425
__ pinsrw(xmm4, rdx, 3);
426
__ movq(xmm2, ExternalAddress(PI_4), rbx /*rscratch*/); //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
427
__ movq(xmm6, ExternalAddress(PI_4 + 8), rbx /*rscratch*/); //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
428
__ xorpd(xmm5, xmm5);
430
__ pinsrw(xmm5, rdx, 3);
431
__ mulsd(xmm0, xmm4);
434
__ mulsd(xmm3, xmm5);
435
__ movdqu(xmm1, xmm0);
436
__ mulsd(xmm0, xmm2);
438
__ addsd(xmm1, xmm3);
439
__ mulsd(xmm3, xmm2);
442
__ mulsd(xmm6, xmm1);
444
__ addsd(xmm6, xmm3);
445
__ movdqu(xmm2, xmm0);
446
__ addsd(xmm0, xmm6);
447
__ subsd(xmm2, xmm0);
448
__ addsd(xmm6, xmm2);
450
__ bind(L_2TAG_PACKET_11_0_1);
451
__ movq(xmm1, ExternalAddress(PI32INV), rbx /*rscratch*/); //0x6dc9c883UL, 0x40245f30UL
452
__ mulsd(xmm1, xmm0);
453
__ movq(xmm5, ExternalAddress(ONEHALF), rbx /*rscratch*/); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
454
__ movq(xmm4, ExternalAddress(SIGN_MASK), rbx /*rscratch*/); //0x00000000UL, 0x80000000UL
457
__ addpd(xmm1, xmm5);
458
__ cvttsd2siq(rdx, xmm1);
459
__ cvtsi2sdq(xmm1, rdx);
460
__ movq(xmm3, ExternalAddress(P_1), rbx /*rscratch*/); //0x54400000UL, 0x3fb921fbUL
461
__ movdqu(xmm2, ExternalAddress(P_2), rbx /*rscratch*/); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
462
__ mulsd(xmm3, xmm1);
463
__ unpcklpd(xmm1, xmm1);
465
__ addl(rdx, 1865232);
466
__ movdqu(xmm4, xmm0);
469
__ movdqu(xmm5, ExternalAddress(SC_4), rbx /*rscratch*/); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
470
__ lea(rax, ExternalAddress(Ctable));
473
__ mulpd(xmm2, xmm1);
474
__ subsd(xmm0, xmm3);
475
__ mulsd(xmm1, ExternalAddress(P_3), rbx /*rscratch*/); //0x2e037073UL, 0x3b63198aUL
476
__ subsd(xmm4, xmm3);
477
__ movq(xmm7, Address(rax, 8));
478
__ unpcklpd(xmm0, xmm0);
479
__ movdqu(xmm3, xmm4);
480
__ subsd(xmm4, xmm2);
481
__ mulpd(xmm5, xmm0);
482
__ subpd(xmm0, xmm2);
483
__ mulsd(xmm7, xmm4);
484
__ subsd(xmm3, xmm4);
485
__ mulpd(xmm5, xmm0);
486
__ mulpd(xmm0, xmm0);
487
__ subsd(xmm3, xmm2);
488
__ movdqu(xmm2, Address(rax, 0));
489
__ subsd(xmm1, xmm3);
490
__ movq(xmm3, Address(rax, 24));
491
__ addsd(xmm2, xmm3);
492
__ subsd(xmm7, xmm2);
493
__ subsd(xmm1, xmm6);
494
__ movdqu(xmm6, ExternalAddress(SC_2), rbx /*rscratch*/); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
495
__ mulsd(xmm2, xmm4);
496
__ mulpd(xmm6, xmm0);
497
__ mulsd(xmm3, xmm4);
498
__ mulpd(xmm2, xmm0);
499
__ mulpd(xmm0, xmm0);
500
__ addpd(xmm5, ExternalAddress(SC_3), rbx /*rscratch*/); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
501
__ mulsd(xmm4, Address(rax, 0));
502
__ addpd(xmm6, ExternalAddress(SC_1), rbx /*rscratch*/); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
503
__ mulpd(xmm5, xmm0);
504
__ movdqu(xmm0, xmm3);
505
__ addsd(xmm3, Address(rax, 8));
506
__ mulpd(xmm1, xmm7);
507
__ movdqu(xmm7, xmm4);
508
__ addsd(xmm4, xmm3);
509
__ addpd(xmm6, xmm5);
510
__ movq(xmm5, Address(rax, 8));
511
__ subsd(xmm5, xmm3);
512
__ subsd(xmm3, xmm4);
513
__ addsd(xmm1, Address(rax, 16));
514
__ mulpd(xmm6, xmm2);
515
__ addsd(xmm5, xmm0);
516
__ addsd(xmm3, xmm7);
517
__ addsd(xmm1, xmm5);
518
__ addsd(xmm1, xmm3);
519
__ addsd(xmm1, xmm6);
520
__ unpckhpd(xmm6, xmm6);
521
__ movdqu(xmm0, xmm4);
522
__ addsd(xmm1, xmm6);
523
__ addsd(xmm0, xmm1);
526
__ bind(L_2TAG_PACKET_7_0_1);
532
__ jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
537
__ jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
538
__ xorpd(xmm0, xmm0);
539
__ xorpd(xmm6, xmm6);
540
__ jmp(L_2TAG_PACKET_11_0_1);
542
__ bind(L_2TAG_PACKET_9_0_1);
543
__ jcc(Assembler::equal, L_2TAG_PACKET_10_0_1);
553
__ jmp(L_2TAG_PACKET_10_0_1);
554
__ bind(L_2TAG_PACKET_3_0_1);
560
__ testl(r9, INT_MIN);
561
__ jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1);
565
__ jmp(L_2TAG_PACKET_6_0_1);
567
__ bind(L_2TAG_PACKET_4_0_1);
569
__ movl(rbx, 536870912);
574
__ addl(rdi, 536870912);
584
__ jmp(L_2TAG_PACKET_5_0_1);
586
__ bind(L_2TAG_PACKET_12_0_1);
588
__ mov64(rbx, 0x100000000);
600
__ addl(rdi, 536870912);
601
__ jmp(L_2TAG_PACKET_6_0_1);
603
__ bind(L_2TAG_PACKET_2_0_1);
604
__ movsd(xmm0, Address(rsp, 8));
605
__ mulsd(xmm0, ExternalAddress(NEG_ZERO), rbx /*rscratch*/); //0x00000000UL, 0x80000000UL
606
__ movq(Address(rsp, 0), xmm0);
608
__ bind(L_2TAG_PACKET_13_0_1);
619
__ leave(); // required for proper stackwalking of RuntimeStub frame