jdk

Форк
0
/
stubGenerator_x86_64_cos.cpp 
625 строк · 17.9 Кб
1
/*
2
 *  Copyright (c) 2016, 2021, Intel Corporation. All rights reserved.
3
 * Intel Math Library (LIBM) Source Code
4
 *
5
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6
 *
7
 * This code is free software; you can redistribute it and/or modify it
8
 * under the terms of the GNU General Public License version 2 only, as
9
 * published by the Free Software Foundation.
10
 *
11
 * This code is distributed in the hope that it will be useful, but WITHOUT
12
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
 * version 2 for more details (a copy is included in the LICENSE file that
15
 * accompanied this code).
16
 *
17
 * You should have received a copy of the GNU General Public License version
18
 * 2 along with this work; if not, write to the Free Software Foundation,
19
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
 *
21
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22
 * or visit www.oracle.com if you need additional information or have any
23
 * questions.
24
 *
25
 */
26

27
#include "precompiled.hpp"
28
#include "macroAssembler_x86.hpp"
29
#include "stubGenerator_x86_64.hpp"
30

31
/******************************************************************************/
32
//                     ALGORITHM DESCRIPTION - COS()
33
//                     ---------------------
34
//
35
//     1. RANGE REDUCTION
36
//
37
//     We perform an initial range reduction from X to r with
38
//
39
//          X =~= N * pi/32 + r
40
//
41
//     so that |r| <= pi/64 + epsilon. We restrict inputs to those
42
//     where |N| <= 932560. Beyond this, the range reduction is
43
//     insufficiently accurate. For extremely small inputs,
44
//     denormalization can occur internally, impacting performance.
45
//     This means that the main path is actually only taken for
46
//     2^-252 <= |X| < 90112.
47
//
48
//     To avoid branches, we perform the range reduction to full
49
//     accuracy each time.
50
//
51
//          X - N * (P_1 + P_2 + P_3)
52
//
53
//     where P_1 and P_2 are 32-bit numbers (so multiplication by N
54
//     is exact) and P_3 is a 53-bit number. Together, these
55
//     approximate pi well enough for all cases in the restricted
56
//     range.
57
//
58
//     The main reduction sequence is:
59
//
60
//             y = 32/pi * x
61
//             N = integer(y)
62
//     (computed by adding and subtracting off SHIFTER)
63
//
64
//             m_1 = N * P_1
65
//             m_2 = N * P_2
66
//             r_1 = x - m_1
67
//             r = r_1 - m_2
68
//     (this r can be used for most of the calculation)
69
//
70
//             c_1 = r_1 - r
71
//             m_3 = N * P_3
72
//             c_2 = c_1 - m_2
73
//             c = c_2 - m_3
74
//
75
//     2. MAIN ALGORITHM
76
//
77
//     The algorithm uses a table lookup based on B = M * pi / 32
78
//     where M = N mod 64. The stored values are:
79
//       sigma             closest power of 2 to cos(B)
80
//       C_hl              53-bit cos(B) - sigma
81
//       S_hi + S_lo       2 * 53-bit sin(B)
82
//
83
//     The computation is organized as follows:
84
//
85
//          sin(B + r + c) = [sin(B) + sigma * r] +
86
//                           r * (cos(B) - sigma) +
87
//                           sin(B) * [cos(r + c) - 1] +
88
//                           cos(B) * [sin(r + c) - r]
89
//
90
//     which is approximately:
91
//
92
//          [S_hi + sigma * r] +
93
//          C_hl * r +
94
//          S_lo + S_hi * [(cos(r) - 1) - r * c] +
95
//          (C_hl + sigma) * [(sin(r) - r) + c]
96
//
97
//     and this is what is actually computed. We separate this sum
98
//     into four parts:
99
//
100
//          hi + med + pols + corr
101
//
102
//     where
103
//
104
//          hi       = S_hi + sigma r
105
//          med      = C_hl * r
106
//          pols     = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
107
//          corr     = S_lo + c * ((C_hl + sigma) - S_hi * r)
108
//
109
//     3. POLYNOMIAL
110
//
111
//     The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
112
//     (sin(r) - r) can be rearranged freely, since it is quite
113
//     small, so we exploit parallelism to the fullest.
114
//
115
//          psc4       =   SC_4 * r_1
116
//          msc4       =   psc4 * r
117
//          r2         =   r * r
118
//          msc2       =   SC_2 * r2
119
//          r4         =   r2 * r2
120
//          psc3       =   SC_3 + msc4
121
//          psc1       =   SC_1 + msc2
122
//          msc3       =   r4 * psc3
123
//          sincospols =   psc1 + msc3
124
//          pols       =   sincospols *
125
//                         <S_hi * r^2 | (C_hl + sigma) * r^3>
126
//
127
//     4. CORRECTION TERM
128
//
129
//     This is where the "c" component of the range reduction is
130
//     taken into account; recall that just "r" is used for most of
131
//     the calculation.
132
//
133
//          -c   = m_3 - c_2
134
//          -d   = S_hi * r - (C_hl + sigma)
135
//          corr = -c * -d + S_lo
136
//
137
//     5. COMPENSATED SUMMATIONS
138
//
139
//     The two successive compensated summations add up the high
140
//     and medium parts, leaving just the low parts to add up at
141
//     the end.
142
//
143
//          rs        =  sigma * r
144
//          res_int   =  S_hi + rs
145
//          k_0       =  S_hi - res_int
146
//          k_2       =  k_0 + rs
147
//          med       =  C_hl * r
148
//          res_hi    =  res_int + med
149
//          k_1       =  res_int - res_hi
150
//          k_3       =  k_1 + med
151
//
152
//     6. FINAL SUMMATION
153
//
154
//     We now add up all the small parts:
155
//
156
//          res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
157
//
158
//     Now the overall result is just:
159
//
160
//          res_hi + res_lo
161
//
162
//     7. SMALL ARGUMENTS
163
//
164
//     Inputs with |X| < 2^-252 are treated specially as
165
//     1 - |x|.
166
//
167
// Special cases:
168
//  cos(NaN) = quiet NaN, and raise invalid exception
169
//  cos(INF) = NaN and raise invalid exception
170
//  cos(0) = 1
171
//
172
/******************************************************************************/
173

174
#define __ _masm->
175

176
address StubGenerator::generate_libmCos() {
177
  StubCodeMark mark(this, "StubRoutines", "libmCos");
178
  address start = __ pc();
179

180
  Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1;
181
  Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1;
182
  Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1;
183
  Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_4;
184

185
  __ enter(); // required for proper stackwalking of RuntimeStub frame
186

187
#ifdef _WIN64
188
  __ push(rsi);
189
  __ push(rdi);
190
#endif
191

192
  __ push(rbx);
193
  __ subq(rsp, 16);
194
  __ movsd(Address(rsp, 8), xmm0);
195

196
  __ bind(B1_2);
197
  __ movl(rax, Address(rsp, 12));
198
  __ movq(xmm1, ExternalAddress(PI32INV), rbx /*rscratch*/);    //0x6dc9c883UL, 0x40245f30UL
199
  __ andl(rax, 2147418112);
200
  __ subl(rax, 808452096);
201
  __ cmpl(rax, 281346048);
202
  __ jcc(Assembler::above, L_2TAG_PACKET_0_0_1);
203
  __ mulsd(xmm1, xmm0);
204
  __ movdqu(xmm5, ExternalAddress(ONEHALF), rbx /*rscratch*/);    //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
205
  __ movq(xmm4, ExternalAddress(SIGN_MASK), rbx /*rscratch*/);    //0x00000000UL, 0x80000000UL
206
  __ pand(xmm4, xmm0);
207
  __ por(xmm5, xmm4);
208
  __ addpd(xmm1, xmm5);
209
  __ cvttsd2sil(rdx, xmm1);
210
  __ cvtsi2sdl(xmm1, rdx);
211
  __ movdqu(xmm2, ExternalAddress(P_2), rbx /*rscratch*/);    //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
212
  __ movq(xmm3, ExternalAddress(P_1), rbx /*rscratch*/);    //0x54400000UL, 0x3fb921fbUL
213
  __ mulsd(xmm3, xmm1);
214
  __ unpcklpd(xmm1, xmm1);
215
  __ addq(rdx, 1865232);
216
  __ movdqu(xmm4, xmm0);
217
  __ andq(rdx, 63);
218
  __ movdqu(xmm5, ExternalAddress(SC_4), rbx /*rscratch*/);    //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
219
  __ lea(rax, ExternalAddress(Ctable));
220
  __ shlq(rdx, 5);
221
  __ addq(rax, rdx);
222
  __ mulpd(xmm2, xmm1);
223
  __ subsd(xmm0, xmm3);
224
  __ mulsd(xmm1, ExternalAddress(P_3), rbx /*rscratch*/);    //0x2e037073UL, 0x3b63198aUL
225
  __ subsd(xmm4, xmm3);
226
  __ movq(xmm7, Address(rax, 8));
227
  __ unpcklpd(xmm0, xmm0);
228
  __ movdqu(xmm3, xmm4);
229
  __ subsd(xmm4, xmm2);
230
  __ mulpd(xmm5, xmm0);
231
  __ subpd(xmm0, xmm2);
232
  __ movdqu(xmm6, ExternalAddress(SC_2), rbx /*rscratch*/);    //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
233
  __ mulsd(xmm7, xmm4);
234
  __ subsd(xmm3, xmm4);
235
  __ mulpd(xmm5, xmm0);
236
  __ mulpd(xmm0, xmm0);
237
  __ subsd(xmm3, xmm2);
238
  __ movdqu(xmm2, Address(rax, 0));
239
  __ subsd(xmm1, xmm3);
240
  __ movq(xmm3, Address(rax, 24));
241
  __ addsd(xmm2, xmm3);
242
  __ subsd(xmm7, xmm2);
243
  __ mulsd(xmm2, xmm4);
244
  __ mulpd(xmm6, xmm0);
245
  __ mulsd(xmm3, xmm4);
246
  __ mulpd(xmm2, xmm0);
247
  __ mulpd(xmm0, xmm0);
248
  __ addpd(xmm5, ExternalAddress(SC_3), rbx /*rscratch*/);    //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
249
  __ mulsd(xmm4, Address(rax, 0));
250
  __ addpd(xmm6, ExternalAddress(SC_1), rbx /*rscratch*/);    //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
251
  __ mulpd(xmm5, xmm0);
252
  __ movdqu(xmm0, xmm3);
253
  __ addsd(xmm3, Address(rax, 8));
254
  __ mulpd(xmm1, xmm7);
255
  __ movdqu(xmm7, xmm4);
256
  __ addsd(xmm4, xmm3);
257
  __ addpd(xmm6, xmm5);
258
  __ movq(xmm5, Address(rax, 8));
259
  __ subsd(xmm5, xmm3);
260
  __ subsd(xmm3, xmm4);
261
  __ addsd(xmm1, Address(rax, 16));
262
  __ mulpd(xmm6, xmm2);
263
  __ addsd(xmm0, xmm5);
264
  __ addsd(xmm3, xmm7);
265
  __ addsd(xmm0, xmm1);
266
  __ addsd(xmm0, xmm3);
267
  __ addsd(xmm0, xmm6);
268
  __ unpckhpd(xmm6, xmm6);
269
  __ addsd(xmm0, xmm6);
270
  __ addsd(xmm0, xmm4);
271
  __ jmp(B1_4);
272

273
  __ bind(L_2TAG_PACKET_0_0_1);
274
  __ jcc(Assembler::greater, L_2TAG_PACKET_1_0_1);
275
  __ pextrw(rax, xmm0, 3);
276
  __ andl(rax, 32767);
277
  __ pinsrw(xmm0, rax, 3);
278
  __ movq(xmm1, ExternalAddress(ONE), rbx /*rscratch*/); // 0x00000000UL, 0x3ff00000UL
279
  __ subsd(xmm1, xmm0);
280
  __ movdqu(xmm0, xmm1);
281
  __ jmp(B1_4);
282

283
  __ bind(L_2TAG_PACKET_1_0_1);
284
  __ pextrw(rax, xmm0, 3);
285
  __ andl(rax, 32752);
286
  __ cmpl(rax, 32752);
287
  __ jcc(Assembler::equal, L_2TAG_PACKET_2_0_1);
288
  __ pextrw(rcx, xmm0, 3);
289
  __ andl(rcx, 32752);
290
  __ subl(rcx, 16224);
291
  __ shrl(rcx, 7);
292
  __ andl(rcx, 65532);
293
  __ lea(r11, ExternalAddress(PI_INV_TABLE));
294
  __ addq(rcx, r11);
295
  __ movdq(rax, xmm0);
296
  __ movl(r10, Address(rcx, 20));
297
  __ movl(r8, Address(rcx, 24));
298
  __ movl(rdx, rax);
299
  __ shrq(rax, 21);
300
  __ orl(rax, INT_MIN);
301
  __ shrl(rax, 11);
302
  __ movl(r9, r10);
303
  __ imulq(r10, rdx);
304
  __ imulq(r9, rax);
305
  __ imulq(r8, rax);
306
  __ movl(rsi, Address(rcx, 16));
307
  __ movl(rdi, Address(rcx, 12));
308
  __ movl(r11, r10);
309
  __ shrq(r10, 32);
310
  __ addq(r9, r10);
311
  __ addq(r11, r8);
312
  __ movl(r8, r11);
313
  __ shrq(r11, 32);
314
  __ addq(r9, r11);
315
  __ movl(r10, rsi);
316
  __ imulq(rsi, rdx);
317
  __ imulq(r10, rax);
318
  __ movl(r11, rdi);
319
  __ imulq(rdi, rdx);
320
  __ movl(rbx, rsi);
321
  __ shrq(rsi, 32);
322
  __ addq(r9, rbx);
323
  __ movl(rbx, r9);
324
  __ shrq(r9, 32);
325
  __ addq(r10, rsi);
326
  __ addq(r10, r9);
327
  __ shlq(rbx, 32);
328
  __ orq(r8, rbx);
329
  __ imulq(r11, rax);
330
  __ movl(r9, Address(rcx, 8));
331
  __ movl(rsi, Address(rcx, 4));
332
  __ movl(rbx, rdi);
333
  __ shrq(rdi, 32);
334
  __ addq(r10, rbx);
335
  __ movl(rbx, r10);
336
  __ shrq(r10, 32);
337
  __ addq(r11, rdi);
338
  __ addq(r11, r10);
339
  __ movq(rdi, r9);
340
  __ imulq(r9, rdx);
341
  __ imulq(rdi, rax);
342
  __ movl(r10, r9);
343
  __ shrq(r9, 32);
344
  __ addq(r11, r10);
345
  __ movl(r10, r11);
346
  __ shrq(r11, 32);
347
  __ addq(rdi, r9);
348
  __ addq(rdi, r11);
349
  __ movq(r9, rsi);
350
  __ imulq(rsi, rdx);
351
  __ imulq(r9, rax);
352
  __ shlq(r10, 32);
353
  __ orq(r10, rbx);
354
  __ movl(rax, Address(rcx, 0));
355
  __ movl(r11, rsi);
356
  __ shrq(rsi, 32);
357
  __ addq(rdi, r11);
358
  __ movl(r11, rdi);
359
  __ shrq(rdi, 32);
360
  __ addq(r9, rsi);
361
  __ addq(r9, rdi);
362
  __ imulq(rdx, rax);
363
  __ pextrw(rbx, xmm0, 3);
364
  __ lea(rdi, ExternalAddress(PI_INV_TABLE));
365
  __ subq(rcx, rdi);
366
  __ addl(rcx, rcx);
367
  __ addl(rcx, rcx);
368
  __ addl(rcx, rcx);
369
  __ addl(rcx, 19);
370
  __ movl(rsi, 32768);
371
  __ andl(rsi, rbx);
372
  __ shrl(rbx, 4);
373
  __ andl(rbx, 2047);
374
  __ subl(rbx, 1023);
375
  __ subl(rcx, rbx);
376
  __ addq(r9, rdx);
377
  __ movl(rdx, rcx);
378
  __ addl(rdx, 32);
379
  __ cmpl(rcx, 1);
380
  __ jcc(Assembler::less, L_2TAG_PACKET_3_0_1);
381
  __ negl(rcx);
382
  __ addl(rcx, 29);
383
  __ shll(r9);
384
  __ movl(rdi, r9);
385
  __ andl(r9, 536870911);
386
  __ testl(r9, 268435456);
387
  __ jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1);
388
  __ shrl(r9);
389
  __ movl(rbx, 0);
390
  __ shlq(r9, 32);
391
  __ orq(r9, r11);
392

393
  __ bind(L_2TAG_PACKET_5_0_1);
394

395
  __ bind(L_2TAG_PACKET_6_0_1);
396
  __ cmpq(r9, 0);
397
  __ jcc(Assembler::equal, L_2TAG_PACKET_7_0_1);
398

399
  __ bind(L_2TAG_PACKET_8_0_1);
400
  __ bsrq(r11, r9);
401
  __ movl(rcx, 29);
402
  __ subl(rcx, r11);
403
  __ jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1);
404
  __ shlq(r9);
405
  __ movq(rax, r10);
406
  __ shlq(r10);
407
  __ addl(rdx, rcx);
408
  __ negl(rcx);
409
  __ addl(rcx, 64);
410
  __ shrq(rax);
411
  __ shrq(r8);
412
  __ orq(r9, rax);
413
  __ orq(r10, r8);
414

415
  __ bind(L_2TAG_PACKET_10_0_1);
416
  __ cvtsi2sdq(xmm0, r9);
417
  __ shrq(r10, 1);
418
  __ cvtsi2sdq(xmm3, r10);
419
  __ xorpd(xmm4, xmm4);
420
  __ shll(rdx, 4);
421
  __ negl(rdx);
422
  __ addl(rdx, 16368);
423
  __ orl(rdx, rsi);
424
  __ xorl(rdx, rbx);
425
  __ pinsrw(xmm4, rdx, 3);
426
  __ movq(xmm2, ExternalAddress(PI_4),     rbx /*rscratch*/); //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
427
  __ movq(xmm6, ExternalAddress(PI_4 + 8), rbx /*rscratch*/); //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
428
  __ xorpd(xmm5, xmm5);
429
  __ subl(rdx, 1008);
430
  __ pinsrw(xmm5, rdx, 3);
431
  __ mulsd(xmm0, xmm4);
432
  __ shll(rsi, 16);
433
  __ sarl(rsi, 31);
434
  __ mulsd(xmm3, xmm5);
435
  __ movdqu(xmm1, xmm0);
436
  __ mulsd(xmm0, xmm2);
437
  __ shrl(rdi, 29);
438
  __ addsd(xmm1, xmm3);
439
  __ mulsd(xmm3, xmm2);
440
  __ addl(rdi, rsi);
441
  __ xorl(rdi, rsi);
442
  __ mulsd(xmm6, xmm1);
443
  __ movl(rax, rdi);
444
  __ addsd(xmm6, xmm3);
445
  __ movdqu(xmm2, xmm0);
446
  __ addsd(xmm0, xmm6);
447
  __ subsd(xmm2, xmm0);
448
  __ addsd(xmm6, xmm2);
449

450
  __ bind(L_2TAG_PACKET_11_0_1);
451
  __ movq(xmm1, ExternalAddress(PI32INV), rbx /*rscratch*/);   //0x6dc9c883UL, 0x40245f30UL
452
  __ mulsd(xmm1, xmm0);
453
  __ movq(xmm5, ExternalAddress(ONEHALF), rbx /*rscratch*/);   //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
454
  __ movq(xmm4, ExternalAddress(SIGN_MASK), rbx /*rscratch*/); //0x00000000UL, 0x80000000UL
455
  __ pand(xmm4, xmm0);
456
  __ por(xmm5, xmm4);
457
  __ addpd(xmm1, xmm5);
458
  __ cvttsd2siq(rdx, xmm1);
459
  __ cvtsi2sdq(xmm1, rdx);
460
  __ movq(xmm3, ExternalAddress(P_1), rbx /*rscratch*/);    //0x54400000UL, 0x3fb921fbUL
461
  __ movdqu(xmm2, ExternalAddress(P_2), rbx /*rscratch*/);    //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
462
  __ mulsd(xmm3, xmm1);
463
  __ unpcklpd(xmm1, xmm1);
464
  __ shll(rax, 3);
465
  __ addl(rdx, 1865232);
466
  __ movdqu(xmm4, xmm0);
467
  __ addl(rdx, rax);
468
  __ andl(rdx, 63);
469
  __ movdqu(xmm5, ExternalAddress(SC_4), rbx /*rscratch*/);    //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
470
  __ lea(rax, ExternalAddress(Ctable));
471
  __ shll(rdx, 5);
472
  __ addq(rax, rdx);
473
  __ mulpd(xmm2, xmm1);
474
  __ subsd(xmm0, xmm3);
475
  __ mulsd(xmm1, ExternalAddress(P_3), rbx /*rscratch*/);    //0x2e037073UL, 0x3b63198aUL
476
  __ subsd(xmm4, xmm3);
477
  __ movq(xmm7, Address(rax, 8));
478
  __ unpcklpd(xmm0, xmm0);
479
  __ movdqu(xmm3, xmm4);
480
  __ subsd(xmm4, xmm2);
481
  __ mulpd(xmm5, xmm0);
482
  __ subpd(xmm0, xmm2);
483
  __ mulsd(xmm7, xmm4);
484
  __ subsd(xmm3, xmm4);
485
  __ mulpd(xmm5, xmm0);
486
  __ mulpd(xmm0, xmm0);
487
  __ subsd(xmm3, xmm2);
488
  __ movdqu(xmm2, Address(rax, 0));
489
  __ subsd(xmm1, xmm3);
490
  __ movq(xmm3, Address(rax, 24));
491
  __ addsd(xmm2, xmm3);
492
  __ subsd(xmm7, xmm2);
493
  __ subsd(xmm1, xmm6);
494
  __ movdqu(xmm6, ExternalAddress(SC_2), rbx /*rscratch*/);    //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
495
  __ mulsd(xmm2, xmm4);
496
  __ mulpd(xmm6, xmm0);
497
  __ mulsd(xmm3, xmm4);
498
  __ mulpd(xmm2, xmm0);
499
  __ mulpd(xmm0, xmm0);
500
  __ addpd(xmm5, ExternalAddress(SC_3), rbx /*rscratch*/);    //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
501
  __ mulsd(xmm4, Address(rax, 0));
502
  __ addpd(xmm6, ExternalAddress(SC_1), rbx /*rscratch*/);    //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
503
  __ mulpd(xmm5, xmm0);
504
  __ movdqu(xmm0, xmm3);
505
  __ addsd(xmm3, Address(rax, 8));
506
  __ mulpd(xmm1, xmm7);
507
  __ movdqu(xmm7, xmm4);
508
  __ addsd(xmm4, xmm3);
509
  __ addpd(xmm6, xmm5);
510
  __ movq(xmm5, Address(rax, 8));
511
  __ subsd(xmm5, xmm3);
512
  __ subsd(xmm3, xmm4);
513
  __ addsd(xmm1, Address(rax, 16));
514
  __ mulpd(xmm6, xmm2);
515
  __ addsd(xmm5, xmm0);
516
  __ addsd(xmm3, xmm7);
517
  __ addsd(xmm1, xmm5);
518
  __ addsd(xmm1, xmm3);
519
  __ addsd(xmm1, xmm6);
520
  __ unpckhpd(xmm6, xmm6);
521
  __ movdqu(xmm0, xmm4);
522
  __ addsd(xmm1, xmm6);
523
  __ addsd(xmm0, xmm1);
524
  __ jmp(B1_4);
525

526
  __ bind(L_2TAG_PACKET_7_0_1);
527
  __ addl(rdx, 64);
528
  __ movq(r9, r10);
529
  __ movq(r10, r8);
530
  __ movl(r8, 0);
531
  __ cmpq(r9, 0);
532
  __ jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
533
  __ addl(rdx, 64);
534
  __ movq(r9, r10);
535
  __ movq(r10, r8);
536
  __ cmpq(r9, 0);
537
  __ jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
538
  __ xorpd(xmm0, xmm0);
539
  __ xorpd(xmm6, xmm6);
540
  __ jmp(L_2TAG_PACKET_11_0_1);
541

542
  __ bind(L_2TAG_PACKET_9_0_1);
543
  __ jcc(Assembler::equal, L_2TAG_PACKET_10_0_1);
544
  __ negl(rcx);
545
  __ shrq(r10);
546
  __ movq(rax, r9);
547
  __ shrq(r9);
548
  __ subl(rdx, rcx);
549
  __ negl(rcx);
550
  __ addl(rcx, 64);
551
  __ shlq(rax);
552
  __ orq(r10, rax);
553
  __ jmp(L_2TAG_PACKET_10_0_1);
554
  __ bind(L_2TAG_PACKET_3_0_1);
555
  __ negl(rcx);
556
  __ shlq(r9, 32);
557
  __ orq(r9, r11);
558
  __ shlq(r9);
559
  __ movq(rdi, r9);
560
  __ testl(r9, INT_MIN);
561
  __ jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1);
562
  __ shrl(r9);
563
  __ movl(rbx, 0);
564
  __ shrq(rdi, 3);
565
  __ jmp(L_2TAG_PACKET_6_0_1);
566

567
  __ bind(L_2TAG_PACKET_4_0_1);
568
  __ shrl(r9);
569
  __ movl(rbx, 536870912);
570
  __ shrl(rbx);
571
  __ shlq(r9, 32);
572
  __ orq(r9, r11);
573
  __ shlq(rbx, 32);
574
  __ addl(rdi, 536870912);
575
  __ movl(rcx, 0);
576
  __ movl(r11, 0);
577
  __ subq(rcx, r8);
578
  __ sbbq(r11, r10);
579
  __ sbbq(rbx, r9);
580
  __ movq(r8, rcx);
581
  __ movq(r10, r11);
582
  __ movq(r9, rbx);
583
  __ movl(rbx, 32768);
584
  __ jmp(L_2TAG_PACKET_5_0_1);
585

586
  __ bind(L_2TAG_PACKET_12_0_1);
587
  __ shrl(r9);
588
  __ mov64(rbx, 0x100000000);
589
  __ shrq(rbx);
590
  __ movl(rcx, 0);
591
  __ movl(r11, 0);
592
  __ subq(rcx, r8);
593
  __ sbbq(r11, r10);
594
  __ sbbq(rbx, r9);
595
  __ movq(r8, rcx);
596
  __ movq(r10, r11);
597
  __ movq(r9, rbx);
598
  __ movl(rbx, 32768);
599
  __ shrq(rdi, 3);
600
  __ addl(rdi, 536870912);
601
  __ jmp(L_2TAG_PACKET_6_0_1);
602

603
  __ bind(L_2TAG_PACKET_2_0_1);
604
  __ movsd(xmm0, Address(rsp, 8));
605
  __ mulsd(xmm0, ExternalAddress(NEG_ZERO), rbx /*rscratch*/);    //0x00000000UL, 0x80000000UL
606
  __ movq(Address(rsp, 0), xmm0);
607

608
  __ bind(L_2TAG_PACKET_13_0_1);
609

610
  __ bind(B1_4);
611
  __ addq(rsp, 16);
612
  __ pop(rbx);
613

614
#ifdef _WIN64
615
  __ pop(rdi);
616
  __ pop(rsi);
617
#endif
618

619
  __ leave(); // required for proper stackwalking of RuntimeStub frame
620
  __ ret(0);
621

622
  return start;
623
}
624

625
#undef __
626

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.