2
* Copyright (c) 1999, 2024, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
25
#include "precompiled.hpp"
26
#include "asm/macroAssembler.hpp"
27
#include "asm/macroAssembler.inline.hpp"
28
#include "compiler/oopMap.hpp"
29
#include "gc/shared/barrierSet.hpp"
30
#include "gc/shared/barrierSetAssembler.hpp"
31
#include "gc/shared/barrierSetNMethod.hpp"
32
#include "interpreter/interpreter.hpp"
33
#include "memory/universe.hpp"
34
#include "nativeInst_x86.hpp"
35
#include "oops/instanceOop.hpp"
36
#include "oops/method.hpp"
37
#include "oops/objArrayKlass.hpp"
38
#include "oops/oop.inline.hpp"
39
#include "prims/methodHandles.hpp"
40
#include "runtime/frame.inline.hpp"
41
#include "runtime/handles.inline.hpp"
42
#include "runtime/javaThread.hpp"
43
#include "runtime/sharedRuntime.hpp"
44
#include "runtime/stubCodeGenerator.hpp"
45
#include "runtime/stubRoutines.hpp"
47
#include "opto/runtime.hpp"
50
// Declaration and definition of StubGenerator (no .hpp file).
51
// For a more detailed description of the stub routine structure
52
// see the comment in stubRoutines.hpp
55
#define a__ ((Assembler*)_masm)->
58
#define BLOCK_COMMENT(str) /* nothing */
60
#define BLOCK_COMMENT(str) __ block_comment(str)
63
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
65
const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
66
const int FPU_CNTRL_WRD_MASK = 0xFFFF;
68
ATTRIBUTE_ALIGNED(16) static const uint32_t KEY_SHUFFLE_MASK[] = {
69
0x00010203UL, 0x04050607UL, 0x08090A0BUL, 0x0C0D0E0FUL,
72
ATTRIBUTE_ALIGNED(16) static const uint32_t COUNTER_SHUFFLE_MASK[] = {
73
0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL,
76
ATTRIBUTE_ALIGNED(16) static const uint32_t GHASH_BYTE_SWAP_MASK[] = {
77
0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL,
80
ATTRIBUTE_ALIGNED(16) static const uint32_t GHASH_LONG_SWAP_MASK[] = {
81
0x0B0A0908UL, 0x0F0E0D0CUL, 0x03020100UL, 0x07060504UL,
84
// -------------------------------------------------------------------------------------------------------------------------
85
// Stub Code definitions
87
class StubGenerator: public StubCodeGenerator {
91
#define inc_counter_np(counter) ((void)0)
93
void inc_counter_np_(uint& counter) {
94
__ incrementl(ExternalAddress((address)&counter));
96
#define inc_counter_np(counter) \
97
BLOCK_COMMENT("inc_counter " #counter); \
98
inc_counter_np_(counter);
101
void inc_copy_counter_np(BasicType t) {
104
case T_BYTE: inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
105
case T_SHORT: inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
106
case T_INT: inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
107
case T_LONG: inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
108
case T_OBJECT: inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
109
default: ShouldNotReachHere();
114
//------------------------------------------------------------------------------------------------------------------------
115
// Call stubs are used to call Java from C
117
// [ return_from_Java ] <--- rsp
118
// [ argument word n ]
120
// -N [ argument word 1 ]
121
// -7 [ Possible padding for stack alignment ]
122
// -6 [ Possible padding for stack alignment ]
123
// -5 [ Possible padding for stack alignment ]
124
// -4 [ mxcsr save ] <--- rsp_after_call
128
// 0 [ saved rbp, ] <--- rbp,
129
// 1 [ return address ]
130
// 2 [ ptr. to call wrapper ]
136
// 8 [ parameter_size ]
140
address generate_call_stub(address& return_address) {
141
StubCodeMark mark(this, "StubRoutines", "call_stub");
142
address start = __ pc();
144
// stub code parameters / addresses
145
assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
146
bool sse_save = false;
147
const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
148
const int locals_count_in_bytes (4*wordSize);
149
const Address mxcsr_save (rbp, -4 * wordSize);
150
const Address saved_rbx (rbp, -3 * wordSize);
151
const Address saved_rsi (rbp, -2 * wordSize);
152
const Address saved_rdi (rbp, -1 * wordSize);
153
const Address result (rbp, 3 * wordSize);
154
const Address result_type (rbp, 4 * wordSize);
155
const Address method (rbp, 5 * wordSize);
156
const Address entry_point (rbp, 6 * wordSize);
157
const Address parameters (rbp, 7 * wordSize);
158
const Address parameter_size(rbp, 8 * wordSize);
159
const Address thread (rbp, 9 * wordSize); // same as in generate_catch_exception()!
160
sse_save = UseSSE > 0;
164
__ movptr(rcx, parameter_size); // parameter counter
165
__ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
166
__ addptr(rcx, locals_count_in_bytes); // reserve space for register saves
168
__ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
170
// save rdi, rsi, & rbx, according to C calling conventions
171
__ movptr(saved_rdi, rdi);
172
__ movptr(saved_rsi, rsi);
173
__ movptr(saved_rbx, rbx);
175
// save and initialize %mxcsr
178
__ stmxcsr(mxcsr_save);
179
__ movl(rax, mxcsr_save);
180
__ andl(rax, MXCSR_MASK); // Only check control and mask bits
181
ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
182
__ cmp32(rax, mxcsr_std);
183
__ jcc(Assembler::equal, skip_ldmx);
184
__ ldmxcsr(mxcsr_std);
188
// make sure the control word is correct.
189
__ fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_std()));
192
// make sure we have no pending exceptions
194
__ movptr(rcx, thread);
195
__ cmpptr(Address(rcx, Thread::pending_exception_offset()), NULL_WORD);
196
__ jcc(Assembler::equal, L);
197
__ stop("StubRoutines::call_stub: entered with pending exception");
202
// pass parameters if any
203
BLOCK_COMMENT("pass parameters if any");
204
Label parameters_done;
205
__ movl(rcx, parameter_size); // parameter counter
207
__ jcc(Assembler::zero, parameters_done);
209
// parameter passing loop
212
// Copy Java parameters in reverse order (receiver last)
213
// Note that the argument order is inverted in the process
214
// source is rdx[rcx: N-1..0]
215
// dest is rsp[rbx: 0..N-1]
217
__ movptr(rdx, parameters); // parameter pointer
223
__ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
224
__ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
225
Interpreter::expr_offset_in_bytes(0)), rax); // store parameter
228
__ jcc(Assembler::notZero, loop);
230
// call Java function
231
__ BIND(parameters_done);
232
__ movptr(rbx, method); // get Method*
233
__ movptr(rax, entry_point); // get entry_point
234
__ mov(rsi, rsp); // set sender sp
235
BLOCK_COMMENT("call Java function");
238
BLOCK_COMMENT("call_stub_return_address:");
239
return_address = __ pc();
245
__ verify_FPU(0, "call_stub_return");
247
for (int i = 1; i < 8; i++) {
251
// UseSSE <= 1 so double result should be left on TOS
252
__ movl(rsi, result_type);
253
__ cmpl(rsi, T_DOUBLE);
254
__ jcc(Assembler::equal, L_skip);
256
// UseSSE == 0 so float result should be left on TOS
257
__ cmpl(rsi, T_FLOAT);
258
__ jcc(Assembler::equal, L_skip);
266
// store result depending on type
267
// (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
268
__ movptr(rdi, result);
269
Label is_long, is_float, is_double, exit;
270
__ movl(rsi, result_type);
271
__ cmpl(rsi, T_LONG);
272
__ jcc(Assembler::equal, is_long);
273
__ cmpl(rsi, T_FLOAT);
274
__ jcc(Assembler::equal, is_float);
275
__ cmpl(rsi, T_DOUBLE);
276
__ jcc(Assembler::equal, is_double);
279
__ movl(Address(rdi, 0), rax);
282
// check that FPU stack is empty
283
__ verify_FPU(0, "generate_call_stub");
286
__ lea(rsp, rsp_after_call);
290
__ ldmxcsr(mxcsr_save);
293
// restore rdi, rsi and rbx,
294
__ movptr(rbx, saved_rbx);
295
__ movptr(rsi, saved_rsi);
296
__ movptr(rdi, saved_rdi);
297
__ addptr(rsp, 4*wordSize);
303
// handle return types different from T_INT
305
__ movl(Address(rdi, 0 * wordSize), rax);
306
__ movl(Address(rdi, 1 * wordSize), rdx);
310
// interpreter uses xmm0 for return values
312
__ movflt(Address(rdi, 0), xmm0);
314
__ fstp_s(Address(rdi, 0));
319
// interpreter uses xmm0 for return values
321
__ movdbl(Address(rdi, 0), xmm0);
323
__ fstp_d(Address(rdi, 0));
331
//------------------------------------------------------------------------------------------------------------------------
332
// Return point for a Java call if there's an exception thrown in Java code.
333
// The exception is caught and transformed into a pending exception stored in
334
// JavaThread that can be tested from within the VM.
336
// Note: Usually the parameters are removed by the callee. In case of an exception
337
// crossing an activation frame boundary, that is not the case if the callee
338
// is compiled code => need to setup the rsp.
340
// rax,: exception oop
342
address generate_catch_exception() {
343
StubCodeMark mark(this, "StubRoutines", "catch_exception");
344
const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
345
const Address thread (rbp, 9 * wordSize); // same as in generate_call_stub()!
346
address start = __ pc();
348
// get thread directly
349
__ movptr(rcx, thread);
351
// verify that threads correspond
355
__ jcc(Assembler::equal, L);
356
__ stop("StubRoutines::catch_exception: threads must correspond");
360
// set pending exception
362
__ movptr(Address(rcx, Thread::pending_exception_offset()), rax);
363
__ lea(Address(rcx, Thread::exception_file_offset()),
364
ExternalAddress((address)__FILE__), noreg);
365
__ movl(Address(rcx, Thread::exception_line_offset()), __LINE__ );
366
// complete return to VM
367
assert(StubRoutines::_call_stub_return_address != nullptr, "_call_stub_return_address must have been generated before");
368
__ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
374
//------------------------------------------------------------------------------------------------------------------------
375
// Continuation point for runtime calls returning with a pending exception.
376
// The pending exception check happened in the runtime or native call stub.
377
// The pending exception in Thread is converted into a Java-level exception.
379
// Contract with Java-level exception handlers:
383
// NOTE: At entry of this stub, exception-pc must be on stack !!
385
address generate_forward_exception() {
386
StubCodeMark mark(this, "StubRoutines", "forward exception");
387
address start = __ pc();
388
const Register thread = rcx;
390
// other registers used in this stub
391
const Register exception_oop = rax;
392
const Register handler_addr = rbx;
393
const Register exception_pc = rdx;
395
// Upon entry, the sp points to the return address returning into Java
396
// (interpreted or compiled) code; i.e., the return address becomes the
399
// Arguments pushed before the runtime call are still on the stack but
400
// the exception handler will reset the stack pointer -> ignore them.
401
// A potential result in registers can be ignored as well.
404
// make sure this code is only executed if there is a pending exception
406
__ get_thread(thread);
407
__ cmpptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
408
__ jcc(Assembler::notEqual, L);
409
__ stop("StubRoutines::forward exception: no pending exception (1)");
414
// compute exception handler into rbx,
415
__ get_thread(thread);
416
__ movptr(exception_pc, Address(rsp, 0));
417
BLOCK_COMMENT("call exception_handler_for_return_address");
418
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
419
__ mov(handler_addr, rax);
421
// setup rax & rdx, remove return address & clear pending exception
422
__ get_thread(thread);
423
__ pop(exception_pc);
424
__ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
425
__ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
428
// make sure exception is set
430
__ testptr(exception_oop, exception_oop);
431
__ jcc(Assembler::notEqual, L);
432
__ stop("StubRoutines::forward exception: no pending exception (2)");
437
// Verify that there is really a valid exception in RAX.
438
__ verify_oop(exception_oop);
440
// continue at exception handler (return address removed)
442
// rbx: exception handler
444
__ jmp(handler_addr);
449
//----------------------------------------------------------------------------------------------------
450
// Support for void verify_mxcsr()
452
// This routine is used with -Xcheck:jni to verify that native
453
// JNI code does not return to Java code without restoring the
454
// MXCSR register to our expected state.
457
address generate_verify_mxcsr() {
458
StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
459
address start = __ pc();
461
const Address mxcsr_save(rsp, 0);
463
if (CheckJNICalls && UseSSE > 0 ) {
465
ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
467
__ subptr(rsp, wordSize); // allocate a temp location
468
__ stmxcsr(mxcsr_save);
469
__ movl(rax, mxcsr_save);
470
__ andl(rax, MXCSR_MASK);
471
__ cmp32(rax, mxcsr_std);
472
__ jcc(Assembler::equal, ok_ret);
474
__ warn("MXCSR changed by native JNI code.");
476
__ ldmxcsr(mxcsr_std);
479
__ addptr(rsp, wordSize);
489
//---------------------------------------------------------------------------
490
// Support for void verify_fpu_cntrl_wrd()
492
// This routine is used with -Xcheck:jni to verify that native
493
// JNI code does not return to Java code without restoring the
494
// FP control word to our expected state.
496
address generate_verify_fpu_cntrl_wrd() {
497
StubCodeMark mark(this, "StubRoutines", "verify_spcw");
498
address start = __ pc();
500
const Address fpu_cntrl_wrd_save(rsp, 0);
505
__ subptr(rsp, wordSize); // allocate a temp location
506
__ fnstcw(fpu_cntrl_wrd_save);
507
__ movl(rax, fpu_cntrl_wrd_save);
508
__ andl(rax, FPU_CNTRL_WRD_MASK);
509
ExternalAddress fpu_std(StubRoutines::x86::addr_fpu_cntrl_wrd_std());
510
__ cmp32(rax, fpu_std);
511
__ jcc(Assembler::equal, ok_ret);
513
__ warn("Floating point control word changed by native JNI code.");
518
__ addptr(rsp, wordSize);
527
//---------------------------------------------------------------------------
528
// Wrapper for slow-case handling of double-to-integer conversion
529
// d2i or f2i fast case failed either because it is nan or because
530
// of under/overflow.
531
// Input: FPU TOS: float value
532
// Output: rax, (rdx): integer (long) result
534
address generate_d2i_wrapper(BasicType t, address fcn) {
535
StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
536
address start = __ pc();
538
// Capture info about frame layout
539
enum layout { FPUState_off = 0,
540
rbp_off = FPUStateSizeInWords,
546
saved_argument_off2, // 2nd half of double
550
assert(FPUStateSizeInWords == 27, "update stack layout");
552
// Save outgoing argument to stack across push_FPU_state()
553
__ subptr(rsp, wordSize * 2);
554
__ fstp_d(Address(rsp, 0));
556
// Save CPU & FPU state
564
// push_FPU_state() resets the FP top of stack
565
// Load original double into FP top of stack
566
__ fld_d(Address(rsp, saved_argument_off * wordSize));
567
// Store double into stack as outgoing argument
568
__ subptr(rsp, wordSize*2);
569
__ fst_d(Address(rsp, 0));
571
// Prepare FPU for doing math in C-land
572
__ empty_FPU_stack();
573
// Call the C code to massage the double. Result in EAX
575
{ BLOCK_COMMENT("SharedRuntime::d2i"); }
576
else if (t == T_LONG)
577
{ BLOCK_COMMENT("SharedRuntime::d2l"); }
578
__ call_VM_leaf( fcn, 2 );
580
// Restore CPU & FPU state
587
__ addptr(rsp, wordSize * 2);
593
//---------------------------------------------------------------------------------------------------
595
address generate_vector_mask(const char *stub_name, int32_t mask) {
596
__ align(CodeEntryAlignment);
597
StubCodeMark mark(this, "StubRoutines", stub_name);
598
address start = __ pc();
600
for (int i = 0; i < 16; i++) {
601
__ emit_data(mask, relocInfo::none, 0);
607
address generate_count_leading_zeros_lut(const char *stub_name) {
609
StubCodeMark mark(this, "StubRoutines", stub_name);
610
address start = __ pc();
611
__ emit_data(0x02020304, relocInfo::none, 0);
612
__ emit_data(0x01010101, relocInfo::none, 0);
613
__ emit_data(0x00000000, relocInfo::none, 0);
614
__ emit_data(0x00000000, relocInfo::none, 0);
615
__ emit_data(0x02020304, relocInfo::none, 0);
616
__ emit_data(0x01010101, relocInfo::none, 0);
617
__ emit_data(0x00000000, relocInfo::none, 0);
618
__ emit_data(0x00000000, relocInfo::none, 0);
619
__ emit_data(0x02020304, relocInfo::none, 0);
620
__ emit_data(0x01010101, relocInfo::none, 0);
621
__ emit_data(0x00000000, relocInfo::none, 0);
622
__ emit_data(0x00000000, relocInfo::none, 0);
623
__ emit_data(0x02020304, relocInfo::none, 0);
624
__ emit_data(0x01010101, relocInfo::none, 0);
625
__ emit_data(0x00000000, relocInfo::none, 0);
626
__ emit_data(0x00000000, relocInfo::none, 0);
631
address generate_popcount_avx_lut(const char *stub_name) {
633
StubCodeMark mark(this, "StubRoutines", stub_name);
634
address start = __ pc();
635
__ emit_data(0x02010100, relocInfo::none, 0);
636
__ emit_data(0x03020201, relocInfo::none, 0);
637
__ emit_data(0x03020201, relocInfo::none, 0);
638
__ emit_data(0x04030302, relocInfo::none, 0);
639
__ emit_data(0x02010100, relocInfo::none, 0);
640
__ emit_data(0x03020201, relocInfo::none, 0);
641
__ emit_data(0x03020201, relocInfo::none, 0);
642
__ emit_data(0x04030302, relocInfo::none, 0);
643
__ emit_data(0x02010100, relocInfo::none, 0);
644
__ emit_data(0x03020201, relocInfo::none, 0);
645
__ emit_data(0x03020201, relocInfo::none, 0);
646
__ emit_data(0x04030302, relocInfo::none, 0);
647
__ emit_data(0x02010100, relocInfo::none, 0);
648
__ emit_data(0x03020201, relocInfo::none, 0);
649
__ emit_data(0x03020201, relocInfo::none, 0);
650
__ emit_data(0x04030302, relocInfo::none, 0);
655
address generate_iota_indices(const char *stub_name) {
656
__ align(CodeEntryAlignment);
657
StubCodeMark mark(this, "StubRoutines", stub_name);
658
address start = __ pc();
660
__ emit_data(0x03020100, relocInfo::none, 0);
661
__ emit_data(0x07060504, relocInfo::none, 0);
662
__ emit_data(0x0B0A0908, relocInfo::none, 0);
663
__ emit_data(0x0F0E0D0C, relocInfo::none, 0);
664
__ emit_data(0x13121110, relocInfo::none, 0);
665
__ emit_data(0x17161514, relocInfo::none, 0);
666
__ emit_data(0x1B1A1918, relocInfo::none, 0);
667
__ emit_data(0x1F1E1D1C, relocInfo::none, 0);
668
__ emit_data(0x23222120, relocInfo::none, 0);
669
__ emit_data(0x27262524, relocInfo::none, 0);
670
__ emit_data(0x2B2A2928, relocInfo::none, 0);
671
__ emit_data(0x2F2E2D2C, relocInfo::none, 0);
672
__ emit_data(0x33323130, relocInfo::none, 0);
673
__ emit_data(0x37363534, relocInfo::none, 0);
674
__ emit_data(0x3B3A3938, relocInfo::none, 0);
675
__ emit_data(0x3F3E3D3C, relocInfo::none, 0);
678
__ emit_data(0x00010000, relocInfo::none, 0);
679
__ emit_data(0x00030002, relocInfo::none, 0);
680
__ emit_data(0x00050004, relocInfo::none, 0);
681
__ emit_data(0x00070006, relocInfo::none, 0);
682
__ emit_data(0x00090008, relocInfo::none, 0);
683
__ emit_data(0x000B000A, relocInfo::none, 0);
684
__ emit_data(0x000D000C, relocInfo::none, 0);
685
__ emit_data(0x000F000E, relocInfo::none, 0);
686
__ emit_data(0x00110010, relocInfo::none, 0);
687
__ emit_data(0x00130012, relocInfo::none, 0);
688
__ emit_data(0x00150014, relocInfo::none, 0);
689
__ emit_data(0x00170016, relocInfo::none, 0);
690
__ emit_data(0x00190018, relocInfo::none, 0);
691
__ emit_data(0x001B001A, relocInfo::none, 0);
692
__ emit_data(0x001D001C, relocInfo::none, 0);
693
__ emit_data(0x001F001E, relocInfo::none, 0);
696
__ emit_data(0x00000000, relocInfo::none, 0);
697
__ emit_data(0x00000001, relocInfo::none, 0);
698
__ emit_data(0x00000002, relocInfo::none, 0);
699
__ emit_data(0x00000003, relocInfo::none, 0);
700
__ emit_data(0x00000004, relocInfo::none, 0);
701
__ emit_data(0x00000005, relocInfo::none, 0);
702
__ emit_data(0x00000006, relocInfo::none, 0);
703
__ emit_data(0x00000007, relocInfo::none, 0);
704
__ emit_data(0x00000008, relocInfo::none, 0);
705
__ emit_data(0x00000009, relocInfo::none, 0);
706
__ emit_data(0x0000000A, relocInfo::none, 0);
707
__ emit_data(0x0000000B, relocInfo::none, 0);
708
__ emit_data(0x0000000C, relocInfo::none, 0);
709
__ emit_data(0x0000000D, relocInfo::none, 0);
710
__ emit_data(0x0000000E, relocInfo::none, 0);
711
__ emit_data(0x0000000F, relocInfo::none, 0);
714
__ emit_data(0x00000000, relocInfo::none, 0);
715
__ emit_data(0x00000000, relocInfo::none, 0);
716
__ emit_data(0x00000001, relocInfo::none, 0);
717
__ emit_data(0x00000000, relocInfo::none, 0);
718
__ emit_data(0x00000002, relocInfo::none, 0);
719
__ emit_data(0x00000000, relocInfo::none, 0);
720
__ emit_data(0x00000003, relocInfo::none, 0);
721
__ emit_data(0x00000000, relocInfo::none, 0);
722
__ emit_data(0x00000004, relocInfo::none, 0);
723
__ emit_data(0x00000000, relocInfo::none, 0);
724
__ emit_data(0x00000005, relocInfo::none, 0);
725
__ emit_data(0x00000000, relocInfo::none, 0);
726
__ emit_data(0x00000006, relocInfo::none, 0);
727
__ emit_data(0x00000000, relocInfo::none, 0);
728
__ emit_data(0x00000007, relocInfo::none, 0);
729
__ emit_data(0x00000000, relocInfo::none, 0);
732
__ emit_data(0x00000000, relocInfo::none, 0); // 0.0f
733
__ emit_data(0x3F800000, relocInfo::none, 0); // 1.0f
734
__ emit_data(0x40000000, relocInfo::none, 0); // 2.0f
735
__ emit_data(0x40400000, relocInfo::none, 0); // 3.0f
736
__ emit_data(0x40800000, relocInfo::none, 0); // 4.0f
737
__ emit_data(0x40A00000, relocInfo::none, 0); // 5.0f
738
__ emit_data(0x40C00000, relocInfo::none, 0); // 6.0f
739
__ emit_data(0x40E00000, relocInfo::none, 0); // 7.0f
740
__ emit_data(0x41000000, relocInfo::none, 0); // 8.0f
741
__ emit_data(0x41100000, relocInfo::none, 0); // 9.0f
742
__ emit_data(0x41200000, relocInfo::none, 0); // 10.0f
743
__ emit_data(0x41300000, relocInfo::none, 0); // 11.0f
744
__ emit_data(0x41400000, relocInfo::none, 0); // 12.0f
745
__ emit_data(0x41500000, relocInfo::none, 0); // 13.0f
746
__ emit_data(0x41600000, relocInfo::none, 0); // 14.0f
747
__ emit_data(0x41700000, relocInfo::none, 0); // 15.0f
750
__ emit_data(0x00000000, relocInfo::none, 0); // 0.0d
751
__ emit_data(0x00000000, relocInfo::none, 0);
752
__ emit_data(0x00000000, relocInfo::none, 0); // 1.0d
753
__ emit_data(0x3FF00000, relocInfo::none, 0);
754
__ emit_data(0x00000000, relocInfo::none, 0); // 2.0d
755
__ emit_data(0x40000000, relocInfo::none, 0);
756
__ emit_data(0x00000000, relocInfo::none, 0); // 3.0d
757
__ emit_data(0x40080000, relocInfo::none, 0);
758
__ emit_data(0x00000000, relocInfo::none, 0); // 4.0d
759
__ emit_data(0x40100000, relocInfo::none, 0);
760
__ emit_data(0x00000000, relocInfo::none, 0); // 5.0d
761
__ emit_data(0x40140000, relocInfo::none, 0);
762
__ emit_data(0x00000000, relocInfo::none, 0); // 6.0d
763
__ emit_data(0x40180000, relocInfo::none, 0);
764
__ emit_data(0x00000000, relocInfo::none, 0); // 7.0d
765
__ emit_data(0x401c0000, relocInfo::none, 0);
769
address generate_vector_reverse_bit_lut(const char *stub_name) {
770
__ align(CodeEntryAlignment);
771
StubCodeMark mark(this, "StubRoutines", stub_name);
772
address start = __ pc();
773
__ emit_data(0x0C040800, relocInfo::none, 0);
774
__ emit_data(0x0E060A02, relocInfo::none, 0);
775
__ emit_data(0x0D050901, relocInfo::none, 0);
776
__ emit_data(0x0F070B03, relocInfo::none, 0);
777
__ emit_data(0x0C040800, relocInfo::none, 0);
778
__ emit_data(0x0E060A02, relocInfo::none, 0);
779
__ emit_data(0x0D050901, relocInfo::none, 0);
780
__ emit_data(0x0F070B03, relocInfo::none, 0);
781
__ emit_data(0x0C040800, relocInfo::none, 0);
782
__ emit_data(0x0E060A02, relocInfo::none, 0);
783
__ emit_data(0x0D050901, relocInfo::none, 0);
784
__ emit_data(0x0F070B03, relocInfo::none, 0);
785
__ emit_data(0x0C040800, relocInfo::none, 0);
786
__ emit_data(0x0E060A02, relocInfo::none, 0);
787
__ emit_data(0x0D050901, relocInfo::none, 0);
788
__ emit_data(0x0F070B03, relocInfo::none, 0);
792
address generate_vector_reverse_byte_perm_mask_long(const char *stub_name) {
793
__ align(CodeEntryAlignment);
794
StubCodeMark mark(this, "StubRoutines", stub_name);
795
address start = __ pc();
796
__ emit_data(0x04050607, relocInfo::none, 0);
797
__ emit_data(0x00010203, relocInfo::none, 0);
798
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
799
__ emit_data(0x08090A0B, relocInfo::none, 0);
800
__ emit_data(0x04050607, relocInfo::none, 0);
801
__ emit_data(0x00010203, relocInfo::none, 0);
802
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
803
__ emit_data(0x08090A0B, relocInfo::none, 0);
804
__ emit_data(0x04050607, relocInfo::none, 0);
805
__ emit_data(0x00010203, relocInfo::none, 0);
806
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
807
__ emit_data(0x08090A0B, relocInfo::none, 0);
808
__ emit_data(0x04050607, relocInfo::none, 0);
809
__ emit_data(0x00010203, relocInfo::none, 0);
810
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
811
__ emit_data(0x08090A0B, relocInfo::none, 0);
815
address generate_vector_reverse_byte_perm_mask_int(const char *stub_name) {
816
__ align(CodeEntryAlignment);
817
StubCodeMark mark(this, "StubRoutines", stub_name);
818
address start = __ pc();
819
__ emit_data(0x00010203, relocInfo::none, 0);
820
__ emit_data(0x04050607, relocInfo::none, 0);
821
__ emit_data(0x08090A0B, relocInfo::none, 0);
822
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
823
__ emit_data(0x00010203, relocInfo::none, 0);
824
__ emit_data(0x04050607, relocInfo::none, 0);
825
__ emit_data(0x08090A0B, relocInfo::none, 0);
826
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
827
__ emit_data(0x00010203, relocInfo::none, 0);
828
__ emit_data(0x04050607, relocInfo::none, 0);
829
__ emit_data(0x08090A0B, relocInfo::none, 0);
830
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
831
__ emit_data(0x00010203, relocInfo::none, 0);
832
__ emit_data(0x04050607, relocInfo::none, 0);
833
__ emit_data(0x08090A0B, relocInfo::none, 0);
834
__ emit_data(0x0C0D0E0F, relocInfo::none, 0);
838
address generate_vector_reverse_byte_perm_mask_short(const char *stub_name) {
839
__ align(CodeEntryAlignment);
840
StubCodeMark mark(this, "StubRoutines", stub_name);
841
address start = __ pc();
842
__ emit_data(0x02030001, relocInfo::none, 0);
843
__ emit_data(0x06070405, relocInfo::none, 0);
844
__ emit_data(0x0A0B0809, relocInfo::none, 0);
845
__ emit_data(0x0E0F0C0D, relocInfo::none, 0);
846
__ emit_data(0x02030001, relocInfo::none, 0);
847
__ emit_data(0x06070405, relocInfo::none, 0);
848
__ emit_data(0x0A0B0809, relocInfo::none, 0);
849
__ emit_data(0x0E0F0C0D, relocInfo::none, 0);
850
__ emit_data(0x02030001, relocInfo::none, 0);
851
__ emit_data(0x06070405, relocInfo::none, 0);
852
__ emit_data(0x0A0B0809, relocInfo::none, 0);
853
__ emit_data(0x0E0F0C0D, relocInfo::none, 0);
854
__ emit_data(0x02030001, relocInfo::none, 0);
855
__ emit_data(0x06070405, relocInfo::none, 0);
856
__ emit_data(0x0A0B0809, relocInfo::none, 0);
857
__ emit_data(0x0E0F0C0D, relocInfo::none, 0);
861
address generate_vector_byte_shuffle_mask(const char *stub_name) {
862
__ align(CodeEntryAlignment);
863
StubCodeMark mark(this, "StubRoutines", stub_name);
864
address start = __ pc();
865
__ emit_data(0x70707070, relocInfo::none, 0);
866
__ emit_data(0x70707070, relocInfo::none, 0);
867
__ emit_data(0x70707070, relocInfo::none, 0);
868
__ emit_data(0x70707070, relocInfo::none, 0);
869
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
870
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
871
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
872
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
876
address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
877
__ align(CodeEntryAlignment);
878
StubCodeMark mark(this, "StubRoutines", stub_name);
879
address start = __ pc();
881
for (int i = 0; i < 8; i++) {
882
__ emit_data(masklo, relocInfo::none, 0);
883
__ emit_data(maskhi, relocInfo::none, 0);
889
//----------------------------------------------------------------------------------------------------
891
address generate_vector_byte_perm_mask(const char *stub_name) {
892
__ align(CodeEntryAlignment);
893
StubCodeMark mark(this, "StubRoutines", stub_name);
894
address start = __ pc();
896
__ emit_data(0x00000001, relocInfo::none, 0);
897
__ emit_data(0x00000000, relocInfo::none, 0);
898
__ emit_data(0x00000003, relocInfo::none, 0);
899
__ emit_data(0x00000000, relocInfo::none, 0);
900
__ emit_data(0x00000005, relocInfo::none, 0);
901
__ emit_data(0x00000000, relocInfo::none, 0);
902
__ emit_data(0x00000007, relocInfo::none, 0);
903
__ emit_data(0x00000000, relocInfo::none, 0);
904
__ emit_data(0x00000000, relocInfo::none, 0);
905
__ emit_data(0x00000000, relocInfo::none, 0);
906
__ emit_data(0x00000002, relocInfo::none, 0);
907
__ emit_data(0x00000000, relocInfo::none, 0);
908
__ emit_data(0x00000004, relocInfo::none, 0);
909
__ emit_data(0x00000000, relocInfo::none, 0);
910
__ emit_data(0x00000006, relocInfo::none, 0);
911
__ emit_data(0x00000000, relocInfo::none, 0);
916
address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
917
int32_t val0, int32_t val1, int32_t val2, int32_t val3,
918
int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
919
int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
920
int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
921
__ align(CodeEntryAlignment);
922
StubCodeMark mark(this, "StubRoutines", stub_name);
923
address start = __ pc();
925
assert(len != Assembler::AVX_NoVec, "vector len must be specified");
926
__ emit_data(val0, relocInfo::none, 0);
927
__ emit_data(val1, relocInfo::none, 0);
928
__ emit_data(val2, relocInfo::none, 0);
929
__ emit_data(val3, relocInfo::none, 0);
930
if (len >= Assembler::AVX_256bit) {
931
__ emit_data(val4, relocInfo::none, 0);
932
__ emit_data(val5, relocInfo::none, 0);
933
__ emit_data(val6, relocInfo::none, 0);
934
__ emit_data(val7, relocInfo::none, 0);
935
if (len >= Assembler::AVX_512bit) {
936
__ emit_data(val8, relocInfo::none, 0);
937
__ emit_data(val9, relocInfo::none, 0);
938
__ emit_data(val10, relocInfo::none, 0);
939
__ emit_data(val11, relocInfo::none, 0);
940
__ emit_data(val12, relocInfo::none, 0);
941
__ emit_data(val13, relocInfo::none, 0);
942
__ emit_data(val14, relocInfo::none, 0);
943
__ emit_data(val15, relocInfo::none, 0);
950
//----------------------------------------------------------------------------------------------------
951
// Non-destructive plausibility checks for oops
953
address generate_verify_oop() {
954
StubCodeMark mark(this, "StubRoutines", "verify_oop");
955
address start = __ pc();
957
// Incoming arguments on stack after saving rax,:
960
// [tos + 1]: saved EFLAGS
961
// [tos + 2]: return address
962
// [tos + 3]: char* error message
963
// [tos + 4]: oop object to verify
964
// [tos + 5]: saved rax, - saved by caller and bashed
968
__ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
969
__ push(rdx); // save rdx
970
// make sure object is 'reasonable'
971
__ movptr(rax, Address(rsp, 4 * wordSize)); // get object
972
__ testptr(rax, rax);
973
__ jcc(Assembler::zero, exit); // if obj is null it is ok
975
// Check if the oop is in the right area of memory
976
const int oop_mask = Universe::verify_oop_mask();
977
const int oop_bits = Universe::verify_oop_bits();
979
__ andptr(rdx, oop_mask);
980
__ cmpptr(rdx, oop_bits);
981
__ jcc(Assembler::notZero, error);
983
// make sure klass is 'reasonable', which is not zero.
984
__ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
985
__ testptr(rax, rax);
986
__ jcc(Assembler::zero, error); // if klass is null it is broken
988
// return if everything seems ok
990
__ movptr(rax, Address(rsp, 5 * wordSize)); // get saved rax, back
991
__ pop(rdx); // restore rdx
992
__ popf(); // restore EFLAGS
993
__ ret(3 * wordSize); // pop arguments
997
__ movptr(rax, Address(rsp, 5 * wordSize)); // get saved rax, back
998
__ pop(rdx); // get saved rdx back
999
__ popf(); // get saved EFLAGS off stack -- will be ignored
1000
__ pusha(); // push registers (eip = return address & msg are already pushed)
1001
BLOCK_COMMENT("call MacroAssembler::debug");
1002
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
1008
// Copy 64 bytes chunks
1011
// from - source array address
1012
// to_from - destination array address - from
1013
// qword_count - 8-bytes element count, negative
1015
void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
1016
assert( UseSSE >= 2, "supported cpu only" );
1017
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
1019
// Copy 64-byte chunks
1020
__ jmpb(L_copy_64_bytes);
1021
__ align(OptoLoopAlignment);
1022
__ BIND(L_copy_64_bytes_loop);
1024
if (UseUnalignedLoadStores) {
1026
__ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
1027
__ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1028
} else if (UseAVX == 2) {
1029
__ vmovdqu(xmm0, Address(from, 0));
1030
__ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
1031
__ vmovdqu(xmm1, Address(from, 32));
1032
__ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
1034
__ movdqu(xmm0, Address(from, 0));
1035
__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
1036
__ movdqu(xmm1, Address(from, 16));
1037
__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
1038
__ movdqu(xmm2, Address(from, 32));
1039
__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
1040
__ movdqu(xmm3, Address(from, 48));
1041
__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
1044
__ movq(xmm0, Address(from, 0));
1045
__ movq(Address(from, to_from, Address::times_1, 0), xmm0);
1046
__ movq(xmm1, Address(from, 8));
1047
__ movq(Address(from, to_from, Address::times_1, 8), xmm1);
1048
__ movq(xmm2, Address(from, 16));
1049
__ movq(Address(from, to_from, Address::times_1, 16), xmm2);
1050
__ movq(xmm3, Address(from, 24));
1051
__ movq(Address(from, to_from, Address::times_1, 24), xmm3);
1052
__ movq(xmm4, Address(from, 32));
1053
__ movq(Address(from, to_from, Address::times_1, 32), xmm4);
1054
__ movq(xmm5, Address(from, 40));
1055
__ movq(Address(from, to_from, Address::times_1, 40), xmm5);
1056
__ movq(xmm6, Address(from, 48));
1057
__ movq(Address(from, to_from, Address::times_1, 48), xmm6);
1058
__ movq(xmm7, Address(from, 56));
1059
__ movq(Address(from, to_from, Address::times_1, 56), xmm7);
1063
__ BIND(L_copy_64_bytes);
1064
__ subl(qword_count, 8);
1065
__ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
1067
if (UseUnalignedLoadStores && (UseAVX == 2)) {
1068
// clean upper bits of YMM registers
1069
__ vpxor(xmm0, xmm0);
1070
__ vpxor(xmm1, xmm1);
1072
__ addl(qword_count, 8);
1073
__ jccb(Assembler::zero, L_exit);
1075
// length is too short, just copy qwords
1077
__ BIND(L_copy_8_bytes);
1078
__ movq(xmm0, Address(from, 0));
1079
__ movq(Address(from, to_from, Address::times_1), xmm0);
1081
__ decrement(qword_count);
1082
__ jcc(Assembler::greater, L_copy_8_bytes);
1086
address generate_disjoint_copy(BasicType t, bool aligned,
1087
Address::ScaleFactor sf,
1088
address* entry, const char *name,
1089
bool dest_uninitialized = false) {
1090
__ align(CodeEntryAlignment);
1091
StubCodeMark mark(this, "StubRoutines", name);
1092
address start = __ pc();
1094
Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1095
Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
1097
int shift = Address::times_ptr - sf;
1099
const Register from = rsi; // source array address
1100
const Register to = rdi; // destination array address
1101
const Register count = rcx; // elements count
1102
const Register to_from = to; // (to - from)
1103
const Register saved_to = rdx; // saved destination array address
1105
__ enter(); // required for proper stackwalking of RuntimeStub frame
1108
__ movptr(from , Address(rsp, 12+ 4));
1109
__ movptr(to , Address(rsp, 12+ 8));
1110
__ movl(count, Address(rsp, 12+ 12));
1112
if (entry != nullptr) {
1113
*entry = __ pc(); // Entry point from conjoint arraycopy stub.
1114
BLOCK_COMMENT("Entry:");
1117
if (t == T_OBJECT) {
1118
__ testl(count, count);
1119
__ jcc(Assembler::zero, L_0_count);
1122
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1123
if (dest_uninitialized) {
1124
decorators |= IS_DEST_UNINITIALIZED;
1127
decorators |= ARRAYCOPY_ALIGNED;
1130
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1131
bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1133
bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1134
// UnsafeMemoryAccess page error: continue after unsafe access
1135
UnsafeMemoryAccessMark umam(this, add_entry, true);
1136
__ subptr(to, from); // to --> to_from
1137
__ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1138
__ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1139
if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
1140
// align source address at 4 bytes address boundary
1142
// One byte misalignment happens only for byte arrays
1144
__ jccb(Assembler::zero, L_skip_align1);
1145
__ movb(rax, Address(from, 0));
1146
__ movb(Address(from, to_from, Address::times_1, 0), rax);
1148
__ decrement(count);
1149
__ BIND(L_skip_align1);
1151
// Two bytes misalignment happens only for byte and short (char) arrays
1153
__ jccb(Assembler::zero, L_skip_align2);
1154
__ movw(rax, Address(from, 0));
1155
__ movw(Address(from, to_from, Address::times_1, 0), rax);
1157
__ subl(count, 1<<(shift-1));
1158
__ BIND(L_skip_align2);
1160
if (!UseXMMForArrayCopy) {
1161
__ mov(rax, count); // save 'count'
1162
__ shrl(count, shift); // bytes count
1163
__ addptr(to_from, from);// restore 'to'
1165
__ subptr(to_from, from);// restore 'to_from'
1166
__ mov(count, rax); // restore 'count'
1167
__ jmpb(L_copy_2_bytes); // all dwords were copied
1169
if (!UseUnalignedLoadStores) {
1170
// align to 8 bytes, we know we are 4 byte aligned to start
1171
__ testptr(from, 4);
1172
__ jccb(Assembler::zero, L_copy_64_bytes);
1173
__ movl(rax, Address(from, 0));
1174
__ movl(Address(from, to_from, Address::times_1, 0), rax);
1176
__ subl(count, 1<<shift);
1178
__ BIND(L_copy_64_bytes);
1180
__ shrl(rax, shift+1); // 8 bytes chunk count
1182
// Copy 8-byte chunks through XMM registers, 8 per iteration of the loop
1184
xmm_copy_forward(from, to_from, rax);
1186
// copy tailing dword
1187
__ BIND(L_copy_4_bytes);
1188
__ testl(count, 1<<shift);
1189
__ jccb(Assembler::zero, L_copy_2_bytes);
1190
__ movl(rax, Address(from, 0));
1191
__ movl(Address(from, to_from, Address::times_1, 0), rax);
1192
if (t == T_BYTE || t == T_SHORT) {
1194
__ BIND(L_copy_2_bytes);
1195
// copy tailing word
1196
__ testl(count, 1<<(shift-1));
1197
__ jccb(Assembler::zero, L_copy_byte);
1198
__ movw(rax, Address(from, 0));
1199
__ movw(Address(from, to_from, Address::times_1, 0), rax);
1202
__ BIND(L_copy_byte);
1203
// copy tailing byte
1205
__ jccb(Assembler::zero, L_exit);
1206
__ movb(rax, Address(from, 0));
1207
__ movb(Address(from, to_from, Address::times_1, 0), rax);
1210
__ BIND(L_copy_byte);
1213
__ BIND(L_copy_2_bytes);
1217
__ movl(count, Address(rsp, 12+12)); // reread 'count'
1218
bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1220
if (t == T_OBJECT) {
1223
inc_copy_counter_np(t);
1226
__ leave(); // required for proper stackwalking of RuntimeStub frame
1228
__ xorptr(rax, rax); // return 0
1234
address generate_fill(BasicType t, bool aligned, const char *name) {
1235
__ align(CodeEntryAlignment);
1236
StubCodeMark mark(this, "StubRoutines", name);
1237
address start = __ pc();
1239
BLOCK_COMMENT("Entry:");
1241
const Register to = rdi; // source array address
1242
const Register value = rdx; // value
1243
const Register count = rsi; // elements count
1245
__ enter(); // required for proper stackwalking of RuntimeStub frame
1248
__ movptr(to , Address(rsp, 12+ 4));
1249
__ movl(value, Address(rsp, 12+ 8));
1250
__ movl(count, Address(rsp, 12+ 12));
1252
__ generate_fill(t, aligned, to, value, count, rax, xmm0);
1256
__ leave(); // required for proper stackwalking of RuntimeStub frame
1261
address generate_conjoint_copy(BasicType t, bool aligned,
1262
Address::ScaleFactor sf,
1263
address nooverlap_target,
1264
address* entry, const char *name,
1265
bool dest_uninitialized = false) {
1266
__ align(CodeEntryAlignment);
1267
StubCodeMark mark(this, "StubRoutines", name);
1268
address start = __ pc();
1270
Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1271
Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1273
int shift = Address::times_ptr - sf;
1275
const Register src = rax; // source array address
1276
const Register dst = rdx; // destination array address
1277
const Register from = rsi; // source array address
1278
const Register to = rdi; // destination array address
1279
const Register count = rcx; // elements count
1280
const Register end = rax; // array end address
1282
__ enter(); // required for proper stackwalking of RuntimeStub frame
1285
__ movptr(src , Address(rsp, 12+ 4)); // from
1286
__ movptr(dst , Address(rsp, 12+ 8)); // to
1287
__ movl2ptr(count, Address(rsp, 12+12)); // count
1289
if (entry != nullptr) {
1290
*entry = __ pc(); // Entry point from generic arraycopy stub.
1291
BLOCK_COMMENT("Entry:");
1294
// nooverlap_target expects arguments in rsi and rdi.
1298
// arrays overlap test: dispatch to disjoint stub if necessary.
1299
RuntimeAddress nooverlap(nooverlap_target);
1300
__ cmpptr(dst, src);
1301
__ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1302
__ jump_cc(Assembler::belowEqual, nooverlap);
1303
__ cmpptr(dst, end);
1304
__ jump_cc(Assembler::aboveEqual, nooverlap);
1306
if (t == T_OBJECT) {
1307
__ testl(count, count);
1308
__ jcc(Assembler::zero, L_0_count);
1311
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1312
if (dest_uninitialized) {
1313
decorators |= IS_DEST_UNINITIALIZED;
1316
decorators |= ARRAYCOPY_ALIGNED;
1319
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1320
bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1323
bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1324
// UnsafeMemoryAccess page error: continue after unsafe access
1325
UnsafeMemoryAccessMark umam(this, add_entry, true);
1326
// copy from high to low
1327
__ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1328
__ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1329
if (t == T_BYTE || t == T_SHORT) {
1330
// Align the end of destination array at 4 bytes address boundary
1331
__ lea(end, Address(dst, count, sf, 0));
1333
// One byte misalignment happens only for byte arrays
1335
__ jccb(Assembler::zero, L_skip_align1);
1336
__ decrement(count);
1337
__ movb(rdx, Address(from, count, sf, 0));
1338
__ movb(Address(to, count, sf, 0), rdx);
1339
__ BIND(L_skip_align1);
1341
// Two bytes misalignment happens only for byte and short (char) arrays
1343
__ jccb(Assembler::zero, L_skip_align2);
1344
__ subptr(count, 1<<(shift-1));
1345
__ movw(rdx, Address(from, count, sf, 0));
1346
__ movw(Address(to, count, sf, 0), rdx);
1347
__ BIND(L_skip_align2);
1348
__ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1349
__ jcc(Assembler::below, L_copy_4_bytes);
1352
if (!UseXMMForArrayCopy) {
1354
__ mov(rax, count); // Save 'count'
1355
__ mov(rdx, to); // Save 'to'
1356
__ lea(rsi, Address(from, count, sf, -4));
1357
__ lea(rdi, Address(to , count, sf, -4));
1358
__ shrptr(count, shift); // bytes count
1361
__ mov(count, rax); // restore 'count'
1362
__ andl(count, (1<<shift)-1); // mask the number of rest elements
1363
__ movptr(from, Address(rsp, 12+4)); // reread 'from'
1364
__ mov(to, rdx); // restore 'to'
1365
__ jmpb(L_copy_2_bytes); // all dword were copied
1367
// Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1369
__ jccb(Assembler::zero, L_copy_8_bytes);
1370
__ subl(count, 1<<shift);
1371
__ movl(rdx, Address(from, count, sf, 0));
1372
__ movl(Address(to, count, sf, 0), rdx);
1373
__ jmpb(L_copy_8_bytes);
1375
__ align(OptoLoopAlignment);
1377
__ BIND(L_copy_8_bytes_loop);
1378
__ movq(xmm0, Address(from, count, sf, 0));
1379
__ movq(Address(to, count, sf, 0), xmm0);
1380
__ BIND(L_copy_8_bytes);
1381
__ subl(count, 2<<shift);
1382
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1383
__ addl(count, 2<<shift);
1385
__ BIND(L_copy_4_bytes);
1386
// copy prefix qword
1387
__ testl(count, 1<<shift);
1388
__ jccb(Assembler::zero, L_copy_2_bytes);
1389
__ movl(rdx, Address(from, count, sf, -4));
1390
__ movl(Address(to, count, sf, -4), rdx);
1392
if (t == T_BYTE || t == T_SHORT) {
1393
__ subl(count, (1<<shift));
1394
__ BIND(L_copy_2_bytes);
1395
// copy prefix dword
1396
__ testl(count, 1<<(shift-1));
1397
__ jccb(Assembler::zero, L_copy_byte);
1398
__ movw(rdx, Address(from, count, sf, -2));
1399
__ movw(Address(to, count, sf, -2), rdx);
1401
__ subl(count, 1<<(shift-1));
1402
__ BIND(L_copy_byte);
1405
__ jccb(Assembler::zero, L_exit);
1406
__ movb(rdx, Address(from, 0));
1407
__ movb(Address(to, 0), rdx);
1410
__ BIND(L_copy_byte);
1413
__ BIND(L_copy_2_bytes);
1417
__ movl2ptr(count, Address(rsp, 12+12)); // reread count
1418
bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1420
if (t == T_OBJECT) {
1423
inc_copy_counter_np(t);
1426
__ leave(); // required for proper stackwalking of RuntimeStub frame
1427
__ xorptr(rax, rax); // return 0
1433
address generate_disjoint_long_copy(address* entry, const char *name) {
1434
__ align(CodeEntryAlignment);
1435
StubCodeMark mark(this, "StubRoutines", name);
1436
address start = __ pc();
1438
Label L_copy_8_bytes, L_copy_8_bytes_loop;
1439
const Register from = rax; // source array address
1440
const Register to = rdx; // destination array address
1441
const Register count = rcx; // elements count
1442
const Register to_from = rdx; // (to - from)
1444
__ enter(); // required for proper stackwalking of RuntimeStub frame
1445
__ movptr(from , Address(rsp, 8+0)); // from
1446
__ movptr(to , Address(rsp, 8+4)); // to
1447
__ movl2ptr(count, Address(rsp, 8+8)); // count
1449
*entry = __ pc(); // Entry point from conjoint arraycopy stub.
1450
BLOCK_COMMENT("Entry:");
1453
// UnsafeMemoryAccess page error: continue after unsafe access
1454
UnsafeMemoryAccessMark umam(this, true, true);
1455
__ subptr(to, from); // to --> to_from
1456
if (UseXMMForArrayCopy) {
1457
xmm_copy_forward(from, to_from, count);
1459
__ jmpb(L_copy_8_bytes);
1460
__ align(OptoLoopAlignment);
1461
__ BIND(L_copy_8_bytes_loop);
1462
__ fild_d(Address(from, 0));
1463
__ fistp_d(Address(from, to_from, Address::times_1));
1465
__ BIND(L_copy_8_bytes);
1466
__ decrement(count);
1467
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1470
inc_copy_counter_np(T_LONG);
1471
__ leave(); // required for proper stackwalking of RuntimeStub frame
1473
__ xorptr(rax, rax); // return 0
1478
address generate_conjoint_long_copy(address nooverlap_target,
1479
address* entry, const char *name) {
1480
__ align(CodeEntryAlignment);
1481
StubCodeMark mark(this, "StubRoutines", name);
1482
address start = __ pc();
1484
Label L_copy_8_bytes, L_copy_8_bytes_loop;
1485
const Register from = rax; // source array address
1486
const Register to = rdx; // destination array address
1487
const Register count = rcx; // elements count
1488
const Register end_from = rax; // source array end address
1490
__ enter(); // required for proper stackwalking of RuntimeStub frame
1491
__ movptr(from , Address(rsp, 8+0)); // from
1492
__ movptr(to , Address(rsp, 8+4)); // to
1493
__ movl2ptr(count, Address(rsp, 8+8)); // count
1495
*entry = __ pc(); // Entry point from generic arraycopy stub.
1496
BLOCK_COMMENT("Entry:");
1498
// arrays overlap test
1499
__ cmpptr(to, from);
1500
RuntimeAddress nooverlap(nooverlap_target);
1501
__ jump_cc(Assembler::belowEqual, nooverlap);
1502
__ lea(end_from, Address(from, count, Address::times_8, 0));
1503
__ cmpptr(to, end_from);
1504
__ movptr(from, Address(rsp, 8)); // from
1505
__ jump_cc(Assembler::aboveEqual, nooverlap);
1508
// UnsafeMemoryAccess page error: continue after unsafe access
1509
UnsafeMemoryAccessMark umam(this, true, true);
1511
__ jmpb(L_copy_8_bytes);
1513
__ align(OptoLoopAlignment);
1514
__ BIND(L_copy_8_bytes_loop);
1515
if (UseXMMForArrayCopy) {
1516
__ movq(xmm0, Address(from, count, Address::times_8));
1517
__ movq(Address(to, count, Address::times_8), xmm0);
1519
__ fild_d(Address(from, count, Address::times_8));
1520
__ fistp_d(Address(to, count, Address::times_8));
1522
__ BIND(L_copy_8_bytes);
1523
__ decrement(count);
1524
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1527
inc_copy_counter_np(T_LONG);
1528
__ leave(); // required for proper stackwalking of RuntimeStub frame
1529
__ xorptr(rax, rax); // return 0
1535
// Helper for generating a dynamic type check.
1536
// The sub_klass must be one of {rbx, rdx, rsi}.
1537
// The temp is killed.
1538
void generate_type_check(Register sub_klass,
1539
Address& super_check_offset_addr,
1540
Address& super_klass_addr,
1542
Label* L_success, Label* L_failure) {
1543
BLOCK_COMMENT("type_check:");
1545
Label L_fallthrough;
1546
#define LOCAL_JCC(assembler_con, label_ptr) \
1547
if (label_ptr != nullptr) __ jcc(assembler_con, *(label_ptr)); \
1548
else __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1550
// The following is a strange variation of the fast path which requires
1551
// one less register, because needed values are on the argument stack.
1552
// __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1553
// L_success, L_failure, null);
1554
assert_different_registers(sub_klass, temp);
1556
int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1558
// if the pointers are equal, we are done (e.g., String[] elements)
1559
__ cmpptr(sub_klass, super_klass_addr);
1560
LOCAL_JCC(Assembler::equal, L_success);
1562
// check the supertype display:
1563
__ movl2ptr(temp, super_check_offset_addr);
1564
Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1565
__ movptr(temp, super_check_addr); // load displayed supertype
1566
__ cmpptr(temp, super_klass_addr); // test the super type
1567
LOCAL_JCC(Assembler::equal, L_success);
1569
// if it was a primary super, we can just fail immediately
1570
__ cmpl(super_check_offset_addr, sc_offset);
1571
LOCAL_JCC(Assembler::notEqual, L_failure);
1573
// The repne_scan instruction uses fixed registers, which will get spilled.
1574
// We happen to know this works best when super_klass is in rax.
1575
Register super_klass = temp;
1576
__ movptr(super_klass, super_klass_addr);
1577
__ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1578
L_success, L_failure);
1580
__ bind(L_fallthrough);
1582
if (L_success == nullptr) { BLOCK_COMMENT("L_success:"); }
1583
if (L_failure == nullptr) { BLOCK_COMMENT("L_failure:"); }
1589
// Generate checkcasting array copy stub
1592
// 4(rsp) - source array address
1593
// 8(rsp) - destination array address
1594
// 12(rsp) - element count, can be zero
1595
// 16(rsp) - size_t ckoff (super_check_offset)
1596
// 20(rsp) - oop ckval (super_klass)
1599
// rax, == 0 - success
1600
// rax, == -1^K - failure, where K is partial transfer count
1602
address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1603
__ align(CodeEntryAlignment);
1604
StubCodeMark mark(this, "StubRoutines", name);
1605
address start = __ pc();
1607
Label L_load_element, L_store_element, L_do_card_marks, L_done;
1610
// rax, rdx, rcx -- loop control (end_from, end_to, count)
1611
// rdi, rsi -- element access (oop, klass)
1613
const Register from = rax; // source array address
1614
const Register to = rdx; // destination array address
1615
const Register length = rcx; // elements count
1616
const Register elem = rdi; // each oop copied
1617
const Register elem_klass = rsi; // each elem._klass (sub_klass)
1618
const Register temp = rbx; // lone remaining temp
1620
__ enter(); // required for proper stackwalking of RuntimeStub frame
1626
Address from_arg(rsp, 16+ 4); // from
1627
Address to_arg(rsp, 16+ 8); // to
1628
Address length_arg(rsp, 16+12); // elements count
1629
Address ckoff_arg(rsp, 16+16); // super_check_offset
1630
Address ckval_arg(rsp, 16+20); // super_klass
1633
__ movptr(from, from_arg);
1634
__ movptr(to, to_arg);
1635
__ movl2ptr(length, length_arg);
1637
if (entry != nullptr) {
1638
*entry = __ pc(); // Entry point from generic arraycopy stub.
1639
BLOCK_COMMENT("Entry:");
1642
//---------------------------------------------------------------
1643
// Assembler stub will be used for this call to arraycopy
1644
// if the two arrays are subtypes of Object[] but the
1645
// destination array type is not equal to or a supertype
1646
// of the source type. Each element must be separately
1649
// Loop-invariant addresses. They are exclusive end pointers.
1650
Address end_from_addr(from, length, Address::times_ptr, 0);
1651
Address end_to_addr(to, length, Address::times_ptr, 0);
1653
Register end_from = from; // re-use
1654
Register end_to = to; // re-use
1655
Register count = length; // re-use
1657
// Loop-variant addresses. They assume post-incremented count < 0.
1658
Address from_element_addr(end_from, count, Address::times_ptr, 0);
1659
Address to_element_addr(end_to, count, Address::times_ptr, 0);
1660
Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1662
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1663
if (dest_uninitialized) {
1664
decorators |= IS_DEST_UNINITIALIZED;
1667
BasicType type = T_OBJECT;
1668
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1669
bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1671
// Copy from low to high addresses, indexed from the end of each array.
1672
__ lea(end_from, end_from_addr);
1673
__ lea(end_to, end_to_addr);
1674
assert(length == count, ""); // else fix next line:
1675
__ negptr(count); // negate and test the length
1676
__ jccb(Assembler::notZero, L_load_element);
1678
// Empty array: Nothing to do.
1679
__ xorptr(rax, rax); // return 0 on (trivial) success
1682
// ======== begin loop ========
1683
// (Loop is rotated; its entry is L_load_element.)
1685
// for (count = -count; count != 0; count++)
1686
// Base pointers src, dst are biased by 8*count,to last element.
1687
__ align(OptoLoopAlignment);
1689
__ BIND(L_store_element);
1690
__ movptr(to_element_addr, elem); // store the oop
1691
__ increment(count); // increment the count toward zero
1692
__ jccb(Assembler::zero, L_do_card_marks);
1694
// ======== loop entry is here ========
1695
__ BIND(L_load_element);
1696
__ movptr(elem, from_element_addr); // load the oop
1697
__ testptr(elem, elem);
1698
__ jccb(Assembler::zero, L_store_element);
1700
// (Could do a trick here: Remember last successful non-null
1701
// element stored and make a quick oop equality check on it.)
1703
__ movptr(elem_klass, elem_klass_addr); // query the object klass
1704
generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1705
&L_store_element, nullptr);
1706
// (On fall-through, we have failed the element type check.)
1707
// ======== end loop ========
1709
// It was a real error; we must depend on the caller to finish the job.
1710
// Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1711
// Emit GC store barriers for the oops we have copied (length_arg + count),
1712
// and report their number to the caller.
1713
assert_different_registers(to, count, rax);
1714
Label L_post_barrier;
1715
__ addl(count, length_arg); // transfers = (length - remaining)
1716
__ movl2ptr(rax, count); // save the value
1717
__ notptr(rax); // report (-1^K) to caller (does not affect flags)
1718
__ jccb(Assembler::notZero, L_post_barrier);
1719
__ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1721
// Come here on success only.
1722
__ BIND(L_do_card_marks);
1723
__ xorptr(rax, rax); // return 0 on success
1724
__ movl2ptr(count, length_arg);
1726
__ BIND(L_post_barrier);
1727
__ movptr(to, to_arg); // reload
1728
bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1730
// Common exit point (success or failure).
1735
inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1736
__ leave(); // required for proper stackwalking of RuntimeStub frame
1743
// Generate 'unsafe' array copy stub
1744
// Though just as safe as the other stubs, it takes an unscaled
1745
// size_t argument instead of an element count.
1748
// 4(rsp) - source array address
1749
// 8(rsp) - destination array address
1750
// 12(rsp) - byte count, can be zero
1753
// rax, == 0 - success
1754
// rax, == -1 - need to call System.arraycopy
1756
// Examines the alignment of the operands and dispatches
1757
// to a long, int, short, or byte copy loop.
1759
address generate_unsafe_copy(const char *name,
1760
address byte_copy_entry,
1761
address short_copy_entry,
1762
address int_copy_entry,
1763
address long_copy_entry) {
1765
Label L_long_aligned, L_int_aligned, L_short_aligned;
1767
__ align(CodeEntryAlignment);
1768
StubCodeMark mark(this, "StubRoutines", name);
1769
address start = __ pc();
1771
const Register from = rax; // source array address
1772
const Register to = rdx; // destination array address
1773
const Register count = rcx; // elements count
1775
__ enter(); // required for proper stackwalking of RuntimeStub frame
1778
Address from_arg(rsp, 12+ 4); // from
1779
Address to_arg(rsp, 12+ 8); // to
1780
Address count_arg(rsp, 12+12); // byte count
1783
__ movptr(from , from_arg);
1784
__ movptr(to , to_arg);
1785
__ movl2ptr(count, count_arg);
1787
// bump this on entry, not on exit:
1788
inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1790
const Register bits = rsi;
1793
__ orptr(bits, count);
1795
__ testl(bits, BytesPerLong-1);
1796
__ jccb(Assembler::zero, L_long_aligned);
1798
__ testl(bits, BytesPerInt-1);
1799
__ jccb(Assembler::zero, L_int_aligned);
1801
__ testl(bits, BytesPerShort-1);
1802
__ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1804
__ BIND(L_short_aligned);
1805
__ shrptr(count, LogBytesPerShort); // size => short_count
1806
__ movl(count_arg, count); // update 'count'
1807
__ jump(RuntimeAddress(short_copy_entry));
1809
__ BIND(L_int_aligned);
1810
__ shrptr(count, LogBytesPerInt); // size => int_count
1811
__ movl(count_arg, count); // update 'count'
1812
__ jump(RuntimeAddress(int_copy_entry));
1814
__ BIND(L_long_aligned);
1815
__ shrptr(count, LogBytesPerLong); // size => qword_count
1816
__ movl(count_arg, count); // update 'count'
1817
__ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1819
__ jump(RuntimeAddress(long_copy_entry));
1825
// Perform range checks on the proposed arraycopy.
1826
// Smashes src_pos and dst_pos. (Uses them up for temps.)
1827
void arraycopy_range_checks(Register src,
1833
BLOCK_COMMENT("arraycopy_range_checks:");
1834
const Register src_end = src_pos; // source array end position
1835
const Register dst_end = dst_pos; // destination array end position
1836
__ addl(src_end, length); // src_pos + length
1837
__ addl(dst_end, length); // dst_pos + length
1839
// if (src_pos + length > arrayOop(src)->length() ) FAIL;
1840
__ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1841
__ jcc(Assembler::above, L_failed);
1843
// if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1844
__ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1845
__ jcc(Assembler::above, L_failed);
1847
BLOCK_COMMENT("arraycopy_range_checks done");
1852
// Generate generic array copy stubs
1857
// 12(rsp) - dst oop
1858
// 16(rsp) - dst_pos
1859
// 20(rsp) - element count
1862
// rax, == 0 - success
1863
// rax, == -1^K - failure, where K is partial transfer count
1865
address generate_generic_copy(const char *name,
1866
address entry_jbyte_arraycopy,
1867
address entry_jshort_arraycopy,
1868
address entry_jint_arraycopy,
1869
address entry_oop_arraycopy,
1870
address entry_jlong_arraycopy,
1871
address entry_checkcast_arraycopy) {
1872
Label L_failed, L_failed_0, L_objArray;
1874
{ int modulus = CodeEntryAlignment;
1875
int target = modulus - 5; // 5 = sizeof jmp(L_failed)
1876
int advance = target - (__ offset() % modulus);
1877
if (advance < 0) advance += modulus;
1878
if (advance > 0) __ nop(advance);
1880
StubCodeMark mark(this, "StubRoutines", name);
1882
// Short-hop target to L_failed. Makes for denser prologue code.
1883
__ BIND(L_failed_0);
1885
assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1887
__ align(CodeEntryAlignment);
1888
address start = __ pc();
1890
__ enter(); // required for proper stackwalking of RuntimeStub frame
1894
// bump this on entry, not on exit:
1895
inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1898
Address SRC (rsp, 12+ 4);
1899
Address SRC_POS (rsp, 12+ 8);
1900
Address DST (rsp, 12+12);
1901
Address DST_POS (rsp, 12+16);
1902
Address LENGTH (rsp, 12+20);
1904
//-----------------------------------------------------------------------
1905
// Assembler stub will be used for this call to arraycopy
1906
// if the following conditions are met:
1908
// (1) src and dst must not be null.
1909
// (2) src_pos must not be negative.
1910
// (3) dst_pos must not be negative.
1911
// (4) length must not be negative.
1912
// (5) src klass and dst klass should be the same and not null.
1913
// (6) src and dst should be arrays.
1914
// (7) src_pos + length must not exceed length of src.
1915
// (8) dst_pos + length must not exceed length of dst.
1918
const Register src = rax; // source array oop
1919
const Register src_pos = rsi;
1920
const Register dst = rdx; // destination array oop
1921
const Register dst_pos = rdi;
1922
const Register length = rcx; // transfer count
1924
// if (src == null) return -1;
1925
__ movptr(src, SRC); // src oop
1926
__ testptr(src, src);
1927
__ jccb(Assembler::zero, L_failed_0);
1929
// if (src_pos < 0) return -1;
1930
__ movl2ptr(src_pos, SRC_POS); // src_pos
1931
__ testl(src_pos, src_pos);
1932
__ jccb(Assembler::negative, L_failed_0);
1934
// if (dst == nullptr) return -1;
1935
__ movptr(dst, DST); // dst oop
1936
__ testptr(dst, dst);
1937
__ jccb(Assembler::zero, L_failed_0);
1939
// if (dst_pos < 0) return -1;
1940
__ movl2ptr(dst_pos, DST_POS); // dst_pos
1941
__ testl(dst_pos, dst_pos);
1942
__ jccb(Assembler::negative, L_failed_0);
1944
// if (length < 0) return -1;
1945
__ movl2ptr(length, LENGTH); // length
1946
__ testl(length, length);
1947
__ jccb(Assembler::negative, L_failed_0);
1949
// if (src->klass() == nullptr) return -1;
1950
Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1951
Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1952
const Register rcx_src_klass = rcx; // array klass
1953
__ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1956
// assert(src->klass() != nullptr);
1957
BLOCK_COMMENT("assert klasses not null");
1959
__ testptr(rcx_src_klass, rcx_src_klass);
1960
__ jccb(Assembler::notZero, L2); // it is broken if klass is null
1962
__ stop("broken null klass");
1964
__ cmpptr(dst_klass_addr, NULL_WORD);
1965
__ jccb(Assembler::equal, L1); // this would be broken also
1966
BLOCK_COMMENT("assert done");
1970
// Load layout helper (32-bits)
1972
// |array_tag| | header_size | element_type | |log2_element_size|
1973
// 32 30 24 16 8 2 0
1975
// array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1978
int lh_offset = in_bytes(Klass::layout_helper_offset());
1979
Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1981
// Handle objArrays completely differently...
1982
jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1983
__ cmpl(src_klass_lh_addr, objArray_lh);
1984
__ jcc(Assembler::equal, L_objArray);
1986
// if (src->klass() != dst->klass()) return -1;
1987
__ cmpptr(rcx_src_klass, dst_klass_addr);
1988
__ jccb(Assembler::notEqual, L_failed_0);
1990
const Register rcx_lh = rcx; // layout helper
1991
assert(rcx_lh == rcx_src_klass, "known alias");
1992
__ movl(rcx_lh, src_klass_lh_addr);
1994
// if (!src->is_Array()) return -1;
1995
__ cmpl(rcx_lh, Klass::_lh_neutral_value);
1996
__ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1998
// At this point, it is known to be a typeArray (array_tag 0x3).
2001
__ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2002
__ jcc(Assembler::greaterEqual, L); // signed cmp
2003
__ stop("must be a primitive array");
2008
assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
2009
arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2013
// src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2014
// dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2016
const Register rsi_offset = rsi; // array offset
2017
const Register src_array = src; // src array offset
2018
const Register dst_array = dst; // dst array offset
2019
const Register rdi_elsize = rdi; // log2 element size
2021
__ mov(rsi_offset, rcx_lh);
2022
__ shrptr(rsi_offset, Klass::_lh_header_size_shift);
2023
__ andptr(rsi_offset, Klass::_lh_header_size_mask); // array_offset
2024
__ addptr(src_array, rsi_offset); // src array offset
2025
__ addptr(dst_array, rsi_offset); // dst array offset
2026
__ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
2028
// next registers should be set before the jump to corresponding stub
2029
const Register from = src; // source array address
2030
const Register to = dst; // destination array address
2031
const Register count = rcx; // elements count
2032
// some of them should be duplicated on stack
2033
#define FROM Address(rsp, 12+ 4)
2034
#define TO Address(rsp, 12+ 8) // Not used now
2035
#define COUNT Address(rsp, 12+12) // Only for oop arraycopy
2037
BLOCK_COMMENT("scale indexes to element size");
2038
__ movl2ptr(rsi, SRC_POS); // src_pos
2039
__ shlptr(rsi); // src_pos << rcx (log2 elsize)
2040
assert(src_array == from, "");
2041
__ addptr(from, rsi); // from = src_array + SRC_POS << log2 elsize
2042
__ movl2ptr(rdi, DST_POS); // dst_pos
2043
__ shlptr(rdi); // dst_pos << rcx (log2 elsize)
2044
assert(dst_array == to, "");
2045
__ addptr(to, rdi); // to = dst_array + DST_POS << log2 elsize
2046
__ movptr(FROM, from); // src_addr
2047
__ mov(rdi_elsize, rcx_lh); // log2 elsize
2048
__ movl2ptr(count, LENGTH); // elements count
2050
BLOCK_COMMENT("choose copy loop based on element size");
2051
__ cmpl(rdi_elsize, 0);
2053
__ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
2054
__ cmpl(rdi_elsize, LogBytesPerShort);
2055
__ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
2056
__ cmpl(rdi_elsize, LogBytesPerInt);
2057
__ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
2059
__ cmpl(rdi_elsize, LogBytesPerLong);
2060
__ jccb(Assembler::notEqual, L_failed);
2062
__ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
2064
__ jump(RuntimeAddress(entry_jlong_arraycopy));
2067
__ xorptr(rax, rax);
2068
__ notptr(rax); // return -1
2071
__ leave(); // required for proper stackwalking of RuntimeStub frame
2075
__ BIND(L_objArray);
2076
// live at this point: rcx_src_klass, src[_pos], dst[_pos]
2078
Label L_plain_copy, L_checkcast_copy;
2079
// test array classes for subtyping
2080
__ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
2081
__ jccb(Assembler::notEqual, L_checkcast_copy);
2083
// Identically typed arrays can be copied without element-wise checks.
2084
assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
2085
arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2087
__ BIND(L_plain_copy);
2088
__ movl2ptr(count, LENGTH); // elements count
2089
__ movl2ptr(src_pos, SRC_POS); // reload src_pos
2090
__ lea(from, Address(src, src_pos, Address::times_ptr,
2091
arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2092
__ movl2ptr(dst_pos, DST_POS); // reload dst_pos
2093
__ lea(to, Address(dst, dst_pos, Address::times_ptr,
2094
arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2095
__ movptr(FROM, from); // src_addr
2096
__ movptr(TO, to); // dst_addr
2097
__ movl(COUNT, count); // count
2098
__ jump(RuntimeAddress(entry_oop_arraycopy));
2100
__ BIND(L_checkcast_copy);
2101
// live at this point: rcx_src_klass, dst[_pos], src[_pos]
2104
int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2105
int sco_offset = in_bytes(Klass::super_check_offset_offset());
2107
Register rsi_dst_klass = rsi;
2108
Register rdi_temp = rdi;
2109
assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
2110
assert(rdi_temp == dst_pos, "expected alias w/ dst_pos");
2111
Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
2113
// Before looking at dst.length, make sure dst is also an objArray.
2114
__ movptr(rsi_dst_klass, dst_klass_addr);
2115
__ cmpl(dst_klass_lh_addr, objArray_lh);
2116
__ jccb(Assembler::notEqual, L_failed);
2118
// It is safe to examine both src.length and dst.length.
2119
__ movl2ptr(src_pos, SRC_POS); // reload rsi
2120
arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2121
// (Now src_pos and dst_pos are killed, but not src and dst.)
2123
// We'll need this temp (don't forget to pop it after the type check).
2125
Register rbx_src_klass = rbx;
2127
__ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
2128
__ movptr(rsi_dst_klass, dst_klass_addr);
2129
Address super_check_offset_addr(rsi_dst_klass, sco_offset);
2130
Label L_fail_array_check;
2131
generate_type_check(rbx_src_klass,
2132
super_check_offset_addr, dst_klass_addr,
2133
rdi_temp, nullptr, &L_fail_array_check);
2134
// (On fall-through, we have passed the array type check.)
2136
__ jmp(L_plain_copy);
2138
__ BIND(L_fail_array_check);
2139
// Reshuffle arguments so we can call checkcast_arraycopy:
2141
// match initial saves for checkcast_arraycopy
2142
// push(rsi); // already done; see above
2143
// push(rdi); // already done; see above
2144
// push(rbx); // already done; see above
2146
// Marshal outgoing arguments now, freeing registers.
2147
Address from_arg(rsp, 16+ 4); // from
2148
Address to_arg(rsp, 16+ 8); // to
2149
Address length_arg(rsp, 16+12); // elements count
2150
Address ckoff_arg(rsp, 16+16); // super_check_offset
2151
Address ckval_arg(rsp, 16+20); // super_klass
2153
Address SRC_POS_arg(rsp, 16+ 8);
2154
Address DST_POS_arg(rsp, 16+16);
2155
Address LENGTH_arg(rsp, 16+20);
2156
// push rbx, changed the incoming offsets (why not just use rbp,??)
2157
// assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
2159
__ movptr(rbx, Address(rsi_dst_klass, ek_offset));
2160
__ movl2ptr(length, LENGTH_arg); // reload elements count
2161
__ movl2ptr(src_pos, SRC_POS_arg); // reload src_pos
2162
__ movl2ptr(dst_pos, DST_POS_arg); // reload dst_pos
2164
__ movptr(ckval_arg, rbx); // destination element type
2165
__ movl(rbx, Address(rbx, sco_offset));
2166
__ movl(ckoff_arg, rbx); // corresponding class check offset
2168
__ movl(length_arg, length); // outgoing length argument
2170
__ lea(from, Address(src, src_pos, Address::times_ptr,
2171
arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2172
__ movptr(from_arg, from);
2174
__ lea(to, Address(dst, dst_pos, Address::times_ptr,
2175
arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2176
__ movptr(to_arg, to);
2177
__ jump(RuntimeAddress(entry_checkcast_arraycopy));
2183
void generate_arraycopy_stubs() {
2185
address entry_jbyte_arraycopy;
2186
address entry_jshort_arraycopy;
2187
address entry_jint_arraycopy;
2188
address entry_oop_arraycopy;
2189
address entry_jlong_arraycopy;
2190
address entry_checkcast_arraycopy;
2192
StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
2193
generate_disjoint_copy(T_BYTE, true, Address::times_1, &entry,
2194
"arrayof_jbyte_disjoint_arraycopy");
2195
StubRoutines::_arrayof_jbyte_arraycopy =
2196
generate_conjoint_copy(T_BYTE, true, Address::times_1, entry,
2197
nullptr, "arrayof_jbyte_arraycopy");
2198
StubRoutines::_jbyte_disjoint_arraycopy =
2199
generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
2200
"jbyte_disjoint_arraycopy");
2201
StubRoutines::_jbyte_arraycopy =
2202
generate_conjoint_copy(T_BYTE, false, Address::times_1, entry,
2203
&entry_jbyte_arraycopy, "jbyte_arraycopy");
2205
StubRoutines::_arrayof_jshort_disjoint_arraycopy =
2206
generate_disjoint_copy(T_SHORT, true, Address::times_2, &entry,
2207
"arrayof_jshort_disjoint_arraycopy");
2208
StubRoutines::_arrayof_jshort_arraycopy =
2209
generate_conjoint_copy(T_SHORT, true, Address::times_2, entry,
2210
nullptr, "arrayof_jshort_arraycopy");
2211
StubRoutines::_jshort_disjoint_arraycopy =
2212
generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
2213
"jshort_disjoint_arraycopy");
2214
StubRoutines::_jshort_arraycopy =
2215
generate_conjoint_copy(T_SHORT, false, Address::times_2, entry,
2216
&entry_jshort_arraycopy, "jshort_arraycopy");
2218
// Next arrays are always aligned on 4 bytes at least.
2219
StubRoutines::_jint_disjoint_arraycopy =
2220
generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
2221
"jint_disjoint_arraycopy");
2222
StubRoutines::_jint_arraycopy =
2223
generate_conjoint_copy(T_INT, true, Address::times_4, entry,
2224
&entry_jint_arraycopy, "jint_arraycopy");
2226
StubRoutines::_oop_disjoint_arraycopy =
2227
generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2228
"oop_disjoint_arraycopy");
2229
StubRoutines::_oop_arraycopy =
2230
generate_conjoint_copy(T_OBJECT, true, Address::times_ptr, entry,
2231
&entry_oop_arraycopy, "oop_arraycopy");
2233
StubRoutines::_oop_disjoint_arraycopy_uninit =
2234
generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2235
"oop_disjoint_arraycopy_uninit",
2236
/*dest_uninitialized*/true);
2237
StubRoutines::_oop_arraycopy_uninit =
2238
generate_conjoint_copy(T_OBJECT, true, Address::times_ptr, entry,
2239
nullptr, "oop_arraycopy_uninit",
2240
/*dest_uninitialized*/true);
2242
StubRoutines::_jlong_disjoint_arraycopy =
2243
generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2244
StubRoutines::_jlong_arraycopy =
2245
generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2248
StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2249
StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2250
StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2251
StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2252
StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2253
StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2255
StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
2256
StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
2257
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2258
StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
2260
StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
2261
StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
2262
StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2263
StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
2265
StubRoutines::_checkcast_arraycopy =
2266
generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2267
StubRoutines::_checkcast_arraycopy_uninit =
2268
generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, /*dest_uninitialized*/true);
2270
StubRoutines::_unsafe_arraycopy =
2271
generate_unsafe_copy("unsafe_arraycopy",
2272
entry_jbyte_arraycopy,
2273
entry_jshort_arraycopy,
2274
entry_jint_arraycopy,
2275
entry_jlong_arraycopy);
2277
StubRoutines::_generic_arraycopy =
2278
generate_generic_copy("generic_arraycopy",
2279
entry_jbyte_arraycopy,
2280
entry_jshort_arraycopy,
2281
entry_jint_arraycopy,
2282
entry_oop_arraycopy,
2283
entry_jlong_arraycopy,
2284
entry_checkcast_arraycopy);
2287
// AES intrinsic stubs
2288
enum {AESBlockSize = 16};
2290
address key_shuffle_mask_addr() {
2291
return (address)KEY_SHUFFLE_MASK;
2294
address counter_shuffle_mask_addr() {
2295
return (address)COUNTER_SHUFFLE_MASK;
2298
// Utility routine for loading a 128-bit key word in little endian format
2299
// can optionally specify that the shuffle mask is already in an xmmregister
2300
void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg) {
2301
__ movdqu(xmmdst, Address(key, offset));
2302
if (xmm_shuf_mask != xnoreg) {
2303
__ pshufb(xmmdst, xmm_shuf_mask);
2305
__ pshufb(xmmdst, ExternalAddress(key_shuffle_mask_addr()));
2309
// aesenc using specified key+offset
2310
// can optionally specify that the shuffle mask is already in an xmmregister
2311
void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg) {
2312
load_key(xmmtmp, key, offset, xmm_shuf_mask);
2313
__ aesenc(xmmdst, xmmtmp);
2316
// aesdec using specified key+offset
2317
// can optionally specify that the shuffle mask is already in an xmmregister
2318
void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg) {
2319
load_key(xmmtmp, key, offset, xmm_shuf_mask);
2320
__ aesdec(xmmdst, xmmtmp);
2323
// Utility routine for increase 128bit counter (iv in CTR mode)
2324
// XMM_128bit, D3, D2, D1, D0
2325
void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2326
__ pextrd(reg, xmmdst, 0x0);
2327
__ addl(reg, inc_delta);
2328
__ pinsrd(xmmdst, reg, 0x0);
2329
__ jcc(Assembler::carryClear, next_block); // jump if no carry
2331
__ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2333
__ pinsrd(xmmdst, reg, 0x01);
2334
__ jcc(Assembler::carryClear, next_block); // jump if no carry
2336
__ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2338
__ pinsrd(xmmdst, reg, 0x02);
2339
__ jcc(Assembler::carryClear, next_block); // jump if no carry
2341
__ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2343
__ pinsrd(xmmdst, reg, 0x03);
2345
__ BIND(next_block); // next instruction
2352
// c_rarg0 - source byte array address
2353
// c_rarg1 - destination byte array address
2354
// c_rarg2 - K (key) in little endian int array
2356
address generate_aescrypt_encryptBlock() {
2357
assert(UseAES, "need AES instructions and misaligned SSE support");
2358
__ align(CodeEntryAlignment);
2359
StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2361
address start = __ pc();
2363
const Register from = rdx; // source array address
2364
const Register to = rdx; // destination array address
2365
const Register key = rcx; // key array address
2366
const Register keylen = rax;
2367
const Address from_param(rbp, 8+0);
2368
const Address to_param (rbp, 8+4);
2369
const Address key_param (rbp, 8+8);
2371
const XMMRegister xmm_result = xmm0;
2372
const XMMRegister xmm_key_shuf_mask = xmm1;
2373
const XMMRegister xmm_temp1 = xmm2;
2374
const XMMRegister xmm_temp2 = xmm3;
2375
const XMMRegister xmm_temp3 = xmm4;
2376
const XMMRegister xmm_temp4 = xmm5;
2378
__ enter(); // required for proper stackwalking of RuntimeStub frame
2380
__ movptr(from, from_param);
2381
__ movptr(key, key_param);
2383
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2384
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2386
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2387
__ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
2388
__ movptr(to, to_param);
2390
// For encryption, the java expanded key ordering is just what we need
2392
load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2393
__ pxor(xmm_result, xmm_temp1);
2395
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2396
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2397
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2398
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2400
__ aesenc(xmm_result, xmm_temp1);
2401
__ aesenc(xmm_result, xmm_temp2);
2402
__ aesenc(xmm_result, xmm_temp3);
2403
__ aesenc(xmm_result, xmm_temp4);
2405
load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2406
load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2407
load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2408
load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2410
__ aesenc(xmm_result, xmm_temp1);
2411
__ aesenc(xmm_result, xmm_temp2);
2412
__ aesenc(xmm_result, xmm_temp3);
2413
__ aesenc(xmm_result, xmm_temp4);
2415
load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2416
load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2418
__ cmpl(keylen, 44);
2419
__ jccb(Assembler::equal, L_doLast);
2421
__ aesenc(xmm_result, xmm_temp1);
2422
__ aesenc(xmm_result, xmm_temp2);
2424
load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2425
load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2427
__ cmpl(keylen, 52);
2428
__ jccb(Assembler::equal, L_doLast);
2430
__ aesenc(xmm_result, xmm_temp1);
2431
__ aesenc(xmm_result, xmm_temp2);
2433
load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2434
load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2437
__ aesenc(xmm_result, xmm_temp1);
2438
__ aesenclast(xmm_result, xmm_temp2);
2439
__ movdqu(Address(to, 0), xmm_result); // store the result
2440
__ xorptr(rax, rax); // return 0
2441
__ leave(); // required for proper stackwalking of RuntimeStub frame
2451
// c_rarg0 - source byte array address
2452
// c_rarg1 - destination byte array address
2453
// c_rarg2 - K (key) in little endian int array
2455
address generate_aescrypt_decryptBlock() {
2456
assert(UseAES, "need AES instructions and misaligned SSE support");
2457
__ align(CodeEntryAlignment);
2458
StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2460
address start = __ pc();
2462
const Register from = rdx; // source array address
2463
const Register to = rdx; // destination array address
2464
const Register key = rcx; // key array address
2465
const Register keylen = rax;
2466
const Address from_param(rbp, 8+0);
2467
const Address to_param (rbp, 8+4);
2468
const Address key_param (rbp, 8+8);
2470
const XMMRegister xmm_result = xmm0;
2471
const XMMRegister xmm_key_shuf_mask = xmm1;
2472
const XMMRegister xmm_temp1 = xmm2;
2473
const XMMRegister xmm_temp2 = xmm3;
2474
const XMMRegister xmm_temp3 = xmm4;
2475
const XMMRegister xmm_temp4 = xmm5;
2477
__ enter(); // required for proper stackwalking of RuntimeStub frame
2479
__ movptr(from, from_param);
2480
__ movptr(key, key_param);
2482
// keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2483
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2485
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2486
__ movdqu(xmm_result, Address(from, 0));
2487
__ movptr(to, to_param);
2489
// for decryption java expanded key ordering is rotated one position from what we want
2490
// so we start from 0x10 here and hit 0x00 last
2491
// we don't know if the key is aligned, hence not using load-execute form
2492
load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2493
load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2494
load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2495
load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2497
__ pxor (xmm_result, xmm_temp1);
2498
__ aesdec(xmm_result, xmm_temp2);
2499
__ aesdec(xmm_result, xmm_temp3);
2500
__ aesdec(xmm_result, xmm_temp4);
2502
load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2503
load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2504
load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2505
load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2507
__ aesdec(xmm_result, xmm_temp1);
2508
__ aesdec(xmm_result, xmm_temp2);
2509
__ aesdec(xmm_result, xmm_temp3);
2510
__ aesdec(xmm_result, xmm_temp4);
2512
load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2513
load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2514
load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2516
__ cmpl(keylen, 44);
2517
__ jccb(Assembler::equal, L_doLast);
2519
__ aesdec(xmm_result, xmm_temp1);
2520
__ aesdec(xmm_result, xmm_temp2);
2522
load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2523
load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2525
__ cmpl(keylen, 52);
2526
__ jccb(Assembler::equal, L_doLast);
2528
__ aesdec(xmm_result, xmm_temp1);
2529
__ aesdec(xmm_result, xmm_temp2);
2531
load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2532
load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2535
__ aesdec(xmm_result, xmm_temp1);
2536
__ aesdec(xmm_result, xmm_temp2);
2538
// for decryption the aesdeclast operation is always on key+0x00
2539
__ aesdeclast(xmm_result, xmm_temp3);
2540
__ movdqu(Address(to, 0), xmm_result); // store the result
2541
__ xorptr(rax, rax); // return 0
2542
__ leave(); // required for proper stackwalking of RuntimeStub frame
2548
void handleSOERegisters(bool saving) {
2549
const int saveFrameSizeInBytes = 4 * wordSize;
2550
const Address saved_rbx (rbp, -3 * wordSize);
2551
const Address saved_rsi (rbp, -2 * wordSize);
2552
const Address saved_rdi (rbp, -1 * wordSize);
2555
__ subptr(rsp, saveFrameSizeInBytes);
2556
__ movptr(saved_rsi, rsi);
2557
__ movptr(saved_rdi, rdi);
2558
__ movptr(saved_rbx, rbx);
2561
__ movptr(rsi, saved_rsi);
2562
__ movptr(rdi, saved_rdi);
2563
__ movptr(rbx, saved_rbx);
2570
// c_rarg0 - source byte array address
2571
// c_rarg1 - destination byte array address
2572
// c_rarg2 - K (key) in little endian int array
2573
// c_rarg3 - r vector byte array address
2574
// c_rarg4 - input length
2577
// rax - input length
2579
address generate_cipherBlockChaining_encryptAESCrypt() {
2580
assert(UseAES, "need AES instructions and misaligned SSE support");
2581
__ align(CodeEntryAlignment);
2582
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2583
address start = __ pc();
2585
Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2586
const Register from = rsi; // source array address
2587
const Register to = rdx; // destination array address
2588
const Register key = rcx; // key array address
2589
const Register rvec = rdi; // r byte array initialized from initvector array address
2590
// and left with the results of the last encryption block
2591
const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
2592
const Register pos = rax;
2594
// xmm register assignments for the loops below
2595
const XMMRegister xmm_result = xmm0;
2596
const XMMRegister xmm_temp = xmm1;
2597
// first 6 keys preloaded into xmm2-xmm7
2598
const int XMM_REG_NUM_KEY_FIRST = 2;
2599
const int XMM_REG_NUM_KEY_LAST = 7;
2600
const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2602
__ enter(); // required for proper stackwalking of RuntimeStub frame
2603
handleSOERegisters(true /*saving*/);
2605
// load registers from incoming parameters
2606
const Address from_param(rbp, 8+0);
2607
const Address to_param (rbp, 8+4);
2608
const Address key_param (rbp, 8+8);
2609
const Address rvec_param (rbp, 8+12);
2610
const Address len_param (rbp, 8+16);
2611
__ movptr(from , from_param);
2612
__ movptr(to , to_param);
2613
__ movptr(key , key_param);
2614
__ movptr(rvec , rvec_param);
2615
__ movptr(len_reg , len_param);
2617
const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
2618
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2619
// load up xmm regs 2 thru 7 with keys 0-5
2620
for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2621
load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2625
__ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
2627
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2628
__ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2630
__ jcc(Assembler::notEqual, L_key_192_256);
2632
// 128 bit code follows here
2634
__ align(OptoLoopAlignment);
2635
__ BIND(L_loopTop_128);
2636
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2637
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
2639
__ pxor (xmm_result, xmm_key0); // do the aes rounds
2640
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2641
__ aesenc(xmm_result, as_XMMRegister(rnum));
2643
for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2644
aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2646
load_key(xmm_temp, key, 0xa0);
2647
__ aesenclast(xmm_result, xmm_temp);
2649
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
2650
// no need to store r to memory until we exit
2651
__ addptr(pos, AESBlockSize);
2652
__ subptr(len_reg, AESBlockSize);
2653
__ jcc(Assembler::notEqual, L_loopTop_128);
2656
__ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
2658
handleSOERegisters(false /*restoring*/);
2659
__ movptr(rax, len_param); // return length
2660
__ leave(); // required for proper stackwalking of RuntimeStub frame
2663
__ BIND(L_key_192_256);
2664
// here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2666
__ jcc(Assembler::notEqual, L_key_256);
2668
// 192-bit code follows here (could be changed to use more xmm registers)
2670
__ align(OptoLoopAlignment);
2671
__ BIND(L_loopTop_192);
2672
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2673
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
2675
__ pxor (xmm_result, xmm_key0); // do the aes rounds
2676
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2677
__ aesenc(xmm_result, as_XMMRegister(rnum));
2679
for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2680
aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2682
load_key(xmm_temp, key, 0xc0);
2683
__ aesenclast(xmm_result, xmm_temp);
2685
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
2686
// no need to store r to memory until we exit
2687
__ addptr(pos, AESBlockSize);
2688
__ subptr(len_reg, AESBlockSize);
2689
__ jcc(Assembler::notEqual, L_loopTop_192);
2693
// 256-bit code follows here (could be changed to use more xmm registers)
2695
__ align(OptoLoopAlignment);
2696
__ BIND(L_loopTop_256);
2697
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
2698
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
2700
__ pxor (xmm_result, xmm_key0); // do the aes rounds
2701
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
2702
__ aesenc(xmm_result, as_XMMRegister(rnum));
2704
for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2705
aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2707
load_key(xmm_temp, key, 0xe0);
2708
__ aesenclast(xmm_result, xmm_temp);
2710
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
2711
// no need to store r to memory until we exit
2712
__ addptr(pos, AESBlockSize);
2713
__ subptr(len_reg, AESBlockSize);
2714
__ jcc(Assembler::notEqual, L_loopTop_256);
2721
// CBC AES Decryption.
2722
// In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2727
// c_rarg0 - source byte array address
2728
// c_rarg1 - destination byte array address
2729
// c_rarg2 - K (key) in little endian int array
2730
// c_rarg3 - r vector byte array address
2731
// c_rarg4 - input length
2734
// rax - input length
2737
address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2738
assert(UseAES, "need AES instructions and misaligned SSE support");
2739
__ align(CodeEntryAlignment);
2740
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2741
address start = __ pc();
2743
const Register from = rsi; // source array address
2744
const Register to = rdx; // destination array address
2745
const Register key = rcx; // key array address
2746
const Register rvec = rdi; // r byte array initialized from initvector array address
2747
// and left with the results of the last encryption block
2748
const Register len_reg = rbx; // src len (must be multiple of blocksize 16)
2749
const Register pos = rax;
2751
const int PARALLEL_FACTOR = 4;
2752
const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2755
Label L_singleBlock_loopTop[3]; //128, 192, 256
2756
Label L_multiBlock_loopTop[3]; //128, 192, 256
2758
const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2759
const XMMRegister xmm_key_shuf_mask = xmm1;
2761
const XMMRegister xmm_key_tmp0 = xmm2;
2762
const XMMRegister xmm_key_tmp1 = xmm3;
2764
// registers holding the six results in the parallelized loop
2765
const XMMRegister xmm_result0 = xmm4;
2766
const XMMRegister xmm_result1 = xmm5;
2767
const XMMRegister xmm_result2 = xmm6;
2768
const XMMRegister xmm_result3 = xmm7;
2770
__ enter(); // required for proper stackwalking of RuntimeStub frame
2771
handleSOERegisters(true /*saving*/);
2773
// load registers from incoming parameters
2774
const Address from_param(rbp, 8+0);
2775
const Address to_param (rbp, 8+4);
2776
const Address key_param (rbp, 8+8);
2777
const Address rvec_param (rbp, 8+12);
2778
const Address len_param (rbp, 8+16);
2780
__ movptr(from , from_param);
2781
__ movptr(to , to_param);
2782
__ movptr(key , key_param);
2783
__ movptr(rvec , rvec_param);
2784
__ movptr(len_reg , len_param);
2786
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2787
__ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2789
__ xorptr(pos, pos);
2791
// now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2793
__ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2795
__ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2797
__ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2799
#define DoFour(opc, src_reg) \
2800
__ opc(xmm_result0, src_reg); \
2801
__ opc(xmm_result1, src_reg); \
2802
__ opc(xmm_result2, src_reg); \
2803
__ opc(xmm_result3, src_reg); \
2805
for (int k = 0; k < 3; ++k) {
2806
__ align(OptoLoopAlignment);
2807
__ BIND(L_multiBlock_loopTop[k]);
2808
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2809
__ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2811
__ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2812
__ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2813
__ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2814
__ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2816
// the java expanded key ordering is rotated one position from what we want
2817
// so we start from 0x10 here and hit 0x00 last
2818
load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2819
DoFour(pxor, xmm_key_tmp0); //xor with first key
2820
// do the aes dec rounds
2821
for (int rnum = 1; rnum <= ROUNDS[k];) {
2822
//load two keys at a time
2823
//k1->0x20, ..., k9->0xa0, k10->0x00
2824
load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2825
load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2826
DoFour(aesdec, xmm_key_tmp1);
2828
if (rnum != ROUNDS[k]) {
2829
DoFour(aesdec, xmm_key_tmp0);
2832
DoFour(aesdeclast, xmm_key_tmp0);
2837
// for each result, xor with the r vector of previous cipher block
2838
__ pxor(xmm_result0, xmm_prev_block_cipher);
2839
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2840
__ pxor(xmm_result1, xmm_prev_block_cipher);
2841
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2842
__ pxor(xmm_result2, xmm_prev_block_cipher);
2843
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2844
__ pxor(xmm_result3, xmm_prev_block_cipher);
2845
__ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2847
// store 4 results into the next 64 bytes of output
2848
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2849
__ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2850
__ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2851
__ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2853
__ addptr(pos, 4 * AESBlockSize);
2854
__ subptr(len_reg, 4 * AESBlockSize);
2855
__ jmp(L_multiBlock_loopTop[k]);
2857
//singleBlock starts here
2858
__ align(OptoLoopAlignment);
2859
__ BIND(L_singleBlock_loopTop[k]);
2860
__ cmpptr(len_reg, 0); // any blocks left?
2861
__ jcc(Assembler::equal, L_exit);
2862
__ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2863
__ movdqa(xmm_result1, xmm_result0);
2865
load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2866
__ pxor(xmm_result0, xmm_key_tmp0);
2867
// do the aes dec rounds
2868
for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2869
// the java expanded key ordering is rotated one position from what we want
2870
load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2871
__ aesdec(xmm_result0, xmm_key_tmp0);
2873
load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2874
__ aesdeclast(xmm_result0, xmm_key_tmp0);
2875
__ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2876
__ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2877
// no need to store r to memory until we exit
2878
__ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2880
__ addptr(pos, AESBlockSize);
2881
__ subptr(len_reg, AESBlockSize);
2882
__ jmp(L_singleBlock_loopTop[k]);
2886
__ movptr(rvec, rvec_param); // restore this since reused earlier
2887
__ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2888
handleSOERegisters(false /*restoring*/);
2889
__ movptr(rax, len_param); // return length
2890
__ leave(); // required for proper stackwalking of RuntimeStub frame
2897
// In 32-bit stub, parallelize 4 blocks at a time
2901
// c_rarg0 - source byte array address
2902
// c_rarg1 - destination byte array address
2903
// c_rarg2 - K (key) in little endian int array
2904
// c_rarg3 - counter vector byte array address
2905
// c_rarg4 - input length
2908
// rax - input length
2910
address generate_counterMode_AESCrypt_Parallel() {
2911
assert(UseAES, "need AES instructions and misaligned SSE support");
2912
__ align(CodeEntryAlignment);
2913
StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2914
address start = __ pc();
2915
const Register from = rsi; // source array address
2916
const Register to = rdx; // destination array address
2917
const Register key = rcx; // key array address
2918
const Register counter = rdi; // counter byte array initialized from initvector array address
2919
// and updated with the incremented counter in the end
2920
const Register len_reg = rbx;
2921
const Register pos = rax;
2923
__ enter(); // required for proper stackwalking of RuntimeStub frame
2924
handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2926
// load registers from incoming parameters
2927
const Address from_param(rbp, 8+0);
2928
const Address to_param (rbp, 8+4);
2929
const Address key_param (rbp, 8+8);
2930
const Address rvec_param (rbp, 8+12);
2931
const Address len_param (rbp, 8+16);
2932
const Address saved_counter_param(rbp, 8 + 20);
2933
const Address used_addr_param(rbp, 8 + 24);
2935
__ movptr(from , from_param);
2936
__ movptr(to , to_param);
2937
__ movptr(len_reg , len_param);
2939
// Use the partially used encrpyted counter from last invocation
2940
Label L_exit_preLoop, L_preLoop_start;
2942
// Use the registers 'counter' and 'key' here in this preloop
2943
// to hold of last 2 params 'used' and 'saved_encCounter_start'
2944
Register used = counter;
2945
Register saved_encCounter_start = key;
2946
Register used_addr = saved_encCounter_start;
2948
__ movptr(used_addr, used_addr_param);
2949
__ movptr(used, Address(used_addr, 0));
2950
__ movptr(saved_encCounter_start, saved_counter_param);
2952
__ BIND(L_preLoop_start);
2953
__ cmpptr(used, 16);
2954
__ jcc(Assembler::aboveEqual, L_exit_preLoop);
2955
__ cmpptr(len_reg, 0);
2956
__ jcc(Assembler::lessEqual, L_exit_preLoop);
2957
__ movb(rax, Address(saved_encCounter_start, used));
2958
__ xorb(rax, Address(from, 0));
2959
__ movb(Address(to, 0), rax);
2963
__ subptr(len_reg, 1);
2965
__ jmp(L_preLoop_start);
2967
__ BIND(L_exit_preLoop);
2968
__ movptr(used_addr, used_addr_param);
2969
__ movptr(used_addr, used_addr_param);
2970
__ movl(Address(used_addr, 0), used);
2972
// load the parameters 'key' and 'counter'
2973
__ movptr(key, key_param);
2974
__ movptr(counter, rvec_param);
2976
// xmm register assignments for the loops below
2977
const XMMRegister xmm_curr_counter = xmm0;
2978
const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded
2979
const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded
2980
const XMMRegister xmm_key = xmm3;
2981
const XMMRegister xmm_result0 = xmm4;
2982
const XMMRegister xmm_result1 = xmm5;
2983
const XMMRegister xmm_result2 = xmm6;
2984
const XMMRegister xmm_result3 = xmm7;
2985
const XMMRegister xmm_from0 = xmm1; //reuse XMM register
2986
const XMMRegister xmm_from1 = xmm2;
2987
const XMMRegister xmm_from2 = xmm3;
2988
const XMMRegister xmm_from3 = xmm4;
2990
//for key_128, key_192, key_256
2991
const int rounds[3] = {10, 12, 14};
2992
Label L_singleBlockLoopTop[3];
2993
Label L_multiBlock_loopTop[3];
2994
Label L_key192_top, L_key256_top;
2995
Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time
2996
Label L_incCounter_single[3]; //for single block, key128, key192, key256
2997
Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2998
Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3001
const int PARALLEL_FACTOR = 4; //because of the limited register number
3003
// initialize counter with initial counter
3004
__ movdqu(xmm_curr_counter, Address(counter, 0x00));
3005
__ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3006
__ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
3008
// key length could be only {11, 13, 15} * 4 = {44, 52, 60}
3009
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
3010
__ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3012
__ jcc(Assembler::equal, L_key192_top);
3014
__ jcc(Assembler::equal, L_key256_top);
3016
//key128 begins here
3017
__ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3019
#define CTR_DoFour(opc, src_reg) \
3020
__ opc(xmm_result0, src_reg); \
3021
__ opc(xmm_result1, src_reg); \
3022
__ opc(xmm_result2, src_reg); \
3023
__ opc(xmm_result3, src_reg);
3025
// k == 0 : generate code for key_128
3026
// k == 1 : generate code for key_192
3027
// k == 2 : generate code for key_256
3028
for (int k = 0; k < 3; ++k) {
3029
//multi blocks starts here
3030
__ align(OptoLoopAlignment);
3031
__ BIND(L_multiBlock_loopTop[k]);
3032
__ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
3033
__ jcc(Assembler::less, L_singleBlockLoopTop[k]);
3035
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
3036
__ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3038
//load, then increase counters
3039
CTR_DoFour(movdqa, xmm_curr_counter);
3041
inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
3042
inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
3043
inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
3044
inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
3047
load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
3049
CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
3050
CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key
3052
for (int i = 1; i < rounds[k]; ++i) {
3053
load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
3054
CTR_DoFour(aesenc, xmm_key);
3056
load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
3057
CTR_DoFour(aesenclast, xmm_key);
3059
// get next PARALLEL_FACTOR blocks into xmm_from registers
3060
__ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3061
__ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3062
__ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3064
// PXOR with input text
3065
__ pxor(xmm_result0, xmm_from0); //result0 is xmm4
3066
__ pxor(xmm_result1, xmm_from1);
3067
__ pxor(xmm_result2, xmm_from2);
3069
// store PARALLEL_FACTOR results into the next 64 bytes of output
3070
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3071
__ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3072
__ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3074
// do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
3075
__ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3076
__ pxor(xmm_result3, xmm_from3);
3077
__ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3079
__ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
3080
__ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
3081
__ jmp(L_multiBlock_loopTop[k]);
3083
// singleBlock starts here
3084
__ align(OptoLoopAlignment);
3085
__ BIND(L_singleBlockLoopTop[k]);
3086
__ cmpptr(len_reg, 0);
3087
__ jcc(Assembler::equal, L_exit);
3088
__ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
3089
__ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3090
__ movdqa(xmm_result0, xmm_curr_counter);
3091
load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
3092
__ push(rbx);//rbx is used for increasing counter
3093
inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
3095
__ pshufb(xmm_result0, xmm_counter_shuf_mask);
3096
__ pxor(xmm_result0, xmm_key);
3097
for (int i = 1; i < rounds[k]; i++) {
3098
load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
3099
__ aesenc(xmm_result0, xmm_key);
3101
load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
3102
__ aesenclast(xmm_result0, xmm_key);
3103
__ cmpptr(len_reg, AESBlockSize);
3104
__ jcc(Assembler::less, L_processTail_insr[k]);
3105
__ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3106
__ pxor(xmm_result0, xmm_from0);
3107
__ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3108
__ addptr(pos, AESBlockSize);
3109
__ subptr(len_reg, AESBlockSize);
3110
__ jmp(L_singleBlockLoopTop[k]);
3112
__ BIND(L_processTail_insr[k]); // Process the tail part of the input array
3113
__ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
3114
__ testptr(len_reg, 8);
3115
__ jcc(Assembler::zero, L_processTail_4_insr[k]);
3117
__ pinsrd(xmm_from0, Address(from, pos), 0);
3118
__ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
3119
__ BIND(L_processTail_4_insr[k]);
3120
__ testptr(len_reg, 4);
3121
__ jcc(Assembler::zero, L_processTail_2_insr[k]);
3123
__ pslldq(xmm_from0, 4);
3124
__ pinsrd(xmm_from0, Address(from, pos), 0);
3125
__ BIND(L_processTail_2_insr[k]);
3126
__ testptr(len_reg, 2);
3127
__ jcc(Assembler::zero, L_processTail_1_insr[k]);
3129
__ pslldq(xmm_from0, 2);
3130
__ pinsrw(xmm_from0, Address(from, pos), 0);
3131
__ BIND(L_processTail_1_insr[k]);
3132
__ testptr(len_reg, 1);
3133
__ jcc(Assembler::zero, L_processTail_exit_insr[k]);
3135
__ pslldq(xmm_from0, 1);
3136
__ pinsrb(xmm_from0, Address(from, pos), 0);
3137
__ BIND(L_processTail_exit_insr[k]);
3139
__ movptr(saved_encCounter_start, saved_counter_param);
3140
__ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
3141
__ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
3143
__ testptr(len_reg, 8);
3144
__ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
3145
__ pextrd(Address(to, pos), xmm_result0, 0);
3146
__ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
3147
__ psrldq(xmm_result0, 8);
3149
__ BIND(L_processTail_4_extr[k]);
3150
__ testptr(len_reg, 4);
3151
__ jcc(Assembler::zero, L_processTail_2_extr[k]);
3152
__ pextrd(Address(to, pos), xmm_result0, 0);
3153
__ psrldq(xmm_result0, 4);
3155
__ BIND(L_processTail_2_extr[k]);
3156
__ testptr(len_reg, 2);
3157
__ jcc(Assembler::zero, L_processTail_1_extr[k]);
3158
__ pextrb(Address(to, pos), xmm_result0, 0);
3159
__ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
3160
__ psrldq(xmm_result0, 2);
3162
__ BIND(L_processTail_1_extr[k]);
3163
__ testptr(len_reg, 1);
3164
__ jcc(Assembler::zero, L_processTail_exit_extr[k]);
3165
__ pextrb(Address(to, pos), xmm_result0, 0);
3167
__ BIND(L_processTail_exit_extr[k]);
3168
__ movptr(used_addr, used_addr_param);
3169
__ movl(Address(used_addr, 0), len_reg);
3174
__ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3175
__ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
3176
__ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
3177
handleSOERegisters(false /*restoring*/);
3178
__ movptr(rax, len_param); // return length
3179
__ leave(); // required for proper stackwalking of RuntimeStub frame
3182
__ BIND (L_key192_top);
3183
__ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3184
__ jmp(L_multiBlock_loopTop[1]); //key192
3186
__ BIND (L_key256_top);
3187
__ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3188
__ jmp(L_multiBlock_loopTop[2]); //key192
3193
// ofs and limit are use for multi-block byte array.
3194
// int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
3195
address generate_md5_implCompress(bool multi_block, const char *name) {
3196
__ align(CodeEntryAlignment);
3197
StubCodeMark mark(this, "StubRoutines", name);
3198
address start = __ pc();
3200
const Register buf_param = rbp;
3201
const Address state_param(rsp, 0 * wordSize);
3202
const Address ofs_param (rsp, 1 * wordSize);
3203
const Address limit_param(rsp, 2 * wordSize);
3210
__ subptr(rsp, 3 * wordSize);
3212
__ movptr(rsi, Address(rbp, 8 + 4));
3213
__ movptr(state_param, rsi);
3215
__ movptr(rsi, Address(rbp, 8 + 8));
3216
__ movptr(ofs_param, rsi);
3217
__ movptr(rsi, Address(rbp, 8 + 12));
3218
__ movptr(limit_param, rsi);
3220
__ movptr(buf_param, Address(rbp, 8 + 0)); // do it last because it override rbp
3221
__ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
3223
__ addptr(rsp, 3 * wordSize);
3233
address generate_upper_word_mask() {
3235
StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3236
address start = __ pc();
3237
__ emit_data(0x00000000, relocInfo::none, 0);
3238
__ emit_data(0x00000000, relocInfo::none, 0);
3239
__ emit_data(0x00000000, relocInfo::none, 0);
3240
__ emit_data(0xFFFFFFFF, relocInfo::none, 0);
3244
address generate_shuffle_byte_flip_mask() {
3246
StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3247
address start = __ pc();
3248
__ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3249
__ emit_data(0x08090a0b, relocInfo::none, 0);
3250
__ emit_data(0x04050607, relocInfo::none, 0);
3251
__ emit_data(0x00010203, relocInfo::none, 0);
3255
// ofs and limit are use for multi-block byte array.
3256
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3257
address generate_sha1_implCompress(bool multi_block, const char *name) {
3258
__ align(CodeEntryAlignment);
3259
StubCodeMark mark(this, "StubRoutines", name);
3260
address start = __ pc();
3263
Register state = rdx;
3265
Register limit = rdi;
3267
const Address buf_param(rbp, 8 + 0);
3268
const Address state_param(rbp, 8 + 4);
3269
const Address ofs_param(rbp, 8 + 8);
3270
const Address limit_param(rbp, 8 + 12);
3272
const XMMRegister abcd = xmm0;
3273
const XMMRegister e0 = xmm1;
3274
const XMMRegister e1 = xmm2;
3275
const XMMRegister msg0 = xmm3;
3277
const XMMRegister msg1 = xmm4;
3278
const XMMRegister msg2 = xmm5;
3279
const XMMRegister msg3 = xmm6;
3280
const XMMRegister shuf_mask = xmm7;
3283
__ subptr(rsp, 8 * wordSize);
3284
handleSOERegisters(true /*saving*/);
3286
__ movptr(buf, buf_param);
3287
__ movptr(state, state_param);
3289
__ movptr(ofs, ofs_param);
3290
__ movptr(limit, limit_param);
3293
__ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3294
buf, state, ofs, limit, rsp, multi_block);
3296
handleSOERegisters(false /*restoring*/);
3297
__ addptr(rsp, 8 * wordSize);
3303
address generate_pshuffle_byte_flip_mask() {
3305
StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3306
address start = __ pc();
3307
__ emit_data(0x00010203, relocInfo::none, 0);
3308
__ emit_data(0x04050607, relocInfo::none, 0);
3309
__ emit_data(0x08090a0b, relocInfo::none, 0);
3310
__ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3314
// ofs and limit are use for multi-block byte array.
3315
// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3316
address generate_sha256_implCompress(bool multi_block, const char *name) {
3317
__ align(CodeEntryAlignment);
3318
StubCodeMark mark(this, "StubRoutines", name);
3319
address start = __ pc();
3322
Register state = rsi;
3324
Register limit = rcx;
3326
const Address buf_param(rbp, 8 + 0);
3327
const Address state_param(rbp, 8 + 4);
3328
const Address ofs_param(rbp, 8 + 8);
3329
const Address limit_param(rbp, 8 + 12);
3331
const XMMRegister msg = xmm0;
3332
const XMMRegister state0 = xmm1;
3333
const XMMRegister state1 = xmm2;
3334
const XMMRegister msgtmp0 = xmm3;
3336
const XMMRegister msgtmp1 = xmm4;
3337
const XMMRegister msgtmp2 = xmm5;
3338
const XMMRegister msgtmp3 = xmm6;
3339
const XMMRegister msgtmp4 = xmm7;
3342
__ subptr(rsp, 8 * wordSize);
3343
handleSOERegisters(true /*saving*/);
3344
__ movptr(buf, buf_param);
3345
__ movptr(state, state_param);
3347
__ movptr(ofs, ofs_param);
3348
__ movptr(limit, limit_param);
3351
__ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3352
buf, state, ofs, limit, rsp, multi_block);
3354
handleSOERegisters(false);
3355
__ addptr(rsp, 8 * wordSize);
3361
// byte swap x86 long
3362
address ghash_long_swap_mask_addr() {
3363
return (address)GHASH_LONG_SWAP_MASK;
3366
// byte swap x86 byte array
3367
address ghash_byte_swap_mask_addr() {
3368
return (address)GHASH_BYTE_SWAP_MASK;
3371
/* Single and multi-block ghash operations */
3372
address generate_ghash_processBlocks() {
3373
assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3374
__ align(CodeEntryAlignment);
3375
Label L_ghash_loop, L_exit;
3376
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3377
address start = __ pc();
3379
const Register state = rdi;
3380
const Register subkeyH = rsi;
3381
const Register data = rdx;
3382
const Register blocks = rcx;
3384
const Address state_param(rbp, 8+0);
3385
const Address subkeyH_param(rbp, 8+4);
3386
const Address data_param(rbp, 8+8);
3387
const Address blocks_param(rbp, 8+12);
3389
const XMMRegister xmm_temp0 = xmm0;
3390
const XMMRegister xmm_temp1 = xmm1;
3391
const XMMRegister xmm_temp2 = xmm2;
3392
const XMMRegister xmm_temp3 = xmm3;
3393
const XMMRegister xmm_temp4 = xmm4;
3394
const XMMRegister xmm_temp5 = xmm5;
3395
const XMMRegister xmm_temp6 = xmm6;
3396
const XMMRegister xmm_temp7 = xmm7;
3399
handleSOERegisters(true); // Save registers
3401
__ movptr(state, state_param);
3402
__ movptr(subkeyH, subkeyH_param);
3403
__ movptr(data, data_param);
3404
__ movptr(blocks, blocks_param);
3406
__ movdqu(xmm_temp0, Address(state, 0));
3407
__ pshufb(xmm_temp0, ExternalAddress(ghash_long_swap_mask_addr()));
3409
__ movdqu(xmm_temp1, Address(subkeyH, 0));
3410
__ pshufb(xmm_temp1, ExternalAddress(ghash_long_swap_mask_addr()));
3412
__ BIND(L_ghash_loop);
3413
__ movdqu(xmm_temp2, Address(data, 0));
3414
__ pshufb(xmm_temp2, ExternalAddress(ghash_byte_swap_mask_addr()));
3416
__ pxor(xmm_temp0, xmm_temp2);
3419
// Multiply with the hash key
3421
__ movdqu(xmm_temp3, xmm_temp0);
3422
__ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
3423
__ movdqu(xmm_temp4, xmm_temp0);
3424
__ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
3426
__ movdqu(xmm_temp5, xmm_temp0);
3427
__ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
3428
__ movdqu(xmm_temp6, xmm_temp0);
3429
__ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
3431
__ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
3433
__ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
3434
__ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
3435
__ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
3436
__ pxor(xmm_temp3, xmm_temp5);
3437
__ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
3438
// of the carry-less multiplication of
3441
// We shift the result of the multiplication by one bit position
3442
// to the left to cope for the fact that the bits are reversed.
3443
__ movdqu(xmm_temp7, xmm_temp3);
3444
__ movdqu(xmm_temp4, xmm_temp6);
3445
__ pslld (xmm_temp3, 1);
3446
__ pslld(xmm_temp6, 1);
3447
__ psrld(xmm_temp7, 31);
3448
__ psrld(xmm_temp4, 31);
3449
__ movdqu(xmm_temp5, xmm_temp7);
3450
__ pslldq(xmm_temp4, 4);
3451
__ pslldq(xmm_temp7, 4);
3452
__ psrldq(xmm_temp5, 12);
3453
__ por(xmm_temp3, xmm_temp7);
3454
__ por(xmm_temp6, xmm_temp4);
3455
__ por(xmm_temp6, xmm_temp5);
3458
// First phase of the reduction
3460
// Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3462
__ movdqu(xmm_temp7, xmm_temp3);
3463
__ movdqu(xmm_temp4, xmm_temp3);
3464
__ movdqu(xmm_temp5, xmm_temp3);
3465
__ pslld(xmm_temp7, 31); // packed right shift shifting << 31
3466
__ pslld(xmm_temp4, 30); // packed right shift shifting << 30
3467
__ pslld(xmm_temp5, 25); // packed right shift shifting << 25
3468
__ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
3469
__ pxor(xmm_temp7, xmm_temp5);
3470
__ movdqu(xmm_temp4, xmm_temp7);
3471
__ pslldq(xmm_temp7, 12);
3472
__ psrldq(xmm_temp4, 4);
3473
__ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
3476
// Second phase of the reduction
3478
// Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3479
// shift operations.
3480
__ movdqu(xmm_temp2, xmm_temp3);
3481
__ movdqu(xmm_temp7, xmm_temp3);
3482
__ movdqu(xmm_temp5, xmm_temp3);
3483
__ psrld(xmm_temp2, 1); // packed left shifting >> 1
3484
__ psrld(xmm_temp7, 2); // packed left shifting >> 2
3485
__ psrld(xmm_temp5, 7); // packed left shifting >> 7
3486
__ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
3487
__ pxor(xmm_temp2, xmm_temp5);
3488
__ pxor(xmm_temp2, xmm_temp4);
3489
__ pxor(xmm_temp3, xmm_temp2);
3490
__ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
3492
__ decrement(blocks);
3493
__ jcc(Assembler::zero, L_exit);
3494
__ movdqu(xmm_temp0, xmm_temp6);
3495
__ addptr(data, 16);
3496
__ jmp(L_ghash_loop);
3499
// Byte swap 16-byte result
3500
__ pshufb(xmm_temp6, ExternalAddress(ghash_long_swap_mask_addr()));
3501
__ movdqu(Address(state, 0), xmm_temp6); // store the result
3503
handleSOERegisters(false); // restore registers
3514
* rsp(8) - byte* buf
3515
* rsp(12) - int length
3518
* rax - int crc result
3520
address generate_updateBytesCRC32() {
3521
assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3523
__ align(CodeEntryAlignment);
3524
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3526
address start = __ pc();
3528
const Register crc = rdx; // crc
3529
const Register buf = rsi; // source java byte array address
3530
const Register len = rcx; // length
3531
const Register table = rdi; // crc_table address (reuse register)
3532
const Register tmp = rbx;
3533
assert_different_registers(crc, buf, len, table, tmp, rax);
3535
BLOCK_COMMENT("Entry:");
3536
__ enter(); // required for proper stackwalking of RuntimeStub frame
3541
Address crc_arg(rbp, 8 + 0);
3542
Address buf_arg(rbp, 8 + 4);
3543
Address len_arg(rbp, 8 + 8);
3546
__ movl(crc, crc_arg);
3547
__ movptr(buf, buf_arg);
3548
__ movl(len, len_arg);
3550
__ kernel_crc32(crc, buf, len, table, tmp);
3557
__ leave(); // required for proper stackwalking of RuntimeStub frame
3568
* rsp(8) - byte* buf
3569
* rsp(12) - int length
3570
* rsp(16) - table_start - optional (present only when doing a library_calll,
3571
* not used by x86 algorithm)
3574
* rax - int crc result
3576
address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3577
assert(UseCRC32CIntrinsics, "need SSE4_2");
3578
__ align(CodeEntryAlignment);
3579
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3580
address start = __ pc();
3581
const Register crc = rax; // crc
3582
const Register buf = rcx; // source java byte array address
3583
const Register len = rdx; // length
3584
const Register d = rbx;
3585
const Register g = rsi;
3586
const Register h = rdi;
3587
const Register empty = noreg; // will never be used, in order not
3588
// to change a signature for crc32c_IPL_Alg2_Alt2
3589
// between 64/32 I'm just keeping it here
3590
assert_different_registers(crc, buf, len, d, g, h);
3592
BLOCK_COMMENT("Entry:");
3593
__ enter(); // required for proper stackwalking of RuntimeStub frame
3594
Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3595
// we need to add additional 4 because __ enter
3596
// have just pushed ebp on a stack
3597
Address buf_arg(rsp, 4 + 4 + 4);
3598
Address len_arg(rsp, 4 + 4 + 8);
3600
__ movl(crc, crc_arg);
3601
__ movl(buf, buf_arg);
3602
__ movl(len, len_arg);
3606
__ crc32c_ipl_alg2_alt2(crc, buf, len,
3608
empty, empty, empty,
3610
is_pclmulqdq_supported);
3615
__ leave(); // required for proper stackwalking of RuntimeStub frame
3621
address generate_libmExp() {
3622
StubCodeMark mark(this, "StubRoutines", "libmExp");
3624
address start = __ pc();
3626
BLOCK_COMMENT("Entry:");
3627
__ enter(); // required for proper stackwalking of RuntimeStub frame
3628
__ fast_exp(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3629
rax, rcx, rdx, rbx);
3630
__ leave(); // required for proper stackwalking of RuntimeStub frame
3637
address generate_libmLog() {
3638
StubCodeMark mark(this, "StubRoutines", "libmLog");
3640
address start = __ pc();
3642
BLOCK_COMMENT("Entry:");
3643
__ enter(); // required for proper stackwalking of RuntimeStub frame
3644
__ fast_log(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3645
rax, rcx, rdx, rbx);
3646
__ leave(); // required for proper stackwalking of RuntimeStub frame
3653
address generate_libmLog10() {
3654
StubCodeMark mark(this, "StubRoutines", "libmLog10");
3656
address start = __ pc();
3658
BLOCK_COMMENT("Entry:");
3659
__ enter(); // required for proper stackwalking of RuntimeStub frame
3660
__ fast_log10(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3661
rax, rcx, rdx, rbx);
3662
__ leave(); // required for proper stackwalking of RuntimeStub frame
3669
address generate_libmPow() {
3670
StubCodeMark mark(this, "StubRoutines", "libmPow");
3672
address start = __ pc();
3674
BLOCK_COMMENT("Entry:");
3675
__ enter(); // required for proper stackwalking of RuntimeStub frame
3676
__ fast_pow(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3677
rax, rcx, rdx, rbx);
3678
__ leave(); // required for proper stackwalking of RuntimeStub frame
3685
address generate_libm_reduce_pi04l() {
3686
StubCodeMark mark(this, "StubRoutines", "libm_reduce_pi04l");
3688
address start = __ pc();
3690
BLOCK_COMMENT("Entry:");
3691
__ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3697
address generate_libm_sin_cos_huge() {
3698
StubCodeMark mark(this, "StubRoutines", "libm_sin_cos_huge");
3700
address start = __ pc();
3702
BLOCK_COMMENT("Entry:");
3703
__ libm_sincos_huge(xmm0, xmm1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3709
address generate_libmSin() {
3710
StubCodeMark mark(this, "StubRoutines", "libmSin");
3712
address start = __ pc();
3714
BLOCK_COMMENT("Entry:");
3715
__ enter(); // required for proper stackwalking of RuntimeStub frame
3716
__ fast_sin(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3718
__ leave(); // required for proper stackwalking of RuntimeStub frame
3725
address generate_libmCos() {
3726
StubCodeMark mark(this, "StubRoutines", "libmCos");
3728
address start = __ pc();
3730
BLOCK_COMMENT("Entry:");
3731
__ enter(); // required for proper stackwalking of RuntimeStub frame
3732
__ fast_cos(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3733
rax, rcx, rdx, rbx);
3734
__ leave(); // required for proper stackwalking of RuntimeStub frame
3741
address generate_libm_tan_cot_huge() {
3742
StubCodeMark mark(this, "StubRoutines", "libm_tan_cot_huge");
3744
address start = __ pc();
3746
BLOCK_COMMENT("Entry:");
3747
__ libm_tancot_huge(xmm0, xmm1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3753
address generate_libmTan() {
3754
StubCodeMark mark(this, "StubRoutines", "libmTan");
3756
address start = __ pc();
3758
BLOCK_COMMENT("Entry:");
3759
__ enter(); // required for proper stackwalking of RuntimeStub frame
3760
__ fast_tan(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3761
rax, rcx, rdx, rbx);
3762
__ leave(); // required for proper stackwalking of RuntimeStub frame
3769
address generate_method_entry_barrier() {
3770
__ align(CodeEntryAlignment);
3771
StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3773
Label deoptimize_label;
3775
address start = __ pc();
3777
__ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
3779
BLOCK_COMMENT("Entry:");
3780
__ enter(); // save rbp
3782
// save rbx, because we want to use that value.
3783
// We could do without it but then we depend on the number of slots used by pusha
3786
__ lea(rbx, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for rbx - this should be the return address
3790
// xmm0 and xmm1 may be used for passing float/double arguments
3793
const int xmm_size = wordSize * 4;
3794
__ subptr(rsp, xmm_size * 2);
3795
__ movdbl(Address(rsp, xmm_size * 1), xmm1);
3796
__ movdbl(Address(rsp, xmm_size * 0), xmm0);
3797
} else if (UseSSE >= 1) {
3798
const int xmm_size = wordSize * 2;
3799
__ subptr(rsp, xmm_size * 2);
3800
__ movflt(Address(rsp, xmm_size * 1), xmm1);
3801
__ movflt(Address(rsp, xmm_size * 0), xmm0);
3804
__ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), rbx);
3807
const int xmm_size = wordSize * 4;
3808
__ movdbl(xmm0, Address(rsp, xmm_size * 0));
3809
__ movdbl(xmm1, Address(rsp, xmm_size * 1));
3810
__ addptr(rsp, xmm_size * 2);
3811
} else if (UseSSE >= 1) {
3812
const int xmm_size = wordSize * 2;
3813
__ movflt(xmm0, Address(rsp, xmm_size * 0));
3814
__ movflt(xmm1, Address(rsp, xmm_size * 1));
3815
__ addptr(rsp, xmm_size * 2);
3818
__ cmpl(rax, 1); // 1 means deoptimize
3819
__ jcc(Assembler::equal, deoptimize_label);
3826
__ addptr(rsp, 1 * wordSize); // cookie
3829
__ BIND(deoptimize_label);
3836
// this can be taken out, but is good for verification purposes. getting a SIGSEGV
3837
// here while still having a correct stack is valuable
3838
__ testptr(rsp, Address(rsp, 0));
3840
__ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
3841
__ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
3847
// Information about frame layout at time of blocking runtime call.
3848
// Note that we only have to preserve callee-saved registers since
3849
// the compilers are responsible for supplying a continuation point
3850
// if they expect all registers to be preserved.
3852
thread_off, // last_java_sp
3855
rbp_off, // callee saved register
3865
//------------------------------------------------------------------------------------------------------------------------
3866
// Continuation point for throwing of implicit exceptions that are not handled in
3867
// the current activation. Fabricates an exception oop and initiates normal
3868
// exception dispatching in this frame.
3870
// Previously the compiler (c2) allowed for callee save registers on Java calls.
3871
// This is no longer true after adapter frames were removed but could possibly
3872
// be brought back in the future if the interpreter code was reworked and it
3873
// was deemed worthwhile. The comment below was left to describe what must
3874
// happen here if callee saves were resurrected. As it stands now this stub
3875
// could actually be a vanilla BufferBlob and have now oopMap at all.
3876
// Since it doesn't make much difference we've chosen to leave it the
3877
// way it was in the callee save days and keep the comment.
3879
// If we need to preserve callee-saved values we need a callee-saved oop map and
3880
// therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3881
// If the compiler needs all registers to be preserved between the fault
3882
// point and the exception handler then it must assume responsibility for that in
3883
// AbstractCompiler::continuation_for_implicit_null_exception or
3884
// continuation_for_implicit_division_by_zero_exception. All other implicit
3885
// exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3886
// either at call sites or otherwise assume that stack unwinding will be initiated,
3887
// so caller saved registers were assumed volatile in the compiler.
3888
address generate_throw_exception(const char* name, address runtime_entry,
3889
Register arg1 = noreg, Register arg2 = noreg) {
3891
int insts_size = 256;
3894
CodeBuffer code(name, insts_size, locs_size);
3895
OopMapSet* oop_maps = new OopMapSet();
3896
MacroAssembler* masm = new MacroAssembler(&code);
3898
address start = __ pc();
3900
// This is an inlined and slightly modified version of call_VM
3901
// which has the ability to fetch the return PC out of
3902
// thread-local storage and also sets up last_Java_sp slightly
3903
// differently than the real call_VM
3904
Register java_thread = rbx;
3905
__ get_thread(java_thread);
3907
__ enter(); // required for proper stackwalking of RuntimeStub frame
3909
// pc and rbp, already pushed
3910
__ subptr(rsp, (framesize-2) * wordSize); // prolog
3912
// Frame is now completed as far as size and linkage.
3914
int frame_complete = __ pc() - start;
3916
// push java thread (becomes first argument of C function)
3917
__ movptr(Address(rsp, thread_off * wordSize), java_thread);
3918
if (arg1 != noreg) {
3919
__ movptr(Address(rsp, arg1_off * wordSize), arg1);
3921
if (arg2 != noreg) {
3922
assert(arg1 != noreg, "missing reg arg");
3923
__ movptr(Address(rsp, arg2_off * wordSize), arg2);
3926
// Set up last_Java_sp and last_Java_fp
3927
__ set_last_Java_frame(java_thread, rsp, rbp, nullptr, noreg);
3930
BLOCK_COMMENT("call runtime_entry");
3931
__ call(RuntimeAddress(runtime_entry));
3933
OopMap* map = new OopMap(framesize, 0);
3934
oop_maps->add_gc_map(__ pc() - start, map);
3936
// restore the thread (cannot use the pushed argument since arguments
3937
// may be overwritten by C code generated by an optimizing compiler);
3938
// however can use the register value directly if it is callee saved.
3939
__ get_thread(java_thread);
3941
__ reset_last_Java_frame(java_thread, true);
3943
__ leave(); // required for proper stackwalking of RuntimeStub frame
3945
// check for pending exceptions
3948
__ cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD);
3949
__ jcc(Assembler::notEqual, L);
3950
__ should_not_reach_here();
3953
__ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3956
RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3957
return stub->entry_point();
3961
void create_control_words() {
3962
// Round to nearest, 53-bit mode, exceptions masked
3963
StubRoutines::x86::_fpu_cntrl_wrd_std = 0x027F;
3964
// Round to zero, 53-bit mode, exception mased
3965
StubRoutines::x86::_fpu_cntrl_wrd_trunc = 0x0D7F;
3966
// Round to nearest, 24-bit mode, exceptions masked
3967
StubRoutines::x86::_fpu_cntrl_wrd_24 = 0x007F;
3968
// Round to nearest, 64-bit mode, exceptions masked, flags specialized
3969
StubRoutines::x86::_mxcsr_std = EnableX86ECoreOpts ? 0x1FBF : 0x1F80;
3970
// Note: the following two constants are 80-bit values
3971
// layout is critical for correct loading by FPU.
3972
// Bias for strict fp multiply/divide
3973
StubRoutines::x86::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3974
StubRoutines::x86::_fpu_subnormal_bias1[1]= 0x80000000;
3975
StubRoutines::x86::_fpu_subnormal_bias1[2]= 0x03ff;
3976
// Un-Bias for strict fp multiply/divide
3977
StubRoutines::x86::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3978
StubRoutines::x86::_fpu_subnormal_bias2[1]= 0x80000000;
3979
StubRoutines::x86::_fpu_subnormal_bias2[2]= 0x7bff;
3982
address generate_cont_thaw() {
3983
if (!Continuations::enabled()) return nullptr;
3988
address generate_cont_returnBarrier() {
3989
if (!Continuations::enabled()) return nullptr;
3994
address generate_cont_returnBarrier_exception() {
3995
if (!Continuations::enabled()) return nullptr;
4002
static void jfr_prologue(address the_pc, MacroAssembler* masm) {
4003
Register java_thread = rdi;
4004
__ get_thread(java_thread);
4005
__ set_last_Java_frame(java_thread, rsp, rbp, the_pc, noreg);
4006
__ movptr(Address(rsp, 0), java_thread);
4009
// The handle is dereferenced through a load barrier.
4010
static void jfr_epilogue(MacroAssembler* masm) {
4011
Register java_thread = rdi;
4012
__ get_thread(java_thread);
4013
__ reset_last_Java_frame(java_thread, true);
4016
// For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
4017
// It returns a jobject handle to the event writer.
4018
// The handle is dereferenced and the return value is the event writer oop.
4019
static RuntimeStub* generate_jfr_write_checkpoint() {
4022
rbp_off = FPUStateSizeInWords,
4028
saved_argument_off2, // 2nd half of double
4032
int insts_size = 1024;
4034
CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
4035
OopMapSet* oop_maps = new OopMapSet();
4036
MacroAssembler* masm = new MacroAssembler(&code);
4037
MacroAssembler* _masm = masm;
4039
address start = __ pc();
4041
int frame_complete = __ pc() - start;
4042
address the_pc = __ pc();
4043
jfr_prologue(the_pc, _masm);
4044
__ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4045
jfr_epilogue(_masm);
4046
__ resolve_global_jobject(rax, rdi, rdx);
4050
OopMap* map = new OopMap(framesize, 1); // rbp
4051
oop_maps->add_gc_map(the_pc - start, map);
4053
RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
4054
RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
4055
(framesize >> (LogBytesPerWord - LogBytesPerInt)),
4060
// For c2: call to return a leased buffer.
4061
static RuntimeStub* generate_jfr_return_lease() {
4064
rbp_off = FPUStateSizeInWords,
4070
saved_argument_off2, // 2nd half of double
4074
int insts_size = 1024;
4076
CodeBuffer code("jfr_return_lease", insts_size, locs_size);
4077
OopMapSet* oop_maps = new OopMapSet();
4078
MacroAssembler* masm = new MacroAssembler(&code);
4079
MacroAssembler* _masm = masm;
4081
address start = __ pc();
4083
int frame_complete = __ pc() - start;
4084
address the_pc = __ pc();
4085
jfr_prologue(the_pc, _masm);
4086
__ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4087
jfr_epilogue(_masm);
4091
OopMap* map = new OopMap(framesize, 1); // rbp
4092
oop_maps->add_gc_map(the_pc - start, map);
4094
RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
4095
RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
4096
(framesize >> (LogBytesPerWord - LogBytesPerInt)),
4101
#endif // INCLUDE_JFR
4103
//---------------------------------------------------------------------------
4106
void generate_initial_stubs() {
4107
// Generates all stubs and initializes the entry points
4109
//------------------------------------------------------------------------------------------------------------------------
4110
// entry points that exist in all platforms
4111
// Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
4112
// the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
4113
StubRoutines::_forward_exception_entry = generate_forward_exception();
4115
StubRoutines::_call_stub_entry =
4116
generate_call_stub(StubRoutines::_call_stub_return_address);
4117
// is referenced by megamorphic call
4118
StubRoutines::_catch_exception_entry = generate_catch_exception();
4120
// platform dependent
4121
create_control_words();
4123
// Initialize table for copy memory (arraycopy) check.
4124
if (UnsafeMemoryAccess::_table == nullptr) {
4125
UnsafeMemoryAccess::create_table(16 + 4); // 16 for copyMemory; 4 for setMemory
4128
StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr();
4129
StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = generate_verify_fpu_cntrl_wrd();
4130
StubRoutines::x86::_d2i_wrapper = generate_d2i_wrapper(T_INT, CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
4131
StubRoutines::x86::_d2l_wrapper = generate_d2i_wrapper(T_LONG, CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
4133
// Build this early so it's available for the interpreter
4134
StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception",
4135
CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
4136
StubRoutines::_throw_delayed_StackOverflowError_entry = generate_throw_exception("delayed StackOverflowError throw_exception",
4137
CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
4139
if (UseCRC32Intrinsics) {
4140
// set table address before stub generation which use it
4141
StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
4142
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4145
if (UseCRC32CIntrinsics) {
4146
bool supports_clmul = VM_Version::supports_clmul();
4147
StubRoutines::x86::generate_CRC32C_table(supports_clmul);
4148
StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
4149
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
4151
if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
4152
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
4153
StubRoutines::_dexp = generate_libmExp();
4155
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
4156
StubRoutines::_dlog = generate_libmLog();
4158
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
4159
StubRoutines::_dlog10 = generate_libmLog10();
4161
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
4162
StubRoutines::_dpow = generate_libmPow();
4164
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4165
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
4166
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4167
StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
4169
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4170
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
4171
StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
4173
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
4174
StubRoutines::_dsin = generate_libmSin();
4176
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
4177
StubRoutines::_dcos = generate_libmCos();
4179
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4180
StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
4181
StubRoutines::_dtan = generate_libmTan();
4186
void generate_continuation_stubs() {
4187
// Continuation stubs:
4188
StubRoutines::_cont_thaw = generate_cont_thaw();
4189
StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
4190
StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
4192
JFR_ONLY(generate_jfr_stubs();)
4196
void generate_jfr_stubs() {
4197
StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
4198
StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
4199
StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
4200
StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
4202
#endif // INCLUDE_JFR
4204
void generate_final_stubs() {
4205
// Generates all stubs and initializes the entry points
4207
// These entry points require SharedInfo::stack0 to be set up in non-core builds
4208
// and need to be relocatable, so they each fabricate a RuntimeStub internally.
4209
StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
4210
StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
4211
StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
4213
// support for verify_oop (must happen after universe_init)
4214
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4216
// arraycopy stubs used by compilers
4217
generate_arraycopy_stubs();
4219
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
4220
if (bs_nm != nullptr) {
4221
StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
4225
void generate_compiler_stubs() {
4226
#if COMPILER2_OR_JVMCI
4228
// entry points that are C2/JVMCI specific
4230
StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF);
4231
StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000);
4232
StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF);
4233
StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000);
4234
StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff);
4235
StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff);
4236
StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff);
4237
StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
4238
0xFFFFFFFF, 0, 0, 0);
4239
StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
4240
0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
4241
StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
4242
StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
4243
StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
4244
StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
4245
StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
4246
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000);
4247
StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF);
4248
StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
4249
StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
4250
StubRoutines::x86::_vector_count_leading_zeros_lut = generate_count_leading_zeros_lut("count_leading_zeros_lut");
4251
StubRoutines::x86::_vector_reverse_bit_lut = generate_vector_reverse_bit_lut("reverse_bit_lut");
4252
StubRoutines::x86::_vector_reverse_byte_perm_mask_long = generate_vector_reverse_byte_perm_mask_long("perm_mask_long");
4253
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
4254
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
4256
if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
4257
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
4258
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
4261
// don't bother generating these AES intrinsic stubs unless global flag is set
4262
if (UseAESIntrinsics) {
4263
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4264
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4265
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4266
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4269
if (UseAESCTRIntrinsics) {
4270
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
4273
if (UseMD5Intrinsics) {
4274
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
4275
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
4277
if (UseSHA1Intrinsics) {
4278
StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
4279
StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
4280
StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
4281
StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
4283
if (UseSHA256Intrinsics) {
4284
StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
4285
StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
4286
StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
4287
StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
4290
// Generate GHASH intrinsics code
4291
if (UseGHASHIntrinsics) {
4292
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4294
#endif // COMPILER2_OR_JVMCI
4299
StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
4302
generate_initial_stubs();
4304
case Continuation_stubs:
4305
generate_continuation_stubs();
4307
case Compiler_stubs:
4308
generate_compiler_stubs();
4311
generate_final_stubs();
4314
fatal("unexpected stubs kind: %d", kind);
4318
}; // end class declaration
4320
void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
4321
StubGenerator g(code, kind);