jdk

Форк
0
/
stubGenerator_x86_32.cpp 
4322 строки · 163.1 Кб
1
/*
2
 * Copyright (c) 1999, 2024, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.
8
 *
9
 * This code is distributed in the hope that it will be useful, but WITHOUT
10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
 * version 2 for more details (a copy is included in the LICENSE file that
13
 * accompanied this code).
14
 *
15
 * You should have received a copy of the GNU General Public License version
16
 * 2 along with this work; if not, write to the Free Software Foundation,
17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
 *
19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
 * or visit www.oracle.com if you need additional information or have any
21
 * questions.
22
 *
23
 */
24

25
#include "precompiled.hpp"
26
#include "asm/macroAssembler.hpp"
27
#include "asm/macroAssembler.inline.hpp"
28
#include "compiler/oopMap.hpp"
29
#include "gc/shared/barrierSet.hpp"
30
#include "gc/shared/barrierSetAssembler.hpp"
31
#include "gc/shared/barrierSetNMethod.hpp"
32
#include "interpreter/interpreter.hpp"
33
#include "memory/universe.hpp"
34
#include "nativeInst_x86.hpp"
35
#include "oops/instanceOop.hpp"
36
#include "oops/method.hpp"
37
#include "oops/objArrayKlass.hpp"
38
#include "oops/oop.inline.hpp"
39
#include "prims/methodHandles.hpp"
40
#include "runtime/frame.inline.hpp"
41
#include "runtime/handles.inline.hpp"
42
#include "runtime/javaThread.hpp"
43
#include "runtime/sharedRuntime.hpp"
44
#include "runtime/stubCodeGenerator.hpp"
45
#include "runtime/stubRoutines.hpp"
46
#ifdef COMPILER2
47
#include "opto/runtime.hpp"
48
#endif
49

50
// Declaration and definition of StubGenerator (no .hpp file).
51
// For a more detailed description of the stub routine structure
52
// see the comment in stubRoutines.hpp
53

54
#define __ _masm->
55
#define a__ ((Assembler*)_masm)->
56

57
#ifdef PRODUCT
58
#define BLOCK_COMMENT(str) /* nothing */
59
#else
60
#define BLOCK_COMMENT(str) __ block_comment(str)
61
#endif
62

63
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
64

65
const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
66
const int FPU_CNTRL_WRD_MASK = 0xFFFF;
67

68
ATTRIBUTE_ALIGNED(16) static const uint32_t KEY_SHUFFLE_MASK[] = {
69
    0x00010203UL, 0x04050607UL, 0x08090A0BUL, 0x0C0D0E0FUL,
70
};
71

72
ATTRIBUTE_ALIGNED(16) static const uint32_t COUNTER_SHUFFLE_MASK[] = {
73
    0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL,
74
};
75

76
ATTRIBUTE_ALIGNED(16) static const uint32_t GHASH_BYTE_SWAP_MASK[] = {
77
    0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL,
78
};
79

80
ATTRIBUTE_ALIGNED(16) static const uint32_t GHASH_LONG_SWAP_MASK[] = {
81
    0x0B0A0908UL, 0x0F0E0D0CUL, 0x03020100UL, 0x07060504UL,
82
};
83

84
// -------------------------------------------------------------------------------------------------------------------------
85
// Stub Code definitions
86

87
class StubGenerator: public StubCodeGenerator {
88
 private:
89

90
#ifdef PRODUCT
91
#define inc_counter_np(counter) ((void)0)
92
#else
93
  void inc_counter_np_(uint& counter) {
94
    __ incrementl(ExternalAddress((address)&counter));
95
  }
96
#define inc_counter_np(counter) \
97
  BLOCK_COMMENT("inc_counter " #counter); \
98
  inc_counter_np_(counter);
99
#endif //PRODUCT
100

101
  void inc_copy_counter_np(BasicType t) {
102
#ifndef PRODUCT
103
    switch (t) {
104
    case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
105
    case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
106
    case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
107
    case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
108
    case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
109
    default:        ShouldNotReachHere();
110
    }
111
#endif //PRODUCT
112
  }
113

114
  //------------------------------------------------------------------------------------------------------------------------
115
  // Call stubs are used to call Java from C
116
  //
117
  //    [ return_from_Java     ] <--- rsp
118
  //    [ argument word n      ]
119
  //      ...
120
  // -N [ argument word 1      ]
121
  // -7 [ Possible padding for stack alignment ]
122
  // -6 [ Possible padding for stack alignment ]
123
  // -5 [ Possible padding for stack alignment ]
124
  // -4 [ mxcsr save           ] <--- rsp_after_call
125
  // -3 [ saved rbx,            ]
126
  // -2 [ saved rsi            ]
127
  // -1 [ saved rdi            ]
128
  //  0 [ saved rbp,            ] <--- rbp,
129
  //  1 [ return address       ]
130
  //  2 [ ptr. to call wrapper ]
131
  //  3 [ result               ]
132
  //  4 [ result_type          ]
133
  //  5 [ method               ]
134
  //  6 [ entry_point          ]
135
  //  7 [ parameters           ]
136
  //  8 [ parameter_size       ]
137
  //  9 [ thread               ]
138

139

140
  address generate_call_stub(address& return_address) {
141
    StubCodeMark mark(this, "StubRoutines", "call_stub");
142
    address start = __ pc();
143

144
    // stub code parameters / addresses
145
    assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
146
    bool  sse_save = false;
147
    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
148
    const int     locals_count_in_bytes  (4*wordSize);
149
    const Address mxcsr_save    (rbp, -4 * wordSize);
150
    const Address saved_rbx     (rbp, -3 * wordSize);
151
    const Address saved_rsi     (rbp, -2 * wordSize);
152
    const Address saved_rdi     (rbp, -1 * wordSize);
153
    const Address result        (rbp,  3 * wordSize);
154
    const Address result_type   (rbp,  4 * wordSize);
155
    const Address method        (rbp,  5 * wordSize);
156
    const Address entry_point   (rbp,  6 * wordSize);
157
    const Address parameters    (rbp,  7 * wordSize);
158
    const Address parameter_size(rbp,  8 * wordSize);
159
    const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
160
    sse_save =  UseSSE > 0;
161

162
    // stub code
163
    __ enter();
164
    __ movptr(rcx, parameter_size);              // parameter counter
165
    __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
166
    __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
167
    __ subptr(rsp, rcx);
168
    __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
169

170
    // save rdi, rsi, & rbx, according to C calling conventions
171
    __ movptr(saved_rdi, rdi);
172
    __ movptr(saved_rsi, rsi);
173
    __ movptr(saved_rbx, rbx);
174

175
    // save and initialize %mxcsr
176
    if (sse_save) {
177
      Label skip_ldmx;
178
      __ stmxcsr(mxcsr_save);
179
      __ movl(rax, mxcsr_save);
180
      __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
181
      ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
182
      __ cmp32(rax, mxcsr_std);
183
      __ jcc(Assembler::equal, skip_ldmx);
184
      __ ldmxcsr(mxcsr_std);
185
      __ bind(skip_ldmx);
186
    }
187

188
    // make sure the control word is correct.
189
    __ fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_std()));
190

191
#ifdef ASSERT
192
    // make sure we have no pending exceptions
193
    { Label L;
194
      __ movptr(rcx, thread);
195
      __ cmpptr(Address(rcx, Thread::pending_exception_offset()), NULL_WORD);
196
      __ jcc(Assembler::equal, L);
197
      __ stop("StubRoutines::call_stub: entered with pending exception");
198
      __ bind(L);
199
    }
200
#endif
201

202
    // pass parameters if any
203
    BLOCK_COMMENT("pass parameters if any");
204
    Label parameters_done;
205
    __ movl(rcx, parameter_size);  // parameter counter
206
    __ testl(rcx, rcx);
207
    __ jcc(Assembler::zero, parameters_done);
208

209
    // parameter passing loop
210

211
    Label loop;
212
    // Copy Java parameters in reverse order (receiver last)
213
    // Note that the argument order is inverted in the process
214
    // source is rdx[rcx: N-1..0]
215
    // dest   is rsp[rbx: 0..N-1]
216

217
    __ movptr(rdx, parameters);          // parameter pointer
218
    __ xorptr(rbx, rbx);
219

220
    __ BIND(loop);
221

222
    // get parameter
223
    __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
224
    __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
225
                    Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
226
    __ increment(rbx);
227
    __ decrement(rcx);
228
    __ jcc(Assembler::notZero, loop);
229

230
    // call Java function
231
    __ BIND(parameters_done);
232
    __ movptr(rbx, method);           // get Method*
233
    __ movptr(rax, entry_point);      // get entry_point
234
    __ mov(rsi, rsp);                 // set sender sp
235
    BLOCK_COMMENT("call Java function");
236
    __ call(rax);
237

238
    BLOCK_COMMENT("call_stub_return_address:");
239
    return_address = __ pc();
240

241
#ifdef COMPILER2
242
    {
243
      Label L_skip;
244
      if (UseSSE >= 2) {
245
        __ verify_FPU(0, "call_stub_return");
246
      } else {
247
        for (int i = 1; i < 8; i++) {
248
          __ ffree(i);
249
        }
250

251
        // UseSSE <= 1 so double result should be left on TOS
252
        __ movl(rsi, result_type);
253
        __ cmpl(rsi, T_DOUBLE);
254
        __ jcc(Assembler::equal, L_skip);
255
        if (UseSSE == 0) {
256
          // UseSSE == 0 so float result should be left on TOS
257
          __ cmpl(rsi, T_FLOAT);
258
          __ jcc(Assembler::equal, L_skip);
259
        }
260
        __ ffree(0);
261
      }
262
      __ BIND(L_skip);
263
    }
264
#endif // COMPILER2
265

266
    // store result depending on type
267
    // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
268
    __ movptr(rdi, result);
269
    Label is_long, is_float, is_double, exit;
270
    __ movl(rsi, result_type);
271
    __ cmpl(rsi, T_LONG);
272
    __ jcc(Assembler::equal, is_long);
273
    __ cmpl(rsi, T_FLOAT);
274
    __ jcc(Assembler::equal, is_float);
275
    __ cmpl(rsi, T_DOUBLE);
276
    __ jcc(Assembler::equal, is_double);
277

278
    // handle T_INT case
279
    __ movl(Address(rdi, 0), rax);
280
    __ BIND(exit);
281

282
    // check that FPU stack is empty
283
    __ verify_FPU(0, "generate_call_stub");
284

285
    // pop parameters
286
    __ lea(rsp, rsp_after_call);
287

288
    // restore %mxcsr
289
    if (sse_save) {
290
      __ ldmxcsr(mxcsr_save);
291
    }
292

293
    // restore rdi, rsi and rbx,
294
    __ movptr(rbx, saved_rbx);
295
    __ movptr(rsi, saved_rsi);
296
    __ movptr(rdi, saved_rdi);
297
    __ addptr(rsp, 4*wordSize);
298

299
    // return
300
    __ pop(rbp);
301
    __ ret(0);
302

303
    // handle return types different from T_INT
304
    __ BIND(is_long);
305
    __ movl(Address(rdi, 0 * wordSize), rax);
306
    __ movl(Address(rdi, 1 * wordSize), rdx);
307
    __ jmp(exit);
308

309
    __ BIND(is_float);
310
    // interpreter uses xmm0 for return values
311
    if (UseSSE >= 1) {
312
      __ movflt(Address(rdi, 0), xmm0);
313
    } else {
314
      __ fstp_s(Address(rdi, 0));
315
    }
316
    __ jmp(exit);
317

318
    __ BIND(is_double);
319
    // interpreter uses xmm0 for return values
320
    if (UseSSE >= 2) {
321
      __ movdbl(Address(rdi, 0), xmm0);
322
    } else {
323
      __ fstp_d(Address(rdi, 0));
324
    }
325
    __ jmp(exit);
326

327
    return start;
328
  }
329

330

331
  //------------------------------------------------------------------------------------------------------------------------
332
  // Return point for a Java call if there's an exception thrown in Java code.
333
  // The exception is caught and transformed into a pending exception stored in
334
  // JavaThread that can be tested from within the VM.
335
  //
336
  // Note: Usually the parameters are removed by the callee. In case of an exception
337
  //       crossing an activation frame boundary, that is not the case if the callee
338
  //       is compiled code => need to setup the rsp.
339
  //
340
  // rax,: exception oop
341

342
  address generate_catch_exception() {
343
    StubCodeMark mark(this, "StubRoutines", "catch_exception");
344
    const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
345
    const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
346
    address start = __ pc();
347

348
    // get thread directly
349
    __ movptr(rcx, thread);
350
#ifdef ASSERT
351
    // verify that threads correspond
352
    { Label L;
353
      __ get_thread(rbx);
354
      __ cmpptr(rbx, rcx);
355
      __ jcc(Assembler::equal, L);
356
      __ stop("StubRoutines::catch_exception: threads must correspond");
357
      __ bind(L);
358
    }
359
#endif
360
    // set pending exception
361
    __ verify_oop(rax);
362
    __ movptr(Address(rcx, Thread::pending_exception_offset()), rax);
363
    __ lea(Address(rcx, Thread::exception_file_offset()),
364
           ExternalAddress((address)__FILE__), noreg);
365
    __ movl(Address(rcx, Thread::exception_line_offset()), __LINE__ );
366
    // complete return to VM
367
    assert(StubRoutines::_call_stub_return_address != nullptr, "_call_stub_return_address must have been generated before");
368
    __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
369

370
    return start;
371
  }
372

373

374
  //------------------------------------------------------------------------------------------------------------------------
375
  // Continuation point for runtime calls returning with a pending exception.
376
  // The pending exception check happened in the runtime or native call stub.
377
  // The pending exception in Thread is converted into a Java-level exception.
378
  //
379
  // Contract with Java-level exception handlers:
380
  // rax: exception
381
  // rdx: throwing pc
382
  //
383
  // NOTE: At entry of this stub, exception-pc must be on stack !!
384

385
  address generate_forward_exception() {
386
    StubCodeMark mark(this, "StubRoutines", "forward exception");
387
    address start = __ pc();
388
    const Register thread = rcx;
389

390
    // other registers used in this stub
391
    const Register exception_oop = rax;
392
    const Register handler_addr  = rbx;
393
    const Register exception_pc  = rdx;
394

395
    // Upon entry, the sp points to the return address returning into Java
396
    // (interpreted or compiled) code; i.e., the return address becomes the
397
    // throwing pc.
398
    //
399
    // Arguments pushed before the runtime call are still on the stack but
400
    // the exception handler will reset the stack pointer -> ignore them.
401
    // A potential result in registers can be ignored as well.
402

403
#ifdef ASSERT
404
    // make sure this code is only executed if there is a pending exception
405
    { Label L;
406
      __ get_thread(thread);
407
      __ cmpptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
408
      __ jcc(Assembler::notEqual, L);
409
      __ stop("StubRoutines::forward exception: no pending exception (1)");
410
      __ bind(L);
411
    }
412
#endif
413

414
    // compute exception handler into rbx,
415
    __ get_thread(thread);
416
    __ movptr(exception_pc, Address(rsp, 0));
417
    BLOCK_COMMENT("call exception_handler_for_return_address");
418
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
419
    __ mov(handler_addr, rax);
420

421
    // setup rax & rdx, remove return address & clear pending exception
422
    __ get_thread(thread);
423
    __ pop(exception_pc);
424
    __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
425
    __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
426

427
#ifdef ASSERT
428
    // make sure exception is set
429
    { Label L;
430
      __ testptr(exception_oop, exception_oop);
431
      __ jcc(Assembler::notEqual, L);
432
      __ stop("StubRoutines::forward exception: no pending exception (2)");
433
      __ bind(L);
434
    }
435
#endif
436

437
    // Verify that there is really a valid exception in RAX.
438
    __ verify_oop(exception_oop);
439

440
    // continue at exception handler (return address removed)
441
    // rax: exception
442
    // rbx: exception handler
443
    // rdx: throwing pc
444
    __ jmp(handler_addr);
445

446
    return start;
447
  }
448

449
  //----------------------------------------------------------------------------------------------------
450
  // Support for void verify_mxcsr()
451
  //
452
  // This routine is used with -Xcheck:jni to verify that native
453
  // JNI code does not return to Java code without restoring the
454
  // MXCSR register to our expected state.
455

456

457
  address generate_verify_mxcsr() {
458
    StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
459
    address start = __ pc();
460

461
    const Address mxcsr_save(rsp, 0);
462

463
    if (CheckJNICalls && UseSSE > 0 ) {
464
      Label ok_ret;
465
      ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
466
      __ push(rax);
467
      __ subptr(rsp, wordSize);      // allocate a temp location
468
      __ stmxcsr(mxcsr_save);
469
      __ movl(rax, mxcsr_save);
470
      __ andl(rax, MXCSR_MASK);
471
      __ cmp32(rax, mxcsr_std);
472
      __ jcc(Assembler::equal, ok_ret);
473

474
      __ warn("MXCSR changed by native JNI code.");
475

476
      __ ldmxcsr(mxcsr_std);
477

478
      __ bind(ok_ret);
479
      __ addptr(rsp, wordSize);
480
      __ pop(rax);
481
    }
482

483
    __ ret(0);
484

485
    return start;
486
  }
487

488

489
  //---------------------------------------------------------------------------
490
  // Support for void verify_fpu_cntrl_wrd()
491
  //
492
  // This routine is used with -Xcheck:jni to verify that native
493
  // JNI code does not return to Java code without restoring the
494
  // FP control word to our expected state.
495

496
  address generate_verify_fpu_cntrl_wrd() {
497
    StubCodeMark mark(this, "StubRoutines", "verify_spcw");
498
    address start = __ pc();
499

500
    const Address fpu_cntrl_wrd_save(rsp, 0);
501

502
    if (CheckJNICalls) {
503
      Label ok_ret;
504
      __ push(rax);
505
      __ subptr(rsp, wordSize);      // allocate a temp location
506
      __ fnstcw(fpu_cntrl_wrd_save);
507
      __ movl(rax, fpu_cntrl_wrd_save);
508
      __ andl(rax, FPU_CNTRL_WRD_MASK);
509
      ExternalAddress fpu_std(StubRoutines::x86::addr_fpu_cntrl_wrd_std());
510
      __ cmp32(rax, fpu_std);
511
      __ jcc(Assembler::equal, ok_ret);
512

513
      __ warn("Floating point control word changed by native JNI code.");
514

515
      __ fldcw(fpu_std);
516

517
      __ bind(ok_ret);
518
      __ addptr(rsp, wordSize);
519
      __ pop(rax);
520
    }
521

522
    __ ret(0);
523

524
    return start;
525
  }
526

527
  //---------------------------------------------------------------------------
528
  // Wrapper for slow-case handling of double-to-integer conversion
529
  // d2i or f2i fast case failed either because it is nan or because
530
  // of under/overflow.
531
  // Input:  FPU TOS: float value
532
  // Output: rax, (rdx): integer (long) result
533

534
  address generate_d2i_wrapper(BasicType t, address fcn) {
535
    StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
536
    address start = __ pc();
537

538
  // Capture info about frame layout
539
  enum layout { FPUState_off         = 0,
540
                rbp_off              = FPUStateSizeInWords,
541
                rdi_off,
542
                rsi_off,
543
                rcx_off,
544
                rbx_off,
545
                saved_argument_off,
546
                saved_argument_off2, // 2nd half of double
547
                framesize
548
  };
549

550
  assert(FPUStateSizeInWords == 27, "update stack layout");
551

552
    // Save outgoing argument to stack across push_FPU_state()
553
    __ subptr(rsp, wordSize * 2);
554
    __ fstp_d(Address(rsp, 0));
555

556
    // Save CPU & FPU state
557
    __ push(rbx);
558
    __ push(rcx);
559
    __ push(rsi);
560
    __ push(rdi);
561
    __ push(rbp);
562
    __ push_FPU_state();
563

564
    // push_FPU_state() resets the FP top of stack
565
    // Load original double into FP top of stack
566
    __ fld_d(Address(rsp, saved_argument_off * wordSize));
567
    // Store double into stack as outgoing argument
568
    __ subptr(rsp, wordSize*2);
569
    __ fst_d(Address(rsp, 0));
570

571
    // Prepare FPU for doing math in C-land
572
    __ empty_FPU_stack();
573
    // Call the C code to massage the double.  Result in EAX
574
    if (t == T_INT)
575
      { BLOCK_COMMENT("SharedRuntime::d2i"); }
576
    else if (t == T_LONG)
577
      { BLOCK_COMMENT("SharedRuntime::d2l"); }
578
    __ call_VM_leaf( fcn, 2 );
579

580
    // Restore CPU & FPU state
581
    __ pop_FPU_state();
582
    __ pop(rbp);
583
    __ pop(rdi);
584
    __ pop(rsi);
585
    __ pop(rcx);
586
    __ pop(rbx);
587
    __ addptr(rsp, wordSize * 2);
588

589
    __ ret(0);
590

591
    return start;
592
  }
593
  //---------------------------------------------------------------------------------------------------
594

595
  address generate_vector_mask(const char *stub_name, int32_t mask) {
596
    __ align(CodeEntryAlignment);
597
    StubCodeMark mark(this, "StubRoutines", stub_name);
598
    address start = __ pc();
599

600
    for (int i = 0; i < 16; i++) {
601
      __ emit_data(mask, relocInfo::none, 0);
602
    }
603

604
    return start;
605
  }
606

607
  address generate_count_leading_zeros_lut(const char *stub_name) {
608
    __ align64();
609
    StubCodeMark mark(this, "StubRoutines", stub_name);
610
    address start = __ pc();
611
    __ emit_data(0x02020304, relocInfo::none, 0);
612
    __ emit_data(0x01010101, relocInfo::none, 0);
613
    __ emit_data(0x00000000, relocInfo::none, 0);
614
    __ emit_data(0x00000000, relocInfo::none, 0);
615
    __ emit_data(0x02020304, relocInfo::none, 0);
616
    __ emit_data(0x01010101, relocInfo::none, 0);
617
    __ emit_data(0x00000000, relocInfo::none, 0);
618
    __ emit_data(0x00000000, relocInfo::none, 0);
619
    __ emit_data(0x02020304, relocInfo::none, 0);
620
    __ emit_data(0x01010101, relocInfo::none, 0);
621
    __ emit_data(0x00000000, relocInfo::none, 0);
622
    __ emit_data(0x00000000, relocInfo::none, 0);
623
    __ emit_data(0x02020304, relocInfo::none, 0);
624
    __ emit_data(0x01010101, relocInfo::none, 0);
625
    __ emit_data(0x00000000, relocInfo::none, 0);
626
    __ emit_data(0x00000000, relocInfo::none, 0);
627
    return start;
628
  }
629

630

631
  address generate_popcount_avx_lut(const char *stub_name) {
632
    __ align64();
633
    StubCodeMark mark(this, "StubRoutines", stub_name);
634
    address start = __ pc();
635
    __ emit_data(0x02010100, relocInfo::none, 0);
636
    __ emit_data(0x03020201, relocInfo::none, 0);
637
    __ emit_data(0x03020201, relocInfo::none, 0);
638
    __ emit_data(0x04030302, relocInfo::none, 0);
639
    __ emit_data(0x02010100, relocInfo::none, 0);
640
    __ emit_data(0x03020201, relocInfo::none, 0);
641
    __ emit_data(0x03020201, relocInfo::none, 0);
642
    __ emit_data(0x04030302, relocInfo::none, 0);
643
    __ emit_data(0x02010100, relocInfo::none, 0);
644
    __ emit_data(0x03020201, relocInfo::none, 0);
645
    __ emit_data(0x03020201, relocInfo::none, 0);
646
    __ emit_data(0x04030302, relocInfo::none, 0);
647
    __ emit_data(0x02010100, relocInfo::none, 0);
648
    __ emit_data(0x03020201, relocInfo::none, 0);
649
    __ emit_data(0x03020201, relocInfo::none, 0);
650
    __ emit_data(0x04030302, relocInfo::none, 0);
651
    return start;
652
  }
653

654

655
  address generate_iota_indices(const char *stub_name) {
656
    __ align(CodeEntryAlignment);
657
    StubCodeMark mark(this, "StubRoutines", stub_name);
658
    address start = __ pc();
659
    // B
660
    __ emit_data(0x03020100, relocInfo::none, 0);
661
    __ emit_data(0x07060504, relocInfo::none, 0);
662
    __ emit_data(0x0B0A0908, relocInfo::none, 0);
663
    __ emit_data(0x0F0E0D0C, relocInfo::none, 0);
664
    __ emit_data(0x13121110, relocInfo::none, 0);
665
    __ emit_data(0x17161514, relocInfo::none, 0);
666
    __ emit_data(0x1B1A1918, relocInfo::none, 0);
667
    __ emit_data(0x1F1E1D1C, relocInfo::none, 0);
668
    __ emit_data(0x23222120, relocInfo::none, 0);
669
    __ emit_data(0x27262524, relocInfo::none, 0);
670
    __ emit_data(0x2B2A2928, relocInfo::none, 0);
671
    __ emit_data(0x2F2E2D2C, relocInfo::none, 0);
672
    __ emit_data(0x33323130, relocInfo::none, 0);
673
    __ emit_data(0x37363534, relocInfo::none, 0);
674
    __ emit_data(0x3B3A3938, relocInfo::none, 0);
675
    __ emit_data(0x3F3E3D3C, relocInfo::none, 0);
676

677
    // W
678
    __ emit_data(0x00010000, relocInfo::none, 0);
679
    __ emit_data(0x00030002, relocInfo::none, 0);
680
    __ emit_data(0x00050004, relocInfo::none, 0);
681
    __ emit_data(0x00070006, relocInfo::none, 0);
682
    __ emit_data(0x00090008, relocInfo::none, 0);
683
    __ emit_data(0x000B000A, relocInfo::none, 0);
684
    __ emit_data(0x000D000C, relocInfo::none, 0);
685
    __ emit_data(0x000F000E, relocInfo::none, 0);
686
    __ emit_data(0x00110010, relocInfo::none, 0);
687
    __ emit_data(0x00130012, relocInfo::none, 0);
688
    __ emit_data(0x00150014, relocInfo::none, 0);
689
    __ emit_data(0x00170016, relocInfo::none, 0);
690
    __ emit_data(0x00190018, relocInfo::none, 0);
691
    __ emit_data(0x001B001A, relocInfo::none, 0);
692
    __ emit_data(0x001D001C, relocInfo::none, 0);
693
    __ emit_data(0x001F001E, relocInfo::none, 0);
694

695
    // D
696
    __ emit_data(0x00000000, relocInfo::none, 0);
697
    __ emit_data(0x00000001, relocInfo::none, 0);
698
    __ emit_data(0x00000002, relocInfo::none, 0);
699
    __ emit_data(0x00000003, relocInfo::none, 0);
700
    __ emit_data(0x00000004, relocInfo::none, 0);
701
    __ emit_data(0x00000005, relocInfo::none, 0);
702
    __ emit_data(0x00000006, relocInfo::none, 0);
703
    __ emit_data(0x00000007, relocInfo::none, 0);
704
    __ emit_data(0x00000008, relocInfo::none, 0);
705
    __ emit_data(0x00000009, relocInfo::none, 0);
706
    __ emit_data(0x0000000A, relocInfo::none, 0);
707
    __ emit_data(0x0000000B, relocInfo::none, 0);
708
    __ emit_data(0x0000000C, relocInfo::none, 0);
709
    __ emit_data(0x0000000D, relocInfo::none, 0);
710
    __ emit_data(0x0000000E, relocInfo::none, 0);
711
    __ emit_data(0x0000000F, relocInfo::none, 0);
712

713
    // Q
714
    __ emit_data(0x00000000, relocInfo::none, 0);
715
    __ emit_data(0x00000000, relocInfo::none, 0);
716
    __ emit_data(0x00000001, relocInfo::none, 0);
717
    __ emit_data(0x00000000, relocInfo::none, 0);
718
    __ emit_data(0x00000002, relocInfo::none, 0);
719
    __ emit_data(0x00000000, relocInfo::none, 0);
720
    __ emit_data(0x00000003, relocInfo::none, 0);
721
    __ emit_data(0x00000000, relocInfo::none, 0);
722
    __ emit_data(0x00000004, relocInfo::none, 0);
723
    __ emit_data(0x00000000, relocInfo::none, 0);
724
    __ emit_data(0x00000005, relocInfo::none, 0);
725
    __ emit_data(0x00000000, relocInfo::none, 0);
726
    __ emit_data(0x00000006, relocInfo::none, 0);
727
    __ emit_data(0x00000000, relocInfo::none, 0);
728
    __ emit_data(0x00000007, relocInfo::none, 0);
729
    __ emit_data(0x00000000, relocInfo::none, 0);
730

731
    // D - FP
732
    __ emit_data(0x00000000, relocInfo::none, 0); // 0.0f
733
    __ emit_data(0x3F800000, relocInfo::none, 0); // 1.0f
734
    __ emit_data(0x40000000, relocInfo::none, 0); // 2.0f
735
    __ emit_data(0x40400000, relocInfo::none, 0); // 3.0f
736
    __ emit_data(0x40800000, relocInfo::none, 0); // 4.0f
737
    __ emit_data(0x40A00000, relocInfo::none, 0); // 5.0f
738
    __ emit_data(0x40C00000, relocInfo::none, 0); // 6.0f
739
    __ emit_data(0x40E00000, relocInfo::none, 0); // 7.0f
740
    __ emit_data(0x41000000, relocInfo::none, 0); // 8.0f
741
    __ emit_data(0x41100000, relocInfo::none, 0); // 9.0f
742
    __ emit_data(0x41200000, relocInfo::none, 0); // 10.0f
743
    __ emit_data(0x41300000, relocInfo::none, 0); // 11.0f
744
    __ emit_data(0x41400000, relocInfo::none, 0); // 12.0f
745
    __ emit_data(0x41500000, relocInfo::none, 0); // 13.0f
746
    __ emit_data(0x41600000, relocInfo::none, 0); // 14.0f
747
    __ emit_data(0x41700000, relocInfo::none, 0); // 15.0f
748

749
    // Q - FP
750
    __ emit_data(0x00000000, relocInfo::none, 0); // 0.0d
751
    __ emit_data(0x00000000, relocInfo::none, 0);
752
    __ emit_data(0x00000000, relocInfo::none, 0); // 1.0d
753
    __ emit_data(0x3FF00000, relocInfo::none, 0);
754
    __ emit_data(0x00000000, relocInfo::none, 0); // 2.0d
755
    __ emit_data(0x40000000, relocInfo::none, 0);
756
    __ emit_data(0x00000000, relocInfo::none, 0); // 3.0d
757
    __ emit_data(0x40080000, relocInfo::none, 0);
758
    __ emit_data(0x00000000, relocInfo::none, 0); // 4.0d
759
    __ emit_data(0x40100000, relocInfo::none, 0);
760
    __ emit_data(0x00000000, relocInfo::none, 0); // 5.0d
761
    __ emit_data(0x40140000, relocInfo::none, 0);
762
    __ emit_data(0x00000000, relocInfo::none, 0); // 6.0d
763
    __ emit_data(0x40180000, relocInfo::none, 0);
764
    __ emit_data(0x00000000, relocInfo::none, 0); // 7.0d
765
    __ emit_data(0x401c0000, relocInfo::none, 0);
766
    return start;
767
  }
768

769
  address generate_vector_reverse_bit_lut(const char *stub_name) {
770
    __ align(CodeEntryAlignment);
771
    StubCodeMark mark(this, "StubRoutines", stub_name);
772
    address start = __ pc();
773
    __ emit_data(0x0C040800, relocInfo::none, 0);
774
    __ emit_data(0x0E060A02, relocInfo::none, 0);
775
    __ emit_data(0x0D050901, relocInfo::none, 0);
776
    __ emit_data(0x0F070B03, relocInfo::none, 0);
777
    __ emit_data(0x0C040800, relocInfo::none, 0);
778
    __ emit_data(0x0E060A02, relocInfo::none, 0);
779
    __ emit_data(0x0D050901, relocInfo::none, 0);
780
    __ emit_data(0x0F070B03, relocInfo::none, 0);
781
    __ emit_data(0x0C040800, relocInfo::none, 0);
782
    __ emit_data(0x0E060A02, relocInfo::none, 0);
783
    __ emit_data(0x0D050901, relocInfo::none, 0);
784
    __ emit_data(0x0F070B03, relocInfo::none, 0);
785
    __ emit_data(0x0C040800, relocInfo::none, 0);
786
    __ emit_data(0x0E060A02, relocInfo::none, 0);
787
    __ emit_data(0x0D050901, relocInfo::none, 0);
788
    __ emit_data(0x0F070B03, relocInfo::none, 0);
789
    return start;
790
  }
791

792
  address generate_vector_reverse_byte_perm_mask_long(const char *stub_name) {
793
    __ align(CodeEntryAlignment);
794
    StubCodeMark mark(this, "StubRoutines", stub_name);
795
    address start = __ pc();
796
    __ emit_data(0x04050607, relocInfo::none, 0);
797
    __ emit_data(0x00010203, relocInfo::none, 0);
798
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
799
    __ emit_data(0x08090A0B, relocInfo::none, 0);
800
    __ emit_data(0x04050607, relocInfo::none, 0);
801
    __ emit_data(0x00010203, relocInfo::none, 0);
802
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
803
    __ emit_data(0x08090A0B, relocInfo::none, 0);
804
    __ emit_data(0x04050607, relocInfo::none, 0);
805
    __ emit_data(0x00010203, relocInfo::none, 0);
806
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
807
    __ emit_data(0x08090A0B, relocInfo::none, 0);
808
    __ emit_data(0x04050607, relocInfo::none, 0);
809
    __ emit_data(0x00010203, relocInfo::none, 0);
810
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
811
    __ emit_data(0x08090A0B, relocInfo::none, 0);
812
    return start;
813
  }
814

815
  address generate_vector_reverse_byte_perm_mask_int(const char *stub_name) {
816
    __ align(CodeEntryAlignment);
817
    StubCodeMark mark(this, "StubRoutines", stub_name);
818
    address start = __ pc();
819
    __ emit_data(0x00010203, relocInfo::none, 0);
820
    __ emit_data(0x04050607, relocInfo::none, 0);
821
    __ emit_data(0x08090A0B, relocInfo::none, 0);
822
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
823
    __ emit_data(0x00010203, relocInfo::none, 0);
824
    __ emit_data(0x04050607, relocInfo::none, 0);
825
    __ emit_data(0x08090A0B, relocInfo::none, 0);
826
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
827
    __ emit_data(0x00010203, relocInfo::none, 0);
828
    __ emit_data(0x04050607, relocInfo::none, 0);
829
    __ emit_data(0x08090A0B, relocInfo::none, 0);
830
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
831
    __ emit_data(0x00010203, relocInfo::none, 0);
832
    __ emit_data(0x04050607, relocInfo::none, 0);
833
    __ emit_data(0x08090A0B, relocInfo::none, 0);
834
    __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
835
    return start;
836
  }
837

838
  address generate_vector_reverse_byte_perm_mask_short(const char *stub_name) {
839
    __ align(CodeEntryAlignment);
840
    StubCodeMark mark(this, "StubRoutines", stub_name);
841
    address start = __ pc();
842
    __ emit_data(0x02030001, relocInfo::none, 0);
843
    __ emit_data(0x06070405, relocInfo::none, 0);
844
    __ emit_data(0x0A0B0809, relocInfo::none, 0);
845
    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
846
    __ emit_data(0x02030001, relocInfo::none, 0);
847
    __ emit_data(0x06070405, relocInfo::none, 0);
848
    __ emit_data(0x0A0B0809, relocInfo::none, 0);
849
    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
850
    __ emit_data(0x02030001, relocInfo::none, 0);
851
    __ emit_data(0x06070405, relocInfo::none, 0);
852
    __ emit_data(0x0A0B0809, relocInfo::none, 0);
853
    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
854
    __ emit_data(0x02030001, relocInfo::none, 0);
855
    __ emit_data(0x06070405, relocInfo::none, 0);
856
    __ emit_data(0x0A0B0809, relocInfo::none, 0);
857
    __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
858
    return start;
859
  }
860

861
  address generate_vector_byte_shuffle_mask(const char *stub_name) {
862
    __ align(CodeEntryAlignment);
863
    StubCodeMark mark(this, "StubRoutines", stub_name);
864
    address start = __ pc();
865
    __ emit_data(0x70707070, relocInfo::none, 0);
866
    __ emit_data(0x70707070, relocInfo::none, 0);
867
    __ emit_data(0x70707070, relocInfo::none, 0);
868
    __ emit_data(0x70707070, relocInfo::none, 0);
869
    __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
870
    __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
871
    __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
872
    __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
873
    return start;
874
  }
875

876
  address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
877
    __ align(CodeEntryAlignment);
878
    StubCodeMark mark(this, "StubRoutines", stub_name);
879
    address start = __ pc();
880

881
    for (int i = 0; i < 8; i++) {
882
      __ emit_data(masklo, relocInfo::none, 0);
883
      __ emit_data(maskhi, relocInfo::none, 0);
884
    }
885

886
    return start;
887
  }
888

889
  //----------------------------------------------------------------------------------------------------
890

891
  address generate_vector_byte_perm_mask(const char *stub_name) {
892
    __ align(CodeEntryAlignment);
893
    StubCodeMark mark(this, "StubRoutines", stub_name);
894
    address start = __ pc();
895

896
    __ emit_data(0x00000001, relocInfo::none, 0);
897
    __ emit_data(0x00000000, relocInfo::none, 0);
898
    __ emit_data(0x00000003, relocInfo::none, 0);
899
    __ emit_data(0x00000000, relocInfo::none, 0);
900
    __ emit_data(0x00000005, relocInfo::none, 0);
901
    __ emit_data(0x00000000, relocInfo::none, 0);
902
    __ emit_data(0x00000007, relocInfo::none, 0);
903
    __ emit_data(0x00000000, relocInfo::none, 0);
904
    __ emit_data(0x00000000, relocInfo::none, 0);
905
    __ emit_data(0x00000000, relocInfo::none, 0);
906
    __ emit_data(0x00000002, relocInfo::none, 0);
907
    __ emit_data(0x00000000, relocInfo::none, 0);
908
    __ emit_data(0x00000004, relocInfo::none, 0);
909
    __ emit_data(0x00000000, relocInfo::none, 0);
910
    __ emit_data(0x00000006, relocInfo::none, 0);
911
    __ emit_data(0x00000000, relocInfo::none, 0);
912

913
    return start;
914
  }
915

916
  address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
917
                                     int32_t val0, int32_t val1, int32_t val2, int32_t val3,
918
                                     int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
919
                                     int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
920
                                     int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
921
    __ align(CodeEntryAlignment);
922
    StubCodeMark mark(this, "StubRoutines", stub_name);
923
    address start = __ pc();
924

925
    assert(len != Assembler::AVX_NoVec, "vector len must be specified");
926
    __ emit_data(val0, relocInfo::none, 0);
927
    __ emit_data(val1, relocInfo::none, 0);
928
    __ emit_data(val2, relocInfo::none, 0);
929
    __ emit_data(val3, relocInfo::none, 0);
930
    if (len >= Assembler::AVX_256bit) {
931
      __ emit_data(val4, relocInfo::none, 0);
932
      __ emit_data(val5, relocInfo::none, 0);
933
      __ emit_data(val6, relocInfo::none, 0);
934
      __ emit_data(val7, relocInfo::none, 0);
935
      if (len >= Assembler::AVX_512bit) {
936
        __ emit_data(val8, relocInfo::none, 0);
937
        __ emit_data(val9, relocInfo::none, 0);
938
        __ emit_data(val10, relocInfo::none, 0);
939
        __ emit_data(val11, relocInfo::none, 0);
940
        __ emit_data(val12, relocInfo::none, 0);
941
        __ emit_data(val13, relocInfo::none, 0);
942
        __ emit_data(val14, relocInfo::none, 0);
943
        __ emit_data(val15, relocInfo::none, 0);
944
      }
945
    }
946

947
    return start;
948
  }
949

950
  //----------------------------------------------------------------------------------------------------
951
  // Non-destructive plausibility checks for oops
952

953
  address generate_verify_oop() {
954
    StubCodeMark mark(this, "StubRoutines", "verify_oop");
955
    address start = __ pc();
956

957
    // Incoming arguments on stack after saving rax,:
958
    //
959
    // [tos    ]: saved rdx
960
    // [tos + 1]: saved EFLAGS
961
    // [tos + 2]: return address
962
    // [tos + 3]: char* error message
963
    // [tos + 4]: oop   object to verify
964
    // [tos + 5]: saved rax, - saved by caller and bashed
965

966
    Label exit, error;
967
    __ pushf();
968
    __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
969
    __ push(rdx);                                // save rdx
970
    // make sure object is 'reasonable'
971
    __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
972
    __ testptr(rax, rax);
973
    __ jcc(Assembler::zero, exit);               // if obj is null it is ok
974

975
    // Check if the oop is in the right area of memory
976
    const int oop_mask = Universe::verify_oop_mask();
977
    const int oop_bits = Universe::verify_oop_bits();
978
    __ mov(rdx, rax);
979
    __ andptr(rdx, oop_mask);
980
    __ cmpptr(rdx, oop_bits);
981
    __ jcc(Assembler::notZero, error);
982

983
    // make sure klass is 'reasonable', which is not zero.
984
    __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
985
    __ testptr(rax, rax);
986
    __ jcc(Assembler::zero, error);              // if klass is null it is broken
987

988
    // return if everything seems ok
989
    __ bind(exit);
990
    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
991
    __ pop(rdx);                                 // restore rdx
992
    __ popf();                                   // restore EFLAGS
993
    __ ret(3 * wordSize);                        // pop arguments
994

995
    // handle errors
996
    __ bind(error);
997
    __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
998
    __ pop(rdx);                                 // get saved rdx back
999
    __ popf();                                   // get saved EFLAGS off stack -- will be ignored
1000
    __ pusha();                                  // push registers (eip = return address & msg are already pushed)
1001
    BLOCK_COMMENT("call MacroAssembler::debug");
1002
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
1003
    __ hlt();
1004
    return start;
1005
  }
1006

1007

1008
  // Copy 64 bytes chunks
1009
  //
1010
  // Inputs:
1011
  //   from        - source array address
1012
  //   to_from     - destination array address - from
1013
  //   qword_count - 8-bytes element count, negative
1014
  //
1015
  void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
1016
    assert( UseSSE >= 2, "supported cpu only" );
1017
    Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
1018

1019
    // Copy 64-byte chunks
1020
    __ jmpb(L_copy_64_bytes);
1021
    __ align(OptoLoopAlignment);
1022
  __ BIND(L_copy_64_bytes_loop);
1023

1024
    if (UseUnalignedLoadStores) {
1025
      if (UseAVX > 2) {
1026
        __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
1027
        __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
1028
      } else if (UseAVX == 2) {
1029
        __ vmovdqu(xmm0, Address(from,  0));
1030
        __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
1031
        __ vmovdqu(xmm1, Address(from, 32));
1032
        __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
1033
      } else {
1034
        __ movdqu(xmm0, Address(from, 0));
1035
        __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
1036
        __ movdqu(xmm1, Address(from, 16));
1037
        __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
1038
        __ movdqu(xmm2, Address(from, 32));
1039
        __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
1040
        __ movdqu(xmm3, Address(from, 48));
1041
        __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
1042
      }
1043
    } else {
1044
      __ movq(xmm0, Address(from, 0));
1045
      __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
1046
      __ movq(xmm1, Address(from, 8));
1047
      __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
1048
      __ movq(xmm2, Address(from, 16));
1049
      __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
1050
      __ movq(xmm3, Address(from, 24));
1051
      __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
1052
      __ movq(xmm4, Address(from, 32));
1053
      __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
1054
      __ movq(xmm5, Address(from, 40));
1055
      __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
1056
      __ movq(xmm6, Address(from, 48));
1057
      __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
1058
      __ movq(xmm7, Address(from, 56));
1059
      __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
1060
    }
1061

1062
    __ addl(from, 64);
1063
  __ BIND(L_copy_64_bytes);
1064
    __ subl(qword_count, 8);
1065
    __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
1066

1067
    if (UseUnalignedLoadStores && (UseAVX == 2)) {
1068
      // clean upper bits of YMM registers
1069
      __ vpxor(xmm0, xmm0);
1070
      __ vpxor(xmm1, xmm1);
1071
    }
1072
    __ addl(qword_count, 8);
1073
    __ jccb(Assembler::zero, L_exit);
1074
    //
1075
    // length is too short, just copy qwords
1076
    //
1077
  __ BIND(L_copy_8_bytes);
1078
    __ movq(xmm0, Address(from, 0));
1079
    __ movq(Address(from, to_from, Address::times_1), xmm0);
1080
    __ addl(from, 8);
1081
    __ decrement(qword_count);
1082
    __ jcc(Assembler::greater, L_copy_8_bytes);
1083
  __ BIND(L_exit);
1084
  }
1085

1086
  address generate_disjoint_copy(BasicType t, bool aligned,
1087
                                 Address::ScaleFactor sf,
1088
                                 address* entry, const char *name,
1089
                                 bool dest_uninitialized = false) {
1090
    __ align(CodeEntryAlignment);
1091
    StubCodeMark mark(this, "StubRoutines", name);
1092
    address start = __ pc();
1093

1094
    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1095
    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
1096

1097
    int shift = Address::times_ptr - sf;
1098

1099
    const Register from     = rsi;  // source array address
1100
    const Register to       = rdi;  // destination array address
1101
    const Register count    = rcx;  // elements count
1102
    const Register to_from  = to;   // (to - from)
1103
    const Register saved_to = rdx;  // saved destination array address
1104

1105
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1106
    __ push(rsi);
1107
    __ push(rdi);
1108
    __ movptr(from , Address(rsp, 12+ 4));
1109
    __ movptr(to   , Address(rsp, 12+ 8));
1110
    __ movl(count, Address(rsp, 12+ 12));
1111

1112
    if (entry != nullptr) {
1113
      *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1114
      BLOCK_COMMENT("Entry:");
1115
    }
1116

1117
    if (t == T_OBJECT) {
1118
      __ testl(count, count);
1119
      __ jcc(Assembler::zero, L_0_count);
1120
    }
1121

1122
    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1123
    if (dest_uninitialized) {
1124
      decorators |= IS_DEST_UNINITIALIZED;
1125
    }
1126
    if (aligned) {
1127
      decorators |= ARRAYCOPY_ALIGNED;
1128
    }
1129

1130
    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1131
    bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1132
    {
1133
      bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1134
      // UnsafeMemoryAccess page error: continue after unsafe access
1135
      UnsafeMemoryAccessMark umam(this, add_entry, true);
1136
      __ subptr(to, from); // to --> to_from
1137
      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1138
      __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1139
      if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
1140
        // align source address at 4 bytes address boundary
1141
        if (t == T_BYTE) {
1142
          // One byte misalignment happens only for byte arrays
1143
          __ testl(from, 1);
1144
          __ jccb(Assembler::zero, L_skip_align1);
1145
          __ movb(rax, Address(from, 0));
1146
          __ movb(Address(from, to_from, Address::times_1, 0), rax);
1147
          __ increment(from);
1148
          __ decrement(count);
1149
        __ BIND(L_skip_align1);
1150
        }
1151
        // Two bytes misalignment happens only for byte and short (char) arrays
1152
        __ testl(from, 2);
1153
        __ jccb(Assembler::zero, L_skip_align2);
1154
        __ movw(rax, Address(from, 0));
1155
        __ movw(Address(from, to_from, Address::times_1, 0), rax);
1156
        __ addptr(from, 2);
1157
        __ subl(count, 1<<(shift-1));
1158
      __ BIND(L_skip_align2);
1159
      }
1160
      if (!UseXMMForArrayCopy) {
1161
        __ mov(rax, count);      // save 'count'
1162
        __ shrl(count, shift); // bytes count
1163
        __ addptr(to_from, from);// restore 'to'
1164
        __ rep_mov();
1165
        __ subptr(to_from, from);// restore 'to_from'
1166
        __ mov(count, rax);      // restore 'count'
1167
        __ jmpb(L_copy_2_bytes); // all dwords were copied
1168
      } else {
1169
        if (!UseUnalignedLoadStores) {
1170
          // align to 8 bytes, we know we are 4 byte aligned to start
1171
          __ testptr(from, 4);
1172
          __ jccb(Assembler::zero, L_copy_64_bytes);
1173
          __ movl(rax, Address(from, 0));
1174
          __ movl(Address(from, to_from, Address::times_1, 0), rax);
1175
          __ addptr(from, 4);
1176
          __ subl(count, 1<<shift);
1177
        }
1178
      __ BIND(L_copy_64_bytes);
1179
        __ mov(rax, count);
1180
        __ shrl(rax, shift+1);  // 8 bytes chunk count
1181
        //
1182
        // Copy 8-byte chunks through XMM registers, 8 per iteration of the loop
1183
        //
1184
        xmm_copy_forward(from, to_from, rax);
1185
      }
1186
      // copy tailing dword
1187
    __ BIND(L_copy_4_bytes);
1188
      __ testl(count, 1<<shift);
1189
      __ jccb(Assembler::zero, L_copy_2_bytes);
1190
      __ movl(rax, Address(from, 0));
1191
      __ movl(Address(from, to_from, Address::times_1, 0), rax);
1192
      if (t == T_BYTE || t == T_SHORT) {
1193
        __ addptr(from, 4);
1194
      __ BIND(L_copy_2_bytes);
1195
        // copy tailing word
1196
        __ testl(count, 1<<(shift-1));
1197
        __ jccb(Assembler::zero, L_copy_byte);
1198
        __ movw(rax, Address(from, 0));
1199
        __ movw(Address(from, to_from, Address::times_1, 0), rax);
1200
        if (t == T_BYTE) {
1201
          __ addptr(from, 2);
1202
        __ BIND(L_copy_byte);
1203
          // copy tailing byte
1204
          __ testl(count, 1);
1205
          __ jccb(Assembler::zero, L_exit);
1206
          __ movb(rax, Address(from, 0));
1207
          __ movb(Address(from, to_from, Address::times_1, 0), rax);
1208
        __ BIND(L_exit);
1209
        } else {
1210
        __ BIND(L_copy_byte);
1211
        }
1212
      } else {
1213
      __ BIND(L_copy_2_bytes);
1214
      }
1215
    }
1216

1217
    __ movl(count, Address(rsp, 12+12)); // reread 'count'
1218
    bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1219

1220
    if (t == T_OBJECT) {
1221
    __ BIND(L_0_count);
1222
    }
1223
    inc_copy_counter_np(t);
1224
    __ pop(rdi);
1225
    __ pop(rsi);
1226
    __ leave(); // required for proper stackwalking of RuntimeStub frame
1227
    __ vzeroupper();
1228
    __ xorptr(rax, rax); // return 0
1229
    __ ret(0);
1230
    return start;
1231
  }
1232

1233

1234
  address generate_fill(BasicType t, bool aligned, const char *name) {
1235
    __ align(CodeEntryAlignment);
1236
    StubCodeMark mark(this, "StubRoutines", name);
1237
    address start = __ pc();
1238

1239
    BLOCK_COMMENT("Entry:");
1240

1241
    const Register to       = rdi;  // source array address
1242
    const Register value    = rdx;  // value
1243
    const Register count    = rsi;  // elements count
1244

1245
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1246
    __ push(rsi);
1247
    __ push(rdi);
1248
    __ movptr(to   , Address(rsp, 12+ 4));
1249
    __ movl(value, Address(rsp, 12+ 8));
1250
    __ movl(count, Address(rsp, 12+ 12));
1251

1252
    __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1253

1254
    __ pop(rdi);
1255
    __ pop(rsi);
1256
    __ leave(); // required for proper stackwalking of RuntimeStub frame
1257
    __ ret(0);
1258
    return start;
1259
  }
1260

1261
  address generate_conjoint_copy(BasicType t, bool aligned,
1262
                                 Address::ScaleFactor sf,
1263
                                 address nooverlap_target,
1264
                                 address* entry, const char *name,
1265
                                 bool dest_uninitialized = false) {
1266
    __ align(CodeEntryAlignment);
1267
    StubCodeMark mark(this, "StubRoutines", name);
1268
    address start = __ pc();
1269

1270
    Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1271
    Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1272

1273
    int shift = Address::times_ptr - sf;
1274

1275
    const Register src   = rax;  // source array address
1276
    const Register dst   = rdx;  // destination array address
1277
    const Register from  = rsi;  // source array address
1278
    const Register to    = rdi;  // destination array address
1279
    const Register count = rcx;  // elements count
1280
    const Register end   = rax;  // array end address
1281

1282
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1283
    __ push(rsi);
1284
    __ push(rdi);
1285
    __ movptr(src  , Address(rsp, 12+ 4));   // from
1286
    __ movptr(dst  , Address(rsp, 12+ 8));   // to
1287
    __ movl2ptr(count, Address(rsp, 12+12)); // count
1288

1289
    if (entry != nullptr) {
1290
      *entry = __ pc(); // Entry point from generic arraycopy stub.
1291
      BLOCK_COMMENT("Entry:");
1292
    }
1293

1294
    // nooverlap_target expects arguments in rsi and rdi.
1295
    __ mov(from, src);
1296
    __ mov(to  , dst);
1297

1298
    // arrays overlap test: dispatch to disjoint stub if necessary.
1299
    RuntimeAddress nooverlap(nooverlap_target);
1300
    __ cmpptr(dst, src);
1301
    __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1302
    __ jump_cc(Assembler::belowEqual, nooverlap);
1303
    __ cmpptr(dst, end);
1304
    __ jump_cc(Assembler::aboveEqual, nooverlap);
1305

1306
    if (t == T_OBJECT) {
1307
      __ testl(count, count);
1308
      __ jcc(Assembler::zero, L_0_count);
1309
    }
1310

1311
    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1312
    if (dest_uninitialized) {
1313
      decorators |= IS_DEST_UNINITIALIZED;
1314
    }
1315
    if (aligned) {
1316
      decorators |= ARRAYCOPY_ALIGNED;
1317
    }
1318

1319
    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1320
    bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1321

1322
    {
1323
      bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1324
      // UnsafeMemoryAccess page error: continue after unsafe access
1325
      UnsafeMemoryAccessMark umam(this, add_entry, true);
1326
      // copy from high to low
1327
      __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1328
      __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1329
      if (t == T_BYTE || t == T_SHORT) {
1330
        // Align the end of destination array at 4 bytes address boundary
1331
        __ lea(end, Address(dst, count, sf, 0));
1332
        if (t == T_BYTE) {
1333
          // One byte misalignment happens only for byte arrays
1334
          __ testl(end, 1);
1335
          __ jccb(Assembler::zero, L_skip_align1);
1336
          __ decrement(count);
1337
          __ movb(rdx, Address(from, count, sf, 0));
1338
          __ movb(Address(to, count, sf, 0), rdx);
1339
        __ BIND(L_skip_align1);
1340
        }
1341
        // Two bytes misalignment happens only for byte and short (char) arrays
1342
        __ testl(end, 2);
1343
        __ jccb(Assembler::zero, L_skip_align2);
1344
        __ subptr(count, 1<<(shift-1));
1345
        __ movw(rdx, Address(from, count, sf, 0));
1346
        __ movw(Address(to, count, sf, 0), rdx);
1347
      __ BIND(L_skip_align2);
1348
        __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1349
        __ jcc(Assembler::below, L_copy_4_bytes);
1350
      }
1351

1352
      if (!UseXMMForArrayCopy) {
1353
        __ std();
1354
        __ mov(rax, count); // Save 'count'
1355
        __ mov(rdx, to);    // Save 'to'
1356
        __ lea(rsi, Address(from, count, sf, -4));
1357
        __ lea(rdi, Address(to  , count, sf, -4));
1358
        __ shrptr(count, shift); // bytes count
1359
        __ rep_mov();
1360
        __ cld();
1361
        __ mov(count, rax); // restore 'count'
1362
        __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1363
        __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1364
        __ mov(to, rdx);   // restore 'to'
1365
        __ jmpb(L_copy_2_bytes); // all dword were copied
1366
      } else {
1367
        // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1368
        __ testptr(end, 4);
1369
        __ jccb(Assembler::zero, L_copy_8_bytes);
1370
        __ subl(count, 1<<shift);
1371
        __ movl(rdx, Address(from, count, sf, 0));
1372
        __ movl(Address(to, count, sf, 0), rdx);
1373
        __ jmpb(L_copy_8_bytes);
1374

1375
        __ align(OptoLoopAlignment);
1376
        // Move 8 bytes
1377
      __ BIND(L_copy_8_bytes_loop);
1378
        __ movq(xmm0, Address(from, count, sf, 0));
1379
        __ movq(Address(to, count, sf, 0), xmm0);
1380
      __ BIND(L_copy_8_bytes);
1381
        __ subl(count, 2<<shift);
1382
        __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1383
        __ addl(count, 2<<shift);
1384
      }
1385
    __ BIND(L_copy_4_bytes);
1386
      // copy prefix qword
1387
      __ testl(count, 1<<shift);
1388
      __ jccb(Assembler::zero, L_copy_2_bytes);
1389
      __ movl(rdx, Address(from, count, sf, -4));
1390
      __ movl(Address(to, count, sf, -4), rdx);
1391

1392
      if (t == T_BYTE || t == T_SHORT) {
1393
          __ subl(count, (1<<shift));
1394
        __ BIND(L_copy_2_bytes);
1395
          // copy prefix dword
1396
          __ testl(count, 1<<(shift-1));
1397
          __ jccb(Assembler::zero, L_copy_byte);
1398
          __ movw(rdx, Address(from, count, sf, -2));
1399
          __ movw(Address(to, count, sf, -2), rdx);
1400
          if (t == T_BYTE) {
1401
            __ subl(count, 1<<(shift-1));
1402
          __ BIND(L_copy_byte);
1403
            // copy prefix byte
1404
            __ testl(count, 1);
1405
            __ jccb(Assembler::zero, L_exit);
1406
            __ movb(rdx, Address(from, 0));
1407
            __ movb(Address(to, 0), rdx);
1408
          __ BIND(L_exit);
1409
          } else {
1410
          __ BIND(L_copy_byte);
1411
          }
1412
      } else {
1413
      __ BIND(L_copy_2_bytes);
1414
      }
1415
    }
1416

1417
    __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1418
    bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1419

1420
    if (t == T_OBJECT) {
1421
    __ BIND(L_0_count);
1422
    }
1423
    inc_copy_counter_np(t);
1424
    __ pop(rdi);
1425
    __ pop(rsi);
1426
    __ leave(); // required for proper stackwalking of RuntimeStub frame
1427
    __ xorptr(rax, rax); // return 0
1428
    __ ret(0);
1429
    return start;
1430
  }
1431

1432

1433
  address generate_disjoint_long_copy(address* entry, const char *name) {
1434
    __ align(CodeEntryAlignment);
1435
    StubCodeMark mark(this, "StubRoutines", name);
1436
    address start = __ pc();
1437

1438
    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1439
    const Register from       = rax;  // source array address
1440
    const Register to         = rdx;  // destination array address
1441
    const Register count      = rcx;  // elements count
1442
    const Register to_from    = rdx;  // (to - from)
1443

1444
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1445
    __ movptr(from , Address(rsp, 8+0));       // from
1446
    __ movptr(to   , Address(rsp, 8+4));       // to
1447
    __ movl2ptr(count, Address(rsp, 8+8));     // count
1448

1449
    *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1450
    BLOCK_COMMENT("Entry:");
1451

1452
    {
1453
      // UnsafeMemoryAccess page error: continue after unsafe access
1454
      UnsafeMemoryAccessMark umam(this, true, true);
1455
      __ subptr(to, from); // to --> to_from
1456
      if (UseXMMForArrayCopy) {
1457
        xmm_copy_forward(from, to_from, count);
1458
      } else {
1459
        __ jmpb(L_copy_8_bytes);
1460
        __ align(OptoLoopAlignment);
1461
      __ BIND(L_copy_8_bytes_loop);
1462
        __ fild_d(Address(from, 0));
1463
        __ fistp_d(Address(from, to_from, Address::times_1));
1464
        __ addptr(from, 8);
1465
      __ BIND(L_copy_8_bytes);
1466
        __ decrement(count);
1467
        __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1468
      }
1469
    }
1470
    inc_copy_counter_np(T_LONG);
1471
    __ leave(); // required for proper stackwalking of RuntimeStub frame
1472
    __ vzeroupper();
1473
    __ xorptr(rax, rax); // return 0
1474
    __ ret(0);
1475
    return start;
1476
  }
1477

1478
  address generate_conjoint_long_copy(address nooverlap_target,
1479
                                      address* entry, const char *name) {
1480
    __ align(CodeEntryAlignment);
1481
    StubCodeMark mark(this, "StubRoutines", name);
1482
    address start = __ pc();
1483

1484
    Label L_copy_8_bytes, L_copy_8_bytes_loop;
1485
    const Register from       = rax;  // source array address
1486
    const Register to         = rdx;  // destination array address
1487
    const Register count      = rcx;  // elements count
1488
    const Register end_from   = rax;  // source array end address
1489

1490
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1491
    __ movptr(from , Address(rsp, 8+0));       // from
1492
    __ movptr(to   , Address(rsp, 8+4));       // to
1493
    __ movl2ptr(count, Address(rsp, 8+8));     // count
1494

1495
    *entry = __ pc(); // Entry point from generic arraycopy stub.
1496
    BLOCK_COMMENT("Entry:");
1497

1498
    // arrays overlap test
1499
    __ cmpptr(to, from);
1500
    RuntimeAddress nooverlap(nooverlap_target);
1501
    __ jump_cc(Assembler::belowEqual, nooverlap);
1502
    __ lea(end_from, Address(from, count, Address::times_8, 0));
1503
    __ cmpptr(to, end_from);
1504
    __ movptr(from, Address(rsp, 8));  // from
1505
    __ jump_cc(Assembler::aboveEqual, nooverlap);
1506

1507
    {
1508
      // UnsafeMemoryAccess page error: continue after unsafe access
1509
      UnsafeMemoryAccessMark umam(this, true, true);
1510

1511
      __ jmpb(L_copy_8_bytes);
1512

1513
      __ align(OptoLoopAlignment);
1514
    __ BIND(L_copy_8_bytes_loop);
1515
      if (UseXMMForArrayCopy) {
1516
        __ movq(xmm0, Address(from, count, Address::times_8));
1517
        __ movq(Address(to, count, Address::times_8), xmm0);
1518
      } else {
1519
        __ fild_d(Address(from, count, Address::times_8));
1520
        __ fistp_d(Address(to, count, Address::times_8));
1521
      }
1522
    __ BIND(L_copy_8_bytes);
1523
      __ decrement(count);
1524
      __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1525

1526
    }
1527
    inc_copy_counter_np(T_LONG);
1528
    __ leave(); // required for proper stackwalking of RuntimeStub frame
1529
    __ xorptr(rax, rax); // return 0
1530
    __ ret(0);
1531
    return start;
1532
  }
1533

1534

1535
  // Helper for generating a dynamic type check.
1536
  // The sub_klass must be one of {rbx, rdx, rsi}.
1537
  // The temp is killed.
1538
  void generate_type_check(Register sub_klass,
1539
                           Address& super_check_offset_addr,
1540
                           Address& super_klass_addr,
1541
                           Register temp,
1542
                           Label* L_success, Label* L_failure) {
1543
    BLOCK_COMMENT("type_check:");
1544

1545
    Label L_fallthrough;
1546
#define LOCAL_JCC(assembler_con, label_ptr)                             \
1547
    if (label_ptr != nullptr)  __ jcc(assembler_con, *(label_ptr));        \
1548
    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1549

1550
    // The following is a strange variation of the fast path which requires
1551
    // one less register, because needed values are on the argument stack.
1552
    // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1553
    //                                  L_success, L_failure, null);
1554
    assert_different_registers(sub_klass, temp);
1555

1556
    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1557

1558
    // if the pointers are equal, we are done (e.g., String[] elements)
1559
    __ cmpptr(sub_klass, super_klass_addr);
1560
    LOCAL_JCC(Assembler::equal, L_success);
1561

1562
    // check the supertype display:
1563
    __ movl2ptr(temp, super_check_offset_addr);
1564
    Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1565
    __ movptr(temp, super_check_addr); // load displayed supertype
1566
    __ cmpptr(temp, super_klass_addr); // test the super type
1567
    LOCAL_JCC(Assembler::equal, L_success);
1568

1569
    // if it was a primary super, we can just fail immediately
1570
    __ cmpl(super_check_offset_addr, sc_offset);
1571
    LOCAL_JCC(Assembler::notEqual, L_failure);
1572

1573
    // The repne_scan instruction uses fixed registers, which will get spilled.
1574
    // We happen to know this works best when super_klass is in rax.
1575
    Register super_klass = temp;
1576
    __ movptr(super_klass, super_klass_addr);
1577
    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1578
                                     L_success, L_failure);
1579

1580
    __ bind(L_fallthrough);
1581

1582
    if (L_success == nullptr) { BLOCK_COMMENT("L_success:"); }
1583
    if (L_failure == nullptr) { BLOCK_COMMENT("L_failure:"); }
1584

1585
#undef LOCAL_JCC
1586
  }
1587

1588
  //
1589
  //  Generate checkcasting array copy stub
1590
  //
1591
  //  Input:
1592
  //    4(rsp)   - source array address
1593
  //    8(rsp)   - destination array address
1594
  //   12(rsp)   - element count, can be zero
1595
  //   16(rsp)   - size_t ckoff (super_check_offset)
1596
  //   20(rsp)   - oop ckval (super_klass)
1597
  //
1598
  //  Output:
1599
  //    rax, ==  0  -  success
1600
  //    rax, == -1^K - failure, where K is partial transfer count
1601
  //
1602
  address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1603
    __ align(CodeEntryAlignment);
1604
    StubCodeMark mark(this, "StubRoutines", name);
1605
    address start = __ pc();
1606

1607
    Label L_load_element, L_store_element, L_do_card_marks, L_done;
1608

1609
    // register use:
1610
    //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1611
    //  rdi, rsi      -- element access (oop, klass)
1612
    //  rbx,           -- temp
1613
    const Register from       = rax;    // source array address
1614
    const Register to         = rdx;    // destination array address
1615
    const Register length     = rcx;    // elements count
1616
    const Register elem       = rdi;    // each oop copied
1617
    const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1618
    const Register temp       = rbx;    // lone remaining temp
1619

1620
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1621

1622
    __ push(rsi);
1623
    __ push(rdi);
1624
    __ push(rbx);
1625

1626
    Address   from_arg(rsp, 16+ 4);     // from
1627
    Address     to_arg(rsp, 16+ 8);     // to
1628
    Address length_arg(rsp, 16+12);     // elements count
1629
    Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1630
    Address  ckval_arg(rsp, 16+20);     // super_klass
1631

1632
    // Load up:
1633
    __ movptr(from,     from_arg);
1634
    __ movptr(to,         to_arg);
1635
    __ movl2ptr(length, length_arg);
1636

1637
    if (entry != nullptr) {
1638
      *entry = __ pc(); // Entry point from generic arraycopy stub.
1639
      BLOCK_COMMENT("Entry:");
1640
    }
1641

1642
    //---------------------------------------------------------------
1643
    // Assembler stub will be used for this call to arraycopy
1644
    // if the two arrays are subtypes of Object[] but the
1645
    // destination array type is not equal to or a supertype
1646
    // of the source type.  Each element must be separately
1647
    // checked.
1648

1649
    // Loop-invariant addresses.  They are exclusive end pointers.
1650
    Address end_from_addr(from, length, Address::times_ptr, 0);
1651
    Address   end_to_addr(to,   length, Address::times_ptr, 0);
1652

1653
    Register end_from = from;           // re-use
1654
    Register end_to   = to;             // re-use
1655
    Register count    = length;         // re-use
1656

1657
    // Loop-variant addresses.  They assume post-incremented count < 0.
1658
    Address from_element_addr(end_from, count, Address::times_ptr, 0);
1659
    Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1660
    Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1661

1662
    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1663
    if (dest_uninitialized) {
1664
      decorators |= IS_DEST_UNINITIALIZED;
1665
    }
1666

1667
    BasicType type = T_OBJECT;
1668
    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1669
    bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1670

1671
    // Copy from low to high addresses, indexed from the end of each array.
1672
    __ lea(end_from, end_from_addr);
1673
    __ lea(end_to,   end_to_addr);
1674
    assert(length == count, "");        // else fix next line:
1675
    __ negptr(count);                   // negate and test the length
1676
    __ jccb(Assembler::notZero, L_load_element);
1677

1678
    // Empty array:  Nothing to do.
1679
    __ xorptr(rax, rax);                  // return 0 on (trivial) success
1680
    __ jmp(L_done);
1681

1682
    // ======== begin loop ========
1683
    // (Loop is rotated; its entry is L_load_element.)
1684
    // Loop control:
1685
    //   for (count = -count; count != 0; count++)
1686
    // Base pointers src, dst are biased by 8*count,to last element.
1687
    __ align(OptoLoopAlignment);
1688

1689
    __ BIND(L_store_element);
1690
    __ movptr(to_element_addr, elem);     // store the oop
1691
    __ increment(count);                // increment the count toward zero
1692
    __ jccb(Assembler::zero, L_do_card_marks);
1693

1694
    // ======== loop entry is here ========
1695
    __ BIND(L_load_element);
1696
    __ movptr(elem, from_element_addr);   // load the oop
1697
    __ testptr(elem, elem);
1698
    __ jccb(Assembler::zero, L_store_element);
1699

1700
    // (Could do a trick here:  Remember last successful non-null
1701
    // element stored and make a quick oop equality check on it.)
1702

1703
    __ movptr(elem_klass, elem_klass_addr); // query the object klass
1704
    generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1705
                        &L_store_element, nullptr);
1706
    // (On fall-through, we have failed the element type check.)
1707
    // ======== end loop ========
1708

1709
    // It was a real error; we must depend on the caller to finish the job.
1710
    // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1711
    // Emit GC store barriers for the oops we have copied (length_arg + count),
1712
    // and report their number to the caller.
1713
    assert_different_registers(to, count, rax);
1714
    Label L_post_barrier;
1715
    __ addl(count, length_arg);         // transfers = (length - remaining)
1716
    __ movl2ptr(rax, count);            // save the value
1717
    __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1718
    __ jccb(Assembler::notZero, L_post_barrier);
1719
    __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1720

1721
    // Come here on success only.
1722
    __ BIND(L_do_card_marks);
1723
    __ xorptr(rax, rax);                // return 0 on success
1724
    __ movl2ptr(count, length_arg);
1725

1726
    __ BIND(L_post_barrier);
1727
    __ movptr(to, to_arg);              // reload
1728
    bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1729

1730
    // Common exit point (success or failure).
1731
    __ BIND(L_done);
1732
    __ pop(rbx);
1733
    __ pop(rdi);
1734
    __ pop(rsi);
1735
    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1736
    __ leave(); // required for proper stackwalking of RuntimeStub frame
1737
    __ ret(0);
1738

1739
    return start;
1740
  }
1741

1742
  //
1743
  //  Generate 'unsafe' array copy stub
1744
  //  Though just as safe as the other stubs, it takes an unscaled
1745
  //  size_t argument instead of an element count.
1746
  //
1747
  //  Input:
1748
  //    4(rsp)   - source array address
1749
  //    8(rsp)   - destination array address
1750
  //   12(rsp)   - byte count, can be zero
1751
  //
1752
  //  Output:
1753
  //    rax, ==  0  -  success
1754
  //    rax, == -1  -  need to call System.arraycopy
1755
  //
1756
  // Examines the alignment of the operands and dispatches
1757
  // to a long, int, short, or byte copy loop.
1758
  //
1759
  address generate_unsafe_copy(const char *name,
1760
                               address byte_copy_entry,
1761
                               address short_copy_entry,
1762
                               address int_copy_entry,
1763
                               address long_copy_entry) {
1764

1765
    Label L_long_aligned, L_int_aligned, L_short_aligned;
1766

1767
    __ align(CodeEntryAlignment);
1768
    StubCodeMark mark(this, "StubRoutines", name);
1769
    address start = __ pc();
1770

1771
    const Register from       = rax;  // source array address
1772
    const Register to         = rdx;  // destination array address
1773
    const Register count      = rcx;  // elements count
1774

1775
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1776
    __ push(rsi);
1777
    __ push(rdi);
1778
    Address  from_arg(rsp, 12+ 4);      // from
1779
    Address    to_arg(rsp, 12+ 8);      // to
1780
    Address count_arg(rsp, 12+12);      // byte count
1781

1782
    // Load up:
1783
    __ movptr(from ,  from_arg);
1784
    __ movptr(to   ,    to_arg);
1785
    __ movl2ptr(count, count_arg);
1786

1787
    // bump this on entry, not on exit:
1788
    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1789

1790
    const Register bits = rsi;
1791
    __ mov(bits, from);
1792
    __ orptr(bits, to);
1793
    __ orptr(bits, count);
1794

1795
    __ testl(bits, BytesPerLong-1);
1796
    __ jccb(Assembler::zero, L_long_aligned);
1797

1798
    __ testl(bits, BytesPerInt-1);
1799
    __ jccb(Assembler::zero, L_int_aligned);
1800

1801
    __ testl(bits, BytesPerShort-1);
1802
    __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1803

1804
    __ BIND(L_short_aligned);
1805
    __ shrptr(count, LogBytesPerShort); // size => short_count
1806
    __ movl(count_arg, count);          // update 'count'
1807
    __ jump(RuntimeAddress(short_copy_entry));
1808

1809
    __ BIND(L_int_aligned);
1810
    __ shrptr(count, LogBytesPerInt); // size => int_count
1811
    __ movl(count_arg, count);          // update 'count'
1812
    __ jump(RuntimeAddress(int_copy_entry));
1813

1814
    __ BIND(L_long_aligned);
1815
    __ shrptr(count, LogBytesPerLong); // size => qword_count
1816
    __ movl(count_arg, count);          // update 'count'
1817
    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1818
    __ pop(rsi);
1819
    __ jump(RuntimeAddress(long_copy_entry));
1820

1821
    return start;
1822
  }
1823

1824

1825
  // Perform range checks on the proposed arraycopy.
1826
  // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1827
  void arraycopy_range_checks(Register src,
1828
                              Register src_pos,
1829
                              Register dst,
1830
                              Register dst_pos,
1831
                              Address& length,
1832
                              Label& L_failed) {
1833
    BLOCK_COMMENT("arraycopy_range_checks:");
1834
    const Register src_end = src_pos;   // source array end position
1835
    const Register dst_end = dst_pos;   // destination array end position
1836
    __ addl(src_end, length); // src_pos + length
1837
    __ addl(dst_end, length); // dst_pos + length
1838

1839
    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1840
    __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1841
    __ jcc(Assembler::above, L_failed);
1842

1843
    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1844
    __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1845
    __ jcc(Assembler::above, L_failed);
1846

1847
    BLOCK_COMMENT("arraycopy_range_checks done");
1848
  }
1849

1850

1851
  //
1852
  //  Generate generic array copy stubs
1853
  //
1854
  //  Input:
1855
  //     4(rsp)    -  src oop
1856
  //     8(rsp)    -  src_pos
1857
  //    12(rsp)    -  dst oop
1858
  //    16(rsp)    -  dst_pos
1859
  //    20(rsp)    -  element count
1860
  //
1861
  //  Output:
1862
  //    rax, ==  0  -  success
1863
  //    rax, == -1^K - failure, where K is partial transfer count
1864
  //
1865
  address generate_generic_copy(const char *name,
1866
                                address entry_jbyte_arraycopy,
1867
                                address entry_jshort_arraycopy,
1868
                                address entry_jint_arraycopy,
1869
                                address entry_oop_arraycopy,
1870
                                address entry_jlong_arraycopy,
1871
                                address entry_checkcast_arraycopy) {
1872
    Label L_failed, L_failed_0, L_objArray;
1873

1874
    { int modulus = CodeEntryAlignment;
1875
      int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1876
      int advance = target - (__ offset() % modulus);
1877
      if (advance < 0)  advance += modulus;
1878
      if (advance > 0)  __ nop(advance);
1879
    }
1880
    StubCodeMark mark(this, "StubRoutines", name);
1881

1882
    // Short-hop target to L_failed.  Makes for denser prologue code.
1883
    __ BIND(L_failed_0);
1884
    __ jmp(L_failed);
1885
    assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1886

1887
    __ align(CodeEntryAlignment);
1888
    address start = __ pc();
1889

1890
    __ enter(); // required for proper stackwalking of RuntimeStub frame
1891
    __ push(rsi);
1892
    __ push(rdi);
1893

1894
    // bump this on entry, not on exit:
1895
    inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1896

1897
    // Input values
1898
    Address SRC     (rsp, 12+ 4);
1899
    Address SRC_POS (rsp, 12+ 8);
1900
    Address DST     (rsp, 12+12);
1901
    Address DST_POS (rsp, 12+16);
1902
    Address LENGTH  (rsp, 12+20);
1903

1904
    //-----------------------------------------------------------------------
1905
    // Assembler stub will be used for this call to arraycopy
1906
    // if the following conditions are met:
1907
    //
1908
    // (1) src and dst must not be null.
1909
    // (2) src_pos must not be negative.
1910
    // (3) dst_pos must not be negative.
1911
    // (4) length  must not be negative.
1912
    // (5) src klass and dst klass should be the same and not null.
1913
    // (6) src and dst should be arrays.
1914
    // (7) src_pos + length must not exceed length of src.
1915
    // (8) dst_pos + length must not exceed length of dst.
1916
    //
1917

1918
    const Register src     = rax;       // source array oop
1919
    const Register src_pos = rsi;
1920
    const Register dst     = rdx;       // destination array oop
1921
    const Register dst_pos = rdi;
1922
    const Register length  = rcx;       // transfer count
1923

1924
    //  if (src == null) return -1;
1925
    __ movptr(src, SRC);      // src oop
1926
    __ testptr(src, src);
1927
    __ jccb(Assembler::zero, L_failed_0);
1928

1929
    //  if (src_pos < 0) return -1;
1930
    __ movl2ptr(src_pos, SRC_POS);  // src_pos
1931
    __ testl(src_pos, src_pos);
1932
    __ jccb(Assembler::negative, L_failed_0);
1933

1934
    //  if (dst == nullptr) return -1;
1935
    __ movptr(dst, DST);      // dst oop
1936
    __ testptr(dst, dst);
1937
    __ jccb(Assembler::zero, L_failed_0);
1938

1939
    //  if (dst_pos < 0) return -1;
1940
    __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1941
    __ testl(dst_pos, dst_pos);
1942
    __ jccb(Assembler::negative, L_failed_0);
1943

1944
    //  if (length < 0) return -1;
1945
    __ movl2ptr(length, LENGTH);   // length
1946
    __ testl(length, length);
1947
    __ jccb(Assembler::negative, L_failed_0);
1948

1949
    //  if (src->klass() == nullptr) return -1;
1950
    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1951
    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1952
    const Register rcx_src_klass = rcx;    // array klass
1953
    __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1954

1955
#ifdef ASSERT
1956
    //  assert(src->klass() != nullptr);
1957
    BLOCK_COMMENT("assert klasses not null");
1958
    { Label L1, L2;
1959
      __ testptr(rcx_src_klass, rcx_src_klass);
1960
      __ jccb(Assembler::notZero, L2);   // it is broken if klass is null
1961
      __ bind(L1);
1962
      __ stop("broken null klass");
1963
      __ bind(L2);
1964
      __ cmpptr(dst_klass_addr, NULL_WORD);
1965
      __ jccb(Assembler::equal, L1);      // this would be broken also
1966
      BLOCK_COMMENT("assert done");
1967
    }
1968
#endif //ASSERT
1969

1970
    // Load layout helper (32-bits)
1971
    //
1972
    //  |array_tag|     | header_size | element_type |     |log2_element_size|
1973
    // 32        30    24            16              8     2                 0
1974
    //
1975
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1976
    //
1977

1978
    int lh_offset = in_bytes(Klass::layout_helper_offset());
1979
    Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1980

1981
    // Handle objArrays completely differently...
1982
    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1983
    __ cmpl(src_klass_lh_addr, objArray_lh);
1984
    __ jcc(Assembler::equal, L_objArray);
1985

1986
    //  if (src->klass() != dst->klass()) return -1;
1987
    __ cmpptr(rcx_src_klass, dst_klass_addr);
1988
    __ jccb(Assembler::notEqual, L_failed_0);
1989

1990
    const Register rcx_lh = rcx;  // layout helper
1991
    assert(rcx_lh == rcx_src_klass, "known alias");
1992
    __ movl(rcx_lh, src_klass_lh_addr);
1993

1994
    //  if (!src->is_Array()) return -1;
1995
    __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1996
    __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1997

1998
    // At this point, it is known to be a typeArray (array_tag 0x3).
1999
#ifdef ASSERT
2000
    { Label L;
2001
      __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2002
      __ jcc(Assembler::greaterEqual, L); // signed cmp
2003
      __ stop("must be a primitive array");
2004
      __ bind(L);
2005
    }
2006
#endif
2007

2008
    assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
2009
    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2010

2011
    // TypeArrayKlass
2012
    //
2013
    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2014
    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2015
    //
2016
    const Register rsi_offset = rsi; // array offset
2017
    const Register src_array  = src; // src array offset
2018
    const Register dst_array  = dst; // dst array offset
2019
    const Register rdi_elsize = rdi; // log2 element size
2020

2021
    __ mov(rsi_offset, rcx_lh);
2022
    __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
2023
    __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
2024
    __ addptr(src_array, rsi_offset);  // src array offset
2025
    __ addptr(dst_array, rsi_offset);  // dst array offset
2026
    __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
2027

2028
    // next registers should be set before the jump to corresponding stub
2029
    const Register from       = src; // source array address
2030
    const Register to         = dst; // destination array address
2031
    const Register count      = rcx; // elements count
2032
    // some of them should be duplicated on stack
2033
#define FROM   Address(rsp, 12+ 4)
2034
#define TO     Address(rsp, 12+ 8)   // Not used now
2035
#define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
2036

2037
    BLOCK_COMMENT("scale indexes to element size");
2038
    __ movl2ptr(rsi, SRC_POS);  // src_pos
2039
    __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
2040
    assert(src_array == from, "");
2041
    __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
2042
    __ movl2ptr(rdi, DST_POS);  // dst_pos
2043
    __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
2044
    assert(dst_array == to, "");
2045
    __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
2046
    __ movptr(FROM, from);      // src_addr
2047
    __ mov(rdi_elsize, rcx_lh); // log2 elsize
2048
    __ movl2ptr(count, LENGTH); // elements count
2049

2050
    BLOCK_COMMENT("choose copy loop based on element size");
2051
    __ cmpl(rdi_elsize, 0);
2052

2053
    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
2054
    __ cmpl(rdi_elsize, LogBytesPerShort);
2055
    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
2056
    __ cmpl(rdi_elsize, LogBytesPerInt);
2057
    __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
2058
#ifdef ASSERT
2059
    __ cmpl(rdi_elsize, LogBytesPerLong);
2060
    __ jccb(Assembler::notEqual, L_failed);
2061
#endif
2062
    __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
2063
    __ pop(rsi);
2064
    __ jump(RuntimeAddress(entry_jlong_arraycopy));
2065

2066
  __ BIND(L_failed);
2067
    __ xorptr(rax, rax);
2068
    __ notptr(rax); // return -1
2069
    __ pop(rdi);
2070
    __ pop(rsi);
2071
    __ leave(); // required for proper stackwalking of RuntimeStub frame
2072
    __ ret(0);
2073

2074
    // ObjArrayKlass
2075
  __ BIND(L_objArray);
2076
    // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
2077

2078
    Label L_plain_copy, L_checkcast_copy;
2079
    //  test array classes for subtyping
2080
    __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
2081
    __ jccb(Assembler::notEqual, L_checkcast_copy);
2082

2083
    // Identically typed arrays can be copied without element-wise checks.
2084
    assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
2085
    arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2086

2087
  __ BIND(L_plain_copy);
2088
    __ movl2ptr(count, LENGTH); // elements count
2089
    __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
2090
    __ lea(from, Address(src, src_pos, Address::times_ptr,
2091
                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2092
    __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
2093
    __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
2094
                 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2095
    __ movptr(FROM,  from);   // src_addr
2096
    __ movptr(TO,    to);     // dst_addr
2097
    __ movl(COUNT, count);  // count
2098
    __ jump(RuntimeAddress(entry_oop_arraycopy));
2099

2100
  __ BIND(L_checkcast_copy);
2101
    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
2102
    {
2103
      // Handy offsets:
2104
      int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2105
      int sco_offset = in_bytes(Klass::super_check_offset_offset());
2106

2107
      Register rsi_dst_klass = rsi;
2108
      Register rdi_temp      = rdi;
2109
      assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
2110
      assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
2111
      Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
2112

2113
      // Before looking at dst.length, make sure dst is also an objArray.
2114
      __ movptr(rsi_dst_klass, dst_klass_addr);
2115
      __ cmpl(dst_klass_lh_addr, objArray_lh);
2116
      __ jccb(Assembler::notEqual, L_failed);
2117

2118
      // It is safe to examine both src.length and dst.length.
2119
      __ movl2ptr(src_pos, SRC_POS);        // reload rsi
2120
      arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2121
      // (Now src_pos and dst_pos are killed, but not src and dst.)
2122

2123
      // We'll need this temp (don't forget to pop it after the type check).
2124
      __ push(rbx);
2125
      Register rbx_src_klass = rbx;
2126

2127
      __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
2128
      __ movptr(rsi_dst_klass, dst_klass_addr);
2129
      Address super_check_offset_addr(rsi_dst_klass, sco_offset);
2130
      Label L_fail_array_check;
2131
      generate_type_check(rbx_src_klass,
2132
                          super_check_offset_addr, dst_klass_addr,
2133
                          rdi_temp, nullptr, &L_fail_array_check);
2134
      // (On fall-through, we have passed the array type check.)
2135
      __ pop(rbx);
2136
      __ jmp(L_plain_copy);
2137

2138
      __ BIND(L_fail_array_check);
2139
      // Reshuffle arguments so we can call checkcast_arraycopy:
2140

2141
      // match initial saves for checkcast_arraycopy
2142
      // push(rsi);    // already done; see above
2143
      // push(rdi);    // already done; see above
2144
      // push(rbx);    // already done; see above
2145

2146
      // Marshal outgoing arguments now, freeing registers.
2147
      Address   from_arg(rsp, 16+ 4);   // from
2148
      Address     to_arg(rsp, 16+ 8);   // to
2149
      Address length_arg(rsp, 16+12);   // elements count
2150
      Address  ckoff_arg(rsp, 16+16);   // super_check_offset
2151
      Address  ckval_arg(rsp, 16+20);   // super_klass
2152

2153
      Address SRC_POS_arg(rsp, 16+ 8);
2154
      Address DST_POS_arg(rsp, 16+16);
2155
      Address  LENGTH_arg(rsp, 16+20);
2156
      // push rbx, changed the incoming offsets (why not just use rbp,??)
2157
      // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
2158

2159
      __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
2160
      __ movl2ptr(length, LENGTH_arg);    // reload elements count
2161
      __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
2162
      __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
2163

2164
      __ movptr(ckval_arg, rbx);          // destination element type
2165
      __ movl(rbx, Address(rbx, sco_offset));
2166
      __ movl(ckoff_arg, rbx);          // corresponding class check offset
2167

2168
      __ movl(length_arg, length);      // outgoing length argument
2169

2170
      __ lea(from, Address(src, src_pos, Address::times_ptr,
2171
                            arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2172
      __ movptr(from_arg, from);
2173

2174
      __ lea(to, Address(dst, dst_pos, Address::times_ptr,
2175
                          arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2176
      __ movptr(to_arg, to);
2177
      __ jump(RuntimeAddress(entry_checkcast_arraycopy));
2178
    }
2179

2180
    return start;
2181
  }
2182

2183
  void generate_arraycopy_stubs() {
2184
    address entry;
2185
    address entry_jbyte_arraycopy;
2186
    address entry_jshort_arraycopy;
2187
    address entry_jint_arraycopy;
2188
    address entry_oop_arraycopy;
2189
    address entry_jlong_arraycopy;
2190
    address entry_checkcast_arraycopy;
2191

2192
    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
2193
        generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
2194
                               "arrayof_jbyte_disjoint_arraycopy");
2195
    StubRoutines::_arrayof_jbyte_arraycopy =
2196
        generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
2197
                               nullptr, "arrayof_jbyte_arraycopy");
2198
    StubRoutines::_jbyte_disjoint_arraycopy =
2199
        generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
2200
                               "jbyte_disjoint_arraycopy");
2201
    StubRoutines::_jbyte_arraycopy =
2202
        generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
2203
                               &entry_jbyte_arraycopy, "jbyte_arraycopy");
2204

2205
    StubRoutines::_arrayof_jshort_disjoint_arraycopy =
2206
        generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
2207
                               "arrayof_jshort_disjoint_arraycopy");
2208
    StubRoutines::_arrayof_jshort_arraycopy =
2209
        generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
2210
                               nullptr, "arrayof_jshort_arraycopy");
2211
    StubRoutines::_jshort_disjoint_arraycopy =
2212
        generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
2213
                               "jshort_disjoint_arraycopy");
2214
    StubRoutines::_jshort_arraycopy =
2215
        generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
2216
                               &entry_jshort_arraycopy, "jshort_arraycopy");
2217

2218
    // Next arrays are always aligned on 4 bytes at least.
2219
    StubRoutines::_jint_disjoint_arraycopy =
2220
        generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
2221
                               "jint_disjoint_arraycopy");
2222
    StubRoutines::_jint_arraycopy =
2223
        generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
2224
                               &entry_jint_arraycopy, "jint_arraycopy");
2225

2226
    StubRoutines::_oop_disjoint_arraycopy =
2227
        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2228
                               "oop_disjoint_arraycopy");
2229
    StubRoutines::_oop_arraycopy =
2230
        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2231
                               &entry_oop_arraycopy, "oop_arraycopy");
2232

2233
    StubRoutines::_oop_disjoint_arraycopy_uninit =
2234
        generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2235
                               "oop_disjoint_arraycopy_uninit",
2236
                               /*dest_uninitialized*/true);
2237
    StubRoutines::_oop_arraycopy_uninit =
2238
        generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2239
                               nullptr, "oop_arraycopy_uninit",
2240
                               /*dest_uninitialized*/true);
2241

2242
    StubRoutines::_jlong_disjoint_arraycopy =
2243
        generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2244
    StubRoutines::_jlong_arraycopy =
2245
        generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2246
                                    "jlong_arraycopy");
2247

2248
    StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2249
    StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2250
    StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2251
    StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2252
    StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2253
    StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2254

2255
    StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2256
    StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2257
    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2258
    StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2259

2260
    StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2261
    StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2262
    StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2263
    StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2264

2265
    StubRoutines::_checkcast_arraycopy =
2266
        generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2267
    StubRoutines::_checkcast_arraycopy_uninit =
2268
        generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, /*dest_uninitialized*/true);
2269

2270
    StubRoutines::_unsafe_arraycopy =
2271
        generate_unsafe_copy("unsafe_arraycopy",
2272
                               entry_jbyte_arraycopy,
2273
                               entry_jshort_arraycopy,
2274
                               entry_jint_arraycopy,
2275
                               entry_jlong_arraycopy);
2276

2277
    StubRoutines::_generic_arraycopy =
2278
        generate_generic_copy("generic_arraycopy",
2279
                               entry_jbyte_arraycopy,
2280
                               entry_jshort_arraycopy,
2281
                               entry_jint_arraycopy,
2282
                               entry_oop_arraycopy,
2283
                               entry_jlong_arraycopy,
2284
                               entry_checkcast_arraycopy);
2285
  }
2286

2287
  // AES intrinsic stubs
2288
  enum {AESBlockSize = 16};
2289

2290
  address key_shuffle_mask_addr() {
2291
    return (address)KEY_SHUFFLE_MASK;
2292
  }
2293

2294
  address counter_shuffle_mask_addr() {
2295
    return (address)COUNTER_SHUFFLE_MASK;
2296
  }
2297

2298
  // Utility routine for loading a 128-bit key word in little endian format
2299
  // can optionally specify that the shuffle mask is already in an xmmregister
2300
  void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg) {
2301
    __ movdqu(xmmdst, Address(key, offset));
2302
    if (xmm_shuf_mask != xnoreg) {
2303
      __ pshufb(xmmdst, xmm_shuf_mask);
2304
    } else {
2305
      __ pshufb(xmmdst, ExternalAddress(key_shuffle_mask_addr()));
2306
    }
2307
  }
2308

2309
  // aesenc using specified key+offset
2310
  // can optionally specify that the shuffle mask is already in an xmmregister
2311
  void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg) {
2312
    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2313
    __ aesenc(xmmdst, xmmtmp);
2314
  }
2315

2316
  // aesdec using specified key+offset
2317
  // can optionally specify that the shuffle mask is already in an xmmregister
2318
  void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask = xnoreg) {
2319
    load_key(xmmtmp, key, offset, xmm_shuf_mask);
2320
    __ aesdec(xmmdst, xmmtmp);
2321
  }
2322

2323
  // Utility routine for increase 128bit counter (iv in CTR mode)
2324
  //  XMM_128bit,  D3, D2, D1, D0
2325
  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2326
    __ pextrd(reg, xmmdst, 0x0);
2327
    __ addl(reg, inc_delta);
2328
    __ pinsrd(xmmdst, reg, 0x0);
2329
    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2330

2331
    __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2332
    __ addl(reg, 0x01);
2333
    __ pinsrd(xmmdst, reg, 0x01);
2334
    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2335

2336
    __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2337
    __ addl(reg, 0x01);
2338
    __ pinsrd(xmmdst, reg, 0x02);
2339
    __ jcc(Assembler::carryClear, next_block); // jump if no carry
2340

2341
    __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2342
    __ addl(reg, 0x01);
2343
    __ pinsrd(xmmdst, reg, 0x03);
2344

2345
    __ BIND(next_block);          // next instruction
2346
  }
2347

2348

2349
  // Arguments:
2350
  //
2351
  // Inputs:
2352
  //   c_rarg0   - source byte array address
2353
  //   c_rarg1   - destination byte array address
2354
  //   c_rarg2   - K (key) in little endian int array
2355
  //
2356
  address generate_aescrypt_encryptBlock() {
2357
    assert(UseAES, "need AES instructions and misaligned SSE support");
2358
    __ align(CodeEntryAlignment);
2359
    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2360
    Label L_doLast;
2361
    address start = __ pc();
2362

2363
    const Register from        = rdx;      // source array address
2364
    const Register to          = rdx;      // destination array address
2365
    const Register key         = rcx;      // key array address
2366
    const Register keylen      = rax;
2367
    const Address  from_param(rbp, 8+0);
2368
    const Address  to_param  (rbp, 8+4);
2369
    const Address  key_param (rbp, 8+8);
2370

2371
    const XMMRegister xmm_result = xmm0;
2372
    const XMMRegister xmm_key_shuf_mask = xmm1;
2373
    const XMMRegister xmm_temp1  = xmm2;
2374
    const XMMRegister xmm_temp2  = xmm3;
2375
    const XMMRegister xmm_temp3  = xmm4;
2376
    const XMMRegister xmm_temp4  = xmm5;
2377

2378
    __ enter();   // required for proper stackwalking of RuntimeStub frame
2379

2380
    __ movptr(from, from_param);
2381
    __ movptr(key, key_param);
2382

2383
    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2384
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2385

2386
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2387
    __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2388
    __ movptr(to, to_param);
2389

2390
    // For encryption, the java expanded key ordering is just what we need
2391

2392
    load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2393
    __ pxor(xmm_result, xmm_temp1);
2394

2395
    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2396
    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2397
    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2398
    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2399

2400
    __ aesenc(xmm_result, xmm_temp1);
2401
    __ aesenc(xmm_result, xmm_temp2);
2402
    __ aesenc(xmm_result, xmm_temp3);
2403
    __ aesenc(xmm_result, xmm_temp4);
2404

2405
    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2406
    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2407
    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2408
    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2409

2410
    __ aesenc(xmm_result, xmm_temp1);
2411
    __ aesenc(xmm_result, xmm_temp2);
2412
    __ aesenc(xmm_result, xmm_temp3);
2413
    __ aesenc(xmm_result, xmm_temp4);
2414

2415
    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2416
    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2417

2418
    __ cmpl(keylen, 44);
2419
    __ jccb(Assembler::equal, L_doLast);
2420

2421
    __ aesenc(xmm_result, xmm_temp1);
2422
    __ aesenc(xmm_result, xmm_temp2);
2423

2424
    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2425
    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2426

2427
    __ cmpl(keylen, 52);
2428
    __ jccb(Assembler::equal, L_doLast);
2429

2430
    __ aesenc(xmm_result, xmm_temp1);
2431
    __ aesenc(xmm_result, xmm_temp2);
2432

2433
    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2434
    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2435

2436
    __ BIND(L_doLast);
2437
    __ aesenc(xmm_result, xmm_temp1);
2438
    __ aesenclast(xmm_result, xmm_temp2);
2439
    __ movdqu(Address(to, 0), xmm_result);        // store the result
2440
    __ xorptr(rax, rax); // return 0
2441
    __ leave(); // required for proper stackwalking of RuntimeStub frame
2442
    __ ret(0);
2443

2444
    return start;
2445
  }
2446

2447

2448
  // Arguments:
2449
  //
2450
  // Inputs:
2451
  //   c_rarg0   - source byte array address
2452
  //   c_rarg1   - destination byte array address
2453
  //   c_rarg2   - K (key) in little endian int array
2454
  //
2455
  address generate_aescrypt_decryptBlock() {
2456
    assert(UseAES, "need AES instructions and misaligned SSE support");
2457
    __ align(CodeEntryAlignment);
2458
    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2459
    Label L_doLast;
2460
    address start = __ pc();
2461

2462
    const Register from        = rdx;      // source array address
2463
    const Register to          = rdx;      // destination array address
2464
    const Register key         = rcx;      // key array address
2465
    const Register keylen      = rax;
2466
    const Address  from_param(rbp, 8+0);
2467
    const Address  to_param  (rbp, 8+4);
2468
    const Address  key_param (rbp, 8+8);
2469

2470
    const XMMRegister xmm_result = xmm0;
2471
    const XMMRegister xmm_key_shuf_mask = xmm1;
2472
    const XMMRegister xmm_temp1  = xmm2;
2473
    const XMMRegister xmm_temp2  = xmm3;
2474
    const XMMRegister xmm_temp3  = xmm4;
2475
    const XMMRegister xmm_temp4  = xmm5;
2476

2477
    __ enter(); // required for proper stackwalking of RuntimeStub frame
2478

2479
    __ movptr(from, from_param);
2480
    __ movptr(key, key_param);
2481

2482
    // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2483
    __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2484

2485
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2486
    __ movdqu(xmm_result, Address(from, 0));
2487
    __ movptr(to, to_param);
2488

2489
    // for decryption java expanded key ordering is rotated one position from what we want
2490
    // so we start from 0x10 here and hit 0x00 last
2491
    // we don't know if the key is aligned, hence not using load-execute form
2492
    load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2493
    load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2494
    load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2495
    load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2496

2497
    __ pxor  (xmm_result, xmm_temp1);
2498
    __ aesdec(xmm_result, xmm_temp2);
2499
    __ aesdec(xmm_result, xmm_temp3);
2500
    __ aesdec(xmm_result, xmm_temp4);
2501

2502
    load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2503
    load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2504
    load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2505
    load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2506

2507
    __ aesdec(xmm_result, xmm_temp1);
2508
    __ aesdec(xmm_result, xmm_temp2);
2509
    __ aesdec(xmm_result, xmm_temp3);
2510
    __ aesdec(xmm_result, xmm_temp4);
2511

2512
    load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2513
    load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2514
    load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2515

2516
    __ cmpl(keylen, 44);
2517
    __ jccb(Assembler::equal, L_doLast);
2518

2519
    __ aesdec(xmm_result, xmm_temp1);
2520
    __ aesdec(xmm_result, xmm_temp2);
2521

2522
    load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2523
    load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2524

2525
    __ cmpl(keylen, 52);
2526
    __ jccb(Assembler::equal, L_doLast);
2527

2528
    __ aesdec(xmm_result, xmm_temp1);
2529
    __ aesdec(xmm_result, xmm_temp2);
2530

2531
    load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2532
    load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2533

2534
    __ BIND(L_doLast);
2535
    __ aesdec(xmm_result, xmm_temp1);
2536
    __ aesdec(xmm_result, xmm_temp2);
2537

2538
    // for decryption the aesdeclast operation is always on key+0x00
2539
    __ aesdeclast(xmm_result, xmm_temp3);
2540
    __ movdqu(Address(to, 0), xmm_result);  // store the result
2541
    __ xorptr(rax, rax); // return 0
2542
    __ leave(); // required for proper stackwalking of RuntimeStub frame
2543
    __ ret(0);
2544

2545
    return start;
2546
  }
2547

2548
  void handleSOERegisters(bool saving) {
2549
    const int saveFrameSizeInBytes = 4 * wordSize;
2550
    const Address saved_rbx     (rbp, -3 * wordSize);
2551
    const Address saved_rsi     (rbp, -2 * wordSize);
2552
    const Address saved_rdi     (rbp, -1 * wordSize);
2553

2554
    if (saving) {
2555
      __ subptr(rsp, saveFrameSizeInBytes);
2556
      __ movptr(saved_rsi, rsi);
2557
      __ movptr(saved_rdi, rdi);
2558
      __ movptr(saved_rbx, rbx);
2559
    } else {
2560
      // restoring
2561
      __ movptr(rsi, saved_rsi);
2562
      __ movptr(rdi, saved_rdi);
2563
      __ movptr(rbx, saved_rbx);
2564
    }
2565
  }
2566

2567
  // Arguments:
2568
  //
2569
  // Inputs:
2570
  //   c_rarg0   - source byte array address
2571
  //   c_rarg1   - destination byte array address
2572
  //   c_rarg2   - K (key) in little endian int array
2573
  //   c_rarg3   - r vector byte array address
2574
  //   c_rarg4   - input length
2575
  //
2576
  // Output:
2577
  //   rax       - input length
2578
  //
2579
  address generate_cipherBlockChaining_encryptAESCrypt() {
2580
    assert(UseAES, "need AES instructions and misaligned SSE support");
2581
    __ align(CodeEntryAlignment);
2582
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2583
    address start = __ pc();
2584

2585
    Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2586
    const Register from        = rsi;      // source array address
2587
    const Register to          = rdx;      // destination array address
2588
    const Register key         = rcx;      // key array address
2589
    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2590
                                           // and left with the results of the last encryption block
2591
    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2592
    const Register pos         = rax;
2593

2594
    // xmm register assignments for the loops below
2595
    const XMMRegister xmm_result = xmm0;
2596
    const XMMRegister xmm_temp   = xmm1;
2597
    // first 6 keys preloaded into xmm2-xmm7
2598
    const int XMM_REG_NUM_KEY_FIRST = 2;
2599
    const int XMM_REG_NUM_KEY_LAST  = 7;
2600
    const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2601

2602
    __ enter(); // required for proper stackwalking of RuntimeStub frame
2603
    handleSOERegisters(true /*saving*/);
2604

2605
    // load registers from incoming parameters
2606
    const Address  from_param(rbp, 8+0);
2607
    const Address  to_param  (rbp, 8+4);
2608
    const Address  key_param (rbp, 8+8);
2609
    const Address  rvec_param (rbp, 8+12);
2610
    const Address  len_param  (rbp, 8+16);
2611
    __ movptr(from , from_param);
2612
    __ movptr(to   , to_param);
2613
    __ movptr(key  , key_param);
2614
    __ movptr(rvec , rvec_param);
2615
    __ movptr(len_reg , len_param);
2616

2617
    const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2618
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2619
    // load up xmm regs 2 thru 7 with keys 0-5
2620
    for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2621
      load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2622
      offset += 0x10;
2623
    }
2624

2625
    __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2626

2627
    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2628
    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2629
    __ cmpl(rax, 44);
2630
    __ jcc(Assembler::notEqual, L_key_192_256);
2631

2632
    // 128 bit code follows here
2633
    __ movl(pos, 0);
2634
    __ align(OptoLoopAlignment);
2635
    __ BIND(L_loopTop_128);
2636
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2637
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2638

2639
    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2640
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2641
      __ aesenc(xmm_result, as_XMMRegister(rnum));
2642
    }
2643
    for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2644
      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2645
    }
2646
    load_key(xmm_temp, key, 0xa0);
2647
    __ aesenclast(xmm_result, xmm_temp);
2648

2649
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2650
    // no need to store r to memory until we exit
2651
    __ addptr(pos, AESBlockSize);
2652
    __ subptr(len_reg, AESBlockSize);
2653
    __ jcc(Assembler::notEqual, L_loopTop_128);
2654

2655
    __ BIND(L_exit);
2656
    __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2657

2658
    handleSOERegisters(false /*restoring*/);
2659
    __ movptr(rax, len_param); // return length
2660
    __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2661
    __ ret(0);
2662

2663
    __ BIND(L_key_192_256);
2664
    // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2665
    __ cmpl(rax, 52);
2666
    __ jcc(Assembler::notEqual, L_key_256);
2667

2668
    // 192-bit code follows here (could be changed to use more xmm registers)
2669
    __ movl(pos, 0);
2670
    __ align(OptoLoopAlignment);
2671
    __ BIND(L_loopTop_192);
2672
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2673
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2674

2675
    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2676
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2677
      __ aesenc(xmm_result, as_XMMRegister(rnum));
2678
    }
2679
    for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2680
      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2681
    }
2682
    load_key(xmm_temp, key, 0xc0);
2683
    __ aesenclast(xmm_result, xmm_temp);
2684

2685
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2686
    // no need to store r to memory until we exit
2687
    __ addptr(pos, AESBlockSize);
2688
    __ subptr(len_reg, AESBlockSize);
2689
    __ jcc(Assembler::notEqual, L_loopTop_192);
2690
    __ jmp(L_exit);
2691

2692
    __ BIND(L_key_256);
2693
    // 256-bit code follows here (could be changed to use more xmm registers)
2694
    __ movl(pos, 0);
2695
    __ align(OptoLoopAlignment);
2696
    __ BIND(L_loopTop_256);
2697
    __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2698
    __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2699

2700
    __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2701
    for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2702
      __ aesenc(xmm_result, as_XMMRegister(rnum));
2703
    }
2704
    for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2705
      aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2706
    }
2707
    load_key(xmm_temp, key, 0xe0);
2708
    __ aesenclast(xmm_result, xmm_temp);
2709

2710
    __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2711
    // no need to store r to memory until we exit
2712
    __ addptr(pos, AESBlockSize);
2713
    __ subptr(len_reg, AESBlockSize);
2714
    __ jcc(Assembler::notEqual, L_loopTop_256);
2715
    __ jmp(L_exit);
2716

2717
    return start;
2718
  }
2719

2720

2721
  // CBC AES Decryption.
2722
  // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2723
  //
2724
  // Arguments:
2725
  //
2726
  // Inputs:
2727
  //   c_rarg0   - source byte array address
2728
  //   c_rarg1   - destination byte array address
2729
  //   c_rarg2   - K (key) in little endian int array
2730
  //   c_rarg3   - r vector byte array address
2731
  //   c_rarg4   - input length
2732
  //
2733
  // Output:
2734
  //   rax       - input length
2735
  //
2736

2737
  address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2738
    assert(UseAES, "need AES instructions and misaligned SSE support");
2739
    __ align(CodeEntryAlignment);
2740
    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2741
    address start = __ pc();
2742

2743
    const Register from        = rsi;      // source array address
2744
    const Register to          = rdx;      // destination array address
2745
    const Register key         = rcx;      // key array address
2746
    const Register rvec        = rdi;      // r byte array initialized from initvector array address
2747
                                           // and left with the results of the last encryption block
2748
    const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2749
    const Register pos         = rax;
2750

2751
    const int PARALLEL_FACTOR = 4;
2752
    const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2753

2754
    Label L_exit;
2755
    Label L_singleBlock_loopTop[3]; //128, 192, 256
2756
    Label L_multiBlock_loopTop[3]; //128, 192, 256
2757

2758
    const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2759
    const XMMRegister xmm_key_shuf_mask = xmm1;
2760

2761
    const XMMRegister xmm_key_tmp0 = xmm2;
2762
    const XMMRegister xmm_key_tmp1 = xmm3;
2763

2764
    // registers holding the six results in the parallelized loop
2765
    const XMMRegister xmm_result0 = xmm4;
2766
    const XMMRegister xmm_result1 = xmm5;
2767
    const XMMRegister xmm_result2 = xmm6;
2768
    const XMMRegister xmm_result3 = xmm7;
2769

2770
    __ enter(); // required for proper stackwalking of RuntimeStub frame
2771
    handleSOERegisters(true /*saving*/);
2772

2773
    // load registers from incoming parameters
2774
    const Address  from_param(rbp, 8+0);
2775
    const Address  to_param  (rbp, 8+4);
2776
    const Address  key_param (rbp, 8+8);
2777
    const Address  rvec_param (rbp, 8+12);
2778
    const Address  len_param  (rbp, 8+16);
2779

2780
    __ movptr(from , from_param);
2781
    __ movptr(to   , to_param);
2782
    __ movptr(key  , key_param);
2783
    __ movptr(rvec , rvec_param);
2784
    __ movptr(len_reg , len_param);
2785

2786
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
2787
    __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2788

2789
    __ xorptr(pos, pos);
2790

2791
    // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2792
    // rvec is reused
2793
    __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2794
    __ cmpl(rvec, 52);
2795
    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2796
    __ cmpl(rvec, 60);
2797
    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2798

2799
#define DoFour(opc, src_reg)           \
2800
  __ opc(xmm_result0, src_reg);         \
2801
  __ opc(xmm_result1, src_reg);         \
2802
  __ opc(xmm_result2, src_reg);         \
2803
  __ opc(xmm_result3, src_reg);         \
2804

2805
    for (int k = 0; k < 3; ++k) {
2806
      __ align(OptoLoopAlignment);
2807
      __ BIND(L_multiBlock_loopTop[k]);
2808
      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2809
      __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2810

2811
      __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2812
      __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2813
      __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2814
      __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2815

2816
      // the java expanded key ordering is rotated one position from what we want
2817
      // so we start from 0x10 here and hit 0x00 last
2818
      load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2819
      DoFour(pxor, xmm_key_tmp0); //xor with first key
2820
      // do the aes dec rounds
2821
      for (int rnum = 1; rnum <= ROUNDS[k];) {
2822
        //load two keys at a time
2823
        //k1->0x20, ..., k9->0xa0, k10->0x00
2824
        load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2825
        load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2826
        DoFour(aesdec, xmm_key_tmp1);
2827
        rnum++;
2828
        if (rnum != ROUNDS[k]) {
2829
          DoFour(aesdec, xmm_key_tmp0);
2830
        }
2831
        else {
2832
          DoFour(aesdeclast, xmm_key_tmp0);
2833
        }
2834
        rnum++;
2835
      }
2836

2837
      // for each result, xor with the r vector of previous cipher block
2838
      __ pxor(xmm_result0, xmm_prev_block_cipher);
2839
      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2840
      __ pxor(xmm_result1, xmm_prev_block_cipher);
2841
      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2842
      __ pxor(xmm_result2, xmm_prev_block_cipher);
2843
      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2844
      __ pxor(xmm_result3, xmm_prev_block_cipher);
2845
      __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2846

2847
            // store 4 results into the next 64 bytes of output
2848
       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2849
       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2850
       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2851
       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2852

2853
       __ addptr(pos, 4 * AESBlockSize);
2854
       __ subptr(len_reg, 4 * AESBlockSize);
2855
       __ jmp(L_multiBlock_loopTop[k]);
2856

2857
       //singleBlock starts here
2858
       __ align(OptoLoopAlignment);
2859
       __ BIND(L_singleBlock_loopTop[k]);
2860
       __ cmpptr(len_reg, 0); // any blocks left?
2861
       __ jcc(Assembler::equal, L_exit);
2862
       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2863
       __ movdqa(xmm_result1, xmm_result0);
2864

2865
       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2866
       __ pxor(xmm_result0, xmm_key_tmp0);
2867
       // do the aes dec rounds
2868
       for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2869
         // the java expanded key ordering is rotated one position from what we want
2870
         load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2871
         __ aesdec(xmm_result0, xmm_key_tmp0);
2872
       }
2873
       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2874
       __ aesdeclast(xmm_result0, xmm_key_tmp0);
2875
       __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2876
       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2877
       // no need to store r to memory until we exit
2878
       __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2879

2880
       __ addptr(pos, AESBlockSize);
2881
       __ subptr(len_reg, AESBlockSize);
2882
       __ jmp(L_singleBlock_loopTop[k]);
2883
    }//for 128/192/256
2884

2885
    __ BIND(L_exit);
2886
    __ movptr(rvec, rvec_param);                        // restore this since reused earlier
2887
    __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2888
    handleSOERegisters(false /*restoring*/);
2889
    __ movptr(rax, len_param);                          // return length
2890
    __ leave();                                         // required for proper stackwalking of RuntimeStub frame
2891
    __ ret(0);
2892

2893
    return start;
2894
  }
2895

2896
  // CTR AES crypt.
2897
  // In 32-bit stub, parallelize 4 blocks at a time
2898
  // Arguments:
2899
  //
2900
  // Inputs:
2901
  //   c_rarg0   - source byte array address
2902
  //   c_rarg1   - destination byte array address
2903
  //   c_rarg2   - K (key) in little endian int array
2904
  //   c_rarg3   - counter vector byte array address
2905
  //   c_rarg4   - input length
2906
  //
2907
  // Output:
2908
  //   rax       - input length
2909
  //
2910
  address generate_counterMode_AESCrypt_Parallel() {
2911
    assert(UseAES, "need AES instructions and misaligned SSE support");
2912
    __ align(CodeEntryAlignment);
2913
    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2914
    address start = __ pc();
2915
    const Register from        = rsi;      // source array address
2916
    const Register to          = rdx;      // destination array address
2917
    const Register key         = rcx;      // key array address
2918
    const Register counter     = rdi;      // counter byte array initialized from initvector array address
2919
                                           // and updated with the incremented counter in the end
2920
    const Register len_reg     = rbx;
2921
    const Register pos         = rax;
2922

2923
    __ enter(); // required for proper stackwalking of RuntimeStub frame
2924
    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2925

2926
    // load registers from incoming parameters
2927
    const Address  from_param(rbp, 8+0);
2928
    const Address  to_param  (rbp, 8+4);
2929
    const Address  key_param (rbp, 8+8);
2930
    const Address  rvec_param (rbp, 8+12);
2931
    const Address  len_param  (rbp, 8+16);
2932
    const Address  saved_counter_param(rbp, 8 + 20);
2933
    const Address  used_addr_param(rbp, 8 + 24);
2934

2935
    __ movptr(from , from_param);
2936
    __ movptr(to   , to_param);
2937
    __ movptr(len_reg , len_param);
2938

2939
    // Use the partially used encrpyted counter from last invocation
2940
    Label L_exit_preLoop, L_preLoop_start;
2941

2942
    // Use the registers 'counter' and 'key' here in this preloop
2943
    // to hold of last 2 params 'used' and 'saved_encCounter_start'
2944
    Register used = counter;
2945
    Register saved_encCounter_start = key;
2946
    Register used_addr = saved_encCounter_start;
2947

2948
    __ movptr(used_addr, used_addr_param);
2949
    __ movptr(used, Address(used_addr, 0));
2950
    __ movptr(saved_encCounter_start, saved_counter_param);
2951

2952
    __ BIND(L_preLoop_start);
2953
    __ cmpptr(used, 16);
2954
    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
2955
    __ cmpptr(len_reg, 0);
2956
    __ jcc(Assembler::lessEqual, L_exit_preLoop);
2957
    __ movb(rax, Address(saved_encCounter_start, used));
2958
    __ xorb(rax, Address(from, 0));
2959
    __ movb(Address(to, 0), rax);
2960
    __ addptr(from, 1);
2961
    __ addptr(to, 1);
2962
    __ addptr(used, 1);
2963
    __ subptr(len_reg, 1);
2964

2965
    __ jmp(L_preLoop_start);
2966

2967
    __ BIND(L_exit_preLoop);
2968
    __ movptr(used_addr, used_addr_param);
2969
    __ movptr(used_addr, used_addr_param);
2970
    __ movl(Address(used_addr, 0), used);
2971

2972
    // load the parameters 'key' and 'counter'
2973
    __ movptr(key, key_param);
2974
    __ movptr(counter, rvec_param);
2975

2976
    // xmm register assignments for the loops below
2977
    const XMMRegister xmm_curr_counter      = xmm0;
2978
    const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
2979
    const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
2980
    const XMMRegister xmm_key               = xmm3;
2981
    const XMMRegister xmm_result0           = xmm4;
2982
    const XMMRegister xmm_result1           = xmm5;
2983
    const XMMRegister xmm_result2           = xmm6;
2984
    const XMMRegister xmm_result3           = xmm7;
2985
    const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
2986
    const XMMRegister xmm_from1             = xmm2;
2987
    const XMMRegister xmm_from2             = xmm3;
2988
    const XMMRegister xmm_from3             = xmm4;
2989

2990
    //for key_128, key_192, key_256
2991
    const int rounds[3] = {10, 12, 14};
2992
    Label L_singleBlockLoopTop[3];
2993
    Label L_multiBlock_loopTop[3];
2994
    Label L_key192_top, L_key256_top;
2995
    Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
2996
    Label L_incCounter_single[3]; //for single block, key128, key192, key256
2997
    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2998
    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
2999

3000
    Label L_exit;
3001
    const int PARALLEL_FACTOR = 4;  //because of the limited register number
3002

3003
    // initialize counter with initial counter
3004
    __ movdqu(xmm_curr_counter, Address(counter, 0x00));
3005
    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3006
    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
3007

3008
    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
3009
    __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
3010
    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3011
    __ cmpl(rax, 52);
3012
    __ jcc(Assembler::equal, L_key192_top);
3013
    __ cmpl(rax, 60);
3014
    __ jcc(Assembler::equal, L_key256_top);
3015

3016
    //key128 begins here
3017
    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3018

3019
#define CTR_DoFour(opc, src_reg)               \
3020
    __ opc(xmm_result0, src_reg);              \
3021
    __ opc(xmm_result1, src_reg);              \
3022
    __ opc(xmm_result2, src_reg);              \
3023
    __ opc(xmm_result3, src_reg);
3024

3025
    // k == 0 :  generate code for key_128
3026
    // k == 1 :  generate code for key_192
3027
    // k == 2 :  generate code for key_256
3028
    for (int k = 0; k < 3; ++k) {
3029
      //multi blocks starts here
3030
      __ align(OptoLoopAlignment);
3031
      __ BIND(L_multiBlock_loopTop[k]);
3032
      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
3033
      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
3034

3035
      __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
3036
      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3037

3038
      //load, then increase counters
3039
      CTR_DoFour(movdqa, xmm_curr_counter);
3040
      __ push(rbx);
3041
      inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
3042
      inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
3043
      inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
3044
      inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
3045
      __ pop (rbx);
3046

3047
      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
3048

3049
      CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
3050
      CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
3051

3052
      for (int i = 1; i < rounds[k]; ++i) {
3053
        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
3054
        CTR_DoFour(aesenc, xmm_key);
3055
      }
3056
      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
3057
      CTR_DoFour(aesenclast, xmm_key);
3058

3059
      // get next PARALLEL_FACTOR blocks into xmm_from registers
3060
      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3061
      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3062
      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3063

3064
      // PXOR with input text
3065
      __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
3066
      __ pxor(xmm_result1, xmm_from1);
3067
      __ pxor(xmm_result2, xmm_from2);
3068

3069
      // store PARALLEL_FACTOR results into the next 64 bytes of output
3070
      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3071
      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3072
      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3073

3074
      // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
3075
      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3076
      __ pxor(xmm_result3, xmm_from3);
3077
      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3078

3079
      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
3080
      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
3081
      __ jmp(L_multiBlock_loopTop[k]);
3082

3083
      // singleBlock starts here
3084
      __ align(OptoLoopAlignment);
3085
      __ BIND(L_singleBlockLoopTop[k]);
3086
      __ cmpptr(len_reg, 0);
3087
      __ jcc(Assembler::equal, L_exit);
3088
      __ movdqu(xmm_key_shuf_mask, ExternalAddress(key_shuffle_mask_addr()));
3089
      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3090
      __ movdqa(xmm_result0, xmm_curr_counter);
3091
      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
3092
      __ push(rbx);//rbx is used for increasing counter
3093
      inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
3094
      __ pop (rbx);
3095
      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
3096
      __ pxor(xmm_result0, xmm_key);
3097
      for (int i = 1; i < rounds[k]; i++) {
3098
        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
3099
        __ aesenc(xmm_result0, xmm_key);
3100
      }
3101
      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
3102
      __ aesenclast(xmm_result0, xmm_key);
3103
      __ cmpptr(len_reg, AESBlockSize);
3104
      __ jcc(Assembler::less, L_processTail_insr[k]);
3105
        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3106
        __ pxor(xmm_result0, xmm_from0);
3107
        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3108
        __ addptr(pos, AESBlockSize);
3109
        __ subptr(len_reg, AESBlockSize);
3110
        __ jmp(L_singleBlockLoopTop[k]);
3111

3112
      __ BIND(L_processTail_insr[k]);                                               // Process the tail part of the input array
3113
        __ addptr(pos, len_reg);                                                    // 1. Insert bytes from src array into xmm_from0 register
3114
        __ testptr(len_reg, 8);
3115
        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
3116
          __ subptr(pos,8);
3117
          __ pinsrd(xmm_from0, Address(from, pos), 0);
3118
          __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
3119
        __ BIND(L_processTail_4_insr[k]);
3120
        __ testptr(len_reg, 4);
3121
        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
3122
          __ subptr(pos,4);
3123
          __ pslldq(xmm_from0, 4);
3124
          __ pinsrd(xmm_from0, Address(from, pos), 0);
3125
        __ BIND(L_processTail_2_insr[k]);
3126
        __ testptr(len_reg, 2);
3127
        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
3128
          __ subptr(pos, 2);
3129
          __ pslldq(xmm_from0, 2);
3130
          __ pinsrw(xmm_from0, Address(from, pos), 0);
3131
        __ BIND(L_processTail_1_insr[k]);
3132
        __ testptr(len_reg, 1);
3133
        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
3134
          __ subptr(pos, 1);
3135
          __ pslldq(xmm_from0, 1);
3136
          __ pinsrb(xmm_from0, Address(from, pos), 0);
3137
        __ BIND(L_processTail_exit_insr[k]);
3138

3139
        __ movptr(saved_encCounter_start, saved_counter_param);
3140
        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);               // 2. Perform pxor of the encrypted counter and plaintext Bytes.
3141
        __ pxor(xmm_result0, xmm_from0);                                          //    Also the encrypted counter is saved for next invocation.
3142

3143
        __ testptr(len_reg, 8);
3144
        __ jcc(Assembler::zero, L_processTail_4_extr[k]);                        // 3. Extract bytes from xmm_result0 into the dest. array
3145
          __ pextrd(Address(to, pos), xmm_result0, 0);
3146
          __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
3147
          __ psrldq(xmm_result0, 8);
3148
          __ addptr(pos, 8);
3149
        __ BIND(L_processTail_4_extr[k]);
3150
        __ testptr(len_reg, 4);
3151
        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
3152
          __ pextrd(Address(to, pos), xmm_result0, 0);
3153
          __ psrldq(xmm_result0, 4);
3154
          __ addptr(pos, 4);
3155
        __ BIND(L_processTail_2_extr[k]);
3156
        __ testptr(len_reg, 2);
3157
        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
3158
          __ pextrb(Address(to, pos), xmm_result0, 0);
3159
          __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
3160
          __ psrldq(xmm_result0, 2);
3161
          __ addptr(pos, 2);
3162
        __ BIND(L_processTail_1_extr[k]);
3163
        __ testptr(len_reg, 1);
3164
        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
3165
          __ pextrb(Address(to, pos), xmm_result0, 0);
3166

3167
        __ BIND(L_processTail_exit_extr[k]);
3168
        __ movptr(used_addr, used_addr_param);
3169
        __ movl(Address(used_addr, 0), len_reg);
3170
        __ jmp(L_exit);
3171
    }
3172

3173
    __ BIND(L_exit);
3174
    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(counter_shuffle_mask_addr()));
3175
    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
3176
    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
3177
    handleSOERegisters(false /*restoring*/);
3178
    __ movptr(rax, len_param); // return length
3179
    __ leave();                // required for proper stackwalking of RuntimeStub frame
3180
    __ ret(0);
3181

3182
    __ BIND (L_key192_top);
3183
    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3184
    __ jmp(L_multiBlock_loopTop[1]); //key192
3185

3186
    __ BIND (L_key256_top);
3187
    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3188
    __ jmp(L_multiBlock_loopTop[2]); //key192
3189

3190
    return start;
3191
  }
3192

3193
  // ofs and limit are use for multi-block byte array.
3194
  // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
3195
  address generate_md5_implCompress(bool multi_block, const char *name) {
3196
    __ align(CodeEntryAlignment);
3197
    StubCodeMark mark(this, "StubRoutines", name);
3198
    address start = __ pc();
3199

3200
    const Register buf_param = rbp;
3201
    const Address state_param(rsp, 0 * wordSize);
3202
    const Address ofs_param  (rsp, 1 * wordSize);
3203
    const Address limit_param(rsp, 2 * wordSize);
3204

3205
    __ enter();
3206
    __ push(rbx);
3207
    __ push(rdi);
3208
    __ push(rsi);
3209
    __ push(rbp);
3210
    __ subptr(rsp, 3 * wordSize);
3211

3212
    __ movptr(rsi, Address(rbp, 8 + 4));
3213
    __ movptr(state_param, rsi);
3214
    if (multi_block) {
3215
      __ movptr(rsi, Address(rbp, 8 + 8));
3216
      __ movptr(ofs_param, rsi);
3217
      __ movptr(rsi, Address(rbp, 8 + 12));
3218
      __ movptr(limit_param, rsi);
3219
    }
3220
    __ movptr(buf_param, Address(rbp, 8 + 0)); // do it last because it override rbp
3221
    __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
3222

3223
    __ addptr(rsp, 3 * wordSize);
3224
    __ pop(rbp);
3225
    __ pop(rsi);
3226
    __ pop(rdi);
3227
    __ pop(rbx);
3228
    __ leave();
3229
    __ ret(0);
3230
    return start;
3231
  }
3232

3233
  address generate_upper_word_mask() {
3234
    __ align64();
3235
    StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3236
    address start = __ pc();
3237
    __ emit_data(0x00000000, relocInfo::none, 0);
3238
    __ emit_data(0x00000000, relocInfo::none, 0);
3239
    __ emit_data(0x00000000, relocInfo::none, 0);
3240
    __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
3241
    return start;
3242
  }
3243

3244
  address generate_shuffle_byte_flip_mask() {
3245
    __ align64();
3246
    StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3247
    address start = __ pc();
3248
    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3249
    __ emit_data(0x08090a0b, relocInfo::none, 0);
3250
    __ emit_data(0x04050607, relocInfo::none, 0);
3251
    __ emit_data(0x00010203, relocInfo::none, 0);
3252
    return start;
3253
  }
3254

3255
  // ofs and limit are use for multi-block byte array.
3256
  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3257
  address generate_sha1_implCompress(bool multi_block, const char *name) {
3258
    __ align(CodeEntryAlignment);
3259
    StubCodeMark mark(this, "StubRoutines", name);
3260
    address start = __ pc();
3261

3262
    Register buf   = rax;
3263
    Register state = rdx;
3264
    Register ofs   = rcx;
3265
    Register limit = rdi;
3266

3267
    const Address  buf_param(rbp, 8 + 0);
3268
    const Address  state_param(rbp, 8 + 4);
3269
    const Address  ofs_param(rbp, 8 + 8);
3270
    const Address  limit_param(rbp, 8 + 12);
3271

3272
    const XMMRegister abcd = xmm0;
3273
    const XMMRegister e0 = xmm1;
3274
    const XMMRegister e1 = xmm2;
3275
    const XMMRegister msg0 = xmm3;
3276

3277
    const XMMRegister msg1 = xmm4;
3278
    const XMMRegister msg2 = xmm5;
3279
    const XMMRegister msg3 = xmm6;
3280
    const XMMRegister shuf_mask = xmm7;
3281

3282
    __ enter();
3283
    __ subptr(rsp, 8 * wordSize);
3284
    handleSOERegisters(true /*saving*/);
3285

3286
    __ movptr(buf, buf_param);
3287
    __ movptr(state, state_param);
3288
    if (multi_block) {
3289
      __ movptr(ofs, ofs_param);
3290
      __ movptr(limit, limit_param);
3291
    }
3292

3293
    __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3294
      buf, state, ofs, limit, rsp, multi_block);
3295

3296
    handleSOERegisters(false /*restoring*/);
3297
    __ addptr(rsp, 8 * wordSize);
3298
    __ leave();
3299
    __ ret(0);
3300
    return start;
3301
  }
3302

3303
  address generate_pshuffle_byte_flip_mask() {
3304
    __ align64();
3305
    StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3306
    address start = __ pc();
3307
    __ emit_data(0x00010203, relocInfo::none, 0);
3308
    __ emit_data(0x04050607, relocInfo::none, 0);
3309
    __ emit_data(0x08090a0b, relocInfo::none, 0);
3310
    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3311
    return start;
3312
  }
3313

3314
  // ofs and limit are use for multi-block byte array.
3315
  // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3316
 address generate_sha256_implCompress(bool multi_block, const char *name) {
3317
    __ align(CodeEntryAlignment);
3318
    StubCodeMark mark(this, "StubRoutines", name);
3319
    address start = __ pc();
3320

3321
    Register buf = rbx;
3322
    Register state = rsi;
3323
    Register ofs = rdx;
3324
    Register limit = rcx;
3325

3326
    const Address  buf_param(rbp, 8 + 0);
3327
    const Address  state_param(rbp, 8 + 4);
3328
    const Address  ofs_param(rbp, 8 + 8);
3329
    const Address  limit_param(rbp, 8 + 12);
3330

3331
    const XMMRegister msg = xmm0;
3332
    const XMMRegister state0 = xmm1;
3333
    const XMMRegister state1 = xmm2;
3334
    const XMMRegister msgtmp0 = xmm3;
3335

3336
    const XMMRegister msgtmp1 = xmm4;
3337
    const XMMRegister msgtmp2 = xmm5;
3338
    const XMMRegister msgtmp3 = xmm6;
3339
    const XMMRegister msgtmp4 = xmm7;
3340

3341
    __ enter();
3342
    __ subptr(rsp, 8 * wordSize);
3343
    handleSOERegisters(true /*saving*/);
3344
    __ movptr(buf, buf_param);
3345
    __ movptr(state, state_param);
3346
    if (multi_block) {
3347
     __ movptr(ofs, ofs_param);
3348
     __ movptr(limit, limit_param);
3349
    }
3350

3351
    __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3352
      buf, state, ofs, limit, rsp, multi_block);
3353

3354
    handleSOERegisters(false);
3355
    __ addptr(rsp, 8 * wordSize);
3356
    __ leave();
3357
    __ ret(0);
3358
    return start;
3359
  }
3360

3361
  // byte swap x86 long
3362
  address ghash_long_swap_mask_addr() {
3363
    return (address)GHASH_LONG_SWAP_MASK;
3364
  }
3365

3366
  // byte swap x86 byte array
3367
  address ghash_byte_swap_mask_addr() {
3368
    return (address)GHASH_BYTE_SWAP_MASK;
3369
  }
3370

3371
  /* Single and multi-block ghash operations */
3372
  address generate_ghash_processBlocks() {
3373
    assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3374
    __ align(CodeEntryAlignment);
3375
    Label L_ghash_loop, L_exit;
3376
    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3377
    address start = __ pc();
3378

3379
    const Register state        = rdi;
3380
    const Register subkeyH      = rsi;
3381
    const Register data         = rdx;
3382
    const Register blocks       = rcx;
3383

3384
    const Address  state_param(rbp, 8+0);
3385
    const Address  subkeyH_param(rbp, 8+4);
3386
    const Address  data_param(rbp, 8+8);
3387
    const Address  blocks_param(rbp, 8+12);
3388

3389
    const XMMRegister xmm_temp0 = xmm0;
3390
    const XMMRegister xmm_temp1 = xmm1;
3391
    const XMMRegister xmm_temp2 = xmm2;
3392
    const XMMRegister xmm_temp3 = xmm3;
3393
    const XMMRegister xmm_temp4 = xmm4;
3394
    const XMMRegister xmm_temp5 = xmm5;
3395
    const XMMRegister xmm_temp6 = xmm6;
3396
    const XMMRegister xmm_temp7 = xmm7;
3397

3398
    __ enter();
3399
    handleSOERegisters(true);  // Save registers
3400

3401
    __ movptr(state, state_param);
3402
    __ movptr(subkeyH, subkeyH_param);
3403
    __ movptr(data, data_param);
3404
    __ movptr(blocks, blocks_param);
3405

3406
    __ movdqu(xmm_temp0, Address(state, 0));
3407
    __ pshufb(xmm_temp0, ExternalAddress(ghash_long_swap_mask_addr()));
3408

3409
    __ movdqu(xmm_temp1, Address(subkeyH, 0));
3410
    __ pshufb(xmm_temp1, ExternalAddress(ghash_long_swap_mask_addr()));
3411

3412
    __ BIND(L_ghash_loop);
3413
    __ movdqu(xmm_temp2, Address(data, 0));
3414
    __ pshufb(xmm_temp2, ExternalAddress(ghash_byte_swap_mask_addr()));
3415

3416
    __ pxor(xmm_temp0, xmm_temp2);
3417

3418
    //
3419
    // Multiply with the hash key
3420
    //
3421
    __ movdqu(xmm_temp3, xmm_temp0);
3422
    __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3423
    __ movdqu(xmm_temp4, xmm_temp0);
3424
    __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3425

3426
    __ movdqu(xmm_temp5, xmm_temp0);
3427
    __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3428
    __ movdqu(xmm_temp6, xmm_temp0);
3429
    __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3430

3431
    __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3432

3433
    __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3434
    __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3435
    __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3436
    __ pxor(xmm_temp3, xmm_temp5);
3437
    __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3438
                                        // of the carry-less multiplication of
3439
                                        // xmm0 by xmm1.
3440

3441
    // We shift the result of the multiplication by one bit position
3442
    // to the left to cope for the fact that the bits are reversed.
3443
    __ movdqu(xmm_temp7, xmm_temp3);
3444
    __ movdqu(xmm_temp4, xmm_temp6);
3445
    __ pslld (xmm_temp3, 1);
3446
    __ pslld(xmm_temp6, 1);
3447
    __ psrld(xmm_temp7, 31);
3448
    __ psrld(xmm_temp4, 31);
3449
    __ movdqu(xmm_temp5, xmm_temp7);
3450
    __ pslldq(xmm_temp4, 4);
3451
    __ pslldq(xmm_temp7, 4);
3452
    __ psrldq(xmm_temp5, 12);
3453
    __ por(xmm_temp3, xmm_temp7);
3454
    __ por(xmm_temp6, xmm_temp4);
3455
    __ por(xmm_temp6, xmm_temp5);
3456

3457
    //
3458
    // First phase of the reduction
3459
    //
3460
    // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3461
    // independently.
3462
    __ movdqu(xmm_temp7, xmm_temp3);
3463
    __ movdqu(xmm_temp4, xmm_temp3);
3464
    __ movdqu(xmm_temp5, xmm_temp3);
3465
    __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3466
    __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
3467
    __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
3468
    __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
3469
    __ pxor(xmm_temp7, xmm_temp5);
3470
    __ movdqu(xmm_temp4, xmm_temp7);
3471
    __ pslldq(xmm_temp7, 12);
3472
    __ psrldq(xmm_temp4, 4);
3473
    __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3474

3475
    //
3476
    // Second phase of the reduction
3477
    //
3478
    // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3479
    // shift operations.
3480
    __ movdqu(xmm_temp2, xmm_temp3);
3481
    __ movdqu(xmm_temp7, xmm_temp3);
3482
    __ movdqu(xmm_temp5, xmm_temp3);
3483
    __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3484
    __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
3485
    __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3486
    __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
3487
    __ pxor(xmm_temp2, xmm_temp5);
3488
    __ pxor(xmm_temp2, xmm_temp4);
3489
    __ pxor(xmm_temp3, xmm_temp2);
3490
    __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3491

3492
    __ decrement(blocks);
3493
    __ jcc(Assembler::zero, L_exit);
3494
    __ movdqu(xmm_temp0, xmm_temp6);
3495
    __ addptr(data, 16);
3496
    __ jmp(L_ghash_loop);
3497

3498
    __ BIND(L_exit);
3499
       // Byte swap 16-byte result
3500
    __ pshufb(xmm_temp6, ExternalAddress(ghash_long_swap_mask_addr()));
3501
    __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3502

3503
    handleSOERegisters(false);  // restore registers
3504
    __ leave();
3505
    __ ret(0);
3506
    return start;
3507
  }
3508

3509
  /**
3510
   *  Arguments:
3511
   *
3512
   * Inputs:
3513
   *   rsp(4)   - int crc
3514
   *   rsp(8)   - byte* buf
3515
   *   rsp(12)  - int length
3516
   *
3517
   * Output:
3518
   *       rax   - int crc result
3519
   */
3520
  address generate_updateBytesCRC32() {
3521
    assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3522

3523
    __ align(CodeEntryAlignment);
3524
    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3525

3526
    address start = __ pc();
3527

3528
    const Register crc   = rdx;  // crc
3529
    const Register buf   = rsi;  // source java byte array address
3530
    const Register len   = rcx;  // length
3531
    const Register table = rdi;  // crc_table address (reuse register)
3532
    const Register tmp   = rbx;
3533
    assert_different_registers(crc, buf, len, table, tmp, rax);
3534

3535
    BLOCK_COMMENT("Entry:");
3536
    __ enter(); // required for proper stackwalking of RuntimeStub frame
3537
    __ push(rsi);
3538
    __ push(rdi);
3539
    __ push(rbx);
3540

3541
    Address crc_arg(rbp, 8 + 0);
3542
    Address buf_arg(rbp, 8 + 4);
3543
    Address len_arg(rbp, 8 + 8);
3544

3545
    // Load up:
3546
    __ movl(crc,   crc_arg);
3547
    __ movptr(buf, buf_arg);
3548
    __ movl(len,   len_arg);
3549

3550
    __ kernel_crc32(crc, buf, len, table, tmp);
3551

3552
    __ movl(rax, crc);
3553
    __ pop(rbx);
3554
    __ pop(rdi);
3555
    __ pop(rsi);
3556
    __ vzeroupper();
3557
    __ leave(); // required for proper stackwalking of RuntimeStub frame
3558
    __ ret(0);
3559

3560
    return start;
3561
  }
3562

3563
  /**
3564
  *  Arguments:
3565
  *
3566
  * Inputs:
3567
  *   rsp(4)   - int crc
3568
  *   rsp(8)   - byte* buf
3569
  *   rsp(12)  - int length
3570
  *   rsp(16)  - table_start - optional (present only when doing a library_calll,
3571
  *              not used by x86 algorithm)
3572
  *
3573
  * Output:
3574
  *       rax  - int crc result
3575
  */
3576
  address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3577
    assert(UseCRC32CIntrinsics, "need SSE4_2");
3578
    __ align(CodeEntryAlignment);
3579
    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3580
    address start = __ pc();
3581
    const Register crc = rax;  // crc
3582
    const Register buf = rcx;  // source java byte array address
3583
    const Register len = rdx;  // length
3584
    const Register d = rbx;
3585
    const Register g = rsi;
3586
    const Register h = rdi;
3587
    const Register empty = noreg; // will never be used, in order not
3588
                                  // to change a signature for crc32c_IPL_Alg2_Alt2
3589
                                  // between 64/32 I'm just keeping it here
3590
    assert_different_registers(crc, buf, len, d, g, h);
3591

3592
    BLOCK_COMMENT("Entry:");
3593
    __ enter(); // required for proper stackwalking of RuntimeStub frame
3594
    Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3595
                                     // we need to add additional 4 because __ enter
3596
                                     // have just pushed ebp on a stack
3597
    Address buf_arg(rsp, 4 + 4 + 4);
3598
    Address len_arg(rsp, 4 + 4 + 8);
3599
      // Load up:
3600
      __ movl(crc, crc_arg);
3601
      __ movl(buf, buf_arg);
3602
      __ movl(len, len_arg);
3603
      __ push(d);
3604
      __ push(g);
3605
      __ push(h);
3606
      __ crc32c_ipl_alg2_alt2(crc, buf, len,
3607
                              d, g, h,
3608
                              empty, empty, empty,
3609
                              xmm0, xmm1, xmm2,
3610
                              is_pclmulqdq_supported);
3611
      __ pop(h);
3612
      __ pop(g);
3613
      __ pop(d);
3614
    __ vzeroupper();
3615
    __ leave(); // required for proper stackwalking of RuntimeStub frame
3616
    __ ret(0);
3617

3618
    return start;
3619
  }
3620

3621
 address generate_libmExp() {
3622
    StubCodeMark mark(this, "StubRoutines", "libmExp");
3623

3624
    address start = __ pc();
3625

3626
    BLOCK_COMMENT("Entry:");
3627
    __ enter(); // required for proper stackwalking of RuntimeStub frame
3628
    __ fast_exp(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3629
                rax, rcx, rdx, rbx);
3630
    __ leave(); // required for proper stackwalking of RuntimeStub frame
3631
    __ ret(0);
3632

3633
    return start;
3634

3635
  }
3636

3637
 address generate_libmLog() {
3638
   StubCodeMark mark(this, "StubRoutines", "libmLog");
3639

3640
   address start = __ pc();
3641

3642
   BLOCK_COMMENT("Entry:");
3643
   __ enter(); // required for proper stackwalking of RuntimeStub frame
3644
   __ fast_log(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3645
               rax, rcx, rdx, rbx);
3646
   __ leave(); // required for proper stackwalking of RuntimeStub frame
3647
   __ ret(0);
3648

3649
   return start;
3650

3651
 }
3652

3653
 address generate_libmLog10() {
3654
   StubCodeMark mark(this, "StubRoutines", "libmLog10");
3655

3656
   address start = __ pc();
3657

3658
   BLOCK_COMMENT("Entry:");
3659
   __ enter(); // required for proper stackwalking of RuntimeStub frame
3660
   __ fast_log10(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3661
               rax, rcx, rdx, rbx);
3662
   __ leave(); // required for proper stackwalking of RuntimeStub frame
3663
   __ ret(0);
3664

3665
   return start;
3666

3667
 }
3668

3669
 address generate_libmPow() {
3670
   StubCodeMark mark(this, "StubRoutines", "libmPow");
3671

3672
   address start = __ pc();
3673

3674
   BLOCK_COMMENT("Entry:");
3675
   __ enter(); // required for proper stackwalking of RuntimeStub frame
3676
   __ fast_pow(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3677
               rax, rcx, rdx, rbx);
3678
   __ leave(); // required for proper stackwalking of RuntimeStub frame
3679
   __ ret(0);
3680

3681
   return start;
3682

3683
 }
3684

3685
 address generate_libm_reduce_pi04l() {
3686
   StubCodeMark mark(this, "StubRoutines", "libm_reduce_pi04l");
3687

3688
   address start = __ pc();
3689

3690
   BLOCK_COMMENT("Entry:");
3691
   __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3692

3693
   return start;
3694

3695
 }
3696

3697
 address generate_libm_sin_cos_huge() {
3698
   StubCodeMark mark(this, "StubRoutines", "libm_sin_cos_huge");
3699

3700
   address start = __ pc();
3701

3702
   BLOCK_COMMENT("Entry:");
3703
   __ libm_sincos_huge(xmm0, xmm1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3704

3705
   return start;
3706

3707
 }
3708

3709
 address generate_libmSin() {
3710
   StubCodeMark mark(this, "StubRoutines", "libmSin");
3711

3712
   address start = __ pc();
3713

3714
   BLOCK_COMMENT("Entry:");
3715
   __ enter(); // required for proper stackwalking of RuntimeStub frame
3716
   __ fast_sin(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3717
               rax, rbx, rdx);
3718
   __ leave(); // required for proper stackwalking of RuntimeStub frame
3719
   __ ret(0);
3720

3721
   return start;
3722

3723
 }
3724

3725
 address generate_libmCos() {
3726
   StubCodeMark mark(this, "StubRoutines", "libmCos");
3727

3728
   address start = __ pc();
3729

3730
   BLOCK_COMMENT("Entry:");
3731
   __ enter(); // required for proper stackwalking of RuntimeStub frame
3732
   __ fast_cos(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3733
               rax, rcx, rdx, rbx);
3734
   __ leave(); // required for proper stackwalking of RuntimeStub frame
3735
   __ ret(0);
3736

3737
   return start;
3738

3739
 }
3740

3741
 address generate_libm_tan_cot_huge() {
3742
   StubCodeMark mark(this, "StubRoutines", "libm_tan_cot_huge");
3743

3744
   address start = __ pc();
3745

3746
   BLOCK_COMMENT("Entry:");
3747
   __ libm_tancot_huge(xmm0, xmm1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3748

3749
   return start;
3750

3751
 }
3752

3753
 address generate_libmTan() {
3754
   StubCodeMark mark(this, "StubRoutines", "libmTan");
3755

3756
   address start = __ pc();
3757

3758
   BLOCK_COMMENT("Entry:");
3759
   __ enter(); // required for proper stackwalking of RuntimeStub frame
3760
   __ fast_tan(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7,
3761
               rax, rcx, rdx, rbx);
3762
   __ leave(); // required for proper stackwalking of RuntimeStub frame
3763
   __ ret(0);
3764

3765
   return start;
3766

3767
 }
3768

3769
  address generate_method_entry_barrier() {
3770
    __ align(CodeEntryAlignment);
3771
    StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3772

3773
    Label deoptimize_label;
3774

3775
    address start = __ pc();
3776

3777
    __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
3778

3779
    BLOCK_COMMENT("Entry:");
3780
    __ enter(); // save rbp
3781

3782
    // save rbx, because we want to use that value.
3783
    // We could do without it but then we depend on the number of slots used by pusha
3784
    __ push(rbx);
3785

3786
    __ lea(rbx, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for rbx - this should be the return address
3787

3788
    __ pusha();
3789

3790
    // xmm0 and xmm1 may be used for passing float/double arguments
3791

3792
    if (UseSSE >= 2) {
3793
      const int xmm_size = wordSize * 4;
3794
      __ subptr(rsp, xmm_size * 2);
3795
      __ movdbl(Address(rsp, xmm_size * 1), xmm1);
3796
      __ movdbl(Address(rsp, xmm_size * 0), xmm0);
3797
    } else if (UseSSE >= 1) {
3798
      const int xmm_size = wordSize * 2;
3799
      __ subptr(rsp, xmm_size * 2);
3800
      __ movflt(Address(rsp, xmm_size * 1), xmm1);
3801
      __ movflt(Address(rsp, xmm_size * 0), xmm0);
3802
    }
3803

3804
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), rbx);
3805

3806
    if (UseSSE >= 2) {
3807
      const int xmm_size = wordSize * 4;
3808
      __ movdbl(xmm0, Address(rsp, xmm_size * 0));
3809
      __ movdbl(xmm1, Address(rsp, xmm_size * 1));
3810
      __ addptr(rsp, xmm_size * 2);
3811
    } else if (UseSSE >= 1) {
3812
      const int xmm_size = wordSize * 2;
3813
      __ movflt(xmm0, Address(rsp, xmm_size * 0));
3814
      __ movflt(xmm1, Address(rsp, xmm_size * 1));
3815
      __ addptr(rsp, xmm_size * 2);
3816
    }
3817

3818
    __ cmpl(rax, 1); // 1 means deoptimize
3819
    __ jcc(Assembler::equal, deoptimize_label);
3820

3821
    __ popa();
3822
    __ pop(rbx);
3823

3824
    __ leave();
3825

3826
    __ addptr(rsp, 1 * wordSize); // cookie
3827
    __ ret(0);
3828

3829
    __ BIND(deoptimize_label);
3830

3831
    __ popa();
3832
    __ pop(rbx);
3833

3834
    __ leave();
3835

3836
    // this can be taken out, but is good for verification purposes. getting a SIGSEGV
3837
    // here while still having a correct stack is valuable
3838
    __ testptr(rsp, Address(rsp, 0));
3839

3840
    __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
3841
    __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
3842

3843
    return start;
3844
  }
3845

3846
 public:
3847
  // Information about frame layout at time of blocking runtime call.
3848
  // Note that we only have to preserve callee-saved registers since
3849
  // the compilers are responsible for supplying a continuation point
3850
  // if they expect all registers to be preserved.
3851
  enum layout {
3852
    thread_off,    // last_java_sp
3853
    arg1_off,
3854
    arg2_off,
3855
    rbp_off,       // callee saved register
3856
    ret_pc,
3857
    framesize
3858
  };
3859

3860
 private:
3861

3862
#undef  __
3863
#define __ masm->
3864

3865
  //------------------------------------------------------------------------------------------------------------------------
3866
  // Continuation point for throwing of implicit exceptions that are not handled in
3867
  // the current activation. Fabricates an exception oop and initiates normal
3868
  // exception dispatching in this frame.
3869
  //
3870
  // Previously the compiler (c2) allowed for callee save registers on Java calls.
3871
  // This is no longer true after adapter frames were removed but could possibly
3872
  // be brought back in the future if the interpreter code was reworked and it
3873
  // was deemed worthwhile. The comment below was left to describe what must
3874
  // happen here if callee saves were resurrected. As it stands now this stub
3875
  // could actually be a vanilla BufferBlob and have now oopMap at all.
3876
  // Since it doesn't make much difference we've chosen to leave it the
3877
  // way it was in the callee save days and keep the comment.
3878

3879
  // If we need to preserve callee-saved values we need a callee-saved oop map and
3880
  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3881
  // If the compiler needs all registers to be preserved between the fault
3882
  // point and the exception handler then it must assume responsibility for that in
3883
  // AbstractCompiler::continuation_for_implicit_null_exception or
3884
  // continuation_for_implicit_division_by_zero_exception. All other implicit
3885
  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3886
  // either at call sites or otherwise assume that stack unwinding will be initiated,
3887
  // so caller saved registers were assumed volatile in the compiler.
3888
  address generate_throw_exception(const char* name, address runtime_entry,
3889
                                   Register arg1 = noreg, Register arg2 = noreg) {
3890

3891
    int insts_size = 256;
3892
    int locs_size  = 32;
3893

3894
    CodeBuffer code(name, insts_size, locs_size);
3895
    OopMapSet* oop_maps  = new OopMapSet();
3896
    MacroAssembler* masm = new MacroAssembler(&code);
3897

3898
    address start = __ pc();
3899

3900
    // This is an inlined and slightly modified version of call_VM
3901
    // which has the ability to fetch the return PC out of
3902
    // thread-local storage and also sets up last_Java_sp slightly
3903
    // differently than the real call_VM
3904
    Register java_thread = rbx;
3905
    __ get_thread(java_thread);
3906

3907
    __ enter(); // required for proper stackwalking of RuntimeStub frame
3908

3909
    // pc and rbp, already pushed
3910
    __ subptr(rsp, (framesize-2) * wordSize); // prolog
3911

3912
    // Frame is now completed as far as size and linkage.
3913

3914
    int frame_complete = __ pc() - start;
3915

3916
    // push java thread (becomes first argument of C function)
3917
    __ movptr(Address(rsp, thread_off * wordSize), java_thread);
3918
    if (arg1 != noreg) {
3919
      __ movptr(Address(rsp, arg1_off * wordSize), arg1);
3920
    }
3921
    if (arg2 != noreg) {
3922
      assert(arg1 != noreg, "missing reg arg");
3923
      __ movptr(Address(rsp, arg2_off * wordSize), arg2);
3924
    }
3925

3926
    // Set up last_Java_sp and last_Java_fp
3927
    __ set_last_Java_frame(java_thread, rsp, rbp, nullptr, noreg);
3928

3929
    // Call runtime
3930
    BLOCK_COMMENT("call runtime_entry");
3931
    __ call(RuntimeAddress(runtime_entry));
3932
    // Generate oop map
3933
    OopMap* map =  new OopMap(framesize, 0);
3934
    oop_maps->add_gc_map(__ pc() - start, map);
3935

3936
    // restore the thread (cannot use the pushed argument since arguments
3937
    // may be overwritten by C code generated by an optimizing compiler);
3938
    // however can use the register value directly if it is callee saved.
3939
    __ get_thread(java_thread);
3940

3941
    __ reset_last_Java_frame(java_thread, true);
3942

3943
    __ leave(); // required for proper stackwalking of RuntimeStub frame
3944

3945
    // check for pending exceptions
3946
#ifdef ASSERT
3947
    Label L;
3948
    __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD);
3949
    __ jcc(Assembler::notEqual, L);
3950
    __ should_not_reach_here();
3951
    __ bind(L);
3952
#endif /* ASSERT */
3953
    __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3954

3955

3956
    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3957
    return stub->entry_point();
3958
  }
3959

3960

3961
  void create_control_words() {
3962
    // Round to nearest, 53-bit mode, exceptions masked
3963
    StubRoutines::x86::_fpu_cntrl_wrd_std   = 0x027F;
3964
    // Round to zero, 53-bit mode, exception mased
3965
    StubRoutines::x86::_fpu_cntrl_wrd_trunc = 0x0D7F;
3966
    // Round to nearest, 24-bit mode, exceptions masked
3967
    StubRoutines::x86::_fpu_cntrl_wrd_24    = 0x007F;
3968
    // Round to nearest, 64-bit mode, exceptions masked, flags specialized
3969
    StubRoutines::x86::_mxcsr_std           = EnableX86ECoreOpts ? 0x1FBF : 0x1F80;
3970
    // Note: the following two constants are 80-bit values
3971
    //       layout is critical for correct loading by FPU.
3972
    // Bias for strict fp multiply/divide
3973
    StubRoutines::x86::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3974
    StubRoutines::x86::_fpu_subnormal_bias1[1]= 0x80000000;
3975
    StubRoutines::x86::_fpu_subnormal_bias1[2]= 0x03ff;
3976
    // Un-Bias for strict fp multiply/divide
3977
    StubRoutines::x86::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3978
    StubRoutines::x86::_fpu_subnormal_bias2[1]= 0x80000000;
3979
    StubRoutines::x86::_fpu_subnormal_bias2[2]= 0x7bff;
3980
  }
3981

3982
  address generate_cont_thaw() {
3983
    if (!Continuations::enabled()) return nullptr;
3984
    Unimplemented();
3985
    return nullptr;
3986
  }
3987

3988
  address generate_cont_returnBarrier() {
3989
    if (!Continuations::enabled()) return nullptr;
3990
    Unimplemented();
3991
    return nullptr;
3992
  }
3993

3994
  address generate_cont_returnBarrier_exception() {
3995
    if (!Continuations::enabled()) return nullptr;
3996
    Unimplemented();
3997
    return nullptr;
3998
  }
3999

4000
#if INCLUDE_JFR
4001

4002
  static void jfr_prologue(address the_pc, MacroAssembler* masm) {
4003
    Register java_thread = rdi;
4004
    __ get_thread(java_thread);
4005
    __ set_last_Java_frame(java_thread, rsp, rbp, the_pc, noreg);
4006
    __ movptr(Address(rsp, 0), java_thread);
4007
  }
4008

4009
  // The handle is dereferenced through a load barrier.
4010
  static void jfr_epilogue(MacroAssembler* masm) {
4011
    Register java_thread = rdi;
4012
    __ get_thread(java_thread);
4013
    __ reset_last_Java_frame(java_thread, true);
4014
  }
4015

4016
  // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
4017
  // It returns a jobject handle to the event writer.
4018
  // The handle is dereferenced and the return value is the event writer oop.
4019
  static RuntimeStub* generate_jfr_write_checkpoint() {
4020
    enum layout {
4021
      FPUState_off         = 0,
4022
      rbp_off              = FPUStateSizeInWords,
4023
      rdi_off,
4024
      rsi_off,
4025
      rcx_off,
4026
      rbx_off,
4027
      saved_argument_off,
4028
      saved_argument_off2, // 2nd half of double
4029
      framesize
4030
    };
4031

4032
    int insts_size = 1024;
4033
    int locs_size = 64;
4034
    CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
4035
    OopMapSet* oop_maps = new OopMapSet();
4036
    MacroAssembler* masm = new MacroAssembler(&code);
4037
    MacroAssembler* _masm = masm;
4038

4039
    address start = __ pc();
4040
    __ enter();
4041
    int frame_complete = __ pc() - start;
4042
    address the_pc = __ pc();
4043
    jfr_prologue(the_pc, _masm);
4044
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
4045
    jfr_epilogue(_masm);
4046
    __ resolve_global_jobject(rax, rdi, rdx);
4047
    __ leave();
4048
    __ ret(0);
4049

4050
    OopMap* map = new OopMap(framesize, 1); // rbp
4051
    oop_maps->add_gc_map(the_pc - start, map);
4052

4053
    RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
4054
      RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
4055
                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4056
                                    oop_maps, false);
4057
    return stub;
4058
  }
4059

4060
  // For c2: call to return a leased buffer.
4061
  static RuntimeStub* generate_jfr_return_lease() {
4062
    enum layout {
4063
      FPUState_off = 0,
4064
      rbp_off = FPUStateSizeInWords,
4065
      rdi_off,
4066
      rsi_off,
4067
      rcx_off,
4068
      rbx_off,
4069
      saved_argument_off,
4070
      saved_argument_off2, // 2nd half of double
4071
      framesize
4072
    };
4073

4074
    int insts_size = 1024;
4075
    int locs_size = 64;
4076
    CodeBuffer code("jfr_return_lease", insts_size, locs_size);
4077
    OopMapSet* oop_maps = new OopMapSet();
4078
    MacroAssembler* masm = new MacroAssembler(&code);
4079
    MacroAssembler* _masm = masm;
4080

4081
    address start = __ pc();
4082
    __ enter();
4083
    int frame_complete = __ pc() - start;
4084
    address the_pc = __ pc();
4085
    jfr_prologue(the_pc, _masm);
4086
    __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
4087
    jfr_epilogue(_masm);
4088
    __ leave();
4089
    __ ret(0);
4090

4091
    OopMap* map = new OopMap(framesize, 1); // rbp
4092
    oop_maps->add_gc_map(the_pc - start, map);
4093

4094
    RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
4095
      RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
4096
                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4097
                                    oop_maps, false);
4098
    return stub;
4099
  }
4100

4101
#endif // INCLUDE_JFR
4102

4103
  //---------------------------------------------------------------------------
4104
  // Initialization
4105

4106
  void generate_initial_stubs() {
4107
    // Generates all stubs and initializes the entry points
4108

4109
    //------------------------------------------------------------------------------------------------------------------------
4110
    // entry points that exist in all platforms
4111
    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
4112
    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
4113
    StubRoutines::_forward_exception_entry      = generate_forward_exception();
4114

4115
    StubRoutines::_call_stub_entry              =
4116
      generate_call_stub(StubRoutines::_call_stub_return_address);
4117
    // is referenced by megamorphic call
4118
    StubRoutines::_catch_exception_entry        = generate_catch_exception();
4119

4120
    // platform dependent
4121
    create_control_words();
4122

4123
    // Initialize table for copy memory (arraycopy) check.
4124
    if (UnsafeMemoryAccess::_table == nullptr) {
4125
      UnsafeMemoryAccess::create_table(16 + 4); // 16 for copyMemory; 4 for setMemory
4126
    }
4127

4128
    StubRoutines::x86::_verify_mxcsr_entry         = generate_verify_mxcsr();
4129
    StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = generate_verify_fpu_cntrl_wrd();
4130
    StubRoutines::x86::_d2i_wrapper                = generate_d2i_wrapper(T_INT,  CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
4131
    StubRoutines::x86::_d2l_wrapper                = generate_d2i_wrapper(T_LONG, CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
4132

4133
    // Build this early so it's available for the interpreter
4134
    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",
4135
                                                                                      CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
4136
    StubRoutines::_throw_delayed_StackOverflowError_entry  = generate_throw_exception("delayed StackOverflowError throw_exception",
4137
                                                                                      CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
4138

4139
    if (UseCRC32Intrinsics) {
4140
      // set table address before stub generation which use it
4141
      StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
4142
      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4143
    }
4144

4145
    if (UseCRC32CIntrinsics) {
4146
      bool supports_clmul = VM_Version::supports_clmul();
4147
      StubRoutines::x86::generate_CRC32C_table(supports_clmul);
4148
      StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
4149
      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
4150
    }
4151
    if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
4152
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
4153
        StubRoutines::_dexp = generate_libmExp();
4154
      }
4155
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
4156
        StubRoutines::_dlog = generate_libmLog();
4157
      }
4158
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
4159
        StubRoutines::_dlog10 = generate_libmLog10();
4160
      }
4161
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
4162
        StubRoutines::_dpow = generate_libmPow();
4163
      }
4164
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4165
        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
4166
        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4167
        StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
4168
      }
4169
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4170
        vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
4171
        StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
4172
      }
4173
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
4174
        StubRoutines::_dsin = generate_libmSin();
4175
      }
4176
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
4177
        StubRoutines::_dcos = generate_libmCos();
4178
      }
4179
      if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4180
        StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
4181
        StubRoutines::_dtan = generate_libmTan();
4182
      }
4183
    }
4184
  }
4185

4186
  void generate_continuation_stubs() {
4187
    // Continuation stubs:
4188
    StubRoutines::_cont_thaw          = generate_cont_thaw();
4189
    StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
4190
    StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
4191

4192
    JFR_ONLY(generate_jfr_stubs();)
4193
  }
4194

4195
#if INCLUDE_JFR
4196
  void generate_jfr_stubs() {
4197
    StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
4198
    StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
4199
    StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
4200
    StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
4201
  }
4202
#endif // INCLUDE_JFR
4203

4204
  void generate_final_stubs() {
4205
    // Generates all stubs and initializes the entry points
4206

4207
    // These entry points require SharedInfo::stack0 to be set up in non-core builds
4208
    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
4209
    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
4210
    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
4211
    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
4212

4213
    // support for verify_oop (must happen after universe_init)
4214
    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4215

4216
    // arraycopy stubs used by compilers
4217
    generate_arraycopy_stubs();
4218

4219
    BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
4220
    if (bs_nm != nullptr) {
4221
      StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
4222
    }
4223
  }
4224

4225
  void generate_compiler_stubs() {
4226
#if COMPILER2_OR_JVMCI
4227

4228
    // entry points that are C2/JVMCI specific
4229

4230
    StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF);
4231
    StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000);
4232
    StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF);
4233
    StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000);
4234
    StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff);
4235
    StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff);
4236
    StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff);
4237
    StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
4238
                                                                        0xFFFFFFFF, 0, 0, 0);
4239
    StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
4240
                                                                        0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
4241
    StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
4242
    StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
4243
    StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
4244
    StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
4245
    StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
4246
    StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000);
4247
    StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF);
4248
    StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
4249
    StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
4250
    StubRoutines::x86::_vector_count_leading_zeros_lut = generate_count_leading_zeros_lut("count_leading_zeros_lut");
4251
    StubRoutines::x86::_vector_reverse_bit_lut = generate_vector_reverse_bit_lut("reverse_bit_lut");
4252
    StubRoutines::x86::_vector_reverse_byte_perm_mask_long = generate_vector_reverse_byte_perm_mask_long("perm_mask_long");
4253
    StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
4254
    StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
4255

4256
    if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
4257
      // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
4258
      StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
4259
    }
4260

4261
    // don't bother generating these AES intrinsic stubs unless global flag is set
4262
    if (UseAESIntrinsics) {
4263
      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4264
      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4265
      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4266
      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4267
    }
4268

4269
    if (UseAESCTRIntrinsics) {
4270
      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
4271
    }
4272

4273
    if (UseMD5Intrinsics) {
4274
      StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
4275
      StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
4276
    }
4277
    if (UseSHA1Intrinsics) {
4278
      StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
4279
      StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
4280
      StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
4281
      StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
4282
    }
4283
    if (UseSHA256Intrinsics) {
4284
      StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
4285
      StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
4286
      StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
4287
      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
4288
    }
4289

4290
    // Generate GHASH intrinsics code
4291
    if (UseGHASHIntrinsics) {
4292
      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4293
    }
4294
#endif // COMPILER2_OR_JVMCI
4295
  }
4296

4297

4298
 public:
4299
  StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
4300
    switch(kind) {
4301
    case Initial_stubs:
4302
      generate_initial_stubs();
4303
      break;
4304
     case Continuation_stubs:
4305
      generate_continuation_stubs();
4306
      break;
4307
    case Compiler_stubs:
4308
      generate_compiler_stubs();
4309
      break;
4310
    case Final_stubs:
4311
      generate_final_stubs();
4312
      break;
4313
    default:
4314
      fatal("unexpected stubs kind: %d", kind);
4315
      break;
4316
    };
4317
  }
4318
}; // end class declaration
4319

4320
void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
4321
  StubGenerator g(code, kind);
4322
}
4323

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.