2
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
25
#include "precompiled.hpp"
26
#include "asm/assembler.hpp"
27
#include "asm/assembler.inline.hpp"
28
#include "code/compiledIC.hpp"
29
#include "compiler/compiler_globals.hpp"
30
#include "compiler/disassembler.hpp"
32
#include "gc/shared/barrierSet.hpp"
33
#include "gc/shared/barrierSetAssembler.hpp"
34
#include "gc/shared/collectedHeap.inline.hpp"
35
#include "gc/shared/tlab_globals.hpp"
36
#include "interpreter/bytecodeHistogram.hpp"
37
#include "interpreter/interpreter.hpp"
39
#include "memory/resourceArea.hpp"
40
#include "memory/universe.hpp"
41
#include "oops/accessDecorators.hpp"
42
#include "oops/compressedKlass.inline.hpp"
43
#include "oops/compressedOops.inline.hpp"
44
#include "oops/klass.inline.hpp"
45
#include "prims/methodHandles.hpp"
46
#include "runtime/continuation.hpp"
47
#include "runtime/interfaceSupport.inline.hpp"
48
#include "runtime/javaThread.hpp"
49
#include "runtime/jniHandles.hpp"
50
#include "runtime/objectMonitor.hpp"
51
#include "runtime/os.hpp"
52
#include "runtime/safepoint.hpp"
53
#include "runtime/safepointMechanism.hpp"
54
#include "runtime/sharedRuntime.hpp"
55
#include "runtime/stubRoutines.hpp"
56
#include "utilities/checkedCast.hpp"
57
#include "utilities/macros.hpp"
60
#define BLOCK_COMMENT(str) /* nothing */
61
#define STOP(error) stop(error)
63
#define BLOCK_COMMENT(str) block_comment(str)
64
#define STOP(error) block_comment(error); stop(error)
67
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
70
bool AbstractAssembler::pd_check_instruction_mark() { return true; }
73
static const Assembler::Condition reverse[] = {
74
Assembler::noOverflow /* overflow = 0x0 */ ,
75
Assembler::overflow /* noOverflow = 0x1 */ ,
76
Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
77
Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
78
Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
79
Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
80
Assembler::above /* belowEqual = 0x6 */ ,
81
Assembler::belowEqual /* above = 0x7 */ ,
82
Assembler::positive /* negative = 0x8 */ ,
83
Assembler::negative /* positive = 0x9 */ ,
84
Assembler::noParity /* parity = 0xa */ ,
85
Assembler::parity /* noParity = 0xb */ ,
86
Assembler::greaterEqual /* less = 0xc */ ,
87
Assembler::less /* greaterEqual = 0xd */ ,
88
Assembler::greater /* lessEqual = 0xe */ ,
89
Assembler::lessEqual /* greater = 0xf, */
94
// Implementation of MacroAssembler
96
// First all the versions that have distinct versions depending on 32/64 bit
97
// Unless the difference is trivial (1 line or so).
103
Address MacroAssembler::as_Address(AddressLiteral adr) {
104
return Address(adr.target(), adr.rspec());
107
Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
108
assert(rscratch == noreg, "");
109
return Address::make_array(adr);
112
void MacroAssembler::call_VM_leaf_base(address entry_point,
113
int number_of_arguments) {
114
call(RuntimeAddress(entry_point));
115
increment(rsp, number_of_arguments * wordSize);
118
void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
119
cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
123
void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
124
cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
127
void MacroAssembler::cmpoop(Address src1, jobject obj) {
128
cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
131
void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) {
132
assert(rscratch == noreg, "redundant");
133
cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
136
void MacroAssembler::extend_sign(Register hi, Register lo) {
137
// According to Intel Doc. AP-526, "Integer Divide", p.18.
138
if (VM_Version::is_P6() && hi == rdx && lo == rax) {
146
void MacroAssembler::jC2(Register tmp, Label& L) {
147
// set parity bit if FPU flag C2 is set (via rax)
149
fwait(); fnstsw_ax();
153
jcc(Assembler::parity, L);
156
void MacroAssembler::jnC2(Register tmp, Label& L) {
157
// set parity bit if FPU flag C2 is set (via rax)
159
fwait(); fnstsw_ax();
163
jcc(Assembler::noParity, L);
166
// 32bit can do a case table jump in one instruction but we no longer allow the base
167
// to be installed in the Address class
168
void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
169
assert(rscratch == noreg, "not needed");
170
jmp(as_Address(entry, noreg));
173
// Note: y_lo will be destroyed
174
void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
175
// Long compare for Java (semantics as described in JVM spec.)
176
Label high, low, done;
179
jcc(Assembler::less, low);
180
jcc(Assembler::greater, high);
181
// x_hi is the return register
184
jcc(Assembler::below, low);
185
jcc(Assembler::equal, done);
199
void MacroAssembler::lea(Register dst, AddressLiteral src) {
200
mov_literal32(dst, (int32_t)src.target(), src.rspec());
203
void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
204
assert(rscratch == noreg, "not needed");
206
// leal(dst, as_Address(adr));
207
// see note in movl as to why we must use a move
208
mov_literal32(dst, (int32_t)adr.target(), adr.rspec());
211
void MacroAssembler::leave() {
216
void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
217
// Multiplication of two Java long values stored on the stack
218
// as illustrated below. Result is in rdx:rax.
220
// rsp ---> [ ?? ] \ \
221
// .... | y_rsp_offset |
222
// [ y_lo ] / (in bytes) | x_rsp_offset
223
// [ y_hi ] | (in bytes)
229
// Basic idea: lo(result) = lo(x_lo * y_lo)
230
// hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
231
Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
232
Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
234
// load x_hi, y_hi and check if quick
235
// multiplication is possible
239
orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
240
jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
241
// do full multiplication
243
mull(y_lo); // x_hi * y_lo
244
movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
247
mull(rcx); // x_lo * y_hi
248
addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
250
bind(quick); // note: rbx, = 0 if quick multiply!
252
mull(y_lo); // x_lo * y_lo
253
addl(rdx, rbx); // correct hi(x_lo * y_lo)
256
void MacroAssembler::lneg(Register hi, Register lo) {
262
void MacroAssembler::lshl(Register hi, Register lo) {
263
// Java shift left long support (semantics as described in JVM spec., p.305)
264
// (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
265
// shift value is in rcx !
266
assert(hi != rcx, "must not use rcx");
267
assert(lo != rcx, "must not use rcx");
268
const Register s = rcx; // shift count
269
const int n = BitsPerWord;
271
andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
272
cmpl(s, n); // if (s < n)
273
jcc(Assembler::less, L); // else (s >= n)
274
movl(hi, lo); // x := x << n
276
// Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
277
bind(L); // s (mod n) < n
278
shldl(hi, lo); // x := x << s
283
void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
284
// Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
285
// (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
286
assert(hi != rcx, "must not use rcx");
287
assert(lo != rcx, "must not use rcx");
288
const Register s = rcx; // shift count
289
const int n = BitsPerWord;
291
andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
292
cmpl(s, n); // if (s < n)
293
jcc(Assembler::less, L); // else (s >= n)
294
movl(lo, hi); // x := x >> n
295
if (sign_extension) sarl(hi, 31);
297
// Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
298
bind(L); // s (mod n) < n
299
shrdl(lo, hi); // x := x >> s
300
if (sign_extension) sarl(hi);
304
void MacroAssembler::movoop(Register dst, jobject obj) {
305
mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
308
void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
309
assert(rscratch == noreg, "redundant");
310
mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
313
void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
314
mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
317
void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
318
assert(rscratch == noreg, "redundant");
319
mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
322
void MacroAssembler::movptr(Register dst, AddressLiteral src) {
324
mov_literal32(dst, (intptr_t)src.target(), src.rspec());
326
movl(dst, as_Address(src));
330
void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
331
assert(rscratch == noreg, "redundant");
332
movl(as_Address(dst, noreg), src);
335
void MacroAssembler::movptr(Register dst, ArrayAddress src) {
336
movl(dst, as_Address(src, noreg));
339
void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
340
assert(rscratch == noreg, "redundant");
344
void MacroAssembler::pushoop(jobject obj, Register rscratch) {
345
assert(rscratch == noreg, "redundant");
346
push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
349
void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
350
assert(rscratch == noreg, "redundant");
351
push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
354
void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
355
assert(rscratch == noreg, "redundant");
357
push_literal32((int32_t)src.target(), src.rspec());
359
pushl(as_Address(src));
363
static void pass_arg0(MacroAssembler* masm, Register arg) {
367
static void pass_arg1(MacroAssembler* masm, Register arg) {
371
static void pass_arg2(MacroAssembler* masm, Register arg) {
375
static void pass_arg3(MacroAssembler* masm, Register arg) {
380
extern "C" void findpc(intptr_t x);
383
void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
384
// In order to get locks to work, we need to fake a in_VM state
385
JavaThread* thread = JavaThread::current();
386
JavaThreadState saved_state = thread->thread_state();
387
thread->set_thread_state(_thread_in_vm);
388
if (ShowMessageBoxOnError) {
389
JavaThread* thread = JavaThread::current();
390
JavaThreadState saved_state = thread->thread_state();
391
thread->set_thread_state(_thread_in_vm);
392
if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
394
BytecodeCounter::print();
396
// To see where a verify_oop failed, get $ebx+40/X for this frame.
397
// This is the value of eip which points to where verify_oop will return.
398
if (os::message_box(msg, "Execution stopped, print registers?")) {
399
print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
403
fatal("DEBUG MESSAGE: %s", msg);
406
void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
408
DebuggingContext debugging{};
409
tty->print_cr("eip = 0x%08x", eip);
411
if ((WizardMode || Verbose) && PrintMiscellaneous) {
417
#define PRINT_REG(rax) \
418
{ tty->print("%s = ", #rax); os::print_location(tty, rax); }
428
// Print some words near top of staack.
429
int* dump_sp = (int*) rsp;
430
for (int col1 = 0; col1 < 8; col1++) {
431
tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
432
os::print_location(tty, *dump_sp++);
434
for (int row = 0; row < 16; row++) {
435
tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
436
for (int col = 0; col < 8; col++) {
437
tty->print(" 0x%08x", *dump_sp++);
441
// Print some instructions around pc:
442
Disassembler::decode((address)eip-64, (address)eip);
443
tty->print_cr("--------");
444
Disassembler::decode((address)eip, (address)eip+32);
447
void MacroAssembler::stop(const char* msg) {
448
// push address of message
449
ExternalAddress message((address)msg);
450
pushptr(message.addr(), noreg);
451
{ Label L; call(L, relocInfo::none); bind(L); } // push eip
452
pusha(); // push registers
453
call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
457
void MacroAssembler::warn(const char* msg) {
460
// push address of message
461
ExternalAddress message((address)msg);
462
pushptr(message.addr(), noreg);
464
call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
465
addl(rsp, wordSize); // discard argument
469
void MacroAssembler::print_state() {
470
{ Label L; call(L, relocInfo::none); bind(L); } // push eip
471
pusha(); // push registers
474
call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
485
Address MacroAssembler::as_Address(AddressLiteral adr) {
486
// amd64 always does this as a pc-rel
487
// we can be absolute or disp based on the instruction type
488
// jmp/call are displacements others are absolute
489
assert(!adr.is_lval(), "must be rval");
490
assert(reachable(adr), "must be");
491
return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
495
Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
496
AddressLiteral base = adr.base();
498
Address index = adr.index();
499
assert(index._disp == 0, "must not have disp"); // maybe it can?
500
Address array(rscratch, index._index, index._scale, index._disp);
504
void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
508
// Windows always allocates space for it's register args
509
assert(num_args <= 4, "only register arguments supported");
510
subq(rsp, frame::arg_reg_save_area_bytes);
513
// Align stack if necessary
515
jcc(Assembler::zero, L);
518
call(RuntimeAddress(entry_point));
523
call(RuntimeAddress(entry_point));
528
// restore stack pointer
529
addq(rsp, frame::arg_reg_save_area_bytes);
534
void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
535
assert(!src2.is_lval(), "should use cmpptr");
536
assert(rscratch != noreg || always_reachable(src2), "missing");
538
if (reachable(src2)) {
539
cmpq(src1, as_Address(src2));
542
Assembler::cmpq(src1, Address(rscratch, 0));
546
int MacroAssembler::corrected_idivq(Register reg) {
547
// Full implementation of Java ldiv and lrem; checks for special
548
// case as described in JVM spec., p.243 & p.271. The function
549
// returns the (pc) offset of the idivl instruction - may be needed
550
// for implicit exceptions.
552
// normal case special case
554
// input : rax: dividend min_long
555
// reg: divisor (may not be eax/edx) -1
557
// output: rax: quotient (= rax idiv reg) min_long
558
// rdx: remainder (= rax irem reg) 0
559
assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
560
static const int64_t min_long = 0x8000000000000000;
561
Label normal_case, special_case;
563
// check for special case
564
cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
565
jcc(Assembler::notEqual, normal_case);
566
xorl(rdx, rdx); // prepare rdx for possible special case (where
569
jcc(Assembler::equal, special_case);
571
// handle normal case
574
int idivq_offset = offset();
577
// normal and special case exit
583
void MacroAssembler::decrementq(Register reg, int value) {
584
if (value == min_jint) { subq(reg, value); return; }
585
if (value < 0) { incrementq(reg, -value); return; }
586
if (value == 0) { ; return; }
587
if (value == 1 && UseIncDec) { decq(reg) ; return; }
588
/* else */ { subq(reg, value) ; return; }
591
void MacroAssembler::decrementq(Address dst, int value) {
592
if (value == min_jint) { subq(dst, value); return; }
593
if (value < 0) { incrementq(dst, -value); return; }
594
if (value == 0) { ; return; }
595
if (value == 1 && UseIncDec) { decq(dst) ; return; }
596
/* else */ { subq(dst, value) ; return; }
599
void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
600
assert(rscratch != noreg || always_reachable(dst), "missing");
602
if (reachable(dst)) {
603
incrementq(as_Address(dst));
606
incrementq(Address(rscratch, 0));
610
void MacroAssembler::incrementq(Register reg, int value) {
611
if (value == min_jint) { addq(reg, value); return; }
612
if (value < 0) { decrementq(reg, -value); return; }
613
if (value == 0) { ; return; }
614
if (value == 1 && UseIncDec) { incq(reg) ; return; }
615
/* else */ { addq(reg, value) ; return; }
618
void MacroAssembler::incrementq(Address dst, int value) {
619
if (value == min_jint) { addq(dst, value); return; }
620
if (value < 0) { decrementq(dst, -value); return; }
621
if (value == 0) { ; return; }
622
if (value == 1 && UseIncDec) { incq(dst) ; return; }
623
/* else */ { addq(dst, value) ; return; }
626
// 32bit can do a case table jump in one instruction but we no longer allow the base
627
// to be installed in the Address class
628
void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
629
lea(rscratch, entry.base());
630
Address dispatch = entry.index();
631
assert(dispatch._base == noreg, "must be");
632
dispatch._base = rscratch;
636
void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
637
ShouldNotReachHere(); // 64bit doesn't use two regs
641
void MacroAssembler::lea(Register dst, AddressLiteral src) {
642
mov_literal64(dst, (intptr_t)src.target(), src.rspec());
645
void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
647
movptr(dst, rscratch);
650
void MacroAssembler::leave() {
651
// %%% is this really better? Why not on 32bit too?
652
emit_int8((unsigned char)0xC9); // LEAVE
655
void MacroAssembler::lneg(Register hi, Register lo) {
656
ShouldNotReachHere(); // 64bit doesn't use two regs
660
void MacroAssembler::movoop(Register dst, jobject obj) {
661
mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
664
void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
665
mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
669
void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
670
mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
673
void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
674
mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
678
void MacroAssembler::movptr(Register dst, AddressLiteral src) {
680
mov_literal64(dst, (intptr_t)src.target(), src.rspec());
682
if (reachable(src)) {
683
movq(dst, as_Address(src));
686
movq(dst, Address(dst, 0));
691
void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
692
movq(as_Address(dst, rscratch), src);
695
void MacroAssembler::movptr(Register dst, ArrayAddress src) {
696
movq(dst, as_Address(src, dst /*rscratch*/));
699
// src should NEVER be a real pointer. Use AddressLiteral for true pointers
700
void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
701
if (is_simm32(src)) {
702
movptr(dst, checked_cast<int32_t>(src));
704
mov64(rscratch, src);
709
void MacroAssembler::pushoop(jobject obj, Register rscratch) {
710
movoop(rscratch, obj);
714
void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
715
mov_metadata(rscratch, obj);
719
void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
724
pushq(Address(rscratch, 0));
728
void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
729
reset_last_Java_frame(r15_thread, clear_fp);
732
void MacroAssembler::set_last_Java_frame(Register last_java_sp,
733
Register last_java_fp,
734
address last_java_pc,
736
set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, last_java_pc, rscratch);
739
static void pass_arg0(MacroAssembler* masm, Register arg) {
740
if (c_rarg0 != arg ) {
741
masm->mov(c_rarg0, arg);
745
static void pass_arg1(MacroAssembler* masm, Register arg) {
746
if (c_rarg1 != arg ) {
747
masm->mov(c_rarg1, arg);
751
static void pass_arg2(MacroAssembler* masm, Register arg) {
752
if (c_rarg2 != arg ) {
753
masm->mov(c_rarg2, arg);
757
static void pass_arg3(MacroAssembler* masm, Register arg) {
758
if (c_rarg3 != arg ) {
759
masm->mov(c_rarg3, arg);
763
void MacroAssembler::stop(const char* msg) {
764
if (ShowMessageBoxOnError) {
766
pusha(); // get regs on stack
767
lea(c_rarg1, InternalAddress(rip));
768
movq(c_rarg2, rsp); // pass pointer to regs array
770
lea(c_rarg0, ExternalAddress((address) msg));
771
andq(rsp, -16); // align stack as required by ABI
772
call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
776
void MacroAssembler::warn(const char* msg) {
779
andq(rsp, -16); // align stack as required by push_CPU_state and call
780
push_CPU_state(); // keeps alignment at 16 bytes
782
lea(c_rarg0, ExternalAddress((address) msg));
783
call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
790
void MacroAssembler::print_state() {
792
pusha(); // get regs on stack
795
andq(rsp, -16); // align stack as required by push_CPU_state and call
796
push_CPU_state(); // keeps alignment at 16 bytes
798
lea(c_rarg0, InternalAddress(rip));
799
lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
800
call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
809
extern "C" void findpc(intptr_t x);
812
void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
813
// In order to get locks to work, we need to fake a in_VM state
814
if (ShowMessageBoxOnError) {
815
JavaThread* thread = JavaThread::current();
816
JavaThreadState saved_state = thread->thread_state();
817
thread->set_thread_state(_thread_in_vm);
819
if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
821
BytecodeCounter::print();
824
// To see where a verify_oop failed, get $ebx+40/X for this frame.
825
// XXX correct this offset for amd64
826
// This is the value of eip which points to where verify_oop will return.
827
if (os::message_box(msg, "Execution stopped, print registers?")) {
828
print_state64(pc, regs);
832
fatal("DEBUG MESSAGE: %s", msg);
835
void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
837
DebuggingContext debugging{};
838
tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
844
#define PRINT_REG(rax, value) \
845
{ tty->print("%s = ", #rax); os::print_location(tty, value); }
846
PRINT_REG(rax, regs[15]);
847
PRINT_REG(rbx, regs[12]);
848
PRINT_REG(rcx, regs[14]);
849
PRINT_REG(rdx, regs[13]);
850
PRINT_REG(rdi, regs[8]);
851
PRINT_REG(rsi, regs[9]);
852
PRINT_REG(rbp, regs[10]);
853
// rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
854
PRINT_REG(rsp, (intptr_t)(®s[16]));
855
PRINT_REG(r8 , regs[7]);
856
PRINT_REG(r9 , regs[6]);
857
PRINT_REG(r10, regs[5]);
858
PRINT_REG(r11, regs[4]);
859
PRINT_REG(r12, regs[3]);
860
PRINT_REG(r13, regs[2]);
861
PRINT_REG(r14, regs[1]);
862
PRINT_REG(r15, regs[0]);
864
// Print some words near the top of the stack.
865
int64_t* rsp = ®s[16];
866
int64_t* dump_sp = rsp;
867
for (int col1 = 0; col1 < 8; col1++) {
868
tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
869
os::print_location(tty, *dump_sp++);
871
for (int row = 0; row < 25; row++) {
872
tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
873
for (int col = 0; col < 4; col++) {
874
tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
878
// Print some instructions around pc:
879
Disassembler::decode((address)pc-64, (address)pc);
880
tty->print_cr("--------");
881
Disassembler::decode((address)pc, (address)pc+32);
884
// The java_calling_convention describes stack locations as ideal slots on
885
// a frame with no abi restrictions. Since we must observe abi restrictions
886
// (like the placement of the register window) the slots must be biased by
887
// the following value.
888
static int reg2offset_in(VMReg r) {
889
// Account for saved rbp and return address
890
// This should really be in_preserve_stack_slots
891
return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
894
static int reg2offset_out(VMReg r) {
895
return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
899
void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
901
// The calling conventions assures us that each VMregpair is either
902
// all really one physical register or adjacent stack slots.
904
if (src.is_single_phys_reg() ) {
905
if (dst.is_single_phys_reg()) {
906
if (dst.first() != src.first()) {
907
mov(dst.first()->as_Register(), src.first()->as_Register());
910
assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
911
src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
912
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
914
} else if (dst.is_single_phys_reg()) {
915
assert(src.is_single_reg(), "not a stack pair");
916
movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
918
assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
919
movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
920
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
925
void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
927
// The calling conventions assures us that each VMregpair is either
928
// all really one physical register or adjacent stack slots.
930
if (src.is_single_phys_reg() ) {
931
if (dst.is_single_phys_reg()) {
932
// In theory these overlap but the ordering is such that this is likely a nop
933
if ( src.first() != dst.first()) {
934
movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
937
assert(dst.is_single_reg(), "not a stack pair");
938
movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
940
} else if (dst.is_single_phys_reg()) {
941
assert(src.is_single_reg(), "not a stack pair");
942
movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
944
assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
945
movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
946
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
951
// A float arg may have to do float reg int reg conversion
952
void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
953
assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
955
// The calling conventions assures us that each VMregpair is either
956
// all really one physical register or adjacent stack slots.
958
if (src.first()->is_stack()) {
959
if (dst.first()->is_stack()) {
960
movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
961
movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
964
assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
965
movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
967
} else if (dst.first()->is_stack()) {
969
assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
970
movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
973
// In theory these overlap but the ordering is such that this is likely a nop
974
if ( src.first() != dst.first()) {
975
movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
980
// On 64 bit we will store integer like items to the stack as
981
// 64 bits items (x86_32/64 abi) even though java would only store
982
// 32bits for a parameter. On 32bit it will simply be 32 bits
983
// So this routine will do 32->32 on 32bit and 32->64 on 64bit
984
void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
985
if (src.first()->is_stack()) {
986
if (dst.first()->is_stack()) {
988
movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
989
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
992
movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
994
} else if (dst.first()->is_stack()) {
996
// Do we really have to sign extend???
997
// __ movslq(src.first()->as_Register(), src.first()->as_Register());
998
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
1000
// Do we really have to sign extend???
1001
// __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1002
if (dst.first() != src.first()) {
1003
movq(dst.first()->as_Register(), src.first()->as_Register());
1008
void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1009
if (src.first()->is_stack()) {
1010
if (dst.first()->is_stack()) {
1012
movq(rax, Address(rbp, reg2offset_in(src.first())));
1013
movq(Address(rsp, reg2offset_out(dst.first())), rax);
1016
movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1018
} else if (dst.first()->is_stack()) {
1020
movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1022
if (dst.first() != src.first()) {
1023
movq(dst.first()->as_Register(), src.first()->as_Register());
1028
// An oop arg. Must pass a handle not the oop itself
1029
void MacroAssembler::object_move(OopMap* map,
1030
int oop_handle_offset,
1031
int framesize_in_slots,
1035
int* receiver_offset) {
1037
// must pass a handle. First figure out the location we use as a handle
1039
Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1041
// See if oop is null if it is we need no handle
1043
if (src.first()->is_stack()) {
1045
// Oop is already on the stack as an argument
1046
int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1047
map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1049
*receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1052
cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
1053
lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1054
// conditionally move a null
1055
cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1058
// Oop is in a register we must store it to the space we reserve
1059
// on the stack for oop_handles and pass a handle if oop is non-null
1061
const Register rOop = src.first()->as_Register();
1063
if (rOop == j_rarg0)
1065
else if (rOop == j_rarg1)
1067
else if (rOop == j_rarg2)
1069
else if (rOop == j_rarg3)
1071
else if (rOop == j_rarg4)
1074
assert(rOop == j_rarg5, "wrong register");
1078
oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1079
int offset = oop_slot*VMRegImpl::stack_slot_size;
1081
map->set_oop(VMRegImpl::stack2reg(oop_slot));
1082
// Store oop in handle area, may be null
1083
movptr(Address(rsp, offset), rOop);
1085
*receiver_offset = offset;
1088
cmpptr(rOop, NULL_WORD);
1089
lea(rHandle, Address(rsp, offset));
1090
// conditionally move a null from the handle area where it was just stored
1091
cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1094
// If arg is on the stack then place it otherwise it is already in correct reg.
1095
if (dst.first()->is_stack()) {
1096
movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1102
// Now versions that are common to 32/64 bit
1104
void MacroAssembler::addptr(Register dst, int32_t imm32) {
1105
LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1108
void MacroAssembler::addptr(Register dst, Register src) {
1109
LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1112
void MacroAssembler::addptr(Address dst, Register src) {
1113
LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1116
void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1117
assert(rscratch != noreg || always_reachable(src), "missing");
1119
if (reachable(src)) {
1120
Assembler::addsd(dst, as_Address(src));
1123
Assembler::addsd(dst, Address(rscratch, 0));
1127
void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1128
assert(rscratch != noreg || always_reachable(src), "missing");
1130
if (reachable(src)) {
1131
addss(dst, as_Address(src));
1134
addss(dst, Address(rscratch, 0));
1138
void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1139
assert(rscratch != noreg || always_reachable(src), "missing");
1141
if (reachable(src)) {
1142
Assembler::addpd(dst, as_Address(src));
1145
Assembler::addpd(dst, Address(rscratch, 0));
1149
// See 8273459. Function for ensuring 64-byte alignment, intended for stubs only.
1150
// Stub code is generated once and never copied.
1151
// NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1152
void MacroAssembler::align64() {
1153
align(64, (uint)(uintptr_t)pc());
1156
void MacroAssembler::align32() {
1157
align(32, (uint)(uintptr_t)pc());
1160
void MacroAssembler::align(uint modulus) {
1161
// 8273459: Ensure alignment is possible with current segment alignment
1162
assert(modulus <= (uintx)CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1163
align(modulus, offset());
1166
void MacroAssembler::align(uint modulus, uint target) {
1167
if (target % modulus != 0) {
1168
nop(modulus - (target % modulus));
1172
void MacroAssembler::push_f(XMMRegister r) {
1173
subptr(rsp, wordSize);
1174
movflt(Address(rsp, 0), r);
1177
void MacroAssembler::pop_f(XMMRegister r) {
1178
movflt(r, Address(rsp, 0));
1179
addptr(rsp, wordSize);
1182
void MacroAssembler::push_d(XMMRegister r) {
1183
subptr(rsp, 2 * wordSize);
1184
movdbl(Address(rsp, 0), r);
1187
void MacroAssembler::pop_d(XMMRegister r) {
1188
movdbl(r, Address(rsp, 0));
1189
addptr(rsp, 2 * Interpreter::stackElementSize);
1192
void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1193
// Used in sign-masking with aligned address.
1194
assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1195
assert(rscratch != noreg || always_reachable(src), "missing");
1197
if (reachable(src)) {
1198
Assembler::andpd(dst, as_Address(src));
1201
Assembler::andpd(dst, Address(rscratch, 0));
1205
void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
1206
// Used in sign-masking with aligned address.
1207
assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1208
assert(rscratch != noreg || always_reachable(src), "missing");
1210
if (reachable(src)) {
1211
Assembler::andps(dst, as_Address(src));
1214
Assembler::andps(dst, Address(rscratch, 0));
1218
void MacroAssembler::andptr(Register dst, int32_t imm32) {
1219
LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1223
void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
1224
assert(rscratch != noreg || always_reachable(src), "missing");
1226
if (reachable(src)) {
1227
andq(dst, as_Address(src));
1230
andq(dst, Address(rscratch, 0));
1235
void MacroAssembler::atomic_incl(Address counter_addr) {
1237
incrementl(counter_addr);
1240
void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
1241
assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1243
if (reachable(counter_addr)) {
1244
atomic_incl(as_Address(counter_addr));
1246
lea(rscratch, counter_addr);
1247
atomic_incl(Address(rscratch, 0));
1252
void MacroAssembler::atomic_incq(Address counter_addr) {
1254
incrementq(counter_addr);
1257
void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
1258
assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1260
if (reachable(counter_addr)) {
1261
atomic_incq(as_Address(counter_addr));
1263
lea(rscratch, counter_addr);
1264
atomic_incq(Address(rscratch, 0));
1269
// Writes to stack successive pages until offset reached to check for
1270
// stack overflow + shadow pages. This clobbers tmp.
1271
void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1273
// Bang stack for total size given plus shadow page size.
1274
// Bang one page at a time because large size can bang beyond yellow and
1278
movl(Address(tmp, (-(int)os::vm_page_size())), size );
1279
subptr(tmp, (int)os::vm_page_size());
1280
subl(size, (int)os::vm_page_size());
1281
jcc(Assembler::greater, loop);
1283
// Bang down shadow pages too.
1284
// At this point, (tmp-0) is the last address touched, so don't
1285
// touch it again. (It was touched as (tmp-pagesize) but then tmp
1286
// was post-decremented.) Skip this address by starting at i=1, and
1287
// touch a few more pages below. N.B. It is important to touch all
1288
// the way down including all pages in the shadow zone.
1289
for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
1290
// this could be any sized move but this is can be a debugging crumb
1291
// so the bigger the better.
1292
movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
1296
void MacroAssembler::reserved_stack_check() {
1297
// testing if reserved zone needs to be enabled
1298
Label no_reserved_zone_enabling;
1299
Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1300
NOT_LP64(get_thread(rsi);)
1302
cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1303
jcc(Assembler::below, no_reserved_zone_enabling);
1305
call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1306
jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1307
should_not_reach_here();
1309
bind(no_reserved_zone_enabling);
1312
void MacroAssembler::c2bool(Register x) {
1313
// implements x == 0 ? 0 : 1
1314
// note: must only look at least-significant byte of x
1315
// since C-style booleans are stored in one byte
1318
setb(Assembler::notZero, x);
1321
// Wouldn't need if AddressLiteral version had new name
1322
void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1323
Assembler::call(L, rtype);
1326
void MacroAssembler::call(Register entry) {
1327
Assembler::call(entry);
1330
void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
1331
assert(rscratch != noreg || always_reachable(entry), "missing");
1333
if (reachable(entry)) {
1334
Assembler::call_literal(entry.target(), entry.rspec());
1336
lea(rscratch, entry);
1337
Assembler::call(rscratch);
1341
void MacroAssembler::ic_call(address entry, jint method_index) {
1342
RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1344
// Needs full 64-bit immediate for later patching.
1345
mov64(rax, (int64_t)Universe::non_oop_word());
1347
movptr(rax, (intptr_t)Universe::non_oop_word());
1349
call(AddressLiteral(entry, rh));
1352
int MacroAssembler::ic_check_size() {
1353
return LP64_ONLY(14) NOT_LP64(12);
1356
int MacroAssembler::ic_check(int end_alignment) {
1357
Register receiver = LP64_ONLY(j_rarg0) NOT_LP64(rcx);
1358
Register data = rax;
1359
Register temp = LP64_ONLY(rscratch1) NOT_LP64(rbx);
1361
// The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1362
// before the inline cache check, so we don't have to execute any nop instructions when dispatching
1363
// through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1364
// before the inline cache check here, and not after
1365
align(end_alignment, offset() + ic_check_size());
1367
int uep_offset = offset();
1369
if (UseCompressedClassPointers) {
1370
movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
1371
cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
1373
movptr(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
1374
cmpptr(temp, Address(data, CompiledICData::speculated_klass_offset()));
1377
// if inline cache check fails, then jump to runtime routine
1378
jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1379
assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1384
void MacroAssembler::emit_static_call_stub() {
1385
// Static stub relocation also tags the Method* in the code-stream.
1386
mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time.
1387
// This is recognized as unresolved by relocs/nativeinst/ic code.
1388
jump(RuntimeAddress(pc()));
1391
// Implementation of call_VM versions
1393
void MacroAssembler::call_VM(Register oop_result,
1394
address entry_point,
1395
bool check_exceptions) {
1397
call(C, relocInfo::none);
1401
call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1407
void MacroAssembler::call_VM(Register oop_result,
1408
address entry_point,
1410
bool check_exceptions) {
1412
call(C, relocInfo::none);
1416
pass_arg1(this, arg_1);
1417
call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1423
void MacroAssembler::call_VM(Register oop_result,
1424
address entry_point,
1427
bool check_exceptions) {
1429
call(C, relocInfo::none);
1434
LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1436
pass_arg2(this, arg_2);
1437
pass_arg1(this, arg_1);
1438
call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1444
void MacroAssembler::call_VM(Register oop_result,
1445
address entry_point,
1449
bool check_exceptions) {
1451
call(C, relocInfo::none);
1456
LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1457
LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1458
pass_arg3(this, arg_3);
1459
pass_arg2(this, arg_2);
1460
pass_arg1(this, arg_1);
1461
call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1467
void MacroAssembler::call_VM(Register oop_result,
1468
Register last_java_sp,
1469
address entry_point,
1470
int number_of_arguments,
1471
bool check_exceptions) {
1472
Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1473
call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1476
void MacroAssembler::call_VM(Register oop_result,
1477
Register last_java_sp,
1478
address entry_point,
1480
bool check_exceptions) {
1481
pass_arg1(this, arg_1);
1482
call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1485
void MacroAssembler::call_VM(Register oop_result,
1486
Register last_java_sp,
1487
address entry_point,
1490
bool check_exceptions) {
1492
LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1493
pass_arg2(this, arg_2);
1494
pass_arg1(this, arg_1);
1495
call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1498
void MacroAssembler::call_VM(Register oop_result,
1499
Register last_java_sp,
1500
address entry_point,
1504
bool check_exceptions) {
1505
LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1506
LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1507
pass_arg3(this, arg_3);
1508
pass_arg2(this, arg_2);
1509
pass_arg1(this, arg_1);
1510
call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1513
void MacroAssembler::super_call_VM(Register oop_result,
1514
Register last_java_sp,
1515
address entry_point,
1516
int number_of_arguments,
1517
bool check_exceptions) {
1518
Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1519
MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1522
void MacroAssembler::super_call_VM(Register oop_result,
1523
Register last_java_sp,
1524
address entry_point,
1526
bool check_exceptions) {
1527
pass_arg1(this, arg_1);
1528
super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1531
void MacroAssembler::super_call_VM(Register oop_result,
1532
Register last_java_sp,
1533
address entry_point,
1536
bool check_exceptions) {
1538
LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1539
pass_arg2(this, arg_2);
1540
pass_arg1(this, arg_1);
1541
super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1544
void MacroAssembler::super_call_VM(Register oop_result,
1545
Register last_java_sp,
1546
address entry_point,
1550
bool check_exceptions) {
1551
LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1552
LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1553
pass_arg3(this, arg_3);
1554
pass_arg2(this, arg_2);
1555
pass_arg1(this, arg_1);
1556
super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1559
void MacroAssembler::call_VM_base(Register oop_result,
1560
Register java_thread,
1561
Register last_java_sp,
1562
address entry_point,
1563
int number_of_arguments,
1564
bool check_exceptions) {
1565
// determine java_thread register
1566
if (!java_thread->is_valid()) {
1568
java_thread = r15_thread;
1571
get_thread(java_thread);
1574
// determine last_java_sp register
1575
if (!last_java_sp->is_valid()) {
1578
// debugging support
1579
assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
1580
LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1582
// TraceBytecodes does not use r12 but saves it over the call, so don't verify
1583
// r12 is the heapbase.
1584
LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1587
assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
1588
assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1590
// push java thread (becomes first argument of C function)
1592
NOT_LP64(push(java_thread); number_of_arguments++);
1593
LP64_ONLY(mov(c_rarg0, r15_thread));
1595
// set last Java frame before call
1596
assert(last_java_sp != rbp, "can't use ebp/rbp");
1598
// Only interpreter should have to set fp
1599
set_last_Java_frame(java_thread, last_java_sp, rbp, nullptr, rscratch1);
1601
// do the call, remove parameters
1602
MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1604
// restore the thread (cannot use the pushed argument since arguments
1605
// may be overwritten by C code generated by an optimizing compiler);
1606
// however can use the register value directly if it is callee saved.
1607
if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1608
// rdi & rsi (also r15) are callee saved -> nothing to do
1610
guarantee(java_thread != rax, "change this code");
1614
cmpptr(java_thread, rax);
1615
jcc(Assembler::equal, L);
1616
STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1622
get_thread(java_thread);
1624
// reset last Java frame
1625
// Only interpreter should have to clear fp
1626
reset_last_Java_frame(java_thread, true);
1628
// C++ interp handles this in the interpreter
1629
check_and_handle_popframe(java_thread);
1630
check_and_handle_earlyret(java_thread);
1632
if (check_exceptions) {
1633
// check for pending exceptions (java_thread is set upon return)
1634
cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD);
1636
jump_cc(Assembler::notEqual,
1637
RuntimeAddress(StubRoutines::forward_exception_entry()));
1639
// This used to conditionally jump to forward_exception however it is
1640
// possible if we relocate that the branch will not reach. So we must jump
1641
// around so we can always reach
1644
jcc(Assembler::equal, ok);
1645
jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1650
// get oop result if there is one and reset the value in the thread
1651
if (oop_result->is_valid()) {
1652
get_vm_result(oop_result, java_thread);
1656
void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1658
// Calculate the value for last_Java_sp
1659
// somewhat subtle. call_VM does an intermediate call
1660
// which places a return address on the stack just under the
1661
// stack pointer as the user finished with it. This allows
1662
// use to retrieve last_Java_pc from last_Java_sp[-1].
1663
// On 32bit we then have to push additional args on the stack to accomplish
1664
// the actual requested call. On 64bit call_VM only can use register args
1665
// so the only extra space is the return address that call_VM created.
1666
// This hopefully explains the calculations here.
1669
// We've pushed one address, correct last_Java_sp
1670
lea(rax, Address(rsp, wordSize));
1672
lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1675
call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1679
// Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1680
void MacroAssembler::call_VM_leaf0(address entry_point) {
1681
MacroAssembler::call_VM_leaf_base(entry_point, 0);
1684
void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1685
call_VM_leaf_base(entry_point, number_of_arguments);
1688
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1689
pass_arg0(this, arg_0);
1690
call_VM_leaf(entry_point, 1);
1693
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1695
LP64_ONLY(assert_different_registers(arg_0, c_rarg1));
1696
pass_arg1(this, arg_1);
1697
pass_arg0(this, arg_0);
1698
call_VM_leaf(entry_point, 2);
1701
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1702
LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2));
1703
LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1704
pass_arg2(this, arg_2);
1705
pass_arg1(this, arg_1);
1706
pass_arg0(this, arg_0);
1707
call_VM_leaf(entry_point, 3);
1710
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1711
LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3));
1712
LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1713
LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1714
pass_arg3(this, arg_3);
1715
pass_arg2(this, arg_2);
1716
pass_arg1(this, arg_1);
1717
pass_arg0(this, arg_0);
1718
call_VM_leaf(entry_point, 3);
1721
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1722
pass_arg0(this, arg_0);
1723
MacroAssembler::call_VM_leaf_base(entry_point, 1);
1726
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1727
LP64_ONLY(assert_different_registers(arg_0, c_rarg1));
1728
pass_arg1(this, arg_1);
1729
pass_arg0(this, arg_0);
1730
MacroAssembler::call_VM_leaf_base(entry_point, 2);
1733
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1734
LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2));
1735
LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1736
pass_arg2(this, arg_2);
1737
pass_arg1(this, arg_1);
1738
pass_arg0(this, arg_0);
1739
MacroAssembler::call_VM_leaf_base(entry_point, 3);
1742
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1743
LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3));
1744
LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1745
LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1746
pass_arg3(this, arg_3);
1747
pass_arg2(this, arg_2);
1748
pass_arg1(this, arg_1);
1749
pass_arg0(this, arg_0);
1750
MacroAssembler::call_VM_leaf_base(entry_point, 4);
1753
void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1754
movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1755
movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1756
verify_oop_msg(oop_result, "broken oop in call_VM_base");
1759
void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1760
movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1761
movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1764
void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1767
void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1770
void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1771
assert(rscratch != noreg || always_reachable(src1), "missing");
1773
if (reachable(src1)) {
1774
cmpl(as_Address(src1), imm);
1776
lea(rscratch, src1);
1777
cmpl(Address(rscratch, 0), imm);
1781
void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1782
assert(!src2.is_lval(), "use cmpptr");
1783
assert(rscratch != noreg || always_reachable(src2), "missing");
1785
if (reachable(src2)) {
1786
cmpl(src1, as_Address(src2));
1788
lea(rscratch, src2);
1789
cmpl(src1, Address(rscratch, 0));
1793
void MacroAssembler::cmp32(Register src1, int32_t imm) {
1794
Assembler::cmpl(src1, imm);
1797
void MacroAssembler::cmp32(Register src1, Address src2) {
1798
Assembler::cmpl(src1, src2);
1801
void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1802
ucomisd(opr1, opr2);
1805
if (unordered_is_less) {
1807
jcc(Assembler::parity, L);
1808
jcc(Assembler::below , L);
1810
jcc(Assembler::equal , L);
1812
} else { // unordered is greater
1814
jcc(Assembler::parity, L);
1815
jcc(Assembler::above , L);
1817
jcc(Assembler::equal , L);
1823
void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1824
ucomiss(opr1, opr2);
1827
if (unordered_is_less) {
1829
jcc(Assembler::parity, L);
1830
jcc(Assembler::below , L);
1832
jcc(Assembler::equal , L);
1834
} else { // unordered is greater
1836
jcc(Assembler::parity, L);
1837
jcc(Assembler::above , L);
1839
jcc(Assembler::equal , L);
1846
void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1847
assert(rscratch != noreg || always_reachable(src1), "missing");
1849
if (reachable(src1)) {
1850
cmpb(as_Address(src1), imm);
1852
lea(rscratch, src1);
1853
cmpb(Address(rscratch, 0), imm);
1857
void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1859
assert(rscratch != noreg || always_reachable(src2), "missing");
1861
if (src2.is_lval()) {
1862
movptr(rscratch, src2);
1863
Assembler::cmpq(src1, rscratch);
1864
} else if (reachable(src2)) {
1865
cmpq(src1, as_Address(src2));
1867
lea(rscratch, src2);
1868
Assembler::cmpq(src1, Address(rscratch, 0));
1871
assert(rscratch == noreg, "not needed");
1872
if (src2.is_lval()) {
1873
cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1875
cmpl(src1, as_Address(src2));
1880
void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1881
assert(src2.is_lval(), "not a mem-mem compare");
1883
// moves src2's literal address
1884
movptr(rscratch, src2);
1885
Assembler::cmpq(src1, rscratch);
1887
assert(rscratch == noreg, "not needed");
1888
cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1892
void MacroAssembler::cmpoop(Register src1, Register src2) {
1896
void MacroAssembler::cmpoop(Register src1, Address src2) {
1901
void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1902
movoop(rscratch, src2);
1903
cmpptr(src1, rscratch);
1907
void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1908
assert(rscratch != noreg || always_reachable(adr), "missing");
1910
if (reachable(adr)) {
1912
cmpxchgptr(reg, as_Address(adr));
1916
cmpxchgptr(reg, Address(rscratch, 0));
1920
void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1921
LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1924
void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1925
assert(rscratch != noreg || always_reachable(src), "missing");
1927
if (reachable(src)) {
1928
Assembler::comisd(dst, as_Address(src));
1931
Assembler::comisd(dst, Address(rscratch, 0));
1935
void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1936
assert(rscratch != noreg || always_reachable(src), "missing");
1938
if (reachable(src)) {
1939
Assembler::comiss(dst, as_Address(src));
1942
Assembler::comiss(dst, Address(rscratch, 0));
1947
void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1948
assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1950
Condition negated_cond = negate_condition(cond);
1952
jcc(negated_cond, L);
1953
pushf(); // Preserve flags
1954
atomic_incl(counter_addr, rscratch);
1959
int MacroAssembler::corrected_idivl(Register reg) {
1960
// Full implementation of Java idiv and irem; checks for
1961
// special case as described in JVM spec., p.243 & p.271.
1962
// The function returns the (pc) offset of the idivl
1963
// instruction - may be needed for implicit exceptions.
1965
// normal case special case
1967
// input : rax,: dividend min_int
1968
// reg: divisor (may not be rax,/rdx) -1
1970
// output: rax,: quotient (= rax, idiv reg) min_int
1971
// rdx: remainder (= rax, irem reg) 0
1972
assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1973
const int min_int = 0x80000000;
1974
Label normal_case, special_case;
1976
// check for special case
1978
jcc(Assembler::notEqual, normal_case);
1979
xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1981
jcc(Assembler::equal, special_case);
1983
// handle normal case
1986
int idivl_offset = offset();
1989
// normal and special case exit
1992
return idivl_offset;
1997
void MacroAssembler::decrementl(Register reg, int value) {
1998
if (value == min_jint) {subl(reg, value) ; return; }
1999
if (value < 0) { incrementl(reg, -value); return; }
2000
if (value == 0) { ; return; }
2001
if (value == 1 && UseIncDec) { decl(reg) ; return; }
2002
/* else */ { subl(reg, value) ; return; }
2005
void MacroAssembler::decrementl(Address dst, int value) {
2006
if (value == min_jint) {subl(dst, value) ; return; }
2007
if (value < 0) { incrementl(dst, -value); return; }
2008
if (value == 0) { ; return; }
2009
if (value == 1 && UseIncDec) { decl(dst) ; return; }
2010
/* else */ { subl(dst, value) ; return; }
2013
void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2014
assert(shift_value > 0, "illegal shift value");
2017
jcc (Assembler::positive, _is_positive);
2018
int offset = (1 << shift_value) - 1 ;
2026
bind (_is_positive);
2027
sarl(reg, shift_value);
2030
void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2031
assert(rscratch != noreg || always_reachable(src), "missing");
2033
if (reachable(src)) {
2034
Assembler::divsd(dst, as_Address(src));
2037
Assembler::divsd(dst, Address(rscratch, 0));
2041
void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2042
assert(rscratch != noreg || always_reachable(src), "missing");
2044
if (reachable(src)) {
2045
Assembler::divss(dst, as_Address(src));
2048
Assembler::divss(dst, Address(rscratch, 0));
2052
void MacroAssembler::enter() {
2057
void MacroAssembler::post_call_nop() {
2058
if (!Continuations::enabled()) {
2061
InstructionMark im(this);
2062
relocate(post_call_nop_Relocation::spec());
2063
InlineSkippedInstructionsCounter skipCounter(this);
2064
emit_int8((uint8_t)0x0f);
2065
emit_int8((uint8_t)0x1f);
2066
emit_int8((uint8_t)0x84);
2067
emit_int8((uint8_t)0x00);
2071
// A 5 byte nop that is safe for patching (see patch_verified_entry)
2072
void MacroAssembler::fat_nop() {
2073
if (UseAddressNop) {
2076
emit_int8((uint8_t)0x26); // es:
2077
emit_int8((uint8_t)0x2e); // cs:
2078
emit_int8((uint8_t)0x64); // fs:
2079
emit_int8((uint8_t)0x65); // gs:
2080
emit_int8((uint8_t)0x90);
2085
void MacroAssembler::fcmp(Register tmp) {
2086
fcmp(tmp, 1, true, true);
2089
void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2090
assert(!pop_right || pop_left, "usage error");
2091
if (VM_Version::supports_cmov()) {
2092
assert(tmp == noreg, "unneeded temp");
2102
assert(tmp != noreg, "need temp");
2112
// convert FPU condition into eflags condition via rax,
2114
fwait(); fnstsw_ax();
2118
// condition codes set as follows:
2120
// CF (corresponds to C0) if x < y
2121
// PF (corresponds to C2) if unordered
2122
// ZF (corresponds to C3) if x = y
2125
void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2126
fcmp2int(dst, unordered_is_less, 1, true, true);
2129
void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2130
fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2132
if (unordered_is_less) {
2134
jcc(Assembler::parity, L);
2135
jcc(Assembler::below , L);
2137
jcc(Assembler::equal , L);
2139
} else { // unordered is greater
2141
jcc(Assembler::parity, L);
2142
jcc(Assembler::above , L);
2144
jcc(Assembler::equal , L);
2150
void MacroAssembler::fld_d(AddressLiteral src) {
2151
fld_d(as_Address(src));
2154
void MacroAssembler::fld_s(AddressLiteral src) {
2155
fld_s(as_Address(src));
2158
void MacroAssembler::fldcw(AddressLiteral src) {
2159
fldcw(as_Address(src));
2162
void MacroAssembler::fpop() {
2167
void MacroAssembler::fremr(Register tmp) {
2172
fwait(); fnstsw_ax();
2174
jcc(Assembler::parity, L);
2177
// Result is in ST0.
2178
// Note: fxch & fpop to get rid of ST1
2179
// (otherwise FPU stack could overflow eventually)
2184
void MacroAssembler::empty_FPU_stack() {
2185
if (VM_Version::supports_mmx()) {
2188
for (int i = 8; i-- > 0; ) ffree(i);
2193
void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2194
assert(rscratch != noreg || always_reachable(src), "missing");
2195
if (reachable(src)) {
2196
Assembler::mulpd(dst, as_Address(src));
2199
Assembler::mulpd(dst, Address(rscratch, 0));
2203
void MacroAssembler::load_float(Address src) {
2215
void MacroAssembler::store_float(Address dst) {
2227
void MacroAssembler::load_double(Address src) {
2239
void MacroAssembler::store_double(Address dst) {
2251
// dst = c = a * b + c
2252
void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2253
Assembler::vfmadd231sd(c, a, b);
2259
// dst = c = a * b + c
2260
void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2261
Assembler::vfmadd231ss(c, a, b);
2267
// dst = c = a * b + c
2268
void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2269
Assembler::vfmadd231pd(c, a, b, vector_len);
2275
// dst = c = a * b + c
2276
void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2277
Assembler::vfmadd231ps(c, a, b, vector_len);
2283
// dst = c = a * b + c
2284
void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2285
Assembler::vfmadd231pd(c, a, b, vector_len);
2291
// dst = c = a * b + c
2292
void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2293
Assembler::vfmadd231ps(c, a, b, vector_len);
2299
void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
2300
assert(rscratch != noreg || always_reachable(dst), "missing");
2302
if (reachable(dst)) {
2303
incrementl(as_Address(dst));
2306
incrementl(Address(rscratch, 0));
2310
void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
2311
incrementl(as_Address(dst, rscratch));
2314
void MacroAssembler::incrementl(Register reg, int value) {
2315
if (value == min_jint) {addl(reg, value) ; return; }
2316
if (value < 0) { decrementl(reg, -value); return; }
2317
if (value == 0) { ; return; }
2318
if (value == 1 && UseIncDec) { incl(reg) ; return; }
2319
/* else */ { addl(reg, value) ; return; }
2322
void MacroAssembler::incrementl(Address dst, int value) {
2323
if (value == min_jint) {addl(dst, value) ; return; }
2324
if (value < 0) { decrementl(dst, -value); return; }
2325
if (value == 0) { ; return; }
2326
if (value == 1 && UseIncDec) { incl(dst) ; return; }
2327
/* else */ { addl(dst, value) ; return; }
2330
void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
2331
assert(rscratch != noreg || always_reachable(dst), "missing");
2333
if (reachable(dst)) {
2334
jmp_literal(dst.target(), dst.rspec());
2341
void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
2342
assert(rscratch != noreg || always_reachable(dst), "missing");
2344
if (reachable(dst)) {
2345
InstructionMark im(this);
2346
relocate(dst.reloc());
2347
const int short_size = 2;
2348
const int long_size = 6;
2349
int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2350
if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2351
// 0111 tttn #8-bit disp
2352
emit_int8(0x70 | cc);
2353
emit_int8((offs - short_size) & 0xFF);
2355
// 0000 1111 1000 tttn #32-bit disp
2357
emit_int8((unsigned char)(0x80 | cc));
2358
emit_int32(offs - long_size);
2362
warning("reversing conditional branch");
2365
jccb(reverse[cc], skip);
2367
Assembler::jmp(rscratch);
2372
void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
2373
assert(rscratch != noreg || always_reachable(src), "missing");
2375
if (reachable(src)) {
2376
Assembler::ldmxcsr(as_Address(src));
2379
Assembler::ldmxcsr(Address(rscratch, 0));
2383
int MacroAssembler::load_signed_byte(Register dst, Address src) {
2385
if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2387
movsbl(dst, src); // movsxb
2389
off = load_unsigned_byte(dst, src);
2396
// Note: load_signed_short used to be called load_signed_word.
2397
// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2398
// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2399
// The term "word" in HotSpot means a 32- or 64-bit machine word.
2400
int MacroAssembler::load_signed_short(Register dst, Address src) {
2402
if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2403
// This is dubious to me since it seems safe to do a signed 16 => 64 bit
2404
// version but this is what 64bit has always done. This seems to imply
2405
// that users are only using 32bits worth.
2407
movswl(dst, src); // movsxw
2409
off = load_unsigned_short(dst, src);
2416
int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2417
// According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2418
// and "3.9 Partial Register Penalties", p. 22).
2420
if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2422
movzbl(dst, src); // movzxb
2431
// Note: load_unsigned_short used to be called load_unsigned_word.
2432
int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2433
// According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2434
// and "3.9 Partial Register Penalties", p. 22).
2436
if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2438
movzwl(dst, src); // movzxw
2447
void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2448
switch (size_in_bytes) {
2451
assert(dst2 != noreg, "second dest register required");
2453
movl(dst2, src.plus_disp(BytesPerInt));
2456
case 8: movq(dst, src); break;
2458
case 4: movl(dst, src); break;
2459
case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2460
case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2461
default: ShouldNotReachHere();
2465
void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2466
switch (size_in_bytes) {
2469
assert(src2 != noreg, "second source register required");
2471
movl(dst.plus_disp(BytesPerInt), src2);
2474
case 8: movq(dst, src); break;
2476
case 4: movl(dst, src); break;
2477
case 2: movw(dst, src); break;
2478
case 1: movb(dst, src); break;
2479
default: ShouldNotReachHere();
2483
void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
2484
assert(rscratch != noreg || always_reachable(dst), "missing");
2486
if (reachable(dst)) {
2487
movl(as_Address(dst), src);
2490
movl(Address(rscratch, 0), src);
2494
void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2495
if (reachable(src)) {
2496
movl(dst, as_Address(src));
2499
movl(dst, Address(dst, 0));
2503
// C++ bool manipulation
2505
void MacroAssembler::movbool(Register dst, Address src) {
2506
if(sizeof(bool) == 1)
2508
else if(sizeof(bool) == 2)
2510
else if(sizeof(bool) == 4)
2514
ShouldNotReachHere();
2517
void MacroAssembler::movbool(Address dst, bool boolconst) {
2518
if(sizeof(bool) == 1)
2519
movb(dst, (int) boolconst);
2520
else if(sizeof(bool) == 2)
2521
movw(dst, (int) boolconst);
2522
else if(sizeof(bool) == 4)
2523
movl(dst, (int) boolconst);
2526
ShouldNotReachHere();
2529
void MacroAssembler::movbool(Address dst, Register src) {
2530
if(sizeof(bool) == 1)
2532
else if(sizeof(bool) == 2)
2534
else if(sizeof(bool) == 4)
2538
ShouldNotReachHere();
2541
void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2542
assert(rscratch != noreg || always_reachable(src), "missing");
2544
if (reachable(src)) {
2545
movdl(dst, as_Address(src));
2548
movdl(dst, Address(rscratch, 0));
2552
void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
2553
assert(rscratch != noreg || always_reachable(src), "missing");
2555
if (reachable(src)) {
2556
movq(dst, as_Address(src));
2559
movq(dst, Address(rscratch, 0));
2563
void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2564
assert(rscratch != noreg || always_reachable(src), "missing");
2566
if (reachable(src)) {
2567
if (UseXmmLoadAndClearUpper) {
2568
movsd (dst, as_Address(src));
2570
movlpd(dst, as_Address(src));
2574
if (UseXmmLoadAndClearUpper) {
2575
movsd (dst, Address(rscratch, 0));
2577
movlpd(dst, Address(rscratch, 0));
2582
void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
2583
assert(rscratch != noreg || always_reachable(src), "missing");
2585
if (reachable(src)) {
2586
movss(dst, as_Address(src));
2589
movss(dst, Address(rscratch, 0));
2593
void MacroAssembler::movptr(Register dst, Register src) {
2594
LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2597
void MacroAssembler::movptr(Register dst, Address src) {
2598
LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2601
// src should NEVER be a real pointer. Use AddressLiteral for true pointers
2602
void MacroAssembler::movptr(Register dst, intptr_t src) {
2604
if (is_uimm32(src)) {
2605
movl(dst, checked_cast<uint32_t>(src));
2606
} else if (is_simm32(src)) {
2607
movq(dst, checked_cast<int32_t>(src));
2616
void MacroAssembler::movptr(Address dst, Register src) {
2617
LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2620
void MacroAssembler::movptr(Address dst, int32_t src) {
2621
LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src));
2624
void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2625
assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2626
Assembler::movdqu(dst, src);
2629
void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2630
assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2631
Assembler::movdqu(dst, src);
2634
void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2635
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2636
Assembler::movdqu(dst, src);
2639
void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2640
assert(rscratch != noreg || always_reachable(src), "missing");
2642
if (reachable(src)) {
2643
movdqu(dst, as_Address(src));
2646
movdqu(dst, Address(rscratch, 0));
2650
void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2651
assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2652
Assembler::vmovdqu(dst, src);
2655
void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2656
assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2657
Assembler::vmovdqu(dst, src);
2660
void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2661
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2662
Assembler::vmovdqu(dst, src);
2665
void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2666
assert(rscratch != noreg || always_reachable(src), "missing");
2668
if (reachable(src)) {
2669
vmovdqu(dst, as_Address(src));
2673
vmovdqu(dst, Address(rscratch, 0));
2677
void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2678
assert(rscratch != noreg || always_reachable(src), "missing");
2680
if (vector_len == AVX_512bit) {
2681
evmovdquq(dst, src, AVX_512bit, rscratch);
2682
} else if (vector_len == AVX_256bit) {
2683
vmovdqu(dst, src, rscratch);
2685
movdqu(dst, src, rscratch);
2689
void MacroAssembler::kmov(KRegister dst, Address src) {
2690
if (VM_Version::supports_avx512bw()) {
2693
assert(VM_Version::supports_evex(), "");
2698
void MacroAssembler::kmov(Address dst, KRegister src) {
2699
if (VM_Version::supports_avx512bw()) {
2702
assert(VM_Version::supports_evex(), "");
2707
void MacroAssembler::kmov(KRegister dst, KRegister src) {
2708
if (VM_Version::supports_avx512bw()) {
2711
assert(VM_Version::supports_evex(), "");
2716
void MacroAssembler::kmov(Register dst, KRegister src) {
2717
if (VM_Version::supports_avx512bw()) {
2720
assert(VM_Version::supports_evex(), "");
2725
void MacroAssembler::kmov(KRegister dst, Register src) {
2726
if (VM_Version::supports_avx512bw()) {
2729
assert(VM_Version::supports_evex(), "");
2734
void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2735
assert(rscratch != noreg || always_reachable(src), "missing");
2737
if (reachable(src)) {
2738
kmovql(dst, as_Address(src));
2741
kmovql(dst, Address(rscratch, 0));
2745
void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2746
assert(rscratch != noreg || always_reachable(src), "missing");
2748
if (reachable(src)) {
2749
kmovwl(dst, as_Address(src));
2752
kmovwl(dst, Address(rscratch, 0));
2756
void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2757
int vector_len, Register rscratch) {
2758
assert(rscratch != noreg || always_reachable(src), "missing");
2760
if (reachable(src)) {
2761
Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2764
Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2768
void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2769
int vector_len, Register rscratch) {
2770
assert(rscratch != noreg || always_reachable(src), "missing");
2772
if (reachable(src)) {
2773
Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2776
Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2780
void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2781
assert(rscratch != noreg || always_reachable(src), "missing");
2783
if (reachable(src)) {
2784
Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2787
Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2791
void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2792
assert(rscratch != noreg || always_reachable(src), "missing");
2794
if (reachable(src)) {
2795
Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2798
Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2802
void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2803
assert(rscratch != noreg || always_reachable(src), "missing");
2805
if (reachable(src)) {
2806
Assembler::evmovdquq(dst, as_Address(src), vector_len);
2809
Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2813
void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2814
assert(rscratch != noreg || always_reachable(src), "missing");
2816
if (reachable(src)) {
2817
Assembler::movdqa(dst, as_Address(src));
2820
Assembler::movdqa(dst, Address(rscratch, 0));
2824
void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2825
assert(rscratch != noreg || always_reachable(src), "missing");
2827
if (reachable(src)) {
2828
Assembler::movsd(dst, as_Address(src));
2831
Assembler::movsd(dst, Address(rscratch, 0));
2835
void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2836
assert(rscratch != noreg || always_reachable(src), "missing");
2838
if (reachable(src)) {
2839
Assembler::movss(dst, as_Address(src));
2842
Assembler::movss(dst, Address(rscratch, 0));
2846
void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2847
assert(rscratch != noreg || always_reachable(src), "missing");
2849
if (reachable(src)) {
2850
Assembler::movddup(dst, as_Address(src));
2853
Assembler::movddup(dst, Address(rscratch, 0));
2857
void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2858
assert(rscratch != noreg || always_reachable(src), "missing");
2860
if (reachable(src)) {
2861
Assembler::vmovddup(dst, as_Address(src), vector_len);
2864
Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2868
void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2869
assert(rscratch != noreg || always_reachable(src), "missing");
2871
if (reachable(src)) {
2872
Assembler::mulsd(dst, as_Address(src));
2875
Assembler::mulsd(dst, Address(rscratch, 0));
2879
void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2880
assert(rscratch != noreg || always_reachable(src), "missing");
2882
if (reachable(src)) {
2883
Assembler::mulss(dst, as_Address(src));
2886
Assembler::mulss(dst, Address(rscratch, 0));
2890
void MacroAssembler::null_check(Register reg, int offset) {
2891
if (needs_explicit_null_check(offset)) {
2892
// provoke OS null exception if reg is null by
2893
// accessing M[reg] w/o changing any (non-CC) registers
2894
// NOTE: cmpl is plenty here to provoke a segv
2895
cmpptr(rax, Address(reg, 0));
2896
// Note: should probably use testl(rax, Address(reg, 0));
2897
// may be shorter code (however, this version of
2898
// testl needs to be implemented first)
2900
// nothing to do, (later) access of M[reg + offset]
2901
// will provoke OS null exception if reg is null
2905
void MacroAssembler::os_breakpoint() {
2906
// instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2907
// (e.g., MSVC can't call ps() otherwise)
2908
call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2911
void MacroAssembler::unimplemented(const char* what) {
2912
const char* buf = nullptr;
2916
ss.print("unimplemented: %s", what);
2917
buf = code_string(ss.as_string());
2923
#define XSTATE_BV 0x200
2926
void MacroAssembler::pop_CPU_state() {
2931
void MacroAssembler::pop_FPU_state() {
2933
frstor(Address(rsp, 0));
2935
fxrstor(Address(rsp, 0));
2937
addptr(rsp, FPUStateSizeInWords * wordSize);
2940
void MacroAssembler::pop_IU_state() {
2942
LP64_ONLY(addq(rsp, 8));
2946
// Save Integer and Float state
2947
// Warning: Stack must be 16 byte aligned (64bit)
2948
void MacroAssembler::push_CPU_state() {
2953
void MacroAssembler::push_FPU_state() {
2954
subptr(rsp, FPUStateSizeInWords * wordSize);
2956
fnsave(Address(rsp, 0));
2959
fxsave(Address(rsp, 0));
2963
void MacroAssembler::push_IU_state() {
2964
// Push flags first because pusha kills them
2966
// Make sure rsp stays 16-byte aligned
2967
LP64_ONLY(subq(rsp, 8));
2971
void MacroAssembler::push_cont_fastpath() {
2972
if (!Continuations::enabled()) return;
2975
Register rthread = rax;
2976
Register rrealsp = rbx;
2980
get_thread(rthread);
2982
// The code below wants the original RSP.
2983
// Move it back after the pushes above.
2984
movptr(rrealsp, rsp);
2985
addptr(rrealsp, 2*wordSize);
2987
Register rthread = r15_thread;
2988
Register rrealsp = rsp;
2992
cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
2993
jccb(Assembler::belowEqual, done);
2994
movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp);
3003
void MacroAssembler::pop_cont_fastpath() {
3004
if (!Continuations::enabled()) return;
3007
Register rthread = rax;
3008
Register rrealsp = rbx;
3012
get_thread(rthread);
3014
// The code below wants the original RSP.
3015
// Move it back after the pushes above.
3016
movptr(rrealsp, rsp);
3017
addptr(rrealsp, 2*wordSize);
3019
Register rthread = r15_thread;
3020
Register rrealsp = rsp;
3024
cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
3025
jccb(Assembler::below, done);
3026
movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0);
3035
void MacroAssembler::inc_held_monitor_count() {
3037
Register thread = rax;
3040
incrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3043
incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3047
void MacroAssembler::dec_held_monitor_count() {
3049
Register thread = rax;
3052
decrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3055
decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3060
void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
3063
movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
3065
jcc(Assembler::zero, no_cont);
3074
void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3075
if (!java_thread->is_valid()) {
3077
get_thread(java_thread);
3079
// we must set sp to zero to clear frame
3080
movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3081
// must clear fp, so that compiled frames are not confused; it is
3082
// possible that we need it only for debugging
3084
movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3086
// Always clear the pc because it could have been set by make_walkable()
3087
movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3091
void MacroAssembler::restore_rax(Register tmp) {
3092
if (tmp == noreg) pop(rax);
3093
else if (tmp != rax) mov(rax, tmp);
3096
void MacroAssembler::round_to(Register reg, int modulus) {
3097
addptr(reg, modulus - 1);
3098
andptr(reg, -modulus);
3101
void MacroAssembler::save_rax(Register tmp) {
3102
if (tmp == noreg) push(rax);
3103
else if (tmp != rax) mov(tmp, rax);
3106
void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3108
// Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3109
// we may safely use rsp instead to perform the stack watermark check.
3110
cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3111
jcc(Assembler::above, slow_path);
3114
testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3115
jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3120
// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3121
// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3122
// has to be reset to 0. This is required to allow proper stack traversal.
3123
void MacroAssembler::set_last_Java_frame(Register java_thread,
3124
Register last_java_sp,
3125
Register last_java_fp,
3126
address last_java_pc,
3127
Register rscratch) {
3129
// determine java_thread register
3130
if (!java_thread->is_valid()) {
3132
get_thread(java_thread);
3134
// determine last_java_sp register
3135
if (!last_java_sp->is_valid()) {
3138
// last_java_fp is optional
3139
if (last_java_fp->is_valid()) {
3140
movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3142
// last_java_pc is optional
3143
if (last_java_pc != nullptr) {
3144
Address java_pc(java_thread,
3145
JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
3146
lea(java_pc, InternalAddress(last_java_pc), rscratch);
3148
movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3151
void MacroAssembler::shlptr(Register dst, int imm8) {
3152
LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3155
void MacroAssembler::shrptr(Register dst, int imm8) {
3156
LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3159
void MacroAssembler::sign_extend_byte(Register reg) {
3160
if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3161
movsbl(reg, reg); // movsxb
3168
void MacroAssembler::sign_extend_short(Register reg) {
3169
if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3170
movswl(reg, reg); // movsxw
3177
void MacroAssembler::testl(Address dst, int32_t imm32) {
3178
if (imm32 >= 0 && is8bit(imm32)) {
3181
Assembler::testl(dst, imm32);
3185
void MacroAssembler::testl(Register dst, int32_t imm32) {
3186
if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
3189
Assembler::testl(dst, imm32);
3193
void MacroAssembler::testl(Register dst, AddressLiteral src) {
3194
assert(always_reachable(src), "Address should be reachable");
3195
testl(dst, as_Address(src));
3200
void MacroAssembler::testq(Address dst, int32_t imm32) {
3204
Assembler::testq(dst, imm32);
3208
void MacroAssembler::testq(Register dst, int32_t imm32) {
3212
Assembler::testq(dst, imm32);
3218
void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3219
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3220
Assembler::pcmpeqb(dst, src);
3223
void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3224
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3225
Assembler::pcmpeqw(dst, src);
3228
void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3229
assert((dst->encoding() < 16),"XMM register should be 0-15");
3230
Assembler::pcmpestri(dst, src, imm8);
3233
void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3234
assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3235
Assembler::pcmpestri(dst, src, imm8);
3238
void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3239
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3240
Assembler::pmovzxbw(dst, src);
3243
void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3244
assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3245
Assembler::pmovzxbw(dst, src);
3248
void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3249
assert((src->encoding() < 16),"XMM register should be 0-15");
3250
Assembler::pmovmskb(dst, src);
3253
void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3254
assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3255
Assembler::ptest(dst, src);
3258
void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3259
assert(rscratch != noreg || always_reachable(src), "missing");
3261
if (reachable(src)) {
3262
Assembler::sqrtss(dst, as_Address(src));
3265
Assembler::sqrtss(dst, Address(rscratch, 0));
3269
void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3270
assert(rscratch != noreg || always_reachable(src), "missing");
3272
if (reachable(src)) {
3273
Assembler::subsd(dst, as_Address(src));
3276
Assembler::subsd(dst, Address(rscratch, 0));
3280
void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
3281
assert(rscratch != noreg || always_reachable(src), "missing");
3283
if (reachable(src)) {
3284
Assembler::roundsd(dst, as_Address(src), rmode);
3287
Assembler::roundsd(dst, Address(rscratch, 0), rmode);
3291
void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3292
assert(rscratch != noreg || always_reachable(src), "missing");
3294
if (reachable(src)) {
3295
Assembler::subss(dst, as_Address(src));
3298
Assembler::subss(dst, Address(rscratch, 0));
3302
void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3303
assert(rscratch != noreg || always_reachable(src), "missing");
3305
if (reachable(src)) {
3306
Assembler::ucomisd(dst, as_Address(src));
3309
Assembler::ucomisd(dst, Address(rscratch, 0));
3313
void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3314
assert(rscratch != noreg || always_reachable(src), "missing");
3316
if (reachable(src)) {
3317
Assembler::ucomiss(dst, as_Address(src));
3320
Assembler::ucomiss(dst, Address(rscratch, 0));
3324
void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3325
assert(rscratch != noreg || always_reachable(src), "missing");
3327
// Used in sign-bit flipping with aligned address.
3328
assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3329
if (reachable(src)) {
3330
Assembler::xorpd(dst, as_Address(src));
3333
Assembler::xorpd(dst, Address(rscratch, 0));
3337
void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3338
if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3339
Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3342
Assembler::xorpd(dst, src);
3346
void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3347
if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3348
Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3350
Assembler::xorps(dst, src);
3354
void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
3355
assert(rscratch != noreg || always_reachable(src), "missing");
3357
// Used in sign-bit flipping with aligned address.
3358
assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3359
if (reachable(src)) {
3360
Assembler::xorps(dst, as_Address(src));
3363
Assembler::xorps(dst, Address(rscratch, 0));
3367
void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
3368
assert(rscratch != noreg || always_reachable(src), "missing");
3370
// Used in sign-bit flipping with aligned address.
3371
bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3372
assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3373
if (reachable(src)) {
3374
Assembler::pshufb(dst, as_Address(src));
3377
Assembler::pshufb(dst, Address(rscratch, 0));
3381
// AVX 3-operands instructions
3383
void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3384
assert(rscratch != noreg || always_reachable(src), "missing");
3386
if (reachable(src)) {
3387
vaddsd(dst, nds, as_Address(src));
3390
vaddsd(dst, nds, Address(rscratch, 0));
3394
void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3395
assert(rscratch != noreg || always_reachable(src), "missing");
3397
if (reachable(src)) {
3398
vaddss(dst, nds, as_Address(src));
3401
vaddss(dst, nds, Address(rscratch, 0));
3405
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3406
assert(UseAVX > 0, "requires some form of AVX");
3407
assert(rscratch != noreg || always_reachable(src), "missing");
3409
if (reachable(src)) {
3410
Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3413
Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3417
void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3418
assert(UseAVX > 0, "requires some form of AVX");
3419
assert(rscratch != noreg || always_reachable(src), "missing");
3421
if (reachable(src)) {
3422
Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3425
Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3429
void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3430
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3431
assert(rscratch != noreg || always_reachable(negate_field), "missing");
3433
vandps(dst, nds, negate_field, vector_len, rscratch);
3436
void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3437
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3438
assert(rscratch != noreg || always_reachable(negate_field), "missing");
3440
vandpd(dst, nds, negate_field, vector_len, rscratch);
3443
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3444
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3445
Assembler::vpaddb(dst, nds, src, vector_len);
3448
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3449
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3450
Assembler::vpaddb(dst, nds, src, vector_len);
3453
void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3454
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3455
Assembler::vpaddw(dst, nds, src, vector_len);
3458
void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3459
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3460
Assembler::vpaddw(dst, nds, src, vector_len);
3463
void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3464
assert(rscratch != noreg || always_reachable(src), "missing");
3466
if (reachable(src)) {
3467
Assembler::vpand(dst, nds, as_Address(src), vector_len);
3470
Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
3474
void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3475
assert(rscratch != noreg || always_reachable(src), "missing");
3477
if (reachable(src)) {
3478
Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
3481
Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
3485
void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3486
assert(rscratch != noreg || always_reachable(src), "missing");
3488
if (reachable(src)) {
3489
Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
3492
Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
3496
void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3497
assert(rscratch != noreg || always_reachable(src), "missing");
3499
if (reachable(src)) {
3500
Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3503
Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3507
void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3508
assert(rscratch != noreg || always_reachable(src), "missing");
3510
if (reachable(src)) {
3511
Assembler::vbroadcastss(dst, as_Address(src), vector_len);
3514
Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
3518
// Vector float blend
3519
// vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3520
void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3521
// WARN: Allow dst == (src1|src2), mask == scratch
3522
bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1;
3523
bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
3524
bool dst_available = dst != mask && (dst != src1 || dst != src2);
3525
if (blend_emulation && scratch_available && dst_available) {
3527
vpsrad(scratch, mask, 32, vector_len);
3531
vpandn(dst, mask, src1, vector_len); // if mask == 0, src1
3532
vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3534
vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3535
vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
3537
vpor(dst, dst, scratch, vector_len);
3539
Assembler::vblendvps(dst, src1, src2, mask, vector_len);
3543
// vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3544
void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3545
// WARN: Allow dst == (src1|src2), mask == scratch
3546
bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1;
3547
bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
3548
bool dst_available = dst != mask && (dst != src1 || dst != src2);
3549
if (blend_emulation && scratch_available && dst_available) {
3551
vpxor(scratch, scratch, scratch, vector_len);
3552
vpcmpgtq(scratch, scratch, mask, vector_len);
3556
vpandn(dst, mask, src1, vector_len); // if mask == 0, src
3557
vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3559
vpand (dst, mask, src2, vector_len); // if mask == 1, src2
3560
vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3562
vpor(dst, dst, scratch, vector_len);
3564
Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3568
void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3569
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3570
Assembler::vpcmpeqb(dst, nds, src, vector_len);
3573
void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3574
assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3575
Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3578
void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3579
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3580
Assembler::vpcmpeqw(dst, nds, src, vector_len);
3583
void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3584
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3585
Assembler::vpcmpeqw(dst, nds, src, vector_len);
3588
void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3589
assert(rscratch != noreg || always_reachable(src), "missing");
3591
if (reachable(src)) {
3592
Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3595
Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3599
void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3600
int comparison, bool is_signed, int vector_len, Register rscratch) {
3601
assert(rscratch != noreg || always_reachable(src), "missing");
3603
if (reachable(src)) {
3604
Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3607
Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3611
void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3612
int comparison, bool is_signed, int vector_len, Register rscratch) {
3613
assert(rscratch != noreg || always_reachable(src), "missing");
3615
if (reachable(src)) {
3616
Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3619
Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3623
void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3624
int comparison, bool is_signed, int vector_len, Register rscratch) {
3625
assert(rscratch != noreg || always_reachable(src), "missing");
3627
if (reachable(src)) {
3628
Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3631
Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3635
void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3636
int comparison, bool is_signed, int vector_len, Register rscratch) {
3637
assert(rscratch != noreg || always_reachable(src), "missing");
3639
if (reachable(src)) {
3640
Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3643
Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3647
void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3648
if (width == Assembler::Q) {
3649
Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3651
Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3655
void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3656
int eq_cond_enc = 0x29;
3657
int gt_cond_enc = 0x37;
3658
if (width != Assembler::Q) {
3659
eq_cond_enc = 0x74 + width;
3660
gt_cond_enc = 0x64 + width;
3664
vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3667
vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3668
vallones(xtmp, vector_len);
3669
vpxor(dst, xtmp, dst, vector_len);
3672
vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3673
vallones(xtmp, vector_len);
3674
vpxor(dst, xtmp, dst, vector_len);
3677
vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3678
vallones(xtmp, vector_len);
3679
vpxor(dst, xtmp, dst, vector_len);
3682
vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3685
vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3688
assert(false, "Should not reach here");
3692
void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3693
assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3694
Assembler::vpmovzxbw(dst, src, vector_len);
3697
void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3698
assert((src->encoding() < 16),"XMM register should be 0-15");
3699
Assembler::vpmovmskb(dst, src, vector_len);
3702
void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3703
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3704
Assembler::vpmullw(dst, nds, src, vector_len);
3707
void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3708
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3709
Assembler::vpmullw(dst, nds, src, vector_len);
3712
void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3713
assert((UseAVX > 0), "AVX support is needed");
3714
assert(rscratch != noreg || always_reachable(src), "missing");
3716
if (reachable(src)) {
3717
Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3720
Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3724
void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3725
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3726
Assembler::vpsubb(dst, nds, src, vector_len);
3729
void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3730
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3731
Assembler::vpsubb(dst, nds, src, vector_len);
3734
void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3735
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3736
Assembler::vpsubw(dst, nds, src, vector_len);
3739
void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3740
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3741
Assembler::vpsubw(dst, nds, src, vector_len);
3744
void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3745
assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3746
Assembler::vpsraw(dst, nds, shift, vector_len);
3749
void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3750
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3751
Assembler::vpsraw(dst, nds, shift, vector_len);
3754
void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3755
assert(UseAVX > 2,"");
3756
if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3759
Assembler::evpsraq(dst, nds, shift, vector_len);
3762
void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3763
assert(UseAVX > 2,"");
3764
if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3767
Assembler::evpsraq(dst, nds, shift, vector_len);
3770
void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3771
assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3772
Assembler::vpsrlw(dst, nds, shift, vector_len);
3775
void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3776
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3777
Assembler::vpsrlw(dst, nds, shift, vector_len);
3780
void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3781
assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3782
Assembler::vpsllw(dst, nds, shift, vector_len);
3785
void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3786
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3787
Assembler::vpsllw(dst, nds, shift, vector_len);
3790
void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3791
assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3792
Assembler::vptest(dst, src);
3795
void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3796
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3797
Assembler::punpcklbw(dst, src);
3800
void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3801
assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3802
Assembler::pshufd(dst, src, mode);
3805
void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3806
assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3807
Assembler::pshuflw(dst, src, mode);
3810
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3811
assert(rscratch != noreg || always_reachable(src), "missing");
3813
if (reachable(src)) {
3814
vandpd(dst, nds, as_Address(src), vector_len);
3817
vandpd(dst, nds, Address(rscratch, 0), vector_len);
3821
void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3822
assert(rscratch != noreg || always_reachable(src), "missing");
3824
if (reachable(src)) {
3825
vandps(dst, nds, as_Address(src), vector_len);
3828
vandps(dst, nds, Address(rscratch, 0), vector_len);
3832
void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3833
bool merge, int vector_len, Register rscratch) {
3834
assert(rscratch != noreg || always_reachable(src), "missing");
3836
if (reachable(src)) {
3837
Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3840
Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3844
void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3845
assert(rscratch != noreg || always_reachable(src), "missing");
3847
if (reachable(src)) {
3848
vdivsd(dst, nds, as_Address(src));
3851
vdivsd(dst, nds, Address(rscratch, 0));
3855
void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3856
assert(rscratch != noreg || always_reachable(src), "missing");
3858
if (reachable(src)) {
3859
vdivss(dst, nds, as_Address(src));
3862
vdivss(dst, nds, Address(rscratch, 0));
3866
void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3867
assert(rscratch != noreg || always_reachable(src), "missing");
3869
if (reachable(src)) {
3870
vmulsd(dst, nds, as_Address(src));
3873
vmulsd(dst, nds, Address(rscratch, 0));
3877
void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3878
assert(rscratch != noreg || always_reachable(src), "missing");
3880
if (reachable(src)) {
3881
vmulss(dst, nds, as_Address(src));
3884
vmulss(dst, nds, Address(rscratch, 0));
3888
void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3889
assert(rscratch != noreg || always_reachable(src), "missing");
3891
if (reachable(src)) {
3892
vsubsd(dst, nds, as_Address(src));
3895
vsubsd(dst, nds, Address(rscratch, 0));
3899
void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3900
assert(rscratch != noreg || always_reachable(src), "missing");
3902
if (reachable(src)) {
3903
vsubss(dst, nds, as_Address(src));
3906
vsubss(dst, nds, Address(rscratch, 0));
3910
void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3911
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3912
assert(rscratch != noreg || always_reachable(src), "missing");
3914
vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3917
void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3918
assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3919
assert(rscratch != noreg || always_reachable(src), "missing");
3921
vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3924
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3925
assert(rscratch != noreg || always_reachable(src), "missing");
3927
if (reachable(src)) {
3928
vxorpd(dst, nds, as_Address(src), vector_len);
3931
vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3935
void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3936
assert(rscratch != noreg || always_reachable(src), "missing");
3938
if (reachable(src)) {
3939
vxorps(dst, nds, as_Address(src), vector_len);
3942
vxorps(dst, nds, Address(rscratch, 0), vector_len);
3946
void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3947
assert(rscratch != noreg || always_reachable(src), "missing");
3949
if (UseAVX > 1 || (vector_len < 1)) {
3950
if (reachable(src)) {
3951
Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3954
Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3957
MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3961
void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3962
assert(rscratch != noreg || always_reachable(src), "missing");
3964
if (reachable(src)) {
3965
Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3968
Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3972
void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3973
const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3974
STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3975
// The inverted mask is sign-extended
3976
andptr(possibly_non_local, inverted_mask);
3979
void MacroAssembler::resolve_jobject(Register value,
3982
assert_different_registers(value, thread, tmp);
3983
Label done, tagged, weak_tagged;
3984
testptr(value, value);
3985
jcc(Assembler::zero, done); // Use null as-is.
3986
testptr(value, JNIHandles::tag_mask); // Test for tag.
3987
jcc(Assembler::notZero, tagged);
3989
// Resolve local handle
3990
access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread);
3995
testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3996
jcc(Assembler::notZero, weak_tagged);
3998
// Resolve global handle
3999
access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
4005
access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4006
value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread);
4012
void MacroAssembler::resolve_global_jobject(Register value,
4015
assert_different_registers(value, thread, tmp);
4018
testptr(value, value);
4019
jcc(Assembler::zero, done); // Use null as-is.
4023
Label valid_global_tag;
4024
testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
4025
jcc(Assembler::notZero, valid_global_tag);
4026
stop("non global jobject using resolve_global_jobject");
4027
bind(valid_global_tag);
4031
// Resolve global handle
4032
access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
4038
void MacroAssembler::subptr(Register dst, int32_t imm32) {
4039
LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4042
// Force generation of a 4 byte immediate value even if it fits into 8bit
4043
void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4044
LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4047
void MacroAssembler::subptr(Register dst, Register src) {
4048
LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4051
// C++ bool manipulation
4052
void MacroAssembler::testbool(Register dst) {
4053
if(sizeof(bool) == 1)
4055
else if(sizeof(bool) == 2) {
4056
// testw implementation needed for two byte bools
4057
ShouldNotReachHere();
4058
} else if(sizeof(bool) == 4)
4062
ShouldNotReachHere();
4065
void MacroAssembler::testptr(Register dst, Register src) {
4066
LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4069
// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4070
void MacroAssembler::tlab_allocate(Register thread, Register obj,
4071
Register var_size_in_bytes,
4072
int con_size_in_bytes,
4076
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4077
bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4080
RegSet MacroAssembler::call_clobbered_gp_registers() {
4083
regs += RegSet::of(rax, rcx, rdx);
4085
regs += RegSet::of(rsi, rdi);
4087
regs += RegSet::range(r8, r11);
4089
regs += RegSet::of(rax, rcx, rdx);
4093
regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
4099
XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
4100
int num_xmm_registers = XMMRegister::available_xmm_registers();
4101
#if defined(WINDOWS) && defined(_LP64)
4102
XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
4103
if (num_xmm_registers > 16) {
4104
result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
4108
return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
4112
static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
4115
static bool use_x87_registers() { return UseSSE < 2; }
4117
static bool use_xmm_registers() { return UseSSE >= 1; }
4119
// C1 only ever uses the first double/float of the XMM register.
4120
static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
4122
static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4124
masm->movflt(Address(rsp, offset), reg);
4126
masm->movdbl(Address(rsp, offset), reg);
4130
static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4132
masm->movflt(reg, Address(rsp, offset));
4134
masm->movdbl(reg, Address(rsp, offset));
4138
static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
4139
bool save_fpu, int& gp_area_size,
4140
int& fp_area_size, int& xmm_area_size) {
4142
gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
4143
StackAlignmentInBytes);
4147
fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
4149
xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
4151
return gp_area_size + fp_area_size + xmm_area_size;
4154
void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
4155
block_comment("push_call_clobbered_registers start");
4156
// Regular registers
4157
RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
4162
int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
4163
gp_area_size, fp_area_size, xmm_area_size);
4164
subptr(rsp, total_save_size);
4166
push_set(gp_registers_to_push, 0);
4169
if (save_fpu && use_x87_registers()) {
4170
fnsave(Address(rsp, gp_area_size));
4174
if (save_fpu && use_xmm_registers()) {
4175
push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4178
block_comment("push_call_clobbered_registers end");
4181
void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
4182
block_comment("pop_call_clobbered_registers start");
4184
RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
4189
int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
4190
gp_area_size, fp_area_size, xmm_area_size);
4192
if (restore_fpu && use_xmm_registers()) {
4193
pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4196
if (restore_fpu && use_x87_registers()) {
4197
frstor(Address(rsp, gp_area_size));
4201
pop_set(gp_registers_to_pop, 0);
4203
addptr(rsp, total_save_size);
4207
block_comment("pop_call_clobbered_registers end");
4210
void MacroAssembler::push_set(XMMRegSet set, int offset) {
4211
assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
4212
int spill_offset = offset;
4214
for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
4215
save_xmm_register(this, spill_offset, *it);
4216
spill_offset += xmm_save_size();
4220
void MacroAssembler::pop_set(XMMRegSet set, int offset) {
4221
int restore_size = set.size() * xmm_save_size();
4222
assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
4224
int restore_offset = offset + restore_size - xmm_save_size();
4226
for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
4227
restore_xmm_register(this, restore_offset, *it);
4228
restore_offset -= xmm_save_size();
4232
void MacroAssembler::push_set(RegSet set, int offset) {
4235
int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4236
int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4237
subptr(rsp, aligned_size);
4240
spill_offset = offset;
4243
for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
4244
movptr(Address(rsp, spill_offset), *it);
4245
spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4249
void MacroAssembler::pop_set(RegSet set, int offset) {
4251
int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4252
int restore_size = set.size() * gp_reg_size;
4253
int aligned_size = align_up(restore_size, StackAlignmentInBytes);
4257
restore_offset = restore_size - gp_reg_size;
4259
restore_offset = offset + restore_size - gp_reg_size;
4261
for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
4262
movptr(*it, Address(rsp, restore_offset));
4263
restore_offset -= gp_reg_size;
4267
addptr(rsp, aligned_size);
4271
// Preserves the contents of address, destroys the contents length_in_bytes and temp.
4272
void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4273
assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4274
assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4277
testptr(length_in_bytes, length_in_bytes);
4278
jcc(Assembler::zero, done);
4280
// initialize topmost word, divide index by 2, check if odd and test if zero
4281
// note: for the remaining code to work, index must be a multiple of BytesPerWord
4285
testptr(length_in_bytes, BytesPerWord - 1);
4286
jcc(Assembler::zero, L);
4287
stop("length must be a multiple of BytesPerWord");
4291
Register index = length_in_bytes;
4292
xorptr(temp, temp); // use _zero reg to clear memory (shorter code)
4294
shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
4296
shrptr(index, 2); // use 2 instructions to avoid partial flag stall
4300
// index could have not been a multiple of 8 (i.e., bit 2 was set)
4303
// note: if index was a multiple of 8, then it cannot
4304
// be 0 now otherwise it must have been 0 before
4305
// => if it is even, we don't need to check for 0 again
4306
jcc(Assembler::carryClear, even);
4307
// clear topmost word (no jump would be needed if conditional assignment worked here)
4308
movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4309
// index could be 0 now, must check again
4310
jcc(Assembler::zero, done);
4314
// initialize remaining object fields: index is a multiple of 2 now
4318
movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4319
NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4321
jcc(Assembler::notZero, loop);
4327
// Look up the method for a megamorphic invokeinterface call.
4328
// The target method is determined by <intf_klass, itable_index>.
4329
// The receiver klass is in recv_klass.
4330
// On success, the result will be in method_result, and execution falls through.
4331
// On failure, execution transfers to the given label.
4332
void MacroAssembler::lookup_interface_method(Register recv_klass,
4333
Register intf_klass,
4334
RegisterOrConstant itable_index,
4335
Register method_result,
4337
Label& L_no_such_interface,
4338
bool return_method) {
4339
assert_different_registers(recv_klass, intf_klass, scan_temp);
4340
assert_different_registers(method_result, intf_klass, scan_temp);
4341
assert(recv_klass != method_result || !return_method,
4342
"recv_klass can be destroyed when method isn't needed");
4344
assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4345
"caller must use same register for non-constant itable index as for method");
4347
// Compute start of first itableOffsetEntry (which is at the end of the vtable)
4348
int vtable_base = in_bytes(Klass::vtable_start_offset());
4349
int itentry_off = in_bytes(itableMethodEntry::method_offset());
4350
int scan_step = itableOffsetEntry::size() * wordSize;
4351
int vte_size = vtableEntry::size_in_bytes();
4352
Address::ScaleFactor times_vte_scale = Address::times_ptr;
4353
assert(vte_size == wordSize, "else adjust times_vte_scale");
4355
movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4357
// Could store the aligned, prescaled offset in the klass.
4358
lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4360
if (return_method) {
4361
// Adjust recv_klass by scaled itable_index, so we can free itable_index.
4362
assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4363
lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4366
// for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
4367
// if (scan->interface() == intf) {
4368
// result = (klass + scan->offset() + itable_index);
4371
Label search, found_method;
4373
for (int peel = 1; peel >= 0; peel--) {
4374
movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
4375
cmpptr(intf_klass, method_result);
4378
jccb(Assembler::equal, found_method);
4380
jccb(Assembler::notEqual, search);
4381
// (invert the test to fall through to found_method...)
4388
// Check that the previous entry is non-null. A null entry means that
4389
// the receiver class doesn't implement the interface, and wasn't the
4390
// same as when the caller was compiled.
4391
testptr(method_result, method_result);
4392
jcc(Assembler::zero, L_no_such_interface);
4393
addptr(scan_temp, scan_step);
4398
if (return_method) {
4400
movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
4401
movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4405
// Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4406
// - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
4407
// - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4408
// The target method is determined by <holder_klass, itable_index>.
4409
// The receiver klass is in recv_klass.
4410
// On success, the result will be in method_result, and execution falls through.
4411
// On failure, execution transfers to the given label.
4412
void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4413
Register holder_klass,
4414
Register resolved_klass,
4415
Register method_result,
4420
Label& L_no_such_interface) {
4421
assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
4422
Register temp_itbl_klass = method_result;
4423
Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
4425
int vtable_base = in_bytes(Klass::vtable_start_offset());
4426
int itentry_off = in_bytes(itableMethodEntry::method_offset());
4427
int scan_step = itableOffsetEntry::size() * wordSize;
4428
int vte_size = vtableEntry::size_in_bytes();
4429
int ioffset = in_bytes(itableOffsetEntry::interface_offset());
4430
int ooffset = in_bytes(itableOffsetEntry::offset_offset());
4431
Address::ScaleFactor times_vte_scale = Address::times_ptr;
4432
assert(vte_size == wordSize, "adjust times_vte_scale");
4434
Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
4436
// temp_itbl_klass = recv_klass.itable[0]
4437
// scan_temp = &recv_klass.itable[0] + step
4438
movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4439
movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
4440
lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
4441
xorptr(temp_reg, temp_reg);
4444
// - if (holder_klass != resolved_klass), go to "scan for resolved"
4445
// - if (itable[0] == 0), no such interface
4446
// - if (itable[0] == holder_klass), shortcut to "holder found"
4447
cmpptr(holder_klass, resolved_klass);
4448
jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
4449
testptr(temp_itbl_klass, temp_itbl_klass);
4450
jccb(Assembler::zero, L_no_such_interface);
4451
cmpptr(holder_klass, temp_itbl_klass);
4452
jccb(Assembler::equal, L_holder_found);
4454
// Loop: Look for holder_klass record in itable
4456
// tmp = itable[index];
4458
// if (tmp == holder_klass) {
4459
// goto L_holder_found; // Found!
4461
// } while (tmp != 0);
4462
// goto L_no_such_interface // Not found.
4463
Label L_scan_holder;
4464
bind(L_scan_holder);
4465
movptr(temp_itbl_klass, Address(scan_temp, 0));
4466
addptr(scan_temp, scan_step);
4467
cmpptr(holder_klass, temp_itbl_klass);
4468
jccb(Assembler::equal, L_holder_found);
4469
testptr(temp_itbl_klass, temp_itbl_klass);
4470
jccb(Assembler::notZero, L_scan_holder);
4472
jmpb(L_no_such_interface);
4474
// Loop: Look for resolved_class record in itable
4476
// tmp = itable[index];
4478
// if (tmp == holder_klass) {
4479
// // Also check if we have met a holder klass
4480
// holder_tmp = itable[index-step-ioffset];
4482
// if (tmp == resolved_klass) {
4483
// goto L_resolved_found; // Found!
4485
// } while (tmp != 0);
4486
// goto L_no_such_interface // Not found.
4488
Label L_loop_scan_resolved;
4489
bind(L_loop_scan_resolved);
4490
movptr(temp_itbl_klass, Address(scan_temp, 0));
4491
addptr(scan_temp, scan_step);
4492
bind(L_loop_scan_resolved_entry);
4493
cmpptr(holder_klass, temp_itbl_klass);
4494
cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4495
cmpptr(resolved_klass, temp_itbl_klass);
4496
jccb(Assembler::equal, L_resolved_found);
4497
testptr(temp_itbl_klass, temp_itbl_klass);
4498
jccb(Assembler::notZero, L_loop_scan_resolved);
4500
jmpb(L_no_such_interface);
4504
// See if we already have a holder klass. If not, go and scan for it.
4505
bind(L_resolved_found);
4506
testptr(temp_reg, temp_reg);
4507
jccb(Assembler::zero, L_scan_holder);
4510
bind(L_holder_found);
4511
movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4513
// Finally, temp_reg contains holder_klass vtable offset
4515
assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4516
if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
4517
load_klass(scan_temp, receiver, noreg);
4518
movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4520
movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4525
// virtual method calling
4526
void MacroAssembler::lookup_virtual_method(Register recv_klass,
4527
RegisterOrConstant vtable_index,
4528
Register method_result) {
4529
const ByteSize base = Klass::vtable_start_offset();
4530
assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4531
Address vtable_entry_addr(recv_klass,
4532
vtable_index, Address::times_ptr,
4533
base + vtableEntry::method_offset());
4534
movptr(method_result, vtable_entry_addr);
4538
void MacroAssembler::check_klass_subtype(Register sub_klass,
4539
Register super_klass,
4543
check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr);
4544
check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
4549
void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4550
Register super_klass,
4555
RegisterOrConstant super_check_offset) {
4556
assert_different_registers(sub_klass, super_klass, temp_reg);
4557
bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4558
if (super_check_offset.is_register()) {
4559
assert_different_registers(sub_klass, super_klass,
4560
super_check_offset.as_register());
4561
} else if (must_load_sco) {
4562
assert(temp_reg != noreg, "supply either a temp or a register offset");
4565
Label L_fallthrough;
4566
int label_nulls = 0;
4567
if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4568
if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4569
if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4570
assert(label_nulls <= 1, "at most one null in the batch");
4572
int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4573
int sco_offset = in_bytes(Klass::super_check_offset_offset());
4574
Address super_check_offset_addr(super_klass, sco_offset);
4576
// Hacked jcc, which "knows" that L_fallthrough, at least, is in
4577
// range of a jccb. If this routine grows larger, reconsider at
4578
// least some of these.
4579
#define local_jcc(assembler_cond, label) \
4580
if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
4581
else jcc( assembler_cond, label) /*omit semi*/
4583
// Hacked jmp, which may only be used just before L_fallthrough.
4584
#define final_jmp(label) \
4585
if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4586
else jmp(label) /*omit semi*/
4588
// If the pointers are equal, we are done (e.g., String[] elements).
4589
// This self-check enables sharing of secondary supertype arrays among
4590
// non-primary types such as array-of-interface. Otherwise, each such
4591
// type would need its own customized SSA.
4592
// We move this check to the front of the fast path because many
4593
// type checks are in fact trivially successful in this manner,
4594
// so we get a nicely predicted branch right at the start of the check.
4595
cmpptr(sub_klass, super_klass);
4596
local_jcc(Assembler::equal, *L_success);
4598
// Check the supertype display:
4599
if (must_load_sco) {
4600
// Positive movl does right thing on LP64.
4601
movl(temp_reg, super_check_offset_addr);
4602
super_check_offset = RegisterOrConstant(temp_reg);
4604
Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4605
cmpptr(super_klass, super_check_addr); // load displayed supertype
4607
// This check has worked decisively for primary supers.
4608
// Secondary supers are sought in the super_cache ('super_cache_addr').
4609
// (Secondary supers are interfaces and very deeply nested subtypes.)
4610
// This works in the same check above because of a tricky aliasing
4611
// between the super_cache and the primary super display elements.
4612
// (The 'super_check_addr' can address either, as the case requires.)
4613
// Note that the cache is updated below if it does not help us find
4614
// what we need immediately.
4615
// So if it was a primary super, we can just fail immediately.
4616
// Otherwise, it's the slow path for us (no success at this point).
4618
if (super_check_offset.is_register()) {
4619
local_jcc(Assembler::equal, *L_success);
4620
cmpl(super_check_offset.as_register(), sc_offset);
4621
if (L_failure == &L_fallthrough) {
4622
local_jcc(Assembler::equal, *L_slow_path);
4624
local_jcc(Assembler::notEqual, *L_failure);
4625
final_jmp(*L_slow_path);
4627
} else if (super_check_offset.as_constant() == sc_offset) {
4628
// Need a slow path; fast failure is impossible.
4629
if (L_slow_path == &L_fallthrough) {
4630
local_jcc(Assembler::equal, *L_success);
4632
local_jcc(Assembler::notEqual, *L_slow_path);
4633
final_jmp(*L_success);
4636
// No slow path; it's a fast decision.
4637
if (L_failure == &L_fallthrough) {
4638
local_jcc(Assembler::equal, *L_success);
4640
local_jcc(Assembler::notEqual, *L_failure);
4641
final_jmp(*L_success);
4645
bind(L_fallthrough);
4652
void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4653
Register super_klass,
4658
bool set_cond_codes) {
4659
assert_different_registers(sub_klass, super_klass, temp_reg);
4660
if (temp2_reg != noreg)
4661
assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4662
#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4664
Label L_fallthrough;
4665
int label_nulls = 0;
4666
if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4667
if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4668
assert(label_nulls <= 1, "at most one null in the batch");
4670
// a couple of useful fields in sub_klass:
4671
int ss_offset = in_bytes(Klass::secondary_supers_offset());
4672
int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4673
Address secondary_supers_addr(sub_klass, ss_offset);
4674
Address super_cache_addr( sub_klass, sc_offset);
4676
// Do a linear scan of the secondary super-klass chain.
4677
// This code is rarely used, so simplicity is a virtue here.
4678
// The repne_scan instruction uses fixed registers, which we must spill.
4679
// Don't worry too much about pre-existing connections with the input regs.
4681
assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4682
assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4684
// Get super_klass value into rax (even if it was in rdi or rcx).
4685
bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4686
if (super_klass != rax) {
4687
if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4688
mov(rax, super_klass);
4690
if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4691
if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4694
uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4695
ExternalAddress pst_counter_addr((address) pst_counter);
4696
NOT_LP64( incrementl(pst_counter_addr) );
4697
LP64_ONLY( lea(rcx, pst_counter_addr) );
4698
LP64_ONLY( incrementl(Address(rcx, 0)) );
4701
// We will consult the secondary-super array.
4702
movptr(rdi, secondary_supers_addr);
4703
// Load the array length. (Positive movl does right thing on LP64.)
4704
movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4705
// Skip to start of data.
4706
addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4708
// Scan RCX words at [RDI] for an occurrence of RAX.
4709
// Set NZ/Z based on last compare.
4710
// Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4711
// not change flags (only scas instruction which is repeated sets flags).
4712
// Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4714
testptr(rax,rax); // Set Z = 0
4717
// Unspill the temp. registers:
4718
if (pushed_rdi) pop(rdi);
4719
if (pushed_rcx) pop(rcx);
4720
if (pushed_rax) pop(rax);
4722
if (set_cond_codes) {
4723
// Special hack for the AD files: rdi is guaranteed non-zero.
4724
assert(!pushed_rdi, "rdi must be left non-null");
4725
// Also, the condition codes are properly set Z/NZ on succeed/failure.
4728
if (L_failure == &L_fallthrough)
4729
jccb(Assembler::notEqual, *L_failure);
4730
else jcc(Assembler::notEqual, *L_failure);
4732
// Success. Cache the super we found and proceed in triumph.
4733
movptr(super_cache_addr, super_klass);
4735
if (L_success != &L_fallthrough) {
4741
bind(L_fallthrough);
4746
// population_count variant for running without the POPCNT
4747
// instruction, which was introduced with SSE4.2 in 2008.
4748
void MacroAssembler::population_count(Register dst, Register src,
4749
Register scratch1, Register scratch2) {
4750
assert_different_registers(src, scratch1, scratch2);
4751
if (UsePopCountInstruction) {
4752
Assembler::popcntq(dst, src);
4754
assert_different_registers(src, scratch1, scratch2);
4755
assert_different_registers(dst, scratch1, scratch2);
4760
// while(scratch1 != 0) {
4762
// scratch1 &= (scratch1 - 1);
4765
testq(scratch1, scratch1);
4766
jccb(Assembler::equal, done);
4770
movq(scratch2, scratch1);
4772
andq(scratch1, scratch2);
4773
jccb(Assembler::notEqual, loop);
4779
// Ensure that the inline code and the stub are using the same registers.
4780
#define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
4782
assert(r_super_klass == rax, "mismatch"); \
4783
assert(r_array_base == rbx, "mismatch"); \
4784
assert(r_array_length == rcx, "mismatch"); \
4785
assert(r_array_index == rdx, "mismatch"); \
4786
assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \
4787
assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \
4788
assert(result == rdi || result == noreg, "mismatch"); \
4791
void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
4792
Register r_super_klass,
4798
u1 super_klass_slot) {
4799
assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4801
Label L_fallthrough, L_success, L_failure;
4803
BLOCK_COMMENT("lookup_secondary_supers_table {");
4806
r_array_index = temp1,
4807
r_array_length = temp2,
4808
r_array_base = temp3,
4811
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4813
xorq(result, result); // = 0
4815
movq(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
4816
movq(r_array_index, r_bitmap);
4818
// First check the bitmap to see if super_klass might be present. If
4819
// the bit is zero, we are certain that super_klass is not one of
4820
// the secondary supers.
4821
u1 bit = super_klass_slot;
4823
// NB: If the count in a x86 shift instruction is 0, the flags are
4824
// not affected, so we do a testq instead.
4825
int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4826
if (shift_count != 0) {
4827
salq(r_array_index, shift_count);
4829
testq(r_array_index, r_array_index);
4832
// We test the MSB of r_array_index, i.e. its sign bit
4833
jcc(Assembler::positive, L_failure);
4835
// Get the first array index that can contain super_klass into r_array_index.
4837
population_count(r_array_index, r_array_index, temp2, temp3);
4839
movl(r_array_index, 1);
4841
// NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4843
// We will consult the secondary-super array.
4844
movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4846
// We're asserting that the first word in an Array<Klass*> is the
4847
// length, and the second word is the first word of the data. If
4848
// that ever changes, r_array_base will have to be adjusted here.
4849
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4850
assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4852
cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4853
jccb(Assembler::equal, L_success);
4855
// Is there another entry to check? Consult the bitmap.
4856
btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4857
jccb(Assembler::carryClear, L_failure);
4859
// Linear probe. Rotate the bitmap so that the next bit to test is
4862
rorq(r_bitmap, bit);
4865
// Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4866
// Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4867
// Kills: r_array_length.
4869
call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4870
// Result (0/1) is in rdi
4871
jmpb(L_fallthrough);
4874
incq(result); // 0 => 1
4879
bind(L_fallthrough);
4880
BLOCK_COMMENT("} lookup_secondary_supers_table");
4882
if (VerifySecondarySupers) {
4883
verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4884
temp1, temp2, temp3);
4888
void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4889
Label* L_success, Label* L_failure) {
4890
Label L_loop, L_fallthrough;
4892
int label_nulls = 0;
4893
if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4894
if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4895
assert(label_nulls <= 1, "at most one null in the batch");
4898
cmpq(value, Address(addr, count, Address::times_8));
4899
jcc(Assembler::equal, *L_success);
4902
jcc(Assembler::less, L_loop);
4904
if (&L_fallthrough != L_failure) {
4907
bind(L_fallthrough);
4910
// Called by code generated by check_klass_subtype_slow_path
4911
// above. This is called when there is a collision in the hashed
4912
// lookup in the secondary supers array.
4913
void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4914
Register r_array_base,
4915
Register r_array_index,
4921
assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4924
r_array_length = temp1,
4925
r_sub_klass = noreg,
4928
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4930
Label L_fallthrough;
4931
int label_nulls = 0;
4932
if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4933
if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4934
assert(label_nulls <= 1, "at most one null in the batch");
4936
// Load the array length.
4937
movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4938
// And adjust the array base to point to the data.
4939
// NB! Effectively increments current slot index by 1.
4940
assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4941
addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4946
// The bitmap is full to bursting.
4947
// Implicit invariant: BITMAP_FULL implies (length > 0)
4948
assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "");
4949
cmpq(r_bitmap, (int32_t)-1); // sign-extends immediate to 64-bit value
4950
jcc(Assembler::equal, L_huge);
4952
// NB! Our caller has checked bits 0 and 1 in the bitmap. The
4953
// current slot (at secondary_supers[r_array_index]) has not yet
4954
// been inspected, and r_array_index may be out of bounds if we
4955
// wrapped around the end of the array.
4957
{ // This is conventional linear probing, but instead of terminating
4958
// when a null entry is found in the table, we maintain a bitmap
4959
// in which a 0 indicates missing entries.
4960
// The check above guarantees there are 0s in the bitmap, so the loop
4961
// eventually terminates.
4963
xorl(temp2, temp2); // = 0;
4968
// Check for array wraparound.
4969
cmpl(r_array_index, r_array_length);
4970
cmovl(Assembler::greaterEqual, r_array_index, temp2);
4972
cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4973
jcc(Assembler::equal, *L_success);
4975
// If the next bit in bitmap is zero, we're done.
4976
btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4977
jcc(Assembler::carryClear, *L_failure);
4979
rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4980
addl(r_array_index, 1);
4985
{ // Degenerate case: more than 64 secondary supers.
4986
// FIXME: We could do something smarter here, maybe a vectorized
4987
// comparison or a binary search, but is that worth any added
4990
xorl(r_array_index, r_array_index); // = 0
4991
repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4993
(&L_fallthrough != L_failure ? L_failure : nullptr));
4995
bind(L_fallthrough);
4999
struct VerifyHelperArguments {
5002
intptr_t _linear_result;
5003
intptr_t _table_result;
5006
static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
5007
Klass::on_secondary_supers_verification_failure(args->_super,
5009
args->_linear_result,
5010
args->_table_result,
5014
// Make sure that the hashed lookup and a linear scan agree.
5015
void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5016
Register r_super_klass,
5022
r_array_index = temp1,
5023
r_array_length = temp2,
5024
r_array_base = temp3,
5027
LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
5029
BLOCK_COMMENT("verify_secondary_supers_table {");
5031
Label L_success, L_failure, L_check, L_done;
5033
movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5034
movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5035
// And adjust the array base to point to the data.
5036
addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
5038
testl(r_array_length, r_array_length); // array_length == 0?
5039
jcc(Assembler::zero, L_failure);
5041
movl(r_array_index, 0);
5042
repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
5043
// fall through to L_failure
5045
const Register linear_result = r_array_index; // reuse temp1
5047
bind(L_failure); // not present
5048
movl(linear_result, 1);
5051
bind(L_success); // present
5052
movl(linear_result, 0);
5055
cmpl(linear_result, result);
5056
jcc(Assembler::equal, L_done);
5058
{ // To avoid calling convention issues, build a record on the stack
5059
// and pass the pointer to that instead.
5061
push(linear_result);
5063
push(r_super_klass);
5064
movptr(c_rarg1, rsp);
5065
movptr(c_rarg0, (uintptr_t) "mismatch");
5066
call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
5067
should_not_reach_here();
5071
BLOCK_COMMENT("} verify_secondary_supers_table");
5074
#undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
5078
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
5079
assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
5081
Label L_fallthrough;
5082
if (L_fast_path == nullptr) {
5083
L_fast_path = &L_fallthrough;
5084
} else if (L_slow_path == nullptr) {
5085
L_slow_path = &L_fallthrough;
5088
// Fast path check: class is fully initialized
5089
cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
5090
jcc(Assembler::equal, *L_fast_path);
5092
// Fast path check: current thread is initializer thread
5093
cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
5094
if (L_slow_path == &L_fallthrough) {
5095
jcc(Assembler::equal, *L_fast_path);
5097
} else if (L_fast_path == &L_fallthrough) {
5098
jcc(Assembler::notEqual, *L_slow_path);
5105
void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5106
if (VM_Version::supports_cmov()) {
5107
cmovl(cc, dst, src);
5110
jccb(negate_condition(cc), L);
5116
void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5117
if (VM_Version::supports_cmov()) {
5118
cmovl(cc, dst, src);
5121
jccb(negate_condition(cc), L);
5127
void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
5128
if (!VerifyOops) return;
5130
BLOCK_COMMENT("verify_oop {");
5134
push(rax); // save rax
5135
push(reg); // pass register argument
5137
// Pass register number to verify_oop_subroutine
5138
const char* b = nullptr;
5142
ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
5143
b = code_string(ss.as_string());
5145
AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5146
pushptr(buffer.addr(), rscratch1);
5148
// call indirectly to solve generation ordering problem
5149
movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5151
// Caller pops the arguments (oop, message) and restores rax, r10
5152
BLOCK_COMMENT("} verify_oop");
5155
void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
5156
if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
5157
// Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
5158
// waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
5159
vpternlogd(dst, 0xFF, dst, dst, vector_len);
5160
} else if (VM_Version::supports_avx()) {
5161
vpcmpeqd(dst, dst, dst, vector_len);
5163
assert(VM_Version::supports_sse2(), "");
5168
Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5169
int extra_slot_offset) {
5170
// cf. TemplateTable::prepare_invoke(), if (load_receiver).
5171
int stackElementSize = Interpreter::stackElementSize;
5172
int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5174
int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5175
assert(offset1 - offset == stackElementSize, "correct arithmetic");
5177
Register scale_reg = noreg;
5178
Address::ScaleFactor scale_factor = Address::no_scale;
5179
if (arg_slot.is_constant()) {
5180
offset += arg_slot.as_constant() * stackElementSize;
5182
scale_reg = arg_slot.as_register();
5183
scale_factor = Address::times(stackElementSize);
5185
offset += wordSize; // return PC is on stack
5186
return Address(rsp, scale_reg, scale_factor, offset);
5189
void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5190
if (!VerifyOops) return;
5195
push(rax); // save rax,
5196
// addr may contain rsp so we will have to adjust it based on the push
5197
// we just did (and on 64 bit we do two pushes)
5198
// NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5199
// stores rax into addr which is backwards of what was intended.
5200
if (addr.uses(rsp)) {
5202
pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5207
// Pass register number to verify_oop_subroutine
5208
const char* b = nullptr;
5212
ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5213
b = code_string(ss.as_string());
5215
AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5216
pushptr(buffer.addr(), rscratch1);
5218
// call indirectly to solve generation ordering problem
5219
movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5221
// Caller pops the arguments (addr, message) and restores rax, r10.
5224
void MacroAssembler::verify_tlab() {
5226
if (UseTLAB && VerifyOops) {
5229
Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5232
NOT_LP64(push(thread_reg));
5233
NOT_LP64(get_thread(thread_reg));
5235
movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5236
cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5237
jcc(Assembler::aboveEqual, next);
5238
STOP("assert(top >= start)");
5239
should_not_reach_here();
5242
movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5243
cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5244
jcc(Assembler::aboveEqual, ok);
5245
STOP("assert(top <= end)");
5246
should_not_reach_here();
5249
NOT_LP64(pop(thread_reg));
5259
int rounding_control() const { return (_value >> 10) & 3 ; }
5260
int precision_control() const { return (_value >> 8) & 3 ; }
5261
bool precision() const { return ((_value >> 5) & 1) != 0; }
5262
bool underflow() const { return ((_value >> 4) & 1) != 0; }
5263
bool overflow() const { return ((_value >> 3) & 1) != 0; }
5264
bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5265
bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5266
bool invalid() const { return ((_value >> 0) & 1) != 0; }
5268
void print() const {
5271
switch (rounding_control()) {
5272
case 0: rc = "round near"; break;
5273
case 1: rc = "round down"; break;
5274
case 2: rc = "round up "; break;
5275
case 3: rc = "chop "; break;
5277
rc = nullptr; // silence compiler warnings
5278
fatal("Unknown rounding control: %d", rounding_control());
5280
// precision control
5282
switch (precision_control()) {
5283
case 0: pc = "24 bits "; break;
5284
case 1: pc = "reserved"; break;
5285
case 2: pc = "53 bits "; break;
5286
case 3: pc = "64 bits "; break;
5288
pc = nullptr; // silence compiler warnings
5289
fatal("Unknown precision control: %d", precision_control());
5295
f[2] = (precision ()) ? 'P' : 'p';
5296
f[3] = (underflow ()) ? 'U' : 'u';
5297
f[4] = (overflow ()) ? 'O' : 'o';
5298
f[5] = (zero_divide ()) ? 'Z' : 'z';
5299
f[6] = (denormalized()) ? 'D' : 'd';
5300
f[7] = (invalid ()) ? 'I' : 'i';
5303
printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5312
bool busy() const { return ((_value >> 15) & 1) != 0; }
5313
bool C3() const { return ((_value >> 14) & 1) != 0; }
5314
bool C2() const { return ((_value >> 10) & 1) != 0; }
5315
bool C1() const { return ((_value >> 9) & 1) != 0; }
5316
bool C0() const { return ((_value >> 8) & 1) != 0; }
5317
int top() const { return (_value >> 11) & 7 ; }
5318
bool error_status() const { return ((_value >> 7) & 1) != 0; }
5319
bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5320
bool precision() const { return ((_value >> 5) & 1) != 0; }
5321
bool underflow() const { return ((_value >> 4) & 1) != 0; }
5322
bool overflow() const { return ((_value >> 3) & 1) != 0; }
5323
bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5324
bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5325
bool invalid() const { return ((_value >> 0) & 1) != 0; }
5327
void print() const {
5330
c[0] = (C3()) ? '3' : '-';
5331
c[1] = (C2()) ? '2' : '-';
5332
c[2] = (C1()) ? '1' : '-';
5333
c[3] = (C0()) ? '0' : '-';
5337
f[0] = (error_status()) ? 'E' : '-';
5338
f[1] = (stack_fault ()) ? 'S' : '-';
5339
f[2] = (precision ()) ? 'P' : '-';
5340
f[3] = (underflow ()) ? 'U' : '-';
5341
f[4] = (overflow ()) ? 'O' : '-';
5342
f[5] = (zero_divide ()) ? 'Z' : '-';
5343
f[6] = (denormalized()) ? 'D' : '-';
5344
f[7] = (invalid ()) ? 'I' : '-';
5347
printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5356
int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5358
void print() const {
5359
printf("%04x", _value & 0xFFFF);
5370
bool is_indefinite() const {
5371
return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5374
void print() const {
5375
char sign = (_ex < 0) ? '-' : '+';
5376
const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5377
printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5386
number_of_registers = 8,
5390
ControlWord _control_word;
5391
StatusWord _status_word;
5393
int32_t _error_offset;
5394
int32_t _error_selector;
5395
int32_t _data_offset;
5396
int32_t _data_selector;
5397
int8_t _register[register_size * number_of_registers];
5399
int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5400
FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5402
const char* tag_as_string(int tag) const {
5404
case 0: return "valid";
5405
case 1: return "zero";
5406
case 2: return "special";
5407
case 3: return "empty";
5409
ShouldNotReachHere();
5413
void print() const {
5414
// print computation registers
5415
{ int t = _status_word.top();
5416
for (int i = 0; i < number_of_registers; i++) {
5417
int j = (i - t) & register_mask;
5418
printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5420
printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5424
// print control registers
5425
printf("ctrl = "); _control_word.print(); printf("\n");
5426
printf("stat = "); _status_word .print(); printf("\n");
5427
printf("tags = "); _tag_word .print(); printf("\n");
5432
class Flag_Register {
5436
bool overflow() const { return ((_value >> 11) & 1) != 0; }
5437
bool direction() const { return ((_value >> 10) & 1) != 0; }
5438
bool sign() const { return ((_value >> 7) & 1) != 0; }
5439
bool zero() const { return ((_value >> 6) & 1) != 0; }
5440
bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5441
bool parity() const { return ((_value >> 2) & 1) != 0; }
5442
bool carry() const { return ((_value >> 0) & 1) != 0; }
5444
void print() const {
5447
f[0] = (overflow ()) ? 'O' : '-';
5448
f[1] = (direction ()) ? 'D' : '-';
5449
f[2] = (sign ()) ? 'S' : '-';
5450
f[3] = (zero ()) ? 'Z' : '-';
5451
f[4] = (auxiliary_carry()) ? 'A' : '-';
5452
f[5] = (parity ()) ? 'P' : '-';
5453
f[6] = (carry ()) ? 'C' : '-';
5456
printf("%08x flags = %s", _value, f);
5465
void print() const {
5466
printf("%08x %11d", _value, _value);
5473
Flag_Register _eflags;
5483
void print() const {
5484
// computation registers
5485
printf("rax, = "); _rax.print(); printf("\n");
5486
printf("rbx, = "); _rbx.print(); printf("\n");
5487
printf("rcx = "); _rcx.print(); printf("\n");
5488
printf("rdx = "); _rdx.print(); printf("\n");
5489
printf("rdi = "); _rdi.print(); printf("\n");
5490
printf("rsi = "); _rsi.print(); printf("\n");
5491
printf("rbp, = "); _rbp.print(); printf("\n");
5492
printf("rsp = "); _rsp.print(); printf("\n");
5494
// control registers
5495
printf("flgs = "); _eflags.print(); printf("\n");
5502
FPU_State _fpu_state;
5505
void print() const {
5506
printf("--------------------------------------------------\n");
5510
printf("--------------------------------------------------\n");
5516
static void _print_CPU_state(CPU_State* state) {
5521
void MacroAssembler::print_CPU_state() {
5523
push(rsp); // pass CPU state
5524
call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5525
addptr(rsp, wordSize); // discard argument
5531
static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5532
static int counter = 0;
5533
FPU_State* fs = &state->_fpu_state;
5535
// For leaf calls, only verify that the top few elements remain empty.
5536
// We only need 1 empty at the top for C2 code.
5537
if( stack_depth < 0 ) {
5538
if( fs->tag_for_st(7) != 3 ) {
5539
printf("FPR7 not empty\n");
5541
assert(false, "error");
5544
return true; // All other stack states do not matter
5547
assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
5548
"bad FPU control word");
5550
// compute stack depth
5552
while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
5554
while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5556
if (i != FPU_State::number_of_registers) {
5557
// stack not contiguous
5558
printf("%s: stack not contiguous at ST%d\n", s, i);
5560
assert(false, "error");
5563
// check if computed stack depth corresponds to expected stack depth
5564
if (stack_depth < 0) {
5565
// expected stack depth is -stack_depth or less
5566
if (d > -stack_depth) {
5567
// too many elements on the stack
5568
printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5570
assert(false, "error");
5574
// expected stack depth is stack_depth
5575
if (d != stack_depth) {
5576
// wrong stack depth
5577
printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5579
assert(false, "error");
5583
// everything is cool
5587
void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5588
if (!VerifyFPU) return;
5590
push(rsp); // pass CPU state
5591
ExternalAddress msg((address) s);
5592
// pass message string s
5593
pushptr(msg.addr(), noreg);
5594
push(stack_depth); // pass stack depth
5595
call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5596
addptr(rsp, 3 * wordSize); // discard arguments
5600
jcc(Assembler::notZero, L);
5601
int3(); // break if error condition
5608
void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5609
// Either restore the MXCSR register after returning from the JNI Call
5610
// or verify that it wasn't changed (with -Xcheck:jni flag).
5611
if (VM_Version::supports_sse()) {
5612
if (RestoreMXCSROnJNICalls) {
5613
ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5614
} else if (CheckJNICalls) {
5615
call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5618
// Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5622
// Either restore the x87 floating pointer control word after returning
5623
// from the JNI call or verify that it wasn't changed.
5624
if (CheckJNICalls) {
5625
call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5630
// ((OopHandle)result).resolve();
5631
void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5632
assert_different_registers(result, tmp);
5634
// Only 64 bit platforms support GCs that require a tmp register
5635
// Only IN_HEAP loads require a thread_tmp register
5636
// OopHandle::resolve is an indirection like jobject.
5637
access_load_at(T_OBJECT, IN_NATIVE,
5638
result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5641
// ((WeakHandle)result).resolve();
5642
void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5643
assert_different_registers(rresult, rtmp);
5646
// A null weak handle resolves to null.
5648
jcc(Assembler::equal, resolved);
5650
// Only 64 bit platforms support GCs that require a tmp register
5651
// Only IN_HEAP loads require a thread_tmp register
5652
// WeakHandle::resolve is an indirection like jweak.
5653
access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5654
rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5658
void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5660
const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5661
load_method_holder(mirror, method);
5662
movptr(mirror, Address(mirror, mirror_offset));
5663
resolve_oop_handle(mirror, tmp);
5666
void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5667
load_method_holder(rresult, rmethod);
5668
movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5671
void MacroAssembler::load_method_holder(Register holder, Register method) {
5672
movptr(holder, Address(method, Method::const_offset())); // ConstMethod*
5673
movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5674
movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5677
void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5678
assert_different_registers(src, tmp);
5679
assert_different_registers(dst, tmp);
5681
if (UseCompressedClassPointers) {
5682
movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5683
decode_klass_not_null(dst, tmp);
5686
movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5689
void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5690
assert_different_registers(src, tmp);
5691
assert_different_registers(dst, tmp);
5693
if (UseCompressedClassPointers) {
5694
encode_klass_not_null(src, tmp);
5695
movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5698
movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5701
void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5702
Register tmp1, Register thread_tmp) {
5703
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5704
decorators = AccessInternal::decorator_fixup(decorators, type);
5705
bool as_raw = (decorators & AS_RAW) != 0;
5707
bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5709
bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5713
void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5714
Register tmp1, Register tmp2, Register tmp3) {
5715
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5716
decorators = AccessInternal::decorator_fixup(decorators, type);
5717
bool as_raw = (decorators & AS_RAW) != 0;
5719
bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5721
bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5725
void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5726
Register thread_tmp, DecoratorSet decorators) {
5727
access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5730
// Doesn't do verification, generates fixed size code
5731
void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5732
Register thread_tmp, DecoratorSet decorators) {
5733
access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5736
void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5737
Register tmp2, Register tmp3, DecoratorSet decorators) {
5738
access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5741
// Used for storing nulls.
5742
void MacroAssembler::store_heap_oop_null(Address dst) {
5743
access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5747
void MacroAssembler::store_klass_gap(Register dst, Register src) {
5748
if (UseCompressedClassPointers) {
5749
// Store to klass gap in destination
5750
movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5755
void MacroAssembler::verify_heapbase(const char* msg) {
5756
assert (UseCompressedOops, "should be compressed");
5757
assert (Universe::heap() != nullptr, "java heap should be initialized");
5758
if (CheckCompressedOops) {
5760
ExternalAddress src2(CompressedOops::ptrs_base_addr());
5761
const bool is_src2_reachable = reachable(src2);
5762
if (!is_src2_reachable) {
5763
push(rscratch1); // cmpptr trashes rscratch1
5765
cmpptr(r12_heapbase, src2, rscratch1);
5766
jcc(Assembler::equal, ok);
5769
if (!is_src2_reachable) {
5776
// Algorithm must match oop.inline.hpp encode_heap_oop.
5777
void MacroAssembler::encode_heap_oop(Register r) {
5779
verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5781
verify_oop_msg(r, "broken oop in encode_heap_oop");
5782
if (CompressedOops::base() == nullptr) {
5783
if (CompressedOops::shift() != 0) {
5784
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5785
shrq(r, LogMinObjAlignmentInBytes);
5790
cmovq(Assembler::equal, r, r12_heapbase);
5791
subq(r, r12_heapbase);
5792
shrq(r, LogMinObjAlignmentInBytes);
5795
void MacroAssembler::encode_heap_oop_not_null(Register r) {
5797
verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5798
if (CheckCompressedOops) {
5801
jcc(Assembler::notEqual, ok);
5802
STOP("null oop passed to encode_heap_oop_not_null");
5806
verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5807
if (CompressedOops::base() != nullptr) {
5808
subq(r, r12_heapbase);
5810
if (CompressedOops::shift() != 0) {
5811
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5812
shrq(r, LogMinObjAlignmentInBytes);
5816
void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5818
verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5819
if (CheckCompressedOops) {
5822
jcc(Assembler::notEqual, ok);
5823
STOP("null oop passed to encode_heap_oop_not_null2");
5827
verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5831
if (CompressedOops::base() != nullptr) {
5832
subq(dst, r12_heapbase);
5834
if (CompressedOops::shift() != 0) {
5835
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5836
shrq(dst, LogMinObjAlignmentInBytes);
5840
void MacroAssembler::decode_heap_oop(Register r) {
5842
verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5844
if (CompressedOops::base() == nullptr) {
5845
if (CompressedOops::shift() != 0) {
5846
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5847
shlq(r, LogMinObjAlignmentInBytes);
5851
shlq(r, LogMinObjAlignmentInBytes);
5852
jccb(Assembler::equal, done);
5853
addq(r, r12_heapbase);
5856
verify_oop_msg(r, "broken oop in decode_heap_oop");
5859
void MacroAssembler::decode_heap_oop_not_null(Register r) {
5860
// Note: it will change flags
5861
assert (UseCompressedOops, "should only be used for compressed headers");
5862
assert (Universe::heap() != nullptr, "java heap should be initialized");
5863
// Cannot assert, unverified entry point counts instructions (see .ad file)
5864
// vtableStubs also counts instructions in pd_code_size_limit.
5865
// Also do not verify_oop as this is called by verify_oop.
5866
if (CompressedOops::shift() != 0) {
5867
assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5868
shlq(r, LogMinObjAlignmentInBytes);
5869
if (CompressedOops::base() != nullptr) {
5870
addq(r, r12_heapbase);
5873
assert (CompressedOops::base() == nullptr, "sanity");
5877
void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5878
// Note: it will change flags
5879
assert (UseCompressedOops, "should only be used for compressed headers");
5880
assert (Universe::heap() != nullptr, "java heap should be initialized");
5881
// Cannot assert, unverified entry point counts instructions (see .ad file)
5882
// vtableStubs also counts instructions in pd_code_size_limit.
5883
// Also do not verify_oop as this is called by verify_oop.
5884
if (CompressedOops::shift() != 0) {
5885
assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5886
if (LogMinObjAlignmentInBytes == Address::times_8) {
5887
leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5892
shlq(dst, LogMinObjAlignmentInBytes);
5893
if (CompressedOops::base() != nullptr) {
5894
addq(dst, r12_heapbase);
5898
assert (CompressedOops::base() == nullptr, "sanity");
5905
void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5906
assert_different_registers(r, tmp);
5907
if (CompressedKlassPointers::base() != nullptr) {
5908
mov64(tmp, (int64_t)CompressedKlassPointers::base());
5911
if (CompressedKlassPointers::shift() != 0) {
5912
assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5913
shrq(r, LogKlassAlignmentInBytes);
5917
void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5918
assert_different_registers(src, dst);
5919
if (CompressedKlassPointers::base() != nullptr) {
5920
mov64(dst, -(int64_t)CompressedKlassPointers::base());
5925
if (CompressedKlassPointers::shift() != 0) {
5926
assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5927
shrq(dst, LogKlassAlignmentInBytes);
5931
void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5932
assert_different_registers(r, tmp);
5933
// Note: it will change flags
5934
assert(UseCompressedClassPointers, "should only be used for compressed headers");
5935
// Cannot assert, unverified entry point counts instructions (see .ad file)
5936
// vtableStubs also counts instructions in pd_code_size_limit.
5937
// Also do not verify_oop as this is called by verify_oop.
5938
if (CompressedKlassPointers::shift() != 0) {
5939
assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5940
shlq(r, LogKlassAlignmentInBytes);
5942
if (CompressedKlassPointers::base() != nullptr) {
5943
mov64(tmp, (int64_t)CompressedKlassPointers::base());
5948
void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5949
assert_different_registers(src, dst);
5950
// Note: it will change flags
5951
assert (UseCompressedClassPointers, "should only be used for compressed headers");
5952
// Cannot assert, unverified entry point counts instructions (see .ad file)
5953
// vtableStubs also counts instructions in pd_code_size_limit.
5954
// Also do not verify_oop as this is called by verify_oop.
5956
if (CompressedKlassPointers::base() == nullptr &&
5957
CompressedKlassPointers::shift() == 0) {
5958
// The best case scenario is that there is no base or shift. Then it is already
5959
// a pointer that needs nothing but a register rename.
5962
if (CompressedKlassPointers::base() != nullptr) {
5963
mov64(dst, (int64_t)CompressedKlassPointers::base());
5967
if (CompressedKlassPointers::shift() != 0) {
5968
assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5969
assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5970
leaq(dst, Address(dst, src, Address::times_8, 0));
5977
void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5978
assert (UseCompressedOops, "should only be used for compressed headers");
5979
assert (Universe::heap() != nullptr, "java heap should be initialized");
5980
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5981
int oop_index = oop_recorder()->find_index(obj);
5982
RelocationHolder rspec = oop_Relocation::spec(oop_index);
5983
mov_narrow_oop(dst, oop_index, rspec);
5986
void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5987
assert (UseCompressedOops, "should only be used for compressed headers");
5988
assert (Universe::heap() != nullptr, "java heap should be initialized");
5989
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5990
int oop_index = oop_recorder()->find_index(obj);
5991
RelocationHolder rspec = oop_Relocation::spec(oop_index);
5992
mov_narrow_oop(dst, oop_index, rspec);
5995
void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5996
assert (UseCompressedClassPointers, "should only be used for compressed headers");
5997
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5998
int klass_index = oop_recorder()->find_index(k);
5999
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6000
mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6003
void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6004
assert (UseCompressedClassPointers, "should only be used for compressed headers");
6005
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6006
int klass_index = oop_recorder()->find_index(k);
6007
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6008
mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6011
void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6012
assert (UseCompressedOops, "should only be used for compressed headers");
6013
assert (Universe::heap() != nullptr, "java heap should be initialized");
6014
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6015
int oop_index = oop_recorder()->find_index(obj);
6016
RelocationHolder rspec = oop_Relocation::spec(oop_index);
6017
Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6020
void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6021
assert (UseCompressedOops, "should only be used for compressed headers");
6022
assert (Universe::heap() != nullptr, "java heap should be initialized");
6023
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6024
int oop_index = oop_recorder()->find_index(obj);
6025
RelocationHolder rspec = oop_Relocation::spec(oop_index);
6026
Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6029
void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6030
assert (UseCompressedClassPointers, "should only be used for compressed headers");
6031
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6032
int klass_index = oop_recorder()->find_index(k);
6033
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6034
Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6037
void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6038
assert (UseCompressedClassPointers, "should only be used for compressed headers");
6039
assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6040
int klass_index = oop_recorder()->find_index(k);
6041
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6042
Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6045
void MacroAssembler::reinit_heapbase() {
6046
if (UseCompressedOops) {
6047
if (Universe::heap() != nullptr) {
6048
if (CompressedOops::base() == nullptr) {
6049
MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6051
mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
6054
movptr(r12_heapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
6061
#if COMPILER2_OR_JVMCI
6063
// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
6064
void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
6065
// cnt - number of qwords (8-byte words).
6066
// base - start address, qword aligned.
6067
Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
6068
bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
6069
if (use64byteVector) {
6070
vpxor(xtmp, xtmp, xtmp, AVX_512bit);
6071
} else if (MaxVectorSize >= 32) {
6072
vpxor(xtmp, xtmp, xtmp, AVX_256bit);
6076
jmp(L_zero_64_bytes);
6079
if (MaxVectorSize >= 32) {
6080
fill64(base, 0, xtmp, use64byteVector);
6082
movdqu(Address(base, 0), xtmp);
6083
movdqu(Address(base, 16), xtmp);
6084
movdqu(Address(base, 32), xtmp);
6085
movdqu(Address(base, 48), xtmp);
6089
BIND(L_zero_64_bytes);
6091
jccb(Assembler::greaterEqual, L_loop);
6093
// Copy trailing 64 bytes
6094
if (use64byteVector) {
6096
jccb(Assembler::equal, L_end);
6097
fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
6101
jccb(Assembler::less, L_tail);
6102
if (MaxVectorSize >= 32) {
6103
vmovdqu(Address(base, 0), xtmp);
6105
movdqu(Address(base, 0), xtmp);
6106
movdqu(Address(base, 16), xtmp);
6114
jccb(Assembler::lessEqual, L_end);
6115
if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
6116
fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
6121
movq(Address(base, 0), xtmp);
6124
jccb(Assembler::greaterEqual, L_sloop);
6129
// Clearing constant sized memory using YMM/ZMM registers.
6130
void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
6131
assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
6132
bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
6134
int vector64_count = (cnt & (~0x7)) >> 3;
6136
const int fill64_per_loop = 4;
6137
const int max_unrolled_fill64 = 8;
6139
// 64 byte initialization loop.
6140
vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
6142
if (vector64_count > max_unrolled_fill64) {
6144
Register index = rtmp;
6146
start64 = vector64_count - (vector64_count % fill64_per_loop);
6150
for (int i = 0; i < fill64_per_loop; i++) {
6151
fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
6153
addl(index, fill64_per_loop * 64);
6154
cmpl(index, start64 * 64);
6155
jccb(Assembler::less, LOOP);
6157
for (int i = start64; i < vector64_count; i++) {
6158
fill64(base, i * 64, xtmp, use64byteVector);
6161
// Clear remaining 64 byte tail.
6162
int disp = vector64_count * 64;
6166
movq(Address(base, disp), xtmp);
6169
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
6174
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
6177
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6180
if (use64byteVector) {
6183
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6185
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6186
movq(Address(base, disp + 32), xtmp);
6190
if (use64byteVector) {
6193
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6195
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6196
evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
6200
if (use64byteVector) {
6203
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6205
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6208
evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
6212
fatal("Unexpected length : %d\n",cnt);
6218
void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
6219
bool is_large, KRegister mask) {
6220
// cnt - number of qwords (8-byte words).
6221
// base - start address, qword aligned.
6222
// is_large - if optimizers know cnt is larger than InitArrayShortSize
6223
assert(base==rdi, "base register must be edi for rep stos");
6224
assert(tmp==rax, "tmp register must be eax for rep stos");
6225
assert(cnt==rcx, "cnt register must be ecx for rep stos");
6226
assert(InitArrayShortSize % BytesPerLong == 0,
6227
"InitArrayShortSize should be the multiple of BytesPerLong");
6230
if (!is_large || !UseXMMForObjInit) {
6236
cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6237
jccb(Assembler::greater, LONG);
6239
NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6242
jccb(Assembler::negative, DONE); // Zero length
6244
// Use individual pointer-sized stores for small counts:
6246
movptr(Address(base, cnt, Address::times_ptr), tmp);
6248
jccb(Assembler::greaterEqual, LOOP);
6254
// Use longer rep-prefixed ops for non-small counts:
6256
shlptr(cnt, 3); // convert to number of bytes
6258
} else if (UseXMMForObjInit) {
6259
xmm_clear_mem(base, cnt, tmp, xtmp, mask);
6261
NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6268
#endif //COMPILER2_OR_JVMCI
6271
void MacroAssembler::generate_fill(BasicType t, bool aligned,
6272
Register to, Register value, Register count,
6273
Register rtmp, XMMRegister xtmp) {
6274
ShortBranchVerifier sbv(this);
6275
assert_different_registers(to, value, count, rtmp);
6277
Label L_fill_2_bytes, L_fill_4_bytes;
6279
#if defined(COMPILER2) && defined(_LP64)
6280
if(MaxVectorSize >=32 &&
6281
VM_Version::supports_avx512vlbw() &&
6282
VM_Version::supports_bmi2()) {
6283
generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6299
default: ShouldNotReachHere();
6309
andl(value, 0xffff);
6311
if (t == T_BYTE || t == T_SHORT) {
6317
cmpptr(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
6318
jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6319
if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6320
Label L_skip_align2;
6321
// align source address at 4 bytes address boundary
6323
Label L_skip_align1;
6324
// One byte misalignment happens only for byte arrays
6326
jccb(Assembler::zero, L_skip_align1);
6327
movb(Address(to, 0), value);
6330
BIND(L_skip_align1);
6332
// Two bytes misalignment happens only for byte and short (char) arrays
6334
jccb(Assembler::zero, L_skip_align2);
6335
movw(Address(to, 0), value);
6337
subptr(count, 1<<(shift-1));
6338
BIND(L_skip_align2);
6341
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6342
// Fill 32-byte chunks
6343
subptr(count, 8 << shift);
6344
jcc(Assembler::less, L_check_fill_8_bytes);
6347
BIND(L_fill_32_bytes_loop);
6349
for (int i = 0; i < 32; i += 4) {
6350
movl(Address(to, i), value);
6354
subptr(count, 8 << shift);
6355
jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6356
BIND(L_check_fill_8_bytes);
6357
addptr(count, 8 << shift);
6358
jccb(Assembler::zero, L_exit);
6359
jmpb(L_fill_8_bytes);
6362
// length is too short, just fill qwords
6364
BIND(L_fill_8_bytes_loop);
6365
movl(Address(to, 0), value);
6366
movl(Address(to, 4), value);
6368
BIND(L_fill_8_bytes);
6369
subptr(count, 1 << (shift + 1));
6370
jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6371
// fall through to fill 4 bytes
6373
Label L_fill_32_bytes;
6374
if (!UseUnalignedLoadStores) {
6375
// align to 8 bytes, we know we are 4 byte aligned to start
6377
jccb(Assembler::zero, L_fill_32_bytes);
6378
movl(Address(to, 0), value);
6380
subptr(count, 1<<shift);
6382
BIND(L_fill_32_bytes);
6384
assert( UseSSE >= 2, "supported cpu only" );
6385
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6387
if (UseAVX >= 2 && UseUnalignedLoadStores) {
6388
Label L_check_fill_32_bytes;
6390
// Fill 64-byte chunks
6391
Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6393
// If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
6394
cmpptr(count, VM_Version::avx3_threshold());
6395
jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6397
vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6399
subptr(count, 16 << shift);
6400
jccb(Assembler::less, L_check_fill_32_bytes);
6403
BIND(L_fill_64_bytes_loop_avx3);
6404
evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6406
subptr(count, 16 << shift);
6407
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6408
jmpb(L_check_fill_32_bytes);
6410
BIND(L_check_fill_64_bytes_avx2);
6412
// Fill 64-byte chunks
6413
Label L_fill_64_bytes_loop;
6414
vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6416
subptr(count, 16 << shift);
6417
jcc(Assembler::less, L_check_fill_32_bytes);
6420
BIND(L_fill_64_bytes_loop);
6421
vmovdqu(Address(to, 0), xtmp);
6422
vmovdqu(Address(to, 32), xtmp);
6424
subptr(count, 16 << shift);
6425
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6427
BIND(L_check_fill_32_bytes);
6428
addptr(count, 8 << shift);
6429
jccb(Assembler::less, L_check_fill_8_bytes);
6430
vmovdqu(Address(to, 0), xtmp);
6432
subptr(count, 8 << shift);
6434
BIND(L_check_fill_8_bytes);
6435
// clean upper bits of YMM registers
6437
pshufd(xtmp, xtmp, 0);
6439
// Fill 32-byte chunks
6440
pshufd(xtmp, xtmp, 0);
6442
subptr(count, 8 << shift);
6443
jcc(Assembler::less, L_check_fill_8_bytes);
6446
BIND(L_fill_32_bytes_loop);
6448
if (UseUnalignedLoadStores) {
6449
movdqu(Address(to, 0), xtmp);
6450
movdqu(Address(to, 16), xtmp);
6452
movq(Address(to, 0), xtmp);
6453
movq(Address(to, 8), xtmp);
6454
movq(Address(to, 16), xtmp);
6455
movq(Address(to, 24), xtmp);
6459
subptr(count, 8 << shift);
6460
jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6462
BIND(L_check_fill_8_bytes);
6464
addptr(count, 8 << shift);
6465
jccb(Assembler::zero, L_exit);
6466
jmpb(L_fill_8_bytes);
6469
// length is too short, just fill qwords
6471
BIND(L_fill_8_bytes_loop);
6472
movq(Address(to, 0), xtmp);
6474
BIND(L_fill_8_bytes);
6475
subptr(count, 1 << (shift + 1));
6476
jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6479
// fill trailing 4 bytes
6480
BIND(L_fill_4_bytes);
6481
testl(count, 1<<shift);
6482
jccb(Assembler::zero, L_fill_2_bytes);
6483
movl(Address(to, 0), value);
6484
if (t == T_BYTE || t == T_SHORT) {
6487
BIND(L_fill_2_bytes);
6488
// fill trailing 2 bytes
6489
testl(count, 1<<(shift-1));
6490
jccb(Assembler::zero, L_fill_byte);
6491
movw(Address(to, 0), value);
6495
// fill trailing byte
6497
jccb(Assembler::zero, L_exit);
6498
movb(Address(to, 0), value);
6503
BIND(L_fill_2_bytes);
6508
void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6512
evpbroadcastb(dst, src, vector_len);
6516
evpbroadcastw(dst, src, vector_len);
6520
evpbroadcastd(dst, src, vector_len);
6524
evpbroadcastq(dst, src, vector_len);
6527
fatal("Unhandled type : %s", type2name(type));
6532
// encode char[] to byte[] in ISO_8859_1 or ASCII
6533
//@IntrinsicCandidate
6534
//private static int implEncodeISOArray(byte[] sa, int sp,
6535
//byte[] da, int dp, int len) {
6537
// for (; i < len; i++) {
6538
// char c = StringUTF16.getChar(sa, sp++);
6539
// if (c > '\u00FF')
6541
// da[dp++] = (byte)c;
6546
//@IntrinsicCandidate
6547
//private static int implEncodeAsciiArray(char[] sa, int sp,
6548
// byte[] da, int dp, int len) {
6550
// for (; i < len; i++) {
6551
// char c = sa[sp++];
6552
// if (c >= '\u0080')
6554
// da[dp++] = (byte)c;
6558
void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6559
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6560
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6561
Register tmp5, Register result, bool ascii) {
6568
ShortBranchVerifier sbv(this);
6569
assert_different_registers(src, dst, len, tmp5, result);
6570
Label L_done, L_copy_1_char, L_copy_1_char_exit;
6572
int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6573
int short_mask = ascii ? 0xff80 : 0xff00;
6576
xorl(result, result);
6577
// check for zero length
6579
jcc(Assembler::zero, L_done);
6584
lea(src, Address(src, len, Address::times_2)); // char[]
6585
lea(dst, Address(dst, len, Address::times_1)); // byte[]
6588
if (UseSSE42Intrinsics || UseAVX >= 2) {
6589
Label L_copy_8_chars, L_copy_8_chars_exit;
6590
Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6593
Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6594
movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6595
movdl(tmp1Reg, tmp5);
6596
vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6597
jmp(L_chars_32_check);
6599
bind(L_copy_32_chars);
6600
vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6601
vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6602
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6603
vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6604
jccb(Assembler::notZero, L_copy_32_chars_exit);
6605
vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6606
vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6607
vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6609
bind(L_chars_32_check);
6611
jcc(Assembler::lessEqual, L_copy_32_chars);
6613
bind(L_copy_32_chars_exit);
6615
jccb(Assembler::greater, L_copy_16_chars_exit);
6617
} else if (UseSSE42Intrinsics) {
6618
movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
6619
movdl(tmp1Reg, tmp5);
6620
pshufd(tmp1Reg, tmp1Reg, 0);
6621
jmpb(L_chars_16_check);
6624
bind(L_copy_16_chars);
6626
vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6627
vptest(tmp2Reg, tmp1Reg);
6628
jcc(Assembler::notZero, L_copy_16_chars_exit);
6629
vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6630
vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6633
movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6634
movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6635
vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6637
movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6638
por(tmp2Reg, tmp3Reg);
6639
movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6640
por(tmp2Reg, tmp4Reg);
6642
ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector
6643
jccb(Assembler::notZero, L_copy_16_chars_exit);
6644
packuswb(tmp3Reg, tmp4Reg);
6646
movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6648
bind(L_chars_16_check);
6650
jcc(Assembler::lessEqual, L_copy_16_chars);
6652
bind(L_copy_16_chars_exit);
6654
// clean upper bits of YMM registers
6655
vpxor(tmp2Reg, tmp2Reg);
6656
vpxor(tmp3Reg, tmp3Reg);
6657
vpxor(tmp4Reg, tmp4Reg);
6658
movdl(tmp1Reg, tmp5);
6659
pshufd(tmp1Reg, tmp1Reg, 0);
6662
jccb(Assembler::greater, L_copy_8_chars_exit);
6664
bind(L_copy_8_chars);
6665
movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6666
ptest(tmp3Reg, tmp1Reg);
6667
jccb(Assembler::notZero, L_copy_8_chars_exit);
6668
packuswb(tmp3Reg, tmp1Reg);
6669
movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6671
jccb(Assembler::lessEqual, L_copy_8_chars);
6673
bind(L_copy_8_chars_exit);
6675
jccb(Assembler::zero, L_done);
6678
bind(L_copy_1_char);
6679
load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6680
testl(tmp5, short_mask); // check if Unicode or non-ASCII char
6681
jccb(Assembler::notZero, L_copy_1_char_exit);
6682
movb(Address(dst, len, Address::times_1, 0), tmp5);
6684
jccb(Assembler::less, L_copy_1_char);
6686
bind(L_copy_1_char_exit);
6687
addptr(result, len); // len is negative count of not processed elements
6694
* Helper for multiply_to_len().
6696
void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6697
addq(dest_lo, src1);
6699
addq(dest_lo, src2);
6704
* Multiply 64 bit by 64 bit first loop.
6706
void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6707
Register y, Register y_idx, Register z,
6708
Register carry, Register product,
6709
Register idx, Register kdx) {
6711
// jlong carry, x[], y[], z[];
6712
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6713
// huge_128 product = y[idx] * x[xstart] + carry;
6714
// z[kdx] = (jlong)product;
6715
// carry = (jlong)(product >>> 64);
6717
// z[xstart] = carry;
6720
Label L_first_loop, L_first_loop_exit;
6721
Label L_one_x, L_one_y, L_multiply;
6724
jcc(Assembler::negative, L_one_x);
6726
movq(x_xstart, Address(x, xstart, Address::times_4, 0));
6727
rorq(x_xstart, 32); // convert big-endian to little-endian
6731
jcc(Assembler::negative, L_first_loop_exit);
6733
jcc(Assembler::negative, L_one_y);
6734
movq(y_idx, Address(y, idx, Address::times_4, 0));
6735
rorq(y_idx, 32); // convert big-endian to little-endian
6737
movq(product, x_xstart);
6738
mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6739
addq(product, carry);
6742
movl(Address(z, kdx, Address::times_4, 4), product);
6744
movl(Address(z, kdx, Address::times_4, 0), product);
6749
movl(y_idx, Address(y, 0));
6753
movl(x_xstart, Address(x, 0));
6756
bind(L_first_loop_exit);
6760
* Multiply 64 bit by 64 bit and add 128 bit.
6762
void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6763
Register yz_idx, Register idx,
6764
Register carry, Register product, int offset) {
6765
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6766
// z[kdx] = (jlong)product;
6768
movq(yz_idx, Address(y, idx, Address::times_4, offset));
6769
rorq(yz_idx, 32); // convert big-endian to little-endian
6770
movq(product, x_xstart);
6771
mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6772
movq(yz_idx, Address(z, idx, Address::times_4, offset));
6773
rorq(yz_idx, 32); // convert big-endian to little-endian
6775
add2_with_carry(rdx, product, carry, yz_idx);
6777
movl(Address(z, idx, Address::times_4, offset+4), product);
6779
movl(Address(z, idx, Address::times_4, offset), product);
6784
* Multiply 128 bit by 128 bit. Unrolled inner loop.
6786
void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6787
Register yz_idx, Register idx, Register jdx,
6788
Register carry, Register product,
6790
// jlong carry, x[], y[], z[];
6791
// int kdx = ystart+1;
6792
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6793
// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6794
// z[kdx+idx+1] = (jlong)product;
6795
// jlong carry2 = (jlong)(product >>> 64);
6796
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6797
// z[kdx+idx] = (jlong)product;
6798
// carry = (jlong)(product >>> 64);
6802
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6803
// z[kdx+idx] = (jlong)product;
6804
// carry = (jlong)(product >>> 64);
6808
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6811
andl(jdx, 0xFFFFFFFC);
6816
jcc(Assembler::negative, L_third_loop_exit);
6819
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6822
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6826
bind (L_third_loop_exit);
6829
jcc(Assembler::zero, L_post_third_loop_done);
6833
jcc(Assembler::negative, L_check_1);
6835
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6842
jcc(Assembler::negative, L_post_third_loop_done);
6844
movl(yz_idx, Address(y, idx, Address::times_4, 0));
6845
movq(product, x_xstart);
6846
mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6847
movl(yz_idx, Address(z, idx, Address::times_4, 0));
6849
add2_with_carry(rdx, product, yz_idx, carry);
6851
movl(Address(z, idx, Address::times_4, 0), product);
6856
movq(carry, product);
6858
bind(L_post_third_loop_done);
6862
* Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6865
void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6866
Register carry, Register carry2,
6867
Register idx, Register jdx,
6868
Register yz_idx1, Register yz_idx2,
6869
Register tmp, Register tmp3, Register tmp4) {
6870
assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6872
// jlong carry, x[], y[], z[];
6873
// int kdx = ystart+1;
6874
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6875
// huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6876
// jlong carry2 = (jlong)(tmp3 >>> 64);
6877
// huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
6878
// carry = (jlong)(tmp4 >>> 64);
6879
// z[kdx+idx+1] = (jlong)tmp3;
6880
// z[kdx+idx] = (jlong)tmp4;
6884
// yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6885
// z[kdx+idx] = (jlong)yz_idx1;
6886
// carry = (jlong)(yz_idx1 >>> 64);
6890
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6893
andl(jdx, 0xFFFFFFFC);
6898
jcc(Assembler::negative, L_third_loop_exit);
6901
movq(yz_idx1, Address(y, idx, Address::times_4, 8));
6902
rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6903
movq(yz_idx2, Address(y, idx, Address::times_4, 0));
6904
rorxq(yz_idx2, yz_idx2, 32);
6906
mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6907
mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
6909
movq(yz_idx1, Address(z, idx, Address::times_4, 8));
6910
rorxq(yz_idx1, yz_idx1, 32);
6911
movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6912
rorxq(yz_idx2, yz_idx2, 32);
6914
if (VM_Version::supports_adx()) {
6916
adoxq(tmp3, yz_idx1);
6919
adoxq(tmp4, yz_idx2);
6921
movl(carry, 0); // does not affect flags
6922
adcxq(carry2, carry);
6923
adoxq(carry2, carry);
6925
add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6926
add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6928
movq(carry, carry2);
6930
movl(Address(z, idx, Address::times_4, 12), tmp3);
6932
movl(Address(z, idx, Address::times_4, 8), tmp3);
6934
movl(Address(z, idx, Address::times_4, 4), tmp4);
6936
movl(Address(z, idx, Address::times_4, 0), tmp4);
6940
bind (L_third_loop_exit);
6943
jcc(Assembler::zero, L_post_third_loop_done);
6947
jcc(Assembler::negative, L_check_1);
6949
movq(yz_idx1, Address(y, idx, Address::times_4, 0));
6950
rorxq(yz_idx1, yz_idx1, 32);
6951
mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
6952
movq(yz_idx2, Address(z, idx, Address::times_4, 0));
6953
rorxq(yz_idx2, yz_idx2, 32);
6955
add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6957
movl(Address(z, idx, Address::times_4, 4), tmp3);
6959
movl(Address(z, idx, Address::times_4, 0), tmp3);
6966
jcc(Assembler::negative, L_post_third_loop_done);
6967
movl(tmp4, Address(y, idx, Address::times_4, 0));
6968
mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
6969
movl(tmp4, Address(z, idx, Address::times_4, 0));
6971
add2_with_carry(carry2, tmp3, tmp4, carry);
6973
movl(Address(z, idx, Address::times_4, 0), tmp3);
6980
bind(L_post_third_loop_done);
6984
* Code for BigInteger::multiplyToLen() intrinsic.
6999
void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
7000
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7001
ShortBranchVerifier sbv(this);
7002
assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7013
const Register idx = tmp1;
7014
const Register kdx = tmp2;
7015
const Register xstart = tmp3;
7017
const Register y_idx = tmp4;
7018
const Register carry = tmp5;
7019
const Register product = xlen;
7020
const Register x_xstart = tmp0;
7024
// final static long LONG_MASK = 0xffffffffL;
7025
// int xstart = xlen - 1;
7026
// int ystart = ylen - 1;
7028
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7029
// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7030
// z[kdx] = (int)product;
7031
// carry = product >>> 32;
7033
// z[xstart] = (int)carry;
7036
movl(idx, ylen); // idx = ylen;
7037
lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
7038
xorq(carry, carry); // carry = 0;
7044
jcc(Assembler::negative, L_done);
7046
multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7048
Label L_second_loop;
7050
jcc(Assembler::zero, L_second_loop);
7054
jcc(Assembler::zero, L_carry);
7056
movl(Address(z, kdx, Address::times_4, 0), carry);
7061
movl(Address(z, kdx, Address::times_4, 0), carry);
7063
// Second and third (nested) loops.
7065
// for (int i = xstart-1; i >= 0; i--) { // Second loop
7067
// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7068
// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7069
// (z[k] & LONG_MASK) + carry;
7070
// z[k] = (int)product;
7071
// carry = product >>> 32;
7073
// z[i] = (int)carry;
7076
// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7078
const Register jdx = tmp1;
7080
bind(L_second_loop);
7081
xorl(carry, carry); // carry = 0;
7082
movl(jdx, ylen); // j = ystart+1
7084
subl(xstart, 1); // i = xstart-1;
7085
jcc(Assembler::negative, L_done);
7090
lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7091
subl(xstart, 1); // i = xstart-1;
7092
jcc(Assembler::negative, L_last_x);
7094
if (UseBMI2Instructions) {
7095
movq(rdx, Address(x, xstart, Address::times_4, 0));
7096
rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7098
movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7099
rorq(x_xstart, 32); // convert big-endian to little-endian
7102
Label L_third_loop_prologue;
7103
bind(L_third_loop_prologue);
7110
if (UseBMI2Instructions) {
7111
multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7112
} else { // !UseBMI2Instructions
7113
multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7123
movl(Address(z, tmp3, Address::times_4, 0), carry);
7125
jccb(Assembler::negative, L_done);
7128
movl(Address(z, tmp3, Address::times_4, 0), carry);
7131
// Next infrequent code is moved outside loops.
7133
if (UseBMI2Instructions) {
7134
movl(rdx, Address(x, 0));
7136
movl(x_xstart, Address(x, 0));
7138
jmp(L_third_loop_prologue);
7152
void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
7153
Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
7154
assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
7155
Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
7156
Label VECTOR8_TAIL, VECTOR4_TAIL;
7157
Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
7158
Label SAME_TILL_END, DONE;
7159
Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
7161
//scale is in rcx in both Win64 and Unix
7162
ShortBranchVerifier sbv(this);
7165
xorq(result, result);
7167
if ((AVX3Threshold == 0) && (UseAVX > 2) &&
7168
VM_Version::supports_avx512vlbw()) {
7169
Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
7172
jcc(Assembler::less, VECTOR32_TAIL);
7175
andq(tmp1, 0x3F); // tail count
7176
andq(length, ~(0x3F)); //vector count
7178
bind(VECTOR64_LOOP);
7179
// AVX512 code to compare 64 byte vectors.
7180
evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
7181
evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
7183
jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch
7186
jccb(Assembler::notZero, VECTOR64_LOOP);
7188
//bind(VECTOR64_TAIL);
7190
jcc(Assembler::zero, SAME_TILL_END);
7192
//bind(VECTOR64_TAIL);
7193
// AVX512 code to compare up to 63 byte vectors.
7194
mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
7195
shlxq(tmp2, tmp2, tmp1);
7199
evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7200
evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7203
jcc(Assembler::below, SAME_TILL_END); // not mismatch
7205
bind(VECTOR64_NOT_EQUAL);
7212
bind(VECTOR32_TAIL);
7216
jcc(Assembler::equal, VECTOR8_LOOP);
7217
jcc(Assembler::less, VECTOR4_TAIL);
7220
Label VECTOR16_TAIL, VECTOR32_LOOP;
7223
jcc(Assembler::equal, VECTOR16_LOOP);
7224
jcc(Assembler::less, VECTOR8_LOOP);
7227
jccb(Assembler::less, VECTOR16_TAIL);
7230
bind(VECTOR32_LOOP);
7231
vmovdqu(rymm0, Address(obja, result));
7232
vmovdqu(rymm1, Address(objb, result));
7233
vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7234
vptest(rymm2, rymm2);
7235
jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7238
jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7240
jcc(Assembler::equal, SAME_TILL_END);
7241
//falling through if less than 32 bytes left //close the branch here.
7243
bind(VECTOR16_TAIL);
7245
jccb(Assembler::less, VECTOR8_TAIL);
7246
bind(VECTOR16_LOOP);
7247
movdqu(rymm0, Address(obja, result));
7248
movdqu(rymm1, Address(objb, result));
7249
vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7250
ptest(rymm2, rymm2);
7251
jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7254
jcc(Assembler::equal, SAME_TILL_END);
7255
//falling through if less than 16 bytes left
7256
} else {//regular intrinsics
7259
jccb(Assembler::less, VECTOR8_TAIL);
7262
bind(VECTOR16_LOOP);
7263
movdqu(rymm0, Address(obja, result));
7264
movdqu(rymm1, Address(objb, result));
7266
ptest(rymm0, rymm0);
7267
jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7270
jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7272
jcc(Assembler::equal, SAME_TILL_END);
7273
//falling through if less than 16 bytes left
7278
jccb(Assembler::less, VECTOR4_TAIL);
7280
movq(tmp1, Address(obja, result));
7281
movq(tmp2, Address(objb, result));
7284
jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7287
jcc(Assembler::equal, SAME_TILL_END);
7288
//falling through if less than 8 bytes left
7292
jccb(Assembler::less, BYTES_TAIL);
7294
movl(tmp1, Address(obja, result));
7295
xorl(tmp1, Address(objb, result));
7297
jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7300
jcc(Assembler::equal, SAME_TILL_END);
7301
//falling through if less than 4 bytes left
7305
load_unsigned_byte(tmp1, Address(obja, result));
7306
load_unsigned_byte(tmp2, Address(objb, result));
7309
jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7311
jcc(Assembler::zero, SAME_TILL_END);
7313
load_unsigned_byte(tmp1, Address(obja, result));
7314
load_unsigned_byte(tmp2, Address(objb, result));
7317
jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7319
jcc(Assembler::zero, SAME_TILL_END);
7321
load_unsigned_byte(tmp1, Address(obja, result));
7322
load_unsigned_byte(tmp2, Address(objb, result));
7325
jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7329
bind(VECTOR32_NOT_EQUAL);
7330
vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7331
vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7332
vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7333
vpmovmskb(tmp1, rymm0);
7340
bind(VECTOR16_NOT_EQUAL);
7342
vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7343
vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7346
pcmpeqb(rymm2, rymm2);
7348
pcmpeqb(rymm0, rymm1);
7351
pmovmskb(tmp1, rymm0);
7357
bind(VECTOR8_NOT_EQUAL);
7358
bind(VECTOR4_NOT_EQUAL);
7362
bind(BYTES_NOT_EQUAL);
7366
bind(SAME_TILL_END);
7372
//Helper functions for square_to_len()
7375
* Store the squares of x[], right shifted one bit (divided by 2) into z[]
7376
* Preserves x and z and modifies rest of the registers.
7378
void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7379
// Perform square and right shift by 1
7380
// Handle odd xlen case first, then for even xlen do the following
7382
// for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7383
// huge_128 product = x[j:j+1] * x[j:j+1];
7384
// z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7385
// z[i+2:i+3] = (jlong)(product >>> 1);
7386
// carry = (jlong)product;
7389
xorq(tmp5, tmp5); // carry
7390
xorq(rdxReg, rdxReg);
7391
xorl(tmp1, tmp1); // index for x
7392
xorl(tmp4, tmp4); // index for z
7394
Label L_first_loop, L_first_loop_exit;
7397
jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7399
// Square and right shift by 1 the odd element using 32 bit multiply
7400
movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7401
imulq(raxReg, raxReg);
7404
movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7408
// Square and right shift by 1 the rest using 64 bit multiply
7411
jccb(Assembler::equal, L_first_loop_exit);
7414
movq(raxReg, Address(x, tmp1, Address::times_4, 0));
7415
rorq(raxReg, 32); // convert big-endian to little-endian
7416
mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
7418
// Right shift by 1 and save carry
7419
shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7424
// Store result in z
7425
movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7426
movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7428
// Update indices for x and z
7433
bind(L_first_loop_exit);
7438
* Perform the following multiply add operation using BMI2 instructions
7439
* carry:sum = sum + op1*op2 + carry
7440
* op2 should be in rdx
7441
* op2 is preserved, all other registers are modified
7443
void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7444
// assert op2 is rdx
7445
mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
7454
* Perform the following multiply add operation:
7455
* carry:sum = sum + op1*op2 + carry
7456
* Preserves op1, op2 and modifies rest of registers
7458
void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7459
// rdx:rax = op1 * op2
7463
// rdx:rax = sum + carry + rdx:rax
7469
// carry:sum = rdx:sum
7470
movq(carry, rdxReg);
7474
* Add 64 bit long carry into z[] with carry propagation.
7475
* Preserves z and carry register values and modifies rest of registers.
7478
void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7479
Label L_fourth_loop, L_fourth_loop_exit;
7483
addq(Address(z, zlen, Address::times_4, 0), carry);
7485
bind(L_fourth_loop);
7486
jccb(Assembler::carryClear, L_fourth_loop_exit);
7488
jccb(Assembler::negative, L_fourth_loop_exit);
7489
addq(Address(z, zlen, Address::times_4, 0), tmp1);
7491
bind(L_fourth_loop_exit);
7495
* Shift z[] left by 1 bit.
7496
* Preserves x, len, z and zlen registers and modifies rest of the registers.
7499
void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7501
Label L_fifth_loop, L_fifth_loop_exit;
7504
// Perform primitiveLeftShift(z, zlen, 1)
7506
const Register prev_carry = tmp1;
7507
const Register new_carry = tmp4;
7508
const Register value = tmp2;
7509
const Register zidx = tmp3;
7514
// for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7515
// (carry:value) = (z[i] << 1) | carry ;
7520
xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7523
decl(zidx); // Use decl to preserve carry flag
7525
jccb(Assembler::negative, L_fifth_loop_exit);
7527
if (UseBMI2Instructions) {
7528
movq(value, Address(z, zidx, Address::times_4, 0));
7530
rorxq(value, value, 32);
7531
movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7535
xorl(new_carry, new_carry);
7537
// Shift z[i] by 1, or in previous carry and save new carry
7538
movq(value, Address(z, zidx, Address::times_4, 0));
7542
orq(value, prev_carry);
7544
movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
7546
// Set previous carry = new carry
7547
movl(prev_carry, new_carry);
7551
bind(L_fifth_loop_exit);
7556
* Code for BigInteger::squareToLen() intrinsic
7569
void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7571
Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7579
// Store the squares, right shifted one bit (i.e., divided by 2).
7580
square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7582
// Add in off-diagonal sums.
7584
// Second, third (nested) and fourth loops.
7586
// for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7588
// long op2 = x[xidx:xidx+1];
7589
// for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7591
// long op1 = x[j:j+1];
7592
// long sum = z[k:k+1];
7593
// carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7596
// add_one_64(z, k, carry, tmp_regs);
7599
const Register carry = tmp5;
7600
const Register sum = tmp3;
7601
const Register op1 = tmp4;
7602
Register op2 = tmp2;
7607
bind(L_second_loop);
7614
jccb(Assembler::lessEqual, L_second_loop_exit);
7616
// Multiply an array by one 64 bit long.
7617
if (UseBMI2Instructions) {
7619
movq(op2, Address(x, len, Address::times_4, 0));
7620
rorxq(op2, op2, 32);
7623
movq(op2, Address(x, len, Address::times_4, 0));
7629
jccb(Assembler::negative, L_third_loop_exit);
7631
jccb(Assembler::negative, L_last_x);
7633
movq(op1, Address(x, len, Address::times_4, 0));
7638
movq(sum, Address(z, zlen, Address::times_4, 0));
7640
// Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7641
if (UseBMI2Instructions) {
7642
multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7645
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7648
movq(Address(z, zlen, Address::times_4, 0), sum);
7651
bind(L_third_loop_exit);
7654
// Add 64 bit long carry into z with carry propagation.
7655
// Uses offsetted zlen.
7656
add_one_64(z, zlen, carry, tmp1);
7662
// Next infrequent code is moved outside loops.
7664
movl(op1, Address(x, 0));
7667
bind(L_second_loop_exit);
7674
// Shift z left 1 bit.
7675
lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7677
// z[zlen-1] |= x[len-1] & 1;
7678
movl(tmp3, Address(x, len, Address::times_4, -4));
7680
orl(Address(z, zlen, Address::times_4, -4), tmp3);
7690
* Helper function for mul_add()
7691
* Multiply the in[] by int k and add to out[] starting at offset offs using
7692
* 128 bit by 32 bit multiply and return the carry in tmp5.
7693
* Only quad int aligned length of in[] is operated on in this function.
7694
* k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7695
* This function preserves out, in and k registers.
7696
* len and offset point to the appropriate index in "in" & "out" correspondingly
7697
* tmp5 has the carry.
7698
* other registers are temporary and are modified.
7701
void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7702
Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7703
Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7705
Label L_first_loop, L_first_loop_exit;
7712
jccb(Assembler::negative, L_first_loop_exit);
7717
Register op2 = tmp2;
7718
const Register sum = tmp3;
7719
const Register op1 = tmp4;
7720
const Register carry = tmp5;
7722
if (UseBMI2Instructions) {
7726
movq(op1, Address(in, len, Address::times_4, 8));
7728
movq(sum, Address(out, offset, Address::times_4, 8));
7730
if (UseBMI2Instructions) {
7731
multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7734
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7736
// Store back in big endian from little endian
7738
movq(Address(out, offset, Address::times_4, 8), sum);
7740
movq(op1, Address(in, len, Address::times_4, 0));
7742
movq(sum, Address(out, offset, Address::times_4, 0));
7744
if (UseBMI2Instructions) {
7745
multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7748
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7750
// Store back in big endian from little endian
7752
movq(Address(out, offset, Address::times_4, 0), sum);
7755
bind(L_first_loop_exit);
7759
* Code for BigInteger::mulAdd() intrinsic
7763
* r11: offs (out.length - offset)
7771
* Multiply the in[] by word k and add to out[], return the carry in rax
7773
void MacroAssembler::mul_add(Register out, Register in, Register offs,
7774
Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7775
Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7777
Label L_carry, L_last_in, L_done;
7780
// for (int j=len-1; j >= 0; j--) {
7781
// long product = (in[j] & LONG_MASK) * kLong +
7782
// (out[offs] & LONG_MASK) + carry;
7783
// out[offs--] = (int)product;
7784
// carry = product >>> 32;
7793
Register op2 = tmp2;
7794
const Register sum = tmp3;
7795
const Register op1 = tmp4;
7796
const Register carry = tmp5;
7798
if (UseBMI2Instructions) {
7810
//Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7811
//The carry is in tmp5
7812
mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7814
//Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7816
jccb(Assembler::negative, L_carry);
7818
jccb(Assembler::negative, L_last_in);
7820
movq(op1, Address(in, len, Address::times_4, 0));
7824
movq(sum, Address(out, offs, Address::times_4, 0));
7827
if (UseBMI2Instructions) {
7828
multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7831
multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7834
// Store back in big endian from little endian
7836
movq(Address(out, offs, Address::times_4, 0), sum);
7839
jccb(Assembler::zero, L_carry);
7841
//Multiply the last in[] entry, if any
7843
movl(op1, Address(in, 0));
7844
movl(sum, Address(out, offs, Address::times_4, -4));
7847
mull(op1); //tmp4 * eax -> edx:eax
7852
movl(carry, rdxReg);
7854
movl(Address(out, offs, Address::times_4, -4), sum);
7857
//return tmp5/carry as carry in rax
7870
* Emits code to update CRC-32 with a byte value according to constants in table
7872
* @param [in,out]crc Register containing the crc.
7873
* @param [in]val Register containing the byte to fold into the CRC.
7874
* @param [in]table Register containing the table of crc constants.
7877
* val = crc_table[(val ^ crc) & 0xFF];
7878
* crc = val ^ (crc >> 8);
7881
void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7884
shrl(crc, 8); // unsigned shift
7885
xorl(crc, Address(table, val, Address::times_4, 0));
7889
* Fold 128-bit data chunk
7891
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7893
vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7894
vpclmulldq(xcrc, xK, xcrc); // [63:0]
7895
vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7899
pclmulhdq(xtmp, xK); // [123:64]
7900
pclmulldq(xcrc, xK); // [63:0]
7902
movdqu(xtmp, Address(buf, offset));
7907
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7909
vpclmulhdq(xtmp, xK, xcrc);
7910
vpclmulldq(xcrc, xK, xcrc);
7915
pclmulhdq(xtmp, xK);
7916
pclmulldq(xcrc, xK);
7923
* 8-bit folds to compute 32-bit CRC
7926
* timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7928
void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7931
movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7932
psrldq(xcrc, 1); // unsigned shift one byte
7938
* timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7940
void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7944
xorl(crc, Address(table, tmp, Address::times_4, 0));
7948
* @param crc register containing existing CRC (32-bit)
7949
* @param buf register pointing to input byte buffer (byte*)
7950
* @param len register containing number of bytes
7951
* @param table register that will contain address of CRC table
7952
* @param tmp scratch register
7954
void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7955
assert_different_registers(crc, buf, len, table, tmp, rax);
7957
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7958
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7960
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7961
// context for the registers used, where all instructions below are using 128-bit mode
7962
// On EVEX without VL and BW, these instructions will all be AVX.
7963
lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7966
jcc(Assembler::less, L_tail);
7968
// Align buffer to 16 bytes
7971
jccb(Assembler::zero, L_aligned);
7977
movsbl(rax, Address(buf, 0)); // load byte with sign extension
7978
update_byte_crc32(crc, rax, table);
7981
jccb(Assembler::less, L_align_loop);
7984
movl(tmp, len); // save
7986
jcc(Assembler::zero, L_tail_restore);
7988
// Fold crc into first bytes of vector
7989
movdqa(xmm1, Address(buf, 0));
7992
if (VM_Version::supports_sse4_1()) {
7993
pinsrd(xmm1, crc, 0);
7995
pinsrw(xmm1, crc, 0);
7997
pinsrw(xmm1, crc, 1);
8000
subl(len, 4); // len > 0
8001
jcc(Assembler::less, L_fold_tail);
8003
movdqa(xmm2, Address(buf, 0));
8004
movdqa(xmm3, Address(buf, 16));
8005
movdqa(xmm4, Address(buf, 32));
8008
jcc(Assembler::lessEqual, L_fold_512b);
8010
// Fold total 512 bits of polynomial on each iteration,
8011
// 128 bits per each of 4 parallel streams.
8012
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
8015
BIND(L_fold_512b_loop);
8016
fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8017
fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8018
fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8019
fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8022
jcc(Assembler::greater, L_fold_512b_loop);
8024
// Fold 512 bits to 128 bits.
8026
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8027
fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8028
fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8029
fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8031
// Fold the rest of 128 bits data chunks
8034
jccb(Assembler::lessEqual, L_fold_128b);
8035
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8037
BIND(L_fold_tail_loop);
8038
fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
8041
jccb(Assembler::greater, L_fold_tail_loop);
8043
// Fold 128 bits in xmm1 down into 32 bits in crc register.
8045
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
8047
vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
8048
vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
8049
vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
8052
pclmulqdq(xmm2, xmm1, 0x1);
8055
pclmulqdq(xmm0, xmm3, 0x1);
8062
// 8 8-bit folds to compute 32-bit CRC.
8063
for (int j = 0; j < 4; j++) {
8064
fold_8bit_crc32(xmm0, table, xmm1, rax);
8066
movdl(crc, xmm0); // mov 32 bits to general register
8067
for (int j = 0; j < 4; j++) {
8068
fold_8bit_crc32(crc, table, rax);
8071
BIND(L_tail_restore);
8072
movl(len, tmp); // restore
8075
jccb(Assembler::zero, L_exit);
8077
// Fold the rest of bytes
8080
movsbl(rax, Address(buf, 0)); // load byte with sign extension
8081
update_byte_crc32(crc, rax, table);
8084
jccb(Assembler::greater, L_tail_loop);
8091
// Helper function for AVX 512 CRC32
8092
// Fold 512-bit data chunks
8093
void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
8094
Register pos, int offset) {
8095
evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
8096
evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
8097
evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
8098
evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
8099
evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
8102
// Helper function for AVX 512 CRC32
8103
// Compute CRC32 for < 256B buffers
8104
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
8105
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
8106
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
8108
Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
8109
Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
8110
Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
8112
// check if there is enough buffer to be able to fold 16B at a time
8114
jcc(Assembler::less, L_less_than_32);
8116
// if there is, load the constants
8117
movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
8118
movdl(xmm0, crc); // get the initial crc value
8119
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8122
// update the buffer pointer
8124
//update the counter.subtract 32 instead of 16 to save one instruction from the loop
8126
jmp(L_16B_reduction_loop);
8128
bind(L_less_than_32);
8129
//mov initial crc to the return value. this is necessary for zero - length buffers.
8132
jcc(Assembler::equal, L_cleanup);
8134
movdl(xmm0, crc); //get the initial crc value
8137
jcc(Assembler::equal, L_exact_16_left);
8138
jcc(Assembler::less, L_less_than_16_left);
8140
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8141
pxor(xmm7, xmm0); //xor the initial crc value
8144
movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
8145
jmp(L_get_last_two_xmms);
8147
bind(L_less_than_16_left);
8148
//use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
8151
movdqu(Address(tmp1, 0 * 16), xmm1);
8154
jcc(Assembler::less, L_only_less_than_4);
8156
//backup the counter value
8159
jcc(Assembler::less, L_less_than_8_left);
8162
movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
8163
movq(Address(tmp1, 0 * 16), rax);
8168
bind(L_less_than_8_left);
8170
jcc(Assembler::less, L_less_than_4_left);
8173
movl(rax, Address(buf, pos, Address::times_1, 0));
8174
movl(Address(tmp1, 0 * 16), rax);
8179
bind(L_less_than_4_left);
8181
jcc(Assembler::less, L_less_than_2_left);
8184
movw(rax, Address(buf, pos, Address::times_1, 0));
8185
movl(Address(tmp1, 0 * 16), rax);
8190
bind(L_less_than_2_left);
8192
jcc(Assembler::less, L_zero_left);
8195
movb(rax, Address(buf, pos, Address::times_1, 0));
8196
movb(Address(tmp1, 0 * 16), rax);
8199
movdqu(xmm7, Address(rsp, 0));
8200
pxor(xmm7, xmm0); //xor the initial crc value
8202
lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8203
movdqu(xmm0, Address(rax, tmp2));
8207
bind(L_exact_16_left);
8208
movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8209
pxor(xmm7, xmm0); //xor the initial crc value
8212
bind(L_only_less_than_4);
8214
jcc(Assembler::less, L_only_less_than_3);
8217
movb(rax, Address(buf, pos, Address::times_1, 0));
8218
movb(Address(tmp1, 0), rax);
8220
movb(rax, Address(buf, pos, Address::times_1, 1));
8221
movb(Address(tmp1, 1), rax);
8223
movb(rax, Address(buf, pos, Address::times_1, 2));
8224
movb(Address(tmp1, 2), rax);
8226
movdqu(xmm7, Address(rsp, 0));
8227
pxor(xmm7, xmm0); //xor the initial crc value
8231
bind(L_only_less_than_3);
8233
jcc(Assembler::less, L_only_less_than_2);
8236
movb(rax, Address(buf, pos, Address::times_1, 0));
8237
movb(Address(tmp1, 0), rax);
8239
movb(rax, Address(buf, pos, Address::times_1, 1));
8240
movb(Address(tmp1, 1), rax);
8242
movdqu(xmm7, Address(rsp, 0));
8243
pxor(xmm7, xmm0); //xor the initial crc value
8248
bind(L_only_less_than_2);
8250
movb(rax, Address(buf, pos, Address::times_1, 0));
8251
movb(Address(tmp1, 0), rax);
8253
movdqu(xmm7, Address(rsp, 0));
8254
pxor(xmm7, xmm0); //xor the initial crc value
8260
* Compute CRC32 using AVX512 instructions
8261
* param crc register containing existing CRC (32-bit)
8262
* param buf register pointing to input byte buffer (byte*)
8263
* param len register containing number of bytes
8264
* param table address of crc or crc32c table
8265
* param tmp1 scratch register
8266
* param tmp2 scratch register
8267
* return rax result register
8269
* This routine is identical for crc32c with the exception of the precomputed constant
8270
* table which will be passed as the table argument. The calculation steps are
8271
* the same for both variants.
8273
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8274
assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8276
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8277
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8278
Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8279
Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8280
Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8282
const Register pos = r12;
8284
subptr(rsp, 16 * 2 + 8);
8286
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8287
// context for the registers used, where all instructions below are using 128-bit mode
8288
// On EVEX without VL and BW, these instructions will all be AVX.
8291
// check if smaller than 256B
8293
jcc(Assembler::less, L_less_than_256);
8295
// load the initial crc value
8298
// receive the initial 64B data, xor the initial crc value
8299
evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8300
evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8301
evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8302
evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8306
jcc(Assembler::less, L_fold_128_B_loop);
8308
evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8309
evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8310
evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8313
bind(L_fold_256_B_loop);
8315
fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8316
fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8317
fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8318
fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8321
jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8323
// Fold 256 into 128
8325
evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8326
evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8327
vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8329
evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8330
evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8331
vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8333
evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8334
evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8337
jmp(L_fold_128_B_register);
8339
// at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8340
// loop will fold 128B at a time until we have 128 + y Bytes of buffer
8342
// fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8343
bind(L_fold_128_B_loop);
8345
fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8346
fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8349
jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8353
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8354
// the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8355
bind(L_fold_128_B_register);
8356
evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8357
evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8358
evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8359
evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8360
// save last that has no multiplicand
8361
vextracti64x2(xmm7, xmm4, 3);
8363
evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8364
evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8365
// Needed later in reduction loop
8366
movdqu(xmm10, Address(table, 1 * 16));
8367
vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8368
vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8370
// Swap 1,0,3,2 - 01 00 11 10
8371
evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8372
evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8373
vextracti128(xmm5, xmm8, 1);
8374
evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8376
// instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8377
// instead of a cmp instruction, we use the negative flag with the jl instruction
8378
addl(len, 128 - 16);
8379
jcc(Assembler::less, L_final_reduction_for_128);
8381
bind(L_16B_reduction_loop);
8382
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8383
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8384
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8385
movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8386
vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8389
jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8391
bind(L_final_reduction_for_128);
8393
jcc(Assembler::equal, L_128_done);
8395
bind(L_get_last_two_xmms);
8398
movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8401
// get rid of the extra data that was loaded before
8402
// load the shift constant
8403
lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8404
movdqu(xmm0, Address(rax, len));
8407
vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8408
//Change mask to 512
8409
vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8410
vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8412
blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8413
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8414
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8415
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8416
vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8419
// compute crc of a 128-bit value
8420
movdqu(xmm10, Address(table, 3 * 16));
8424
vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8425
vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8426
vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8430
vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8431
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8432
vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8435
bind(L_less_than_256);
8436
kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8440
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8443
movdqu(xmm10, Address(table, 4 * 16));
8445
pclmulqdq(xmm7, xmm10, 0x0);
8447
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8449
pclmulqdq(xmm7, xmm10, 0x10);
8452
pextrd(crc, xmm7, 2);
8455
addptr(rsp, 16 * 2 + 8);
8459
// S. Gueron / Information Processing Letters 112 (2012) 184
8460
// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8461
// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8462
// Output: the 64-bit carry-less product of B * CONST
8463
void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8464
Register tmp1, Register tmp2, Register tmp3) {
8465
lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8467
addq(tmp3, n * 256 * 8);
8469
// Q1 = TABLEExt[n][B & 0xFF];
8471
andl(tmp1, 0x000000FF);
8474
movq(tmp1, Address(tmp1, 0));
8476
// Q2 = TABLEExt[n][B >> 8 & 0xFF];
8479
andl(tmp2, 0x000000FF);
8482
movq(tmp2, Address(tmp2, 0));
8487
// Q3 = TABLEExt[n][B >> 16 & 0xFF];
8490
andl(tmp2, 0x000000FF);
8493
movq(tmp2, Address(tmp2, 0));
8498
// Q4 = TABLEExt[n][B >> 24 & 0xFF];
8500
andl(in, 0x000000FF);
8503
movq(in, Address(in, 0));
8507
// return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8510
void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8512
uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8513
XMMRegister w_xtmp2,
8515
Register n_tmp2, Register n_tmp3) {
8516
if (is_pclmulqdq_supported) {
8517
movdl(w_xtmp1, in_out); // modified blindly
8519
movl(tmp1, const_or_pre_comp_const_index);
8520
movdl(w_xtmp2, tmp1);
8521
pclmulqdq(w_xtmp1, w_xtmp2, 0);
8523
movdq(in_out, w_xtmp1);
8525
crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8529
// Recombination Alternative 2: No bit-reflections
8530
// T1 = (CRC_A * U1) << 1
8531
// T2 = (CRC_B * U2) << 1
8534
// T1 = T1 & 0xFFFFFFFF
8535
// T2 = T2 & 0xFFFFFFFF
8540
// CRC = C1 ^ C2 ^ CRC_C
8541
void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8542
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8543
Register tmp1, Register tmp2,
8545
crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8546
crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8551
crc32(tmp2, tmp1, 4);
8552
xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8557
crc32(tmp2, tmp1, 4);
8563
// Set N to predefined value
8564
// Subtract from a length of a buffer
8565
// execute in a loop:
8566
// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8568
// CRC_A = CRC32(CRC_A, A[i])
8569
// CRC_B = CRC32(CRC_B, B[i])
8570
// CRC_C = CRC32(CRC_C, C[i])
8573
void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8574
Register in_out1, Register in_out2, Register in_out3,
8575
Register tmp1, Register tmp2, Register tmp3,
8576
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8577
Register tmp4, Register tmp5,
8579
Label L_processPartitions;
8580
Label L_processPartition;
8583
bind(L_processPartitions);
8584
cmpl(in_out1, 3 * size);
8585
jcc(Assembler::less, L_exit);
8588
movq(tmp3, in_out2);
8591
bind(L_processPartition);
8592
crc32(in_out3, Address(in_out2, 0), 8);
8593
crc32(tmp1, Address(in_out2, size), 8);
8594
crc32(tmp2, Address(in_out2, size * 2), 8);
8596
cmpq(in_out2, tmp3);
8597
jcc(Assembler::less, L_processPartition);
8598
crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8599
w_xtmp1, w_xtmp2, w_xtmp3,
8602
addq(in_out2, 2 * size);
8603
subl(in_out1, 3 * size);
8604
jmp(L_processPartitions);
8609
void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
8610
Register tmp1, Register tmp2, Register tmp3,
8611
XMMRegister xtmp1, XMMRegister xtmp2) {
8612
lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8614
addl(tmp3, n * 256 * 8);
8616
// Q1 = TABLEExt[n][B & 0xFF];
8618
andl(tmp1, 0x000000FF);
8621
movq(xtmp1, Address(tmp1, 0));
8623
// Q2 = TABLEExt[n][B >> 8 & 0xFF];
8626
andl(tmp2, 0x000000FF);
8629
movq(xtmp2, Address(tmp2, 0));
8634
// Q3 = TABLEExt[n][B >> 16 & 0xFF];
8637
andl(tmp2, 0x000000FF);
8640
movq(xtmp2, Address(tmp2, 0));
8645
// Q4 = TABLEExt[n][B >> 24 & 0xFF];
8647
andl(in_out, 0x000000FF);
8650
movq(xtmp2, Address(in_out, 0));
8653
pxor(xtmp1, xtmp2); // Result in CXMM
8654
// return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8657
void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8659
uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8660
XMMRegister w_xtmp2,
8662
Register n_tmp2, Register n_tmp3) {
8663
if (is_pclmulqdq_supported) {
8664
movdl(w_xtmp1, in_out);
8666
movl(tmp1, const_or_pre_comp_const_index);
8667
movdl(w_xtmp2, tmp1);
8668
pclmulqdq(w_xtmp1, w_xtmp2, 0);
8669
// Keep result in XMM since GPR is 32 bit in length
8671
crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
8675
void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8676
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8677
Register tmp1, Register tmp2,
8679
crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8680
crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8683
movdl(tmp1, w_xtmp1);
8685
movdl(in_out, w_xtmp1);
8688
crc32(tmp2, tmp1, 4);
8692
movdl(tmp1, w_xtmp2);
8694
movdl(in1, w_xtmp2);
8697
crc32(tmp2, tmp1, 4);
8703
void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8704
Register in_out1, Register in_out2, Register in_out3,
8705
Register tmp1, Register tmp2, Register tmp3,
8706
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8707
Register tmp4, Register tmp5,
8709
Label L_processPartitions;
8710
Label L_processPartition;
8713
bind(L_processPartitions);
8714
cmpl(in_out1, 3 * size);
8715
jcc(Assembler::less, L_exit);
8718
movl(tmp3, in_out2);
8721
bind(L_processPartition);
8722
crc32(in_out3, Address(in_out2, 0), 4);
8723
crc32(tmp1, Address(in_out2, size), 4);
8724
crc32(tmp2, Address(in_out2, size*2), 4);
8725
crc32(in_out3, Address(in_out2, 0+4), 4);
8726
crc32(tmp1, Address(in_out2, size+4), 4);
8727
crc32(tmp2, Address(in_out2, size*2+4), 4);
8729
cmpl(in_out2, tmp3);
8730
jcc(Assembler::less, L_processPartition);
8739
crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8740
w_xtmp1, w_xtmp2, w_xtmp3,
8748
addl(in_out2, 2 * size);
8749
subl(in_out1, 3 * size);
8750
jmp(L_processPartitions);
8757
// Algorithm 2: Pipelined usage of the CRC32 instruction.
8758
// Input: A buffer I of L bytes.
8759
// Output: the CRC32C value of the buffer.
8761
// Write L = 24N + r, with N = floor (L/24).
8762
// r = L mod 24 (0 <= r < 24).
8763
// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8764
// N quadwords, and R consists of r bytes.
8765
// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8766
// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8767
// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8768
// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8769
void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8770
Register tmp1, Register tmp2, Register tmp3,
8771
Register tmp4, Register tmp5, Register tmp6,
8772
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8773
bool is_pclmulqdq_supported) {
8774
uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8776
Label L_byteByByteProlog;
8780
if (is_pclmulqdq_supported ) {
8781
const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8782
const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
8784
const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8785
const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8787
const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8788
const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8789
assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8791
const_or_pre_comp_const_index[0] = 1;
8792
const_or_pre_comp_const_index[1] = 0;
8794
const_or_pre_comp_const_index[2] = 3;
8795
const_or_pre_comp_const_index[3] = 2;
8797
const_or_pre_comp_const_index[4] = 5;
8798
const_or_pre_comp_const_index[5] = 4;
8800
crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8803
w_xtmp1, w_xtmp2, w_xtmp3,
8806
crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8809
w_xtmp1, w_xtmp2, w_xtmp3,
8812
crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8815
w_xtmp1, w_xtmp2, w_xtmp3,
8819
andl(tmp1, 0x00000007);
8825
jccb(Assembler::greaterEqual, L_byteByByteProlog);
8828
crc32(in_out, Address(in1, 0), 8);
8831
jcc(Assembler::less, L_wordByWord);
8833
BIND(L_byteByByteProlog);
8834
andl(in2, 0x00000007);
8838
jccb(Assembler::greater, L_exit);
8840
crc32(in_out, Address(in1, 0), 1);
8844
jcc(Assembler::lessEqual, L_byteByByte);
8849
void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8850
Register tmp1, Register tmp2, Register tmp3,
8851
Register tmp4, Register tmp5, Register tmp6,
8852
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8853
bool is_pclmulqdq_supported) {
8854
uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8856
Label L_byteByByteProlog;
8860
if (is_pclmulqdq_supported) {
8861
const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8862
const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8864
const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8865
const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8867
const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8868
const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8870
const_or_pre_comp_const_index[0] = 1;
8871
const_or_pre_comp_const_index[1] = 0;
8873
const_or_pre_comp_const_index[2] = 3;
8874
const_or_pre_comp_const_index[3] = 2;
8876
const_or_pre_comp_const_index[4] = 5;
8877
const_or_pre_comp_const_index[5] = 4;
8879
crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8882
w_xtmp1, w_xtmp2, w_xtmp3,
8885
crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8888
w_xtmp1, w_xtmp2, w_xtmp3,
8891
crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8894
w_xtmp1, w_xtmp2, w_xtmp3,
8898
andl(tmp1, 0x00000007);
8905
jcc(Assembler::greaterEqual, L_byteByByteProlog);
8906
crc32(in_out, Address(in1,0), 4);
8910
BIND(L_byteByByteProlog);
8911
andl(in2, 0x00000007);
8916
jccb(Assembler::greater, L_exit);
8917
movb(tmp1, Address(in1, 0));
8918
crc32(in_out, tmp1, 1);
8929
// Compress char[] array to byte[].
8930
// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8931
// Return the array length if every element in array can be encoded,
8932
// otherwise, the index of first non-latin1 (> 0xff) character.
8933
// @IntrinsicCandidate
8934
// public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8935
// for (int i = 0; i < len; i++) {
8936
// char c = src[srcOff];
8938
// return i; // return index of non-latin1 char
8940
// dst[dstOff] = (byte)c;
8946
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8947
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8948
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8949
Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8950
Label copy_chars_loop, done, reset_sp, copy_tail;
8958
// rsi holds start addr of source char[] to be compressed
8959
// rdi holds start addr of destination byte[]
8962
assert(len != result, "");
8964
// save length for return
8967
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8968
VM_Version::supports_avx512vlbw() &&
8969
VM_Version::supports_bmi2()) {
8971
Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
8974
Label post_alignment;
8976
// if length of the string is less than 32, handle it the old fashioned way
8978
jcc(Assembler::zero, below_threshold);
8980
// First check whether a character is compressible ( <= 0xFF).
8981
// Create mask to test for Unicode chars inside zmm vector
8983
evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
8986
jccb(Assembler::zero, post_alignment);
8989
andl(tmp5, (32 - 1));
8991
andl(tmp5, (32 - 1));
8993
// bail out when there is nothing to be done
8994
testl(tmp5, 0xFFFFFFFF);
8995
jccb(Assembler::zero, post_alignment);
8997
// ~(~0 << len), where len is the # of remaining elements to process
8998
movl(len, 0xFFFFFFFF);
8999
shlxl(len, len, tmp5);
9004
evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9005
evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9006
ktestd(mask1, mask2);
9007
jcc(Assembler::carryClear, copy_tail);
9009
evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9016
bind(post_alignment);
9020
andl(tmp5, (32 - 1)); // tail count (in chars)
9021
andl(len, ~(32 - 1)); // vector count (in chars)
9022
jccb(Assembler::zero, copy_loop_tail);
9024
lea(src, Address(src, len, Address::times_2));
9025
lea(dst, Address(dst, len, Address::times_1));
9029
evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9030
evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9031
kortestdl(mask1, mask1);
9032
jccb(Assembler::carryClear, reset_for_copy_tail);
9034
// All elements in current processed chunk are valid candidates for
9035
// compression. Write a truncated byte elements to the memory.
9036
evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9038
jccb(Assembler::notZero, copy_32_loop);
9040
bind(copy_loop_tail);
9041
// bail out when there is nothing to be done
9042
testl(tmp5, 0xFFFFFFFF);
9043
jcc(Assembler::zero, done);
9047
// ~(~0 << len), where len is the # of remaining elements to process
9048
movl(tmp5, 0xFFFFFFFF);
9049
shlxl(tmp5, tmp5, len);
9052
kmovdl(mask2, tmp5);
9054
evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9055
evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9056
ktestd(mask1, mask2);
9057
jcc(Assembler::carryClear, copy_tail);
9059
evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9062
bind(reset_for_copy_tail);
9063
lea(src, Address(src, tmp5, Address::times_2));
9064
lea(dst, Address(dst, tmp5, Address::times_1));
9066
jmp(copy_chars_loop);
9068
bind(below_threshold);
9071
if (UseSSE42Intrinsics) {
9072
Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
9074
// vectored compression
9075
testl(len, 0xfffffff8);
9076
jcc(Assembler::zero, copy_tail);
9078
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
9079
movdl(tmp1Reg, tmp5);
9080
pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
9082
andl(len, 0xfffffff0);
9083
jccb(Assembler::zero, copy_16);
9085
// compress 16 chars per iter
9086
pxor(tmp4Reg, tmp4Reg);
9088
lea(src, Address(src, len, Address::times_2));
9089
lea(dst, Address(dst, len, Address::times_1));
9093
movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters
9094
por(tmp4Reg, tmp2Reg);
9095
movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9096
por(tmp4Reg, tmp3Reg);
9097
ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
9098
jccb(Assembler::notZero, reset_for_copy_tail);
9099
packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
9100
movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9102
jccb(Assembler::notZero, copy_32_loop);
9104
// compress next vector of 8 chars (if any)
9107
testl(result, 0x00000008); // check if there's a block of 8 chars to compress
9108
jccb(Assembler::zero, copy_tail_sse);
9110
pxor(tmp3Reg, tmp3Reg);
9112
movdqu(tmp2Reg, Address(src, 0));
9113
ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
9114
jccb(Assembler::notZero, reset_for_copy_tail);
9115
packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
9116
movq(Address(dst, 0), tmp2Reg);
9119
jmpb(copy_tail_sse);
9121
bind(reset_for_copy_tail);
9123
andl(tmp5, 0x0000000f);
9124
lea(src, Address(src, tmp5, Address::times_2));
9125
lea(dst, Address(dst, tmp5, Address::times_1));
9127
jmpb(copy_chars_loop);
9129
bind(copy_tail_sse);
9131
andl(len, 0x00000007); // tail count (in chars)
9133
// compress 1 char per iter
9136
jccb(Assembler::zero, done);
9137
lea(src, Address(src, len, Address::times_2));
9138
lea(dst, Address(dst, len, Address::times_1));
9141
bind(copy_chars_loop);
9142
load_unsigned_short(tmp5, Address(src, len, Address::times_2));
9143
testl(tmp5, 0xff00); // check if Unicode char
9144
jccb(Assembler::notZero, reset_sp);
9145
movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
9147
jccb(Assembler::notZero, copy_chars_loop);
9149
// add len then return (len will be zero if compress succeeded, otherwise negative)
9156
// Inflate byte[] array to char[].
9157
// ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9158
// @IntrinsicCandidate
9159
// private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9160
// for (int i = 0; i < len; i++) {
9161
// dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9164
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9165
XMMRegister tmp1, Register tmp2, KRegister mask) {
9166
Label copy_chars_loop, done, below_threshold, avx3_threshold;
9172
// rsi holds start addr of source byte[] to be inflated
9173
// rdi holds start addr of destination char[]
9175
assert_different_registers(src, dst, len, tmp2);
9177
if ((UseAVX > 2) && // AVX512
9178
VM_Version::supports_avx512vlbw() &&
9179
VM_Version::supports_bmi2()) {
9181
Label copy_32_loop, copy_tail;
9182
Register tmp3_aliased = len;
9184
// if length of the string is less than 16, handle it in an old fashioned way
9186
jcc(Assembler::zero, below_threshold);
9188
testl(len, -1 * AVX3Threshold);
9189
jcc(Assembler::zero, avx3_threshold);
9191
// In order to use only one arithmetic operation for the main loop we use
9192
// this pre-calculation
9193
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9194
andl(len, -32); // vector count
9195
jccb(Assembler::zero, copy_tail);
9197
lea(src, Address(src, len, Address::times_1));
9198
lea(dst, Address(dst, len, Address::times_2));
9202
// inflate 32 chars per iter
9204
vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9205
evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9207
jcc(Assembler::notZero, copy_32_loop);
9210
// bail out when there is nothing to be done
9211
testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9212
jcc(Assembler::zero, done);
9214
// ~(~0 << length), where length is the # of remaining elements to process
9215
movl(tmp3_aliased, -1);
9216
shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9218
kmovdl(mask, tmp3_aliased);
9219
evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
9220
evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
9223
bind(avx3_threshold);
9225
if (UseSSE42Intrinsics) {
9226
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9229
andl(tmp2, (16 - 1));
9231
jccb(Assembler::zero, copy_new_tail);
9233
andl(tmp2, 0x00000007); // tail count (in chars)
9234
andl(len, 0xfffffff8); // vector count (in chars)
9235
jccb(Assembler::zero, copy_tail);
9238
// vectored inflation
9239
lea(src, Address(src, len, Address::times_1));
9240
lea(dst, Address(dst, len, Address::times_2));
9245
vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9246
vmovdqu(Address(dst, len, Address::times_2), tmp1);
9248
jcc(Assembler::notZero, copy_16_loop);
9250
bind(below_threshold);
9251
bind(copy_new_tail);
9253
andl(tmp2, 0x00000007);
9254
andl(len, 0xFFFFFFF8);
9255
jccb(Assembler::zero, copy_tail);
9257
pmovzxbw(tmp1, Address(src, 0));
9258
movdqu(Address(dst, 0), tmp1);
9262
jmp(copy_tail, true);
9265
// inflate 8 chars per iter
9267
pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
9268
movdqu(Address(dst, len, Address::times_2), tmp1);
9270
jcc(Assembler::notZero, copy_8_loop);
9276
jccb(Assembler::less, copy_bytes);
9278
movdl(tmp1, Address(src, 0)); // load 4 byte chars
9279
pmovzxbw(tmp1, tmp1);
9280
movq(Address(dst, 0), tmp1);
9287
bind(below_threshold);
9291
jccb(Assembler::zero, done);
9292
lea(src, Address(src, len, Address::times_1));
9293
lea(dst, Address(dst, len, Address::times_2));
9296
// inflate 1 char per iter
9297
bind(copy_chars_loop);
9298
load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char
9299
movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word
9301
jcc(Assembler::notZero, copy_chars_loop);
9307
void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
9311
evmovdqub(dst, kmask, src, merge, vector_len);
9315
evmovdquw(dst, kmask, src, merge, vector_len);
9319
evmovdqul(dst, kmask, src, merge, vector_len);
9323
evmovdquq(dst, kmask, src, merge, vector_len);
9326
fatal("Unexpected type argument %s", type2name(type));
9331
void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
9335
evmovdqub(dst, kmask, src, merge, vector_len);
9339
evmovdquw(dst, kmask, src, merge, vector_len);
9343
evmovdqul(dst, kmask, src, merge, vector_len);
9347
evmovdquq(dst, kmask, src, merge, vector_len);
9350
fatal("Unexpected type argument %s", type2name(type));
9355
void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
9361
kandbl(dst, ktmp, dst);
9367
kandbl(dst, ktmp, dst);
9382
fatal("Unexpected vector length %d", masklen);
9387
void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9391
kandbl(dst, src1, src2);
9395
kandwl(dst, src1, src2);
9399
kanddl(dst, src1, src2);
9403
kandql(dst, src1, src2);
9406
fatal("Unexpected type argument %s", type2name(type));
9411
void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9415
korbl(dst, src1, src2);
9419
korwl(dst, src1, src2);
9423
kordl(dst, src1, src2);
9427
korql(dst, src1, src2);
9430
fatal("Unexpected type argument %s", type2name(type));
9435
void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9439
kxorbl(dst, src1, src2);
9443
kxorwl(dst, src1, src2);
9447
kxordl(dst, src1, src2);
9451
kxorql(dst, src1, src2);
9454
fatal("Unexpected type argument %s", type2name(type));
9459
void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9463
evpermb(dst, mask, nds, src, merge, vector_len); break;
9466
evpermw(dst, mask, nds, src, merge, vector_len); break;
9469
evpermd(dst, mask, nds, src, merge, vector_len); break;
9472
evpermq(dst, mask, nds, src, merge, vector_len); break;
9474
fatal("Unexpected type argument %s", type2name(type)); break;
9478
void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9482
evpermb(dst, mask, nds, src, merge, vector_len); break;
9485
evpermw(dst, mask, nds, src, merge, vector_len); break;
9488
evpermd(dst, mask, nds, src, merge, vector_len); break;
9491
evpermq(dst, mask, nds, src, merge, vector_len); break;
9493
fatal("Unexpected type argument %s", type2name(type)); break;
9497
void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9500
evpminsb(dst, mask, nds, src, merge, vector_len); break;
9502
evpminsw(dst, mask, nds, src, merge, vector_len); break;
9504
evpminsd(dst, mask, nds, src, merge, vector_len); break;
9506
evpminsq(dst, mask, nds, src, merge, vector_len); break;
9508
fatal("Unexpected type argument %s", type2name(type)); break;
9512
void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9515
evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9517
evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9519
evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9521
evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9523
fatal("Unexpected type argument %s", type2name(type)); break;
9527
void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9530
evpminsb(dst, mask, nds, src, merge, vector_len); break;
9532
evpminsw(dst, mask, nds, src, merge, vector_len); break;
9534
evpminsd(dst, mask, nds, src, merge, vector_len); break;
9536
evpminsq(dst, mask, nds, src, merge, vector_len); break;
9538
fatal("Unexpected type argument %s", type2name(type)); break;
9542
void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9545
evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9547
evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9549
evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9551
evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9553
fatal("Unexpected type argument %s", type2name(type)); break;
9557
void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9560
evpxord(dst, mask, nds, src, merge, vector_len); break;
9562
evpxorq(dst, mask, nds, src, merge, vector_len); break;
9564
fatal("Unexpected type argument %s", type2name(type)); break;
9568
void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9571
evpxord(dst, mask, nds, src, merge, vector_len); break;
9573
evpxorq(dst, mask, nds, src, merge, vector_len); break;
9575
fatal("Unexpected type argument %s", type2name(type)); break;
9579
void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9582
Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9584
evporq(dst, mask, nds, src, merge, vector_len); break;
9586
fatal("Unexpected type argument %s", type2name(type)); break;
9590
void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9593
Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9595
evporq(dst, mask, nds, src, merge, vector_len); break;
9597
fatal("Unexpected type argument %s", type2name(type)); break;
9601
void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9604
evpandd(dst, mask, nds, src, merge, vector_len); break;
9606
evpandq(dst, mask, nds, src, merge, vector_len); break;
9608
fatal("Unexpected type argument %s", type2name(type)); break;
9612
void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9615
evpandd(dst, mask, nds, src, merge, vector_len); break;
9617
evpandq(dst, mask, nds, src, merge, vector_len); break;
9619
fatal("Unexpected type argument %s", type2name(type)); break;
9623
void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9626
kortestbl(src1, src2);
9629
kortestwl(src1, src2);
9632
kortestdl(src1, src2);
9635
kortestql(src1, src2);
9638
fatal("Unexpected mask length %d", masklen);
9644
void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9647
ktestbl(src1, src2);
9650
ktestwl(src1, src2);
9653
ktestdl(src1, src2);
9656
ktestql(src1, src2);
9659
fatal("Unexpected mask length %d", masklen);
9664
void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9667
evprold(dst, mask, src, shift, merge, vlen_enc); break;
9669
evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9671
fatal("Unexpected type argument %s", type2name(type)); break;
9676
void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9679
evprord(dst, mask, src, shift, merge, vlen_enc); break;
9681
evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9683
fatal("Unexpected type argument %s", type2name(type)); break;
9687
void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9690
evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9692
evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9694
fatal("Unexpected type argument %s", type2name(type)); break;
9698
void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9701
evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9703
evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9705
fatal("Unexpected type argument %s", type2name(type)); break;
9709
void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9710
assert(rscratch != noreg || always_reachable(src), "missing");
9712
if (reachable(src)) {
9713
evpandq(dst, nds, as_Address(src), vector_len);
9716
evpandq(dst, nds, Address(rscratch, 0), vector_len);
9720
void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9721
assert(rscratch != noreg || always_reachable(src), "missing");
9723
if (reachable(src)) {
9724
Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9727
Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9731
void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9732
assert(rscratch != noreg || always_reachable(src), "missing");
9734
if (reachable(src)) {
9735
evporq(dst, nds, as_Address(src), vector_len);
9738
evporq(dst, nds, Address(rscratch, 0), vector_len);
9742
void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9743
assert(rscratch != noreg || always_reachable(src), "missing");
9745
if (reachable(src)) {
9746
vpshufb(dst, nds, as_Address(src), vector_len);
9749
vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9753
void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9754
assert(rscratch != noreg || always_reachable(src), "missing");
9756
if (reachable(src)) {
9757
Assembler::vpor(dst, nds, as_Address(src), vector_len);
9760
Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9764
void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9765
assert(rscratch != noreg || always_reachable(src3), "missing");
9767
if (reachable(src3)) {
9768
vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9770
lea(rscratch, src3);
9771
vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9775
#if COMPILER2_OR_JVMCI
9777
void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9778
Register length, Register temp, int vec_enc) {
9779
// Computing mask for predicated vector store.
9781
bzhiq(temp, temp, length);
9783
evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9786
// Set memory operation for length "less than" 64 bytes.
9787
void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9788
XMMRegister xmm, KRegister mask, Register length,
9789
Register temp, bool use64byteVector) {
9790
assert(MaxVectorSize >= 32, "vector length should be >= 32");
9791
const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9792
if (!use64byteVector) {
9793
fill32(dst, disp, xmm);
9794
subptr(length, 32 >> shift);
9795
fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9797
assert(MaxVectorSize == 64, "vector length != 64");
9798
fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9803
void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9804
XMMRegister xmm, KRegister mask, Register length,
9806
assert(MaxVectorSize >= 32, "vector length should be >= 32");
9807
const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9808
fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9812
void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9813
assert(MaxVectorSize >= 32, "vector length should be >= 32");
9817
void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9818
fill32(Address(dst, disp), xmm);
9821
void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9822
assert(MaxVectorSize >= 32, "vector length should be >= 32");
9823
if (!use64byteVector) {
9825
fill32(dst.plus_disp(32), xmm);
9827
evmovdquq(dst, xmm, Assembler::AVX_512bit);
9831
void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9832
fill64(Address(dst, disp), xmm, use64byteVector);
9836
void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9837
Register count, Register rtmp, XMMRegister xtmp) {
9840
Label L_fill_64_bytes;
9841
Label L_fill_96_bytes;
9842
Label L_fill_128_bytes;
9843
Label L_fill_128_bytes_loop;
9844
Label L_fill_128_loop_header;
9845
Label L_fill_128_bytes_loop_header;
9846
Label L_fill_128_bytes_loop_pre_header;
9847
Label L_fill_zmm_sequence;
9850
int avx3threshold = VM_Version::avx3_threshold();
9852
case T_BYTE: shift = 0;
9854
case T_SHORT: shift = 1;
9856
case T_INT: shift = 2;
9858
/* Uncomment when LONG fill stubs are supported.
9859
case T_LONG: shift = 3;
9863
fatal("Unhandled type: %s\n", type2name(type));
9866
if ((avx3threshold != 0) || (MaxVectorSize == 32)) {
9868
if (MaxVectorSize == 64) {
9869
cmpq(count, avx3threshold >> shift);
9870
jcc(Assembler::greater, L_fill_zmm_sequence);
9873
evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9877
cmpq(count, 32 >> shift);
9878
jccb(Assembler::greater, L_fill_64_bytes);
9879
fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9882
bind(L_fill_64_bytes);
9883
cmpq(count, 64 >> shift);
9884
jccb(Assembler::greater, L_fill_96_bytes);
9885
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9888
bind(L_fill_96_bytes);
9889
cmpq(count, 96 >> shift);
9890
jccb(Assembler::greater, L_fill_128_bytes);
9891
fill64(to, 0, xtmp);
9892
subq(count, 64 >> shift);
9893
fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9896
bind(L_fill_128_bytes);
9897
cmpq(count, 128 >> shift);
9898
jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9899
fill64(to, 0, xtmp);
9900
fill32(to, 64, xtmp);
9901
subq(count, 96 >> shift);
9902
fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9905
bind(L_fill_128_bytes_loop_pre_header);
9909
jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9913
bzhiq(r8, r8, rtmp);
9915
evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9921
cmpq(count, 128 >> shift);
9922
jcc(Assembler::less, L_fill_start);
9924
bind(L_fill_128_bytes_loop_header);
9925
subq(count, 128 >> shift);
9928
bind(L_fill_128_bytes_loop);
9929
fill64(to, 0, xtmp);
9930
fill64(to, 64, xtmp);
9932
subq(count, 128 >> shift);
9933
jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9935
addq(count, 128 >> shift);
9936
jcc(Assembler::zero, L_exit);
9940
if (MaxVectorSize == 64) {
9941
// Sequence using 64 byte ZMM register.
9942
Label L_fill_128_bytes_zmm;
9943
Label L_fill_192_bytes_zmm;
9944
Label L_fill_192_bytes_loop_zmm;
9945
Label L_fill_192_bytes_loop_header_zmm;
9946
Label L_fill_192_bytes_loop_pre_header_zmm;
9947
Label L_fill_start_zmm_sequence;
9949
bind(L_fill_zmm_sequence);
9950
evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9952
bind(L_fill_start_zmm_sequence);
9953
cmpq(count, 64 >> shift);
9954
jccb(Assembler::greater, L_fill_128_bytes_zmm);
9955
fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9958
bind(L_fill_128_bytes_zmm);
9959
cmpq(count, 128 >> shift);
9960
jccb(Assembler::greater, L_fill_192_bytes_zmm);
9961
fill64(to, 0, xtmp, true);
9962
subq(count, 64 >> shift);
9963
fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9966
bind(L_fill_192_bytes_zmm);
9967
cmpq(count, 192 >> shift);
9968
jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9969
fill64(to, 0, xtmp, true);
9970
fill64(to, 64, xtmp, true);
9971
subq(count, 128 >> shift);
9972
fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9975
bind(L_fill_192_bytes_loop_pre_header_zmm);
9979
jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9983
bzhiq(r8, r8, rtmp);
9985
evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9991
cmpq(count, 192 >> shift);
9992
jcc(Assembler::less, L_fill_start_zmm_sequence);
9994
bind(L_fill_192_bytes_loop_header_zmm);
9995
subq(count, 192 >> shift);
9998
bind(L_fill_192_bytes_loop_zmm);
9999
fill64(to, 0, xtmp, true);
10000
fill64(to, 64, xtmp, true);
10001
fill64(to, 128, xtmp, true);
10003
subq(count, 192 >> shift);
10004
jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
10006
addq(count, 192 >> shift);
10007
jcc(Assembler::zero, L_exit);
10008
jmp(L_fill_start_zmm_sequence);
10013
#endif //COMPILER2_OR_JVMCI
10017
void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
10019
cvttss2sil(dst, src);
10020
// Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10021
cmpl(dst, 0x80000000); // float_sign_flip
10022
jccb(Assembler::notEqual, done);
10024
movflt(Address(rsp, 0), src);
10025
call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
10030
void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
10032
cvttsd2sil(dst, src);
10033
// Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10034
cmpl(dst, 0x80000000); // float_sign_flip
10035
jccb(Assembler::notEqual, done);
10037
movdbl(Address(rsp, 0), src);
10038
call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
10043
void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
10045
cvttss2siq(dst, src);
10046
cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10047
jccb(Assembler::notEqual, done);
10049
movflt(Address(rsp, 0), src);
10050
call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
10055
void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10056
// Following code is line by line assembly translation rounding algorithm.
10057
// Please refer to java.lang.Math.round(float) algorithm for details.
10058
const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
10059
const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
10060
const int32_t FloatConsts_EXP_BIAS = 127;
10061
const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
10062
const int32_t MINUS_32 = 0xFFFFFFE0;
10063
Label L_special_case, L_block1, L_exit;
10064
movl(rtmp, FloatConsts_EXP_BIT_MASK);
10067
sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
10068
movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
10071
movl(dst, MINUS_32);
10073
jccb(Assembler::notEqual, L_special_case);
10075
andl(dst, FloatConsts_SIGNIF_BIT_MASK);
10076
orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
10079
jccb(Assembler::greaterEqual, L_block1);
10086
bind(L_special_case);
10087
convert_f2i(dst, src);
10091
void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10092
// Following code is line by line assembly translation rounding algorithm.
10093
// Please refer to java.lang.Math.round(double) algorithm for details.
10094
const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
10095
const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
10096
const int64_t DoubleConsts_EXP_BIAS = 1023;
10097
const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
10098
const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
10099
Label L_special_case, L_block1, L_exit;
10100
mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
10103
sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
10104
mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
10107
mov64(dst, MINUS_64);
10109
jccb(Assembler::notEqual, L_special_case);
10111
mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
10113
mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
10117
jccb(Assembler::greaterEqual, L_block1);
10124
bind(L_special_case);
10125
convert_d2l(dst, src);
10129
void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
10131
cvttsd2siq(dst, src);
10132
cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10133
jccb(Assembler::notEqual, done);
10135
movdbl(Address(rsp, 0), src);
10136
call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
10141
void MacroAssembler::cache_wb(Address line)
10143
// 64 bit cpus always support clflush
10144
assert(VM_Version::supports_clflush(), "clflush should be available");
10145
bool optimized = VM_Version::supports_clflushopt();
10146
bool no_evict = VM_Version::supports_clwb();
10148
// prefer clwb (writeback without evict) otherwise
10149
// prefer clflushopt (potentially parallel writeback with evict)
10150
// otherwise fallback on clflush (serial writeback with evict)
10159
// no need for fence when using CLFLUSH
10164
void MacroAssembler::cache_wbsync(bool is_pre)
10166
assert(VM_Version::supports_clflush(), "clflush should be available");
10167
bool optimized = VM_Version::supports_clflushopt();
10168
bool no_evict = VM_Version::supports_clwb();
10170
// pick the correct implementation
10172
if (!is_pre && (optimized || no_evict)) {
10173
// need an sfence for post flush when using clflushopt or clwb
10174
// otherwise no no need for any synchroniaztion
10182
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10184
// Note some conditions are synonyms for others
10185
case Assembler::zero: return Assembler::notZero;
10186
case Assembler::notZero: return Assembler::zero;
10187
case Assembler::less: return Assembler::greaterEqual;
10188
case Assembler::lessEqual: return Assembler::greater;
10189
case Assembler::greater: return Assembler::lessEqual;
10190
case Assembler::greaterEqual: return Assembler::less;
10191
case Assembler::below: return Assembler::aboveEqual;
10192
case Assembler::belowEqual: return Assembler::above;
10193
case Assembler::above: return Assembler::belowEqual;
10194
case Assembler::aboveEqual: return Assembler::below;
10195
case Assembler::overflow: return Assembler::noOverflow;
10196
case Assembler::noOverflow: return Assembler::overflow;
10197
case Assembler::negative: return Assembler::positive;
10198
case Assembler::positive: return Assembler::negative;
10199
case Assembler::parity: return Assembler::noParity;
10200
case Assembler::noParity: return Assembler::parity;
10202
ShouldNotReachHere(); return Assembler::overflow;
10205
SkipIfEqual::SkipIfEqual(
10206
MacroAssembler* masm, const bool* flag_addr, bool value, Register rscratch) {
10208
_masm->cmp8(ExternalAddress((address)flag_addr), value, rscratch);
10209
_masm->jcc(Assembler::equal, _label);
10212
SkipIfEqual::~SkipIfEqual() {
10213
_masm->bind(_label);
10216
// 32-bit Windows has its own fast-path implementation
10218
#if !defined(WIN32) || defined(_LP64)
10220
// This is simply a call to Thread::current()
10221
void MacroAssembler::get_thread(Register thread) {
10222
if (thread != rax) {
10225
LP64_ONLY(push(rdi);)
10226
LP64_ONLY(push(rsi);)
10236
MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10246
LP64_ONLY(pop(rsi);)
10247
LP64_ONLY(pop(rdi);)
10248
if (thread != rax) {
10255
#endif // !WIN32 || _LP64
10257
void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
10260
testptr(sp, 2 * wordSize - 1);
10262
// lea(tmp, Address(rsp, bias);
10265
testptr(tmp, 2 * wordSize - 1);
10267
jcc(Assembler::equal, L_stack_ok);
10268
block_comment(msg);
10273
// Implements lightweight-locking.
10275
// obj: the object to be locked
10277
// thread: the thread which attempts to lock obj
10278
// tmp: a temporary register
10279
void MacroAssembler::lightweight_lock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
10280
assert(reg_rax == rax, "");
10281
assert_different_registers(obj, reg_rax, thread, tmp);
10284
const Register top = tmp;
10286
// Preload the markWord. It is important that this is the first
10287
// instruction emitted as it is part of C1's null check semantics.
10288
movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10291
movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10293
// Check if the lock-stack is full.
10294
cmpl(top, LockStack::end_offset());
10295
jcc(Assembler::greaterEqual, slow);
10297
// Check for recursion.
10298
cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10299
jcc(Assembler::equal, push);
10301
// Check header for monitor (0b10).
10302
testptr(reg_rax, markWord::monitor_value);
10303
jcc(Assembler::notZero, slow);
10305
// Try to lock. Transition lock bits 0b01 => 0b00
10306
movptr(tmp, reg_rax);
10307
andptr(tmp, ~(int32_t)markWord::unlocked_value);
10308
orptr(reg_rax, markWord::unlocked_value);
10309
lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10310
jcc(Assembler::notEqual, slow);
10312
// Restore top, CAS clobbers register.
10313
movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10316
// After successful lock, push object on lock-stack.
10317
movptr(Address(thread, top), obj);
10318
incrementl(top, oopSize);
10319
movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
10322
// Implements lightweight-unlocking.
10324
// obj: the object to be unlocked
10326
// thread: the thread
10327
// tmp: a temporary register
10329
// x86_32 Note: reg_rax and thread may alias each other due to limited register
10331
void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
10332
assert(reg_rax == rax, "");
10333
assert_different_registers(obj, reg_rax, tmp);
10334
LP64_ONLY(assert_different_registers(obj, reg_rax, thread, tmp);)
10336
Label unlocked, push_and_slow;
10337
const Register top = tmp;
10339
// Check if obj is top of lock-stack.
10340
movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10341
cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10342
jcc(Assembler::notEqual, slow);
10345
DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
10346
subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10348
// Check if recursive.
10349
cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10350
jcc(Assembler::equal, unlocked);
10352
// Not recursive. Check header for monitor (0b10).
10353
movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10354
testptr(reg_rax, markWord::monitor_value);
10355
jcc(Assembler::notZero, push_and_slow);
10358
// Check header not unlocked (0b01).
10359
Label not_unlocked;
10360
testptr(reg_rax, markWord::unlocked_value);
10361
jcc(Assembler::zero, not_unlocked);
10362
stop("lightweight_unlock already unlocked");
10363
bind(not_unlocked);
10366
// Try to unlock. Transition lock bits 0b00 => 0b01
10367
movptr(tmp, reg_rax);
10368
orptr(tmp, markWord::unlocked_value);
10369
lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10370
jcc(Assembler::equal, unlocked);
10372
bind(push_and_slow);
10373
// Restore lock-stack and handle the unlock in runtime.
10374
if (thread == reg_rax) {
10375
// On x86_32 we may lose the thread.
10376
get_thread(thread);
10379
movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10380
movptr(Address(thread, top), obj);
10382
addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10389
// Saves legacy GPRs state on stack.
10390
void MacroAssembler::save_legacy_gprs() {
10391
subq(rsp, 16 * wordSize);
10392
movq(Address(rsp, 15 * wordSize), rax);
10393
movq(Address(rsp, 14 * wordSize), rcx);
10394
movq(Address(rsp, 13 * wordSize), rdx);
10395
movq(Address(rsp, 12 * wordSize), rbx);
10396
movq(Address(rsp, 10 * wordSize), rbp);
10397
movq(Address(rsp, 9 * wordSize), rsi);
10398
movq(Address(rsp, 8 * wordSize), rdi);
10399
movq(Address(rsp, 7 * wordSize), r8);
10400
movq(Address(rsp, 6 * wordSize), r9);
10401
movq(Address(rsp, 5 * wordSize), r10);
10402
movq(Address(rsp, 4 * wordSize), r11);
10403
movq(Address(rsp, 3 * wordSize), r12);
10404
movq(Address(rsp, 2 * wordSize), r13);
10405
movq(Address(rsp, wordSize), r14);
10406
movq(Address(rsp, 0), r15);
10409
// Resotres back legacy GPRs state from stack.
10410
void MacroAssembler::restore_legacy_gprs() {
10411
movq(r15, Address(rsp, 0));
10412
movq(r14, Address(rsp, wordSize));
10413
movq(r13, Address(rsp, 2 * wordSize));
10414
movq(r12, Address(rsp, 3 * wordSize));
10415
movq(r11, Address(rsp, 4 * wordSize));
10416
movq(r10, Address(rsp, 5 * wordSize));
10417
movq(r9, Address(rsp, 6 * wordSize));
10418
movq(r8, Address(rsp, 7 * wordSize));
10419
movq(rdi, Address(rsp, 8 * wordSize));
10420
movq(rsi, Address(rsp, 9 * wordSize));
10421
movq(rbp, Address(rsp, 10 * wordSize));
10422
movq(rbx, Address(rsp, 12 * wordSize));
10423
movq(rdx, Address(rsp, 13 * wordSize));
10424
movq(rcx, Address(rsp, 14 * wordSize));
10425
movq(rax, Address(rsp, 15 * wordSize));
10426
addq(rsp, 16 * wordSize);