jdk

Форк
0
/
macroAssembler_x86.cpp 
10428 строк · 331.7 Кб
1
/*
2
 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.
8
 *
9
 * This code is distributed in the hope that it will be useful, but WITHOUT
10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
 * version 2 for more details (a copy is included in the LICENSE file that
13
 * accompanied this code).
14
 *
15
 * You should have received a copy of the GNU General Public License version
16
 * 2 along with this work; if not, write to the Free Software Foundation,
17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
 *
19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
 * or visit www.oracle.com if you need additional information or have any
21
 * questions.
22
 *
23
 */
24

25
#include "precompiled.hpp"
26
#include "asm/assembler.hpp"
27
#include "asm/assembler.inline.hpp"
28
#include "code/compiledIC.hpp"
29
#include "compiler/compiler_globals.hpp"
30
#include "compiler/disassembler.hpp"
31
#include "crc32c.h"
32
#include "gc/shared/barrierSet.hpp"
33
#include "gc/shared/barrierSetAssembler.hpp"
34
#include "gc/shared/collectedHeap.inline.hpp"
35
#include "gc/shared/tlab_globals.hpp"
36
#include "interpreter/bytecodeHistogram.hpp"
37
#include "interpreter/interpreter.hpp"
38
#include "jvm.h"
39
#include "memory/resourceArea.hpp"
40
#include "memory/universe.hpp"
41
#include "oops/accessDecorators.hpp"
42
#include "oops/compressedKlass.inline.hpp"
43
#include "oops/compressedOops.inline.hpp"
44
#include "oops/klass.inline.hpp"
45
#include "prims/methodHandles.hpp"
46
#include "runtime/continuation.hpp"
47
#include "runtime/interfaceSupport.inline.hpp"
48
#include "runtime/javaThread.hpp"
49
#include "runtime/jniHandles.hpp"
50
#include "runtime/objectMonitor.hpp"
51
#include "runtime/os.hpp"
52
#include "runtime/safepoint.hpp"
53
#include "runtime/safepointMechanism.hpp"
54
#include "runtime/sharedRuntime.hpp"
55
#include "runtime/stubRoutines.hpp"
56
#include "utilities/checkedCast.hpp"
57
#include "utilities/macros.hpp"
58

59
#ifdef PRODUCT
60
#define BLOCK_COMMENT(str) /* nothing */
61
#define STOP(error) stop(error)
62
#else
63
#define BLOCK_COMMENT(str) block_comment(str)
64
#define STOP(error) block_comment(error); stop(error)
65
#endif
66

67
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
68

69
#ifdef ASSERT
70
bool AbstractAssembler::pd_check_instruction_mark() { return true; }
71
#endif
72

73
static const Assembler::Condition reverse[] = {
74
    Assembler::noOverflow     /* overflow      = 0x0 */ ,
75
    Assembler::overflow       /* noOverflow    = 0x1 */ ,
76
    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
77
    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
78
    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
79
    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
80
    Assembler::above          /* belowEqual    = 0x6 */ ,
81
    Assembler::belowEqual     /* above         = 0x7 */ ,
82
    Assembler::positive       /* negative      = 0x8 */ ,
83
    Assembler::negative       /* positive      = 0x9 */ ,
84
    Assembler::noParity       /* parity        = 0xa */ ,
85
    Assembler::parity         /* noParity      = 0xb */ ,
86
    Assembler::greaterEqual   /* less          = 0xc */ ,
87
    Assembler::less           /* greaterEqual  = 0xd */ ,
88
    Assembler::greater        /* lessEqual     = 0xe */ ,
89
    Assembler::lessEqual      /* greater       = 0xf, */
90

91
};
92

93

94
// Implementation of MacroAssembler
95

96
// First all the versions that have distinct versions depending on 32/64 bit
97
// Unless the difference is trivial (1 line or so).
98

99
#ifndef _LP64
100

101
// 32bit versions
102

103
Address MacroAssembler::as_Address(AddressLiteral adr) {
104
  return Address(adr.target(), adr.rspec());
105
}
106

107
Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
108
  assert(rscratch == noreg, "");
109
  return Address::make_array(adr);
110
}
111

112
void MacroAssembler::call_VM_leaf_base(address entry_point,
113
                                       int number_of_arguments) {
114
  call(RuntimeAddress(entry_point));
115
  increment(rsp, number_of_arguments * wordSize);
116
}
117

118
void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
119
  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
120
}
121

122

123
void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
124
  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
125
}
126

127
void MacroAssembler::cmpoop(Address src1, jobject obj) {
128
  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
129
}
130

131
void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) {
132
  assert(rscratch == noreg, "redundant");
133
  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
134
}
135

136
void MacroAssembler::extend_sign(Register hi, Register lo) {
137
  // According to Intel Doc. AP-526, "Integer Divide", p.18.
138
  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
139
    cdql();
140
  } else {
141
    movl(hi, lo);
142
    sarl(hi, 31);
143
  }
144
}
145

146
void MacroAssembler::jC2(Register tmp, Label& L) {
147
  // set parity bit if FPU flag C2 is set (via rax)
148
  save_rax(tmp);
149
  fwait(); fnstsw_ax();
150
  sahf();
151
  restore_rax(tmp);
152
  // branch
153
  jcc(Assembler::parity, L);
154
}
155

156
void MacroAssembler::jnC2(Register tmp, Label& L) {
157
  // set parity bit if FPU flag C2 is set (via rax)
158
  save_rax(tmp);
159
  fwait(); fnstsw_ax();
160
  sahf();
161
  restore_rax(tmp);
162
  // branch
163
  jcc(Assembler::noParity, L);
164
}
165

166
// 32bit can do a case table jump in one instruction but we no longer allow the base
167
// to be installed in the Address class
168
void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
169
  assert(rscratch == noreg, "not needed");
170
  jmp(as_Address(entry, noreg));
171
}
172

173
// Note: y_lo will be destroyed
174
void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
175
  // Long compare for Java (semantics as described in JVM spec.)
176
  Label high, low, done;
177

178
  cmpl(x_hi, y_hi);
179
  jcc(Assembler::less, low);
180
  jcc(Assembler::greater, high);
181
  // x_hi is the return register
182
  xorl(x_hi, x_hi);
183
  cmpl(x_lo, y_lo);
184
  jcc(Assembler::below, low);
185
  jcc(Assembler::equal, done);
186

187
  bind(high);
188
  xorl(x_hi, x_hi);
189
  increment(x_hi);
190
  jmp(done);
191

192
  bind(low);
193
  xorl(x_hi, x_hi);
194
  decrementl(x_hi);
195

196
  bind(done);
197
}
198

199
void MacroAssembler::lea(Register dst, AddressLiteral src) {
200
  mov_literal32(dst, (int32_t)src.target(), src.rspec());
201
}
202

203
void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
204
  assert(rscratch == noreg, "not needed");
205

206
  // leal(dst, as_Address(adr));
207
  // see note in movl as to why we must use a move
208
  mov_literal32(dst, (int32_t)adr.target(), adr.rspec());
209
}
210

211
void MacroAssembler::leave() {
212
  mov(rsp, rbp);
213
  pop(rbp);
214
}
215

216
void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
217
  // Multiplication of two Java long values stored on the stack
218
  // as illustrated below. Result is in rdx:rax.
219
  //
220
  // rsp ---> [  ??  ] \               \
221
  //            ....    | y_rsp_offset  |
222
  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
223
  //          [ y_hi ]                  | (in bytes)
224
  //            ....                    |
225
  //          [ x_lo ]                 /
226
  //          [ x_hi ]
227
  //            ....
228
  //
229
  // Basic idea: lo(result) = lo(x_lo * y_lo)
230
  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
231
  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
232
  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
233
  Label quick;
234
  // load x_hi, y_hi and check if quick
235
  // multiplication is possible
236
  movl(rbx, x_hi);
237
  movl(rcx, y_hi);
238
  movl(rax, rbx);
239
  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
240
  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
241
  // do full multiplication
242
  // 1st step
243
  mull(y_lo);                                    // x_hi * y_lo
244
  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
245
  // 2nd step
246
  movl(rax, x_lo);
247
  mull(rcx);                                     // x_lo * y_hi
248
  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
249
  // 3rd step
250
  bind(quick);                                   // note: rbx, = 0 if quick multiply!
251
  movl(rax, x_lo);
252
  mull(y_lo);                                    // x_lo * y_lo
253
  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
254
}
255

256
void MacroAssembler::lneg(Register hi, Register lo) {
257
  negl(lo);
258
  adcl(hi, 0);
259
  negl(hi);
260
}
261

262
void MacroAssembler::lshl(Register hi, Register lo) {
263
  // Java shift left long support (semantics as described in JVM spec., p.305)
264
  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
265
  // shift value is in rcx !
266
  assert(hi != rcx, "must not use rcx");
267
  assert(lo != rcx, "must not use rcx");
268
  const Register s = rcx;                        // shift count
269
  const int      n = BitsPerWord;
270
  Label L;
271
  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
272
  cmpl(s, n);                                    // if (s < n)
273
  jcc(Assembler::less, L);                       // else (s >= n)
274
  movl(hi, lo);                                  // x := x << n
275
  xorl(lo, lo);
276
  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
277
  bind(L);                                       // s (mod n) < n
278
  shldl(hi, lo);                                 // x := x << s
279
  shll(lo);
280
}
281

282

283
void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
284
  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
285
  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
286
  assert(hi != rcx, "must not use rcx");
287
  assert(lo != rcx, "must not use rcx");
288
  const Register s = rcx;                        // shift count
289
  const int      n = BitsPerWord;
290
  Label L;
291
  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
292
  cmpl(s, n);                                    // if (s < n)
293
  jcc(Assembler::less, L);                       // else (s >= n)
294
  movl(lo, hi);                                  // x := x >> n
295
  if (sign_extension) sarl(hi, 31);
296
  else                xorl(hi, hi);
297
  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
298
  bind(L);                                       // s (mod n) < n
299
  shrdl(lo, hi);                                 // x := x >> s
300
  if (sign_extension) sarl(hi);
301
  else                shrl(hi);
302
}
303

304
void MacroAssembler::movoop(Register dst, jobject obj) {
305
  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
306
}
307

308
void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
309
  assert(rscratch == noreg, "redundant");
310
  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
311
}
312

313
void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
314
  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
315
}
316

317
void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
318
  assert(rscratch == noreg, "redundant");
319
  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
320
}
321

322
void MacroAssembler::movptr(Register dst, AddressLiteral src) {
323
  if (src.is_lval()) {
324
    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
325
  } else {
326
    movl(dst, as_Address(src));
327
  }
328
}
329

330
void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
331
  assert(rscratch == noreg, "redundant");
332
  movl(as_Address(dst, noreg), src);
333
}
334

335
void MacroAssembler::movptr(Register dst, ArrayAddress src) {
336
  movl(dst, as_Address(src, noreg));
337
}
338

339
void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
340
  assert(rscratch == noreg, "redundant");
341
  movl(dst, src);
342
}
343

344
void MacroAssembler::pushoop(jobject obj, Register rscratch) {
345
  assert(rscratch == noreg, "redundant");
346
  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
347
}
348

349
void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
350
  assert(rscratch == noreg, "redundant");
351
  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
352
}
353

354
void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
355
  assert(rscratch == noreg, "redundant");
356
  if (src.is_lval()) {
357
    push_literal32((int32_t)src.target(), src.rspec());
358
  } else {
359
    pushl(as_Address(src));
360
  }
361
}
362

363
static void pass_arg0(MacroAssembler* masm, Register arg) {
364
  masm->push(arg);
365
}
366

367
static void pass_arg1(MacroAssembler* masm, Register arg) {
368
  masm->push(arg);
369
}
370

371
static void pass_arg2(MacroAssembler* masm, Register arg) {
372
  masm->push(arg);
373
}
374

375
static void pass_arg3(MacroAssembler* masm, Register arg) {
376
  masm->push(arg);
377
}
378

379
#ifndef PRODUCT
380
extern "C" void findpc(intptr_t x);
381
#endif
382

383
void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
384
  // In order to get locks to work, we need to fake a in_VM state
385
  JavaThread* thread = JavaThread::current();
386
  JavaThreadState saved_state = thread->thread_state();
387
  thread->set_thread_state(_thread_in_vm);
388
  if (ShowMessageBoxOnError) {
389
    JavaThread* thread = JavaThread::current();
390
    JavaThreadState saved_state = thread->thread_state();
391
    thread->set_thread_state(_thread_in_vm);
392
    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
393
      ttyLocker ttyl;
394
      BytecodeCounter::print();
395
    }
396
    // To see where a verify_oop failed, get $ebx+40/X for this frame.
397
    // This is the value of eip which points to where verify_oop will return.
398
    if (os::message_box(msg, "Execution stopped, print registers?")) {
399
      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
400
      BREAKPOINT;
401
    }
402
  }
403
  fatal("DEBUG MESSAGE: %s", msg);
404
}
405

406
void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
407
  ttyLocker ttyl;
408
  DebuggingContext debugging{};
409
  tty->print_cr("eip = 0x%08x", eip);
410
#ifndef PRODUCT
411
  if ((WizardMode || Verbose) && PrintMiscellaneous) {
412
    tty->cr();
413
    findpc(eip);
414
    tty->cr();
415
  }
416
#endif
417
#define PRINT_REG(rax) \
418
  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
419
  PRINT_REG(rax);
420
  PRINT_REG(rbx);
421
  PRINT_REG(rcx);
422
  PRINT_REG(rdx);
423
  PRINT_REG(rdi);
424
  PRINT_REG(rsi);
425
  PRINT_REG(rbp);
426
  PRINT_REG(rsp);
427
#undef PRINT_REG
428
  // Print some words near top of staack.
429
  int* dump_sp = (int*) rsp;
430
  for (int col1 = 0; col1 < 8; col1++) {
431
    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
432
    os::print_location(tty, *dump_sp++);
433
  }
434
  for (int row = 0; row < 16; row++) {
435
    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
436
    for (int col = 0; col < 8; col++) {
437
      tty->print(" 0x%08x", *dump_sp++);
438
    }
439
    tty->cr();
440
  }
441
  // Print some instructions around pc:
442
  Disassembler::decode((address)eip-64, (address)eip);
443
  tty->print_cr("--------");
444
  Disassembler::decode((address)eip, (address)eip+32);
445
}
446

447
void MacroAssembler::stop(const char* msg) {
448
  // push address of message
449
  ExternalAddress message((address)msg);
450
  pushptr(message.addr(), noreg);
451
  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
452
  pusha();                                            // push registers
453
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
454
  hlt();
455
}
456

457
void MacroAssembler::warn(const char* msg) {
458
  push_CPU_state();
459

460
  // push address of message
461
  ExternalAddress message((address)msg);
462
  pushptr(message.addr(), noreg);
463

464
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
465
  addl(rsp, wordSize);       // discard argument
466
  pop_CPU_state();
467
}
468

469
void MacroAssembler::print_state() {
470
  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
471
  pusha();                                            // push registers
472

473
  push_CPU_state();
474
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
475
  pop_CPU_state();
476

477
  popa();
478
  addl(rsp, wordSize);
479
}
480

481
#else // _LP64
482

483
// 64 bit versions
484

485
Address MacroAssembler::as_Address(AddressLiteral adr) {
486
  // amd64 always does this as a pc-rel
487
  // we can be absolute or disp based on the instruction type
488
  // jmp/call are displacements others are absolute
489
  assert(!adr.is_lval(), "must be rval");
490
  assert(reachable(adr), "must be");
491
  return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
492

493
}
494

495
Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
496
  AddressLiteral base = adr.base();
497
  lea(rscratch, base);
498
  Address index = adr.index();
499
  assert(index._disp == 0, "must not have disp"); // maybe it can?
500
  Address array(rscratch, index._index, index._scale, index._disp);
501
  return array;
502
}
503

504
void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
505
  Label L, E;
506

507
#ifdef _WIN64
508
  // Windows always allocates space for it's register args
509
  assert(num_args <= 4, "only register arguments supported");
510
  subq(rsp,  frame::arg_reg_save_area_bytes);
511
#endif
512

513
  // Align stack if necessary
514
  testl(rsp, 15);
515
  jcc(Assembler::zero, L);
516

517
  subq(rsp, 8);
518
  call(RuntimeAddress(entry_point));
519
  addq(rsp, 8);
520
  jmp(E);
521

522
  bind(L);
523
  call(RuntimeAddress(entry_point));
524

525
  bind(E);
526

527
#ifdef _WIN64
528
  // restore stack pointer
529
  addq(rsp, frame::arg_reg_save_area_bytes);
530
#endif
531

532
}
533

534
void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) {
535
  assert(!src2.is_lval(), "should use cmpptr");
536
  assert(rscratch != noreg || always_reachable(src2), "missing");
537

538
  if (reachable(src2)) {
539
    cmpq(src1, as_Address(src2));
540
  } else {
541
    lea(rscratch, src2);
542
    Assembler::cmpq(src1, Address(rscratch, 0));
543
  }
544
}
545

546
int MacroAssembler::corrected_idivq(Register reg) {
547
  // Full implementation of Java ldiv and lrem; checks for special
548
  // case as described in JVM spec., p.243 & p.271.  The function
549
  // returns the (pc) offset of the idivl instruction - may be needed
550
  // for implicit exceptions.
551
  //
552
  //         normal case                           special case
553
  //
554
  // input : rax: dividend                         min_long
555
  //         reg: divisor   (may not be eax/edx)   -1
556
  //
557
  // output: rax: quotient  (= rax idiv reg)       min_long
558
  //         rdx: remainder (= rax irem reg)       0
559
  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
560
  static const int64_t min_long = 0x8000000000000000;
561
  Label normal_case, special_case;
562

563
  // check for special case
564
  cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
565
  jcc(Assembler::notEqual, normal_case);
566
  xorl(rdx, rdx); // prepare rdx for possible special case (where
567
                  // remainder = 0)
568
  cmpq(reg, -1);
569
  jcc(Assembler::equal, special_case);
570

571
  // handle normal case
572
  bind(normal_case);
573
  cdqq();
574
  int idivq_offset = offset();
575
  idivq(reg);
576

577
  // normal and special case exit
578
  bind(special_case);
579

580
  return idivq_offset;
581
}
582

583
void MacroAssembler::decrementq(Register reg, int value) {
584
  if (value == min_jint) { subq(reg, value); return; }
585
  if (value <  0) { incrementq(reg, -value); return; }
586
  if (value == 0) {                        ; return; }
587
  if (value == 1 && UseIncDec) { decq(reg) ; return; }
588
  /* else */      { subq(reg, value)       ; return; }
589
}
590

591
void MacroAssembler::decrementq(Address dst, int value) {
592
  if (value == min_jint) { subq(dst, value); return; }
593
  if (value <  0) { incrementq(dst, -value); return; }
594
  if (value == 0) {                        ; return; }
595
  if (value == 1 && UseIncDec) { decq(dst) ; return; }
596
  /* else */      { subq(dst, value)       ; return; }
597
}
598

599
void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) {
600
  assert(rscratch != noreg || always_reachable(dst), "missing");
601

602
  if (reachable(dst)) {
603
    incrementq(as_Address(dst));
604
  } else {
605
    lea(rscratch, dst);
606
    incrementq(Address(rscratch, 0));
607
  }
608
}
609

610
void MacroAssembler::incrementq(Register reg, int value) {
611
  if (value == min_jint) { addq(reg, value); return; }
612
  if (value <  0) { decrementq(reg, -value); return; }
613
  if (value == 0) {                        ; return; }
614
  if (value == 1 && UseIncDec) { incq(reg) ; return; }
615
  /* else */      { addq(reg, value)       ; return; }
616
}
617

618
void MacroAssembler::incrementq(Address dst, int value) {
619
  if (value == min_jint) { addq(dst, value); return; }
620
  if (value <  0) { decrementq(dst, -value); return; }
621
  if (value == 0) {                        ; return; }
622
  if (value == 1 && UseIncDec) { incq(dst) ; return; }
623
  /* else */      { addq(dst, value)       ; return; }
624
}
625

626
// 32bit can do a case table jump in one instruction but we no longer allow the base
627
// to be installed in the Address class
628
void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
629
  lea(rscratch, entry.base());
630
  Address dispatch = entry.index();
631
  assert(dispatch._base == noreg, "must be");
632
  dispatch._base = rscratch;
633
  jmp(dispatch);
634
}
635

636
void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
637
  ShouldNotReachHere(); // 64bit doesn't use two regs
638
  cmpq(x_lo, y_lo);
639
}
640

641
void MacroAssembler::lea(Register dst, AddressLiteral src) {
642
  mov_literal64(dst, (intptr_t)src.target(), src.rspec());
643
}
644

645
void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) {
646
  lea(rscratch, adr);
647
  movptr(dst, rscratch);
648
}
649

650
void MacroAssembler::leave() {
651
  // %%% is this really better? Why not on 32bit too?
652
  emit_int8((unsigned char)0xC9); // LEAVE
653
}
654

655
void MacroAssembler::lneg(Register hi, Register lo) {
656
  ShouldNotReachHere(); // 64bit doesn't use two regs
657
  negq(lo);
658
}
659

660
void MacroAssembler::movoop(Register dst, jobject obj) {
661
  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
662
}
663

664
void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) {
665
  mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate());
666
  movq(dst, rscratch);
667
}
668

669
void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
670
  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
671
}
672

673
void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) {
674
  mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
675
  movq(dst, rscratch);
676
}
677

678
void MacroAssembler::movptr(Register dst, AddressLiteral src) {
679
  if (src.is_lval()) {
680
    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
681
  } else {
682
    if (reachable(src)) {
683
      movq(dst, as_Address(src));
684
    } else {
685
      lea(dst, src);
686
      movq(dst, Address(dst, 0));
687
    }
688
  }
689
}
690

691
void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) {
692
  movq(as_Address(dst, rscratch), src);
693
}
694

695
void MacroAssembler::movptr(Register dst, ArrayAddress src) {
696
  movq(dst, as_Address(src, dst /*rscratch*/));
697
}
698

699
// src should NEVER be a real pointer. Use AddressLiteral for true pointers
700
void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) {
701
  if (is_simm32(src)) {
702
    movptr(dst, checked_cast<int32_t>(src));
703
  } else {
704
    mov64(rscratch, src);
705
    movq(dst, rscratch);
706
  }
707
}
708

709
void MacroAssembler::pushoop(jobject obj, Register rscratch) {
710
  movoop(rscratch, obj);
711
  push(rscratch);
712
}
713

714
void MacroAssembler::pushklass(Metadata* obj, Register rscratch) {
715
  mov_metadata(rscratch, obj);
716
  push(rscratch);
717
}
718

719
void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) {
720
  lea(rscratch, src);
721
  if (src.is_lval()) {
722
    push(rscratch);
723
  } else {
724
    pushq(Address(rscratch, 0));
725
  }
726
}
727

728
void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
729
  reset_last_Java_frame(r15_thread, clear_fp);
730
}
731

732
void MacroAssembler::set_last_Java_frame(Register last_java_sp,
733
                                         Register last_java_fp,
734
                                         address  last_java_pc,
735
                                         Register rscratch) {
736
  set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, last_java_pc, rscratch);
737
}
738

739
static void pass_arg0(MacroAssembler* masm, Register arg) {
740
  if (c_rarg0 != arg ) {
741
    masm->mov(c_rarg0, arg);
742
  }
743
}
744

745
static void pass_arg1(MacroAssembler* masm, Register arg) {
746
  if (c_rarg1 != arg ) {
747
    masm->mov(c_rarg1, arg);
748
  }
749
}
750

751
static void pass_arg2(MacroAssembler* masm, Register arg) {
752
  if (c_rarg2 != arg ) {
753
    masm->mov(c_rarg2, arg);
754
  }
755
}
756

757
static void pass_arg3(MacroAssembler* masm, Register arg) {
758
  if (c_rarg3 != arg ) {
759
    masm->mov(c_rarg3, arg);
760
  }
761
}
762

763
void MacroAssembler::stop(const char* msg) {
764
  if (ShowMessageBoxOnError) {
765
    address rip = pc();
766
    pusha(); // get regs on stack
767
    lea(c_rarg1, InternalAddress(rip));
768
    movq(c_rarg2, rsp); // pass pointer to regs array
769
  }
770
  lea(c_rarg0, ExternalAddress((address) msg));
771
  andq(rsp, -16); // align stack as required by ABI
772
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
773
  hlt();
774
}
775

776
void MacroAssembler::warn(const char* msg) {
777
  push(rbp);
778
  movq(rbp, rsp);
779
  andq(rsp, -16);     // align stack as required by push_CPU_state and call
780
  push_CPU_state();   // keeps alignment at 16 bytes
781

782
  lea(c_rarg0, ExternalAddress((address) msg));
783
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
784

785
  pop_CPU_state();
786
  mov(rsp, rbp);
787
  pop(rbp);
788
}
789

790
void MacroAssembler::print_state() {
791
  address rip = pc();
792
  pusha();            // get regs on stack
793
  push(rbp);
794
  movq(rbp, rsp);
795
  andq(rsp, -16);     // align stack as required by push_CPU_state and call
796
  push_CPU_state();   // keeps alignment at 16 bytes
797

798
  lea(c_rarg0, InternalAddress(rip));
799
  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
800
  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
801

802
  pop_CPU_state();
803
  mov(rsp, rbp);
804
  pop(rbp);
805
  popa();
806
}
807

808
#ifndef PRODUCT
809
extern "C" void findpc(intptr_t x);
810
#endif
811

812
void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
813
  // In order to get locks to work, we need to fake a in_VM state
814
  if (ShowMessageBoxOnError) {
815
    JavaThread* thread = JavaThread::current();
816
    JavaThreadState saved_state = thread->thread_state();
817
    thread->set_thread_state(_thread_in_vm);
818
#ifndef PRODUCT
819
    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
820
      ttyLocker ttyl;
821
      BytecodeCounter::print();
822
    }
823
#endif
824
    // To see where a verify_oop failed, get $ebx+40/X for this frame.
825
    // XXX correct this offset for amd64
826
    // This is the value of eip which points to where verify_oop will return.
827
    if (os::message_box(msg, "Execution stopped, print registers?")) {
828
      print_state64(pc, regs);
829
      BREAKPOINT;
830
    }
831
  }
832
  fatal("DEBUG MESSAGE: %s", msg);
833
}
834

835
void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
836
  ttyLocker ttyl;
837
  DebuggingContext debugging{};
838
  tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
839
#ifndef PRODUCT
840
  tty->cr();
841
  findpc(pc);
842
  tty->cr();
843
#endif
844
#define PRINT_REG(rax, value) \
845
  { tty->print("%s = ", #rax); os::print_location(tty, value); }
846
  PRINT_REG(rax, regs[15]);
847
  PRINT_REG(rbx, regs[12]);
848
  PRINT_REG(rcx, regs[14]);
849
  PRINT_REG(rdx, regs[13]);
850
  PRINT_REG(rdi, regs[8]);
851
  PRINT_REG(rsi, regs[9]);
852
  PRINT_REG(rbp, regs[10]);
853
  // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
854
  PRINT_REG(rsp, (intptr_t)(&regs[16]));
855
  PRINT_REG(r8 , regs[7]);
856
  PRINT_REG(r9 , regs[6]);
857
  PRINT_REG(r10, regs[5]);
858
  PRINT_REG(r11, regs[4]);
859
  PRINT_REG(r12, regs[3]);
860
  PRINT_REG(r13, regs[2]);
861
  PRINT_REG(r14, regs[1]);
862
  PRINT_REG(r15, regs[0]);
863
#undef PRINT_REG
864
  // Print some words near the top of the stack.
865
  int64_t* rsp = &regs[16];
866
  int64_t* dump_sp = rsp;
867
  for (int col1 = 0; col1 < 8; col1++) {
868
    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
869
    os::print_location(tty, *dump_sp++);
870
  }
871
  for (int row = 0; row < 25; row++) {
872
    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
873
    for (int col = 0; col < 4; col++) {
874
      tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
875
    }
876
    tty->cr();
877
  }
878
  // Print some instructions around pc:
879
  Disassembler::decode((address)pc-64, (address)pc);
880
  tty->print_cr("--------");
881
  Disassembler::decode((address)pc, (address)pc+32);
882
}
883

884
// The java_calling_convention describes stack locations as ideal slots on
885
// a frame with no abi restrictions. Since we must observe abi restrictions
886
// (like the placement of the register window) the slots must be biased by
887
// the following value.
888
static int reg2offset_in(VMReg r) {
889
  // Account for saved rbp and return address
890
  // This should really be in_preserve_stack_slots
891
  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
892
}
893

894
static int reg2offset_out(VMReg r) {
895
  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
896
}
897

898
// A long move
899
void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
900

901
  // The calling conventions assures us that each VMregpair is either
902
  // all really one physical register or adjacent stack slots.
903

904
  if (src.is_single_phys_reg() ) {
905
    if (dst.is_single_phys_reg()) {
906
      if (dst.first() != src.first()) {
907
        mov(dst.first()->as_Register(), src.first()->as_Register());
908
      }
909
    } else {
910
      assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)",
911
             src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name());
912
      movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
913
    }
914
  } else if (dst.is_single_phys_reg()) {
915
    assert(src.is_single_reg(),  "not a stack pair");
916
    movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
917
  } else {
918
    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
919
    movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
920
    movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
921
  }
922
}
923

924
// A double move
925
void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
926

927
  // The calling conventions assures us that each VMregpair is either
928
  // all really one physical register or adjacent stack slots.
929

930
  if (src.is_single_phys_reg() ) {
931
    if (dst.is_single_phys_reg()) {
932
      // In theory these overlap but the ordering is such that this is likely a nop
933
      if ( src.first() != dst.first()) {
934
        movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
935
      }
936
    } else {
937
      assert(dst.is_single_reg(), "not a stack pair");
938
      movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
939
    }
940
  } else if (dst.is_single_phys_reg()) {
941
    assert(src.is_single_reg(),  "not a stack pair");
942
    movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
943
  } else {
944
    assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
945
    movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
946
    movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
947
  }
948
}
949

950

951
// A float arg may have to do float reg int reg conversion
952
void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
953
  assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
954

955
  // The calling conventions assures us that each VMregpair is either
956
  // all really one physical register or adjacent stack slots.
957

958
  if (src.first()->is_stack()) {
959
    if (dst.first()->is_stack()) {
960
      movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
961
      movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
962
    } else {
963
      // stack to reg
964
      assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
965
      movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
966
    }
967
  } else if (dst.first()->is_stack()) {
968
    // reg to stack
969
    assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
970
    movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
971
  } else {
972
    // reg to reg
973
    // In theory these overlap but the ordering is such that this is likely a nop
974
    if ( src.first() != dst.first()) {
975
      movdbl(dst.first()->as_XMMRegister(),  src.first()->as_XMMRegister());
976
    }
977
  }
978
}
979

980
// On 64 bit we will store integer like items to the stack as
981
// 64 bits items (x86_32/64 abi) even though java would only store
982
// 32bits for a parameter. On 32bit it will simply be 32 bits
983
// So this routine will do 32->32 on 32bit and 32->64 on 64bit
984
void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
985
  if (src.first()->is_stack()) {
986
    if (dst.first()->is_stack()) {
987
      // stack to stack
988
      movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
989
      movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
990
    } else {
991
      // stack to reg
992
      movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
993
    }
994
  } else if (dst.first()->is_stack()) {
995
    // reg to stack
996
    // Do we really have to sign extend???
997
    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
998
    movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
999
  } else {
1000
    // Do we really have to sign extend???
1001
    // __ movslq(dst.first()->as_Register(), src.first()->as_Register());
1002
    if (dst.first() != src.first()) {
1003
      movq(dst.first()->as_Register(), src.first()->as_Register());
1004
    }
1005
  }
1006
}
1007

1008
void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) {
1009
  if (src.first()->is_stack()) {
1010
    if (dst.first()->is_stack()) {
1011
      // stack to stack
1012
      movq(rax, Address(rbp, reg2offset_in(src.first())));
1013
      movq(Address(rsp, reg2offset_out(dst.first())), rax);
1014
    } else {
1015
      // stack to reg
1016
      movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
1017
    }
1018
  } else if (dst.first()->is_stack()) {
1019
    // reg to stack
1020
    movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
1021
  } else {
1022
    if (dst.first() != src.first()) {
1023
      movq(dst.first()->as_Register(), src.first()->as_Register());
1024
    }
1025
  }
1026
}
1027

1028
// An oop arg. Must pass a handle not the oop itself
1029
void MacroAssembler::object_move(OopMap* map,
1030
                        int oop_handle_offset,
1031
                        int framesize_in_slots,
1032
                        VMRegPair src,
1033
                        VMRegPair dst,
1034
                        bool is_receiver,
1035
                        int* receiver_offset) {
1036

1037
  // must pass a handle. First figure out the location we use as a handle
1038

1039
  Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register();
1040

1041
  // See if oop is null if it is we need no handle
1042

1043
  if (src.first()->is_stack()) {
1044

1045
    // Oop is already on the stack as an argument
1046
    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
1047
    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
1048
    if (is_receiver) {
1049
      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
1050
    }
1051

1052
    cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD);
1053
    lea(rHandle, Address(rbp, reg2offset_in(src.first())));
1054
    // conditionally move a null
1055
    cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first())));
1056
  } else {
1057

1058
    // Oop is in a register we must store it to the space we reserve
1059
    // on the stack for oop_handles and pass a handle if oop is non-null
1060

1061
    const Register rOop = src.first()->as_Register();
1062
    int oop_slot;
1063
    if (rOop == j_rarg0)
1064
      oop_slot = 0;
1065
    else if (rOop == j_rarg1)
1066
      oop_slot = 1;
1067
    else if (rOop == j_rarg2)
1068
      oop_slot = 2;
1069
    else if (rOop == j_rarg3)
1070
      oop_slot = 3;
1071
    else if (rOop == j_rarg4)
1072
      oop_slot = 4;
1073
    else {
1074
      assert(rOop == j_rarg5, "wrong register");
1075
      oop_slot = 5;
1076
    }
1077

1078
    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
1079
    int offset = oop_slot*VMRegImpl::stack_slot_size;
1080

1081
    map->set_oop(VMRegImpl::stack2reg(oop_slot));
1082
    // Store oop in handle area, may be null
1083
    movptr(Address(rsp, offset), rOop);
1084
    if (is_receiver) {
1085
      *receiver_offset = offset;
1086
    }
1087

1088
    cmpptr(rOop, NULL_WORD);
1089
    lea(rHandle, Address(rsp, offset));
1090
    // conditionally move a null from the handle area where it was just stored
1091
    cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
1092
  }
1093

1094
  // If arg is on the stack then place it otherwise it is already in correct reg.
1095
  if (dst.first()->is_stack()) {
1096
    movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
1097
  }
1098
}
1099

1100
#endif // _LP64
1101

1102
// Now versions that are common to 32/64 bit
1103

1104
void MacroAssembler::addptr(Register dst, int32_t imm32) {
1105
  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
1106
}
1107

1108
void MacroAssembler::addptr(Register dst, Register src) {
1109
  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1110
}
1111

1112
void MacroAssembler::addptr(Address dst, Register src) {
1113
  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
1114
}
1115

1116
void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1117
  assert(rscratch != noreg || always_reachable(src), "missing");
1118

1119
  if (reachable(src)) {
1120
    Assembler::addsd(dst, as_Address(src));
1121
  } else {
1122
    lea(rscratch, src);
1123
    Assembler::addsd(dst, Address(rscratch, 0));
1124
  }
1125
}
1126

1127
void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1128
  assert(rscratch != noreg || always_reachable(src), "missing");
1129

1130
  if (reachable(src)) {
1131
    addss(dst, as_Address(src));
1132
  } else {
1133
    lea(rscratch, src);
1134
    addss(dst, Address(rscratch, 0));
1135
  }
1136
}
1137

1138
void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1139
  assert(rscratch != noreg || always_reachable(src), "missing");
1140

1141
  if (reachable(src)) {
1142
    Assembler::addpd(dst, as_Address(src));
1143
  } else {
1144
    lea(rscratch, src);
1145
    Assembler::addpd(dst, Address(rscratch, 0));
1146
  }
1147
}
1148

1149
// See 8273459.  Function for ensuring 64-byte alignment, intended for stubs only.
1150
// Stub code is generated once and never copied.
1151
// NMethods can't use this because they get copied and we can't force alignment > 32 bytes.
1152
void MacroAssembler::align64() {
1153
  align(64, (uint)(uintptr_t)pc());
1154
}
1155

1156
void MacroAssembler::align32() {
1157
  align(32, (uint)(uintptr_t)pc());
1158
}
1159

1160
void MacroAssembler::align(uint modulus) {
1161
  // 8273459: Ensure alignment is possible with current segment alignment
1162
  assert(modulus <= (uintx)CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
1163
  align(modulus, offset());
1164
}
1165

1166
void MacroAssembler::align(uint modulus, uint target) {
1167
  if (target % modulus != 0) {
1168
    nop(modulus - (target % modulus));
1169
  }
1170
}
1171

1172
void MacroAssembler::push_f(XMMRegister r) {
1173
  subptr(rsp, wordSize);
1174
  movflt(Address(rsp, 0), r);
1175
}
1176

1177
void MacroAssembler::pop_f(XMMRegister r) {
1178
  movflt(r, Address(rsp, 0));
1179
  addptr(rsp, wordSize);
1180
}
1181

1182
void MacroAssembler::push_d(XMMRegister r) {
1183
  subptr(rsp, 2 * wordSize);
1184
  movdbl(Address(rsp, 0), r);
1185
}
1186

1187
void MacroAssembler::pop_d(XMMRegister r) {
1188
  movdbl(r, Address(rsp, 0));
1189
  addptr(rsp, 2 * Interpreter::stackElementSize);
1190
}
1191

1192
void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1193
  // Used in sign-masking with aligned address.
1194
  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1195
  assert(rscratch != noreg || always_reachable(src), "missing");
1196

1197
  if (reachable(src)) {
1198
    Assembler::andpd(dst, as_Address(src));
1199
  } else {
1200
    lea(rscratch, src);
1201
    Assembler::andpd(dst, Address(rscratch, 0));
1202
  }
1203
}
1204

1205
void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) {
1206
  // Used in sign-masking with aligned address.
1207
  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1208
  assert(rscratch != noreg || always_reachable(src), "missing");
1209

1210
  if (reachable(src)) {
1211
    Assembler::andps(dst, as_Address(src));
1212
  } else {
1213
    lea(rscratch, src);
1214
    Assembler::andps(dst, Address(rscratch, 0));
1215
  }
1216
}
1217

1218
void MacroAssembler::andptr(Register dst, int32_t imm32) {
1219
  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1220
}
1221

1222
#ifdef _LP64
1223
void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) {
1224
  assert(rscratch != noreg || always_reachable(src), "missing");
1225

1226
  if (reachable(src)) {
1227
    andq(dst, as_Address(src));
1228
  } else {
1229
    lea(rscratch, src);
1230
    andq(dst, Address(rscratch, 0));
1231
  }
1232
}
1233
#endif
1234

1235
void MacroAssembler::atomic_incl(Address counter_addr) {
1236
  lock();
1237
  incrementl(counter_addr);
1238
}
1239

1240
void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) {
1241
  assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1242

1243
  if (reachable(counter_addr)) {
1244
    atomic_incl(as_Address(counter_addr));
1245
  } else {
1246
    lea(rscratch, counter_addr);
1247
    atomic_incl(Address(rscratch, 0));
1248
  }
1249
}
1250

1251
#ifdef _LP64
1252
void MacroAssembler::atomic_incq(Address counter_addr) {
1253
  lock();
1254
  incrementq(counter_addr);
1255
}
1256

1257
void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) {
1258
  assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1259

1260
  if (reachable(counter_addr)) {
1261
    atomic_incq(as_Address(counter_addr));
1262
  } else {
1263
    lea(rscratch, counter_addr);
1264
    atomic_incq(Address(rscratch, 0));
1265
  }
1266
}
1267
#endif
1268

1269
// Writes to stack successive pages until offset reached to check for
1270
// stack overflow + shadow pages.  This clobbers tmp.
1271
void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1272
  movptr(tmp, rsp);
1273
  // Bang stack for total size given plus shadow page size.
1274
  // Bang one page at a time because large size can bang beyond yellow and
1275
  // red zones.
1276
  Label loop;
1277
  bind(loop);
1278
  movl(Address(tmp, (-(int)os::vm_page_size())), size );
1279
  subptr(tmp, (int)os::vm_page_size());
1280
  subl(size, (int)os::vm_page_size());
1281
  jcc(Assembler::greater, loop);
1282

1283
  // Bang down shadow pages too.
1284
  // At this point, (tmp-0) is the last address touched, so don't
1285
  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1286
  // was post-decremented.)  Skip this address by starting at i=1, and
1287
  // touch a few more pages below.  N.B.  It is important to touch all
1288
  // the way down including all pages in the shadow zone.
1289
  for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) {
1290
    // this could be any sized move but this is can be a debugging crumb
1291
    // so the bigger the better.
1292
    movptr(Address(tmp, (-i*(int)os::vm_page_size())), size );
1293
  }
1294
}
1295

1296
void MacroAssembler::reserved_stack_check() {
1297
  // testing if reserved zone needs to be enabled
1298
  Label no_reserved_zone_enabling;
1299
  Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1300
  NOT_LP64(get_thread(rsi);)
1301

1302
  cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1303
  jcc(Assembler::below, no_reserved_zone_enabling);
1304

1305
  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1306
  jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1307
  should_not_reach_here();
1308

1309
  bind(no_reserved_zone_enabling);
1310
}
1311

1312
void MacroAssembler::c2bool(Register x) {
1313
  // implements x == 0 ? 0 : 1
1314
  // note: must only look at least-significant byte of x
1315
  //       since C-style booleans are stored in one byte
1316
  //       only! (was bug)
1317
  andl(x, 0xFF);
1318
  setb(Assembler::notZero, x);
1319
}
1320

1321
// Wouldn't need if AddressLiteral version had new name
1322
void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1323
  Assembler::call(L, rtype);
1324
}
1325

1326
void MacroAssembler::call(Register entry) {
1327
  Assembler::call(entry);
1328
}
1329

1330
void MacroAssembler::call(AddressLiteral entry, Register rscratch) {
1331
  assert(rscratch != noreg || always_reachable(entry), "missing");
1332

1333
  if (reachable(entry)) {
1334
    Assembler::call_literal(entry.target(), entry.rspec());
1335
  } else {
1336
    lea(rscratch, entry);
1337
    Assembler::call(rscratch);
1338
  }
1339
}
1340

1341
void MacroAssembler::ic_call(address entry, jint method_index) {
1342
  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1343
#ifdef _LP64
1344
  // Needs full 64-bit immediate for later patching.
1345
  mov64(rax, (int64_t)Universe::non_oop_word());
1346
#else
1347
  movptr(rax, (intptr_t)Universe::non_oop_word());
1348
#endif
1349
  call(AddressLiteral(entry, rh));
1350
}
1351

1352
int MacroAssembler::ic_check_size() {
1353
  return LP64_ONLY(14) NOT_LP64(12);
1354
}
1355

1356
int MacroAssembler::ic_check(int end_alignment) {
1357
  Register receiver = LP64_ONLY(j_rarg0) NOT_LP64(rcx);
1358
  Register data = rax;
1359
  Register temp = LP64_ONLY(rscratch1) NOT_LP64(rbx);
1360

1361
  // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1362
  // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1363
  // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1364
  // before the inline cache check here, and not after
1365
  align(end_alignment, offset() + ic_check_size());
1366

1367
  int uep_offset = offset();
1368

1369
  if (UseCompressedClassPointers) {
1370
    movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
1371
    cmpl(temp, Address(data, CompiledICData::speculated_klass_offset()));
1372
  } else {
1373
    movptr(temp, Address(receiver, oopDesc::klass_offset_in_bytes()));
1374
    cmpptr(temp, Address(data, CompiledICData::speculated_klass_offset()));
1375
  }
1376

1377
  // if inline cache check fails, then jump to runtime routine
1378
  jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1379
  assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1380

1381
  return uep_offset;
1382
}
1383

1384
void MacroAssembler::emit_static_call_stub() {
1385
  // Static stub relocation also tags the Method* in the code-stream.
1386
  mov_metadata(rbx, (Metadata*) nullptr);  // Method is zapped till fixup time.
1387
  // This is recognized as unresolved by relocs/nativeinst/ic code.
1388
  jump(RuntimeAddress(pc()));
1389
}
1390

1391
// Implementation of call_VM versions
1392

1393
void MacroAssembler::call_VM(Register oop_result,
1394
                             address entry_point,
1395
                             bool check_exceptions) {
1396
  Label C, E;
1397
  call(C, relocInfo::none);
1398
  jmp(E);
1399

1400
  bind(C);
1401
  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1402
  ret(0);
1403

1404
  bind(E);
1405
}
1406

1407
void MacroAssembler::call_VM(Register oop_result,
1408
                             address entry_point,
1409
                             Register arg_1,
1410
                             bool check_exceptions) {
1411
  Label C, E;
1412
  call(C, relocInfo::none);
1413
  jmp(E);
1414

1415
  bind(C);
1416
  pass_arg1(this, arg_1);
1417
  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1418
  ret(0);
1419

1420
  bind(E);
1421
}
1422

1423
void MacroAssembler::call_VM(Register oop_result,
1424
                             address entry_point,
1425
                             Register arg_1,
1426
                             Register arg_2,
1427
                             bool check_exceptions) {
1428
  Label C, E;
1429
  call(C, relocInfo::none);
1430
  jmp(E);
1431

1432
  bind(C);
1433

1434
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1435

1436
  pass_arg2(this, arg_2);
1437
  pass_arg1(this, arg_1);
1438
  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1439
  ret(0);
1440

1441
  bind(E);
1442
}
1443

1444
void MacroAssembler::call_VM(Register oop_result,
1445
                             address entry_point,
1446
                             Register arg_1,
1447
                             Register arg_2,
1448
                             Register arg_3,
1449
                             bool check_exceptions) {
1450
  Label C, E;
1451
  call(C, relocInfo::none);
1452
  jmp(E);
1453

1454
  bind(C);
1455

1456
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1457
  LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1458
  pass_arg3(this, arg_3);
1459
  pass_arg2(this, arg_2);
1460
  pass_arg1(this, arg_1);
1461
  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1462
  ret(0);
1463

1464
  bind(E);
1465
}
1466

1467
void MacroAssembler::call_VM(Register oop_result,
1468
                             Register last_java_sp,
1469
                             address entry_point,
1470
                             int number_of_arguments,
1471
                             bool check_exceptions) {
1472
  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1473
  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1474
}
1475

1476
void MacroAssembler::call_VM(Register oop_result,
1477
                             Register last_java_sp,
1478
                             address entry_point,
1479
                             Register arg_1,
1480
                             bool check_exceptions) {
1481
  pass_arg1(this, arg_1);
1482
  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1483
}
1484

1485
void MacroAssembler::call_VM(Register oop_result,
1486
                             Register last_java_sp,
1487
                             address entry_point,
1488
                             Register arg_1,
1489
                             Register arg_2,
1490
                             bool check_exceptions) {
1491

1492
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1493
  pass_arg2(this, arg_2);
1494
  pass_arg1(this, arg_1);
1495
  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1496
}
1497

1498
void MacroAssembler::call_VM(Register oop_result,
1499
                             Register last_java_sp,
1500
                             address entry_point,
1501
                             Register arg_1,
1502
                             Register arg_2,
1503
                             Register arg_3,
1504
                             bool check_exceptions) {
1505
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1506
  LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1507
  pass_arg3(this, arg_3);
1508
  pass_arg2(this, arg_2);
1509
  pass_arg1(this, arg_1);
1510
  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1511
}
1512

1513
void MacroAssembler::super_call_VM(Register oop_result,
1514
                                   Register last_java_sp,
1515
                                   address entry_point,
1516
                                   int number_of_arguments,
1517
                                   bool check_exceptions) {
1518
  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1519
  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1520
}
1521

1522
void MacroAssembler::super_call_VM(Register oop_result,
1523
                                   Register last_java_sp,
1524
                                   address entry_point,
1525
                                   Register arg_1,
1526
                                   bool check_exceptions) {
1527
  pass_arg1(this, arg_1);
1528
  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1529
}
1530

1531
void MacroAssembler::super_call_VM(Register oop_result,
1532
                                   Register last_java_sp,
1533
                                   address entry_point,
1534
                                   Register arg_1,
1535
                                   Register arg_2,
1536
                                   bool check_exceptions) {
1537

1538
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1539
  pass_arg2(this, arg_2);
1540
  pass_arg1(this, arg_1);
1541
  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1542
}
1543

1544
void MacroAssembler::super_call_VM(Register oop_result,
1545
                                   Register last_java_sp,
1546
                                   address entry_point,
1547
                                   Register arg_1,
1548
                                   Register arg_2,
1549
                                   Register arg_3,
1550
                                   bool check_exceptions) {
1551
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1552
  LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1553
  pass_arg3(this, arg_3);
1554
  pass_arg2(this, arg_2);
1555
  pass_arg1(this, arg_1);
1556
  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1557
}
1558

1559
void MacroAssembler::call_VM_base(Register oop_result,
1560
                                  Register java_thread,
1561
                                  Register last_java_sp,
1562
                                  address  entry_point,
1563
                                  int      number_of_arguments,
1564
                                  bool     check_exceptions) {
1565
  // determine java_thread register
1566
  if (!java_thread->is_valid()) {
1567
#ifdef _LP64
1568
    java_thread = r15_thread;
1569
#else
1570
    java_thread = rdi;
1571
    get_thread(java_thread);
1572
#endif // LP64
1573
  }
1574
  // determine last_java_sp register
1575
  if (!last_java_sp->is_valid()) {
1576
    last_java_sp = rsp;
1577
  }
1578
  // debugging support
1579
  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1580
  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1581
#ifdef ASSERT
1582
  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1583
  // r12 is the heapbase.
1584
  LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1585
#endif // ASSERT
1586

1587
  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1588
  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1589

1590
  // push java thread (becomes first argument of C function)
1591

1592
  NOT_LP64(push(java_thread); number_of_arguments++);
1593
  LP64_ONLY(mov(c_rarg0, r15_thread));
1594

1595
  // set last Java frame before call
1596
  assert(last_java_sp != rbp, "can't use ebp/rbp");
1597

1598
  // Only interpreter should have to set fp
1599
  set_last_Java_frame(java_thread, last_java_sp, rbp, nullptr, rscratch1);
1600

1601
  // do the call, remove parameters
1602
  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1603

1604
  // restore the thread (cannot use the pushed argument since arguments
1605
  // may be overwritten by C code generated by an optimizing compiler);
1606
  // however can use the register value directly if it is callee saved.
1607
  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1608
    // rdi & rsi (also r15) are callee saved -> nothing to do
1609
#ifdef ASSERT
1610
    guarantee(java_thread != rax, "change this code");
1611
    push(rax);
1612
    { Label L;
1613
      get_thread(rax);
1614
      cmpptr(java_thread, rax);
1615
      jcc(Assembler::equal, L);
1616
      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1617
      bind(L);
1618
    }
1619
    pop(rax);
1620
#endif
1621
  } else {
1622
    get_thread(java_thread);
1623
  }
1624
  // reset last Java frame
1625
  // Only interpreter should have to clear fp
1626
  reset_last_Java_frame(java_thread, true);
1627

1628
   // C++ interp handles this in the interpreter
1629
  check_and_handle_popframe(java_thread);
1630
  check_and_handle_earlyret(java_thread);
1631

1632
  if (check_exceptions) {
1633
    // check for pending exceptions (java_thread is set upon return)
1634
    cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD);
1635
#ifndef _LP64
1636
    jump_cc(Assembler::notEqual,
1637
            RuntimeAddress(StubRoutines::forward_exception_entry()));
1638
#else
1639
    // This used to conditionally jump to forward_exception however it is
1640
    // possible if we relocate that the branch will not reach. So we must jump
1641
    // around so we can always reach
1642

1643
    Label ok;
1644
    jcc(Assembler::equal, ok);
1645
    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1646
    bind(ok);
1647
#endif // LP64
1648
  }
1649

1650
  // get oop result if there is one and reset the value in the thread
1651
  if (oop_result->is_valid()) {
1652
    get_vm_result(oop_result, java_thread);
1653
  }
1654
}
1655

1656
void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1657

1658
  // Calculate the value for last_Java_sp
1659
  // somewhat subtle. call_VM does an intermediate call
1660
  // which places a return address on the stack just under the
1661
  // stack pointer as the user finished with it. This allows
1662
  // use to retrieve last_Java_pc from last_Java_sp[-1].
1663
  // On 32bit we then have to push additional args on the stack to accomplish
1664
  // the actual requested call. On 64bit call_VM only can use register args
1665
  // so the only extra space is the return address that call_VM created.
1666
  // This hopefully explains the calculations here.
1667

1668
#ifdef _LP64
1669
  // We've pushed one address, correct last_Java_sp
1670
  lea(rax, Address(rsp, wordSize));
1671
#else
1672
  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1673
#endif // LP64
1674

1675
  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1676

1677
}
1678

1679
// Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1680
void MacroAssembler::call_VM_leaf0(address entry_point) {
1681
  MacroAssembler::call_VM_leaf_base(entry_point, 0);
1682
}
1683

1684
void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1685
  call_VM_leaf_base(entry_point, number_of_arguments);
1686
}
1687

1688
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1689
  pass_arg0(this, arg_0);
1690
  call_VM_leaf(entry_point, 1);
1691
}
1692

1693
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1694

1695
  LP64_ONLY(assert_different_registers(arg_0, c_rarg1));
1696
  pass_arg1(this, arg_1);
1697
  pass_arg0(this, arg_0);
1698
  call_VM_leaf(entry_point, 2);
1699
}
1700

1701
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1702
  LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2));
1703
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1704
  pass_arg2(this, arg_2);
1705
  pass_arg1(this, arg_1);
1706
  pass_arg0(this, arg_0);
1707
  call_VM_leaf(entry_point, 3);
1708
}
1709

1710
void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1711
  LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3));
1712
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1713
  LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1714
  pass_arg3(this, arg_3);
1715
  pass_arg2(this, arg_2);
1716
  pass_arg1(this, arg_1);
1717
  pass_arg0(this, arg_0);
1718
  call_VM_leaf(entry_point, 3);
1719
}
1720

1721
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1722
  pass_arg0(this, arg_0);
1723
  MacroAssembler::call_VM_leaf_base(entry_point, 1);
1724
}
1725

1726
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1727
  LP64_ONLY(assert_different_registers(arg_0, c_rarg1));
1728
  pass_arg1(this, arg_1);
1729
  pass_arg0(this, arg_0);
1730
  MacroAssembler::call_VM_leaf_base(entry_point, 2);
1731
}
1732

1733
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1734
  LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2));
1735
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2));
1736
  pass_arg2(this, arg_2);
1737
  pass_arg1(this, arg_1);
1738
  pass_arg0(this, arg_0);
1739
  MacroAssembler::call_VM_leaf_base(entry_point, 3);
1740
}
1741

1742
void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1743
  LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3));
1744
  LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3));
1745
  LP64_ONLY(assert_different_registers(arg_2, c_rarg3));
1746
  pass_arg3(this, arg_3);
1747
  pass_arg2(this, arg_2);
1748
  pass_arg1(this, arg_1);
1749
  pass_arg0(this, arg_0);
1750
  MacroAssembler::call_VM_leaf_base(entry_point, 4);
1751
}
1752

1753
void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1754
  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1755
  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1756
  verify_oop_msg(oop_result, "broken oop in call_VM_base");
1757
}
1758

1759
void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1760
  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1761
  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1762
}
1763

1764
void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1765
}
1766

1767
void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1768
}
1769

1770
void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) {
1771
  assert(rscratch != noreg || always_reachable(src1), "missing");
1772

1773
  if (reachable(src1)) {
1774
    cmpl(as_Address(src1), imm);
1775
  } else {
1776
    lea(rscratch, src1);
1777
    cmpl(Address(rscratch, 0), imm);
1778
  }
1779
}
1780

1781
void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) {
1782
  assert(!src2.is_lval(), "use cmpptr");
1783
  assert(rscratch != noreg || always_reachable(src2), "missing");
1784

1785
  if (reachable(src2)) {
1786
    cmpl(src1, as_Address(src2));
1787
  } else {
1788
    lea(rscratch, src2);
1789
    cmpl(src1, Address(rscratch, 0));
1790
  }
1791
}
1792

1793
void MacroAssembler::cmp32(Register src1, int32_t imm) {
1794
  Assembler::cmpl(src1, imm);
1795
}
1796

1797
void MacroAssembler::cmp32(Register src1, Address src2) {
1798
  Assembler::cmpl(src1, src2);
1799
}
1800

1801
void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1802
  ucomisd(opr1, opr2);
1803

1804
  Label L;
1805
  if (unordered_is_less) {
1806
    movl(dst, -1);
1807
    jcc(Assembler::parity, L);
1808
    jcc(Assembler::below , L);
1809
    movl(dst, 0);
1810
    jcc(Assembler::equal , L);
1811
    increment(dst);
1812
  } else { // unordered is greater
1813
    movl(dst, 1);
1814
    jcc(Assembler::parity, L);
1815
    jcc(Assembler::above , L);
1816
    movl(dst, 0);
1817
    jcc(Assembler::equal , L);
1818
    decrementl(dst);
1819
  }
1820
  bind(L);
1821
}
1822

1823
void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1824
  ucomiss(opr1, opr2);
1825

1826
  Label L;
1827
  if (unordered_is_less) {
1828
    movl(dst, -1);
1829
    jcc(Assembler::parity, L);
1830
    jcc(Assembler::below , L);
1831
    movl(dst, 0);
1832
    jcc(Assembler::equal , L);
1833
    increment(dst);
1834
  } else { // unordered is greater
1835
    movl(dst, 1);
1836
    jcc(Assembler::parity, L);
1837
    jcc(Assembler::above , L);
1838
    movl(dst, 0);
1839
    jcc(Assembler::equal , L);
1840
    decrementl(dst);
1841
  }
1842
  bind(L);
1843
}
1844

1845

1846
void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) {
1847
  assert(rscratch != noreg || always_reachable(src1), "missing");
1848

1849
  if (reachable(src1)) {
1850
    cmpb(as_Address(src1), imm);
1851
  } else {
1852
    lea(rscratch, src1);
1853
    cmpb(Address(rscratch, 0), imm);
1854
  }
1855
}
1856

1857
void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) {
1858
#ifdef _LP64
1859
  assert(rscratch != noreg || always_reachable(src2), "missing");
1860

1861
  if (src2.is_lval()) {
1862
    movptr(rscratch, src2);
1863
    Assembler::cmpq(src1, rscratch);
1864
  } else if (reachable(src2)) {
1865
    cmpq(src1, as_Address(src2));
1866
  } else {
1867
    lea(rscratch, src2);
1868
    Assembler::cmpq(src1, Address(rscratch, 0));
1869
  }
1870
#else
1871
  assert(rscratch == noreg, "not needed");
1872
  if (src2.is_lval()) {
1873
    cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1874
  } else {
1875
    cmpl(src1, as_Address(src2));
1876
  }
1877
#endif // _LP64
1878
}
1879

1880
void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) {
1881
  assert(src2.is_lval(), "not a mem-mem compare");
1882
#ifdef _LP64
1883
  // moves src2's literal address
1884
  movptr(rscratch, src2);
1885
  Assembler::cmpq(src1, rscratch);
1886
#else
1887
  assert(rscratch == noreg, "not needed");
1888
  cmp_literal32(src1, (int32_t)src2.target(), src2.rspec());
1889
#endif // _LP64
1890
}
1891

1892
void MacroAssembler::cmpoop(Register src1, Register src2) {
1893
  cmpptr(src1, src2);
1894
}
1895

1896
void MacroAssembler::cmpoop(Register src1, Address src2) {
1897
  cmpptr(src1, src2);
1898
}
1899

1900
#ifdef _LP64
1901
void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
1902
  movoop(rscratch, src2);
1903
  cmpptr(src1, rscratch);
1904
}
1905
#endif
1906

1907
void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
1908
  assert(rscratch != noreg || always_reachable(adr), "missing");
1909

1910
  if (reachable(adr)) {
1911
    lock();
1912
    cmpxchgptr(reg, as_Address(adr));
1913
  } else {
1914
    lea(rscratch, adr);
1915
    lock();
1916
    cmpxchgptr(reg, Address(rscratch, 0));
1917
  }
1918
}
1919

1920
void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1921
  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1922
}
1923

1924
void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
1925
  assert(rscratch != noreg || always_reachable(src), "missing");
1926

1927
  if (reachable(src)) {
1928
    Assembler::comisd(dst, as_Address(src));
1929
  } else {
1930
    lea(rscratch, src);
1931
    Assembler::comisd(dst, Address(rscratch, 0));
1932
  }
1933
}
1934

1935
void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
1936
  assert(rscratch != noreg || always_reachable(src), "missing");
1937

1938
  if (reachable(src)) {
1939
    Assembler::comiss(dst, as_Address(src));
1940
  } else {
1941
    lea(rscratch, src);
1942
    Assembler::comiss(dst, Address(rscratch, 0));
1943
  }
1944
}
1945

1946

1947
void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) {
1948
  assert(rscratch != noreg || always_reachable(counter_addr), "missing");
1949

1950
  Condition negated_cond = negate_condition(cond);
1951
  Label L;
1952
  jcc(negated_cond, L);
1953
  pushf(); // Preserve flags
1954
  atomic_incl(counter_addr, rscratch);
1955
  popf();
1956
  bind(L);
1957
}
1958

1959
int MacroAssembler::corrected_idivl(Register reg) {
1960
  // Full implementation of Java idiv and irem; checks for
1961
  // special case as described in JVM spec., p.243 & p.271.
1962
  // The function returns the (pc) offset of the idivl
1963
  // instruction - may be needed for implicit exceptions.
1964
  //
1965
  //         normal case                           special case
1966
  //
1967
  // input : rax,: dividend                         min_int
1968
  //         reg: divisor   (may not be rax,/rdx)   -1
1969
  //
1970
  // output: rax,: quotient  (= rax, idiv reg)       min_int
1971
  //         rdx: remainder (= rax, irem reg)       0
1972
  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1973
  const int min_int = 0x80000000;
1974
  Label normal_case, special_case;
1975

1976
  // check for special case
1977
  cmpl(rax, min_int);
1978
  jcc(Assembler::notEqual, normal_case);
1979
  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1980
  cmpl(reg, -1);
1981
  jcc(Assembler::equal, special_case);
1982

1983
  // handle normal case
1984
  bind(normal_case);
1985
  cdql();
1986
  int idivl_offset = offset();
1987
  idivl(reg);
1988

1989
  // normal and special case exit
1990
  bind(special_case);
1991

1992
  return idivl_offset;
1993
}
1994

1995

1996

1997
void MacroAssembler::decrementl(Register reg, int value) {
1998
  if (value == min_jint) {subl(reg, value) ; return; }
1999
  if (value <  0) { incrementl(reg, -value); return; }
2000
  if (value == 0) {                        ; return; }
2001
  if (value == 1 && UseIncDec) { decl(reg) ; return; }
2002
  /* else */      { subl(reg, value)       ; return; }
2003
}
2004

2005
void MacroAssembler::decrementl(Address dst, int value) {
2006
  if (value == min_jint) {subl(dst, value) ; return; }
2007
  if (value <  0) { incrementl(dst, -value); return; }
2008
  if (value == 0) {                        ; return; }
2009
  if (value == 1 && UseIncDec) { decl(dst) ; return; }
2010
  /* else */      { subl(dst, value)       ; return; }
2011
}
2012

2013
void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2014
  assert(shift_value > 0, "illegal shift value");
2015
  Label _is_positive;
2016
  testl (reg, reg);
2017
  jcc (Assembler::positive, _is_positive);
2018
  int offset = (1 << shift_value) - 1 ;
2019

2020
  if (offset == 1) {
2021
    incrementl(reg);
2022
  } else {
2023
    addl(reg, offset);
2024
  }
2025

2026
  bind (_is_positive);
2027
  sarl(reg, shift_value);
2028
}
2029

2030
void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2031
  assert(rscratch != noreg || always_reachable(src), "missing");
2032

2033
  if (reachable(src)) {
2034
    Assembler::divsd(dst, as_Address(src));
2035
  } else {
2036
    lea(rscratch, src);
2037
    Assembler::divsd(dst, Address(rscratch, 0));
2038
  }
2039
}
2040

2041
void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2042
  assert(rscratch != noreg || always_reachable(src), "missing");
2043

2044
  if (reachable(src)) {
2045
    Assembler::divss(dst, as_Address(src));
2046
  } else {
2047
    lea(rscratch, src);
2048
    Assembler::divss(dst, Address(rscratch, 0));
2049
  }
2050
}
2051

2052
void MacroAssembler::enter() {
2053
  push(rbp);
2054
  mov(rbp, rsp);
2055
}
2056

2057
void MacroAssembler::post_call_nop() {
2058
  if (!Continuations::enabled()) {
2059
    return;
2060
  }
2061
  InstructionMark im(this);
2062
  relocate(post_call_nop_Relocation::spec());
2063
  InlineSkippedInstructionsCounter skipCounter(this);
2064
  emit_int8((uint8_t)0x0f);
2065
  emit_int8((uint8_t)0x1f);
2066
  emit_int8((uint8_t)0x84);
2067
  emit_int8((uint8_t)0x00);
2068
  emit_int32(0x00);
2069
}
2070

2071
// A 5 byte nop that is safe for patching (see patch_verified_entry)
2072
void MacroAssembler::fat_nop() {
2073
  if (UseAddressNop) {
2074
    addr_nop_5();
2075
  } else {
2076
    emit_int8((uint8_t)0x26); // es:
2077
    emit_int8((uint8_t)0x2e); // cs:
2078
    emit_int8((uint8_t)0x64); // fs:
2079
    emit_int8((uint8_t)0x65); // gs:
2080
    emit_int8((uint8_t)0x90);
2081
  }
2082
}
2083

2084
#ifndef _LP64
2085
void MacroAssembler::fcmp(Register tmp) {
2086
  fcmp(tmp, 1, true, true);
2087
}
2088

2089
void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2090
  assert(!pop_right || pop_left, "usage error");
2091
  if (VM_Version::supports_cmov()) {
2092
    assert(tmp == noreg, "unneeded temp");
2093
    if (pop_left) {
2094
      fucomip(index);
2095
    } else {
2096
      fucomi(index);
2097
    }
2098
    if (pop_right) {
2099
      fpop();
2100
    }
2101
  } else {
2102
    assert(tmp != noreg, "need temp");
2103
    if (pop_left) {
2104
      if (pop_right) {
2105
        fcompp();
2106
      } else {
2107
        fcomp(index);
2108
      }
2109
    } else {
2110
      fcom(index);
2111
    }
2112
    // convert FPU condition into eflags condition via rax,
2113
    save_rax(tmp);
2114
    fwait(); fnstsw_ax();
2115
    sahf();
2116
    restore_rax(tmp);
2117
  }
2118
  // condition codes set as follows:
2119
  //
2120
  // CF (corresponds to C0) if x < y
2121
  // PF (corresponds to C2) if unordered
2122
  // ZF (corresponds to C3) if x = y
2123
}
2124

2125
void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2126
  fcmp2int(dst, unordered_is_less, 1, true, true);
2127
}
2128

2129
void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2130
  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2131
  Label L;
2132
  if (unordered_is_less) {
2133
    movl(dst, -1);
2134
    jcc(Assembler::parity, L);
2135
    jcc(Assembler::below , L);
2136
    movl(dst, 0);
2137
    jcc(Assembler::equal , L);
2138
    increment(dst);
2139
  } else { // unordered is greater
2140
    movl(dst, 1);
2141
    jcc(Assembler::parity, L);
2142
    jcc(Assembler::above , L);
2143
    movl(dst, 0);
2144
    jcc(Assembler::equal , L);
2145
    decrementl(dst);
2146
  }
2147
  bind(L);
2148
}
2149

2150
void MacroAssembler::fld_d(AddressLiteral src) {
2151
  fld_d(as_Address(src));
2152
}
2153

2154
void MacroAssembler::fld_s(AddressLiteral src) {
2155
  fld_s(as_Address(src));
2156
}
2157

2158
void MacroAssembler::fldcw(AddressLiteral src) {
2159
  fldcw(as_Address(src));
2160
}
2161

2162
void MacroAssembler::fpop() {
2163
  ffree();
2164
  fincstp();
2165
}
2166

2167
void MacroAssembler::fremr(Register tmp) {
2168
  save_rax(tmp);
2169
  { Label L;
2170
    bind(L);
2171
    fprem();
2172
    fwait(); fnstsw_ax();
2173
    sahf();
2174
    jcc(Assembler::parity, L);
2175
  }
2176
  restore_rax(tmp);
2177
  // Result is in ST0.
2178
  // Note: fxch & fpop to get rid of ST1
2179
  // (otherwise FPU stack could overflow eventually)
2180
  fxch(1);
2181
  fpop();
2182
}
2183

2184
void MacroAssembler::empty_FPU_stack() {
2185
  if (VM_Version::supports_mmx()) {
2186
    emms();
2187
  } else {
2188
    for (int i = 8; i-- > 0; ) ffree(i);
2189
  }
2190
}
2191
#endif // !LP64
2192

2193
void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2194
  assert(rscratch != noreg || always_reachable(src), "missing");
2195
  if (reachable(src)) {
2196
    Assembler::mulpd(dst, as_Address(src));
2197
  } else {
2198
    lea(rscratch, src);
2199
    Assembler::mulpd(dst, Address(rscratch, 0));
2200
  }
2201
}
2202

2203
void MacroAssembler::load_float(Address src) {
2204
#ifdef _LP64
2205
  movflt(xmm0, src);
2206
#else
2207
  if (UseSSE >= 1) {
2208
    movflt(xmm0, src);
2209
  } else {
2210
    fld_s(src);
2211
  }
2212
#endif // LP64
2213
}
2214

2215
void MacroAssembler::store_float(Address dst) {
2216
#ifdef _LP64
2217
  movflt(dst, xmm0);
2218
#else
2219
  if (UseSSE >= 1) {
2220
    movflt(dst, xmm0);
2221
  } else {
2222
    fstp_s(dst);
2223
  }
2224
#endif // LP64
2225
}
2226

2227
void MacroAssembler::load_double(Address src) {
2228
#ifdef _LP64
2229
  movdbl(xmm0, src);
2230
#else
2231
  if (UseSSE >= 2) {
2232
    movdbl(xmm0, src);
2233
  } else {
2234
    fld_d(src);
2235
  }
2236
#endif // LP64
2237
}
2238

2239
void MacroAssembler::store_double(Address dst) {
2240
#ifdef _LP64
2241
  movdbl(dst, xmm0);
2242
#else
2243
  if (UseSSE >= 2) {
2244
    movdbl(dst, xmm0);
2245
  } else {
2246
    fstp_d(dst);
2247
  }
2248
#endif // LP64
2249
}
2250

2251
// dst = c = a * b + c
2252
void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2253
  Assembler::vfmadd231sd(c, a, b);
2254
  if (dst != c) {
2255
    movdbl(dst, c);
2256
  }
2257
}
2258

2259
// dst = c = a * b + c
2260
void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2261
  Assembler::vfmadd231ss(c, a, b);
2262
  if (dst != c) {
2263
    movflt(dst, c);
2264
  }
2265
}
2266

2267
// dst = c = a * b + c
2268
void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2269
  Assembler::vfmadd231pd(c, a, b, vector_len);
2270
  if (dst != c) {
2271
    vmovdqu(dst, c);
2272
  }
2273
}
2274

2275
// dst = c = a * b + c
2276
void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2277
  Assembler::vfmadd231ps(c, a, b, vector_len);
2278
  if (dst != c) {
2279
    vmovdqu(dst, c);
2280
  }
2281
}
2282

2283
// dst = c = a * b + c
2284
void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2285
  Assembler::vfmadd231pd(c, a, b, vector_len);
2286
  if (dst != c) {
2287
    vmovdqu(dst, c);
2288
  }
2289
}
2290

2291
// dst = c = a * b + c
2292
void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2293
  Assembler::vfmadd231ps(c, a, b, vector_len);
2294
  if (dst != c) {
2295
    vmovdqu(dst, c);
2296
  }
2297
}
2298

2299
void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) {
2300
  assert(rscratch != noreg || always_reachable(dst), "missing");
2301

2302
  if (reachable(dst)) {
2303
    incrementl(as_Address(dst));
2304
  } else {
2305
    lea(rscratch, dst);
2306
    incrementl(Address(rscratch, 0));
2307
  }
2308
}
2309

2310
void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) {
2311
  incrementl(as_Address(dst, rscratch));
2312
}
2313

2314
void MacroAssembler::incrementl(Register reg, int value) {
2315
  if (value == min_jint) {addl(reg, value) ; return; }
2316
  if (value <  0) { decrementl(reg, -value); return; }
2317
  if (value == 0) {                        ; return; }
2318
  if (value == 1 && UseIncDec) { incl(reg) ; return; }
2319
  /* else */      { addl(reg, value)       ; return; }
2320
}
2321

2322
void MacroAssembler::incrementl(Address dst, int value) {
2323
  if (value == min_jint) {addl(dst, value) ; return; }
2324
  if (value <  0) { decrementl(dst, -value); return; }
2325
  if (value == 0) {                        ; return; }
2326
  if (value == 1 && UseIncDec) { incl(dst) ; return; }
2327
  /* else */      { addl(dst, value)       ; return; }
2328
}
2329

2330
void MacroAssembler::jump(AddressLiteral dst, Register rscratch) {
2331
  assert(rscratch != noreg || always_reachable(dst), "missing");
2332

2333
  if (reachable(dst)) {
2334
    jmp_literal(dst.target(), dst.rspec());
2335
  } else {
2336
    lea(rscratch, dst);
2337
    jmp(rscratch);
2338
  }
2339
}
2340

2341
void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) {
2342
  assert(rscratch != noreg || always_reachable(dst), "missing");
2343

2344
  if (reachable(dst)) {
2345
    InstructionMark im(this);
2346
    relocate(dst.reloc());
2347
    const int short_size = 2;
2348
    const int long_size = 6;
2349
    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2350
    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2351
      // 0111 tttn #8-bit disp
2352
      emit_int8(0x70 | cc);
2353
      emit_int8((offs - short_size) & 0xFF);
2354
    } else {
2355
      // 0000 1111 1000 tttn #32-bit disp
2356
      emit_int8(0x0F);
2357
      emit_int8((unsigned char)(0x80 | cc));
2358
      emit_int32(offs - long_size);
2359
    }
2360
  } else {
2361
#ifdef ASSERT
2362
    warning("reversing conditional branch");
2363
#endif /* ASSERT */
2364
    Label skip;
2365
    jccb(reverse[cc], skip);
2366
    lea(rscratch, dst);
2367
    Assembler::jmp(rscratch);
2368
    bind(skip);
2369
  }
2370
}
2371

2372
void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) {
2373
  assert(rscratch != noreg || always_reachable(src), "missing");
2374

2375
  if (reachable(src)) {
2376
    Assembler::ldmxcsr(as_Address(src));
2377
  } else {
2378
    lea(rscratch, src);
2379
    Assembler::ldmxcsr(Address(rscratch, 0));
2380
  }
2381
}
2382

2383
int MacroAssembler::load_signed_byte(Register dst, Address src) {
2384
  int off;
2385
  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2386
    off = offset();
2387
    movsbl(dst, src); // movsxb
2388
  } else {
2389
    off = load_unsigned_byte(dst, src);
2390
    shll(dst, 24);
2391
    sarl(dst, 24);
2392
  }
2393
  return off;
2394
}
2395

2396
// Note: load_signed_short used to be called load_signed_word.
2397
// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2398
// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2399
// The term "word" in HotSpot means a 32- or 64-bit machine word.
2400
int MacroAssembler::load_signed_short(Register dst, Address src) {
2401
  int off;
2402
  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2403
    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2404
    // version but this is what 64bit has always done. This seems to imply
2405
    // that users are only using 32bits worth.
2406
    off = offset();
2407
    movswl(dst, src); // movsxw
2408
  } else {
2409
    off = load_unsigned_short(dst, src);
2410
    shll(dst, 16);
2411
    sarl(dst, 16);
2412
  }
2413
  return off;
2414
}
2415

2416
int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2417
  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2418
  // and "3.9 Partial Register Penalties", p. 22).
2419
  int off;
2420
  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2421
    off = offset();
2422
    movzbl(dst, src); // movzxb
2423
  } else {
2424
    xorl(dst, dst);
2425
    off = offset();
2426
    movb(dst, src);
2427
  }
2428
  return off;
2429
}
2430

2431
// Note: load_unsigned_short used to be called load_unsigned_word.
2432
int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2433
  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2434
  // and "3.9 Partial Register Penalties", p. 22).
2435
  int off;
2436
  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2437
    off = offset();
2438
    movzwl(dst, src); // movzxw
2439
  } else {
2440
    xorl(dst, dst);
2441
    off = offset();
2442
    movw(dst, src);
2443
  }
2444
  return off;
2445
}
2446

2447
void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2448
  switch (size_in_bytes) {
2449
#ifndef _LP64
2450
  case  8:
2451
    assert(dst2 != noreg, "second dest register required");
2452
    movl(dst,  src);
2453
    movl(dst2, src.plus_disp(BytesPerInt));
2454
    break;
2455
#else
2456
  case  8:  movq(dst, src); break;
2457
#endif
2458
  case  4:  movl(dst, src); break;
2459
  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2460
  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2461
  default:  ShouldNotReachHere();
2462
  }
2463
}
2464

2465
void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2466
  switch (size_in_bytes) {
2467
#ifndef _LP64
2468
  case  8:
2469
    assert(src2 != noreg, "second source register required");
2470
    movl(dst,                        src);
2471
    movl(dst.plus_disp(BytesPerInt), src2);
2472
    break;
2473
#else
2474
  case  8:  movq(dst, src); break;
2475
#endif
2476
  case  4:  movl(dst, src); break;
2477
  case  2:  movw(dst, src); break;
2478
  case  1:  movb(dst, src); break;
2479
  default:  ShouldNotReachHere();
2480
  }
2481
}
2482

2483
void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) {
2484
  assert(rscratch != noreg || always_reachable(dst), "missing");
2485

2486
  if (reachable(dst)) {
2487
    movl(as_Address(dst), src);
2488
  } else {
2489
    lea(rscratch, dst);
2490
    movl(Address(rscratch, 0), src);
2491
  }
2492
}
2493

2494
void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2495
  if (reachable(src)) {
2496
    movl(dst, as_Address(src));
2497
  } else {
2498
    lea(dst, src);
2499
    movl(dst, Address(dst, 0));
2500
  }
2501
}
2502

2503
// C++ bool manipulation
2504

2505
void MacroAssembler::movbool(Register dst, Address src) {
2506
  if(sizeof(bool) == 1)
2507
    movb(dst, src);
2508
  else if(sizeof(bool) == 2)
2509
    movw(dst, src);
2510
  else if(sizeof(bool) == 4)
2511
    movl(dst, src);
2512
  else
2513
    // unsupported
2514
    ShouldNotReachHere();
2515
}
2516

2517
void MacroAssembler::movbool(Address dst, bool boolconst) {
2518
  if(sizeof(bool) == 1)
2519
    movb(dst, (int) boolconst);
2520
  else if(sizeof(bool) == 2)
2521
    movw(dst, (int) boolconst);
2522
  else if(sizeof(bool) == 4)
2523
    movl(dst, (int) boolconst);
2524
  else
2525
    // unsupported
2526
    ShouldNotReachHere();
2527
}
2528

2529
void MacroAssembler::movbool(Address dst, Register src) {
2530
  if(sizeof(bool) == 1)
2531
    movb(dst, src);
2532
  else if(sizeof(bool) == 2)
2533
    movw(dst, src);
2534
  else if(sizeof(bool) == 4)
2535
    movl(dst, src);
2536
  else
2537
    // unsupported
2538
    ShouldNotReachHere();
2539
}
2540

2541
void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2542
  assert(rscratch != noreg || always_reachable(src), "missing");
2543

2544
  if (reachable(src)) {
2545
    movdl(dst, as_Address(src));
2546
  } else {
2547
    lea(rscratch, src);
2548
    movdl(dst, Address(rscratch, 0));
2549
  }
2550
}
2551

2552
void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) {
2553
  assert(rscratch != noreg || always_reachable(src), "missing");
2554

2555
  if (reachable(src)) {
2556
    movq(dst, as_Address(src));
2557
  } else {
2558
    lea(rscratch, src);
2559
    movq(dst, Address(rscratch, 0));
2560
  }
2561
}
2562

2563
void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) {
2564
  assert(rscratch != noreg || always_reachable(src), "missing");
2565

2566
  if (reachable(src)) {
2567
    if (UseXmmLoadAndClearUpper) {
2568
      movsd (dst, as_Address(src));
2569
    } else {
2570
      movlpd(dst, as_Address(src));
2571
    }
2572
  } else {
2573
    lea(rscratch, src);
2574
    if (UseXmmLoadAndClearUpper) {
2575
      movsd (dst, Address(rscratch, 0));
2576
    } else {
2577
      movlpd(dst, Address(rscratch, 0));
2578
    }
2579
  }
2580
}
2581

2582
void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) {
2583
  assert(rscratch != noreg || always_reachable(src), "missing");
2584

2585
  if (reachable(src)) {
2586
    movss(dst, as_Address(src));
2587
  } else {
2588
    lea(rscratch, src);
2589
    movss(dst, Address(rscratch, 0));
2590
  }
2591
}
2592

2593
void MacroAssembler::movptr(Register dst, Register src) {
2594
  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2595
}
2596

2597
void MacroAssembler::movptr(Register dst, Address src) {
2598
  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2599
}
2600

2601
// src should NEVER be a real pointer. Use AddressLiteral for true pointers
2602
void MacroAssembler::movptr(Register dst, intptr_t src) {
2603
#ifdef _LP64
2604
  if (is_uimm32(src)) {
2605
    movl(dst, checked_cast<uint32_t>(src));
2606
  } else if (is_simm32(src)) {
2607
    movq(dst, checked_cast<int32_t>(src));
2608
  } else {
2609
    mov64(dst, src);
2610
  }
2611
#else
2612
  movl(dst, src);
2613
#endif
2614
}
2615

2616
void MacroAssembler::movptr(Address dst, Register src) {
2617
  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2618
}
2619

2620
void MacroAssembler::movptr(Address dst, int32_t src) {
2621
  LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src));
2622
}
2623

2624
void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2625
  assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2626
  Assembler::movdqu(dst, src);
2627
}
2628

2629
void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2630
  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2631
  Assembler::movdqu(dst, src);
2632
}
2633

2634
void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2635
  assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2636
  Assembler::movdqu(dst, src);
2637
}
2638

2639
void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2640
  assert(rscratch != noreg || always_reachable(src), "missing");
2641

2642
  if (reachable(src)) {
2643
    movdqu(dst, as_Address(src));
2644
  } else {
2645
    lea(rscratch, src);
2646
    movdqu(dst, Address(rscratch, 0));
2647
  }
2648
}
2649

2650
void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2651
  assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2652
  Assembler::vmovdqu(dst, src);
2653
}
2654

2655
void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2656
  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2657
  Assembler::vmovdqu(dst, src);
2658
}
2659

2660
void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2661
  assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2662
  Assembler::vmovdqu(dst, src);
2663
}
2664

2665
void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) {
2666
  assert(rscratch != noreg || always_reachable(src), "missing");
2667

2668
  if (reachable(src)) {
2669
    vmovdqu(dst, as_Address(src));
2670
  }
2671
  else {
2672
    lea(rscratch, src);
2673
    vmovdqu(dst, Address(rscratch, 0));
2674
  }
2675
}
2676

2677
void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2678
  assert(rscratch != noreg || always_reachable(src), "missing");
2679

2680
  if (vector_len == AVX_512bit) {
2681
    evmovdquq(dst, src, AVX_512bit, rscratch);
2682
  } else if (vector_len == AVX_256bit) {
2683
    vmovdqu(dst, src, rscratch);
2684
  } else {
2685
    movdqu(dst, src, rscratch);
2686
  }
2687
}
2688

2689
void MacroAssembler::kmov(KRegister dst, Address src) {
2690
  if (VM_Version::supports_avx512bw()) {
2691
    kmovql(dst, src);
2692
  } else {
2693
    assert(VM_Version::supports_evex(), "");
2694
    kmovwl(dst, src);
2695
  }
2696
}
2697

2698
void MacroAssembler::kmov(Address dst, KRegister src) {
2699
  if (VM_Version::supports_avx512bw()) {
2700
    kmovql(dst, src);
2701
  } else {
2702
    assert(VM_Version::supports_evex(), "");
2703
    kmovwl(dst, src);
2704
  }
2705
}
2706

2707
void MacroAssembler::kmov(KRegister dst, KRegister src) {
2708
  if (VM_Version::supports_avx512bw()) {
2709
    kmovql(dst, src);
2710
  } else {
2711
    assert(VM_Version::supports_evex(), "");
2712
    kmovwl(dst, src);
2713
  }
2714
}
2715

2716
void MacroAssembler::kmov(Register dst, KRegister src) {
2717
  if (VM_Version::supports_avx512bw()) {
2718
    kmovql(dst, src);
2719
  } else {
2720
    assert(VM_Version::supports_evex(), "");
2721
    kmovwl(dst, src);
2722
  }
2723
}
2724

2725
void MacroAssembler::kmov(KRegister dst, Register src) {
2726
  if (VM_Version::supports_avx512bw()) {
2727
    kmovql(dst, src);
2728
  } else {
2729
    assert(VM_Version::supports_evex(), "");
2730
    kmovwl(dst, src);
2731
  }
2732
}
2733

2734
void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) {
2735
  assert(rscratch != noreg || always_reachable(src), "missing");
2736

2737
  if (reachable(src)) {
2738
    kmovql(dst, as_Address(src));
2739
  } else {
2740
    lea(rscratch, src);
2741
    kmovql(dst, Address(rscratch, 0));
2742
  }
2743
}
2744

2745
void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) {
2746
  assert(rscratch != noreg || always_reachable(src), "missing");
2747

2748
  if (reachable(src)) {
2749
    kmovwl(dst, as_Address(src));
2750
  } else {
2751
    lea(rscratch, src);
2752
    kmovwl(dst, Address(rscratch, 0));
2753
  }
2754
}
2755

2756
void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2757
                               int vector_len, Register rscratch) {
2758
  assert(rscratch != noreg || always_reachable(src), "missing");
2759

2760
  if (reachable(src)) {
2761
    Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2762
  } else {
2763
    lea(rscratch, src);
2764
    Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len);
2765
  }
2766
}
2767

2768
void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2769
                               int vector_len, Register rscratch) {
2770
  assert(rscratch != noreg || always_reachable(src), "missing");
2771

2772
  if (reachable(src)) {
2773
    Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2774
  } else {
2775
    lea(rscratch, src);
2776
    Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len);
2777
  }
2778
}
2779

2780
void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2781
  assert(rscratch != noreg || always_reachable(src), "missing");
2782

2783
  if (reachable(src)) {
2784
    Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2785
  } else {
2786
    lea(rscratch, src);
2787
    Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len);
2788
  }
2789
}
2790

2791
void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
2792
  assert(rscratch != noreg || always_reachable(src), "missing");
2793

2794
  if (reachable(src)) {
2795
    Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2796
  } else {
2797
    lea(rscratch, src);
2798
    Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len);
2799
  }
2800
}
2801

2802
void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2803
  assert(rscratch != noreg || always_reachable(src), "missing");
2804

2805
  if (reachable(src)) {
2806
    Assembler::evmovdquq(dst, as_Address(src), vector_len);
2807
  } else {
2808
    lea(rscratch, src);
2809
    Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2810
  }
2811
}
2812

2813
void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) {
2814
  assert(rscratch != noreg || always_reachable(src), "missing");
2815

2816
  if (reachable(src)) {
2817
    Assembler::movdqa(dst, as_Address(src));
2818
  } else {
2819
    lea(rscratch, src);
2820
    Assembler::movdqa(dst, Address(rscratch, 0));
2821
  }
2822
}
2823

2824
void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2825
  assert(rscratch != noreg || always_reachable(src), "missing");
2826

2827
  if (reachable(src)) {
2828
    Assembler::movsd(dst, as_Address(src));
2829
  } else {
2830
    lea(rscratch, src);
2831
    Assembler::movsd(dst, Address(rscratch, 0));
2832
  }
2833
}
2834

2835
void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2836
  assert(rscratch != noreg || always_reachable(src), "missing");
2837

2838
  if (reachable(src)) {
2839
    Assembler::movss(dst, as_Address(src));
2840
  } else {
2841
    lea(rscratch, src);
2842
    Assembler::movss(dst, Address(rscratch, 0));
2843
  }
2844
}
2845

2846
void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) {
2847
  assert(rscratch != noreg || always_reachable(src), "missing");
2848

2849
  if (reachable(src)) {
2850
    Assembler::movddup(dst, as_Address(src));
2851
  } else {
2852
    lea(rscratch, src);
2853
    Assembler::movddup(dst, Address(rscratch, 0));
2854
  }
2855
}
2856

2857
void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2858
  assert(rscratch != noreg || always_reachable(src), "missing");
2859

2860
  if (reachable(src)) {
2861
    Assembler::vmovddup(dst, as_Address(src), vector_len);
2862
  } else {
2863
    lea(rscratch, src);
2864
    Assembler::vmovddup(dst, Address(rscratch, 0), vector_len);
2865
  }
2866
}
2867

2868
void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
2869
  assert(rscratch != noreg || always_reachable(src), "missing");
2870

2871
  if (reachable(src)) {
2872
    Assembler::mulsd(dst, as_Address(src));
2873
  } else {
2874
    lea(rscratch, src);
2875
    Assembler::mulsd(dst, Address(rscratch, 0));
2876
  }
2877
}
2878

2879
void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) {
2880
  assert(rscratch != noreg || always_reachable(src), "missing");
2881

2882
  if (reachable(src)) {
2883
    Assembler::mulss(dst, as_Address(src));
2884
  } else {
2885
    lea(rscratch, src);
2886
    Assembler::mulss(dst, Address(rscratch, 0));
2887
  }
2888
}
2889

2890
void MacroAssembler::null_check(Register reg, int offset) {
2891
  if (needs_explicit_null_check(offset)) {
2892
    // provoke OS null exception if reg is null by
2893
    // accessing M[reg] w/o changing any (non-CC) registers
2894
    // NOTE: cmpl is plenty here to provoke a segv
2895
    cmpptr(rax, Address(reg, 0));
2896
    // Note: should probably use testl(rax, Address(reg, 0));
2897
    //       may be shorter code (however, this version of
2898
    //       testl needs to be implemented first)
2899
  } else {
2900
    // nothing to do, (later) access of M[reg + offset]
2901
    // will provoke OS null exception if reg is null
2902
  }
2903
}
2904

2905
void MacroAssembler::os_breakpoint() {
2906
  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2907
  // (e.g., MSVC can't call ps() otherwise)
2908
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2909
}
2910

2911
void MacroAssembler::unimplemented(const char* what) {
2912
  const char* buf = nullptr;
2913
  {
2914
    ResourceMark rm;
2915
    stringStream ss;
2916
    ss.print("unimplemented: %s", what);
2917
    buf = code_string(ss.as_string());
2918
  }
2919
  stop(buf);
2920
}
2921

2922
#ifdef _LP64
2923
#define XSTATE_BV 0x200
2924
#endif
2925

2926
void MacroAssembler::pop_CPU_state() {
2927
  pop_FPU_state();
2928
  pop_IU_state();
2929
}
2930

2931
void MacroAssembler::pop_FPU_state() {
2932
#ifndef _LP64
2933
  frstor(Address(rsp, 0));
2934
#else
2935
  fxrstor(Address(rsp, 0));
2936
#endif
2937
  addptr(rsp, FPUStateSizeInWords * wordSize);
2938
}
2939

2940
void MacroAssembler::pop_IU_state() {
2941
  popa();
2942
  LP64_ONLY(addq(rsp, 8));
2943
  popf();
2944
}
2945

2946
// Save Integer and Float state
2947
// Warning: Stack must be 16 byte aligned (64bit)
2948
void MacroAssembler::push_CPU_state() {
2949
  push_IU_state();
2950
  push_FPU_state();
2951
}
2952

2953
void MacroAssembler::push_FPU_state() {
2954
  subptr(rsp, FPUStateSizeInWords * wordSize);
2955
#ifndef _LP64
2956
  fnsave(Address(rsp, 0));
2957
  fwait();
2958
#else
2959
  fxsave(Address(rsp, 0));
2960
#endif // LP64
2961
}
2962

2963
void MacroAssembler::push_IU_state() {
2964
  // Push flags first because pusha kills them
2965
  pushf();
2966
  // Make sure rsp stays 16-byte aligned
2967
  LP64_ONLY(subq(rsp, 8));
2968
  pusha();
2969
}
2970

2971
void MacroAssembler::push_cont_fastpath() {
2972
  if (!Continuations::enabled()) return;
2973

2974
#ifndef _LP64
2975
  Register rthread = rax;
2976
  Register rrealsp = rbx;
2977
  push(rthread);
2978
  push(rrealsp);
2979

2980
  get_thread(rthread);
2981

2982
  // The code below wants the original RSP.
2983
  // Move it back after the pushes above.
2984
  movptr(rrealsp, rsp);
2985
  addptr(rrealsp, 2*wordSize);
2986
#else
2987
  Register rthread = r15_thread;
2988
  Register rrealsp = rsp;
2989
#endif
2990

2991
  Label done;
2992
  cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
2993
  jccb(Assembler::belowEqual, done);
2994
  movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp);
2995
  bind(done);
2996

2997
#ifndef _LP64
2998
  pop(rrealsp);
2999
  pop(rthread);
3000
#endif
3001
}
3002

3003
void MacroAssembler::pop_cont_fastpath() {
3004
  if (!Continuations::enabled()) return;
3005

3006
#ifndef _LP64
3007
  Register rthread = rax;
3008
  Register rrealsp = rbx;
3009
  push(rthread);
3010
  push(rrealsp);
3011

3012
  get_thread(rthread);
3013

3014
  // The code below wants the original RSP.
3015
  // Move it back after the pushes above.
3016
  movptr(rrealsp, rsp);
3017
  addptr(rrealsp, 2*wordSize);
3018
#else
3019
  Register rthread = r15_thread;
3020
  Register rrealsp = rsp;
3021
#endif
3022

3023
  Label done;
3024
  cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset()));
3025
  jccb(Assembler::below, done);
3026
  movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0);
3027
  bind(done);
3028

3029
#ifndef _LP64
3030
  pop(rrealsp);
3031
  pop(rthread);
3032
#endif
3033
}
3034

3035
void MacroAssembler::inc_held_monitor_count() {
3036
#ifndef _LP64
3037
  Register thread = rax;
3038
  push(thread);
3039
  get_thread(thread);
3040
  incrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3041
  pop(thread);
3042
#else // LP64
3043
  incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3044
#endif
3045
}
3046

3047
void MacroAssembler::dec_held_monitor_count() {
3048
#ifndef _LP64
3049
  Register thread = rax;
3050
  push(thread);
3051
  get_thread(thread);
3052
  decrementl(Address(thread, JavaThread::held_monitor_count_offset()));
3053
  pop(thread);
3054
#else // LP64
3055
  decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset()));
3056
#endif
3057
}
3058

3059
#ifdef ASSERT
3060
void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
3061
#ifdef _LP64
3062
  Label no_cont;
3063
  movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset()));
3064
  testl(cont, cont);
3065
  jcc(Assembler::zero, no_cont);
3066
  stop(name);
3067
  bind(no_cont);
3068
#else
3069
  Unimplemented();
3070
#endif
3071
}
3072
#endif
3073

3074
void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3075
  if (!java_thread->is_valid()) {
3076
    java_thread = rdi;
3077
    get_thread(java_thread);
3078
  }
3079
  // we must set sp to zero to clear frame
3080
  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3081
  // must clear fp, so that compiled frames are not confused; it is
3082
  // possible that we need it only for debugging
3083
  if (clear_fp) {
3084
    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3085
  }
3086
  // Always clear the pc because it could have been set by make_walkable()
3087
  movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3088
  vzeroupper();
3089
}
3090

3091
void MacroAssembler::restore_rax(Register tmp) {
3092
  if (tmp == noreg) pop(rax);
3093
  else if (tmp != rax) mov(rax, tmp);
3094
}
3095

3096
void MacroAssembler::round_to(Register reg, int modulus) {
3097
  addptr(reg, modulus - 1);
3098
  andptr(reg, -modulus);
3099
}
3100

3101
void MacroAssembler::save_rax(Register tmp) {
3102
  if (tmp == noreg) push(rax);
3103
  else if (tmp != rax) mov(tmp, rax);
3104
}
3105

3106
void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) {
3107
  if (at_return) {
3108
    // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
3109
    // we may safely use rsp instead to perform the stack watermark check.
3110
    cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
3111
    jcc(Assembler::above, slow_path);
3112
    return;
3113
  }
3114
  testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
3115
  jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3116
}
3117

3118
// Calls to C land
3119
//
3120
// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3121
// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3122
// has to be reset to 0. This is required to allow proper stack traversal.
3123
void MacroAssembler::set_last_Java_frame(Register java_thread,
3124
                                         Register last_java_sp,
3125
                                         Register last_java_fp,
3126
                                         address  last_java_pc,
3127
                                         Register rscratch) {
3128
  vzeroupper();
3129
  // determine java_thread register
3130
  if (!java_thread->is_valid()) {
3131
    java_thread = rdi;
3132
    get_thread(java_thread);
3133
  }
3134
  // determine last_java_sp register
3135
  if (!last_java_sp->is_valid()) {
3136
    last_java_sp = rsp;
3137
  }
3138
  // last_java_fp is optional
3139
  if (last_java_fp->is_valid()) {
3140
    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3141
  }
3142
  // last_java_pc is optional
3143
  if (last_java_pc != nullptr) {
3144
    Address java_pc(java_thread,
3145
                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
3146
    lea(java_pc, InternalAddress(last_java_pc), rscratch);
3147
  }
3148
  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3149
}
3150

3151
void MacroAssembler::shlptr(Register dst, int imm8) {
3152
  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3153
}
3154

3155
void MacroAssembler::shrptr(Register dst, int imm8) {
3156
  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3157
}
3158

3159
void MacroAssembler::sign_extend_byte(Register reg) {
3160
  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3161
    movsbl(reg, reg); // movsxb
3162
  } else {
3163
    shll(reg, 24);
3164
    sarl(reg, 24);
3165
  }
3166
}
3167

3168
void MacroAssembler::sign_extend_short(Register reg) {
3169
  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3170
    movswl(reg, reg); // movsxw
3171
  } else {
3172
    shll(reg, 16);
3173
    sarl(reg, 16);
3174
  }
3175
}
3176

3177
void MacroAssembler::testl(Address dst, int32_t imm32) {
3178
  if (imm32 >= 0 && is8bit(imm32)) {
3179
    testb(dst, imm32);
3180
  } else {
3181
    Assembler::testl(dst, imm32);
3182
  }
3183
}
3184

3185
void MacroAssembler::testl(Register dst, int32_t imm32) {
3186
  if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) {
3187
    testb(dst, imm32);
3188
  } else {
3189
    Assembler::testl(dst, imm32);
3190
  }
3191
}
3192

3193
void MacroAssembler::testl(Register dst, AddressLiteral src) {
3194
  assert(always_reachable(src), "Address should be reachable");
3195
  testl(dst, as_Address(src));
3196
}
3197

3198
#ifdef _LP64
3199

3200
void MacroAssembler::testq(Address dst, int32_t imm32) {
3201
  if (imm32 >= 0) {
3202
    testl(dst, imm32);
3203
  } else {
3204
    Assembler::testq(dst, imm32);
3205
  }
3206
}
3207

3208
void MacroAssembler::testq(Register dst, int32_t imm32) {
3209
  if (imm32 >= 0) {
3210
    testl(dst, imm32);
3211
  } else {
3212
    Assembler::testq(dst, imm32);
3213
  }
3214
}
3215

3216
#endif
3217

3218
void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3219
  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3220
  Assembler::pcmpeqb(dst, src);
3221
}
3222

3223
void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3224
  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3225
  Assembler::pcmpeqw(dst, src);
3226
}
3227

3228
void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3229
  assert((dst->encoding() < 16),"XMM register should be 0-15");
3230
  Assembler::pcmpestri(dst, src, imm8);
3231
}
3232

3233
void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3234
  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3235
  Assembler::pcmpestri(dst, src, imm8);
3236
}
3237

3238
void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3239
  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3240
  Assembler::pmovzxbw(dst, src);
3241
}
3242

3243
void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3244
  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3245
  Assembler::pmovzxbw(dst, src);
3246
}
3247

3248
void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3249
  assert((src->encoding() < 16),"XMM register should be 0-15");
3250
  Assembler::pmovmskb(dst, src);
3251
}
3252

3253
void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3254
  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3255
  Assembler::ptest(dst, src);
3256
}
3257

3258
void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3259
  assert(rscratch != noreg || always_reachable(src), "missing");
3260

3261
  if (reachable(src)) {
3262
    Assembler::sqrtss(dst, as_Address(src));
3263
  } else {
3264
    lea(rscratch, src);
3265
    Assembler::sqrtss(dst, Address(rscratch, 0));
3266
  }
3267
}
3268

3269
void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3270
  assert(rscratch != noreg || always_reachable(src), "missing");
3271

3272
  if (reachable(src)) {
3273
    Assembler::subsd(dst, as_Address(src));
3274
  } else {
3275
    lea(rscratch, src);
3276
    Assembler::subsd(dst, Address(rscratch, 0));
3277
  }
3278
}
3279

3280
void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) {
3281
  assert(rscratch != noreg || always_reachable(src), "missing");
3282

3283
  if (reachable(src)) {
3284
    Assembler::roundsd(dst, as_Address(src), rmode);
3285
  } else {
3286
    lea(rscratch, src);
3287
    Assembler::roundsd(dst, Address(rscratch, 0), rmode);
3288
  }
3289
}
3290

3291
void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3292
  assert(rscratch != noreg || always_reachable(src), "missing");
3293

3294
  if (reachable(src)) {
3295
    Assembler::subss(dst, as_Address(src));
3296
  } else {
3297
    lea(rscratch, src);
3298
    Assembler::subss(dst, Address(rscratch, 0));
3299
  }
3300
}
3301

3302
void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3303
  assert(rscratch != noreg || always_reachable(src), "missing");
3304

3305
  if (reachable(src)) {
3306
    Assembler::ucomisd(dst, as_Address(src));
3307
  } else {
3308
    lea(rscratch, src);
3309
    Assembler::ucomisd(dst, Address(rscratch, 0));
3310
  }
3311
}
3312

3313
void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) {
3314
  assert(rscratch != noreg || always_reachable(src), "missing");
3315

3316
  if (reachable(src)) {
3317
    Assembler::ucomiss(dst, as_Address(src));
3318
  } else {
3319
    lea(rscratch, src);
3320
    Assembler::ucomiss(dst, Address(rscratch, 0));
3321
  }
3322
}
3323

3324
void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) {
3325
  assert(rscratch != noreg || always_reachable(src), "missing");
3326

3327
  // Used in sign-bit flipping with aligned address.
3328
  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3329
  if (reachable(src)) {
3330
    Assembler::xorpd(dst, as_Address(src));
3331
  } else {
3332
    lea(rscratch, src);
3333
    Assembler::xorpd(dst, Address(rscratch, 0));
3334
  }
3335
}
3336

3337
void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3338
  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3339
    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3340
  }
3341
  else {
3342
    Assembler::xorpd(dst, src);
3343
  }
3344
}
3345

3346
void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3347
  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3348
    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3349
  } else {
3350
    Assembler::xorps(dst, src);
3351
  }
3352
}
3353

3354
void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) {
3355
  assert(rscratch != noreg || always_reachable(src), "missing");
3356

3357
  // Used in sign-bit flipping with aligned address.
3358
  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3359
  if (reachable(src)) {
3360
    Assembler::xorps(dst, as_Address(src));
3361
  } else {
3362
    lea(rscratch, src);
3363
    Assembler::xorps(dst, Address(rscratch, 0));
3364
  }
3365
}
3366

3367
void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) {
3368
  assert(rscratch != noreg || always_reachable(src), "missing");
3369

3370
  // Used in sign-bit flipping with aligned address.
3371
  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3372
  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3373
  if (reachable(src)) {
3374
    Assembler::pshufb(dst, as_Address(src));
3375
  } else {
3376
    lea(rscratch, src);
3377
    Assembler::pshufb(dst, Address(rscratch, 0));
3378
  }
3379
}
3380

3381
// AVX 3-operands instructions
3382

3383
void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3384
  assert(rscratch != noreg || always_reachable(src), "missing");
3385

3386
  if (reachable(src)) {
3387
    vaddsd(dst, nds, as_Address(src));
3388
  } else {
3389
    lea(rscratch, src);
3390
    vaddsd(dst, nds, Address(rscratch, 0));
3391
  }
3392
}
3393

3394
void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3395
  assert(rscratch != noreg || always_reachable(src), "missing");
3396

3397
  if (reachable(src)) {
3398
    vaddss(dst, nds, as_Address(src));
3399
  } else {
3400
    lea(rscratch, src);
3401
    vaddss(dst, nds, Address(rscratch, 0));
3402
  }
3403
}
3404

3405
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3406
  assert(UseAVX > 0, "requires some form of AVX");
3407
  assert(rscratch != noreg || always_reachable(src), "missing");
3408

3409
  if (reachable(src)) {
3410
    Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
3411
  } else {
3412
    lea(rscratch, src);
3413
    Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
3414
  }
3415
}
3416

3417
void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3418
  assert(UseAVX > 0, "requires some form of AVX");
3419
  assert(rscratch != noreg || always_reachable(src), "missing");
3420

3421
  if (reachable(src)) {
3422
    Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3423
  } else {
3424
    lea(rscratch, src);
3425
    Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3426
  }
3427
}
3428

3429
void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3430
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3431
  assert(rscratch != noreg || always_reachable(negate_field), "missing");
3432

3433
  vandps(dst, nds, negate_field, vector_len, rscratch);
3434
}
3435

3436
void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) {
3437
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3438
  assert(rscratch != noreg || always_reachable(negate_field), "missing");
3439

3440
  vandpd(dst, nds, negate_field, vector_len, rscratch);
3441
}
3442

3443
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3444
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3445
  Assembler::vpaddb(dst, nds, src, vector_len);
3446
}
3447

3448
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3449
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3450
  Assembler::vpaddb(dst, nds, src, vector_len);
3451
}
3452

3453
void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3454
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3455
  Assembler::vpaddw(dst, nds, src, vector_len);
3456
}
3457

3458
void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3459
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3460
  Assembler::vpaddw(dst, nds, src, vector_len);
3461
}
3462

3463
void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3464
  assert(rscratch != noreg || always_reachable(src), "missing");
3465

3466
  if (reachable(src)) {
3467
    Assembler::vpand(dst, nds, as_Address(src), vector_len);
3468
  } else {
3469
    lea(rscratch, src);
3470
    Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len);
3471
  }
3472
}
3473

3474
void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3475
  assert(rscratch != noreg || always_reachable(src), "missing");
3476

3477
  if (reachable(src)) {
3478
    Assembler::vpbroadcastd(dst, as_Address(src), vector_len);
3479
  } else {
3480
    lea(rscratch, src);
3481
    Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len);
3482
  }
3483
}
3484

3485
void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3486
  assert(rscratch != noreg || always_reachable(src), "missing");
3487

3488
  if (reachable(src)) {
3489
    Assembler::vpbroadcastq(dst, as_Address(src), vector_len);
3490
  } else {
3491
    lea(rscratch, src);
3492
    Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len);
3493
  }
3494
}
3495

3496
void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3497
  assert(rscratch != noreg || always_reachable(src), "missing");
3498

3499
  if (reachable(src)) {
3500
    Assembler::vbroadcastsd(dst, as_Address(src), vector_len);
3501
  } else {
3502
    lea(rscratch, src);
3503
    Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len);
3504
  }
3505
}
3506

3507
void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3508
  assert(rscratch != noreg || always_reachable(src), "missing");
3509

3510
  if (reachable(src)) {
3511
    Assembler::vbroadcastss(dst, as_Address(src), vector_len);
3512
  } else {
3513
    lea(rscratch, src);
3514
    Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len);
3515
  }
3516
}
3517

3518
// Vector float blend
3519
// vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3520
void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3521
  // WARN: Allow dst == (src1|src2), mask == scratch
3522
  bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1;
3523
  bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst;
3524
  bool dst_available = dst != mask && (dst != src1 || dst != src2);
3525
  if (blend_emulation && scratch_available && dst_available) {
3526
    if (compute_mask) {
3527
      vpsrad(scratch, mask, 32, vector_len);
3528
      mask = scratch;
3529
    }
3530
    if (dst == src1) {
3531
      vpandn(dst,     mask, src1, vector_len); // if mask == 0, src1
3532
      vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3533
    } else {
3534
      vpand (dst,     mask, src2, vector_len); // if mask == 1, src2
3535
      vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1
3536
    }
3537
    vpor(dst, dst, scratch, vector_len);
3538
  } else {
3539
    Assembler::vblendvps(dst, src1, src2, mask, vector_len);
3540
  }
3541
}
3542

3543
// vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg)
3544
void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) {
3545
  // WARN: Allow dst == (src1|src2), mask == scratch
3546
  bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1;
3547
  bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask);
3548
  bool dst_available = dst != mask && (dst != src1 || dst != src2);
3549
  if (blend_emulation && scratch_available && dst_available) {
3550
    if (compute_mask) {
3551
      vpxor(scratch, scratch, scratch, vector_len);
3552
      vpcmpgtq(scratch, scratch, mask, vector_len);
3553
      mask = scratch;
3554
    }
3555
    if (dst == src1) {
3556
      vpandn(dst,     mask, src1, vector_len); // if mask == 0, src
3557
      vpand (scratch, mask, src2, vector_len); // if mask == 1, src2
3558
    } else {
3559
      vpand (dst,     mask, src2, vector_len); // if mask == 1, src2
3560
      vpandn(scratch, mask, src1, vector_len); // if mask == 0, src
3561
    }
3562
    vpor(dst, dst, scratch, vector_len);
3563
  } else {
3564
    Assembler::vblendvpd(dst, src1, src2, mask, vector_len);
3565
  }
3566
}
3567

3568
void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3569
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3570
  Assembler::vpcmpeqb(dst, nds, src, vector_len);
3571
}
3572

3573
void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) {
3574
  assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3575
  Assembler::vpcmpeqb(dst, src1, src2, vector_len);
3576
}
3577

3578
void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3579
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3580
  Assembler::vpcmpeqw(dst, nds, src, vector_len);
3581
}
3582

3583
void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3584
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3585
  Assembler::vpcmpeqw(dst, nds, src, vector_len);
3586
}
3587

3588
void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3589
  assert(rscratch != noreg || always_reachable(src), "missing");
3590

3591
  if (reachable(src)) {
3592
    Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3593
  } else {
3594
    lea(rscratch, src);
3595
    Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len);
3596
  }
3597
}
3598

3599
void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3600
                             int comparison, bool is_signed, int vector_len, Register rscratch) {
3601
  assert(rscratch != noreg || always_reachable(src), "missing");
3602

3603
  if (reachable(src)) {
3604
    Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3605
  } else {
3606
    lea(rscratch, src);
3607
    Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3608
  }
3609
}
3610

3611
void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3612
                             int comparison, bool is_signed, int vector_len, Register rscratch) {
3613
  assert(rscratch != noreg || always_reachable(src), "missing");
3614

3615
  if (reachable(src)) {
3616
    Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3617
  } else {
3618
    lea(rscratch, src);
3619
    Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3620
  }
3621
}
3622

3623
void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3624
                             int comparison, bool is_signed, int vector_len, Register rscratch) {
3625
  assert(rscratch != noreg || always_reachable(src), "missing");
3626

3627
  if (reachable(src)) {
3628
    Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3629
  } else {
3630
    lea(rscratch, src);
3631
    Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3632
  }
3633
}
3634

3635
void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3636
                             int comparison, bool is_signed, int vector_len, Register rscratch) {
3637
  assert(rscratch != noreg || always_reachable(src), "missing");
3638

3639
  if (reachable(src)) {
3640
    Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len);
3641
  } else {
3642
    lea(rscratch, src);
3643
    Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len);
3644
  }
3645
}
3646

3647
void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3648
  if (width == Assembler::Q) {
3649
    Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3650
  } else {
3651
    Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3652
  }
3653
}
3654

3655
void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) {
3656
  int eq_cond_enc = 0x29;
3657
  int gt_cond_enc = 0x37;
3658
  if (width != Assembler::Q) {
3659
    eq_cond_enc = 0x74 + width;
3660
    gt_cond_enc = 0x64 + width;
3661
  }
3662
  switch (cond) {
3663
  case eq:
3664
    vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3665
    break;
3666
  case neq:
3667
    vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3668
    vallones(xtmp, vector_len);
3669
    vpxor(dst, xtmp, dst, vector_len);
3670
    break;
3671
  case le:
3672
    vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3673
    vallones(xtmp, vector_len);
3674
    vpxor(dst, xtmp, dst, vector_len);
3675
    break;
3676
  case nlt:
3677
    vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3678
    vallones(xtmp, vector_len);
3679
    vpxor(dst, xtmp, dst, vector_len);
3680
    break;
3681
  case lt:
3682
    vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3683
    break;
3684
  case nle:
3685
    vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3686
    break;
3687
  default:
3688
    assert(false, "Should not reach here");
3689
  }
3690
}
3691

3692
void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3693
  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3694
  Assembler::vpmovzxbw(dst, src, vector_len);
3695
}
3696

3697
void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) {
3698
  assert((src->encoding() < 16),"XMM register should be 0-15");
3699
  Assembler::vpmovmskb(dst, src, vector_len);
3700
}
3701

3702
void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3703
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3704
  Assembler::vpmullw(dst, nds, src, vector_len);
3705
}
3706

3707
void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3708
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3709
  Assembler::vpmullw(dst, nds, src, vector_len);
3710
}
3711

3712
void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3713
  assert((UseAVX > 0), "AVX support is needed");
3714
  assert(rscratch != noreg || always_reachable(src), "missing");
3715

3716
  if (reachable(src)) {
3717
    Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3718
  } else {
3719
    lea(rscratch, src);
3720
    Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len);
3721
  }
3722
}
3723

3724
void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3725
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3726
  Assembler::vpsubb(dst, nds, src, vector_len);
3727
}
3728

3729
void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3730
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3731
  Assembler::vpsubb(dst, nds, src, vector_len);
3732
}
3733

3734
void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3735
  assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3736
  Assembler::vpsubw(dst, nds, src, vector_len);
3737
}
3738

3739
void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3740
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3741
  Assembler::vpsubw(dst, nds, src, vector_len);
3742
}
3743

3744
void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3745
  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3746
  Assembler::vpsraw(dst, nds, shift, vector_len);
3747
}
3748

3749
void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3750
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3751
  Assembler::vpsraw(dst, nds, shift, vector_len);
3752
}
3753

3754
void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3755
  assert(UseAVX > 2,"");
3756
  if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3757
     vector_len = 2;
3758
  }
3759
  Assembler::evpsraq(dst, nds, shift, vector_len);
3760
}
3761

3762
void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3763
  assert(UseAVX > 2,"");
3764
  if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3765
     vector_len = 2;
3766
  }
3767
  Assembler::evpsraq(dst, nds, shift, vector_len);
3768
}
3769

3770
void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3771
  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3772
  Assembler::vpsrlw(dst, nds, shift, vector_len);
3773
}
3774

3775
void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3776
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3777
  Assembler::vpsrlw(dst, nds, shift, vector_len);
3778
}
3779

3780
void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3781
  assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3782
  Assembler::vpsllw(dst, nds, shift, vector_len);
3783
}
3784

3785
void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3786
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3787
  Assembler::vpsllw(dst, nds, shift, vector_len);
3788
}
3789

3790
void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3791
  assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3792
  Assembler::vptest(dst, src);
3793
}
3794

3795
void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3796
  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3797
  Assembler::punpcklbw(dst, src);
3798
}
3799

3800
void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3801
  assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3802
  Assembler::pshufd(dst, src, mode);
3803
}
3804

3805
void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3806
  assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3807
  Assembler::pshuflw(dst, src, mode);
3808
}
3809

3810
void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3811
  assert(rscratch != noreg || always_reachable(src), "missing");
3812

3813
  if (reachable(src)) {
3814
    vandpd(dst, nds, as_Address(src), vector_len);
3815
  } else {
3816
    lea(rscratch, src);
3817
    vandpd(dst, nds, Address(rscratch, 0), vector_len);
3818
  }
3819
}
3820

3821
void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3822
  assert(rscratch != noreg || always_reachable(src), "missing");
3823

3824
  if (reachable(src)) {
3825
    vandps(dst, nds, as_Address(src), vector_len);
3826
  } else {
3827
    lea(rscratch, src);
3828
    vandps(dst, nds, Address(rscratch, 0), vector_len);
3829
  }
3830
}
3831

3832
void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3833
                            bool merge, int vector_len, Register rscratch) {
3834
  assert(rscratch != noreg || always_reachable(src), "missing");
3835

3836
  if (reachable(src)) {
3837
    Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3838
  } else {
3839
    lea(rscratch, src);
3840
    Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
3841
  }
3842
}
3843

3844
void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3845
  assert(rscratch != noreg || always_reachable(src), "missing");
3846

3847
  if (reachable(src)) {
3848
    vdivsd(dst, nds, as_Address(src));
3849
  } else {
3850
    lea(rscratch, src);
3851
    vdivsd(dst, nds, Address(rscratch, 0));
3852
  }
3853
}
3854

3855
void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3856
  assert(rscratch != noreg || always_reachable(src), "missing");
3857

3858
  if (reachable(src)) {
3859
    vdivss(dst, nds, as_Address(src));
3860
  } else {
3861
    lea(rscratch, src);
3862
    vdivss(dst, nds, Address(rscratch, 0));
3863
  }
3864
}
3865

3866
void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3867
  assert(rscratch != noreg || always_reachable(src), "missing");
3868

3869
  if (reachable(src)) {
3870
    vmulsd(dst, nds, as_Address(src));
3871
  } else {
3872
    lea(rscratch, src);
3873
    vmulsd(dst, nds, Address(rscratch, 0));
3874
  }
3875
}
3876

3877
void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3878
  assert(rscratch != noreg || always_reachable(src), "missing");
3879

3880
  if (reachable(src)) {
3881
    vmulss(dst, nds, as_Address(src));
3882
  } else {
3883
    lea(rscratch, src);
3884
    vmulss(dst, nds, Address(rscratch, 0));
3885
  }
3886
}
3887

3888
void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3889
  assert(rscratch != noreg || always_reachable(src), "missing");
3890

3891
  if (reachable(src)) {
3892
    vsubsd(dst, nds, as_Address(src));
3893
  } else {
3894
    lea(rscratch, src);
3895
    vsubsd(dst, nds, Address(rscratch, 0));
3896
  }
3897
}
3898

3899
void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3900
  assert(rscratch != noreg || always_reachable(src), "missing");
3901

3902
  if (reachable(src)) {
3903
    vsubss(dst, nds, as_Address(src));
3904
  } else {
3905
    lea(rscratch, src);
3906
    vsubss(dst, nds, Address(rscratch, 0));
3907
  }
3908
}
3909

3910
void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3911
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3912
  assert(rscratch != noreg || always_reachable(src), "missing");
3913

3914
  vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch);
3915
}
3916

3917
void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) {
3918
  assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3919
  assert(rscratch != noreg || always_reachable(src), "missing");
3920

3921
  vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch);
3922
}
3923

3924
void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3925
  assert(rscratch != noreg || always_reachable(src), "missing");
3926

3927
  if (reachable(src)) {
3928
    vxorpd(dst, nds, as_Address(src), vector_len);
3929
  } else {
3930
    lea(rscratch, src);
3931
    vxorpd(dst, nds, Address(rscratch, 0), vector_len);
3932
  }
3933
}
3934

3935
void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3936
  assert(rscratch != noreg || always_reachable(src), "missing");
3937

3938
  if (reachable(src)) {
3939
    vxorps(dst, nds, as_Address(src), vector_len);
3940
  } else {
3941
    lea(rscratch, src);
3942
    vxorps(dst, nds, Address(rscratch, 0), vector_len);
3943
  }
3944
}
3945

3946
void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3947
  assert(rscratch != noreg || always_reachable(src), "missing");
3948

3949
  if (UseAVX > 1 || (vector_len < 1)) {
3950
    if (reachable(src)) {
3951
      Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3952
    } else {
3953
      lea(rscratch, src);
3954
      Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len);
3955
    }
3956
  } else {
3957
    MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch);
3958
  }
3959
}
3960

3961
void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3962
  assert(rscratch != noreg || always_reachable(src), "missing");
3963

3964
  if (reachable(src)) {
3965
    Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3966
  } else {
3967
    lea(rscratch, src);
3968
    Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len);
3969
  }
3970
}
3971

3972
void MacroAssembler::clear_jobject_tag(Register possibly_non_local) {
3973
  const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask);
3974
  STATIC_ASSERT(inverted_mask == -4); // otherwise check this code
3975
  // The inverted mask is sign-extended
3976
  andptr(possibly_non_local, inverted_mask);
3977
}
3978

3979
void MacroAssembler::resolve_jobject(Register value,
3980
                                     Register thread,
3981
                                     Register tmp) {
3982
  assert_different_registers(value, thread, tmp);
3983
  Label done, tagged, weak_tagged;
3984
  testptr(value, value);
3985
  jcc(Assembler::zero, done);           // Use null as-is.
3986
  testptr(value, JNIHandles::tag_mask); // Test for tag.
3987
  jcc(Assembler::notZero, tagged);
3988

3989
  // Resolve local handle
3990
  access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread);
3991
  verify_oop(value);
3992
  jmp(done);
3993

3994
  bind(tagged);
3995
  testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag.
3996
  jcc(Assembler::notZero, weak_tagged);
3997

3998
  // Resolve global handle
3999
  access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
4000
  verify_oop(value);
4001
  jmp(done);
4002

4003
  bind(weak_tagged);
4004
  // Resolve jweak.
4005
  access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4006
                 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread);
4007
  verify_oop(value);
4008

4009
  bind(done);
4010
}
4011

4012
void MacroAssembler::resolve_global_jobject(Register value,
4013
                                            Register thread,
4014
                                            Register tmp) {
4015
  assert_different_registers(value, thread, tmp);
4016
  Label done;
4017

4018
  testptr(value, value);
4019
  jcc(Assembler::zero, done);           // Use null as-is.
4020

4021
#ifdef ASSERT
4022
  {
4023
    Label valid_global_tag;
4024
    testptr(value, JNIHandles::TypeTag::global); // Test for global tag.
4025
    jcc(Assembler::notZero, valid_global_tag);
4026
    stop("non global jobject using resolve_global_jobject");
4027
    bind(valid_global_tag);
4028
  }
4029
#endif
4030

4031
  // Resolve global handle
4032
  access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread);
4033
  verify_oop(value);
4034

4035
  bind(done);
4036
}
4037

4038
void MacroAssembler::subptr(Register dst, int32_t imm32) {
4039
  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4040
}
4041

4042
// Force generation of a 4 byte immediate value even if it fits into 8bit
4043
void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4044
  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4045
}
4046

4047
void MacroAssembler::subptr(Register dst, Register src) {
4048
  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4049
}
4050

4051
// C++ bool manipulation
4052
void MacroAssembler::testbool(Register dst) {
4053
  if(sizeof(bool) == 1)
4054
    testb(dst, 0xff);
4055
  else if(sizeof(bool) == 2) {
4056
    // testw implementation needed for two byte bools
4057
    ShouldNotReachHere();
4058
  } else if(sizeof(bool) == 4)
4059
    testl(dst, dst);
4060
  else
4061
    // unsupported
4062
    ShouldNotReachHere();
4063
}
4064

4065
void MacroAssembler::testptr(Register dst, Register src) {
4066
  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4067
}
4068

4069
// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4070
void MacroAssembler::tlab_allocate(Register thread, Register obj,
4071
                                   Register var_size_in_bytes,
4072
                                   int con_size_in_bytes,
4073
                                   Register t1,
4074
                                   Register t2,
4075
                                   Label& slow_case) {
4076
  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4077
  bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4078
}
4079

4080
RegSet MacroAssembler::call_clobbered_gp_registers() {
4081
  RegSet regs;
4082
#ifdef _LP64
4083
  regs += RegSet::of(rax, rcx, rdx);
4084
#ifndef WINDOWS
4085
  regs += RegSet::of(rsi, rdi);
4086
#endif
4087
  regs += RegSet::range(r8, r11);
4088
#else
4089
  regs += RegSet::of(rax, rcx, rdx);
4090
#endif
4091
#ifdef _LP64
4092
  if (UseAPX) {
4093
    regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1));
4094
  }
4095
#endif
4096
  return regs;
4097
}
4098

4099
XMMRegSet MacroAssembler::call_clobbered_xmm_registers() {
4100
  int num_xmm_registers = XMMRegister::available_xmm_registers();
4101
#if defined(WINDOWS) && defined(_LP64)
4102
  XMMRegSet result = XMMRegSet::range(xmm0, xmm5);
4103
  if (num_xmm_registers > 16) {
4104
     result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1));
4105
  }
4106
  return result;
4107
#else
4108
  return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1));
4109
#endif
4110
}
4111

4112
static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor
4113

4114
#ifndef _LP64
4115
static bool use_x87_registers() { return UseSSE < 2; }
4116
#endif
4117
static bool use_xmm_registers() { return UseSSE >= 1; }
4118

4119
// C1 only ever uses the first double/float of the XMM register.
4120
static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); }
4121

4122
static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4123
  if (UseSSE == 1) {
4124
    masm->movflt(Address(rsp, offset), reg);
4125
  } else {
4126
    masm->movdbl(Address(rsp, offset), reg);
4127
  }
4128
}
4129

4130
static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) {
4131
  if (UseSSE == 1) {
4132
    masm->movflt(reg, Address(rsp, offset));
4133
  } else {
4134
    masm->movdbl(reg, Address(rsp, offset));
4135
  }
4136
}
4137

4138
static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers,
4139
                                  bool save_fpu, int& gp_area_size,
4140
                                  int& fp_area_size, int& xmm_area_size) {
4141

4142
  gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size,
4143
                         StackAlignmentInBytes);
4144
#ifdef _LP64
4145
  fp_area_size = 0;
4146
#else
4147
  fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0;
4148
#endif
4149
  xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0;
4150

4151
  return gp_area_size + fp_area_size + xmm_area_size;
4152
}
4153

4154
void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) {
4155
  block_comment("push_call_clobbered_registers start");
4156
  // Regular registers
4157
  RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude;
4158

4159
  int gp_area_size;
4160
  int fp_area_size;
4161
  int xmm_area_size;
4162
  int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
4163
                                               gp_area_size, fp_area_size, xmm_area_size);
4164
  subptr(rsp, total_save_size);
4165

4166
  push_set(gp_registers_to_push, 0);
4167

4168
#ifndef _LP64
4169
  if (save_fpu && use_x87_registers()) {
4170
    fnsave(Address(rsp, gp_area_size));
4171
    fwait();
4172
  }
4173
#endif
4174
  if (save_fpu && use_xmm_registers()) {
4175
    push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4176
  }
4177

4178
  block_comment("push_call_clobbered_registers end");
4179
}
4180

4181
void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) {
4182
  block_comment("pop_call_clobbered_registers start");
4183

4184
  RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude;
4185

4186
  int gp_area_size;
4187
  int fp_area_size;
4188
  int xmm_area_size;
4189
  int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
4190
                                               gp_area_size, fp_area_size, xmm_area_size);
4191

4192
  if (restore_fpu && use_xmm_registers()) {
4193
    pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size);
4194
  }
4195
#ifndef _LP64
4196
  if (restore_fpu && use_x87_registers()) {
4197
    frstor(Address(rsp, gp_area_size));
4198
  }
4199
#endif
4200

4201
  pop_set(gp_registers_to_pop, 0);
4202

4203
  addptr(rsp, total_save_size);
4204

4205
  vzeroupper();
4206

4207
  block_comment("pop_call_clobbered_registers end");
4208
}
4209

4210
void MacroAssembler::push_set(XMMRegSet set, int offset) {
4211
  assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be");
4212
  int spill_offset = offset;
4213

4214
  for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
4215
    save_xmm_register(this, spill_offset, *it);
4216
    spill_offset += xmm_save_size();
4217
  }
4218
}
4219

4220
void MacroAssembler::pop_set(XMMRegSet set, int offset) {
4221
  int restore_size = set.size() * xmm_save_size();
4222
  assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
4223

4224
  int restore_offset = offset + restore_size - xmm_save_size();
4225

4226
  for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
4227
    restore_xmm_register(this, restore_offset, *it);
4228
    restore_offset -= xmm_save_size();
4229
  }
4230
}
4231

4232
void MacroAssembler::push_set(RegSet set, int offset) {
4233
  int spill_offset;
4234
  if (offset == -1) {
4235
    int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4236
    int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
4237
    subptr(rsp, aligned_size);
4238
    spill_offset = 0;
4239
  } else {
4240
    spill_offset = offset;
4241
  }
4242

4243
  for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
4244
    movptr(Address(rsp, spill_offset), *it);
4245
    spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4246
  }
4247
}
4248

4249
void MacroAssembler::pop_set(RegSet set, int offset) {
4250

4251
  int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size;
4252
  int restore_size = set.size() * gp_reg_size;
4253
  int aligned_size = align_up(restore_size, StackAlignmentInBytes);
4254

4255
  int restore_offset;
4256
  if (offset == -1) {
4257
    restore_offset = restore_size - gp_reg_size;
4258
  } else {
4259
    restore_offset = offset + restore_size - gp_reg_size;
4260
  }
4261
  for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
4262
    movptr(*it, Address(rsp, restore_offset));
4263
    restore_offset -= gp_reg_size;
4264
  }
4265

4266
  if (offset == -1) {
4267
    addptr(rsp, aligned_size);
4268
  }
4269
}
4270

4271
// Preserves the contents of address, destroys the contents length_in_bytes and temp.
4272
void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4273
  assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4274
  assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4275
  Label done;
4276

4277
  testptr(length_in_bytes, length_in_bytes);
4278
  jcc(Assembler::zero, done);
4279

4280
  // initialize topmost word, divide index by 2, check if odd and test if zero
4281
  // note: for the remaining code to work, index must be a multiple of BytesPerWord
4282
#ifdef ASSERT
4283
  {
4284
    Label L;
4285
    testptr(length_in_bytes, BytesPerWord - 1);
4286
    jcc(Assembler::zero, L);
4287
    stop("length must be a multiple of BytesPerWord");
4288
    bind(L);
4289
  }
4290
#endif
4291
  Register index = length_in_bytes;
4292
  xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4293
  if (UseIncDec) {
4294
    shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4295
  } else {
4296
    shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4297
    shrptr(index, 1);
4298
  }
4299
#ifndef _LP64
4300
  // index could have not been a multiple of 8 (i.e., bit 2 was set)
4301
  {
4302
    Label even;
4303
    // note: if index was a multiple of 8, then it cannot
4304
    //       be 0 now otherwise it must have been 0 before
4305
    //       => if it is even, we don't need to check for 0 again
4306
    jcc(Assembler::carryClear, even);
4307
    // clear topmost word (no jump would be needed if conditional assignment worked here)
4308
    movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4309
    // index could be 0 now, must check again
4310
    jcc(Assembler::zero, done);
4311
    bind(even);
4312
  }
4313
#endif // !_LP64
4314
  // initialize remaining object fields: index is a multiple of 2 now
4315
  {
4316
    Label loop;
4317
    bind(loop);
4318
    movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4319
    NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4320
    decrement(index);
4321
    jcc(Assembler::notZero, loop);
4322
  }
4323

4324
  bind(done);
4325
}
4326

4327
// Look up the method for a megamorphic invokeinterface call.
4328
// The target method is determined by <intf_klass, itable_index>.
4329
// The receiver klass is in recv_klass.
4330
// On success, the result will be in method_result, and execution falls through.
4331
// On failure, execution transfers to the given label.
4332
void MacroAssembler::lookup_interface_method(Register recv_klass,
4333
                                             Register intf_klass,
4334
                                             RegisterOrConstant itable_index,
4335
                                             Register method_result,
4336
                                             Register scan_temp,
4337
                                             Label& L_no_such_interface,
4338
                                             bool return_method) {
4339
  assert_different_registers(recv_klass, intf_klass, scan_temp);
4340
  assert_different_registers(method_result, intf_klass, scan_temp);
4341
  assert(recv_klass != method_result || !return_method,
4342
         "recv_klass can be destroyed when method isn't needed");
4343

4344
  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4345
         "caller must use same register for non-constant itable index as for method");
4346

4347
  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4348
  int vtable_base = in_bytes(Klass::vtable_start_offset());
4349
  int itentry_off = in_bytes(itableMethodEntry::method_offset());
4350
  int scan_step   = itableOffsetEntry::size() * wordSize;
4351
  int vte_size    = vtableEntry::size_in_bytes();
4352
  Address::ScaleFactor times_vte_scale = Address::times_ptr;
4353
  assert(vte_size == wordSize, "else adjust times_vte_scale");
4354

4355
  movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4356

4357
  // Could store the aligned, prescaled offset in the klass.
4358
  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4359

4360
  if (return_method) {
4361
    // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4362
    assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4363
    lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4364
  }
4365

4366
  // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
4367
  //   if (scan->interface() == intf) {
4368
  //     result = (klass + scan->offset() + itable_index);
4369
  //   }
4370
  // }
4371
  Label search, found_method;
4372

4373
  for (int peel = 1; peel >= 0; peel--) {
4374
    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset()));
4375
    cmpptr(intf_klass, method_result);
4376

4377
    if (peel) {
4378
      jccb(Assembler::equal, found_method);
4379
    } else {
4380
      jccb(Assembler::notEqual, search);
4381
      // (invert the test to fall through to found_method...)
4382
    }
4383

4384
    if (!peel)  break;
4385

4386
    bind(search);
4387

4388
    // Check that the previous entry is non-null.  A null entry means that
4389
    // the receiver class doesn't implement the interface, and wasn't the
4390
    // same as when the caller was compiled.
4391
    testptr(method_result, method_result);
4392
    jcc(Assembler::zero, L_no_such_interface);
4393
    addptr(scan_temp, scan_step);
4394
  }
4395

4396
  bind(found_method);
4397

4398
  if (return_method) {
4399
    // Got a hit.
4400
    movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset()));
4401
    movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4402
  }
4403
}
4404

4405
// Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4406
// - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
4407
// - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4408
// The target method is determined by <holder_klass, itable_index>.
4409
// The receiver klass is in recv_klass.
4410
// On success, the result will be in method_result, and execution falls through.
4411
// On failure, execution transfers to the given label.
4412
void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4413
                                                  Register holder_klass,
4414
                                                  Register resolved_klass,
4415
                                                  Register method_result,
4416
                                                  Register scan_temp,
4417
                                                  Register temp_reg2,
4418
                                                  Register receiver,
4419
                                                  int itable_index,
4420
                                                  Label& L_no_such_interface) {
4421
  assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver);
4422
  Register temp_itbl_klass = method_result;
4423
  Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl
4424

4425
  int vtable_base = in_bytes(Klass::vtable_start_offset());
4426
  int itentry_off = in_bytes(itableMethodEntry::method_offset());
4427
  int scan_step = itableOffsetEntry::size() * wordSize;
4428
  int vte_size = vtableEntry::size_in_bytes();
4429
  int ioffset = in_bytes(itableOffsetEntry::interface_offset());
4430
  int ooffset = in_bytes(itableOffsetEntry::offset_offset());
4431
  Address::ScaleFactor times_vte_scale = Address::times_ptr;
4432
  assert(vte_size == wordSize, "adjust times_vte_scale");
4433

4434
  Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found;
4435

4436
  // temp_itbl_klass = recv_klass.itable[0]
4437
  // scan_temp = &recv_klass.itable[0] + step
4438
  movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4439
  movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset));
4440
  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step));
4441
  xorptr(temp_reg, temp_reg);
4442

4443
  // Initial checks:
4444
  //   - if (holder_klass != resolved_klass), go to "scan for resolved"
4445
  //   - if (itable[0] == 0), no such interface
4446
  //   - if (itable[0] == holder_klass), shortcut to "holder found"
4447
  cmpptr(holder_klass, resolved_klass);
4448
  jccb(Assembler::notEqual, L_loop_scan_resolved_entry);
4449
  testptr(temp_itbl_klass, temp_itbl_klass);
4450
  jccb(Assembler::zero, L_no_such_interface);
4451
  cmpptr(holder_klass, temp_itbl_klass);
4452
  jccb(Assembler::equal, L_holder_found);
4453

4454
  // Loop: Look for holder_klass record in itable
4455
  //   do {
4456
  //     tmp = itable[index];
4457
  //     index += step;
4458
  //     if (tmp == holder_klass) {
4459
  //       goto L_holder_found; // Found!
4460
  //     }
4461
  //   } while (tmp != 0);
4462
  //   goto L_no_such_interface // Not found.
4463
  Label L_scan_holder;
4464
  bind(L_scan_holder);
4465
    movptr(temp_itbl_klass, Address(scan_temp, 0));
4466
    addptr(scan_temp, scan_step);
4467
    cmpptr(holder_klass, temp_itbl_klass);
4468
    jccb(Assembler::equal, L_holder_found);
4469
    testptr(temp_itbl_klass, temp_itbl_klass);
4470
    jccb(Assembler::notZero, L_scan_holder);
4471

4472
  jmpb(L_no_such_interface);
4473

4474
  // Loop: Look for resolved_class record in itable
4475
  //   do {
4476
  //     tmp = itable[index];
4477
  //     index += step;
4478
  //     if (tmp == holder_klass) {
4479
  //        // Also check if we have met a holder klass
4480
  //        holder_tmp = itable[index-step-ioffset];
4481
  //     }
4482
  //     if (tmp == resolved_klass) {
4483
  //        goto L_resolved_found;  // Found!
4484
  //     }
4485
  //   } while (tmp != 0);
4486
  //   goto L_no_such_interface // Not found.
4487
  //
4488
  Label L_loop_scan_resolved;
4489
  bind(L_loop_scan_resolved);
4490
    movptr(temp_itbl_klass, Address(scan_temp, 0));
4491
    addptr(scan_temp, scan_step);
4492
    bind(L_loop_scan_resolved_entry);
4493
    cmpptr(holder_klass, temp_itbl_klass);
4494
    cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4495
    cmpptr(resolved_klass, temp_itbl_klass);
4496
    jccb(Assembler::equal, L_resolved_found);
4497
    testptr(temp_itbl_klass, temp_itbl_klass);
4498
    jccb(Assembler::notZero, L_loop_scan_resolved);
4499

4500
  jmpb(L_no_such_interface);
4501

4502
  Label L_ready;
4503

4504
  // See if we already have a holder klass. If not, go and scan for it.
4505
  bind(L_resolved_found);
4506
  testptr(temp_reg, temp_reg);
4507
  jccb(Assembler::zero, L_scan_holder);
4508
  jmpb(L_ready);
4509

4510
  bind(L_holder_found);
4511
  movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step));
4512

4513
  // Finally, temp_reg contains holder_klass vtable offset
4514
  bind(L_ready);
4515
  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4516
  if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl
4517
    load_klass(scan_temp, receiver, noreg);
4518
    movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4519
  } else {
4520
    movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off));
4521
  }
4522
}
4523

4524

4525
// virtual method calling
4526
void MacroAssembler::lookup_virtual_method(Register recv_klass,
4527
                                           RegisterOrConstant vtable_index,
4528
                                           Register method_result) {
4529
  const ByteSize base = Klass::vtable_start_offset();
4530
  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4531
  Address vtable_entry_addr(recv_klass,
4532
                            vtable_index, Address::times_ptr,
4533
                            base + vtableEntry::method_offset());
4534
  movptr(method_result, vtable_entry_addr);
4535
}
4536

4537

4538
void MacroAssembler::check_klass_subtype(Register sub_klass,
4539
                           Register super_klass,
4540
                           Register temp_reg,
4541
                           Label& L_success) {
4542
  Label L_failure;
4543
  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, nullptr);
4544
  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr);
4545
  bind(L_failure);
4546
}
4547

4548

4549
void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4550
                                                   Register super_klass,
4551
                                                   Register temp_reg,
4552
                                                   Label* L_success,
4553
                                                   Label* L_failure,
4554
                                                   Label* L_slow_path,
4555
                                        RegisterOrConstant super_check_offset) {
4556
  assert_different_registers(sub_klass, super_klass, temp_reg);
4557
  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4558
  if (super_check_offset.is_register()) {
4559
    assert_different_registers(sub_klass, super_klass,
4560
                               super_check_offset.as_register());
4561
  } else if (must_load_sco) {
4562
    assert(temp_reg != noreg, "supply either a temp or a register offset");
4563
  }
4564

4565
  Label L_fallthrough;
4566
  int label_nulls = 0;
4567
  if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4568
  if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4569
  if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4570
  assert(label_nulls <= 1, "at most one null in the batch");
4571

4572
  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4573
  int sco_offset = in_bytes(Klass::super_check_offset_offset());
4574
  Address super_check_offset_addr(super_klass, sco_offset);
4575

4576
  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4577
  // range of a jccb.  If this routine grows larger, reconsider at
4578
  // least some of these.
4579
#define local_jcc(assembler_cond, label)                                \
4580
  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4581
  else                             jcc( assembler_cond, label) /*omit semi*/
4582

4583
  // Hacked jmp, which may only be used just before L_fallthrough.
4584
#define final_jmp(label)                                                \
4585
  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4586
  else                            jmp(label)                /*omit semi*/
4587

4588
  // If the pointers are equal, we are done (e.g., String[] elements).
4589
  // This self-check enables sharing of secondary supertype arrays among
4590
  // non-primary types such as array-of-interface.  Otherwise, each such
4591
  // type would need its own customized SSA.
4592
  // We move this check to the front of the fast path because many
4593
  // type checks are in fact trivially successful in this manner,
4594
  // so we get a nicely predicted branch right at the start of the check.
4595
  cmpptr(sub_klass, super_klass);
4596
  local_jcc(Assembler::equal, *L_success);
4597

4598
  // Check the supertype display:
4599
  if (must_load_sco) {
4600
    // Positive movl does right thing on LP64.
4601
    movl(temp_reg, super_check_offset_addr);
4602
    super_check_offset = RegisterOrConstant(temp_reg);
4603
  }
4604
  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4605
  cmpptr(super_klass, super_check_addr); // load displayed supertype
4606

4607
  // This check has worked decisively for primary supers.
4608
  // Secondary supers are sought in the super_cache ('super_cache_addr').
4609
  // (Secondary supers are interfaces and very deeply nested subtypes.)
4610
  // This works in the same check above because of a tricky aliasing
4611
  // between the super_cache and the primary super display elements.
4612
  // (The 'super_check_addr' can address either, as the case requires.)
4613
  // Note that the cache is updated below if it does not help us find
4614
  // what we need immediately.
4615
  // So if it was a primary super, we can just fail immediately.
4616
  // Otherwise, it's the slow path for us (no success at this point).
4617

4618
  if (super_check_offset.is_register()) {
4619
    local_jcc(Assembler::equal, *L_success);
4620
    cmpl(super_check_offset.as_register(), sc_offset);
4621
    if (L_failure == &L_fallthrough) {
4622
      local_jcc(Assembler::equal, *L_slow_path);
4623
    } else {
4624
      local_jcc(Assembler::notEqual, *L_failure);
4625
      final_jmp(*L_slow_path);
4626
    }
4627
  } else if (super_check_offset.as_constant() == sc_offset) {
4628
    // Need a slow path; fast failure is impossible.
4629
    if (L_slow_path == &L_fallthrough) {
4630
      local_jcc(Assembler::equal, *L_success);
4631
    } else {
4632
      local_jcc(Assembler::notEqual, *L_slow_path);
4633
      final_jmp(*L_success);
4634
    }
4635
  } else {
4636
    // No slow path; it's a fast decision.
4637
    if (L_failure == &L_fallthrough) {
4638
      local_jcc(Assembler::equal, *L_success);
4639
    } else {
4640
      local_jcc(Assembler::notEqual, *L_failure);
4641
      final_jmp(*L_success);
4642
    }
4643
  }
4644

4645
  bind(L_fallthrough);
4646

4647
#undef local_jcc
4648
#undef final_jmp
4649
}
4650

4651

4652
void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4653
                                                   Register super_klass,
4654
                                                   Register temp_reg,
4655
                                                   Register temp2_reg,
4656
                                                   Label* L_success,
4657
                                                   Label* L_failure,
4658
                                                   bool set_cond_codes) {
4659
  assert_different_registers(sub_klass, super_klass, temp_reg);
4660
  if (temp2_reg != noreg)
4661
    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4662
#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4663

4664
  Label L_fallthrough;
4665
  int label_nulls = 0;
4666
  if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4667
  if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4668
  assert(label_nulls <= 1, "at most one null in the batch");
4669

4670
  // a couple of useful fields in sub_klass:
4671
  int ss_offset = in_bytes(Klass::secondary_supers_offset());
4672
  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4673
  Address secondary_supers_addr(sub_klass, ss_offset);
4674
  Address super_cache_addr(     sub_klass, sc_offset);
4675

4676
  // Do a linear scan of the secondary super-klass chain.
4677
  // This code is rarely used, so simplicity is a virtue here.
4678
  // The repne_scan instruction uses fixed registers, which we must spill.
4679
  // Don't worry too much about pre-existing connections with the input regs.
4680

4681
  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4682
  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4683

4684
  // Get super_klass value into rax (even if it was in rdi or rcx).
4685
  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4686
  if (super_klass != rax) {
4687
    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4688
    mov(rax, super_klass);
4689
  }
4690
  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4691
  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4692

4693
#ifndef PRODUCT
4694
  uint* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4695
  ExternalAddress pst_counter_addr((address) pst_counter);
4696
  NOT_LP64(  incrementl(pst_counter_addr) );
4697
  LP64_ONLY( lea(rcx, pst_counter_addr) );
4698
  LP64_ONLY( incrementl(Address(rcx, 0)) );
4699
#endif //PRODUCT
4700

4701
  // We will consult the secondary-super array.
4702
  movptr(rdi, secondary_supers_addr);
4703
  // Load the array length.  (Positive movl does right thing on LP64.)
4704
  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4705
  // Skip to start of data.
4706
  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4707

4708
  // Scan RCX words at [RDI] for an occurrence of RAX.
4709
  // Set NZ/Z based on last compare.
4710
  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4711
  // not change flags (only scas instruction which is repeated sets flags).
4712
  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4713

4714
    testptr(rax,rax); // Set Z = 0
4715
    repne_scan();
4716

4717
  // Unspill the temp. registers:
4718
  if (pushed_rdi)  pop(rdi);
4719
  if (pushed_rcx)  pop(rcx);
4720
  if (pushed_rax)  pop(rax);
4721

4722
  if (set_cond_codes) {
4723
    // Special hack for the AD files:  rdi is guaranteed non-zero.
4724
    assert(!pushed_rdi, "rdi must be left non-null");
4725
    // Also, the condition codes are properly set Z/NZ on succeed/failure.
4726
  }
4727

4728
  if (L_failure == &L_fallthrough)
4729
        jccb(Assembler::notEqual, *L_failure);
4730
  else  jcc(Assembler::notEqual, *L_failure);
4731

4732
  // Success.  Cache the super we found and proceed in triumph.
4733
  movptr(super_cache_addr, super_klass);
4734

4735
  if (L_success != &L_fallthrough) {
4736
    jmp(*L_success);
4737
  }
4738

4739
#undef IS_A_TEMP
4740

4741
  bind(L_fallthrough);
4742
}
4743

4744
#ifdef _LP64
4745

4746
// population_count variant for running without the POPCNT
4747
// instruction, which was introduced with SSE4.2 in 2008.
4748
void MacroAssembler::population_count(Register dst, Register src,
4749
                                      Register scratch1, Register scratch2) {
4750
  assert_different_registers(src, scratch1, scratch2);
4751
  if (UsePopCountInstruction) {
4752
    Assembler::popcntq(dst, src);
4753
  } else {
4754
    assert_different_registers(src, scratch1, scratch2);
4755
    assert_different_registers(dst, scratch1, scratch2);
4756
    Label loop, done;
4757

4758
    mov(scratch1, src);
4759
    // dst = 0;
4760
    // while(scratch1 != 0) {
4761
    //   dst++;
4762
    //   scratch1 &= (scratch1 - 1);
4763
    // }
4764
    xorl(dst, dst);
4765
    testq(scratch1, scratch1);
4766
    jccb(Assembler::equal, done);
4767
    {
4768
      bind(loop);
4769
      incq(dst);
4770
      movq(scratch2, scratch1);
4771
      decq(scratch2);
4772
      andq(scratch1, scratch2);
4773
      jccb(Assembler::notEqual, loop);
4774
    }
4775
    bind(done);
4776
  }
4777
}
4778

4779
// Ensure that the inline code and the stub are using the same registers.
4780
#define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS                      \
4781
do {                                                                 \
4782
  assert(r_super_klass  == rax, "mismatch");                         \
4783
  assert(r_array_base   == rbx, "mismatch");                         \
4784
  assert(r_array_length == rcx, "mismatch");                         \
4785
  assert(r_array_index  == rdx, "mismatch");                         \
4786
  assert(r_sub_klass    == rsi || r_sub_klass == noreg, "mismatch"); \
4787
  assert(r_bitmap       == r11 || r_bitmap    == noreg, "mismatch"); \
4788
  assert(result         == rdi || result      == noreg, "mismatch"); \
4789
} while(0)
4790

4791
void MacroAssembler::lookup_secondary_supers_table(Register r_sub_klass,
4792
                                                   Register r_super_klass,
4793
                                                   Register temp1,
4794
                                                   Register temp2,
4795
                                                   Register temp3,
4796
                                                   Register temp4,
4797
                                                   Register result,
4798
                                                   u1 super_klass_slot) {
4799
  assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
4800

4801
  Label L_fallthrough, L_success, L_failure;
4802

4803
  BLOCK_COMMENT("lookup_secondary_supers_table {");
4804

4805
  const Register
4806
    r_array_index  = temp1,
4807
    r_array_length = temp2,
4808
    r_array_base   = temp3,
4809
    r_bitmap       = temp4;
4810

4811
  LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4812

4813
  xorq(result, result); // = 0
4814

4815
  movq(r_bitmap, Address(r_sub_klass, Klass::bitmap_offset()));
4816
  movq(r_array_index, r_bitmap);
4817

4818
  // First check the bitmap to see if super_klass might be present. If
4819
  // the bit is zero, we are certain that super_klass is not one of
4820
  // the secondary supers.
4821
  u1 bit = super_klass_slot;
4822
  {
4823
    // NB: If the count in a x86 shift instruction is 0, the flags are
4824
    // not affected, so we do a testq instead.
4825
    int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
4826
    if (shift_count != 0) {
4827
      salq(r_array_index, shift_count);
4828
    } else {
4829
      testq(r_array_index, r_array_index);
4830
    }
4831
  }
4832
  // We test the MSB of r_array_index, i.e. its sign bit
4833
  jcc(Assembler::positive, L_failure);
4834

4835
  // Get the first array index that can contain super_klass into r_array_index.
4836
  if (bit != 0) {
4837
    population_count(r_array_index, r_array_index, temp2, temp3);
4838
  } else {
4839
    movl(r_array_index, 1);
4840
  }
4841
  // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4842

4843
  // We will consult the secondary-super array.
4844
  movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4845

4846
  // We're asserting that the first word in an Array<Klass*> is the
4847
  // length, and the second word is the first word of the data. If
4848
  // that ever changes, r_array_base will have to be adjusted here.
4849
  assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4850
  assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4851

4852
  cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4853
  jccb(Assembler::equal, L_success);
4854

4855
  // Is there another entry to check? Consult the bitmap.
4856
  btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4857
  jccb(Assembler::carryClear, L_failure);
4858

4859
  // Linear probe. Rotate the bitmap so that the next bit to test is
4860
  // in Bit 1.
4861
  if (bit != 0) {
4862
    rorq(r_bitmap, bit);
4863
  }
4864

4865
  // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
4866
  // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
4867
  // Kills: r_array_length.
4868
  // Returns: result.
4869
  call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub()));
4870
  // Result (0/1) is in rdi
4871
  jmpb(L_fallthrough);
4872

4873
  bind(L_failure);
4874
  incq(result); // 0 => 1
4875

4876
  bind(L_success);
4877
  // result = 0;
4878

4879
  bind(L_fallthrough);
4880
  BLOCK_COMMENT("} lookup_secondary_supers_table");
4881

4882
  if (VerifySecondarySupers) {
4883
    verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
4884
                                  temp1, temp2, temp3);
4885
  }
4886
}
4887

4888
void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit,
4889
                                 Label* L_success, Label* L_failure) {
4890
  Label L_loop, L_fallthrough;
4891
  {
4892
    int label_nulls = 0;
4893
    if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4894
    if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4895
    assert(label_nulls <= 1, "at most one null in the batch");
4896
  }
4897
  bind(L_loop);
4898
  cmpq(value, Address(addr, count, Address::times_8));
4899
  jcc(Assembler::equal, *L_success);
4900
  addl(count, 1);
4901
  cmpl(count, limit);
4902
  jcc(Assembler::less, L_loop);
4903

4904
  if (&L_fallthrough != L_failure) {
4905
    jmp(*L_failure);
4906
  }
4907
  bind(L_fallthrough);
4908
}
4909

4910
// Called by code generated by check_klass_subtype_slow_path
4911
// above. This is called when there is a collision in the hashed
4912
// lookup in the secondary supers array.
4913
void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4914
                                                             Register r_array_base,
4915
                                                             Register r_array_index,
4916
                                                             Register r_bitmap,
4917
                                                             Register temp1,
4918
                                                             Register temp2,
4919
                                                             Label* L_success,
4920
                                                             Label* L_failure) {
4921
  assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2);
4922

4923
  const Register
4924
    r_array_length = temp1,
4925
    r_sub_klass    = noreg,
4926
    result         = noreg;
4927

4928
  LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
4929

4930
  Label L_fallthrough;
4931
  int label_nulls = 0;
4932
  if (L_success == nullptr)   { L_success   = &L_fallthrough; label_nulls++; }
4933
  if (L_failure == nullptr)   { L_failure   = &L_fallthrough; label_nulls++; }
4934
  assert(label_nulls <= 1, "at most one null in the batch");
4935

4936
  // Load the array length.
4937
  movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4938
  // And adjust the array base to point to the data.
4939
  // NB! Effectively increments current slot index by 1.
4940
  assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4941
  addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
4942

4943
  // Linear probe
4944
  Label L_huge;
4945

4946
  // The bitmap is full to bursting.
4947
  // Implicit invariant: BITMAP_FULL implies (length > 0)
4948
  assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "");
4949
  cmpq(r_bitmap, (int32_t)-1); // sign-extends immediate to 64-bit value
4950
  jcc(Assembler::equal, L_huge);
4951

4952
  // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4953
  // current slot (at secondary_supers[r_array_index]) has not yet
4954
  // been inspected, and r_array_index may be out of bounds if we
4955
  // wrapped around the end of the array.
4956

4957
  { // This is conventional linear probing, but instead of terminating
4958
    // when a null entry is found in the table, we maintain a bitmap
4959
    // in which a 0 indicates missing entries.
4960
    // The check above guarantees there are 0s in the bitmap, so the loop
4961
    // eventually terminates.
4962

4963
    xorl(temp2, temp2); // = 0;
4964

4965
    Label L_again;
4966
    bind(L_again);
4967

4968
    // Check for array wraparound.
4969
    cmpl(r_array_index, r_array_length);
4970
    cmovl(Assembler::greaterEqual, r_array_index, temp2);
4971

4972
    cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8));
4973
    jcc(Assembler::equal, *L_success);
4974

4975
    // If the next bit in bitmap is zero, we're done.
4976
    btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now
4977
    jcc(Assembler::carryClear, *L_failure);
4978

4979
    rorq(r_bitmap, 1); // Bits 1/2 => 0/1
4980
    addl(r_array_index, 1);
4981

4982
    jmp(L_again);
4983
  }
4984

4985
  { // Degenerate case: more than 64 secondary supers.
4986
    // FIXME: We could do something smarter here, maybe a vectorized
4987
    // comparison or a binary search, but is that worth any added
4988
    // complexity?
4989
    bind(L_huge);
4990
    xorl(r_array_index, r_array_index); // = 0
4991
    repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length,
4992
                L_success,
4993
                (&L_fallthrough != L_failure ? L_failure : nullptr));
4994

4995
    bind(L_fallthrough);
4996
  }
4997
}
4998

4999
struct VerifyHelperArguments {
5000
  Klass* _super;
5001
  Klass* _sub;
5002
  intptr_t _linear_result;
5003
  intptr_t _table_result;
5004
};
5005

5006
static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) {
5007
  Klass::on_secondary_supers_verification_failure(args->_super,
5008
                                                  args->_sub,
5009
                                                  args->_linear_result,
5010
                                                  args->_table_result,
5011
                                                  msg);
5012
}
5013

5014
// Make sure that the hashed lookup and a linear scan agree.
5015
void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5016
                                                   Register r_super_klass,
5017
                                                   Register result,
5018
                                                   Register temp1,
5019
                                                   Register temp2,
5020
                                                   Register temp3) {
5021
  const Register
5022
      r_array_index  = temp1,
5023
      r_array_length = temp2,
5024
      r_array_base   = temp3,
5025
      r_bitmap       = noreg;
5026

5027
  LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS;
5028

5029
  BLOCK_COMMENT("verify_secondary_supers_table {");
5030

5031
  Label L_success, L_failure, L_check, L_done;
5032

5033
  movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5034
  movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5035
  // And adjust the array base to point to the data.
5036
  addptr(r_array_base, Array<Klass*>::base_offset_in_bytes());
5037

5038
  testl(r_array_length, r_array_length); // array_length == 0?
5039
  jcc(Assembler::zero, L_failure);
5040

5041
  movl(r_array_index, 0);
5042
  repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success);
5043
  // fall through to L_failure
5044

5045
  const Register linear_result = r_array_index; // reuse temp1
5046

5047
  bind(L_failure); // not present
5048
  movl(linear_result, 1);
5049
  jmp(L_check);
5050

5051
  bind(L_success); // present
5052
  movl(linear_result, 0);
5053

5054
  bind(L_check);
5055
  cmpl(linear_result, result);
5056
  jcc(Assembler::equal, L_done);
5057

5058
  { // To avoid calling convention issues, build a record on the stack
5059
    // and pass the pointer to that instead.
5060
    push(result);
5061
    push(linear_result);
5062
    push(r_sub_klass);
5063
    push(r_super_klass);
5064
    movptr(c_rarg1, rsp);
5065
    movptr(c_rarg0, (uintptr_t) "mismatch");
5066
    call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper)));
5067
    should_not_reach_here();
5068
  }
5069
  bind(L_done);
5070

5071
  BLOCK_COMMENT("} verify_secondary_supers_table");
5072
}
5073

5074
#undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS
5075

5076
#endif // LP64
5077

5078
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
5079
  assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
5080

5081
  Label L_fallthrough;
5082
  if (L_fast_path == nullptr) {
5083
    L_fast_path = &L_fallthrough;
5084
  } else if (L_slow_path == nullptr) {
5085
    L_slow_path = &L_fallthrough;
5086
  }
5087

5088
  // Fast path check: class is fully initialized
5089
  cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
5090
  jcc(Assembler::equal, *L_fast_path);
5091

5092
  // Fast path check: current thread is initializer thread
5093
  cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
5094
  if (L_slow_path == &L_fallthrough) {
5095
    jcc(Assembler::equal, *L_fast_path);
5096
    bind(*L_slow_path);
5097
  } else if (L_fast_path == &L_fallthrough) {
5098
    jcc(Assembler::notEqual, *L_slow_path);
5099
    bind(*L_fast_path);
5100
  } else {
5101
    Unimplemented();
5102
  }
5103
}
5104

5105
void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5106
  if (VM_Version::supports_cmov()) {
5107
    cmovl(cc, dst, src);
5108
  } else {
5109
    Label L;
5110
    jccb(negate_condition(cc), L);
5111
    movl(dst, src);
5112
    bind(L);
5113
  }
5114
}
5115

5116
void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5117
  if (VM_Version::supports_cmov()) {
5118
    cmovl(cc, dst, src);
5119
  } else {
5120
    Label L;
5121
    jccb(negate_condition(cc), L);
5122
    movl(dst, src);
5123
    bind(L);
5124
  }
5125
}
5126

5127
void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
5128
  if (!VerifyOops) return;
5129

5130
  BLOCK_COMMENT("verify_oop {");
5131
#ifdef _LP64
5132
  push(rscratch1);
5133
#endif
5134
  push(rax);                          // save rax
5135
  push(reg);                          // pass register argument
5136

5137
  // Pass register number to verify_oop_subroutine
5138
  const char* b = nullptr;
5139
  {
5140
    ResourceMark rm;
5141
    stringStream ss;
5142
    ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
5143
    b = code_string(ss.as_string());
5144
  }
5145
  AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5146
  pushptr(buffer.addr(), rscratch1);
5147

5148
  // call indirectly to solve generation ordering problem
5149
  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5150
  call(rax);
5151
  // Caller pops the arguments (oop, message) and restores rax, r10
5152
  BLOCK_COMMENT("} verify_oop");
5153
}
5154

5155
void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
5156
  if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
5157
    // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without
5158
    // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
5159
    vpternlogd(dst, 0xFF, dst, dst, vector_len);
5160
  } else if (VM_Version::supports_avx()) {
5161
    vpcmpeqd(dst, dst, dst, vector_len);
5162
  } else {
5163
    assert(VM_Version::supports_sse2(), "");
5164
    pcmpeqd(dst, dst);
5165
  }
5166
}
5167

5168
Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5169
                                         int extra_slot_offset) {
5170
  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5171
  int stackElementSize = Interpreter::stackElementSize;
5172
  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5173
#ifdef ASSERT
5174
  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5175
  assert(offset1 - offset == stackElementSize, "correct arithmetic");
5176
#endif
5177
  Register             scale_reg    = noreg;
5178
  Address::ScaleFactor scale_factor = Address::no_scale;
5179
  if (arg_slot.is_constant()) {
5180
    offset += arg_slot.as_constant() * stackElementSize;
5181
  } else {
5182
    scale_reg    = arg_slot.as_register();
5183
    scale_factor = Address::times(stackElementSize);
5184
  }
5185
  offset += wordSize;           // return PC is on stack
5186
  return Address(rsp, scale_reg, scale_factor, offset);
5187
}
5188

5189
void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
5190
  if (!VerifyOops) return;
5191

5192
#ifdef _LP64
5193
  push(rscratch1);
5194
#endif
5195
  push(rax); // save rax,
5196
  // addr may contain rsp so we will have to adjust it based on the push
5197
  // we just did (and on 64 bit we do two pushes)
5198
  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5199
  // stores rax into addr which is backwards of what was intended.
5200
  if (addr.uses(rsp)) {
5201
    lea(rax, addr);
5202
    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5203
  } else {
5204
    pushptr(addr);
5205
  }
5206

5207
  // Pass register number to verify_oop_subroutine
5208
  const char* b = nullptr;
5209
  {
5210
    ResourceMark rm;
5211
    stringStream ss;
5212
    ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
5213
    b = code_string(ss.as_string());
5214
  }
5215
  AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate());
5216
  pushptr(buffer.addr(), rscratch1);
5217

5218
  // call indirectly to solve generation ordering problem
5219
  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5220
  call(rax);
5221
  // Caller pops the arguments (addr, message) and restores rax, r10.
5222
}
5223

5224
void MacroAssembler::verify_tlab() {
5225
#ifdef ASSERT
5226
  if (UseTLAB && VerifyOops) {
5227
    Label next, ok;
5228
    Register t1 = rsi;
5229
    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5230

5231
    push(t1);
5232
    NOT_LP64(push(thread_reg));
5233
    NOT_LP64(get_thread(thread_reg));
5234

5235
    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5236
    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5237
    jcc(Assembler::aboveEqual, next);
5238
    STOP("assert(top >= start)");
5239
    should_not_reach_here();
5240

5241
    bind(next);
5242
    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5243
    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5244
    jcc(Assembler::aboveEqual, ok);
5245
    STOP("assert(top <= end)");
5246
    should_not_reach_here();
5247

5248
    bind(ok);
5249
    NOT_LP64(pop(thread_reg));
5250
    pop(t1);
5251
  }
5252
#endif
5253
}
5254

5255
class ControlWord {
5256
 public:
5257
  int32_t _value;
5258

5259
  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
5260
  int  precision_control() const       { return  (_value >>  8) & 3      ; }
5261
  bool precision() const               { return ((_value >>  5) & 1) != 0; }
5262
  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5263
  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5264
  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5265
  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5266
  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5267

5268
  void print() const {
5269
    // rounding control
5270
    const char* rc;
5271
    switch (rounding_control()) {
5272
      case 0: rc = "round near"; break;
5273
      case 1: rc = "round down"; break;
5274
      case 2: rc = "round up  "; break;
5275
      case 3: rc = "chop      "; break;
5276
      default:
5277
        rc = nullptr; // silence compiler warnings
5278
        fatal("Unknown rounding control: %d", rounding_control());
5279
    };
5280
    // precision control
5281
    const char* pc;
5282
    switch (precision_control()) {
5283
      case 0: pc = "24 bits "; break;
5284
      case 1: pc = "reserved"; break;
5285
      case 2: pc = "53 bits "; break;
5286
      case 3: pc = "64 bits "; break;
5287
      default:
5288
        pc = nullptr; // silence compiler warnings
5289
        fatal("Unknown precision control: %d", precision_control());
5290
    };
5291
    // flags
5292
    char f[9];
5293
    f[0] = ' ';
5294
    f[1] = ' ';
5295
    f[2] = (precision   ()) ? 'P' : 'p';
5296
    f[3] = (underflow   ()) ? 'U' : 'u';
5297
    f[4] = (overflow    ()) ? 'O' : 'o';
5298
    f[5] = (zero_divide ()) ? 'Z' : 'z';
5299
    f[6] = (denormalized()) ? 'D' : 'd';
5300
    f[7] = (invalid     ()) ? 'I' : 'i';
5301
    f[8] = '\x0';
5302
    // output
5303
    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5304
  }
5305

5306
};
5307

5308
class StatusWord {
5309
 public:
5310
  int32_t _value;
5311

5312
  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
5313
  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
5314
  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
5315
  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
5316
  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
5317
  int  top() const                     { return  (_value >> 11) & 7      ; }
5318
  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
5319
  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
5320
  bool precision() const               { return ((_value >>  5) & 1) != 0; }
5321
  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5322
  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5323
  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5324
  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5325
  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5326

5327
  void print() const {
5328
    // condition codes
5329
    char c[5];
5330
    c[0] = (C3()) ? '3' : '-';
5331
    c[1] = (C2()) ? '2' : '-';
5332
    c[2] = (C1()) ? '1' : '-';
5333
    c[3] = (C0()) ? '0' : '-';
5334
    c[4] = '\x0';
5335
    // flags
5336
    char f[9];
5337
    f[0] = (error_status()) ? 'E' : '-';
5338
    f[1] = (stack_fault ()) ? 'S' : '-';
5339
    f[2] = (precision   ()) ? 'P' : '-';
5340
    f[3] = (underflow   ()) ? 'U' : '-';
5341
    f[4] = (overflow    ()) ? 'O' : '-';
5342
    f[5] = (zero_divide ()) ? 'Z' : '-';
5343
    f[6] = (denormalized()) ? 'D' : '-';
5344
    f[7] = (invalid     ()) ? 'I' : '-';
5345
    f[8] = '\x0';
5346
    // output
5347
    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
5348
  }
5349

5350
};
5351

5352
class TagWord {
5353
 public:
5354
  int32_t _value;
5355

5356
  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
5357

5358
  void print() const {
5359
    printf("%04x", _value & 0xFFFF);
5360
  }
5361

5362
};
5363

5364
class FPU_Register {
5365
 public:
5366
  int32_t _m0;
5367
  int32_t _m1;
5368
  int16_t _ex;
5369

5370
  bool is_indefinite() const           {
5371
    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5372
  }
5373

5374
  void print() const {
5375
    char  sign = (_ex < 0) ? '-' : '+';
5376
    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
5377
    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5378
  };
5379

5380
};
5381

5382
class FPU_State {
5383
 public:
5384
  enum {
5385
    register_size       = 10,
5386
    number_of_registers =  8,
5387
    register_mask       =  7
5388
  };
5389

5390
  ControlWord  _control_word;
5391
  StatusWord   _status_word;
5392
  TagWord      _tag_word;
5393
  int32_t      _error_offset;
5394
  int32_t      _error_selector;
5395
  int32_t      _data_offset;
5396
  int32_t      _data_selector;
5397
  int8_t       _register[register_size * number_of_registers];
5398

5399
  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5400
  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5401

5402
  const char* tag_as_string(int tag) const {
5403
    switch (tag) {
5404
      case 0: return "valid";
5405
      case 1: return "zero";
5406
      case 2: return "special";
5407
      case 3: return "empty";
5408
    }
5409
    ShouldNotReachHere();
5410
    return nullptr;
5411
  }
5412

5413
  void print() const {
5414
    // print computation registers
5415
    { int t = _status_word.top();
5416
      for (int i = 0; i < number_of_registers; i++) {
5417
        int j = (i - t) & register_mask;
5418
        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5419
        st(j)->print();
5420
        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5421
      }
5422
    }
5423
    printf("\n");
5424
    // print control registers
5425
    printf("ctrl = "); _control_word.print(); printf("\n");
5426
    printf("stat = "); _status_word .print(); printf("\n");
5427
    printf("tags = "); _tag_word    .print(); printf("\n");
5428
  }
5429

5430
};
5431

5432
class Flag_Register {
5433
 public:
5434
  int32_t _value;
5435

5436
  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5437
  bool direction() const               { return ((_value >> 10) & 1) != 0; }
5438
  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5439
  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5440
  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5441
  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5442
  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5443

5444
  void print() const {
5445
    // flags
5446
    char f[8];
5447
    f[0] = (overflow       ()) ? 'O' : '-';
5448
    f[1] = (direction      ()) ? 'D' : '-';
5449
    f[2] = (sign           ()) ? 'S' : '-';
5450
    f[3] = (zero           ()) ? 'Z' : '-';
5451
    f[4] = (auxiliary_carry()) ? 'A' : '-';
5452
    f[5] = (parity         ()) ? 'P' : '-';
5453
    f[6] = (carry          ()) ? 'C' : '-';
5454
    f[7] = '\x0';
5455
    // output
5456
    printf("%08x  flags = %s", _value, f);
5457
  }
5458

5459
};
5460

5461
class IU_Register {
5462
 public:
5463
  int32_t _value;
5464

5465
  void print() const {
5466
    printf("%08x  %11d", _value, _value);
5467
  }
5468

5469
};
5470

5471
class IU_State {
5472
 public:
5473
  Flag_Register _eflags;
5474
  IU_Register   _rdi;
5475
  IU_Register   _rsi;
5476
  IU_Register   _rbp;
5477
  IU_Register   _rsp;
5478
  IU_Register   _rbx;
5479
  IU_Register   _rdx;
5480
  IU_Register   _rcx;
5481
  IU_Register   _rax;
5482

5483
  void print() const {
5484
    // computation registers
5485
    printf("rax,  = "); _rax.print(); printf("\n");
5486
    printf("rbx,  = "); _rbx.print(); printf("\n");
5487
    printf("rcx  = "); _rcx.print(); printf("\n");
5488
    printf("rdx  = "); _rdx.print(); printf("\n");
5489
    printf("rdi  = "); _rdi.print(); printf("\n");
5490
    printf("rsi  = "); _rsi.print(); printf("\n");
5491
    printf("rbp,  = "); _rbp.print(); printf("\n");
5492
    printf("rsp  = "); _rsp.print(); printf("\n");
5493
    printf("\n");
5494
    // control registers
5495
    printf("flgs = "); _eflags.print(); printf("\n");
5496
  }
5497
};
5498

5499

5500
class CPU_State {
5501
 public:
5502
  FPU_State _fpu_state;
5503
  IU_State  _iu_state;
5504

5505
  void print() const {
5506
    printf("--------------------------------------------------\n");
5507
    _iu_state .print();
5508
    printf("\n");
5509
    _fpu_state.print();
5510
    printf("--------------------------------------------------\n");
5511
  }
5512

5513
};
5514

5515

5516
static void _print_CPU_state(CPU_State* state) {
5517
  state->print();
5518
};
5519

5520

5521
void MacroAssembler::print_CPU_state() {
5522
  push_CPU_state();
5523
  push(rsp);                // pass CPU state
5524
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5525
  addptr(rsp, wordSize);       // discard argument
5526
  pop_CPU_state();
5527
}
5528

5529

5530
#ifndef _LP64
5531
static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5532
  static int counter = 0;
5533
  FPU_State* fs = &state->_fpu_state;
5534
  counter++;
5535
  // For leaf calls, only verify that the top few elements remain empty.
5536
  // We only need 1 empty at the top for C2 code.
5537
  if( stack_depth < 0 ) {
5538
    if( fs->tag_for_st(7) != 3 ) {
5539
      printf("FPR7 not empty\n");
5540
      state->print();
5541
      assert(false, "error");
5542
      return false;
5543
    }
5544
    return true;                // All other stack states do not matter
5545
  }
5546

5547
  assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(),
5548
         "bad FPU control word");
5549

5550
  // compute stack depth
5551
  int i = 0;
5552
  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5553
  int d = i;
5554
  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5555
  // verify findings
5556
  if (i != FPU_State::number_of_registers) {
5557
    // stack not contiguous
5558
    printf("%s: stack not contiguous at ST%d\n", s, i);
5559
    state->print();
5560
    assert(false, "error");
5561
    return false;
5562
  }
5563
  // check if computed stack depth corresponds to expected stack depth
5564
  if (stack_depth < 0) {
5565
    // expected stack depth is -stack_depth or less
5566
    if (d > -stack_depth) {
5567
      // too many elements on the stack
5568
      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5569
      state->print();
5570
      assert(false, "error");
5571
      return false;
5572
    }
5573
  } else {
5574
    // expected stack depth is stack_depth
5575
    if (d != stack_depth) {
5576
      // wrong stack depth
5577
      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5578
      state->print();
5579
      assert(false, "error");
5580
      return false;
5581
    }
5582
  }
5583
  // everything is cool
5584
  return true;
5585
}
5586

5587
void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5588
  if (!VerifyFPU) return;
5589
  push_CPU_state();
5590
  push(rsp);                // pass CPU state
5591
  ExternalAddress msg((address) s);
5592
  // pass message string s
5593
  pushptr(msg.addr(), noreg);
5594
  push(stack_depth);        // pass stack depth
5595
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5596
  addptr(rsp, 3 * wordSize);   // discard arguments
5597
  // check for error
5598
  { Label L;
5599
    testl(rax, rax);
5600
    jcc(Assembler::notZero, L);
5601
    int3();                  // break if error condition
5602
    bind(L);
5603
  }
5604
  pop_CPU_state();
5605
}
5606
#endif // _LP64
5607

5608
void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) {
5609
  // Either restore the MXCSR register after returning from the JNI Call
5610
  // or verify that it wasn't changed (with -Xcheck:jni flag).
5611
  if (VM_Version::supports_sse()) {
5612
    if (RestoreMXCSROnJNICalls) {
5613
      ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
5614
    } else if (CheckJNICalls) {
5615
      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5616
    }
5617
  }
5618
  // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5619
  vzeroupper();
5620

5621
#ifndef _LP64
5622
  // Either restore the x87 floating pointer control word after returning
5623
  // from the JNI call or verify that it wasn't changed.
5624
  if (CheckJNICalls) {
5625
    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5626
  }
5627
#endif // _LP64
5628
}
5629

5630
// ((OopHandle)result).resolve();
5631
void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5632
  assert_different_registers(result, tmp);
5633

5634
  // Only 64 bit platforms support GCs that require a tmp register
5635
  // Only IN_HEAP loads require a thread_tmp register
5636
  // OopHandle::resolve is an indirection like jobject.
5637
  access_load_at(T_OBJECT, IN_NATIVE,
5638
                 result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5639
}
5640

5641
// ((WeakHandle)result).resolve();
5642
void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5643
  assert_different_registers(rresult, rtmp);
5644
  Label resolved;
5645

5646
  // A null weak handle resolves to null.
5647
  cmpptr(rresult, 0);
5648
  jcc(Assembler::equal, resolved);
5649

5650
  // Only 64 bit platforms support GCs that require a tmp register
5651
  // Only IN_HEAP loads require a thread_tmp register
5652
  // WeakHandle::resolve is an indirection like jweak.
5653
  access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5654
                 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5655
  bind(resolved);
5656
}
5657

5658
void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5659
  // get mirror
5660
  const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5661
  load_method_holder(mirror, method);
5662
  movptr(mirror, Address(mirror, mirror_offset));
5663
  resolve_oop_handle(mirror, tmp);
5664
}
5665

5666
void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5667
  load_method_holder(rresult, rmethod);
5668
  movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5669
}
5670

5671
void MacroAssembler::load_method_holder(Register holder, Register method) {
5672
  movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5673
  movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5674
  movptr(holder, Address(holder, ConstantPool::pool_holder_offset()));          // InstanceKlass*
5675
}
5676

5677
void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
5678
  assert_different_registers(src, tmp);
5679
  assert_different_registers(dst, tmp);
5680
#ifdef _LP64
5681
  if (UseCompressedClassPointers) {
5682
    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5683
    decode_klass_not_null(dst, tmp);
5684
  } else
5685
#endif
5686
    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5687
}
5688

5689
void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
5690
  assert_different_registers(src, tmp);
5691
  assert_different_registers(dst, tmp);
5692
#ifdef _LP64
5693
  if (UseCompressedClassPointers) {
5694
    encode_klass_not_null(src, tmp);
5695
    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5696
  } else
5697
#endif
5698
    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5699
}
5700

5701
void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5702
                                    Register tmp1, Register thread_tmp) {
5703
  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5704
  decorators = AccessInternal::decorator_fixup(decorators, type);
5705
  bool as_raw = (decorators & AS_RAW) != 0;
5706
  if (as_raw) {
5707
    bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5708
  } else {
5709
    bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5710
  }
5711
}
5712

5713
void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val,
5714
                                     Register tmp1, Register tmp2, Register tmp3) {
5715
  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5716
  decorators = AccessInternal::decorator_fixup(decorators, type);
5717
  bool as_raw = (decorators & AS_RAW) != 0;
5718
  if (as_raw) {
5719
    bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5720
  } else {
5721
    bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
5722
  }
5723
}
5724

5725
void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5726
                                   Register thread_tmp, DecoratorSet decorators) {
5727
  access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5728
}
5729

5730
// Doesn't do verification, generates fixed size code
5731
void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5732
                                            Register thread_tmp, DecoratorSet decorators) {
5733
  access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5734
}
5735

5736
void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
5737
                                    Register tmp2, Register tmp3, DecoratorSet decorators) {
5738
  access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
5739
}
5740

5741
// Used for storing nulls.
5742
void MacroAssembler::store_heap_oop_null(Address dst) {
5743
  access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5744
}
5745

5746
#ifdef _LP64
5747
void MacroAssembler::store_klass_gap(Register dst, Register src) {
5748
  if (UseCompressedClassPointers) {
5749
    // Store to klass gap in destination
5750
    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5751
  }
5752
}
5753

5754
#ifdef ASSERT
5755
void MacroAssembler::verify_heapbase(const char* msg) {
5756
  assert (UseCompressedOops, "should be compressed");
5757
  assert (Universe::heap() != nullptr, "java heap should be initialized");
5758
  if (CheckCompressedOops) {
5759
    Label ok;
5760
    ExternalAddress src2(CompressedOops::ptrs_base_addr());
5761
    const bool is_src2_reachable = reachable(src2);
5762
    if (!is_src2_reachable) {
5763
      push(rscratch1);  // cmpptr trashes rscratch1
5764
    }
5765
    cmpptr(r12_heapbase, src2, rscratch1);
5766
    jcc(Assembler::equal, ok);
5767
    STOP(msg);
5768
    bind(ok);
5769
    if (!is_src2_reachable) {
5770
      pop(rscratch1);
5771
    }
5772
  }
5773
}
5774
#endif
5775

5776
// Algorithm must match oop.inline.hpp encode_heap_oop.
5777
void MacroAssembler::encode_heap_oop(Register r) {
5778
#ifdef ASSERT
5779
  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5780
#endif
5781
  verify_oop_msg(r, "broken oop in encode_heap_oop");
5782
  if (CompressedOops::base() == nullptr) {
5783
    if (CompressedOops::shift() != 0) {
5784
      assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5785
      shrq(r, LogMinObjAlignmentInBytes);
5786
    }
5787
    return;
5788
  }
5789
  testq(r, r);
5790
  cmovq(Assembler::equal, r, r12_heapbase);
5791
  subq(r, r12_heapbase);
5792
  shrq(r, LogMinObjAlignmentInBytes);
5793
}
5794

5795
void MacroAssembler::encode_heap_oop_not_null(Register r) {
5796
#ifdef ASSERT
5797
  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5798
  if (CheckCompressedOops) {
5799
    Label ok;
5800
    testq(r, r);
5801
    jcc(Assembler::notEqual, ok);
5802
    STOP("null oop passed to encode_heap_oop_not_null");
5803
    bind(ok);
5804
  }
5805
#endif
5806
  verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
5807
  if (CompressedOops::base() != nullptr) {
5808
    subq(r, r12_heapbase);
5809
  }
5810
  if (CompressedOops::shift() != 0) {
5811
    assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5812
    shrq(r, LogMinObjAlignmentInBytes);
5813
  }
5814
}
5815

5816
void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5817
#ifdef ASSERT
5818
  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5819
  if (CheckCompressedOops) {
5820
    Label ok;
5821
    testq(src, src);
5822
    jcc(Assembler::notEqual, ok);
5823
    STOP("null oop passed to encode_heap_oop_not_null2");
5824
    bind(ok);
5825
  }
5826
#endif
5827
  verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
5828
  if (dst != src) {
5829
    movq(dst, src);
5830
  }
5831
  if (CompressedOops::base() != nullptr) {
5832
    subq(dst, r12_heapbase);
5833
  }
5834
  if (CompressedOops::shift() != 0) {
5835
    assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5836
    shrq(dst, LogMinObjAlignmentInBytes);
5837
  }
5838
}
5839

5840
void  MacroAssembler::decode_heap_oop(Register r) {
5841
#ifdef ASSERT
5842
  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5843
#endif
5844
  if (CompressedOops::base() == nullptr) {
5845
    if (CompressedOops::shift() != 0) {
5846
      assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5847
      shlq(r, LogMinObjAlignmentInBytes);
5848
    }
5849
  } else {
5850
    Label done;
5851
    shlq(r, LogMinObjAlignmentInBytes);
5852
    jccb(Assembler::equal, done);
5853
    addq(r, r12_heapbase);
5854
    bind(done);
5855
  }
5856
  verify_oop_msg(r, "broken oop in decode_heap_oop");
5857
}
5858

5859
void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5860
  // Note: it will change flags
5861
  assert (UseCompressedOops, "should only be used for compressed headers");
5862
  assert (Universe::heap() != nullptr, "java heap should be initialized");
5863
  // Cannot assert, unverified entry point counts instructions (see .ad file)
5864
  // vtableStubs also counts instructions in pd_code_size_limit.
5865
  // Also do not verify_oop as this is called by verify_oop.
5866
  if (CompressedOops::shift() != 0) {
5867
    assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5868
    shlq(r, LogMinObjAlignmentInBytes);
5869
    if (CompressedOops::base() != nullptr) {
5870
      addq(r, r12_heapbase);
5871
    }
5872
  } else {
5873
    assert (CompressedOops::base() == nullptr, "sanity");
5874
  }
5875
}
5876

5877
void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5878
  // Note: it will change flags
5879
  assert (UseCompressedOops, "should only be used for compressed headers");
5880
  assert (Universe::heap() != nullptr, "java heap should be initialized");
5881
  // Cannot assert, unverified entry point counts instructions (see .ad file)
5882
  // vtableStubs also counts instructions in pd_code_size_limit.
5883
  // Also do not verify_oop as this is called by verify_oop.
5884
  if (CompressedOops::shift() != 0) {
5885
    assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5886
    if (LogMinObjAlignmentInBytes == Address::times_8) {
5887
      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5888
    } else {
5889
      if (dst != src) {
5890
        movq(dst, src);
5891
      }
5892
      shlq(dst, LogMinObjAlignmentInBytes);
5893
      if (CompressedOops::base() != nullptr) {
5894
        addq(dst, r12_heapbase);
5895
      }
5896
    }
5897
  } else {
5898
    assert (CompressedOops::base() == nullptr, "sanity");
5899
    if (dst != src) {
5900
      movq(dst, src);
5901
    }
5902
  }
5903
}
5904

5905
void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
5906
  assert_different_registers(r, tmp);
5907
  if (CompressedKlassPointers::base() != nullptr) {
5908
    mov64(tmp, (int64_t)CompressedKlassPointers::base());
5909
    subq(r, tmp);
5910
  }
5911
  if (CompressedKlassPointers::shift() != 0) {
5912
    assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5913
    shrq(r, LogKlassAlignmentInBytes);
5914
  }
5915
}
5916

5917
void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
5918
  assert_different_registers(src, dst);
5919
  if (CompressedKlassPointers::base() != nullptr) {
5920
    mov64(dst, -(int64_t)CompressedKlassPointers::base());
5921
    addq(dst, src);
5922
  } else {
5923
    movptr(dst, src);
5924
  }
5925
  if (CompressedKlassPointers::shift() != 0) {
5926
    assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5927
    shrq(dst, LogKlassAlignmentInBytes);
5928
  }
5929
}
5930

5931
void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
5932
  assert_different_registers(r, tmp);
5933
  // Note: it will change flags
5934
  assert(UseCompressedClassPointers, "should only be used for compressed headers");
5935
  // Cannot assert, unverified entry point counts instructions (see .ad file)
5936
  // vtableStubs also counts instructions in pd_code_size_limit.
5937
  // Also do not verify_oop as this is called by verify_oop.
5938
  if (CompressedKlassPointers::shift() != 0) {
5939
    assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5940
    shlq(r, LogKlassAlignmentInBytes);
5941
  }
5942
  if (CompressedKlassPointers::base() != nullptr) {
5943
    mov64(tmp, (int64_t)CompressedKlassPointers::base());
5944
    addq(r, tmp);
5945
  }
5946
}
5947

5948
void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
5949
  assert_different_registers(src, dst);
5950
  // Note: it will change flags
5951
  assert (UseCompressedClassPointers, "should only be used for compressed headers");
5952
  // Cannot assert, unverified entry point counts instructions (see .ad file)
5953
  // vtableStubs also counts instructions in pd_code_size_limit.
5954
  // Also do not verify_oop as this is called by verify_oop.
5955

5956
  if (CompressedKlassPointers::base() == nullptr &&
5957
      CompressedKlassPointers::shift() == 0) {
5958
    // The best case scenario is that there is no base or shift. Then it is already
5959
    // a pointer that needs nothing but a register rename.
5960
    movl(dst, src);
5961
  } else {
5962
    if (CompressedKlassPointers::base() != nullptr) {
5963
      mov64(dst, (int64_t)CompressedKlassPointers::base());
5964
    } else {
5965
      xorq(dst, dst);
5966
    }
5967
    if (CompressedKlassPointers::shift() != 0) {
5968
      assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5969
      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5970
      leaq(dst, Address(dst, src, Address::times_8, 0));
5971
    } else {
5972
      addq(dst, src);
5973
    }
5974
  }
5975
}
5976

5977
void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5978
  assert (UseCompressedOops, "should only be used for compressed headers");
5979
  assert (Universe::heap() != nullptr, "java heap should be initialized");
5980
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5981
  int oop_index = oop_recorder()->find_index(obj);
5982
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
5983
  mov_narrow_oop(dst, oop_index, rspec);
5984
}
5985

5986
void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5987
  assert (UseCompressedOops, "should only be used for compressed headers");
5988
  assert (Universe::heap() != nullptr, "java heap should be initialized");
5989
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5990
  int oop_index = oop_recorder()->find_index(obj);
5991
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
5992
  mov_narrow_oop(dst, oop_index, rspec);
5993
}
5994

5995
void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5996
  assert (UseCompressedClassPointers, "should only be used for compressed headers");
5997
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5998
  int klass_index = oop_recorder()->find_index(k);
5999
  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6000
  mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6001
}
6002

6003
void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6004
  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6005
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6006
  int klass_index = oop_recorder()->find_index(k);
6007
  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6008
  mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6009
}
6010

6011
void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6012
  assert (UseCompressedOops, "should only be used for compressed headers");
6013
  assert (Universe::heap() != nullptr, "java heap should be initialized");
6014
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6015
  int oop_index = oop_recorder()->find_index(obj);
6016
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6017
  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6018
}
6019

6020
void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6021
  assert (UseCompressedOops, "should only be used for compressed headers");
6022
  assert (Universe::heap() != nullptr, "java heap should be initialized");
6023
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6024
  int oop_index = oop_recorder()->find_index(obj);
6025
  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6026
  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6027
}
6028

6029
void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6030
  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6031
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6032
  int klass_index = oop_recorder()->find_index(k);
6033
  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6034
  Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6035
}
6036

6037
void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6038
  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6039
  assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
6040
  int klass_index = oop_recorder()->find_index(k);
6041
  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6042
  Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
6043
}
6044

6045
void MacroAssembler::reinit_heapbase() {
6046
  if (UseCompressedOops) {
6047
    if (Universe::heap() != nullptr) {
6048
      if (CompressedOops::base() == nullptr) {
6049
        MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6050
      } else {
6051
        mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
6052
      }
6053
    } else {
6054
      movptr(r12_heapbase, ExternalAddress(CompressedOops::ptrs_base_addr()));
6055
    }
6056
  }
6057
}
6058

6059
#endif // _LP64
6060

6061
#if COMPILER2_OR_JVMCI
6062

6063
// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers
6064
void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
6065
  // cnt - number of qwords (8-byte words).
6066
  // base - start address, qword aligned.
6067
  Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
6068
  bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0);
6069
  if (use64byteVector) {
6070
    vpxor(xtmp, xtmp, xtmp, AVX_512bit);
6071
  } else if (MaxVectorSize >= 32) {
6072
    vpxor(xtmp, xtmp, xtmp, AVX_256bit);
6073
  } else {
6074
    pxor(xtmp, xtmp);
6075
  }
6076
  jmp(L_zero_64_bytes);
6077

6078
  BIND(L_loop);
6079
  if (MaxVectorSize >= 32) {
6080
    fill64(base, 0, xtmp, use64byteVector);
6081
  } else {
6082
    movdqu(Address(base,  0), xtmp);
6083
    movdqu(Address(base, 16), xtmp);
6084
    movdqu(Address(base, 32), xtmp);
6085
    movdqu(Address(base, 48), xtmp);
6086
  }
6087
  addptr(base, 64);
6088

6089
  BIND(L_zero_64_bytes);
6090
  subptr(cnt, 8);
6091
  jccb(Assembler::greaterEqual, L_loop);
6092

6093
  // Copy trailing 64 bytes
6094
  if (use64byteVector) {
6095
    addptr(cnt, 8);
6096
    jccb(Assembler::equal, L_end);
6097
    fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true);
6098
    jmp(L_end);
6099
  } else {
6100
    addptr(cnt, 4);
6101
    jccb(Assembler::less, L_tail);
6102
    if (MaxVectorSize >= 32) {
6103
      vmovdqu(Address(base, 0), xtmp);
6104
    } else {
6105
      movdqu(Address(base,  0), xtmp);
6106
      movdqu(Address(base, 16), xtmp);
6107
    }
6108
  }
6109
  addptr(base, 32);
6110
  subptr(cnt, 4);
6111

6112
  BIND(L_tail);
6113
  addptr(cnt, 4);
6114
  jccb(Assembler::lessEqual, L_end);
6115
  if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) {
6116
    fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp);
6117
  } else {
6118
    decrement(cnt);
6119

6120
    BIND(L_sloop);
6121
    movq(Address(base, 0), xtmp);
6122
    addptr(base, 8);
6123
    decrement(cnt);
6124
    jccb(Assembler::greaterEqual, L_sloop);
6125
  }
6126
  BIND(L_end);
6127
}
6128

6129
// Clearing constant sized memory using YMM/ZMM registers.
6130
void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
6131
  assert(UseAVX > 2 && VM_Version::supports_avx512vl(), "");
6132
  bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0);
6133

6134
  int vector64_count = (cnt & (~0x7)) >> 3;
6135
  cnt = cnt & 0x7;
6136
  const int fill64_per_loop = 4;
6137
  const int max_unrolled_fill64 = 8;
6138

6139
  // 64 byte initialization loop.
6140
  vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
6141
  int start64 = 0;
6142
  if (vector64_count > max_unrolled_fill64) {
6143
    Label LOOP;
6144
    Register index = rtmp;
6145

6146
    start64 = vector64_count - (vector64_count % fill64_per_loop);
6147

6148
    movl(index, 0);
6149
    BIND(LOOP);
6150
    for (int i = 0; i < fill64_per_loop; i++) {
6151
      fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector);
6152
    }
6153
    addl(index, fill64_per_loop * 64);
6154
    cmpl(index, start64 * 64);
6155
    jccb(Assembler::less, LOOP);
6156
  }
6157
  for (int i = start64; i < vector64_count; i++) {
6158
    fill64(base, i * 64, xtmp, use64byteVector);
6159
  }
6160

6161
  // Clear remaining 64 byte tail.
6162
  int disp = vector64_count * 64;
6163
  if (cnt) {
6164
    switch (cnt) {
6165
      case 1:
6166
        movq(Address(base, disp), xtmp);
6167
        break;
6168
      case 2:
6169
        evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit);
6170
        break;
6171
      case 3:
6172
        movl(rtmp, 0x7);
6173
        kmovwl(mask, rtmp);
6174
        evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit);
6175
        break;
6176
      case 4:
6177
        evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6178
        break;
6179
      case 5:
6180
        if (use64byteVector) {
6181
          movl(rtmp, 0x1F);
6182
          kmovwl(mask, rtmp);
6183
          evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6184
        } else {
6185
          evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6186
          movq(Address(base, disp + 32), xtmp);
6187
        }
6188
        break;
6189
      case 6:
6190
        if (use64byteVector) {
6191
          movl(rtmp, 0x3F);
6192
          kmovwl(mask, rtmp);
6193
          evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6194
        } else {
6195
          evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6196
          evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit);
6197
        }
6198
        break;
6199
      case 7:
6200
        if (use64byteVector) {
6201
          movl(rtmp, 0x7F);
6202
          kmovwl(mask, rtmp);
6203
          evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit);
6204
        } else {
6205
          evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit);
6206
          movl(rtmp, 0x7);
6207
          kmovwl(mask, rtmp);
6208
          evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit);
6209
        }
6210
        break;
6211
      default:
6212
        fatal("Unexpected length : %d\n",cnt);
6213
        break;
6214
    }
6215
  }
6216
}
6217

6218
void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
6219
                               bool is_large, KRegister mask) {
6220
  // cnt      - number of qwords (8-byte words).
6221
  // base     - start address, qword aligned.
6222
  // is_large - if optimizers know cnt is larger than InitArrayShortSize
6223
  assert(base==rdi, "base register must be edi for rep stos");
6224
  assert(tmp==rax,   "tmp register must be eax for rep stos");
6225
  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6226
  assert(InitArrayShortSize % BytesPerLong == 0,
6227
    "InitArrayShortSize should be the multiple of BytesPerLong");
6228

6229
  Label DONE;
6230
  if (!is_large || !UseXMMForObjInit) {
6231
    xorptr(tmp, tmp);
6232
  }
6233

6234
  if (!is_large) {
6235
    Label LOOP, LONG;
6236
    cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6237
    jccb(Assembler::greater, LONG);
6238

6239
    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6240

6241
    decrement(cnt);
6242
    jccb(Assembler::negative, DONE); // Zero length
6243

6244
    // Use individual pointer-sized stores for small counts:
6245
    BIND(LOOP);
6246
    movptr(Address(base, cnt, Address::times_ptr), tmp);
6247
    decrement(cnt);
6248
    jccb(Assembler::greaterEqual, LOOP);
6249
    jmpb(DONE);
6250

6251
    BIND(LONG);
6252
  }
6253

6254
  // Use longer rep-prefixed ops for non-small counts:
6255
  if (UseFastStosb) {
6256
    shlptr(cnt, 3); // convert to number of bytes
6257
    rep_stosb();
6258
  } else if (UseXMMForObjInit) {
6259
    xmm_clear_mem(base, cnt, tmp, xtmp, mask);
6260
  } else {
6261
    NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6262
    rep_stos();
6263
  }
6264

6265
  BIND(DONE);
6266
}
6267

6268
#endif //COMPILER2_OR_JVMCI
6269

6270

6271
void MacroAssembler::generate_fill(BasicType t, bool aligned,
6272
                                   Register to, Register value, Register count,
6273
                                   Register rtmp, XMMRegister xtmp) {
6274
  ShortBranchVerifier sbv(this);
6275
  assert_different_registers(to, value, count, rtmp);
6276
  Label L_exit;
6277
  Label L_fill_2_bytes, L_fill_4_bytes;
6278

6279
#if defined(COMPILER2) && defined(_LP64)
6280
  if(MaxVectorSize >=32 &&
6281
     VM_Version::supports_avx512vlbw() &&
6282
     VM_Version::supports_bmi2()) {
6283
    generate_fill_avx3(t, to, value, count, rtmp, xtmp);
6284
    return;
6285
  }
6286
#endif
6287

6288
  int shift = -1;
6289
  switch (t) {
6290
    case T_BYTE:
6291
      shift = 2;
6292
      break;
6293
    case T_SHORT:
6294
      shift = 1;
6295
      break;
6296
    case T_INT:
6297
      shift = 0;
6298
      break;
6299
    default: ShouldNotReachHere();
6300
  }
6301

6302
  if (t == T_BYTE) {
6303
    andl(value, 0xff);
6304
    movl(rtmp, value);
6305
    shll(rtmp, 8);
6306
    orl(value, rtmp);
6307
  }
6308
  if (t == T_SHORT) {
6309
    andl(value, 0xffff);
6310
  }
6311
  if (t == T_BYTE || t == T_SHORT) {
6312
    movl(rtmp, value);
6313
    shll(rtmp, 16);
6314
    orl(value, rtmp);
6315
  }
6316

6317
  cmpptr(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
6318
  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6319
  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6320
    Label L_skip_align2;
6321
    // align source address at 4 bytes address boundary
6322
    if (t == T_BYTE) {
6323
      Label L_skip_align1;
6324
      // One byte misalignment happens only for byte arrays
6325
      testptr(to, 1);
6326
      jccb(Assembler::zero, L_skip_align1);
6327
      movb(Address(to, 0), value);
6328
      increment(to);
6329
      decrement(count);
6330
      BIND(L_skip_align1);
6331
    }
6332
    // Two bytes misalignment happens only for byte and short (char) arrays
6333
    testptr(to, 2);
6334
    jccb(Assembler::zero, L_skip_align2);
6335
    movw(Address(to, 0), value);
6336
    addptr(to, 2);
6337
    subptr(count, 1<<(shift-1));
6338
    BIND(L_skip_align2);
6339
  }
6340
  if (UseSSE < 2) {
6341
    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6342
    // Fill 32-byte chunks
6343
    subptr(count, 8 << shift);
6344
    jcc(Assembler::less, L_check_fill_8_bytes);
6345
    align(16);
6346

6347
    BIND(L_fill_32_bytes_loop);
6348

6349
    for (int i = 0; i < 32; i += 4) {
6350
      movl(Address(to, i), value);
6351
    }
6352

6353
    addptr(to, 32);
6354
    subptr(count, 8 << shift);
6355
    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6356
    BIND(L_check_fill_8_bytes);
6357
    addptr(count, 8 << shift);
6358
    jccb(Assembler::zero, L_exit);
6359
    jmpb(L_fill_8_bytes);
6360

6361
    //
6362
    // length is too short, just fill qwords
6363
    //
6364
    BIND(L_fill_8_bytes_loop);
6365
    movl(Address(to, 0), value);
6366
    movl(Address(to, 4), value);
6367
    addptr(to, 8);
6368
    BIND(L_fill_8_bytes);
6369
    subptr(count, 1 << (shift + 1));
6370
    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6371
    // fall through to fill 4 bytes
6372
  } else {
6373
    Label L_fill_32_bytes;
6374
    if (!UseUnalignedLoadStores) {
6375
      // align to 8 bytes, we know we are 4 byte aligned to start
6376
      testptr(to, 4);
6377
      jccb(Assembler::zero, L_fill_32_bytes);
6378
      movl(Address(to, 0), value);
6379
      addptr(to, 4);
6380
      subptr(count, 1<<shift);
6381
    }
6382
    BIND(L_fill_32_bytes);
6383
    {
6384
      assert( UseSSE >= 2, "supported cpu only" );
6385
      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6386
      movdl(xtmp, value);
6387
      if (UseAVX >= 2 && UseUnalignedLoadStores) {
6388
        Label L_check_fill_32_bytes;
6389
        if (UseAVX > 2) {
6390
          // Fill 64-byte chunks
6391
          Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
6392

6393
          // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
6394
          cmpptr(count, VM_Version::avx3_threshold());
6395
          jccb(Assembler::below, L_check_fill_64_bytes_avx2);
6396

6397
          vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
6398

6399
          subptr(count, 16 << shift);
6400
          jccb(Assembler::less, L_check_fill_32_bytes);
6401
          align(16);
6402

6403
          BIND(L_fill_64_bytes_loop_avx3);
6404
          evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
6405
          addptr(to, 64);
6406
          subptr(count, 16 << shift);
6407
          jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
6408
          jmpb(L_check_fill_32_bytes);
6409

6410
          BIND(L_check_fill_64_bytes_avx2);
6411
        }
6412
        // Fill 64-byte chunks
6413
        Label L_fill_64_bytes_loop;
6414
        vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
6415

6416
        subptr(count, 16 << shift);
6417
        jcc(Assembler::less, L_check_fill_32_bytes);
6418
        align(16);
6419

6420
        BIND(L_fill_64_bytes_loop);
6421
        vmovdqu(Address(to, 0), xtmp);
6422
        vmovdqu(Address(to, 32), xtmp);
6423
        addptr(to, 64);
6424
        subptr(count, 16 << shift);
6425
        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
6426

6427
        BIND(L_check_fill_32_bytes);
6428
        addptr(count, 8 << shift);
6429
        jccb(Assembler::less, L_check_fill_8_bytes);
6430
        vmovdqu(Address(to, 0), xtmp);
6431
        addptr(to, 32);
6432
        subptr(count, 8 << shift);
6433

6434
        BIND(L_check_fill_8_bytes);
6435
        // clean upper bits of YMM registers
6436
        movdl(xtmp, value);
6437
        pshufd(xtmp, xtmp, 0);
6438
      } else {
6439
        // Fill 32-byte chunks
6440
        pshufd(xtmp, xtmp, 0);
6441

6442
        subptr(count, 8 << shift);
6443
        jcc(Assembler::less, L_check_fill_8_bytes);
6444
        align(16);
6445

6446
        BIND(L_fill_32_bytes_loop);
6447

6448
        if (UseUnalignedLoadStores) {
6449
          movdqu(Address(to, 0), xtmp);
6450
          movdqu(Address(to, 16), xtmp);
6451
        } else {
6452
          movq(Address(to, 0), xtmp);
6453
          movq(Address(to, 8), xtmp);
6454
          movq(Address(to, 16), xtmp);
6455
          movq(Address(to, 24), xtmp);
6456
        }
6457

6458
        addptr(to, 32);
6459
        subptr(count, 8 << shift);
6460
        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
6461

6462
        BIND(L_check_fill_8_bytes);
6463
      }
6464
      addptr(count, 8 << shift);
6465
      jccb(Assembler::zero, L_exit);
6466
      jmpb(L_fill_8_bytes);
6467

6468
      //
6469
      // length is too short, just fill qwords
6470
      //
6471
      BIND(L_fill_8_bytes_loop);
6472
      movq(Address(to, 0), xtmp);
6473
      addptr(to, 8);
6474
      BIND(L_fill_8_bytes);
6475
      subptr(count, 1 << (shift + 1));
6476
      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
6477
    }
6478
  }
6479
  // fill trailing 4 bytes
6480
  BIND(L_fill_4_bytes);
6481
  testl(count, 1<<shift);
6482
  jccb(Assembler::zero, L_fill_2_bytes);
6483
  movl(Address(to, 0), value);
6484
  if (t == T_BYTE || t == T_SHORT) {
6485
    Label L_fill_byte;
6486
    addptr(to, 4);
6487
    BIND(L_fill_2_bytes);
6488
    // fill trailing 2 bytes
6489
    testl(count, 1<<(shift-1));
6490
    jccb(Assembler::zero, L_fill_byte);
6491
    movw(Address(to, 0), value);
6492
    if (t == T_BYTE) {
6493
      addptr(to, 2);
6494
      BIND(L_fill_byte);
6495
      // fill trailing byte
6496
      testl(count, 1);
6497
      jccb(Assembler::zero, L_exit);
6498
      movb(Address(to, 0), value);
6499
    } else {
6500
      BIND(L_fill_byte);
6501
    }
6502
  } else {
6503
    BIND(L_fill_2_bytes);
6504
  }
6505
  BIND(L_exit);
6506
}
6507

6508
void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) {
6509
  switch(type) {
6510
    case T_BYTE:
6511
    case T_BOOLEAN:
6512
      evpbroadcastb(dst, src, vector_len);
6513
      break;
6514
    case T_SHORT:
6515
    case T_CHAR:
6516
      evpbroadcastw(dst, src, vector_len);
6517
      break;
6518
    case T_INT:
6519
    case T_FLOAT:
6520
      evpbroadcastd(dst, src, vector_len);
6521
      break;
6522
    case T_LONG:
6523
    case T_DOUBLE:
6524
      evpbroadcastq(dst, src, vector_len);
6525
      break;
6526
    default:
6527
      fatal("Unhandled type : %s", type2name(type));
6528
      break;
6529
  }
6530
}
6531

6532
// encode char[] to byte[] in ISO_8859_1 or ASCII
6533
   //@IntrinsicCandidate
6534
   //private static int implEncodeISOArray(byte[] sa, int sp,
6535
   //byte[] da, int dp, int len) {
6536
   //  int i = 0;
6537
   //  for (; i < len; i++) {
6538
   //    char c = StringUTF16.getChar(sa, sp++);
6539
   //    if (c > '\u00FF')
6540
   //      break;
6541
   //    da[dp++] = (byte)c;
6542
   //  }
6543
   //  return i;
6544
   //}
6545
   //
6546
   //@IntrinsicCandidate
6547
   //private static int implEncodeAsciiArray(char[] sa, int sp,
6548
   //    byte[] da, int dp, int len) {
6549
   //  int i = 0;
6550
   //  for (; i < len; i++) {
6551
   //    char c = sa[sp++];
6552
   //    if (c >= '\u0080')
6553
   //      break;
6554
   //    da[dp++] = (byte)c;
6555
   //  }
6556
   //  return i;
6557
   //}
6558
void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
6559
  XMMRegister tmp1Reg, XMMRegister tmp2Reg,
6560
  XMMRegister tmp3Reg, XMMRegister tmp4Reg,
6561
  Register tmp5, Register result, bool ascii) {
6562

6563
  // rsi: src
6564
  // rdi: dst
6565
  // rdx: len
6566
  // rcx: tmp5
6567
  // rax: result
6568
  ShortBranchVerifier sbv(this);
6569
  assert_different_registers(src, dst, len, tmp5, result);
6570
  Label L_done, L_copy_1_char, L_copy_1_char_exit;
6571

6572
  int mask = ascii ? 0xff80ff80 : 0xff00ff00;
6573
  int short_mask = ascii ? 0xff80 : 0xff00;
6574

6575
  // set result
6576
  xorl(result, result);
6577
  // check for zero length
6578
  testl(len, len);
6579
  jcc(Assembler::zero, L_done);
6580

6581
  movl(result, len);
6582

6583
  // Setup pointers
6584
  lea(src, Address(src, len, Address::times_2)); // char[]
6585
  lea(dst, Address(dst, len, Address::times_1)); // byte[]
6586
  negptr(len);
6587

6588
  if (UseSSE42Intrinsics || UseAVX >= 2) {
6589
    Label L_copy_8_chars, L_copy_8_chars_exit;
6590
    Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
6591

6592
    if (UseAVX >= 2) {
6593
      Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
6594
      movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
6595
      movdl(tmp1Reg, tmp5);
6596
      vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
6597
      jmp(L_chars_32_check);
6598

6599
      bind(L_copy_32_chars);
6600
      vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
6601
      vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
6602
      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6603
      vptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
6604
      jccb(Assembler::notZero, L_copy_32_chars_exit);
6605
      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
6606
      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
6607
      vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
6608

6609
      bind(L_chars_32_check);
6610
      addptr(len, 32);
6611
      jcc(Assembler::lessEqual, L_copy_32_chars);
6612

6613
      bind(L_copy_32_chars_exit);
6614
      subptr(len, 16);
6615
      jccb(Assembler::greater, L_copy_16_chars_exit);
6616

6617
    } else if (UseSSE42Intrinsics) {
6618
      movl(tmp5, mask);   // create mask to test for Unicode or non-ASCII chars in vector
6619
      movdl(tmp1Reg, tmp5);
6620
      pshufd(tmp1Reg, tmp1Reg, 0);
6621
      jmpb(L_chars_16_check);
6622
    }
6623

6624
    bind(L_copy_16_chars);
6625
    if (UseAVX >= 2) {
6626
      vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
6627
      vptest(tmp2Reg, tmp1Reg);
6628
      jcc(Assembler::notZero, L_copy_16_chars_exit);
6629
      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
6630
      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
6631
    } else {
6632
      if (UseAVX > 0) {
6633
        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6634
        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6635
        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
6636
      } else {
6637
        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
6638
        por(tmp2Reg, tmp3Reg);
6639
        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
6640
        por(tmp2Reg, tmp4Reg);
6641
      }
6642
      ptest(tmp2Reg, tmp1Reg);       // check for Unicode or non-ASCII chars in vector
6643
      jccb(Assembler::notZero, L_copy_16_chars_exit);
6644
      packuswb(tmp3Reg, tmp4Reg);
6645
    }
6646
    movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
6647

6648
    bind(L_chars_16_check);
6649
    addptr(len, 16);
6650
    jcc(Assembler::lessEqual, L_copy_16_chars);
6651

6652
    bind(L_copy_16_chars_exit);
6653
    if (UseAVX >= 2) {
6654
      // clean upper bits of YMM registers
6655
      vpxor(tmp2Reg, tmp2Reg);
6656
      vpxor(tmp3Reg, tmp3Reg);
6657
      vpxor(tmp4Reg, tmp4Reg);
6658
      movdl(tmp1Reg, tmp5);
6659
      pshufd(tmp1Reg, tmp1Reg, 0);
6660
    }
6661
    subptr(len, 8);
6662
    jccb(Assembler::greater, L_copy_8_chars_exit);
6663

6664
    bind(L_copy_8_chars);
6665
    movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
6666
    ptest(tmp3Reg, tmp1Reg);
6667
    jccb(Assembler::notZero, L_copy_8_chars_exit);
6668
    packuswb(tmp3Reg, tmp1Reg);
6669
    movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
6670
    addptr(len, 8);
6671
    jccb(Assembler::lessEqual, L_copy_8_chars);
6672

6673
    bind(L_copy_8_chars_exit);
6674
    subptr(len, 8);
6675
    jccb(Assembler::zero, L_done);
6676
  }
6677

6678
  bind(L_copy_1_char);
6679
  load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
6680
  testl(tmp5, short_mask);      // check if Unicode or non-ASCII char
6681
  jccb(Assembler::notZero, L_copy_1_char_exit);
6682
  movb(Address(dst, len, Address::times_1, 0), tmp5);
6683
  addptr(len, 1);
6684
  jccb(Assembler::less, L_copy_1_char);
6685

6686
  bind(L_copy_1_char_exit);
6687
  addptr(result, len); // len is negative count of not processed elements
6688

6689
  bind(L_done);
6690
}
6691

6692
#ifdef _LP64
6693
/**
6694
 * Helper for multiply_to_len().
6695
 */
6696
void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
6697
  addq(dest_lo, src1);
6698
  adcq(dest_hi, 0);
6699
  addq(dest_lo, src2);
6700
  adcq(dest_hi, 0);
6701
}
6702

6703
/**
6704
 * Multiply 64 bit by 64 bit first loop.
6705
 */
6706
void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
6707
                                           Register y, Register y_idx, Register z,
6708
                                           Register carry, Register product,
6709
                                           Register idx, Register kdx) {
6710
  //
6711
  //  jlong carry, x[], y[], z[];
6712
  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
6713
  //    huge_128 product = y[idx] * x[xstart] + carry;
6714
  //    z[kdx] = (jlong)product;
6715
  //    carry  = (jlong)(product >>> 64);
6716
  //  }
6717
  //  z[xstart] = carry;
6718
  //
6719

6720
  Label L_first_loop, L_first_loop_exit;
6721
  Label L_one_x, L_one_y, L_multiply;
6722

6723
  decrementl(xstart);
6724
  jcc(Assembler::negative, L_one_x);
6725

6726
  movq(x_xstart, Address(x, xstart, Address::times_4,  0));
6727
  rorq(x_xstart, 32); // convert big-endian to little-endian
6728

6729
  bind(L_first_loop);
6730
  decrementl(idx);
6731
  jcc(Assembler::negative, L_first_loop_exit);
6732
  decrementl(idx);
6733
  jcc(Assembler::negative, L_one_y);
6734
  movq(y_idx, Address(y, idx, Address::times_4,  0));
6735
  rorq(y_idx, 32); // convert big-endian to little-endian
6736
  bind(L_multiply);
6737
  movq(product, x_xstart);
6738
  mulq(y_idx); // product(rax) * y_idx -> rdx:rax
6739
  addq(product, carry);
6740
  adcq(rdx, 0);
6741
  subl(kdx, 2);
6742
  movl(Address(z, kdx, Address::times_4,  4), product);
6743
  shrq(product, 32);
6744
  movl(Address(z, kdx, Address::times_4,  0), product);
6745
  movq(carry, rdx);
6746
  jmp(L_first_loop);
6747

6748
  bind(L_one_y);
6749
  movl(y_idx, Address(y,  0));
6750
  jmp(L_multiply);
6751

6752
  bind(L_one_x);
6753
  movl(x_xstart, Address(x,  0));
6754
  jmp(L_first_loop);
6755

6756
  bind(L_first_loop_exit);
6757
}
6758

6759
/**
6760
 * Multiply 64 bit by 64 bit and add 128 bit.
6761
 */
6762
void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
6763
                                            Register yz_idx, Register idx,
6764
                                            Register carry, Register product, int offset) {
6765
  //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
6766
  //     z[kdx] = (jlong)product;
6767

6768
  movq(yz_idx, Address(y, idx, Address::times_4,  offset));
6769
  rorq(yz_idx, 32); // convert big-endian to little-endian
6770
  movq(product, x_xstart);
6771
  mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
6772
  movq(yz_idx, Address(z, idx, Address::times_4,  offset));
6773
  rorq(yz_idx, 32); // convert big-endian to little-endian
6774

6775
  add2_with_carry(rdx, product, carry, yz_idx);
6776

6777
  movl(Address(z, idx, Address::times_4,  offset+4), product);
6778
  shrq(product, 32);
6779
  movl(Address(z, idx, Address::times_4,  offset), product);
6780

6781
}
6782

6783
/**
6784
 * Multiply 128 bit by 128 bit. Unrolled inner loop.
6785
 */
6786
void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
6787
                                             Register yz_idx, Register idx, Register jdx,
6788
                                             Register carry, Register product,
6789
                                             Register carry2) {
6790
  //   jlong carry, x[], y[], z[];
6791
  //   int kdx = ystart+1;
6792
  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6793
  //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
6794
  //     z[kdx+idx+1] = (jlong)product;
6795
  //     jlong carry2  = (jlong)(product >>> 64);
6796
  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
6797
  //     z[kdx+idx] = (jlong)product;
6798
  //     carry  = (jlong)(product >>> 64);
6799
  //   }
6800
  //   idx += 2;
6801
  //   if (idx > 0) {
6802
  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
6803
  //     z[kdx+idx] = (jlong)product;
6804
  //     carry  = (jlong)(product >>> 64);
6805
  //   }
6806
  //
6807

6808
  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6809

6810
  movl(jdx, idx);
6811
  andl(jdx, 0xFFFFFFFC);
6812
  shrl(jdx, 2);
6813

6814
  bind(L_third_loop);
6815
  subl(jdx, 1);
6816
  jcc(Assembler::negative, L_third_loop_exit);
6817
  subl(idx, 4);
6818

6819
  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
6820
  movq(carry2, rdx);
6821

6822
  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
6823
  movq(carry, rdx);
6824
  jmp(L_third_loop);
6825

6826
  bind (L_third_loop_exit);
6827

6828
  andl (idx, 0x3);
6829
  jcc(Assembler::zero, L_post_third_loop_done);
6830

6831
  Label L_check_1;
6832
  subl(idx, 2);
6833
  jcc(Assembler::negative, L_check_1);
6834

6835
  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
6836
  movq(carry, rdx);
6837

6838
  bind (L_check_1);
6839
  addl (idx, 0x2);
6840
  andl (idx, 0x1);
6841
  subl(idx, 1);
6842
  jcc(Assembler::negative, L_post_third_loop_done);
6843

6844
  movl(yz_idx, Address(y, idx, Address::times_4,  0));
6845
  movq(product, x_xstart);
6846
  mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
6847
  movl(yz_idx, Address(z, idx, Address::times_4,  0));
6848

6849
  add2_with_carry(rdx, product, yz_idx, carry);
6850

6851
  movl(Address(z, idx, Address::times_4,  0), product);
6852
  shrq(product, 32);
6853

6854
  shlq(rdx, 32);
6855
  orq(product, rdx);
6856
  movq(carry, product);
6857

6858
  bind(L_post_third_loop_done);
6859
}
6860

6861
/**
6862
 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
6863
 *
6864
 */
6865
void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
6866
                                                  Register carry, Register carry2,
6867
                                                  Register idx, Register jdx,
6868
                                                  Register yz_idx1, Register yz_idx2,
6869
                                                  Register tmp, Register tmp3, Register tmp4) {
6870
  assert(UseBMI2Instructions, "should be used only when BMI2 is available");
6871

6872
  //   jlong carry, x[], y[], z[];
6873
  //   int kdx = ystart+1;
6874
  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
6875
  //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
6876
  //     jlong carry2  = (jlong)(tmp3 >>> 64);
6877
  //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
6878
  //     carry  = (jlong)(tmp4 >>> 64);
6879
  //     z[kdx+idx+1] = (jlong)tmp3;
6880
  //     z[kdx+idx] = (jlong)tmp4;
6881
  //   }
6882
  //   idx += 2;
6883
  //   if (idx > 0) {
6884
  //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
6885
  //     z[kdx+idx] = (jlong)yz_idx1;
6886
  //     carry  = (jlong)(yz_idx1 >>> 64);
6887
  //   }
6888
  //
6889

6890
  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
6891

6892
  movl(jdx, idx);
6893
  andl(jdx, 0xFFFFFFFC);
6894
  shrl(jdx, 2);
6895

6896
  bind(L_third_loop);
6897
  subl(jdx, 1);
6898
  jcc(Assembler::negative, L_third_loop_exit);
6899
  subl(idx, 4);
6900

6901
  movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
6902
  rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
6903
  movq(yz_idx2, Address(y, idx, Address::times_4,  0));
6904
  rorxq(yz_idx2, yz_idx2, 32);
6905

6906
  mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
6907
  mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
6908

6909
  movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
6910
  rorxq(yz_idx1, yz_idx1, 32);
6911
  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6912
  rorxq(yz_idx2, yz_idx2, 32);
6913

6914
  if (VM_Version::supports_adx()) {
6915
    adcxq(tmp3, carry);
6916
    adoxq(tmp3, yz_idx1);
6917

6918
    adcxq(tmp4, tmp);
6919
    adoxq(tmp4, yz_idx2);
6920

6921
    movl(carry, 0); // does not affect flags
6922
    adcxq(carry2, carry);
6923
    adoxq(carry2, carry);
6924
  } else {
6925
    add2_with_carry(tmp4, tmp3, carry, yz_idx1);
6926
    add2_with_carry(carry2, tmp4, tmp, yz_idx2);
6927
  }
6928
  movq(carry, carry2);
6929

6930
  movl(Address(z, idx, Address::times_4, 12), tmp3);
6931
  shrq(tmp3, 32);
6932
  movl(Address(z, idx, Address::times_4,  8), tmp3);
6933

6934
  movl(Address(z, idx, Address::times_4,  4), tmp4);
6935
  shrq(tmp4, 32);
6936
  movl(Address(z, idx, Address::times_4,  0), tmp4);
6937

6938
  jmp(L_third_loop);
6939

6940
  bind (L_third_loop_exit);
6941

6942
  andl (idx, 0x3);
6943
  jcc(Assembler::zero, L_post_third_loop_done);
6944

6945
  Label L_check_1;
6946
  subl(idx, 2);
6947
  jcc(Assembler::negative, L_check_1);
6948

6949
  movq(yz_idx1, Address(y, idx, Address::times_4,  0));
6950
  rorxq(yz_idx1, yz_idx1, 32);
6951
  mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
6952
  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
6953
  rorxq(yz_idx2, yz_idx2, 32);
6954

6955
  add2_with_carry(tmp4, tmp3, carry, yz_idx2);
6956

6957
  movl(Address(z, idx, Address::times_4,  4), tmp3);
6958
  shrq(tmp3, 32);
6959
  movl(Address(z, idx, Address::times_4,  0), tmp3);
6960
  movq(carry, tmp4);
6961

6962
  bind (L_check_1);
6963
  addl (idx, 0x2);
6964
  andl (idx, 0x1);
6965
  subl(idx, 1);
6966
  jcc(Assembler::negative, L_post_third_loop_done);
6967
  movl(tmp4, Address(y, idx, Address::times_4,  0));
6968
  mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
6969
  movl(tmp4, Address(z, idx, Address::times_4,  0));
6970

6971
  add2_with_carry(carry2, tmp3, tmp4, carry);
6972

6973
  movl(Address(z, idx, Address::times_4,  0), tmp3);
6974
  shrq(tmp3, 32);
6975

6976
  shlq(carry2, 32);
6977
  orq(tmp3, carry2);
6978
  movq(carry, tmp3);
6979

6980
  bind(L_post_third_loop_done);
6981
}
6982

6983
/**
6984
 * Code for BigInteger::multiplyToLen() intrinsic.
6985
 *
6986
 * rdi: x
6987
 * rax: xlen
6988
 * rsi: y
6989
 * rcx: ylen
6990
 * r8:  z
6991
 * r11: tmp0
6992
 * r12: tmp1
6993
 * r13: tmp2
6994
 * r14: tmp3
6995
 * r15: tmp4
6996
 * rbx: tmp5
6997
 *
6998
 */
6999
void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0,
7000
                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7001
  ShortBranchVerifier sbv(this);
7002
  assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7003

7004
  push(tmp0);
7005
  push(tmp1);
7006
  push(tmp2);
7007
  push(tmp3);
7008
  push(tmp4);
7009
  push(tmp5);
7010

7011
  push(xlen);
7012

7013
  const Register idx = tmp1;
7014
  const Register kdx = tmp2;
7015
  const Register xstart = tmp3;
7016

7017
  const Register y_idx = tmp4;
7018
  const Register carry = tmp5;
7019
  const Register product  = xlen;
7020
  const Register x_xstart = tmp0;
7021

7022
  // First Loop.
7023
  //
7024
  //  final static long LONG_MASK = 0xffffffffL;
7025
  //  int xstart = xlen - 1;
7026
  //  int ystart = ylen - 1;
7027
  //  long carry = 0;
7028
  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7029
  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7030
  //    z[kdx] = (int)product;
7031
  //    carry = product >>> 32;
7032
  //  }
7033
  //  z[xstart] = (int)carry;
7034
  //
7035

7036
  movl(idx, ylen);               // idx = ylen;
7037
  lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen;
7038
  xorq(carry, carry);            // carry = 0;
7039

7040
  Label L_done;
7041

7042
  movl(xstart, xlen);
7043
  decrementl(xstart);
7044
  jcc(Assembler::negative, L_done);
7045

7046
  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7047

7048
  Label L_second_loop;
7049
  testl(kdx, kdx);
7050
  jcc(Assembler::zero, L_second_loop);
7051

7052
  Label L_carry;
7053
  subl(kdx, 1);
7054
  jcc(Assembler::zero, L_carry);
7055

7056
  movl(Address(z, kdx, Address::times_4,  0), carry);
7057
  shrq(carry, 32);
7058
  subl(kdx, 1);
7059

7060
  bind(L_carry);
7061
  movl(Address(z, kdx, Address::times_4,  0), carry);
7062

7063
  // Second and third (nested) loops.
7064
  //
7065
  // for (int i = xstart-1; i >= 0; i--) { // Second loop
7066
  //   carry = 0;
7067
  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7068
  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7069
  //                    (z[k] & LONG_MASK) + carry;
7070
  //     z[k] = (int)product;
7071
  //     carry = product >>> 32;
7072
  //   }
7073
  //   z[i] = (int)carry;
7074
  // }
7075
  //
7076
  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7077

7078
  const Register jdx = tmp1;
7079

7080
  bind(L_second_loop);
7081
  xorl(carry, carry);    // carry = 0;
7082
  movl(jdx, ylen);       // j = ystart+1
7083

7084
  subl(xstart, 1);       // i = xstart-1;
7085
  jcc(Assembler::negative, L_done);
7086

7087
  push (z);
7088

7089
  Label L_last_x;
7090
  lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7091
  subl(xstart, 1);       // i = xstart-1;
7092
  jcc(Assembler::negative, L_last_x);
7093

7094
  if (UseBMI2Instructions) {
7095
    movq(rdx,  Address(x, xstart, Address::times_4,  0));
7096
    rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7097
  } else {
7098
    movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7099
    rorq(x_xstart, 32);  // convert big-endian to little-endian
7100
  }
7101

7102
  Label L_third_loop_prologue;
7103
  bind(L_third_loop_prologue);
7104

7105
  push (x);
7106
  push (xstart);
7107
  push (ylen);
7108

7109

7110
  if (UseBMI2Instructions) {
7111
    multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7112
  } else { // !UseBMI2Instructions
7113
    multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7114
  }
7115

7116
  pop(ylen);
7117
  pop(xlen);
7118
  pop(x);
7119
  pop(z);
7120

7121
  movl(tmp3, xlen);
7122
  addl(tmp3, 1);
7123
  movl(Address(z, tmp3, Address::times_4,  0), carry);
7124
  subl(tmp3, 1);
7125
  jccb(Assembler::negative, L_done);
7126

7127
  shrq(carry, 32);
7128
  movl(Address(z, tmp3, Address::times_4,  0), carry);
7129
  jmp(L_second_loop);
7130

7131
  // Next infrequent code is moved outside loops.
7132
  bind(L_last_x);
7133
  if (UseBMI2Instructions) {
7134
    movl(rdx, Address(x,  0));
7135
  } else {
7136
    movl(x_xstart, Address(x,  0));
7137
  }
7138
  jmp(L_third_loop_prologue);
7139

7140
  bind(L_done);
7141

7142
  pop(xlen);
7143

7144
  pop(tmp5);
7145
  pop(tmp4);
7146
  pop(tmp3);
7147
  pop(tmp2);
7148
  pop(tmp1);
7149
  pop(tmp0);
7150
}
7151

7152
void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
7153
  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
7154
  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
7155
  Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
7156
  Label VECTOR8_TAIL, VECTOR4_TAIL;
7157
  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
7158
  Label SAME_TILL_END, DONE;
7159
  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
7160

7161
  //scale is in rcx in both Win64 and Unix
7162
  ShortBranchVerifier sbv(this);
7163

7164
  shlq(length);
7165
  xorq(result, result);
7166

7167
  if ((AVX3Threshold == 0) && (UseAVX > 2) &&
7168
      VM_Version::supports_avx512vlbw()) {
7169
    Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
7170

7171
    cmpq(length, 64);
7172
    jcc(Assembler::less, VECTOR32_TAIL);
7173

7174
    movq(tmp1, length);
7175
    andq(tmp1, 0x3F);      // tail count
7176
    andq(length, ~(0x3F)); //vector count
7177

7178
    bind(VECTOR64_LOOP);
7179
    // AVX512 code to compare 64 byte vectors.
7180
    evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
7181
    evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
7182
    kortestql(k7, k7);
7183
    jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
7184
    addq(result, 64);
7185
    subq(length, 64);
7186
    jccb(Assembler::notZero, VECTOR64_LOOP);
7187

7188
    //bind(VECTOR64_TAIL);
7189
    testq(tmp1, tmp1);
7190
    jcc(Assembler::zero, SAME_TILL_END);
7191

7192
    //bind(VECTOR64_TAIL);
7193
    // AVX512 code to compare up to 63 byte vectors.
7194
    mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
7195
    shlxq(tmp2, tmp2, tmp1);
7196
    notq(tmp2);
7197
    kmovql(k3, tmp2);
7198

7199
    evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
7200
    evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7201

7202
    ktestql(k7, k3);
7203
    jcc(Assembler::below, SAME_TILL_END);     // not mismatch
7204

7205
    bind(VECTOR64_NOT_EQUAL);
7206
    kmovql(tmp1, k7);
7207
    notq(tmp1);
7208
    tzcntq(tmp1, tmp1);
7209
    addq(result, tmp1);
7210
    shrq(result);
7211
    jmp(DONE);
7212
    bind(VECTOR32_TAIL);
7213
  }
7214

7215
  cmpq(length, 8);
7216
  jcc(Assembler::equal, VECTOR8_LOOP);
7217
  jcc(Assembler::less, VECTOR4_TAIL);
7218

7219
  if (UseAVX >= 2) {
7220
    Label VECTOR16_TAIL, VECTOR32_LOOP;
7221

7222
    cmpq(length, 16);
7223
    jcc(Assembler::equal, VECTOR16_LOOP);
7224
    jcc(Assembler::less, VECTOR8_LOOP);
7225

7226
    cmpq(length, 32);
7227
    jccb(Assembler::less, VECTOR16_TAIL);
7228

7229
    subq(length, 32);
7230
    bind(VECTOR32_LOOP);
7231
    vmovdqu(rymm0, Address(obja, result));
7232
    vmovdqu(rymm1, Address(objb, result));
7233
    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
7234
    vptest(rymm2, rymm2);
7235
    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
7236
    addq(result, 32);
7237
    subq(length, 32);
7238
    jcc(Assembler::greaterEqual, VECTOR32_LOOP);
7239
    addq(length, 32);
7240
    jcc(Assembler::equal, SAME_TILL_END);
7241
    //falling through if less than 32 bytes left //close the branch here.
7242

7243
    bind(VECTOR16_TAIL);
7244
    cmpq(length, 16);
7245
    jccb(Assembler::less, VECTOR8_TAIL);
7246
    bind(VECTOR16_LOOP);
7247
    movdqu(rymm0, Address(obja, result));
7248
    movdqu(rymm1, Address(objb, result));
7249
    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
7250
    ptest(rymm2, rymm2);
7251
    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7252
    addq(result, 16);
7253
    subq(length, 16);
7254
    jcc(Assembler::equal, SAME_TILL_END);
7255
    //falling through if less than 16 bytes left
7256
  } else {//regular intrinsics
7257

7258
    cmpq(length, 16);
7259
    jccb(Assembler::less, VECTOR8_TAIL);
7260

7261
    subq(length, 16);
7262
    bind(VECTOR16_LOOP);
7263
    movdqu(rymm0, Address(obja, result));
7264
    movdqu(rymm1, Address(objb, result));
7265
    pxor(rymm0, rymm1);
7266
    ptest(rymm0, rymm0);
7267
    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
7268
    addq(result, 16);
7269
    subq(length, 16);
7270
    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
7271
    addq(length, 16);
7272
    jcc(Assembler::equal, SAME_TILL_END);
7273
    //falling through if less than 16 bytes left
7274
  }
7275

7276
  bind(VECTOR8_TAIL);
7277
  cmpq(length, 8);
7278
  jccb(Assembler::less, VECTOR4_TAIL);
7279
  bind(VECTOR8_LOOP);
7280
  movq(tmp1, Address(obja, result));
7281
  movq(tmp2, Address(objb, result));
7282
  xorq(tmp1, tmp2);
7283
  testq(tmp1, tmp1);
7284
  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
7285
  addq(result, 8);
7286
  subq(length, 8);
7287
  jcc(Assembler::equal, SAME_TILL_END);
7288
  //falling through if less than 8 bytes left
7289

7290
  bind(VECTOR4_TAIL);
7291
  cmpq(length, 4);
7292
  jccb(Assembler::less, BYTES_TAIL);
7293
  bind(VECTOR4_LOOP);
7294
  movl(tmp1, Address(obja, result));
7295
  xorl(tmp1, Address(objb, result));
7296
  testl(tmp1, tmp1);
7297
  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
7298
  addq(result, 4);
7299
  subq(length, 4);
7300
  jcc(Assembler::equal, SAME_TILL_END);
7301
  //falling through if less than 4 bytes left
7302

7303
  bind(BYTES_TAIL);
7304
  bind(BYTES_LOOP);
7305
  load_unsigned_byte(tmp1, Address(obja, result));
7306
  load_unsigned_byte(tmp2, Address(objb, result));
7307
  xorl(tmp1, tmp2);
7308
  testl(tmp1, tmp1);
7309
  jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7310
  decq(length);
7311
  jcc(Assembler::zero, SAME_TILL_END);
7312
  incq(result);
7313
  load_unsigned_byte(tmp1, Address(obja, result));
7314
  load_unsigned_byte(tmp2, Address(objb, result));
7315
  xorl(tmp1, tmp2);
7316
  testl(tmp1, tmp1);
7317
  jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7318
  decq(length);
7319
  jcc(Assembler::zero, SAME_TILL_END);
7320
  incq(result);
7321
  load_unsigned_byte(tmp1, Address(obja, result));
7322
  load_unsigned_byte(tmp2, Address(objb, result));
7323
  xorl(tmp1, tmp2);
7324
  testl(tmp1, tmp1);
7325
  jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
7326
  jmp(SAME_TILL_END);
7327

7328
  if (UseAVX >= 2) {
7329
    bind(VECTOR32_NOT_EQUAL);
7330
    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
7331
    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
7332
    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
7333
    vpmovmskb(tmp1, rymm0);
7334
    bsfq(tmp1, tmp1);
7335
    addq(result, tmp1);
7336
    shrq(result);
7337
    jmp(DONE);
7338
  }
7339

7340
  bind(VECTOR16_NOT_EQUAL);
7341
  if (UseAVX >= 2) {
7342
    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
7343
    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
7344
    pxor(rymm0, rymm2);
7345
  } else {
7346
    pcmpeqb(rymm2, rymm2);
7347
    pxor(rymm0, rymm1);
7348
    pcmpeqb(rymm0, rymm1);
7349
    pxor(rymm0, rymm2);
7350
  }
7351
  pmovmskb(tmp1, rymm0);
7352
  bsfq(tmp1, tmp1);
7353
  addq(result, tmp1);
7354
  shrq(result);
7355
  jmpb(DONE);
7356

7357
  bind(VECTOR8_NOT_EQUAL);
7358
  bind(VECTOR4_NOT_EQUAL);
7359
  bsfq(tmp1, tmp1);
7360
  shrq(tmp1, 3);
7361
  addq(result, tmp1);
7362
  bind(BYTES_NOT_EQUAL);
7363
  shrq(result);
7364
  jmpb(DONE);
7365

7366
  bind(SAME_TILL_END);
7367
  mov64(result, -1);
7368

7369
  bind(DONE);
7370
}
7371

7372
//Helper functions for square_to_len()
7373

7374
/**
7375
 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7376
 * Preserves x and z and modifies rest of the registers.
7377
 */
7378
void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7379
  // Perform square and right shift by 1
7380
  // Handle odd xlen case first, then for even xlen do the following
7381
  // jlong carry = 0;
7382
  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7383
  //     huge_128 product = x[j:j+1] * x[j:j+1];
7384
  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7385
  //     z[i+2:i+3] = (jlong)(product >>> 1);
7386
  //     carry = (jlong)product;
7387
  // }
7388

7389
  xorq(tmp5, tmp5);     // carry
7390
  xorq(rdxReg, rdxReg);
7391
  xorl(tmp1, tmp1);     // index for x
7392
  xorl(tmp4, tmp4);     // index for z
7393

7394
  Label L_first_loop, L_first_loop_exit;
7395

7396
  testl(xlen, 1);
7397
  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7398

7399
  // Square and right shift by 1 the odd element using 32 bit multiply
7400
  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7401
  imulq(raxReg, raxReg);
7402
  shrq(raxReg, 1);
7403
  adcq(tmp5, 0);
7404
  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7405
  incrementl(tmp1);
7406
  addl(tmp4, 2);
7407

7408
  // Square and  right shift by 1 the rest using 64 bit multiply
7409
  bind(L_first_loop);
7410
  cmpptr(tmp1, xlen);
7411
  jccb(Assembler::equal, L_first_loop_exit);
7412

7413
  // Square
7414
  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
7415
  rorq(raxReg, 32);    // convert big-endian to little-endian
7416
  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
7417

7418
  // Right shift by 1 and save carry
7419
  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7420
  rcrq(rdxReg, 1);
7421
  rcrq(raxReg, 1);
7422
  adcq(tmp5, 0);
7423

7424
  // Store result in z
7425
  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7426
  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7427

7428
  // Update indices for x and z
7429
  addl(tmp1, 2);
7430
  addl(tmp4, 4);
7431
  jmp(L_first_loop);
7432

7433
  bind(L_first_loop_exit);
7434
}
7435

7436

7437
/**
7438
 * Perform the following multiply add operation using BMI2 instructions
7439
 * carry:sum = sum + op1*op2 + carry
7440
 * op2 should be in rdx
7441
 * op2 is preserved, all other registers are modified
7442
 */
7443
void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7444
  // assert op2 is rdx
7445
  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
7446
  addq(sum, carry);
7447
  adcq(tmp2, 0);
7448
  addq(sum, op1);
7449
  adcq(tmp2, 0);
7450
  movq(carry, tmp2);
7451
}
7452

7453
/**
7454
 * Perform the following multiply add operation:
7455
 * carry:sum = sum + op1*op2 + carry
7456
 * Preserves op1, op2 and modifies rest of registers
7457
 */
7458
void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7459
  // rdx:rax = op1 * op2
7460
  movq(raxReg, op2);
7461
  mulq(op1);
7462

7463
  //  rdx:rax = sum + carry + rdx:rax
7464
  addq(sum, carry);
7465
  adcq(rdxReg, 0);
7466
  addq(sum, raxReg);
7467
  adcq(rdxReg, 0);
7468

7469
  // carry:sum = rdx:sum
7470
  movq(carry, rdxReg);
7471
}
7472

7473
/**
7474
 * Add 64 bit long carry into z[] with carry propagation.
7475
 * Preserves z and carry register values and modifies rest of registers.
7476
 *
7477
 */
7478
void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7479
  Label L_fourth_loop, L_fourth_loop_exit;
7480

7481
  movl(tmp1, 1);
7482
  subl(zlen, 2);
7483
  addq(Address(z, zlen, Address::times_4, 0), carry);
7484

7485
  bind(L_fourth_loop);
7486
  jccb(Assembler::carryClear, L_fourth_loop_exit);
7487
  subl(zlen, 2);
7488
  jccb(Assembler::negative, L_fourth_loop_exit);
7489
  addq(Address(z, zlen, Address::times_4, 0), tmp1);
7490
  jmp(L_fourth_loop);
7491
  bind(L_fourth_loop_exit);
7492
}
7493

7494
/**
7495
 * Shift z[] left by 1 bit.
7496
 * Preserves x, len, z and zlen registers and modifies rest of the registers.
7497
 *
7498
 */
7499
void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7500

7501
  Label L_fifth_loop, L_fifth_loop_exit;
7502

7503
  // Fifth loop
7504
  // Perform primitiveLeftShift(z, zlen, 1)
7505

7506
  const Register prev_carry = tmp1;
7507
  const Register new_carry = tmp4;
7508
  const Register value = tmp2;
7509
  const Register zidx = tmp3;
7510

7511
  // int zidx, carry;
7512
  // long value;
7513
  // carry = 0;
7514
  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7515
  //    (carry:value)  = (z[i] << 1) | carry ;
7516
  //    z[i] = value;
7517
  // }
7518

7519
  movl(zidx, zlen);
7520
  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7521

7522
  bind(L_fifth_loop);
7523
  decl(zidx);  // Use decl to preserve carry flag
7524
  decl(zidx);
7525
  jccb(Assembler::negative, L_fifth_loop_exit);
7526

7527
  if (UseBMI2Instructions) {
7528
     movq(value, Address(z, zidx, Address::times_4, 0));
7529
     rclq(value, 1);
7530
     rorxq(value, value, 32);
7531
     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7532
  }
7533
  else {
7534
    // clear new_carry
7535
    xorl(new_carry, new_carry);
7536

7537
    // Shift z[i] by 1, or in previous carry and save new carry
7538
    movq(value, Address(z, zidx, Address::times_4, 0));
7539
    shlq(value, 1);
7540
    adcl(new_carry, 0);
7541

7542
    orq(value, prev_carry);
7543
    rorq(value, 0x20);
7544
    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7545

7546
    // Set previous carry = new carry
7547
    movl(prev_carry, new_carry);
7548
  }
7549
  jmp(L_fifth_loop);
7550

7551
  bind(L_fifth_loop_exit);
7552
}
7553

7554

7555
/**
7556
 * Code for BigInteger::squareToLen() intrinsic
7557
 *
7558
 * rdi: x
7559
 * rsi: len
7560
 * r8:  z
7561
 * rcx: zlen
7562
 * r12: tmp1
7563
 * r13: tmp2
7564
 * r14: tmp3
7565
 * r15: tmp4
7566
 * rbx: tmp5
7567
 *
7568
 */
7569
void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7570

7571
  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
7572
  push(tmp1);
7573
  push(tmp2);
7574
  push(tmp3);
7575
  push(tmp4);
7576
  push(tmp5);
7577

7578
  // First loop
7579
  // Store the squares, right shifted one bit (i.e., divided by 2).
7580
  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7581

7582
  // Add in off-diagonal sums.
7583
  //
7584
  // Second, third (nested) and fourth loops.
7585
  // zlen +=2;
7586
  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7587
  //    carry = 0;
7588
  //    long op2 = x[xidx:xidx+1];
7589
  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7590
  //       k -= 2;
7591
  //       long op1 = x[j:j+1];
7592
  //       long sum = z[k:k+1];
7593
  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7594
  //       z[k:k+1] = sum;
7595
  //    }
7596
  //    add_one_64(z, k, carry, tmp_regs);
7597
  // }
7598

7599
  const Register carry = tmp5;
7600
  const Register sum = tmp3;
7601
  const Register op1 = tmp4;
7602
  Register op2 = tmp2;
7603

7604
  push(zlen);
7605
  push(len);
7606
  addl(zlen,2);
7607
  bind(L_second_loop);
7608
  xorq(carry, carry);
7609
  subl(zlen, 4);
7610
  subl(len, 2);
7611
  push(zlen);
7612
  push(len);
7613
  cmpl(len, 0);
7614
  jccb(Assembler::lessEqual, L_second_loop_exit);
7615

7616
  // Multiply an array by one 64 bit long.
7617
  if (UseBMI2Instructions) {
7618
    op2 = rdxReg;
7619
    movq(op2, Address(x, len, Address::times_4,  0));
7620
    rorxq(op2, op2, 32);
7621
  }
7622
  else {
7623
    movq(op2, Address(x, len, Address::times_4,  0));
7624
    rorq(op2, 32);
7625
  }
7626

7627
  bind(L_third_loop);
7628
  decrementl(len);
7629
  jccb(Assembler::negative, L_third_loop_exit);
7630
  decrementl(len);
7631
  jccb(Assembler::negative, L_last_x);
7632

7633
  movq(op1, Address(x, len, Address::times_4,  0));
7634
  rorq(op1, 32);
7635

7636
  bind(L_multiply);
7637
  subl(zlen, 2);
7638
  movq(sum, Address(z, zlen, Address::times_4,  0));
7639

7640
  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
7641
  if (UseBMI2Instructions) {
7642
    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
7643
  }
7644
  else {
7645
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7646
  }
7647

7648
  movq(Address(z, zlen, Address::times_4, 0), sum);
7649

7650
  jmp(L_third_loop);
7651
  bind(L_third_loop_exit);
7652

7653
  // Fourth loop
7654
  // Add 64 bit long carry into z with carry propagation.
7655
  // Uses offsetted zlen.
7656
  add_one_64(z, zlen, carry, tmp1);
7657

7658
  pop(len);
7659
  pop(zlen);
7660
  jmp(L_second_loop);
7661

7662
  // Next infrequent code is moved outside loops.
7663
  bind(L_last_x);
7664
  movl(op1, Address(x, 0));
7665
  jmp(L_multiply);
7666

7667
  bind(L_second_loop_exit);
7668
  pop(len);
7669
  pop(zlen);
7670
  pop(len);
7671
  pop(zlen);
7672

7673
  // Fifth loop
7674
  // Shift z left 1 bit.
7675
  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
7676

7677
  // z[zlen-1] |= x[len-1] & 1;
7678
  movl(tmp3, Address(x, len, Address::times_4, -4));
7679
  andl(tmp3, 1);
7680
  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
7681

7682
  pop(tmp5);
7683
  pop(tmp4);
7684
  pop(tmp3);
7685
  pop(tmp2);
7686
  pop(tmp1);
7687
}
7688

7689
/**
7690
 * Helper function for mul_add()
7691
 * Multiply the in[] by int k and add to out[] starting at offset offs using
7692
 * 128 bit by 32 bit multiply and return the carry in tmp5.
7693
 * Only quad int aligned length of in[] is operated on in this function.
7694
 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
7695
 * This function preserves out, in and k registers.
7696
 * len and offset point to the appropriate index in "in" & "out" correspondingly
7697
 * tmp5 has the carry.
7698
 * other registers are temporary and are modified.
7699
 *
7700
 */
7701
void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
7702
  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
7703
  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7704

7705
  Label L_first_loop, L_first_loop_exit;
7706

7707
  movl(tmp1, len);
7708
  shrl(tmp1, 2);
7709

7710
  bind(L_first_loop);
7711
  subl(tmp1, 1);
7712
  jccb(Assembler::negative, L_first_loop_exit);
7713

7714
  subl(len, 4);
7715
  subl(offset, 4);
7716

7717
  Register op2 = tmp2;
7718
  const Register sum = tmp3;
7719
  const Register op1 = tmp4;
7720
  const Register carry = tmp5;
7721

7722
  if (UseBMI2Instructions) {
7723
    op2 = rdxReg;
7724
  }
7725

7726
  movq(op1, Address(in, len, Address::times_4,  8));
7727
  rorq(op1, 32);
7728
  movq(sum, Address(out, offset, Address::times_4,  8));
7729
  rorq(sum, 32);
7730
  if (UseBMI2Instructions) {
7731
    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7732
  }
7733
  else {
7734
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7735
  }
7736
  // Store back in big endian from little endian
7737
  rorq(sum, 0x20);
7738
  movq(Address(out, offset, Address::times_4,  8), sum);
7739

7740
  movq(op1, Address(in, len, Address::times_4,  0));
7741
  rorq(op1, 32);
7742
  movq(sum, Address(out, offset, Address::times_4,  0));
7743
  rorq(sum, 32);
7744
  if (UseBMI2Instructions) {
7745
    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7746
  }
7747
  else {
7748
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7749
  }
7750
  // Store back in big endian from little endian
7751
  rorq(sum, 0x20);
7752
  movq(Address(out, offset, Address::times_4,  0), sum);
7753

7754
  jmp(L_first_loop);
7755
  bind(L_first_loop_exit);
7756
}
7757

7758
/**
7759
 * Code for BigInteger::mulAdd() intrinsic
7760
 *
7761
 * rdi: out
7762
 * rsi: in
7763
 * r11: offs (out.length - offset)
7764
 * rcx: len
7765
 * r8:  k
7766
 * r12: tmp1
7767
 * r13: tmp2
7768
 * r14: tmp3
7769
 * r15: tmp4
7770
 * rbx: tmp5
7771
 * Multiply the in[] by word k and add to out[], return the carry in rax
7772
 */
7773
void MacroAssembler::mul_add(Register out, Register in, Register offs,
7774
   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
7775
   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7776

7777
  Label L_carry, L_last_in, L_done;
7778

7779
// carry = 0;
7780
// for (int j=len-1; j >= 0; j--) {
7781
//    long product = (in[j] & LONG_MASK) * kLong +
7782
//                   (out[offs] & LONG_MASK) + carry;
7783
//    out[offs--] = (int)product;
7784
//    carry = product >>> 32;
7785
// }
7786
//
7787
  push(tmp1);
7788
  push(tmp2);
7789
  push(tmp3);
7790
  push(tmp4);
7791
  push(tmp5);
7792

7793
  Register op2 = tmp2;
7794
  const Register sum = tmp3;
7795
  const Register op1 = tmp4;
7796
  const Register carry =  tmp5;
7797

7798
  if (UseBMI2Instructions) {
7799
    op2 = rdxReg;
7800
    movl(op2, k);
7801
  }
7802
  else {
7803
    movl(op2, k);
7804
  }
7805

7806
  xorq(carry, carry);
7807

7808
  //First loop
7809

7810
  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
7811
  //The carry is in tmp5
7812
  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
7813

7814
  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
7815
  decrementl(len);
7816
  jccb(Assembler::negative, L_carry);
7817
  decrementl(len);
7818
  jccb(Assembler::negative, L_last_in);
7819

7820
  movq(op1, Address(in, len, Address::times_4,  0));
7821
  rorq(op1, 32);
7822

7823
  subl(offs, 2);
7824
  movq(sum, Address(out, offs, Address::times_4,  0));
7825
  rorq(sum, 32);
7826

7827
  if (UseBMI2Instructions) {
7828
    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
7829
  }
7830
  else {
7831
    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
7832
  }
7833

7834
  // Store back in big endian from little endian
7835
  rorq(sum, 0x20);
7836
  movq(Address(out, offs, Address::times_4,  0), sum);
7837

7838
  testl(len, len);
7839
  jccb(Assembler::zero, L_carry);
7840

7841
  //Multiply the last in[] entry, if any
7842
  bind(L_last_in);
7843
  movl(op1, Address(in, 0));
7844
  movl(sum, Address(out, offs, Address::times_4,  -4));
7845

7846
  movl(raxReg, k);
7847
  mull(op1); //tmp4 * eax -> edx:eax
7848
  addl(sum, carry);
7849
  adcl(rdxReg, 0);
7850
  addl(sum, raxReg);
7851
  adcl(rdxReg, 0);
7852
  movl(carry, rdxReg);
7853

7854
  movl(Address(out, offs, Address::times_4,  -4), sum);
7855

7856
  bind(L_carry);
7857
  //return tmp5/carry as carry in rax
7858
  movl(rax, carry);
7859

7860
  bind(L_done);
7861
  pop(tmp5);
7862
  pop(tmp4);
7863
  pop(tmp3);
7864
  pop(tmp2);
7865
  pop(tmp1);
7866
}
7867
#endif
7868

7869
/**
7870
 * Emits code to update CRC-32 with a byte value according to constants in table
7871
 *
7872
 * @param [in,out]crc   Register containing the crc.
7873
 * @param [in]val       Register containing the byte to fold into the CRC.
7874
 * @param [in]table     Register containing the table of crc constants.
7875
 *
7876
 * uint32_t crc;
7877
 * val = crc_table[(val ^ crc) & 0xFF];
7878
 * crc = val ^ (crc >> 8);
7879
 *
7880
 */
7881
void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7882
  xorl(val, crc);
7883
  andl(val, 0xFF);
7884
  shrl(crc, 8); // unsigned shift
7885
  xorl(crc, Address(table, val, Address::times_4, 0));
7886
}
7887

7888
/**
7889
 * Fold 128-bit data chunk
7890
 */
7891
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7892
  if (UseAVX > 0) {
7893
    vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7894
    vpclmulldq(xcrc, xK, xcrc); // [63:0]
7895
    vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
7896
    pxor(xcrc, xtmp);
7897
  } else {
7898
    movdqa(xtmp, xcrc);
7899
    pclmulhdq(xtmp, xK);   // [123:64]
7900
    pclmulldq(xcrc, xK);   // [63:0]
7901
    pxor(xcrc, xtmp);
7902
    movdqu(xtmp, Address(buf, offset));
7903
    pxor(xcrc, xtmp);
7904
  }
7905
}
7906

7907
void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7908
  if (UseAVX > 0) {
7909
    vpclmulhdq(xtmp, xK, xcrc);
7910
    vpclmulldq(xcrc, xK, xcrc);
7911
    pxor(xcrc, xbuf);
7912
    pxor(xcrc, xtmp);
7913
  } else {
7914
    movdqa(xtmp, xcrc);
7915
    pclmulhdq(xtmp, xK);
7916
    pclmulldq(xcrc, xK);
7917
    pxor(xcrc, xbuf);
7918
    pxor(xcrc, xtmp);
7919
  }
7920
}
7921

7922
/**
7923
 * 8-bit folds to compute 32-bit CRC
7924
 *
7925
 * uint64_t xcrc;
7926
 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7927
 */
7928
void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7929
  movdl(tmp, xcrc);
7930
  andl(tmp, 0xFF);
7931
  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7932
  psrldq(xcrc, 1); // unsigned shift one byte
7933
  pxor(xcrc, xtmp);
7934
}
7935

7936
/**
7937
 * uint32_t crc;
7938
 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7939
 */
7940
void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7941
  movl(tmp, crc);
7942
  andl(tmp, 0xFF);
7943
  shrl(crc, 8);
7944
  xorl(crc, Address(table, tmp, Address::times_4, 0));
7945
}
7946

7947
/**
7948
 * @param crc   register containing existing CRC (32-bit)
7949
 * @param buf   register pointing to input byte buffer (byte*)
7950
 * @param len   register containing number of bytes
7951
 * @param table register that will contain address of CRC table
7952
 * @param tmp   scratch register
7953
 */
7954
void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7955
  assert_different_registers(crc, buf, len, table, tmp, rax);
7956

7957
  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7958
  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7959

7960
  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7961
  // context for the registers used, where all instructions below are using 128-bit mode
7962
  // On EVEX without VL and BW, these instructions will all be AVX.
7963
  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7964
  notl(crc); // ~crc
7965
  cmpl(len, 16);
7966
  jcc(Assembler::less, L_tail);
7967

7968
  // Align buffer to 16 bytes
7969
  movl(tmp, buf);
7970
  andl(tmp, 0xF);
7971
  jccb(Assembler::zero, L_aligned);
7972
  subl(tmp,  16);
7973
  addl(len, tmp);
7974

7975
  align(4);
7976
  BIND(L_align_loop);
7977
  movsbl(rax, Address(buf, 0)); // load byte with sign extension
7978
  update_byte_crc32(crc, rax, table);
7979
  increment(buf);
7980
  incrementl(tmp);
7981
  jccb(Assembler::less, L_align_loop);
7982

7983
  BIND(L_aligned);
7984
  movl(tmp, len); // save
7985
  shrl(len, 4);
7986
  jcc(Assembler::zero, L_tail_restore);
7987

7988
  // Fold crc into first bytes of vector
7989
  movdqa(xmm1, Address(buf, 0));
7990
  movdl(rax, xmm1);
7991
  xorl(crc, rax);
7992
  if (VM_Version::supports_sse4_1()) {
7993
    pinsrd(xmm1, crc, 0);
7994
  } else {
7995
    pinsrw(xmm1, crc, 0);
7996
    shrl(crc, 16);
7997
    pinsrw(xmm1, crc, 1);
7998
  }
7999
  addptr(buf, 16);
8000
  subl(len, 4); // len > 0
8001
  jcc(Assembler::less, L_fold_tail);
8002

8003
  movdqa(xmm2, Address(buf,  0));
8004
  movdqa(xmm3, Address(buf, 16));
8005
  movdqa(xmm4, Address(buf, 32));
8006
  addptr(buf, 48);
8007
  subl(len, 3);
8008
  jcc(Assembler::lessEqual, L_fold_512b);
8009

8010
  // Fold total 512 bits of polynomial on each iteration,
8011
  // 128 bits per each of 4 parallel streams.
8012
  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
8013

8014
  align32();
8015
  BIND(L_fold_512b_loop);
8016
  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8017
  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8018
  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8019
  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8020
  addptr(buf, 64);
8021
  subl(len, 4);
8022
  jcc(Assembler::greater, L_fold_512b_loop);
8023

8024
  // Fold 512 bits to 128 bits.
8025
  BIND(L_fold_512b);
8026
  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8027
  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8028
  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8029
  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8030

8031
  // Fold the rest of 128 bits data chunks
8032
  BIND(L_fold_tail);
8033
  addl(len, 3);
8034
  jccb(Assembler::lessEqual, L_fold_128b);
8035
  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1);
8036

8037
  BIND(L_fold_tail_loop);
8038
  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8039
  addptr(buf, 16);
8040
  decrementl(len);
8041
  jccb(Assembler::greater, L_fold_tail_loop);
8042

8043
  // Fold 128 bits in xmm1 down into 32 bits in crc register.
8044
  BIND(L_fold_128b);
8045
  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1);
8046
  if (UseAVX > 0) {
8047
    vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
8048
    vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
8049
    vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
8050
  } else {
8051
    movdqa(xmm2, xmm0);
8052
    pclmulqdq(xmm2, xmm1, 0x1);
8053
    movdqa(xmm3, xmm0);
8054
    pand(xmm3, xmm2);
8055
    pclmulqdq(xmm0, xmm3, 0x1);
8056
  }
8057
  psrldq(xmm1, 8);
8058
  psrldq(xmm2, 4);
8059
  pxor(xmm0, xmm1);
8060
  pxor(xmm0, xmm2);
8061

8062
  // 8 8-bit folds to compute 32-bit CRC.
8063
  for (int j = 0; j < 4; j++) {
8064
    fold_8bit_crc32(xmm0, table, xmm1, rax);
8065
  }
8066
  movdl(crc, xmm0); // mov 32 bits to general register
8067
  for (int j = 0; j < 4; j++) {
8068
    fold_8bit_crc32(crc, table, rax);
8069
  }
8070

8071
  BIND(L_tail_restore);
8072
  movl(len, tmp); // restore
8073
  BIND(L_tail);
8074
  andl(len, 0xf);
8075
  jccb(Assembler::zero, L_exit);
8076

8077
  // Fold the rest of bytes
8078
  align(4);
8079
  BIND(L_tail_loop);
8080
  movsbl(rax, Address(buf, 0)); // load byte with sign extension
8081
  update_byte_crc32(crc, rax, table);
8082
  increment(buf);
8083
  decrementl(len);
8084
  jccb(Assembler::greater, L_tail_loop);
8085

8086
  BIND(L_exit);
8087
  notl(crc); // ~c
8088
}
8089

8090
#ifdef _LP64
8091
// Helper function for AVX 512 CRC32
8092
// Fold 512-bit data chunks
8093
void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
8094
                                             Register pos, int offset) {
8095
  evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
8096
  evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
8097
  evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
8098
  evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
8099
  evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
8100
}
8101

8102
// Helper function for AVX 512 CRC32
8103
// Compute CRC32 for < 256B buffers
8104
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
8105
                                              Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
8106
                                              Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
8107

8108
  Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
8109
  Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
8110
  Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
8111

8112
  // check if there is enough buffer to be able to fold 16B at a time
8113
  cmpl(len, 32);
8114
  jcc(Assembler::less, L_less_than_32);
8115

8116
  // if there is, load the constants
8117
  movdqu(xmm10, Address(table, 1 * 16));    //rk1 and rk2 in xmm10
8118
  movdl(xmm0, crc);                        // get the initial crc value
8119
  movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8120
  pxor(xmm7, xmm0);
8121

8122
  // update the buffer pointer
8123
  addl(pos, 16);
8124
  //update the counter.subtract 32 instead of 16 to save one instruction from the loop
8125
  subl(len, 32);
8126
  jmp(L_16B_reduction_loop);
8127

8128
  bind(L_less_than_32);
8129
  //mov initial crc to the return value. this is necessary for zero - length buffers.
8130
  movl(rax, crc);
8131
  testl(len, len);
8132
  jcc(Assembler::equal, L_cleanup);
8133

8134
  movdl(xmm0, crc);                        //get the initial crc value
8135

8136
  cmpl(len, 16);
8137
  jcc(Assembler::equal, L_exact_16_left);
8138
  jcc(Assembler::less, L_less_than_16_left);
8139

8140
  movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
8141
  pxor(xmm7, xmm0);                       //xor the initial crc value
8142
  addl(pos, 16);
8143
  subl(len, 16);
8144
  movdqu(xmm10, Address(table, 1 * 16));    // rk1 and rk2 in xmm10
8145
  jmp(L_get_last_two_xmms);
8146

8147
  bind(L_less_than_16_left);
8148
  //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
8149
  pxor(xmm1, xmm1);
8150
  movptr(tmp1, rsp);
8151
  movdqu(Address(tmp1, 0 * 16), xmm1);
8152

8153
  cmpl(len, 4);
8154
  jcc(Assembler::less, L_only_less_than_4);
8155

8156
  //backup the counter value
8157
  movl(tmp2, len);
8158
  cmpl(len, 8);
8159
  jcc(Assembler::less, L_less_than_8_left);
8160

8161
  //load 8 Bytes
8162
  movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
8163
  movq(Address(tmp1, 0 * 16), rax);
8164
  addptr(tmp1, 8);
8165
  subl(len, 8);
8166
  addl(pos, 8);
8167

8168
  bind(L_less_than_8_left);
8169
  cmpl(len, 4);
8170
  jcc(Assembler::less, L_less_than_4_left);
8171

8172
  //load 4 Bytes
8173
  movl(rax, Address(buf, pos, Address::times_1, 0));
8174
  movl(Address(tmp1, 0 * 16), rax);
8175
  addptr(tmp1, 4);
8176
  subl(len, 4);
8177
  addl(pos, 4);
8178

8179
  bind(L_less_than_4_left);
8180
  cmpl(len, 2);
8181
  jcc(Assembler::less, L_less_than_2_left);
8182

8183
  // load 2 Bytes
8184
  movw(rax, Address(buf, pos, Address::times_1, 0));
8185
  movl(Address(tmp1, 0 * 16), rax);
8186
  addptr(tmp1, 2);
8187
  subl(len, 2);
8188
  addl(pos, 2);
8189

8190
  bind(L_less_than_2_left);
8191
  cmpl(len, 1);
8192
  jcc(Assembler::less, L_zero_left);
8193

8194
  // load 1 Byte
8195
  movb(rax, Address(buf, pos, Address::times_1, 0));
8196
  movb(Address(tmp1, 0 * 16), rax);
8197

8198
  bind(L_zero_left);
8199
  movdqu(xmm7, Address(rsp, 0));
8200
  pxor(xmm7, xmm0);                       //xor the initial crc value
8201

8202
  lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8203
  movdqu(xmm0, Address(rax, tmp2));
8204
  pshufb(xmm7, xmm0);
8205
  jmp(L_128_done);
8206

8207
  bind(L_exact_16_left);
8208
  movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
8209
  pxor(xmm7, xmm0);                       //xor the initial crc value
8210
  jmp(L_128_done);
8211

8212
  bind(L_only_less_than_4);
8213
  cmpl(len, 3);
8214
  jcc(Assembler::less, L_only_less_than_3);
8215

8216
  // load 3 Bytes
8217
  movb(rax, Address(buf, pos, Address::times_1, 0));
8218
  movb(Address(tmp1, 0), rax);
8219

8220
  movb(rax, Address(buf, pos, Address::times_1, 1));
8221
  movb(Address(tmp1, 1), rax);
8222

8223
  movb(rax, Address(buf, pos, Address::times_1, 2));
8224
  movb(Address(tmp1, 2), rax);
8225

8226
  movdqu(xmm7, Address(rsp, 0));
8227
  pxor(xmm7, xmm0);                     //xor the initial crc value
8228

8229
  pslldq(xmm7, 0x5);
8230
  jmp(L_barrett);
8231
  bind(L_only_less_than_3);
8232
  cmpl(len, 2);
8233
  jcc(Assembler::less, L_only_less_than_2);
8234

8235
  // load 2 Bytes
8236
  movb(rax, Address(buf, pos, Address::times_1, 0));
8237
  movb(Address(tmp1, 0), rax);
8238

8239
  movb(rax, Address(buf, pos, Address::times_1, 1));
8240
  movb(Address(tmp1, 1), rax);
8241

8242
  movdqu(xmm7, Address(rsp, 0));
8243
  pxor(xmm7, xmm0);                     //xor the initial crc value
8244

8245
  pslldq(xmm7, 0x6);
8246
  jmp(L_barrett);
8247

8248
  bind(L_only_less_than_2);
8249
  //load 1 Byte
8250
  movb(rax, Address(buf, pos, Address::times_1, 0));
8251
  movb(Address(tmp1, 0), rax);
8252

8253
  movdqu(xmm7, Address(rsp, 0));
8254
  pxor(xmm7, xmm0);                     //xor the initial crc value
8255

8256
  pslldq(xmm7, 0x7);
8257
}
8258

8259
/**
8260
* Compute CRC32 using AVX512 instructions
8261
* param crc   register containing existing CRC (32-bit)
8262
* param buf   register pointing to input byte buffer (byte*)
8263
* param len   register containing number of bytes
8264
* param table address of crc or crc32c table
8265
* param tmp1  scratch register
8266
* param tmp2  scratch register
8267
* return rax  result register
8268
*
8269
* This routine is identical for crc32c with the exception of the precomputed constant
8270
* table which will be passed as the table argument.  The calculation steps are
8271
* the same for both variants.
8272
*/
8273
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
8274
  assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
8275

8276
  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8277
  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8278
  Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
8279
  Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
8280
  Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
8281

8282
  const Register pos = r12;
8283
  push(r12);
8284
  subptr(rsp, 16 * 2 + 8);
8285

8286
  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8287
  // context for the registers used, where all instructions below are using 128-bit mode
8288
  // On EVEX without VL and BW, these instructions will all be AVX.
8289
  movl(pos, 0);
8290

8291
  // check if smaller than 256B
8292
  cmpl(len, 256);
8293
  jcc(Assembler::less, L_less_than_256);
8294

8295
  // load the initial crc value
8296
  movdl(xmm10, crc);
8297

8298
  // receive the initial 64B data, xor the initial crc value
8299
  evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
8300
  evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
8301
  evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
8302
  evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
8303

8304
  subl(len, 256);
8305
  cmpl(len, 256);
8306
  jcc(Assembler::less, L_fold_128_B_loop);
8307

8308
  evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
8309
  evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
8310
  evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
8311
  subl(len, 256);
8312

8313
  bind(L_fold_256_B_loop);
8314
  addl(pos, 256);
8315
  fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
8316
  fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
8317
  fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
8318
  fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
8319

8320
  subl(len, 256);
8321
  jcc(Assembler::greaterEqual, L_fold_256_B_loop);
8322

8323
  // Fold 256 into 128
8324
  addl(pos, 256);
8325
  evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
8326
  evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
8327
  vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
8328

8329
  evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
8330
  evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
8331
  vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
8332

8333
  evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
8334
  evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
8335

8336
  addl(len, 128);
8337
  jmp(L_fold_128_B_register);
8338

8339
  // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
8340
  // loop will fold 128B at a time until we have 128 + y Bytes of buffer
8341

8342
  // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
8343
  bind(L_fold_128_B_loop);
8344
  addl(pos, 128);
8345
  fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
8346
  fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
8347

8348
  subl(len, 128);
8349
  jcc(Assembler::greaterEqual, L_fold_128_B_loop);
8350

8351
  addl(pos, 128);
8352

8353
  // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
8354
  // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
8355
  bind(L_fold_128_B_register);
8356
  evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
8357
  evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
8358
  evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
8359
  evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
8360
  // save last that has no multiplicand
8361
  vextracti64x2(xmm7, xmm4, 3);
8362

8363
  evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
8364
  evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
8365
  // Needed later in reduction loop
8366
  movdqu(xmm10, Address(table, 1 * 16));
8367
  vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
8368
  vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
8369

8370
  // Swap 1,0,3,2 - 01 00 11 10
8371
  evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
8372
  evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
8373
  vextracti128(xmm5, xmm8, 1);
8374
  evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
8375

8376
  // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
8377
  // instead of a cmp instruction, we use the negative flag with the jl instruction
8378
  addl(len, 128 - 16);
8379
  jcc(Assembler::less, L_final_reduction_for_128);
8380

8381
  bind(L_16B_reduction_loop);
8382
  vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8383
  vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8384
  vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8385
  movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
8386
  vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8387
  addl(pos, 16);
8388
  subl(len, 16);
8389
  jcc(Assembler::greaterEqual, L_16B_reduction_loop);
8390

8391
  bind(L_final_reduction_for_128);
8392
  addl(len, 16);
8393
  jcc(Assembler::equal, L_128_done);
8394

8395
  bind(L_get_last_two_xmms);
8396
  movdqu(xmm2, xmm7);
8397
  addl(pos, len);
8398
  movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
8399
  subl(pos, len);
8400

8401
  // get rid of the extra data that was loaded before
8402
  // load the shift constant
8403
  lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
8404
  movdqu(xmm0, Address(rax, len));
8405
  addl(rax, len);
8406

8407
  vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8408
  //Change mask to 512
8409
  vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
8410
  vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
8411

8412
  blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
8413
  vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
8414
  vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8415
  vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
8416
  vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
8417

8418
  bind(L_128_done);
8419
  // compute crc of a 128-bit value
8420
  movdqu(xmm10, Address(table, 3 * 16));
8421
  movdqu(xmm0, xmm7);
8422

8423
  // 64b fold
8424
  vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
8425
  vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
8426
  vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8427

8428
  // 32b fold
8429
  movdqu(xmm0, xmm7);
8430
  vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
8431
  vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
8432
  vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
8433
  jmp(L_barrett);
8434

8435
  bind(L_less_than_256);
8436
  kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
8437

8438
  //barrett reduction
8439
  bind(L_barrett);
8440
  vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
8441
  movdqu(xmm1, xmm7);
8442
  movdqu(xmm2, xmm7);
8443
  movdqu(xmm10, Address(table, 4 * 16));
8444

8445
  pclmulqdq(xmm7, xmm10, 0x0);
8446
  pxor(xmm7, xmm2);
8447
  vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
8448
  movdqu(xmm2, xmm7);
8449
  pclmulqdq(xmm7, xmm10, 0x10);
8450
  pxor(xmm7, xmm2);
8451
  pxor(xmm7, xmm1);
8452
  pextrd(crc, xmm7, 2);
8453

8454
  bind(L_cleanup);
8455
  addptr(rsp, 16 * 2 + 8);
8456
  pop(r12);
8457
}
8458

8459
// S. Gueron / Information Processing Letters 112 (2012) 184
8460
// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8461
// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8462
// Output: the 64-bit carry-less product of B * CONST
8463
void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8464
                                     Register tmp1, Register tmp2, Register tmp3) {
8465
  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8466
  if (n > 0) {
8467
    addq(tmp3, n * 256 * 8);
8468
  }
8469
  //    Q1 = TABLEExt[n][B & 0xFF];
8470
  movl(tmp1, in);
8471
  andl(tmp1, 0x000000FF);
8472
  shll(tmp1, 3);
8473
  addq(tmp1, tmp3);
8474
  movq(tmp1, Address(tmp1, 0));
8475

8476
  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8477
  movl(tmp2, in);
8478
  shrl(tmp2, 8);
8479
  andl(tmp2, 0x000000FF);
8480
  shll(tmp2, 3);
8481
  addq(tmp2, tmp3);
8482
  movq(tmp2, Address(tmp2, 0));
8483

8484
  shlq(tmp2, 8);
8485
  xorq(tmp1, tmp2);
8486

8487
  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8488
  movl(tmp2, in);
8489
  shrl(tmp2, 16);
8490
  andl(tmp2, 0x000000FF);
8491
  shll(tmp2, 3);
8492
  addq(tmp2, tmp3);
8493
  movq(tmp2, Address(tmp2, 0));
8494

8495
  shlq(tmp2, 16);
8496
  xorq(tmp1, tmp2);
8497

8498
  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8499
  shrl(in, 24);
8500
  andl(in, 0x000000FF);
8501
  shll(in, 3);
8502
  addq(in, tmp3);
8503
  movq(in, Address(in, 0));
8504

8505
  shlq(in, 24);
8506
  xorq(in, tmp1);
8507
  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8508
}
8509

8510
void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8511
                                      Register in_out,
8512
                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8513
                                      XMMRegister w_xtmp2,
8514
                                      Register tmp1,
8515
                                      Register n_tmp2, Register n_tmp3) {
8516
  if (is_pclmulqdq_supported) {
8517
    movdl(w_xtmp1, in_out); // modified blindly
8518

8519
    movl(tmp1, const_or_pre_comp_const_index);
8520
    movdl(w_xtmp2, tmp1);
8521
    pclmulqdq(w_xtmp1, w_xtmp2, 0);
8522

8523
    movdq(in_out, w_xtmp1);
8524
  } else {
8525
    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8526
  }
8527
}
8528

8529
// Recombination Alternative 2: No bit-reflections
8530
// T1 = (CRC_A * U1) << 1
8531
// T2 = (CRC_B * U2) << 1
8532
// C1 = T1 >> 32
8533
// C2 = T2 >> 32
8534
// T1 = T1 & 0xFFFFFFFF
8535
// T2 = T2 & 0xFFFFFFFF
8536
// T1 = CRC32(0, T1)
8537
// T2 = CRC32(0, T2)
8538
// C1 = C1 ^ T1
8539
// C2 = C2 ^ T2
8540
// CRC = C1 ^ C2 ^ CRC_C
8541
void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8542
                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8543
                                     Register tmp1, Register tmp2,
8544
                                     Register n_tmp3) {
8545
  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8546
  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8547
  shlq(in_out, 1);
8548
  movl(tmp1, in_out);
8549
  shrq(in_out, 32);
8550
  xorl(tmp2, tmp2);
8551
  crc32(tmp2, tmp1, 4);
8552
  xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8553
  shlq(in1, 1);
8554
  movl(tmp1, in1);
8555
  shrq(in1, 32);
8556
  xorl(tmp2, tmp2);
8557
  crc32(tmp2, tmp1, 4);
8558
  xorl(in1, tmp2);
8559
  xorl(in_out, in1);
8560
  xorl(in_out, in2);
8561
}
8562

8563
// Set N to predefined value
8564
// Subtract from a length of a buffer
8565
// execute in a loop:
8566
// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8567
// for i = 1 to N do
8568
//  CRC_A = CRC32(CRC_A, A[i])
8569
//  CRC_B = CRC32(CRC_B, B[i])
8570
//  CRC_C = CRC32(CRC_C, C[i])
8571
// end for
8572
// Recombine
8573
void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8574
                                       Register in_out1, Register in_out2, Register in_out3,
8575
                                       Register tmp1, Register tmp2, Register tmp3,
8576
                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8577
                                       Register tmp4, Register tmp5,
8578
                                       Register n_tmp6) {
8579
  Label L_processPartitions;
8580
  Label L_processPartition;
8581
  Label L_exit;
8582

8583
  bind(L_processPartitions);
8584
  cmpl(in_out1, 3 * size);
8585
  jcc(Assembler::less, L_exit);
8586
    xorl(tmp1, tmp1);
8587
    xorl(tmp2, tmp2);
8588
    movq(tmp3, in_out2);
8589
    addq(tmp3, size);
8590

8591
    bind(L_processPartition);
8592
      crc32(in_out3, Address(in_out2, 0), 8);
8593
      crc32(tmp1, Address(in_out2, size), 8);
8594
      crc32(tmp2, Address(in_out2, size * 2), 8);
8595
      addq(in_out2, 8);
8596
      cmpq(in_out2, tmp3);
8597
      jcc(Assembler::less, L_processPartition);
8598
    crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8599
            w_xtmp1, w_xtmp2, w_xtmp3,
8600
            tmp4, tmp5,
8601
            n_tmp6);
8602
    addq(in_out2, 2 * size);
8603
    subl(in_out1, 3 * size);
8604
    jmp(L_processPartitions);
8605

8606
  bind(L_exit);
8607
}
8608
#else
8609
void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
8610
                                     Register tmp1, Register tmp2, Register tmp3,
8611
                                     XMMRegister xtmp1, XMMRegister xtmp2) {
8612
  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8613
  if (n > 0) {
8614
    addl(tmp3, n * 256 * 8);
8615
  }
8616
  //    Q1 = TABLEExt[n][B & 0xFF];
8617
  movl(tmp1, in_out);
8618
  andl(tmp1, 0x000000FF);
8619
  shll(tmp1, 3);
8620
  addl(tmp1, tmp3);
8621
  movq(xtmp1, Address(tmp1, 0));
8622

8623
  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8624
  movl(tmp2, in_out);
8625
  shrl(tmp2, 8);
8626
  andl(tmp2, 0x000000FF);
8627
  shll(tmp2, 3);
8628
  addl(tmp2, tmp3);
8629
  movq(xtmp2, Address(tmp2, 0));
8630

8631
  psllq(xtmp2, 8);
8632
  pxor(xtmp1, xtmp2);
8633

8634
  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8635
  movl(tmp2, in_out);
8636
  shrl(tmp2, 16);
8637
  andl(tmp2, 0x000000FF);
8638
  shll(tmp2, 3);
8639
  addl(tmp2, tmp3);
8640
  movq(xtmp2, Address(tmp2, 0));
8641

8642
  psllq(xtmp2, 16);
8643
  pxor(xtmp1, xtmp2);
8644

8645
  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8646
  shrl(in_out, 24);
8647
  andl(in_out, 0x000000FF);
8648
  shll(in_out, 3);
8649
  addl(in_out, tmp3);
8650
  movq(xtmp2, Address(in_out, 0));
8651

8652
  psllq(xtmp2, 24);
8653
  pxor(xtmp1, xtmp2); // Result in CXMM
8654
  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8655
}
8656

8657
void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8658
                                      Register in_out,
8659
                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8660
                                      XMMRegister w_xtmp2,
8661
                                      Register tmp1,
8662
                                      Register n_tmp2, Register n_tmp3) {
8663
  if (is_pclmulqdq_supported) {
8664
    movdl(w_xtmp1, in_out);
8665

8666
    movl(tmp1, const_or_pre_comp_const_index);
8667
    movdl(w_xtmp2, tmp1);
8668
    pclmulqdq(w_xtmp1, w_xtmp2, 0);
8669
    // Keep result in XMM since GPR is 32 bit in length
8670
  } else {
8671
    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
8672
  }
8673
}
8674

8675
void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8676
                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8677
                                     Register tmp1, Register tmp2,
8678
                                     Register n_tmp3) {
8679
  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8680
  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8681

8682
  psllq(w_xtmp1, 1);
8683
  movdl(tmp1, w_xtmp1);
8684
  psrlq(w_xtmp1, 32);
8685
  movdl(in_out, w_xtmp1);
8686

8687
  xorl(tmp2, tmp2);
8688
  crc32(tmp2, tmp1, 4);
8689
  xorl(in_out, tmp2);
8690

8691
  psllq(w_xtmp2, 1);
8692
  movdl(tmp1, w_xtmp2);
8693
  psrlq(w_xtmp2, 32);
8694
  movdl(in1, w_xtmp2);
8695

8696
  xorl(tmp2, tmp2);
8697
  crc32(tmp2, tmp1, 4);
8698
  xorl(in1, tmp2);
8699
  xorl(in_out, in1);
8700
  xorl(in_out, in2);
8701
}
8702

8703
void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
8704
                                       Register in_out1, Register in_out2, Register in_out3,
8705
                                       Register tmp1, Register tmp2, Register tmp3,
8706
                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8707
                                       Register tmp4, Register tmp5,
8708
                                       Register n_tmp6) {
8709
  Label L_processPartitions;
8710
  Label L_processPartition;
8711
  Label L_exit;
8712

8713
  bind(L_processPartitions);
8714
  cmpl(in_out1, 3 * size);
8715
  jcc(Assembler::less, L_exit);
8716
    xorl(tmp1, tmp1);
8717
    xorl(tmp2, tmp2);
8718
    movl(tmp3, in_out2);
8719
    addl(tmp3, size);
8720

8721
    bind(L_processPartition);
8722
      crc32(in_out3, Address(in_out2, 0), 4);
8723
      crc32(tmp1, Address(in_out2, size), 4);
8724
      crc32(tmp2, Address(in_out2, size*2), 4);
8725
      crc32(in_out3, Address(in_out2, 0+4), 4);
8726
      crc32(tmp1, Address(in_out2, size+4), 4);
8727
      crc32(tmp2, Address(in_out2, size*2+4), 4);
8728
      addl(in_out2, 8);
8729
      cmpl(in_out2, tmp3);
8730
      jcc(Assembler::less, L_processPartition);
8731

8732
        push(tmp3);
8733
        push(in_out1);
8734
        push(in_out2);
8735
        tmp4 = tmp3;
8736
        tmp5 = in_out1;
8737
        n_tmp6 = in_out2;
8738

8739
      crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
8740
            w_xtmp1, w_xtmp2, w_xtmp3,
8741
            tmp4, tmp5,
8742
            n_tmp6);
8743

8744
        pop(in_out2);
8745
        pop(in_out1);
8746
        pop(tmp3);
8747

8748
    addl(in_out2, 2 * size);
8749
    subl(in_out1, 3 * size);
8750
    jmp(L_processPartitions);
8751

8752
  bind(L_exit);
8753
}
8754
#endif //LP64
8755

8756
#ifdef _LP64
8757
// Algorithm 2: Pipelined usage of the CRC32 instruction.
8758
// Input: A buffer I of L bytes.
8759
// Output: the CRC32C value of the buffer.
8760
// Notations:
8761
// Write L = 24N + r, with N = floor (L/24).
8762
// r = L mod 24 (0 <= r < 24).
8763
// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8764
// N quadwords, and R consists of r bytes.
8765
// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8766
// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8767
// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8768
// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8769
void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8770
                                          Register tmp1, Register tmp2, Register tmp3,
8771
                                          Register tmp4, Register tmp5, Register tmp6,
8772
                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8773
                                          bool is_pclmulqdq_supported) {
8774
  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8775
  Label L_wordByWord;
8776
  Label L_byteByByteProlog;
8777
  Label L_byteByByte;
8778
  Label L_exit;
8779

8780
  if (is_pclmulqdq_supported ) {
8781
    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8782
    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
8783

8784
    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8785
    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8786

8787
    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8788
    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8789
    assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8790
  } else {
8791
    const_or_pre_comp_const_index[0] = 1;
8792
    const_or_pre_comp_const_index[1] = 0;
8793

8794
    const_or_pre_comp_const_index[2] = 3;
8795
    const_or_pre_comp_const_index[3] = 2;
8796

8797
    const_or_pre_comp_const_index[4] = 5;
8798
    const_or_pre_comp_const_index[5] = 4;
8799
   }
8800
  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8801
                    in2, in1, in_out,
8802
                    tmp1, tmp2, tmp3,
8803
                    w_xtmp1, w_xtmp2, w_xtmp3,
8804
                    tmp4, tmp5,
8805
                    tmp6);
8806
  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8807
                    in2, in1, in_out,
8808
                    tmp1, tmp2, tmp3,
8809
                    w_xtmp1, w_xtmp2, w_xtmp3,
8810
                    tmp4, tmp5,
8811
                    tmp6);
8812
  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8813
                    in2, in1, in_out,
8814
                    tmp1, tmp2, tmp3,
8815
                    w_xtmp1, w_xtmp2, w_xtmp3,
8816
                    tmp4, tmp5,
8817
                    tmp6);
8818
  movl(tmp1, in2);
8819
  andl(tmp1, 0x00000007);
8820
  negl(tmp1);
8821
  addl(tmp1, in2);
8822
  addq(tmp1, in1);
8823

8824
  cmpq(in1, tmp1);
8825
  jccb(Assembler::greaterEqual, L_byteByByteProlog);
8826
  align(16);
8827
  BIND(L_wordByWord);
8828
    crc32(in_out, Address(in1, 0), 8);
8829
    addq(in1, 8);
8830
    cmpq(in1, tmp1);
8831
    jcc(Assembler::less, L_wordByWord);
8832

8833
  BIND(L_byteByByteProlog);
8834
  andl(in2, 0x00000007);
8835
  movl(tmp2, 1);
8836

8837
  cmpl(tmp2, in2);
8838
  jccb(Assembler::greater, L_exit);
8839
  BIND(L_byteByByte);
8840
    crc32(in_out, Address(in1, 0), 1);
8841
    incq(in1);
8842
    incl(tmp2);
8843
    cmpl(tmp2, in2);
8844
    jcc(Assembler::lessEqual, L_byteByByte);
8845

8846
  BIND(L_exit);
8847
}
8848
#else
8849
void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
8850
                                          Register tmp1, Register  tmp2, Register tmp3,
8851
                                          Register tmp4, Register  tmp5, Register tmp6,
8852
                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8853
                                          bool is_pclmulqdq_supported) {
8854
  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
8855
  Label L_wordByWord;
8856
  Label L_byteByByteProlog;
8857
  Label L_byteByByte;
8858
  Label L_exit;
8859

8860
  if (is_pclmulqdq_supported) {
8861
    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8862
    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8863

8864
    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8865
    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8866

8867
    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8868
    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8869
  } else {
8870
    const_or_pre_comp_const_index[0] = 1;
8871
    const_or_pre_comp_const_index[1] = 0;
8872

8873
    const_or_pre_comp_const_index[2] = 3;
8874
    const_or_pre_comp_const_index[3] = 2;
8875

8876
    const_or_pre_comp_const_index[4] = 5;
8877
    const_or_pre_comp_const_index[5] = 4;
8878
  }
8879
  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
8880
                    in2, in1, in_out,
8881
                    tmp1, tmp2, tmp3,
8882
                    w_xtmp1, w_xtmp2, w_xtmp3,
8883
                    tmp4, tmp5,
8884
                    tmp6);
8885
  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
8886
                    in2, in1, in_out,
8887
                    tmp1, tmp2, tmp3,
8888
                    w_xtmp1, w_xtmp2, w_xtmp3,
8889
                    tmp4, tmp5,
8890
                    tmp6);
8891
  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
8892
                    in2, in1, in_out,
8893
                    tmp1, tmp2, tmp3,
8894
                    w_xtmp1, w_xtmp2, w_xtmp3,
8895
                    tmp4, tmp5,
8896
                    tmp6);
8897
  movl(tmp1, in2);
8898
  andl(tmp1, 0x00000007);
8899
  negl(tmp1);
8900
  addl(tmp1, in2);
8901
  addl(tmp1, in1);
8902

8903
  BIND(L_wordByWord);
8904
  cmpl(in1, tmp1);
8905
  jcc(Assembler::greaterEqual, L_byteByByteProlog);
8906
    crc32(in_out, Address(in1,0), 4);
8907
    addl(in1, 4);
8908
    jmp(L_wordByWord);
8909

8910
  BIND(L_byteByByteProlog);
8911
  andl(in2, 0x00000007);
8912
  movl(tmp2, 1);
8913

8914
  BIND(L_byteByByte);
8915
  cmpl(tmp2, in2);
8916
  jccb(Assembler::greater, L_exit);
8917
    movb(tmp1, Address(in1, 0));
8918
    crc32(in_out, tmp1, 1);
8919
    incl(in1);
8920
    incl(tmp2);
8921
    jmp(L_byteByByte);
8922

8923
  BIND(L_exit);
8924
}
8925
#endif // LP64
8926
#undef BIND
8927
#undef BLOCK_COMMENT
8928

8929
// Compress char[] array to byte[].
8930
// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8931
// Return the array length if every element in array can be encoded,
8932
// otherwise, the index of first non-latin1 (> 0xff) character.
8933
//   @IntrinsicCandidate
8934
//   public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8935
//     for (int i = 0; i < len; i++) {
8936
//       char c = src[srcOff];
8937
//       if (c > 0xff) {
8938
//           return i;  // return index of non-latin1 char
8939
//       }
8940
//       dst[dstOff] = (byte)c;
8941
//       srcOff++;
8942
//       dstOff++;
8943
//     }
8944
//     return len;
8945
//   }
8946
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
8947
  XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8948
  XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8949
  Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8950
  Label copy_chars_loop, done, reset_sp, copy_tail;
8951

8952
  // rsi: src
8953
  // rdi: dst
8954
  // rdx: len
8955
  // rcx: tmp5
8956
  // rax: result
8957

8958
  // rsi holds start addr of source char[] to be compressed
8959
  // rdi holds start addr of destination byte[]
8960
  // rdx holds length
8961

8962
  assert(len != result, "");
8963

8964
  // save length for return
8965
  movl(result, len);
8966

8967
  if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
8968
    VM_Version::supports_avx512vlbw() &&
8969
    VM_Version::supports_bmi2()) {
8970

8971
    Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
8972

8973
    // alignment
8974
    Label post_alignment;
8975

8976
    // if length of the string is less than 32, handle it the old fashioned way
8977
    testl(len, -32);
8978
    jcc(Assembler::zero, below_threshold);
8979

8980
    // First check whether a character is compressible ( <= 0xFF).
8981
    // Create mask to test for Unicode chars inside zmm vector
8982
    movl(tmp5, 0x00FF);
8983
    evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
8984

8985
    testl(len, -64);
8986
    jccb(Assembler::zero, post_alignment);
8987

8988
    movl(tmp5, dst);
8989
    andl(tmp5, (32 - 1));
8990
    negl(tmp5);
8991
    andl(tmp5, (32 - 1));
8992

8993
    // bail out when there is nothing to be done
8994
    testl(tmp5, 0xFFFFFFFF);
8995
    jccb(Assembler::zero, post_alignment);
8996

8997
    // ~(~0 << len), where len is the # of remaining elements to process
8998
    movl(len, 0xFFFFFFFF);
8999
    shlxl(len, len, tmp5);
9000
    notl(len);
9001
    kmovdl(mask2, len);
9002
    movl(len, result);
9003

9004
    evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9005
    evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9006
    ktestd(mask1, mask2);
9007
    jcc(Assembler::carryClear, copy_tail);
9008

9009
    evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9010

9011
    addptr(src, tmp5);
9012
    addptr(src, tmp5);
9013
    addptr(dst, tmp5);
9014
    subl(len, tmp5);
9015

9016
    bind(post_alignment);
9017
    // end of alignment
9018

9019
    movl(tmp5, len);
9020
    andl(tmp5, (32 - 1));    // tail count (in chars)
9021
    andl(len, ~(32 - 1));    // vector count (in chars)
9022
    jccb(Assembler::zero, copy_loop_tail);
9023

9024
    lea(src, Address(src, len, Address::times_2));
9025
    lea(dst, Address(dst, len, Address::times_1));
9026
    negptr(len);
9027

9028
    bind(copy_32_loop);
9029
    evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9030
    evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9031
    kortestdl(mask1, mask1);
9032
    jccb(Assembler::carryClear, reset_for_copy_tail);
9033

9034
    // All elements in current processed chunk are valid candidates for
9035
    // compression. Write a truncated byte elements to the memory.
9036
    evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9037
    addptr(len, 32);
9038
    jccb(Assembler::notZero, copy_32_loop);
9039

9040
    bind(copy_loop_tail);
9041
    // bail out when there is nothing to be done
9042
    testl(tmp5, 0xFFFFFFFF);
9043
    jcc(Assembler::zero, done);
9044

9045
    movl(len, tmp5);
9046

9047
    // ~(~0 << len), where len is the # of remaining elements to process
9048
    movl(tmp5, 0xFFFFFFFF);
9049
    shlxl(tmp5, tmp5, len);
9050
    notl(tmp5);
9051

9052
    kmovdl(mask2, tmp5);
9053

9054
    evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
9055
    evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
9056
    ktestd(mask1, mask2);
9057
    jcc(Assembler::carryClear, copy_tail);
9058

9059
    evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
9060
    jmp(done);
9061

9062
    bind(reset_for_copy_tail);
9063
    lea(src, Address(src, tmp5, Address::times_2));
9064
    lea(dst, Address(dst, tmp5, Address::times_1));
9065
    subptr(len, tmp5);
9066
    jmp(copy_chars_loop);
9067

9068
    bind(below_threshold);
9069
  }
9070

9071
  if (UseSSE42Intrinsics) {
9072
    Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
9073

9074
    // vectored compression
9075
    testl(len, 0xfffffff8);
9076
    jcc(Assembler::zero, copy_tail);
9077

9078
    movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
9079
    movdl(tmp1Reg, tmp5);
9080
    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9081

9082
    andl(len, 0xfffffff0);
9083
    jccb(Assembler::zero, copy_16);
9084

9085
    // compress 16 chars per iter
9086
    pxor(tmp4Reg, tmp4Reg);
9087

9088
    lea(src, Address(src, len, Address::times_2));
9089
    lea(dst, Address(dst, len, Address::times_1));
9090
    negptr(len);
9091

9092
    bind(copy_32_loop);
9093
    movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
9094
    por(tmp4Reg, tmp2Reg);
9095
    movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9096
    por(tmp4Reg, tmp3Reg);
9097
    ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
9098
    jccb(Assembler::notZero, reset_for_copy_tail);
9099
    packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
9100
    movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9101
    addptr(len, 16);
9102
    jccb(Assembler::notZero, copy_32_loop);
9103

9104
    // compress next vector of 8 chars (if any)
9105
    bind(copy_16);
9106
    // len = 0
9107
    testl(result, 0x00000008);     // check if there's a block of 8 chars to compress
9108
    jccb(Assembler::zero, copy_tail_sse);
9109

9110
    pxor(tmp3Reg, tmp3Reg);
9111

9112
    movdqu(tmp2Reg, Address(src, 0));
9113
    ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
9114
    jccb(Assembler::notZero, reset_for_copy_tail);
9115
    packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
9116
    movq(Address(dst, 0), tmp2Reg);
9117
    addptr(src, 16);
9118
    addptr(dst, 8);
9119
    jmpb(copy_tail_sse);
9120

9121
    bind(reset_for_copy_tail);
9122
    movl(tmp5, result);
9123
    andl(tmp5, 0x0000000f);
9124
    lea(src, Address(src, tmp5, Address::times_2));
9125
    lea(dst, Address(dst, tmp5, Address::times_1));
9126
    subptr(len, tmp5);
9127
    jmpb(copy_chars_loop);
9128

9129
    bind(copy_tail_sse);
9130
    movl(len, result);
9131
    andl(len, 0x00000007);    // tail count (in chars)
9132
  }
9133
  // compress 1 char per iter
9134
  bind(copy_tail);
9135
  testl(len, len);
9136
  jccb(Assembler::zero, done);
9137
  lea(src, Address(src, len, Address::times_2));
9138
  lea(dst, Address(dst, len, Address::times_1));
9139
  negptr(len);
9140

9141
  bind(copy_chars_loop);
9142
  load_unsigned_short(tmp5, Address(src, len, Address::times_2));
9143
  testl(tmp5, 0xff00);      // check if Unicode char
9144
  jccb(Assembler::notZero, reset_sp);
9145
  movb(Address(dst, len, Address::times_1), tmp5);  // ASCII char; compress to 1 byte
9146
  increment(len);
9147
  jccb(Assembler::notZero, copy_chars_loop);
9148

9149
  // add len then return (len will be zero if compress succeeded, otherwise negative)
9150
  bind(reset_sp);
9151
  addl(result, len);
9152

9153
  bind(done);
9154
}
9155

9156
// Inflate byte[] array to char[].
9157
//   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9158
//   @IntrinsicCandidate
9159
//   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9160
//     for (int i = 0; i < len; i++) {
9161
//       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9162
//     }
9163
//   }
9164
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9165
  XMMRegister tmp1, Register tmp2, KRegister mask) {
9166
  Label copy_chars_loop, done, below_threshold, avx3_threshold;
9167
  // rsi: src
9168
  // rdi: dst
9169
  // rdx: len
9170
  // rcx: tmp2
9171

9172
  // rsi holds start addr of source byte[] to be inflated
9173
  // rdi holds start addr of destination char[]
9174
  // rdx holds length
9175
  assert_different_registers(src, dst, len, tmp2);
9176
  movl(tmp2, len);
9177
  if ((UseAVX > 2) && // AVX512
9178
    VM_Version::supports_avx512vlbw() &&
9179
    VM_Version::supports_bmi2()) {
9180

9181
    Label copy_32_loop, copy_tail;
9182
    Register tmp3_aliased = len;
9183

9184
    // if length of the string is less than 16, handle it in an old fashioned way
9185
    testl(len, -16);
9186
    jcc(Assembler::zero, below_threshold);
9187

9188
    testl(len, -1 * AVX3Threshold);
9189
    jcc(Assembler::zero, avx3_threshold);
9190

9191
    // In order to use only one arithmetic operation for the main loop we use
9192
    // this pre-calculation
9193
    andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9194
    andl(len, -32);     // vector count
9195
    jccb(Assembler::zero, copy_tail);
9196

9197
    lea(src, Address(src, len, Address::times_1));
9198
    lea(dst, Address(dst, len, Address::times_2));
9199
    negptr(len);
9200

9201

9202
    // inflate 32 chars per iter
9203
    bind(copy_32_loop);
9204
    vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9205
    evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9206
    addptr(len, 32);
9207
    jcc(Assembler::notZero, copy_32_loop);
9208

9209
    bind(copy_tail);
9210
    // bail out when there is nothing to be done
9211
    testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9212
    jcc(Assembler::zero, done);
9213

9214
    // ~(~0 << length), where length is the # of remaining elements to process
9215
    movl(tmp3_aliased, -1);
9216
    shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9217
    notl(tmp3_aliased);
9218
    kmovdl(mask, tmp3_aliased);
9219
    evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
9220
    evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
9221

9222
    jmp(done);
9223
    bind(avx3_threshold);
9224
  }
9225
  if (UseSSE42Intrinsics) {
9226
    Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9227

9228
    if (UseAVX > 1) {
9229
      andl(tmp2, (16 - 1));
9230
      andl(len, -16);
9231
      jccb(Assembler::zero, copy_new_tail);
9232
    } else {
9233
      andl(tmp2, 0x00000007);   // tail count (in chars)
9234
      andl(len, 0xfffffff8);    // vector count (in chars)
9235
      jccb(Assembler::zero, copy_tail);
9236
    }
9237

9238
    // vectored inflation
9239
    lea(src, Address(src, len, Address::times_1));
9240
    lea(dst, Address(dst, len, Address::times_2));
9241
    negptr(len);
9242

9243
    if (UseAVX > 1) {
9244
      bind(copy_16_loop);
9245
      vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9246
      vmovdqu(Address(dst, len, Address::times_2), tmp1);
9247
      addptr(len, 16);
9248
      jcc(Assembler::notZero, copy_16_loop);
9249

9250
      bind(below_threshold);
9251
      bind(copy_new_tail);
9252
      movl(len, tmp2);
9253
      andl(tmp2, 0x00000007);
9254
      andl(len, 0xFFFFFFF8);
9255
      jccb(Assembler::zero, copy_tail);
9256

9257
      pmovzxbw(tmp1, Address(src, 0));
9258
      movdqu(Address(dst, 0), tmp1);
9259
      addptr(src, 8);
9260
      addptr(dst, 2 * 8);
9261

9262
      jmp(copy_tail, true);
9263
    }
9264

9265
    // inflate 8 chars per iter
9266
    bind(copy_8_loop);
9267
    pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
9268
    movdqu(Address(dst, len, Address::times_2), tmp1);
9269
    addptr(len, 8);
9270
    jcc(Assembler::notZero, copy_8_loop);
9271

9272
    bind(copy_tail);
9273
    movl(len, tmp2);
9274

9275
    cmpl(len, 4);
9276
    jccb(Assembler::less, copy_bytes);
9277

9278
    movdl(tmp1, Address(src, 0));  // load 4 byte chars
9279
    pmovzxbw(tmp1, tmp1);
9280
    movq(Address(dst, 0), tmp1);
9281
    subptr(len, 4);
9282
    addptr(src, 4);
9283
    addptr(dst, 8);
9284

9285
    bind(copy_bytes);
9286
  } else {
9287
    bind(below_threshold);
9288
  }
9289

9290
  testl(len, len);
9291
  jccb(Assembler::zero, done);
9292
  lea(src, Address(src, len, Address::times_1));
9293
  lea(dst, Address(dst, len, Address::times_2));
9294
  negptr(len);
9295

9296
  // inflate 1 char per iter
9297
  bind(copy_chars_loop);
9298
  load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
9299
  movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
9300
  increment(len);
9301
  jcc(Assembler::notZero, copy_chars_loop);
9302

9303
  bind(done);
9304
}
9305

9306

9307
void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
9308
  switch(type) {
9309
    case T_BYTE:
9310
    case T_BOOLEAN:
9311
      evmovdqub(dst, kmask, src, merge, vector_len);
9312
      break;
9313
    case T_CHAR:
9314
    case T_SHORT:
9315
      evmovdquw(dst, kmask, src, merge, vector_len);
9316
      break;
9317
    case T_INT:
9318
    case T_FLOAT:
9319
      evmovdqul(dst, kmask, src, merge, vector_len);
9320
      break;
9321
    case T_LONG:
9322
    case T_DOUBLE:
9323
      evmovdquq(dst, kmask, src, merge, vector_len);
9324
      break;
9325
    default:
9326
      fatal("Unexpected type argument %s", type2name(type));
9327
      break;
9328
  }
9329
}
9330

9331
void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
9332
  switch(type) {
9333
    case T_BYTE:
9334
    case T_BOOLEAN:
9335
      evmovdqub(dst, kmask, src, merge, vector_len);
9336
      break;
9337
    case T_CHAR:
9338
    case T_SHORT:
9339
      evmovdquw(dst, kmask, src, merge, vector_len);
9340
      break;
9341
    case T_INT:
9342
    case T_FLOAT:
9343
      evmovdqul(dst, kmask, src, merge, vector_len);
9344
      break;
9345
    case T_LONG:
9346
    case T_DOUBLE:
9347
      evmovdquq(dst, kmask, src, merge, vector_len);
9348
      break;
9349
    default:
9350
      fatal("Unexpected type argument %s", type2name(type));
9351
      break;
9352
  }
9353
}
9354

9355
void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) {
9356
  switch(masklen) {
9357
    case 2:
9358
       knotbl(dst, src);
9359
       movl(rtmp, 3);
9360
       kmovbl(ktmp, rtmp);
9361
       kandbl(dst, ktmp, dst);
9362
       break;
9363
    case 4:
9364
       knotbl(dst, src);
9365
       movl(rtmp, 15);
9366
       kmovbl(ktmp, rtmp);
9367
       kandbl(dst, ktmp, dst);
9368
       break;
9369
    case 8:
9370
       knotbl(dst, src);
9371
       break;
9372
    case 16:
9373
       knotwl(dst, src);
9374
       break;
9375
    case 32:
9376
       knotdl(dst, src);
9377
       break;
9378
    case 64:
9379
       knotql(dst, src);
9380
       break;
9381
    default:
9382
      fatal("Unexpected vector length %d", masklen);
9383
      break;
9384
  }
9385
}
9386

9387
void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9388
  switch(type) {
9389
    case T_BOOLEAN:
9390
    case T_BYTE:
9391
       kandbl(dst, src1, src2);
9392
       break;
9393
    case T_CHAR:
9394
    case T_SHORT:
9395
       kandwl(dst, src1, src2);
9396
       break;
9397
    case T_INT:
9398
    case T_FLOAT:
9399
       kanddl(dst, src1, src2);
9400
       break;
9401
    case T_LONG:
9402
    case T_DOUBLE:
9403
       kandql(dst, src1, src2);
9404
       break;
9405
    default:
9406
      fatal("Unexpected type argument %s", type2name(type));
9407
      break;
9408
  }
9409
}
9410

9411
void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9412
  switch(type) {
9413
    case T_BOOLEAN:
9414
    case T_BYTE:
9415
       korbl(dst, src1, src2);
9416
       break;
9417
    case T_CHAR:
9418
    case T_SHORT:
9419
       korwl(dst, src1, src2);
9420
       break;
9421
    case T_INT:
9422
    case T_FLOAT:
9423
       kordl(dst, src1, src2);
9424
       break;
9425
    case T_LONG:
9426
    case T_DOUBLE:
9427
       korql(dst, src1, src2);
9428
       break;
9429
    default:
9430
      fatal("Unexpected type argument %s", type2name(type));
9431
      break;
9432
  }
9433
}
9434

9435
void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) {
9436
  switch(type) {
9437
    case T_BOOLEAN:
9438
    case T_BYTE:
9439
       kxorbl(dst, src1, src2);
9440
       break;
9441
    case T_CHAR:
9442
    case T_SHORT:
9443
       kxorwl(dst, src1, src2);
9444
       break;
9445
    case T_INT:
9446
    case T_FLOAT:
9447
       kxordl(dst, src1, src2);
9448
       break;
9449
    case T_LONG:
9450
    case T_DOUBLE:
9451
       kxorql(dst, src1, src2);
9452
       break;
9453
    default:
9454
      fatal("Unexpected type argument %s", type2name(type));
9455
      break;
9456
  }
9457
}
9458

9459
void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9460
  switch(type) {
9461
    case T_BOOLEAN:
9462
    case T_BYTE:
9463
      evpermb(dst, mask, nds, src, merge, vector_len); break;
9464
    case T_CHAR:
9465
    case T_SHORT:
9466
      evpermw(dst, mask, nds, src, merge, vector_len); break;
9467
    case T_INT:
9468
    case T_FLOAT:
9469
      evpermd(dst, mask, nds, src, merge, vector_len); break;
9470
    case T_LONG:
9471
    case T_DOUBLE:
9472
      evpermq(dst, mask, nds, src, merge, vector_len); break;
9473
    default:
9474
      fatal("Unexpected type argument %s", type2name(type)); break;
9475
  }
9476
}
9477

9478
void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9479
  switch(type) {
9480
    case T_BOOLEAN:
9481
    case T_BYTE:
9482
      evpermb(dst, mask, nds, src, merge, vector_len); break;
9483
    case T_CHAR:
9484
    case T_SHORT:
9485
      evpermw(dst, mask, nds, src, merge, vector_len); break;
9486
    case T_INT:
9487
    case T_FLOAT:
9488
      evpermd(dst, mask, nds, src, merge, vector_len); break;
9489
    case T_LONG:
9490
    case T_DOUBLE:
9491
      evpermq(dst, mask, nds, src, merge, vector_len); break;
9492
    default:
9493
      fatal("Unexpected type argument %s", type2name(type)); break;
9494
  }
9495
}
9496

9497
void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9498
  switch(type) {
9499
    case T_BYTE:
9500
      evpminsb(dst, mask, nds, src, merge, vector_len); break;
9501
    case T_SHORT:
9502
      evpminsw(dst, mask, nds, src, merge, vector_len); break;
9503
    case T_INT:
9504
      evpminsd(dst, mask, nds, src, merge, vector_len); break;
9505
    case T_LONG:
9506
      evpminsq(dst, mask, nds, src, merge, vector_len); break;
9507
    default:
9508
      fatal("Unexpected type argument %s", type2name(type)); break;
9509
  }
9510
}
9511

9512
void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9513
  switch(type) {
9514
    case T_BYTE:
9515
      evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9516
    case T_SHORT:
9517
      evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9518
    case T_INT:
9519
      evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9520
    case T_LONG:
9521
      evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9522
    default:
9523
      fatal("Unexpected type argument %s", type2name(type)); break;
9524
  }
9525
}
9526

9527
void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9528
  switch(type) {
9529
    case T_BYTE:
9530
      evpminsb(dst, mask, nds, src, merge, vector_len); break;
9531
    case T_SHORT:
9532
      evpminsw(dst, mask, nds, src, merge, vector_len); break;
9533
    case T_INT:
9534
      evpminsd(dst, mask, nds, src, merge, vector_len); break;
9535
    case T_LONG:
9536
      evpminsq(dst, mask, nds, src, merge, vector_len); break;
9537
    default:
9538
      fatal("Unexpected type argument %s", type2name(type)); break;
9539
  }
9540
}
9541

9542
void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9543
  switch(type) {
9544
    case T_BYTE:
9545
      evpmaxsb(dst, mask, nds, src, merge, vector_len); break;
9546
    case T_SHORT:
9547
      evpmaxsw(dst, mask, nds, src, merge, vector_len); break;
9548
    case T_INT:
9549
      evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
9550
    case T_LONG:
9551
      evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
9552
    default:
9553
      fatal("Unexpected type argument %s", type2name(type)); break;
9554
  }
9555
}
9556

9557
void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9558
  switch(type) {
9559
    case T_INT:
9560
      evpxord(dst, mask, nds, src, merge, vector_len); break;
9561
    case T_LONG:
9562
      evpxorq(dst, mask, nds, src, merge, vector_len); break;
9563
    default:
9564
      fatal("Unexpected type argument %s", type2name(type)); break;
9565
  }
9566
}
9567

9568
void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9569
  switch(type) {
9570
    case T_INT:
9571
      evpxord(dst, mask, nds, src, merge, vector_len); break;
9572
    case T_LONG:
9573
      evpxorq(dst, mask, nds, src, merge, vector_len); break;
9574
    default:
9575
      fatal("Unexpected type argument %s", type2name(type)); break;
9576
  }
9577
}
9578

9579
void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9580
  switch(type) {
9581
    case T_INT:
9582
      Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9583
    case T_LONG:
9584
      evporq(dst, mask, nds, src, merge, vector_len); break;
9585
    default:
9586
      fatal("Unexpected type argument %s", type2name(type)); break;
9587
  }
9588
}
9589

9590
void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9591
  switch(type) {
9592
    case T_INT:
9593
      Assembler::evpord(dst, mask, nds, src, merge, vector_len); break;
9594
    case T_LONG:
9595
      evporq(dst, mask, nds, src, merge, vector_len); break;
9596
    default:
9597
      fatal("Unexpected type argument %s", type2name(type)); break;
9598
  }
9599
}
9600

9601
void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
9602
  switch(type) {
9603
    case T_INT:
9604
      evpandd(dst, mask, nds, src, merge, vector_len); break;
9605
    case T_LONG:
9606
      evpandq(dst, mask, nds, src, merge, vector_len); break;
9607
    default:
9608
      fatal("Unexpected type argument %s", type2name(type)); break;
9609
  }
9610
}
9611

9612
void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
9613
  switch(type) {
9614
    case T_INT:
9615
      evpandd(dst, mask, nds, src, merge, vector_len); break;
9616
    case T_LONG:
9617
      evpandq(dst, mask, nds, src, merge, vector_len); break;
9618
    default:
9619
      fatal("Unexpected type argument %s", type2name(type)); break;
9620
  }
9621
}
9622

9623
void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) {
9624
  switch(masklen) {
9625
    case 8:
9626
       kortestbl(src1, src2);
9627
       break;
9628
    case 16:
9629
       kortestwl(src1, src2);
9630
       break;
9631
    case 32:
9632
       kortestdl(src1, src2);
9633
       break;
9634
    case 64:
9635
       kortestql(src1, src2);
9636
       break;
9637
    default:
9638
      fatal("Unexpected mask length %d", masklen);
9639
      break;
9640
  }
9641
}
9642

9643

9644
void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) {
9645
  switch(masklen)  {
9646
    case 8:
9647
       ktestbl(src1, src2);
9648
       break;
9649
    case 16:
9650
       ktestwl(src1, src2);
9651
       break;
9652
    case 32:
9653
       ktestdl(src1, src2);
9654
       break;
9655
    case 64:
9656
       ktestql(src1, src2);
9657
       break;
9658
    default:
9659
      fatal("Unexpected mask length %d", masklen);
9660
      break;
9661
  }
9662
}
9663

9664
void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9665
  switch(type) {
9666
    case T_INT:
9667
      evprold(dst, mask, src, shift, merge, vlen_enc); break;
9668
    case T_LONG:
9669
      evprolq(dst, mask, src, shift, merge, vlen_enc); break;
9670
    default:
9671
      fatal("Unexpected type argument %s", type2name(type)); break;
9672
      break;
9673
  }
9674
}
9675

9676
void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) {
9677
  switch(type) {
9678
    case T_INT:
9679
      evprord(dst, mask, src, shift, merge, vlen_enc); break;
9680
    case T_LONG:
9681
      evprorq(dst, mask, src, shift, merge, vlen_enc); break;
9682
    default:
9683
      fatal("Unexpected type argument %s", type2name(type)); break;
9684
  }
9685
}
9686

9687
void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9688
  switch(type) {
9689
    case T_INT:
9690
      evprolvd(dst, mask, src1, src2, merge, vlen_enc); break;
9691
    case T_LONG:
9692
      evprolvq(dst, mask, src1, src2, merge, vlen_enc); break;
9693
    default:
9694
      fatal("Unexpected type argument %s", type2name(type)); break;
9695
  }
9696
}
9697

9698
void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
9699
  switch(type) {
9700
    case T_INT:
9701
      evprorvd(dst, mask, src1, src2, merge, vlen_enc); break;
9702
    case T_LONG:
9703
      evprorvq(dst, mask, src1, src2, merge, vlen_enc); break;
9704
    default:
9705
      fatal("Unexpected type argument %s", type2name(type)); break;
9706
  }
9707
}
9708

9709
void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9710
  assert(rscratch != noreg || always_reachable(src), "missing");
9711

9712
  if (reachable(src)) {
9713
    evpandq(dst, nds, as_Address(src), vector_len);
9714
  } else {
9715
    lea(rscratch, src);
9716
    evpandq(dst, nds, Address(rscratch, 0), vector_len);
9717
  }
9718
}
9719

9720
void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9721
  assert(rscratch != noreg || always_reachable(src), "missing");
9722

9723
  if (reachable(src)) {
9724
    Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9725
  } else {
9726
    lea(rscratch, src);
9727
    Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9728
  }
9729
}
9730

9731
void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9732
  assert(rscratch != noreg || always_reachable(src), "missing");
9733

9734
  if (reachable(src)) {
9735
    evporq(dst, nds, as_Address(src), vector_len);
9736
  } else {
9737
    lea(rscratch, src);
9738
    evporq(dst, nds, Address(rscratch, 0), vector_len);
9739
  }
9740
}
9741

9742
void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9743
  assert(rscratch != noreg || always_reachable(src), "missing");
9744

9745
  if (reachable(src)) {
9746
    vpshufb(dst, nds, as_Address(src), vector_len);
9747
  } else {
9748
    lea(rscratch, src);
9749
    vpshufb(dst, nds, Address(rscratch, 0), vector_len);
9750
  }
9751
}
9752

9753
void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
9754
  assert(rscratch != noreg || always_reachable(src), "missing");
9755

9756
  if (reachable(src)) {
9757
    Assembler::vpor(dst, nds, as_Address(src), vector_len);
9758
  } else {
9759
    lea(rscratch, src);
9760
    Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len);
9761
  }
9762
}
9763

9764
void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) {
9765
  assert(rscratch != noreg || always_reachable(src3), "missing");
9766

9767
  if (reachable(src3)) {
9768
    vpternlogq(dst, imm8, src2, as_Address(src3), vector_len);
9769
  } else {
9770
    lea(rscratch, src3);
9771
    vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len);
9772
  }
9773
}
9774

9775
#if COMPILER2_OR_JVMCI
9776

9777
void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask,
9778
                                 Register length, Register temp, int vec_enc) {
9779
  // Computing mask for predicated vector store.
9780
  movptr(temp, -1);
9781
  bzhiq(temp, temp, length);
9782
  kmov(mask, temp);
9783
  evmovdqu(bt, mask, dst, xmm, true, vec_enc);
9784
}
9785

9786
// Set memory operation for length "less than" 64 bytes.
9787
void MacroAssembler::fill64_masked(uint shift, Register dst, int disp,
9788
                                       XMMRegister xmm, KRegister mask, Register length,
9789
                                       Register temp, bool use64byteVector) {
9790
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
9791
  const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9792
  if (!use64byteVector) {
9793
    fill32(dst, disp, xmm);
9794
    subptr(length, 32 >> shift);
9795
    fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp);
9796
  } else {
9797
    assert(MaxVectorSize == 64, "vector length != 64");
9798
    fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit);
9799
  }
9800
}
9801

9802

9803
void MacroAssembler::fill32_masked(uint shift, Register dst, int disp,
9804
                                       XMMRegister xmm, KRegister mask, Register length,
9805
                                       Register temp) {
9806
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
9807
  const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
9808
  fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit);
9809
}
9810

9811

9812
void MacroAssembler::fill32(Address dst, XMMRegister xmm) {
9813
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
9814
  vmovdqu(dst, xmm);
9815
}
9816

9817
void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) {
9818
  fill32(Address(dst, disp), xmm);
9819
}
9820

9821
void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) {
9822
  assert(MaxVectorSize >= 32, "vector length should be >= 32");
9823
  if (!use64byteVector) {
9824
    fill32(dst, xmm);
9825
    fill32(dst.plus_disp(32), xmm);
9826
  } else {
9827
    evmovdquq(dst, xmm, Assembler::AVX_512bit);
9828
  }
9829
}
9830

9831
void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
9832
  fill64(Address(dst, disp), xmm, use64byteVector);
9833
}
9834

9835
#ifdef _LP64
9836
void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value,
9837
                                        Register count, Register rtmp, XMMRegister xtmp) {
9838
  Label L_exit;
9839
  Label L_fill_start;
9840
  Label L_fill_64_bytes;
9841
  Label L_fill_96_bytes;
9842
  Label L_fill_128_bytes;
9843
  Label L_fill_128_bytes_loop;
9844
  Label L_fill_128_loop_header;
9845
  Label L_fill_128_bytes_loop_header;
9846
  Label L_fill_128_bytes_loop_pre_header;
9847
  Label L_fill_zmm_sequence;
9848

9849
  int shift = -1;
9850
  int avx3threshold = VM_Version::avx3_threshold();
9851
  switch(type) {
9852
    case T_BYTE:  shift = 0;
9853
      break;
9854
    case T_SHORT: shift = 1;
9855
      break;
9856
    case T_INT:   shift = 2;
9857
      break;
9858
    /* Uncomment when LONG fill stubs are supported.
9859
    case T_LONG:  shift = 3;
9860
      break;
9861
    */
9862
    default:
9863
      fatal("Unhandled type: %s\n", type2name(type));
9864
  }
9865

9866
  if ((avx3threshold != 0)  || (MaxVectorSize == 32)) {
9867

9868
    if (MaxVectorSize == 64) {
9869
      cmpq(count, avx3threshold >> shift);
9870
      jcc(Assembler::greater, L_fill_zmm_sequence);
9871
    }
9872

9873
    evpbroadcast(type, xtmp, value, Assembler::AVX_256bit);
9874

9875
    bind(L_fill_start);
9876

9877
    cmpq(count, 32 >> shift);
9878
    jccb(Assembler::greater, L_fill_64_bytes);
9879
    fill32_masked(shift, to, 0, xtmp, k2, count, rtmp);
9880
    jmp(L_exit);
9881

9882
    bind(L_fill_64_bytes);
9883
    cmpq(count, 64 >> shift);
9884
    jccb(Assembler::greater, L_fill_96_bytes);
9885
    fill64_masked(shift, to, 0, xtmp, k2, count, rtmp);
9886
    jmp(L_exit);
9887

9888
    bind(L_fill_96_bytes);
9889
    cmpq(count, 96 >> shift);
9890
    jccb(Assembler::greater, L_fill_128_bytes);
9891
    fill64(to, 0, xtmp);
9892
    subq(count, 64 >> shift);
9893
    fill32_masked(shift, to, 64, xtmp, k2, count, rtmp);
9894
    jmp(L_exit);
9895

9896
    bind(L_fill_128_bytes);
9897
    cmpq(count, 128 >> shift);
9898
    jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header);
9899
    fill64(to, 0, xtmp);
9900
    fill32(to, 64, xtmp);
9901
    subq(count, 96 >> shift);
9902
    fill32_masked(shift, to, 96, xtmp, k2, count, rtmp);
9903
    jmp(L_exit);
9904

9905
    bind(L_fill_128_bytes_loop_pre_header);
9906
    {
9907
      mov(rtmp, to);
9908
      andq(rtmp, 31);
9909
      jccb(Assembler::zero, L_fill_128_bytes_loop_header);
9910
      negq(rtmp);
9911
      addq(rtmp, 32);
9912
      mov64(r8, -1L);
9913
      bzhiq(r8, r8, rtmp);
9914
      kmovql(k2, r8);
9915
      evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit);
9916
      addq(to, rtmp);
9917
      shrq(rtmp, shift);
9918
      subq(count, rtmp);
9919
    }
9920

9921
    cmpq(count, 128 >> shift);
9922
    jcc(Assembler::less, L_fill_start);
9923

9924
    bind(L_fill_128_bytes_loop_header);
9925
    subq(count, 128 >> shift);
9926

9927
    align32();
9928
    bind(L_fill_128_bytes_loop);
9929
      fill64(to, 0, xtmp);
9930
      fill64(to, 64, xtmp);
9931
      addq(to, 128);
9932
      subq(count, 128 >> shift);
9933
      jccb(Assembler::greaterEqual, L_fill_128_bytes_loop);
9934

9935
    addq(count, 128 >> shift);
9936
    jcc(Assembler::zero, L_exit);
9937
    jmp(L_fill_start);
9938
  }
9939

9940
  if (MaxVectorSize == 64) {
9941
    // Sequence using 64 byte ZMM register.
9942
    Label L_fill_128_bytes_zmm;
9943
    Label L_fill_192_bytes_zmm;
9944
    Label L_fill_192_bytes_loop_zmm;
9945
    Label L_fill_192_bytes_loop_header_zmm;
9946
    Label L_fill_192_bytes_loop_pre_header_zmm;
9947
    Label L_fill_start_zmm_sequence;
9948

9949
    bind(L_fill_zmm_sequence);
9950
    evpbroadcast(type, xtmp, value, Assembler::AVX_512bit);
9951

9952
    bind(L_fill_start_zmm_sequence);
9953
    cmpq(count, 64 >> shift);
9954
    jccb(Assembler::greater, L_fill_128_bytes_zmm);
9955
    fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true);
9956
    jmp(L_exit);
9957

9958
    bind(L_fill_128_bytes_zmm);
9959
    cmpq(count, 128 >> shift);
9960
    jccb(Assembler::greater, L_fill_192_bytes_zmm);
9961
    fill64(to, 0, xtmp, true);
9962
    subq(count, 64 >> shift);
9963
    fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true);
9964
    jmp(L_exit);
9965

9966
    bind(L_fill_192_bytes_zmm);
9967
    cmpq(count, 192 >> shift);
9968
    jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm);
9969
    fill64(to, 0, xtmp, true);
9970
    fill64(to, 64, xtmp, true);
9971
    subq(count, 128 >> shift);
9972
    fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true);
9973
    jmp(L_exit);
9974

9975
    bind(L_fill_192_bytes_loop_pre_header_zmm);
9976
    {
9977
      movq(rtmp, to);
9978
      andq(rtmp, 63);
9979
      jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm);
9980
      negq(rtmp);
9981
      addq(rtmp, 64);
9982
      mov64(r8, -1L);
9983
      bzhiq(r8, r8, rtmp);
9984
      kmovql(k2, r8);
9985
      evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit);
9986
      addq(to, rtmp);
9987
      shrq(rtmp, shift);
9988
      subq(count, rtmp);
9989
    }
9990

9991
    cmpq(count, 192 >> shift);
9992
    jcc(Assembler::less, L_fill_start_zmm_sequence);
9993

9994
    bind(L_fill_192_bytes_loop_header_zmm);
9995
    subq(count, 192 >> shift);
9996

9997
    align32();
9998
    bind(L_fill_192_bytes_loop_zmm);
9999
      fill64(to, 0, xtmp, true);
10000
      fill64(to, 64, xtmp, true);
10001
      fill64(to, 128, xtmp, true);
10002
      addq(to, 192);
10003
      subq(count, 192 >> shift);
10004
      jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm);
10005

10006
    addq(count, 192 >> shift);
10007
    jcc(Assembler::zero, L_exit);
10008
    jmp(L_fill_start_zmm_sequence);
10009
  }
10010
  bind(L_exit);
10011
}
10012
#endif
10013
#endif //COMPILER2_OR_JVMCI
10014

10015

10016
#ifdef _LP64
10017
void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
10018
  Label done;
10019
  cvttss2sil(dst, src);
10020
  // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10021
  cmpl(dst, 0x80000000); // float_sign_flip
10022
  jccb(Assembler::notEqual, done);
10023
  subptr(rsp, 8);
10024
  movflt(Address(rsp, 0), src);
10025
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
10026
  pop(dst);
10027
  bind(done);
10028
}
10029

10030
void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
10031
  Label done;
10032
  cvttsd2sil(dst, src);
10033
  // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
10034
  cmpl(dst, 0x80000000); // float_sign_flip
10035
  jccb(Assembler::notEqual, done);
10036
  subptr(rsp, 8);
10037
  movdbl(Address(rsp, 0), src);
10038
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
10039
  pop(dst);
10040
  bind(done);
10041
}
10042

10043
void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
10044
  Label done;
10045
  cvttss2siq(dst, src);
10046
  cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10047
  jccb(Assembler::notEqual, done);
10048
  subptr(rsp, 8);
10049
  movflt(Address(rsp, 0), src);
10050
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
10051
  pop(dst);
10052
  bind(done);
10053
}
10054

10055
void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10056
  // Following code is line by line assembly translation rounding algorithm.
10057
  // Please refer to java.lang.Math.round(float) algorithm for details.
10058
  const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000;
10059
  const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24;
10060
  const int32_t FloatConsts_EXP_BIAS = 127;
10061
  const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF;
10062
  const int32_t MINUS_32 = 0xFFFFFFE0;
10063
  Label L_special_case, L_block1, L_exit;
10064
  movl(rtmp, FloatConsts_EXP_BIT_MASK);
10065
  movdl(dst, src);
10066
  andl(dst, rtmp);
10067
  sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1);
10068
  movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS);
10069
  subl(rtmp, dst);
10070
  movl(rcx, rtmp);
10071
  movl(dst, MINUS_32);
10072
  testl(rtmp, dst);
10073
  jccb(Assembler::notEqual, L_special_case);
10074
  movdl(dst, src);
10075
  andl(dst, FloatConsts_SIGNIF_BIT_MASK);
10076
  orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1);
10077
  movdl(rtmp, src);
10078
  testl(rtmp, rtmp);
10079
  jccb(Assembler::greaterEqual, L_block1);
10080
  negl(dst);
10081
  bind(L_block1);
10082
  sarl(dst);
10083
  addl(dst, 0x1);
10084
  sarl(dst, 0x1);
10085
  jmp(L_exit);
10086
  bind(L_special_case);
10087
  convert_f2i(dst, src);
10088
  bind(L_exit);
10089
}
10090

10091
void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) {
10092
  // Following code is line by line assembly translation rounding algorithm.
10093
  // Please refer to java.lang.Math.round(double) algorithm for details.
10094
  const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L;
10095
  const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53;
10096
  const int64_t DoubleConsts_EXP_BIAS = 1023;
10097
  const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL;
10098
  const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L;
10099
  Label L_special_case, L_block1, L_exit;
10100
  mov64(rtmp, DoubleConsts_EXP_BIT_MASK);
10101
  movq(dst, src);
10102
  andq(dst, rtmp);
10103
  sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1);
10104
  mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS);
10105
  subq(rtmp, dst);
10106
  movq(rcx, rtmp);
10107
  mov64(dst, MINUS_64);
10108
  testq(rtmp, dst);
10109
  jccb(Assembler::notEqual, L_special_case);
10110
  movq(dst, src);
10111
  mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK);
10112
  andq(dst, rtmp);
10113
  mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1);
10114
  orq(dst, rtmp);
10115
  movq(rtmp, src);
10116
  testq(rtmp, rtmp);
10117
  jccb(Assembler::greaterEqual, L_block1);
10118
  negq(dst);
10119
  bind(L_block1);
10120
  sarq(dst);
10121
  addq(dst, 0x1);
10122
  sarq(dst, 0x1);
10123
  jmp(L_exit);
10124
  bind(L_special_case);
10125
  convert_d2l(dst, src);
10126
  bind(L_exit);
10127
}
10128

10129
void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
10130
  Label done;
10131
  cvttsd2siq(dst, src);
10132
  cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
10133
  jccb(Assembler::notEqual, done);
10134
  subptr(rsp, 8);
10135
  movdbl(Address(rsp, 0), src);
10136
  call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
10137
  pop(dst);
10138
  bind(done);
10139
}
10140

10141
void MacroAssembler::cache_wb(Address line)
10142
{
10143
  // 64 bit cpus always support clflush
10144
  assert(VM_Version::supports_clflush(), "clflush should be available");
10145
  bool optimized = VM_Version::supports_clflushopt();
10146
  bool no_evict = VM_Version::supports_clwb();
10147

10148
  // prefer clwb (writeback without evict) otherwise
10149
  // prefer clflushopt (potentially parallel writeback with evict)
10150
  // otherwise fallback on clflush (serial writeback with evict)
10151

10152
  if (optimized) {
10153
    if (no_evict) {
10154
      clwb(line);
10155
    } else {
10156
      clflushopt(line);
10157
    }
10158
  } else {
10159
    // no need for fence when using CLFLUSH
10160
    clflush(line);
10161
  }
10162
}
10163

10164
void MacroAssembler::cache_wbsync(bool is_pre)
10165
{
10166
  assert(VM_Version::supports_clflush(), "clflush should be available");
10167
  bool optimized = VM_Version::supports_clflushopt();
10168
  bool no_evict = VM_Version::supports_clwb();
10169

10170
  // pick the correct implementation
10171

10172
  if (!is_pre && (optimized || no_evict)) {
10173
    // need an sfence for post flush when using clflushopt or clwb
10174
    // otherwise no no need for any synchroniaztion
10175

10176
    sfence();
10177
  }
10178
}
10179

10180
#endif // _LP64
10181

10182
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10183
  switch (cond) {
10184
    // Note some conditions are synonyms for others
10185
    case Assembler::zero:         return Assembler::notZero;
10186
    case Assembler::notZero:      return Assembler::zero;
10187
    case Assembler::less:         return Assembler::greaterEqual;
10188
    case Assembler::lessEqual:    return Assembler::greater;
10189
    case Assembler::greater:      return Assembler::lessEqual;
10190
    case Assembler::greaterEqual: return Assembler::less;
10191
    case Assembler::below:        return Assembler::aboveEqual;
10192
    case Assembler::belowEqual:   return Assembler::above;
10193
    case Assembler::above:        return Assembler::belowEqual;
10194
    case Assembler::aboveEqual:   return Assembler::below;
10195
    case Assembler::overflow:     return Assembler::noOverflow;
10196
    case Assembler::noOverflow:   return Assembler::overflow;
10197
    case Assembler::negative:     return Assembler::positive;
10198
    case Assembler::positive:     return Assembler::negative;
10199
    case Assembler::parity:       return Assembler::noParity;
10200
    case Assembler::noParity:     return Assembler::parity;
10201
  }
10202
  ShouldNotReachHere(); return Assembler::overflow;
10203
}
10204

10205
SkipIfEqual::SkipIfEqual(
10206
    MacroAssembler* masm, const bool* flag_addr, bool value, Register rscratch) {
10207
  _masm = masm;
10208
  _masm->cmp8(ExternalAddress((address)flag_addr), value, rscratch);
10209
  _masm->jcc(Assembler::equal, _label);
10210
}
10211

10212
SkipIfEqual::~SkipIfEqual() {
10213
  _masm->bind(_label);
10214
}
10215

10216
// 32-bit Windows has its own fast-path implementation
10217
// of get_thread
10218
#if !defined(WIN32) || defined(_LP64)
10219

10220
// This is simply a call to Thread::current()
10221
void MacroAssembler::get_thread(Register thread) {
10222
  if (thread != rax) {
10223
    push(rax);
10224
  }
10225
  LP64_ONLY(push(rdi);)
10226
  LP64_ONLY(push(rsi);)
10227
  push(rdx);
10228
  push(rcx);
10229
#ifdef _LP64
10230
  push(r8);
10231
  push(r9);
10232
  push(r10);
10233
  push(r11);
10234
#endif
10235

10236
  MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10237

10238
#ifdef _LP64
10239
  pop(r11);
10240
  pop(r10);
10241
  pop(r9);
10242
  pop(r8);
10243
#endif
10244
  pop(rcx);
10245
  pop(rdx);
10246
  LP64_ONLY(pop(rsi);)
10247
  LP64_ONLY(pop(rdi);)
10248
  if (thread != rax) {
10249
    mov(thread, rax);
10250
    pop(rax);
10251
  }
10252
}
10253

10254

10255
#endif // !WIN32 || _LP64
10256

10257
void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) {
10258
  Label L_stack_ok;
10259
  if (bias == 0) {
10260
    testptr(sp, 2 * wordSize - 1);
10261
  } else {
10262
    // lea(tmp, Address(rsp, bias);
10263
    mov(tmp, sp);
10264
    addptr(tmp, bias);
10265
    testptr(tmp, 2 * wordSize - 1);
10266
  }
10267
  jcc(Assembler::equal, L_stack_ok);
10268
  block_comment(msg);
10269
  stop(msg);
10270
  bind(L_stack_ok);
10271
}
10272

10273
// Implements lightweight-locking.
10274
//
10275
// obj: the object to be locked
10276
// reg_rax: rax
10277
// thread: the thread which attempts to lock obj
10278
// tmp: a temporary register
10279
void MacroAssembler::lightweight_lock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
10280
  assert(reg_rax == rax, "");
10281
  assert_different_registers(obj, reg_rax, thread, tmp);
10282

10283
  Label push;
10284
  const Register top = tmp;
10285

10286
  // Preload the markWord. It is important that this is the first
10287
  // instruction emitted as it is part of C1's null check semantics.
10288
  movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10289

10290
  // Load top.
10291
  movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10292

10293
  // Check if the lock-stack is full.
10294
  cmpl(top, LockStack::end_offset());
10295
  jcc(Assembler::greaterEqual, slow);
10296

10297
  // Check for recursion.
10298
  cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10299
  jcc(Assembler::equal, push);
10300

10301
  // Check header for monitor (0b10).
10302
  testptr(reg_rax, markWord::monitor_value);
10303
  jcc(Assembler::notZero, slow);
10304

10305
  // Try to lock. Transition lock bits 0b01 => 0b00
10306
  movptr(tmp, reg_rax);
10307
  andptr(tmp, ~(int32_t)markWord::unlocked_value);
10308
  orptr(reg_rax, markWord::unlocked_value);
10309
  lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10310
  jcc(Assembler::notEqual, slow);
10311

10312
  // Restore top, CAS clobbers register.
10313
  movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10314

10315
  bind(push);
10316
  // After successful lock, push object on lock-stack.
10317
  movptr(Address(thread, top), obj);
10318
  incrementl(top, oopSize);
10319
  movl(Address(thread, JavaThread::lock_stack_top_offset()), top);
10320
}
10321

10322
// Implements lightweight-unlocking.
10323
//
10324
// obj: the object to be unlocked
10325
// reg_rax: rax
10326
// thread: the thread
10327
// tmp: a temporary register
10328
//
10329
// x86_32 Note: reg_rax and thread may alias each other due to limited register
10330
//              availiability.
10331
void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) {
10332
  assert(reg_rax == rax, "");
10333
  assert_different_registers(obj, reg_rax, tmp);
10334
  LP64_ONLY(assert_different_registers(obj, reg_rax, thread, tmp);)
10335

10336
  Label unlocked, push_and_slow;
10337
  const Register top = tmp;
10338

10339
  // Check if obj is top of lock-stack.
10340
  movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10341
  cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
10342
  jcc(Assembler::notEqual, slow);
10343

10344
  // Pop lock-stack.
10345
  DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
10346
  subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10347

10348
  // Check if recursive.
10349
  cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
10350
  jcc(Assembler::equal, unlocked);
10351

10352
  // Not recursive. Check header for monitor (0b10).
10353
  movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes()));
10354
  testptr(reg_rax, markWord::monitor_value);
10355
  jcc(Assembler::notZero, push_and_slow);
10356

10357
#ifdef ASSERT
10358
  // Check header not unlocked (0b01).
10359
  Label not_unlocked;
10360
  testptr(reg_rax, markWord::unlocked_value);
10361
  jcc(Assembler::zero, not_unlocked);
10362
  stop("lightweight_unlock already unlocked");
10363
  bind(not_unlocked);
10364
#endif
10365

10366
  // Try to unlock. Transition lock bits 0b00 => 0b01
10367
  movptr(tmp, reg_rax);
10368
  orptr(tmp, markWord::unlocked_value);
10369
  lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
10370
  jcc(Assembler::equal, unlocked);
10371

10372
  bind(push_and_slow);
10373
  // Restore lock-stack and handle the unlock in runtime.
10374
  if (thread == reg_rax) {
10375
    // On x86_32 we may lose the thread.
10376
    get_thread(thread);
10377
  }
10378
#ifdef ASSERT
10379
  movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
10380
  movptr(Address(thread, top), obj);
10381
#endif
10382
  addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
10383
  jmp(slow);
10384

10385
  bind(unlocked);
10386
}
10387

10388
#ifdef _LP64
10389
// Saves legacy GPRs state on stack.
10390
void MacroAssembler::save_legacy_gprs() {
10391
  subq(rsp, 16 * wordSize);
10392
  movq(Address(rsp, 15 * wordSize), rax);
10393
  movq(Address(rsp, 14 * wordSize), rcx);
10394
  movq(Address(rsp, 13 * wordSize), rdx);
10395
  movq(Address(rsp, 12 * wordSize), rbx);
10396
  movq(Address(rsp, 10 * wordSize), rbp);
10397
  movq(Address(rsp, 9 * wordSize), rsi);
10398
  movq(Address(rsp, 8 * wordSize), rdi);
10399
  movq(Address(rsp, 7 * wordSize), r8);
10400
  movq(Address(rsp, 6 * wordSize), r9);
10401
  movq(Address(rsp, 5 * wordSize), r10);
10402
  movq(Address(rsp, 4 * wordSize), r11);
10403
  movq(Address(rsp, 3 * wordSize), r12);
10404
  movq(Address(rsp, 2 * wordSize), r13);
10405
  movq(Address(rsp, wordSize), r14);
10406
  movq(Address(rsp, 0), r15);
10407
}
10408

10409
// Resotres back legacy GPRs state from stack.
10410
void MacroAssembler::restore_legacy_gprs() {
10411
  movq(r15, Address(rsp, 0));
10412
  movq(r14, Address(rsp, wordSize));
10413
  movq(r13, Address(rsp, 2 * wordSize));
10414
  movq(r12, Address(rsp, 3 * wordSize));
10415
  movq(r11, Address(rsp, 4 * wordSize));
10416
  movq(r10, Address(rsp, 5 * wordSize));
10417
  movq(r9,  Address(rsp, 6 * wordSize));
10418
  movq(r8,  Address(rsp, 7 * wordSize));
10419
  movq(rdi, Address(rsp, 8 * wordSize));
10420
  movq(rsi, Address(rsp, 9 * wordSize));
10421
  movq(rbp, Address(rsp, 10 * wordSize));
10422
  movq(rbx, Address(rsp, 12 * wordSize));
10423
  movq(rdx, Address(rsp, 13 * wordSize));
10424
  movq(rcx, Address(rsp, 14 * wordSize));
10425
  movq(rax, Address(rsp, 15 * wordSize));
10426
  addq(rsp, 16 * wordSize);
10427
}
10428
#endif
10429

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.