jdk

c2_MacroAssembler_aarch64.cpp
2496 строк · 85.2 Кб
Перенос по словам
1
/*
2
 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4
 *
5
 * This code is free software; you can redistribute it and/or modify it
6
 * under the terms of the GNU General Public License version 2 only, as
7
 * published by the Free Software Foundation.
8
 *
9
 * This code is distributed in the hope that it will be useful, but WITHOUT
10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
 * version 2 for more details (a copy is included in the LICENSE file that
13
 * accompanied this code).
14
 *
15
 * You should have received a copy of the GNU General Public License version
16
 * 2 along with this work; if not, write to the Free Software Foundation,
17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18
 *
19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
 * or visit www.oracle.com if you need additional information or have any
21
 * questions.
22
 *
23
 */
24

25
#include "precompiled.hpp"
26
#include "asm/assembler.hpp"
27
#include "asm/assembler.inline.hpp"
28
#include "opto/c2_MacroAssembler.hpp"
29
#include "opto/compile.hpp"
30
#include "opto/intrinsicnode.hpp"
31
#include "opto/matcher.hpp"
32
#include "opto/output.hpp"
33
#include "opto/subnode.hpp"
34
#include "runtime/stubRoutines.hpp"
35
#include "utilities/globalDefinitions.hpp"
36

37
#ifdef PRODUCT
38
#define BLOCK_COMMENT(str) /* nothing */
39
#define STOP(error) stop(error)
40
#else
41
#define BLOCK_COMMENT(str) block_comment(str)
42
#define STOP(error) block_comment(error); stop(error)
43
#endif
44

45
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
46

47
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
48

49
void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
50
                                  Register tmp2Reg, Register tmp3Reg) {
51
  Register oop = objectReg;
52
  Register box = boxReg;
53
  Register disp_hdr = tmpReg;
54
  Register tmp = tmp2Reg;
55
  Label cont;
56
  Label object_has_monitor;
57
  Label count, no_count;
58

59
  assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
60
  assert_different_registers(oop, box, tmp, disp_hdr);
61

62
  // Load markWord from object into displaced_header.
63
  ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
64

65
  if (DiagnoseSyncOnValueBasedClasses != 0) {
66
    load_klass(tmp, oop);
67
    ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
68
    tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
69
    br(Assembler::NE, cont);
70
  }
71

72
  // Check for existing monitor
73
  tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
74

75
  if (LockingMode == LM_MONITOR) {
76
    tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
77
    b(cont);
78
  } else {
79
    assert(LockingMode == LM_LEGACY, "must be");
80
    // Set tmp to be (markWord of object | UNLOCK_VALUE).
81
    orr(tmp, disp_hdr, markWord::unlocked_value);
82

83
    // Initialize the box. (Must happen before we update the object mark!)
84
    str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
85

86
    // Compare object markWord with an unlocked value (tmp) and if
87
    // equal exchange the stack address of our box with object markWord.
88
    // On failure disp_hdr contains the possibly locked markWord.
89
    cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
90
            /*release*/ true, /*weak*/ false, disp_hdr);
91
    br(Assembler::EQ, cont);
92

93
    assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
94

95
    // If the compare-and-exchange succeeded, then we found an unlocked
96
    // object, will have now locked it will continue at label cont
97

98
    // Check if the owner is self by comparing the value in the
99
    // markWord of object (disp_hdr) with the stack pointer.
100
    mov(rscratch1, sp);
101
    sub(disp_hdr, disp_hdr, rscratch1);
102
    mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
103
    // If condition is true we are cont and hence we can store 0 as the
104
    // displaced header in the box, which indicates that it is a recursive lock.
105
    ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
106
    str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
107
    b(cont);
108
  }
109

110
  // Handle existing monitor.
111
  bind(object_has_monitor);
112

113
  // The object's monitor m is unlocked iff m->owner == nullptr,
114
  // otherwise m->owner may contain a thread or a stack address.
115
  //
116
  // Try to CAS m->owner from null to current thread.
117
  add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
118
  cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
119
          /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
120

121
  // Store a non-null value into the box to avoid looking like a re-entrant
122
  // lock. The fast-path monitor unlock code checks for
123
  // markWord::monitor_value so use markWord::unused_mark which has the
124
  // relevant bit set, and also matches ObjectSynchronizer::enter.
125
  mov(tmp, (address)markWord::unused_mark().value());
126
  str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
127

128
  br(Assembler::EQ, cont); // CAS success means locking succeeded
129

130
  cmp(tmp3Reg, rthread);
131
  br(Assembler::NE, cont); // Check for recursive locking
132

133
  // Recursive lock case
134
  increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
135
  // flag == EQ still from the cmp above, checking if this is a reentrant lock
136

137
  bind(cont);
138
  // flag == EQ indicates success
139
  // flag == NE indicates failure
140
  br(Assembler::NE, no_count);
141

142
  bind(count);
143
  increment(Address(rthread, JavaThread::held_monitor_count_offset()));
144

145
  bind(no_count);
146
}
147

148
void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
149
                                    Register tmp2Reg) {
150
  Register oop = objectReg;
151
  Register box = boxReg;
152
  Register disp_hdr = tmpReg;
153
  Register tmp = tmp2Reg;
154
  Label cont;
155
  Label object_has_monitor;
156
  Label count, no_count;
157

158
  assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
159
  assert_different_registers(oop, box, tmp, disp_hdr);
160

161
  if (LockingMode == LM_LEGACY) {
162
    // Find the lock address and load the displaced header from the stack.
163
    ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
164

165
    // If the displaced header is 0, we have a recursive unlock.
166
    cmp(disp_hdr, zr);
167
    br(Assembler::EQ, cont);
168
  }
169

170
  // Handle existing monitor.
171
  ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
172
  tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
173

174
  if (LockingMode == LM_MONITOR) {
175
    tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
176
    b(cont);
177
  } else {
178
    assert(LockingMode == LM_LEGACY, "must be");
179
    // Check if it is still a light weight lock, this is is true if we
180
    // see the stack address of the basicLock in the markWord of the
181
    // object.
182

183
    cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
184
            /*release*/ true, /*weak*/ false, tmp);
185
    b(cont);
186
  }
187

188
  assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
189

190
  // Handle existing monitor.
191
  bind(object_has_monitor);
192
  STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
193
  add(tmp, tmp, -(int)markWord::monitor_value); // monitor
194

195
  ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
196

197
  Label notRecursive;
198
  cbz(disp_hdr, notRecursive);
199

200
  // Recursive lock
201
  sub(disp_hdr, disp_hdr, 1u);
202
  str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
203
  cmp(disp_hdr, disp_hdr); // Sets flags for result
204
  b(cont);
205

206
  bind(notRecursive);
207
  ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
208
  ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
209
  orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
210
  cmp(rscratch1, zr); // Sets flags for result
211
  cbnz(rscratch1, cont);
212
  // need a release store here
213
  lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
214
  stlr(zr, tmp); // set unowned
215

216
  bind(cont);
217
  // flag == EQ indicates success
218
  // flag == NE indicates failure
219
  br(Assembler::NE, no_count);
220

221
  bind(count);
222
  decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
223

224
  bind(no_count);
225
}
226

227
void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
228
                                              Register t2, Register t3) {
229
  assert(LockingMode == LM_LIGHTWEIGHT, "must be");
230
  assert_different_registers(obj, t1, t2, t3);
231

232
  // Handle inflated monitor.
233
  Label inflated;
234
  // Finish fast lock successfully. MUST branch to with flag == EQ
235
  Label locked;
236
  // Finish fast lock unsuccessfully. MUST branch to with flag == NE
237
  Label slow_path;
238

239
  if (DiagnoseSyncOnValueBasedClasses != 0) {
240
    load_klass(t1, obj);
241
    ldrw(t1, Address(t1, Klass::access_flags_offset()));
242
    tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
243
    br(Assembler::NE, slow_path);
244
  }
245

246
  const Register t1_mark = t1;
247

248
  { // Lightweight locking
249

250
    // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
251
    Label push;
252

253
    const Register t2_top = t2;
254
    const Register t3_t = t3;
255

256
    // Check if lock-stack is full.
257
    ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
258
    cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
259
    br(Assembler::GT, slow_path);
260

261
    // Check if recursive.
262
    subw(t3_t, t2_top, oopSize);
263
    ldr(t3_t, Address(rthread, t3_t));
264
    cmp(obj, t3_t);
265
    br(Assembler::EQ, push);
266

267
    // Relaxed normal load to check for monitor. Optimization for monitor case.
268
    ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
269
    tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
270

271
    // Not inflated
272
    assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
273

274
    // Try to lock. Transition lock-bits 0b01 => 0b00
275
    orr(t1_mark, t1_mark, markWord::unlocked_value);
276
    eor(t3_t, t1_mark, markWord::unlocked_value);
277
    cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
278
            /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
279
    br(Assembler::NE, slow_path);
280

281
    bind(push);
282
    // After successful lock, push object on lock-stack.
283
    str(obj, Address(rthread, t2_top));
284
    addw(t2_top, t2_top, oopSize);
285
    strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
286
    b(locked);
287
  }
288

289
  { // Handle inflated monitor.
290
    bind(inflated);
291

292
    // mark contains the tagged ObjectMonitor*.
293
    const Register t1_tagged_monitor = t1_mark;
294
    const uintptr_t monitor_tag = markWord::monitor_value;
295
    const Register t2_owner_addr = t2;
296
    const Register t3_owner = t3;
297

298
    // Compute owner address.
299
    lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
300

301
    // CAS owner (null => current thread).
302
    cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
303
            /*release*/ false, /*weak*/ false, t3_owner);
304
    br(Assembler::EQ, locked);
305

306
    // Check if recursive.
307
    cmp(t3_owner, rthread);
308
    br(Assembler::NE, slow_path);
309

310
    // Recursive.
311
    increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
312
  }
313

314
  bind(locked);
315
  increment(Address(rthread, JavaThread::held_monitor_count_offset()));
316

317
#ifdef ASSERT
318
  // Check that locked label is reached with Flags == EQ.
319
  Label flag_correct;
320
  br(Assembler::EQ, flag_correct);
321
  stop("Fast Lock Flag != EQ");
322
#endif
323

324
  bind(slow_path);
325
#ifdef ASSERT
326
  // Check that slow_path label is reached with Flags == NE.
327
  br(Assembler::NE, flag_correct);
328
  stop("Fast Lock Flag != NE");
329
  bind(flag_correct);
330
#endif
331
  // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
332
}
333

334
void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
335
                                                Register t3) {
336
  assert(LockingMode == LM_LIGHTWEIGHT, "must be");
337
  assert_different_registers(obj, t1, t2, t3);
338

339
  // Handle inflated monitor.
340
  Label inflated, inflated_load_monitor;
341
  // Finish fast unlock successfully. MUST branch to with flag == EQ
342
  Label unlocked;
343
  // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
344
  Label slow_path;
345

346
  const Register t1_mark = t1;
347
  const Register t2_top = t2;
348
  const Register t3_t = t3;
349

350
  { // Lightweight unlock
351

352
    // Check if obj is top of lock-stack.
353
    ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
354
    subw(t2_top, t2_top, oopSize);
355
    ldr(t3_t, Address(rthread, t2_top));
356
    cmp(obj, t3_t);
357
    // Top of lock stack was not obj. Must be monitor.
358
    br(Assembler::NE, inflated_load_monitor);
359

360
    // Pop lock-stack.
361
    DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
362
    strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
363

364
    // Check if recursive.
365
    subw(t3_t, t2_top, oopSize);
366
    ldr(t3_t, Address(rthread, t3_t));
367
    cmp(obj, t3_t);
368
    br(Assembler::EQ, unlocked);
369

370
    // Not recursive.
371
    // Load Mark.
372
    ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
373

374
    // Check header for monitor (0b10).
375
    tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
376

377
    // Try to unlock. Transition lock bits 0b00 => 0b01
378
    assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
379
    orr(t3_t, t1_mark, markWord::unlocked_value);
380
    cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
381
            /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
382
    br(Assembler::EQ, unlocked);
383

384
    // Compare and exchange failed.
385
    // Restore lock-stack and handle the unlock in runtime.
386
    DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
387
    addw(t2_top, t2_top, oopSize);
388
    str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
389
    b(slow_path);
390
  }
391

392

393
  { // Handle inflated monitor.
394
    bind(inflated_load_monitor);
395
    ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
396
#ifdef ASSERT
397
    tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
398
    stop("Fast Unlock not monitor");
399
#endif
400

401
    bind(inflated);
402

403
#ifdef ASSERT
404
    Label check_done;
405
    subw(t2_top, t2_top, oopSize);
406
    cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
407
    br(Assembler::LT, check_done);
408
    ldr(t3_t, Address(rthread, t2_top));
409
    cmp(obj, t3_t);
410
    br(Assembler::NE, inflated);
411
    stop("Fast Unlock lock on stack");
412
    bind(check_done);
413
#endif
414

415
    // mark contains the tagged ObjectMonitor*.
416
    const Register t1_monitor = t1_mark;
417
    const uintptr_t monitor_tag = markWord::monitor_value;
418

419
    // Untag the monitor.
420
    sub(t1_monitor, t1_mark, monitor_tag);
421

422
    const Register t2_recursions = t2;
423
    Label not_recursive;
424

425
    // Check if recursive.
426
    ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
427
    cbz(t2_recursions, not_recursive);
428

429
    // Recursive unlock.
430
    sub(t2_recursions, t2_recursions, 1u);
431
    str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
432
    // Set flag == EQ
433
    cmp(t2_recursions, t2_recursions);
434
    b(unlocked);
435

436
    bind(not_recursive);
437

438
    Label release;
439
    const Register t2_owner_addr = t2;
440

441
    // Compute owner address.
442
    lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
443

444
    // Check if the entry lists are empty.
445
    ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
446
    ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
447
    orr(rscratch1, rscratch1, t3_t);
448
    cmp(rscratch1, zr);
449
    br(Assembler::EQ, release);
450

451
    // The owner may be anonymous and we removed the last obj entry in
452
    // the lock-stack. This loses the information about the owner.
453
    // Write the thread to the owner field so the runtime knows the owner.
454
    str(rthread, Address(t2_owner_addr));
455
    b(slow_path);
456

457
    bind(release);
458
    // Set owner to null.
459
    // Release to satisfy the JMM
460
    stlr(zr, t2_owner_addr);
461
  }
462

463
  bind(unlocked);
464
  decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
465

466
#ifdef ASSERT
467
  // Check that unlocked label is reached with Flags == EQ.
468
  Label flag_correct;
469
  br(Assembler::EQ, flag_correct);
470
  stop("Fast Unlock Flag != EQ");
471
#endif
472

473
  bind(slow_path);
474
#ifdef ASSERT
475
  // Check that slow_path label is reached with Flags == NE.
476
  br(Assembler::NE, flag_correct);
477
  stop("Fast Unlock Flag != NE");
478
  bind(flag_correct);
479
#endif
480
  // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
481
}
482

483
// Search for str1 in str2 and return index or -1
484
// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
485
void C2_MacroAssembler::string_indexof(Register str2, Register str1,
486
                                       Register cnt2, Register cnt1,
487
                                       Register tmp1, Register tmp2,
488
                                       Register tmp3, Register tmp4,
489
                                       Register tmp5, Register tmp6,
490
                                       int icnt1, Register result, int ae) {
491
  // NOTE: tmp5, tmp6 can be zr depending on specific method version
492
  Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
493

494
  Register ch1 = rscratch1;
495
  Register ch2 = rscratch2;
496
  Register cnt1tmp = tmp1;
497
  Register cnt2tmp = tmp2;
498
  Register cnt1_neg = cnt1;
499
  Register cnt2_neg = cnt2;
500
  Register result_tmp = tmp4;
501

502
  bool isL = ae == StrIntrinsicNode::LL;
503

504
  bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
505
  bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
506
  int str1_chr_shift = str1_isL ? 0:1;
507
  int str2_chr_shift = str2_isL ? 0:1;
508
  int str1_chr_size = str1_isL ? 1:2;
509
  int str2_chr_size = str2_isL ? 1:2;
510
  chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
511
                                      (chr_insn)&MacroAssembler::ldrh;
512
  chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
513
                                      (chr_insn)&MacroAssembler::ldrh;
514
  chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
515
  chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
516

517
  // Note, inline_string_indexOf() generates checks:
518
  // if (substr.count > string.count) return -1;
519
  // if (substr.count == 0) return 0;
520

521
  // We have two strings, a source string in str2, cnt2 and a pattern string
522
  // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
523

524
  // For larger pattern and source we use a simplified Boyer Moore algorithm.
525
  // With a small pattern and source we use linear scan.
526

527
  if (icnt1 == -1) {
528
    sub(result_tmp, cnt2, cnt1);
529
    cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
530
    br(LT, LINEARSEARCH);
531
    dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
532
    subs(zr, cnt1, 256);
533
    lsr(tmp1, cnt2, 2);
534
    ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
535
    br(GE, LINEARSTUB);
536
  }
537

538
// The Boyer Moore alogorithm is based on the description here:-
539
//
540
// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
541
//
542
// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
543
// and the 'Good Suffix' rule.
544
//
545
// These rules are essentially heuristics for how far we can shift the
546
// pattern along the search string.
547
//
548
// The implementation here uses the 'Bad Character' rule only because of the
549
// complexity of initialisation for the 'Good Suffix' rule.
550
//
551
// This is also known as the Boyer-Moore-Horspool algorithm:-
552
//
553
// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
554
//
555
// This particular implementation has few java-specific optimizations.
556
//
557
// #define ASIZE 256
558
//
559
//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
560
//       int i, j;
561
//       unsigned c;
562
//       unsigned char bc[ASIZE];
563
//
564
//       /* Preprocessing */
565
//       for (i = 0; i < ASIZE; ++i)
566
//          bc[i] = m;
567
//       for (i = 0; i < m - 1; ) {
568
//          c = x[i];
569
//          ++i;
570
//          // c < 256 for Latin1 string, so, no need for branch
571
//          #ifdef PATTERN_STRING_IS_LATIN1
572
//          bc[c] = m - i;
573
//          #else
574
//          if (c < ASIZE) bc[c] = m - i;
575
//          #endif
576
//       }
577
//
578
//       /* Searching */
579
//       j = 0;
580
//       while (j <= n - m) {
581
//          c = y[i+j];
582
//          if (x[m-1] == c)
583
//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
584
//          if (i < 0) return j;
585
//          // c < 256 for Latin1 string, so, no need for branch
586
//          #ifdef SOURCE_STRING_IS_LATIN1
587
//          // LL case: (c< 256) always true. Remove branch
588
//          j += bc[y[j+m-1]];
589
//          #endif
590
//          #ifndef PATTERN_STRING_IS_UTF
591
//          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
592
//          if (c < ASIZE)
593
//            j += bc[y[j+m-1]];
594
//          else
595
//            j += 1
596
//          #endif
597
//          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
598
//          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
599
//          if (c < ASIZE)
600
//            j += bc[y[j+m-1]];
601
//          else
602
//            j += m
603
//          #endif
604
//       }
605
//    }
606

607
  if (icnt1 == -1) {
608
    Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
609
        BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
610
    Register cnt1end = tmp2;
611
    Register str2end = cnt2;
612
    Register skipch = tmp2;
613

614
    // str1 length is >=8, so, we can read at least 1 register for cases when
615
    // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
616
    // UL case. We'll re-read last character in inner pre-loop code to have
617
    // single outer pre-loop load
618
    const int firstStep = isL ? 7 : 3;
619

620
    const int ASIZE = 256;
621
    const int STORED_BYTES = 32; // amount of bytes stored per instruction
622
    sub(sp, sp, ASIZE);
623
    mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
624
    mov(ch1, sp);
625
    BIND(BM_INIT_LOOP);
626
      stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
627
      subs(tmp5, tmp5, 1);
628
      br(GT, BM_INIT_LOOP);
629

630
      sub(cnt1tmp, cnt1, 1);
631
      mov(tmp5, str2);
632
      add(str2end, str2, result_tmp, LSL, str2_chr_shift);
633
      sub(ch2, cnt1, 1);
634
      mov(tmp3, str1);
635
    BIND(BCLOOP);
636
      (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
637
      if (!str1_isL) {
638
        subs(zr, ch1, ASIZE);
639
        br(HS, BCSKIP);
640
      }
641
      strb(ch2, Address(sp, ch1));
642
    BIND(BCSKIP);
643
      subs(ch2, ch2, 1);
644
      br(GT, BCLOOP);
645

646
      add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
647
      if (str1_isL == str2_isL) {
648
        // load last 8 bytes (8LL/4UU symbols)
649
        ldr(tmp6, Address(tmp6, -wordSize));
650
      } else {
651
        ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
652
        // convert Latin1 to UTF. We'll have to wait until load completed, but
653
        // it's still faster than per-character loads+checks
654
        lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
655
        ubfx(ch1, tmp6, 8, 8); // str1[N-2]
656
        ubfx(ch2, tmp6, 16, 8); // str1[N-3]
657
        andr(tmp6, tmp6, 0xFF); // str1[N-4]
658
        orr(ch2, ch1, ch2, LSL, 16);
659
        orr(tmp6, tmp6, tmp3, LSL, 48);
660
        orr(tmp6, tmp6, ch2, LSL, 16);
661
      }
662
    BIND(BMLOOPSTR2);
663
      (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
664
      sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
665
      if (str1_isL == str2_isL) {
666
        // re-init tmp3. It's for free because it's executed in parallel with
667
        // load above. Alternative is to initialize it before loop, but it'll
668
        // affect performance on in-order systems with 2 or more ld/st pipelines
669
        lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
670
      }
671
      if (!isL) { // UU/UL case
672
        lsl(ch2, cnt1tmp, 1); // offset in bytes
673
      }
674
      cmp(tmp3, skipch);
675
      br(NE, BMSKIP);
676
      ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
677
      mov(ch1, tmp6);
678
      if (isL) {
679
        b(BMLOOPSTR1_AFTER_LOAD);
680
      } else {
681
        sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
682
        b(BMLOOPSTR1_CMP);
683
      }
684
    BIND(BMLOOPSTR1);
685
      (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
686
      (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
687
    BIND(BMLOOPSTR1_AFTER_LOAD);
688
      subs(cnt1tmp, cnt1tmp, 1);
689
      br(LT, BMLOOPSTR1_LASTCMP);
690
    BIND(BMLOOPSTR1_CMP);
691
      cmp(ch1, ch2);
692
      br(EQ, BMLOOPSTR1);
693
    BIND(BMSKIP);
694
      if (!isL) {
695
        // if we've met UTF symbol while searching Latin1 pattern, then we can
696
        // skip cnt1 symbols
697
        if (str1_isL != str2_isL) {
698
          mov(result_tmp, cnt1);
699
        } else {
700
          mov(result_tmp, 1);
701
        }
702
        subs(zr, skipch, ASIZE);
703
        br(HS, BMADV);
704
      }
705
      ldrb(result_tmp, Address(sp, skipch)); // load skip distance
706
    BIND(BMADV);
707
      sub(cnt1tmp, cnt1, 1);
708
      add(str2, str2, result_tmp, LSL, str2_chr_shift);
709
      cmp(str2, str2end);
710
      br(LE, BMLOOPSTR2);
711
      add(sp, sp, ASIZE);
712
      b(NOMATCH);
713
    BIND(BMLOOPSTR1_LASTCMP);
714
      cmp(ch1, ch2);
715
      br(NE, BMSKIP);
716
    BIND(BMMATCH);
717
      sub(result, str2, tmp5);
718
      if (!str2_isL) lsr(result, result, 1);
719
      add(sp, sp, ASIZE);
720
      b(DONE);
721

722
    BIND(LINEARSTUB);
723
    cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
724
    br(LT, LINEAR_MEDIUM);
725
    mov(result, zr);
726
    RuntimeAddress stub = nullptr;
727
    if (isL) {
728
      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
729
      assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
730
    } else if (str1_isL) {
731
      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
732
       assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
733
    } else {
734
      stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
735
      assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
736
    }
737
    address call = trampoline_call(stub);
738
    if (call == nullptr) {
739
      DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
740
      ciEnv::current()->record_failure("CodeCache is full");
741
      return;
742
    }
743
    b(DONE);
744
  }
745

746
  BIND(LINEARSEARCH);
747
  {
748
    Label DO1, DO2, DO3;
749

750
    Register str2tmp = tmp2;
751
    Register first = tmp3;
752

753
    if (icnt1 == -1)
754
    {
755
        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
756

757
        cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
758
        br(LT, DOSHORT);
759
      BIND(LINEAR_MEDIUM);
760
        (this->*str1_load_1chr)(first, Address(str1));
761
        lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
762
        sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
763
        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
764
        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
765

766
      BIND(FIRST_LOOP);
767
        (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
768
        cmp(first, ch2);
769
        br(EQ, STR1_LOOP);
770
      BIND(STR2_NEXT);
771
        adds(cnt2_neg, cnt2_neg, str2_chr_size);
772
        br(LE, FIRST_LOOP);
773
        b(NOMATCH);
774

775
      BIND(STR1_LOOP);
776
        adds(cnt1tmp, cnt1_neg, str1_chr_size);
777
        add(cnt2tmp, cnt2_neg, str2_chr_size);
778
        br(GE, MATCH);
779

780
      BIND(STR1_NEXT);
781
        (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
782
        (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
783
        cmp(ch1, ch2);
784
        br(NE, STR2_NEXT);
785
        adds(cnt1tmp, cnt1tmp, str1_chr_size);
786
        add(cnt2tmp, cnt2tmp, str2_chr_size);
787
        br(LT, STR1_NEXT);
788
        b(MATCH);
789

790
      BIND(DOSHORT);
791
      if (str1_isL == str2_isL) {
792
        cmp(cnt1, (u1)2);
793
        br(LT, DO1);
794
        br(GT, DO3);
795
      }
796
    }
797

798
    if (icnt1 == 4) {
799
      Label CH1_LOOP;
800

801
        (this->*load_4chr)(ch1, str1);
802
        sub(result_tmp, cnt2, 4);
803
        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
804
        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
805

806
      BIND(CH1_LOOP);
807
        (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
808
        cmp(ch1, ch2);
809
        br(EQ, MATCH);
810
        adds(cnt2_neg, cnt2_neg, str2_chr_size);
811
        br(LE, CH1_LOOP);
812
        b(NOMATCH);
813
      }
814

815
    if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
816
      Label CH1_LOOP;
817

818
      BIND(DO2);
819
        (this->*load_2chr)(ch1, str1);
820
        if (icnt1 == 2) {
821
          sub(result_tmp, cnt2, 2);
822
        }
823
        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
824
        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
825
      BIND(CH1_LOOP);
826
        (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
827
        cmp(ch1, ch2);
828
        br(EQ, MATCH);
829
        adds(cnt2_neg, cnt2_neg, str2_chr_size);
830
        br(LE, CH1_LOOP);
831
        b(NOMATCH);
832
    }
833

834
    if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
835
      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
836

837
      BIND(DO3);
838
        (this->*load_2chr)(first, str1);
839
        (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
840
        if (icnt1 == 3) {
841
          sub(result_tmp, cnt2, 3);
842
        }
843
        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
844
        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
845
      BIND(FIRST_LOOP);
846
        (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
847
        cmpw(first, ch2);
848
        br(EQ, STR1_LOOP);
849
      BIND(STR2_NEXT);
850
        adds(cnt2_neg, cnt2_neg, str2_chr_size);
851
        br(LE, FIRST_LOOP);
852
        b(NOMATCH);
853

854
      BIND(STR1_LOOP);
855
        add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
856
        (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
857
        cmp(ch1, ch2);
858
        br(NE, STR2_NEXT);
859
        b(MATCH);
860
    }
861

862
    if (icnt1 == -1 || icnt1 == 1) {
863
      Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
864

865
      BIND(DO1);
866
        (this->*str1_load_1chr)(ch1, str1);
867
        cmp(cnt2, (u1)8);
868
        br(LT, DO1_SHORT);
869

870
        sub(result_tmp, cnt2, 8/str2_chr_size);
871
        sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
872
        mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
873
        lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
874

875
        if (str2_isL) {
876
          orr(ch1, ch1, ch1, LSL, 8);
877
        }
878
        orr(ch1, ch1, ch1, LSL, 16);
879
        orr(ch1, ch1, ch1, LSL, 32);
880
      BIND(CH1_LOOP);
881
        ldr(ch2, Address(str2, cnt2_neg));
882
        eor(ch2, ch1, ch2);
883
        sub(tmp1, ch2, tmp3);
884
        orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
885
        bics(tmp1, tmp1, tmp2);
886
        br(NE, HAS_ZERO);
887
        adds(cnt2_neg, cnt2_neg, 8);
888
        br(LT, CH1_LOOP);
889

890
        cmp(cnt2_neg, (u1)8);
891
        mov(cnt2_neg, 0);
892
        br(LT, CH1_LOOP);
893
        b(NOMATCH);
894

895
      BIND(HAS_ZERO);
896
        rev(tmp1, tmp1);
897
        clz(tmp1, tmp1);
898
        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
899
        b(MATCH);
900

901
      BIND(DO1_SHORT);
902
        mov(result_tmp, cnt2);
903
        lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
904
        sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
905
      BIND(DO1_LOOP);
906
        (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
907
        cmpw(ch1, ch2);
908
        br(EQ, MATCH);
909
        adds(cnt2_neg, cnt2_neg, str2_chr_size);
910
        br(LT, DO1_LOOP);
911
    }
912
  }
913
  BIND(NOMATCH);
914
    mov(result, -1);
915
    b(DONE);
916
  BIND(MATCH);
917
    add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
918
  BIND(DONE);
919
}
920

921
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
922
typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
923

924
void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
925
                                            Register ch, Register result,
926
                                            Register tmp1, Register tmp2, Register tmp3)
927
{
928
  Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
929
  Register cnt1_neg = cnt1;
930
  Register ch1 = rscratch1;
931
  Register result_tmp = rscratch2;
932

933
  cbz(cnt1, NOMATCH);
934

935
  cmp(cnt1, (u1)4);
936
  br(LT, DO1_SHORT);
937

938
  orr(ch, ch, ch, LSL, 16);
939
  orr(ch, ch, ch, LSL, 32);
940

941
  sub(cnt1, cnt1, 4);
942
  mov(result_tmp, cnt1);
943
  lea(str1, Address(str1, cnt1, Address::uxtw(1)));
944
  sub(cnt1_neg, zr, cnt1, LSL, 1);
945

946
  mov(tmp3, 0x0001000100010001);
947

948
  BIND(CH1_LOOP);
949
    ldr(ch1, Address(str1, cnt1_neg));
950
    eor(ch1, ch, ch1);
951
    sub(tmp1, ch1, tmp3);
952
    orr(tmp2, ch1, 0x7fff7fff7fff7fff);
953
    bics(tmp1, tmp1, tmp2);
954
    br(NE, HAS_ZERO);
955
    adds(cnt1_neg, cnt1_neg, 8);
956
    br(LT, CH1_LOOP);
957

958
    cmp(cnt1_neg, (u1)8);
959
    mov(cnt1_neg, 0);
960
    br(LT, CH1_LOOP);
961
    b(NOMATCH);
962

963
  BIND(HAS_ZERO);
964
    rev(tmp1, tmp1);
965
    clz(tmp1, tmp1);
966
    add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
967
    b(MATCH);
968

969
  BIND(DO1_SHORT);
970
    mov(result_tmp, cnt1);
971
    lea(str1, Address(str1, cnt1, Address::uxtw(1)));
972
    sub(cnt1_neg, zr, cnt1, LSL, 1);
973
  BIND(DO1_LOOP);
974
    ldrh(ch1, Address(str1, cnt1_neg));
975
    cmpw(ch, ch1);
976
    br(EQ, MATCH);
977
    adds(cnt1_neg, cnt1_neg, 2);
978
    br(LT, DO1_LOOP);
979
  BIND(NOMATCH);
980
    mov(result, -1);
981
    b(DONE);
982
  BIND(MATCH);
983
    add(result, result_tmp, cnt1_neg, ASR, 1);
984
  BIND(DONE);
985
}
986

987
void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
988
                                                Register ch, Register result,
989
                                                FloatRegister ztmp1,
990
                                                FloatRegister ztmp2,
991
                                                PRegister tmp_pg,
992
                                                PRegister tmp_pdn, bool isL)
993
{
994
  // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
995
  assert(tmp_pg->is_governing(),
996
         "this register has to be a governing predicate register");
997

998
  Label LOOP, MATCH, DONE, NOMATCH;
999
  Register vec_len = rscratch1;
1000
  Register idx = rscratch2;
1001

1002
  SIMD_RegVariant T = (isL == true) ? B : H;
1003

1004
  cbz(cnt1, NOMATCH);
1005

1006
  // Assign the particular char throughout the vector.
1007
  sve_dup(ztmp2, T, ch);
1008
  if (isL) {
1009
    sve_cntb(vec_len);
1010
  } else {
1011
    sve_cnth(vec_len);
1012
  }
1013
  mov(idx, 0);
1014

1015
  // Generate a predicate to control the reading of input string.
1016
  sve_whilelt(tmp_pg, T, idx, cnt1);
1017

1018
  BIND(LOOP);
1019
    // Read a vector of 8- or 16-bit data depending on the string type. Note
1020
    // that inactive elements indicated by the predicate register won't cause
1021
    // a data read from memory to the destination vector.
1022
    if (isL) {
1023
      sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1024
    } else {
1025
      sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1026
    }
1027
    add(idx, idx, vec_len);
1028

1029
    // Perform the comparison. An element of the destination predicate is set
1030
    // to active if the particular char is matched.
1031
    sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1032

1033
    // Branch if the particular char is found.
1034
    br(NE, MATCH);
1035

1036
    sve_whilelt(tmp_pg, T, idx, cnt1);
1037

1038
    // Loop back if the particular char not found.
1039
    br(MI, LOOP);
1040

1041
  BIND(NOMATCH);
1042
    mov(result, -1);
1043
    b(DONE);
1044

1045
  BIND(MATCH);
1046
    // Undo the index increment.
1047
    sub(idx, idx, vec_len);
1048

1049
    // Crop the vector to find its location.
1050
    sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1051
    add(result, idx, -1);
1052
    sve_incp(result, T, tmp_pdn);
1053
  BIND(DONE);
1054
}
1055

1056
void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1057
                                            Register ch, Register result,
1058
                                            Register tmp1, Register tmp2, Register tmp3)
1059
{
1060
  Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1061
  Register cnt1_neg = cnt1;
1062
  Register ch1 = rscratch1;
1063
  Register result_tmp = rscratch2;
1064

1065
  cbz(cnt1, NOMATCH);
1066

1067
  cmp(cnt1, (u1)8);
1068
  br(LT, DO1_SHORT);
1069

1070
  orr(ch, ch, ch, LSL, 8);
1071
  orr(ch, ch, ch, LSL, 16);
1072
  orr(ch, ch, ch, LSL, 32);
1073

1074
  sub(cnt1, cnt1, 8);
1075
  mov(result_tmp, cnt1);
1076
  lea(str1, Address(str1, cnt1));
1077
  sub(cnt1_neg, zr, cnt1);
1078

1079
  mov(tmp3, 0x0101010101010101);
1080

1081
  BIND(CH1_LOOP);
1082
    ldr(ch1, Address(str1, cnt1_neg));
1083
    eor(ch1, ch, ch1);
1084
    sub(tmp1, ch1, tmp3);
1085
    orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1086
    bics(tmp1, tmp1, tmp2);
1087
    br(NE, HAS_ZERO);
1088
    adds(cnt1_neg, cnt1_neg, 8);
1089
    br(LT, CH1_LOOP);
1090

1091
    cmp(cnt1_neg, (u1)8);
1092
    mov(cnt1_neg, 0);
1093
    br(LT, CH1_LOOP);
1094
    b(NOMATCH);
1095

1096
  BIND(HAS_ZERO);
1097
    rev(tmp1, tmp1);
1098
    clz(tmp1, tmp1);
1099
    add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1100
    b(MATCH);
1101

1102
  BIND(DO1_SHORT);
1103
    mov(result_tmp, cnt1);
1104
    lea(str1, Address(str1, cnt1));
1105
    sub(cnt1_neg, zr, cnt1);
1106
  BIND(DO1_LOOP);
1107
    ldrb(ch1, Address(str1, cnt1_neg));
1108
    cmp(ch, ch1);
1109
    br(EQ, MATCH);
1110
    adds(cnt1_neg, cnt1_neg, 1);
1111
    br(LT, DO1_LOOP);
1112
  BIND(NOMATCH);
1113
    mov(result, -1);
1114
    b(DONE);
1115
  BIND(MATCH);
1116
    add(result, result_tmp, cnt1_neg);
1117
  BIND(DONE);
1118
}
1119

1120
// Compare strings.
1121
void C2_MacroAssembler::string_compare(Register str1, Register str2,
1122
    Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1123
    FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1124
    PRegister pgtmp1, PRegister pgtmp2, int ae) {
1125
  Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1126
      DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1127
      SHORT_LOOP_START, TAIL_CHECK;
1128

1129
  bool isLL = ae == StrIntrinsicNode::LL;
1130
  bool isLU = ae == StrIntrinsicNode::LU;
1131
  bool isUL = ae == StrIntrinsicNode::UL;
1132

1133
  // The stub threshold for LL strings is: 72 (64 + 8) chars
1134
  // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1135
  // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1136
  const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1137

1138
  bool str1_isL = isLL || isLU;
1139
  bool str2_isL = isLL || isUL;
1140

1141
  int str1_chr_shift = str1_isL ? 0 : 1;
1142
  int str2_chr_shift = str2_isL ? 0 : 1;
1143
  int str1_chr_size = str1_isL ? 1 : 2;
1144
  int str2_chr_size = str2_isL ? 1 : 2;
1145
  int minCharsInWord = isLL ? wordSize : wordSize/2;
1146

1147
  FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1148
  chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1149
                                      (chr_insn)&MacroAssembler::ldrh;
1150
  chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1151
                                      (chr_insn)&MacroAssembler::ldrh;
1152
  uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1153
                            (uxt_insn)&MacroAssembler::uxthw;
1154

1155
  BLOCK_COMMENT("string_compare {");
1156

1157
  // Bizarrely, the counts are passed in bytes, regardless of whether they
1158
  // are L or U strings, however the result is always in characters.
1159
  if (!str1_isL) asrw(cnt1, cnt1, 1);
1160
  if (!str2_isL) asrw(cnt2, cnt2, 1);
1161

1162
  // Compute the minimum of the string lengths and save the difference.
1163
  subsw(result, cnt1, cnt2);
1164
  cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1165

1166
  // A very short string
1167
  cmpw(cnt2, minCharsInWord);
1168
  br(Assembler::LE, SHORT_STRING);
1169

1170
  // Compare longwords
1171
  // load first parts of strings and finish initialization while loading
1172
  {
1173
    if (str1_isL == str2_isL) { // LL or UU
1174
      ldr(tmp1, Address(str1));
1175
      cmp(str1, str2);
1176
      br(Assembler::EQ, DONE);
1177
      ldr(tmp2, Address(str2));
1178
      cmp(cnt2, stub_threshold);
1179
      br(GE, STUB);
1180
      subsw(cnt2, cnt2, minCharsInWord);
1181
      br(EQ, TAIL_CHECK);
1182
      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183
      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1184
      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1185
    } else if (isLU) {
1186
      ldrs(vtmp, Address(str1));
1187
      ldr(tmp2, Address(str2));
1188
      cmp(cnt2, stub_threshold);
1189
      br(GE, STUB);
1190
      subw(cnt2, cnt2, 4);
1191
      eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1192
      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1193
      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1194
      zip1(vtmp, T8B, vtmp, vtmpZ);
1195
      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1196
      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1197
      add(cnt1, cnt1, 4);
1198
      fmovd(tmp1, vtmp);
1199
    } else { // UL case
1200
      ldr(tmp1, Address(str1));
1201
      ldrs(vtmp, Address(str2));
1202
      cmp(cnt2, stub_threshold);
1203
      br(GE, STUB);
1204
      subw(cnt2, cnt2, 4);
1205
      lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206
      eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1207
      lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208
      sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209
      zip1(vtmp, T8B, vtmp, vtmpZ);
1210
      sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1211
      add(cnt1, cnt1, 8);
1212
      fmovd(tmp2, vtmp);
1213
    }
1214
    adds(cnt2, cnt2, isUL ? 4 : 8);
1215
    br(GE, TAIL);
1216
    eor(rscratch2, tmp1, tmp2);
1217
    cbnz(rscratch2, DIFF);
1218
    // main loop
1219
    bind(NEXT_WORD);
1220
    if (str1_isL == str2_isL) {
1221
      ldr(tmp1, Address(str1, cnt2));
1222
      ldr(tmp2, Address(str2, cnt2));
1223
      adds(cnt2, cnt2, 8);
1224
    } else if (isLU) {
1225
      ldrs(vtmp, Address(str1, cnt1));
1226
      ldr(tmp2, Address(str2, cnt2));
1227
      add(cnt1, cnt1, 4);
1228
      zip1(vtmp, T8B, vtmp, vtmpZ);
1229
      fmovd(tmp1, vtmp);
1230
      adds(cnt2, cnt2, 8);
1231
    } else { // UL
1232
      ldrs(vtmp, Address(str2, cnt2));
1233
      ldr(tmp1, Address(str1, cnt1));
1234
      zip1(vtmp, T8B, vtmp, vtmpZ);
1235
      add(cnt1, cnt1, 8);
1236
      fmovd(tmp2, vtmp);
1237
      adds(cnt2, cnt2, 4);
1238
    }
1239
    br(GE, TAIL);
1240

1241
    eor(rscratch2, tmp1, tmp2);
1242
    cbz(rscratch2, NEXT_WORD);
1243
    b(DIFF);
1244
    bind(TAIL);
1245
    eor(rscratch2, tmp1, tmp2);
1246
    cbnz(rscratch2, DIFF);
1247
    // Last longword.  In the case where length == 4 we compare the
1248
    // same longword twice, but that's still faster than another
1249
    // conditional branch.
1250
    if (str1_isL == str2_isL) {
1251
      ldr(tmp1, Address(str1));
1252
      ldr(tmp2, Address(str2));
1253
    } else if (isLU) {
1254
      ldrs(vtmp, Address(str1));
1255
      ldr(tmp2, Address(str2));
1256
      zip1(vtmp, T8B, vtmp, vtmpZ);
1257
      fmovd(tmp1, vtmp);
1258
    } else { // UL
1259
      ldrs(vtmp, Address(str2));
1260
      ldr(tmp1, Address(str1));
1261
      zip1(vtmp, T8B, vtmp, vtmpZ);
1262
      fmovd(tmp2, vtmp);
1263
    }
1264
    bind(TAIL_CHECK);
1265
    eor(rscratch2, tmp1, tmp2);
1266
    cbz(rscratch2, DONE);
1267

1268
    // Find the first different characters in the longwords and
1269
    // compute their difference.
1270
    bind(DIFF);
1271
    rev(rscratch2, rscratch2);
1272
    clz(rscratch2, rscratch2);
1273
    andr(rscratch2, rscratch2, isLL ? -8 : -16);
1274
    lsrv(tmp1, tmp1, rscratch2);
1275
    (this->*ext_chr)(tmp1, tmp1);
1276
    lsrv(tmp2, tmp2, rscratch2);
1277
    (this->*ext_chr)(tmp2, tmp2);
1278
    subw(result, tmp1, tmp2);
1279
    b(DONE);
1280
  }
1281

1282
  bind(STUB);
1283
    RuntimeAddress stub = nullptr;
1284
    switch(ae) {
1285
      case StrIntrinsicNode::LL:
1286
        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1287
        break;
1288
      case StrIntrinsicNode::UU:
1289
        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1290
        break;
1291
      case StrIntrinsicNode::LU:
1292
        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1293
        break;
1294
      case StrIntrinsicNode::UL:
1295
        stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1296
        break;
1297
      default:
1298
        ShouldNotReachHere();
1299
     }
1300
    assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1301
    address call = trampoline_call(stub);
1302
    if (call == nullptr) {
1303
      DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1304
      ciEnv::current()->record_failure("CodeCache is full");
1305
      return;
1306
    }
1307
    b(DONE);
1308

1309
  bind(SHORT_STRING);
1310
  // Is the minimum length zero?
1311
  cbz(cnt2, DONE);
1312
  // arrange code to do most branches while loading and loading next characters
1313
  // while comparing previous
1314
  (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1315
  subs(cnt2, cnt2, 1);
1316
  br(EQ, SHORT_LAST_INIT);
1317
  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1318
  b(SHORT_LOOP_START);
1319
  bind(SHORT_LOOP);
1320
  subs(cnt2, cnt2, 1);
1321
  br(EQ, SHORT_LAST);
1322
  bind(SHORT_LOOP_START);
1323
  (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1324
  (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1325
  cmp(tmp1, cnt1);
1326
  br(NE, SHORT_LOOP_TAIL);
1327
  subs(cnt2, cnt2, 1);
1328
  br(EQ, SHORT_LAST2);
1329
  (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1330
  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331
  cmp(tmp2, rscratch1);
1332
  br(EQ, SHORT_LOOP);
1333
  sub(result, tmp2, rscratch1);
1334
  b(DONE);
1335
  bind(SHORT_LOOP_TAIL);
1336
  sub(result, tmp1, cnt1);
1337
  b(DONE);
1338
  bind(SHORT_LAST2);
1339
  cmp(tmp2, rscratch1);
1340
  br(EQ, DONE);
1341
  sub(result, tmp2, rscratch1);
1342

1343
  b(DONE);
1344
  bind(SHORT_LAST_INIT);
1345
  (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1346
  bind(SHORT_LAST);
1347
  cmp(tmp1, cnt1);
1348
  br(EQ, DONE);
1349
  sub(result, tmp1, cnt1);
1350

1351
  bind(DONE);
1352

1353
  BLOCK_COMMENT("} string_compare");
1354
}
1355

1356
void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1357
                                     FloatRegister src2, Condition cond, bool isQ) {
1358
  SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1359
  FloatRegister zn = src1, zm = src2;
1360
  bool needs_negation = false;
1361
  switch (cond) {
1362
    case LT: cond = GT; zn = src2; zm = src1; break;
1363
    case LE: cond = GE; zn = src2; zm = src1; break;
1364
    case LO: cond = HI; zn = src2; zm = src1; break;
1365
    case LS: cond = HS; zn = src2; zm = src1; break;
1366
    case NE: cond = EQ; needs_negation = true; break;
1367
    default:
1368
      break;
1369
  }
1370

1371
  if (is_floating_point_type(bt)) {
1372
    fcm(cond, dst, size, zn, zm);
1373
  } else {
1374
    cm(cond, dst, size, zn, zm);
1375
  }
1376

1377
  if (needs_negation) {
1378
    notr(dst, isQ ? T16B : T8B, dst);
1379
  }
1380
}
1381

1382
void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1383
                                          Condition cond, bool isQ) {
1384
  SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1385
  if (bt == T_FLOAT || bt == T_DOUBLE) {
1386
    if (cond == Assembler::NE) {
1387
      fcm(Assembler::EQ, dst, size, src);
1388
      notr(dst, isQ ? T16B : T8B, dst);
1389
    } else {
1390
      fcm(cond, dst, size, src);
1391
    }
1392
  } else {
1393
    if (cond == Assembler::NE) {
1394
      cm(Assembler::EQ, dst, size, src);
1395
      notr(dst, isQ ? T16B : T8B, dst);
1396
    } else {
1397
      cm(cond, dst, size, src);
1398
    }
1399
  }
1400
}
1401

1402
// Compress the least significant bit of each byte to the rightmost and clear
1403
// the higher garbage bits.
1404
void C2_MacroAssembler::bytemask_compress(Register dst) {
1405
  // Example input, dst = 0x01 00 00 00 01 01 00 01
1406
  // The "??" bytes are garbage.
1407
  orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1408
  orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1409
  orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1410
  andr(dst, dst, 0xff);                   // dst = 0x8D
1411
}
1412

1413
// Pack the lowest-numbered bit of each mask element in src into a long value
1414
// in dst, at most the first 64 lane elements.
1415
// Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1416
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1417
                                         FloatRegister vtmp1, FloatRegister vtmp2) {
1418
  assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1419
  assert_different_registers(dst, rscratch1);
1420
  assert_different_registers(vtmp1, vtmp2);
1421

1422
  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1423
  // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1424
  // Expected:  dst = 0x658D
1425

1426
  // Convert the mask into vector with sequential bytes.
1427
  // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1428
  sve_cpy(vtmp1, size, src, 1, false);
1429
  if (bt != T_BYTE) {
1430
    sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1431
  }
1432

1433
  if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1434
    // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1435
    // is to compress each significant bit of the byte in a cross-lane way. Due
1436
    // to the lack of a cross-lane bit-compress instruction, we use BEXT
1437
    // (bit-compress in each lane) with the biggest lane size (T = D) then
1438
    // concatenate the results.
1439

1440
    // The second source input of BEXT, initialized with 0x01 in each byte.
1441
    // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1442
    sve_dup(vtmp2, B, 1);
1443

1444
    // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1445
    // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1446
    // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1447
    //         ---------------------------------------
1448
    // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1449
    sve_bext(vtmp1, D, vtmp1, vtmp2);
1450

1451
    // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1452
    // result to dst.
1453
    // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1454
    // dst   = 0x658D
1455
    if (lane_cnt <= 8) {
1456
      // No need to concatenate.
1457
      umov(dst, vtmp1, B, 0);
1458
    } else if (lane_cnt <= 16) {
1459
      ins(vtmp1, B, vtmp1, 1, 8);
1460
      umov(dst, vtmp1, H, 0);
1461
    } else {
1462
      // As the lane count is 64 at most, the final expected value must be in
1463
      // the lowest 64 bits after narrowing vtmp1 from D to B.
1464
      sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1465
      umov(dst, vtmp1, D, 0);
1466
    }
1467
  } else if (UseSVE > 0) {
1468
    // Compress the lowest 8 bytes.
1469
    fmovd(dst, vtmp1);
1470
    bytemask_compress(dst);
1471
    if (lane_cnt <= 8) return;
1472

1473
    // Repeat on higher bytes and join the results.
1474
    // Compress 8 bytes in each iteration.
1475
    for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1476
      sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1477
      bytemask_compress(rscratch1);
1478
      orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1479
    }
1480
  } else {
1481
    assert(false, "unsupported");
1482
    ShouldNotReachHere();
1483
  }
1484
}
1485

1486
// Unpack the mask, a long value in src, into predicate register dst based on the
1487
// corresponding data type. Note that dst can support at most 64 lanes.
1488
// Below example gives the expected dst predicate register in different types, with
1489
// a valid src(0x658D) on a 1024-bit vector size machine.
1490
// BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1491
// SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1492
// INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1493
// LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1494
//
1495
// The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1496
// has 24 significant bits would be an invalid input if dst predicate register refers to
1497
// a LONG type 1024-bit vector, which has at most 16 lanes.
1498
void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1499
                                           FloatRegister vtmp1, FloatRegister vtmp2) {
1500
  assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1501
         lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1502
  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1503
  // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1504
  // Expected:  dst = 0b01101001 10001101
1505

1506
  // Put long value from general purpose register into the first lane of vector.
1507
  // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1508
  sve_dup(vtmp1, B, 0);
1509
  mov(vtmp1, D, 0, src);
1510

1511
  // As sve_cmp generates mask value with the minimum unit in byte, we should
1512
  // transform the value in the first lane which is mask in bit now to the
1513
  // mask in byte, which can be done by SVE2's BDEP instruction.
1514

1515
  // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1516
  // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1517
  if (lane_cnt <= 8) {
1518
    // Nothing. As only one byte exsits.
1519
  } else if (lane_cnt <= 16) {
1520
    ins(vtmp1, B, vtmp1, 8, 1);
1521
    mov(vtmp1, B, 1, zr);
1522
  } else {
1523
    sve_vector_extend(vtmp1, D, vtmp1, B);
1524
  }
1525

1526
  // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527
  // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1528
  sve_dup(vtmp2, B, 1);
1529

1530
  // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1531
  // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1532
  // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1533
  //         ---------------------------------------
1534
  // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1535
  sve_bdep(vtmp1, D, vtmp1, vtmp2);
1536

1537
  if (bt != T_BYTE) {
1538
    sve_vector_extend(vtmp1, size, vtmp1, B);
1539
  }
1540
  // Generate mask according to the given vector, in which the elements have been
1541
  // extended to expected type.
1542
  // dst = 0b01101001 10001101
1543
  sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1544
}
1545

1546
// Clobbers: rflags
1547
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548
                                    FloatRegister zn, FloatRegister zm, Condition cond) {
1549
  assert(pg->is_governing(), "This register has to be a governing predicate register");
1550
  FloatRegister z1 = zn, z2 = zm;
1551
  switch (cond) {
1552
    case LE: z1 = zm; z2 = zn; cond = GE; break;
1553
    case LT: z1 = zm; z2 = zn; cond = GT; break;
1554
    case LO: z1 = zm; z2 = zn; cond = HI; break;
1555
    case LS: z1 = zm; z2 = zn; cond = HS; break;
1556
    default:
1557
      break;
1558
  }
1559

1560
  SIMD_RegVariant size = elemType_to_regVariant(bt);
1561
  if (is_floating_point_type(bt)) {
1562
    sve_fcm(cond, pd, size, pg, z1, z2);
1563
  } else {
1564
    assert(is_integral_type(bt), "unsupported element type");
1565
    sve_cmp(cond, pd, size, pg, z1, z2);
1566
  }
1567
}
1568

1569
// Get index of the last mask lane that is set
1570
void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571
  SIMD_RegVariant size = elemType_to_regVariant(bt);
1572
  sve_rev(ptmp, size, src);
1573
  sve_brkb(ptmp, ptrue, ptmp, false);
1574
  sve_cntp(dst, size, ptrue, ptmp);
1575
  movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576
  subw(dst, rscratch1, dst);
1577
}
1578

1579
// Extend integer vector src to dst with the same lane count
1580
// but larger element size, e.g. 4B -> 4I
1581
void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582
                                           FloatRegister src, BasicType src_bt, bool is_unsigned) {
1583
  if (src_bt == T_BYTE) {
1584
    if (dst_bt == T_SHORT) {
1585
      // 4B/8B to 4S/8S
1586
      _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1587
    } else {
1588
      // 4B to 4I
1589
      assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1590
      _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1591
      _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1592
    }
1593
  } else if (src_bt == T_SHORT) {
1594
    // 4S to 4I
1595
    assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1596
    _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1597
  } else if (src_bt == T_INT) {
1598
    // 2I to 2L
1599
    assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1600
    _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1601
  } else {
1602
    ShouldNotReachHere();
1603
  }
1604
}
1605

1606
// Narrow integer vector src down to dst with the same lane count
1607
// but smaller element size, e.g. 4I -> 4B
1608
void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1609
                                           FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1610
  if (src_bt == T_SHORT) {
1611
    // 4S/8S to 4B/8B
1612
    assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1613
    assert(dst_bt == T_BYTE, "unsupported");
1614
    xtn(dst, T8B, src, T8H);
1615
  } else if (src_bt == T_INT) {
1616
    // 4I to 4B/4S
1617
    assert(src_vlen_in_bytes == 16, "unsupported");
1618
    assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1619
    xtn(dst, T4H, src, T4S);
1620
    if (dst_bt == T_BYTE) {
1621
      xtn(dst, T8B, dst, T8H);
1622
    }
1623
  } else if (src_bt == T_LONG) {
1624
    // 2L to 2I
1625
    assert(src_vlen_in_bytes == 16, "unsupported");
1626
    assert(dst_bt == T_INT, "unsupported");
1627
    xtn(dst, T2S, src, T2D);
1628
  } else {
1629
    ShouldNotReachHere();
1630
  }
1631
}
1632

1633
void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1634
                                          FloatRegister src, SIMD_RegVariant src_size,
1635
                                          bool is_unsigned) {
1636
  assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1637

1638
  if (src_size == B) {
1639
    switch (dst_size) {
1640
    case H:
1641
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1642
      break;
1643
    case S:
1644
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1646
      break;
1647
    case D:
1648
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1649
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1650
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1651
      break;
1652
    default:
1653
      ShouldNotReachHere();
1654
    }
1655
  } else if (src_size == H) {
1656
    if (dst_size == S) {
1657
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1658
    } else { // D
1659
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1660
      _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1661
    }
1662
  } else if (src_size == S) {
1663
    _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1664
  }
1665
}
1666

1667
// Vector narrow from src to dst with specified element sizes.
1668
// High part of dst vector will be filled with zero.
1669
void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1670
                                          FloatRegister src, SIMD_RegVariant src_size,
1671
                                          FloatRegister tmp) {
1672
  assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1673
  assert_different_registers(src, tmp);
1674
  sve_dup(tmp, src_size, 0);
1675
  if (src_size == D) {
1676
    switch (dst_size) {
1677
    case S:
1678
      sve_uzp1(dst, S, src, tmp);
1679
      break;
1680
    case H:
1681
      assert_different_registers(dst, tmp);
1682
      sve_uzp1(dst, S, src, tmp);
1683
      sve_uzp1(dst, H, dst, tmp);
1684
      break;
1685
    case B:
1686
      assert_different_registers(dst, tmp);
1687
      sve_uzp1(dst, S, src, tmp);
1688
      sve_uzp1(dst, H, dst, tmp);
1689
      sve_uzp1(dst, B, dst, tmp);
1690
      break;
1691
    default:
1692
      ShouldNotReachHere();
1693
    }
1694
  } else if (src_size == S) {
1695
    if (dst_size == H) {
1696
      sve_uzp1(dst, H, src, tmp);
1697
    } else { // B
1698
      assert_different_registers(dst, tmp);
1699
      sve_uzp1(dst, H, src, tmp);
1700
      sve_uzp1(dst, B, dst, tmp);
1701
    }
1702
  } else if (src_size == H) {
1703
    sve_uzp1(dst, B, src, tmp);
1704
  }
1705
}
1706

1707
// Extend src predicate to dst predicate with the same lane count but larger
1708
// element size, e.g. 64Byte -> 512Long
1709
void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1710
                                             uint dst_element_length_in_bytes,
1711
                                             uint src_element_length_in_bytes) {
1712
  if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1713
    sve_punpklo(dst, src);
1714
  } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1715
    sve_punpklo(dst, src);
1716
    sve_punpklo(dst, dst);
1717
  } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1718
    sve_punpklo(dst, src);
1719
    sve_punpklo(dst, dst);
1720
    sve_punpklo(dst, dst);
1721
  } else {
1722
    assert(false, "unsupported");
1723
    ShouldNotReachHere();
1724
  }
1725
}
1726

1727
// Narrow src predicate to dst predicate with the same lane count but
1728
// smaller element size, e.g. 512Long -> 64Byte
1729
void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1730
                                             uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1731
  // The insignificant bits in src predicate are expected to be zero.
1732
  // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1733
  // passed as the second argument. An example narrowing operation with a given mask would be -
1734
  // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1735
  // Mask (for 2 Longs) : TF
1736
  // Predicate register for the above mask (16 bits) : 00000001 00000000
1737
  // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1738
  // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1739
  assert_different_registers(src, ptmp);
1740
  assert_different_registers(dst, ptmp);
1741
  sve_pfalse(ptmp);
1742
  if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1743
    sve_uzp1(dst, B, src, ptmp);
1744
  } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1745
    sve_uzp1(dst, H, src, ptmp);
1746
    sve_uzp1(dst, B, dst, ptmp);
1747
  } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1748
    sve_uzp1(dst, S, src, ptmp);
1749
    sve_uzp1(dst, H, dst, ptmp);
1750
    sve_uzp1(dst, B, dst, ptmp);
1751
  } else {
1752
    assert(false, "unsupported");
1753
    ShouldNotReachHere();
1754
  }
1755
}
1756

1757
// Vector reduction add for integral type with ASIMD instructions.
1758
void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1759
                                                 Register isrc, FloatRegister vsrc,
1760
                                                 unsigned vector_length_in_bytes,
1761
                                                 FloatRegister vtmp) {
1762
  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1763
  assert_different_registers(dst, isrc);
1764
  bool isQ = vector_length_in_bytes == 16;
1765

1766
  BLOCK_COMMENT("neon_reduce_add_integral {");
1767
    switch(bt) {
1768
      case T_BYTE:
1769
        addv(vtmp, isQ ? T16B : T8B, vsrc);
1770
        smov(dst, vtmp, B, 0);
1771
        addw(dst, dst, isrc, ext::sxtb);
1772
        break;
1773
      case T_SHORT:
1774
        addv(vtmp, isQ ? T8H : T4H, vsrc);
1775
        smov(dst, vtmp, H, 0);
1776
        addw(dst, dst, isrc, ext::sxth);
1777
        break;
1778
      case T_INT:
1779
        isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1780
        umov(dst, vtmp, S, 0);
1781
        addw(dst, dst, isrc);
1782
        break;
1783
      case T_LONG:
1784
        assert(isQ, "unsupported");
1785
        addpd(vtmp, vsrc);
1786
        umov(dst, vtmp, D, 0);
1787
        add(dst, dst, isrc);
1788
        break;
1789
      default:
1790
        assert(false, "unsupported");
1791
        ShouldNotReachHere();
1792
    }
1793
  BLOCK_COMMENT("} neon_reduce_add_integral");
1794
}
1795

1796
// Vector reduction multiply for integral type with ASIMD instructions.
1797
// Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1798
// Clobbers: rscratch1
1799
void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1800
                                                 Register isrc, FloatRegister vsrc,
1801
                                                 unsigned vector_length_in_bytes,
1802
                                                 FloatRegister vtmp1, FloatRegister vtmp2) {
1803
  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1804
  bool isQ = vector_length_in_bytes == 16;
1805

1806
  BLOCK_COMMENT("neon_reduce_mul_integral {");
1807
    switch(bt) {
1808
      case T_BYTE:
1809
        if (isQ) {
1810
          // Multiply the lower half and higher half of vector iteratively.
1811
          // vtmp1 = vsrc[8:15]
1812
          ins(vtmp1, D, vsrc, 0, 1);
1813
          // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1814
          mulv(vtmp1, T8B, vtmp1, vsrc);
1815
          // vtmp2 = vtmp1[4:7]
1816
          ins(vtmp2, S, vtmp1, 0, 1);
1817
          // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1818
          mulv(vtmp1, T8B, vtmp2, vtmp1);
1819
        } else {
1820
          ins(vtmp1, S, vsrc, 0, 1);
1821
          mulv(vtmp1, T8B, vtmp1, vsrc);
1822
        }
1823
        // vtmp2 = vtmp1[2:3]
1824
        ins(vtmp2, H, vtmp1, 0, 1);
1825
        // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1826
        mulv(vtmp2, T8B, vtmp2, vtmp1);
1827
        // dst = vtmp2[0] * isrc * vtmp2[1]
1828
        umov(rscratch1, vtmp2, B, 0);
1829
        mulw(dst, rscratch1, isrc);
1830
        sxtb(dst, dst);
1831
        umov(rscratch1, vtmp2, B, 1);
1832
        mulw(dst, rscratch1, dst);
1833
        sxtb(dst, dst);
1834
        break;
1835
      case T_SHORT:
1836
        if (isQ) {
1837
          ins(vtmp2, D, vsrc, 0, 1);
1838
          mulv(vtmp2, T4H, vtmp2, vsrc);
1839
          ins(vtmp1, S, vtmp2, 0, 1);
1840
          mulv(vtmp1, T4H, vtmp1, vtmp2);
1841
        } else {
1842
          ins(vtmp1, S, vsrc, 0, 1);
1843
          mulv(vtmp1, T4H, vtmp1, vsrc);
1844
        }
1845
        umov(rscratch1, vtmp1, H, 0);
1846
        mulw(dst, rscratch1, isrc);
1847
        sxth(dst, dst);
1848
        umov(rscratch1, vtmp1, H, 1);
1849
        mulw(dst, rscratch1, dst);
1850
        sxth(dst, dst);
1851
        break;
1852
      case T_INT:
1853
        if (isQ) {
1854
          ins(vtmp1, D, vsrc, 0, 1);
1855
          mulv(vtmp1, T2S, vtmp1, vsrc);
1856
        } else {
1857
          vtmp1 = vsrc;
1858
        }
1859
        umov(rscratch1, vtmp1, S, 0);
1860
        mul(dst, rscratch1, isrc);
1861
        umov(rscratch1, vtmp1, S, 1);
1862
        mul(dst, rscratch1, dst);
1863
        break;
1864
      case T_LONG:
1865
        umov(rscratch1, vsrc, D, 0);
1866
        mul(dst, isrc, rscratch1);
1867
        umov(rscratch1, vsrc, D, 1);
1868
        mul(dst, dst, rscratch1);
1869
        break;
1870
      default:
1871
        assert(false, "unsupported");
1872
        ShouldNotReachHere();
1873
    }
1874
  BLOCK_COMMENT("} neon_reduce_mul_integral");
1875
}
1876

1877
// Vector reduction multiply for floating-point type with ASIMD instructions.
1878
void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1879
                                           FloatRegister fsrc, FloatRegister vsrc,
1880
                                           unsigned vector_length_in_bytes,
1881
                                           FloatRegister vtmp) {
1882
  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1883
  bool isQ = vector_length_in_bytes == 16;
1884

1885
  BLOCK_COMMENT("neon_reduce_mul_fp {");
1886
    switch(bt) {
1887
      case T_FLOAT:
1888
        fmuls(dst, fsrc, vsrc);
1889
        ins(vtmp, S, vsrc, 0, 1);
1890
        fmuls(dst, dst, vtmp);
1891
        if (isQ) {
1892
          ins(vtmp, S, vsrc, 0, 2);
1893
          fmuls(dst, dst, vtmp);
1894
          ins(vtmp, S, vsrc, 0, 3);
1895
          fmuls(dst, dst, vtmp);
1896
         }
1897
        break;
1898
      case T_DOUBLE:
1899
        assert(isQ, "unsupported");
1900
        fmuld(dst, fsrc, vsrc);
1901
        ins(vtmp, D, vsrc, 0, 1);
1902
        fmuld(dst, dst, vtmp);
1903
        break;
1904
      default:
1905
        assert(false, "unsupported");
1906
        ShouldNotReachHere();
1907
    }
1908
  BLOCK_COMMENT("} neon_reduce_mul_fp");
1909
}
1910

1911
// Helper to select logical instruction
1912
void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1913
                                                   Register Rn, Register Rm,
1914
                                                   enum shift_kind kind, unsigned shift) {
1915
  switch(opc) {
1916
    case Op_AndReductionV:
1917
      is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1918
      break;
1919
    case Op_OrReductionV:
1920
      is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1921
      break;
1922
    case Op_XorReductionV:
1923
      is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1924
      break;
1925
    default:
1926
      assert(false, "unsupported");
1927
      ShouldNotReachHere();
1928
  }
1929
}
1930

1931
// Vector reduction logical operations And, Or, Xor
1932
// Clobbers: rscratch1
1933
void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1934
                                            Register isrc, FloatRegister vsrc,
1935
                                            unsigned vector_length_in_bytes) {
1936
  assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1937
         "unsupported");
1938
  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1939
  assert_different_registers(dst, isrc);
1940
  bool isQ = vector_length_in_bytes == 16;
1941

1942
  BLOCK_COMMENT("neon_reduce_logical {");
1943
    umov(rscratch1, vsrc, isQ ? D : S, 0);
1944
    umov(dst, vsrc, isQ ? D : S, 1);
1945
    neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1946
    switch(bt) {
1947
      case T_BYTE:
1948
        if (isQ) {
1949
          neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1950
        }
1951
        neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1952
        neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1953
        neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1954
        sxtb(dst, dst);
1955
        break;
1956
      case T_SHORT:
1957
        if (isQ) {
1958
          neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1959
        }
1960
        neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1961
        neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1962
        sxth(dst, dst);
1963
        break;
1964
      case T_INT:
1965
        if (isQ) {
1966
          neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1967
        }
1968
        neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1969
        break;
1970
      case T_LONG:
1971
        assert(isQ, "unsupported");
1972
        neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1973
        break;
1974
      default:
1975
        assert(false, "unsupported");
1976
        ShouldNotReachHere();
1977
    }
1978
  BLOCK_COMMENT("} neon_reduce_logical");
1979
}
1980

1981
// Vector reduction min/max for integral type with ASIMD instructions.
1982
// Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1983
// Clobbers: rscratch1, rflags
1984
void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1985
                                                    Register isrc, FloatRegister vsrc,
1986
                                                    unsigned vector_length_in_bytes,
1987
                                                    FloatRegister vtmp) {
1988
  assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1989
  assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1990
  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1991
  assert_different_registers(dst, isrc);
1992
  bool isQ = vector_length_in_bytes == 16;
1993
  bool is_min = opc == Op_MinReductionV;
1994

1995
  BLOCK_COMMENT("neon_reduce_minmax_integral {");
1996
    if (bt == T_LONG) {
1997
      assert(vtmp == fnoreg, "should be");
1998
      assert(isQ, "should be");
1999
      umov(rscratch1, vsrc, D, 0);
2000
      cmp(isrc, rscratch1);
2001
      csel(dst, isrc, rscratch1, is_min ? LT : GT);
2002
      umov(rscratch1, vsrc, D, 1);
2003
      cmp(dst, rscratch1);
2004
      csel(dst, dst, rscratch1, is_min ? LT : GT);
2005
    } else {
2006
      SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2007
      if (size == T2S) {
2008
        is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2009
      } else {
2010
        is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2011
      }
2012
      if (bt == T_INT) {
2013
        umov(dst, vtmp, S, 0);
2014
      } else {
2015
        smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2016
      }
2017
      cmpw(dst, isrc);
2018
      cselw(dst, dst, isrc, is_min ? LT : GT);
2019
    }
2020
  BLOCK_COMMENT("} neon_reduce_minmax_integral");
2021
}
2022

2023
// Vector reduction for integral type with SVE instruction.
2024
// Supported operations are Add, And, Or, Xor, Max, Min.
2025
// rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2026
void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2027
                                            FloatRegister src2, PRegister pg, FloatRegister tmp) {
2028
  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2029
  assert(pg->is_governing(), "This register has to be a governing predicate register");
2030
  assert_different_registers(src1, dst);
2031
  // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2032
  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2033
  switch (opc) {
2034
    case Op_AddReductionVI: {
2035
      sve_uaddv(tmp, size, pg, src2);
2036
      if (bt == T_BYTE) {
2037
        smov(dst, tmp, size, 0);
2038
        addw(dst, src1, dst, ext::sxtb);
2039
      } else if (bt == T_SHORT) {
2040
        smov(dst, tmp, size, 0);
2041
        addw(dst, src1, dst, ext::sxth);
2042
      } else {
2043
        umov(dst, tmp, size, 0);
2044
        addw(dst, dst, src1);
2045
      }
2046
      break;
2047
    }
2048
    case Op_AddReductionVL: {
2049
      sve_uaddv(tmp, size, pg, src2);
2050
      umov(dst, tmp, size, 0);
2051
      add(dst, dst, src1);
2052
      break;
2053
    }
2054
    case Op_AndReductionV: {
2055
      sve_andv(tmp, size, pg, src2);
2056
      if (bt == T_INT || bt == T_LONG) {
2057
        umov(dst, tmp, size, 0);
2058
      } else {
2059
        smov(dst, tmp, size, 0);
2060
      }
2061
      if (bt == T_LONG) {
2062
        andr(dst, dst, src1);
2063
      } else {
2064
        andw(dst, dst, src1);
2065
      }
2066
      break;
2067
    }
2068
    case Op_OrReductionV: {
2069
      sve_orv(tmp, size, pg, src2);
2070
      if (bt == T_INT || bt == T_LONG) {
2071
        umov(dst, tmp, size, 0);
2072
      } else {
2073
        smov(dst, tmp, size, 0);
2074
      }
2075
      if (bt == T_LONG) {
2076
        orr(dst, dst, src1);
2077
      } else {
2078
        orrw(dst, dst, src1);
2079
      }
2080
      break;
2081
    }
2082
    case Op_XorReductionV: {
2083
      sve_eorv(tmp, size, pg, src2);
2084
      if (bt == T_INT || bt == T_LONG) {
2085
        umov(dst, tmp, size, 0);
2086
      } else {
2087
        smov(dst, tmp, size, 0);
2088
      }
2089
      if (bt == T_LONG) {
2090
        eor(dst, dst, src1);
2091
      } else {
2092
        eorw(dst, dst, src1);
2093
      }
2094
      break;
2095
    }
2096
    case Op_MaxReductionV: {
2097
      sve_smaxv(tmp, size, pg, src2);
2098
      if (bt == T_INT || bt == T_LONG) {
2099
        umov(dst, tmp, size, 0);
2100
      } else {
2101
        smov(dst, tmp, size, 0);
2102
      }
2103
      if (bt == T_LONG) {
2104
        cmp(dst, src1);
2105
        csel(dst, dst, src1, Assembler::GT);
2106
      } else {
2107
        cmpw(dst, src1);
2108
        cselw(dst, dst, src1, Assembler::GT);
2109
      }
2110
      break;
2111
    }
2112
    case Op_MinReductionV: {
2113
      sve_sminv(tmp, size, pg, src2);
2114
      if (bt == T_INT || bt == T_LONG) {
2115
        umov(dst, tmp, size, 0);
2116
      } else {
2117
        smov(dst, tmp, size, 0);
2118
      }
2119
      if (bt == T_LONG) {
2120
        cmp(dst, src1);
2121
        csel(dst, dst, src1, Assembler::LT);
2122
      } else {
2123
        cmpw(dst, src1);
2124
        cselw(dst, dst, src1, Assembler::LT);
2125
      }
2126
      break;
2127
    }
2128
    default:
2129
      assert(false, "unsupported");
2130
      ShouldNotReachHere();
2131
  }
2132

2133
  if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2134
    if (bt == T_BYTE) {
2135
      sxtb(dst, dst);
2136
    } else if (bt == T_SHORT) {
2137
      sxth(dst, dst);
2138
    }
2139
  }
2140
}
2141

2142
// Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2143
// to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2144
// max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2145
void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2146
  uint32_t max_vector_length = Matcher::max_vector_size(bt);
2147
  assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2148

2149
  // Set all elements to false if the input "lane_cnt" is zero.
2150
  if (lane_cnt == 0) {
2151
    sve_pfalse(dst);
2152
    return;
2153
  }
2154

2155
  SIMD_RegVariant size = elemType_to_regVariant(bt);
2156
  assert(size != Q, "invalid size");
2157

2158
  // Set all true if "lane_cnt" equals to the max lane count.
2159
  if (lane_cnt == max_vector_length) {
2160
    sve_ptrue(dst, size, /* ALL */ 0b11111);
2161
    return;
2162
  }
2163

2164
  // Fixed numbers for "ptrue".
2165
  switch(lane_cnt) {
2166
  case 1: /* VL1 */
2167
  case 2: /* VL2 */
2168
  case 3: /* VL3 */
2169
  case 4: /* VL4 */
2170
  case 5: /* VL5 */
2171
  case 6: /* VL6 */
2172
  case 7: /* VL7 */
2173
  case 8: /* VL8 */
2174
    sve_ptrue(dst, size, lane_cnt);
2175
    return;
2176
  case 16:
2177
    sve_ptrue(dst, size, /* VL16 */ 0b01001);
2178
    return;
2179
  case 32:
2180
    sve_ptrue(dst, size, /* VL32 */ 0b01010);
2181
    return;
2182
  case 64:
2183
    sve_ptrue(dst, size, /* VL64 */ 0b01011);
2184
    return;
2185
  case 128:
2186
    sve_ptrue(dst, size, /* VL128 */ 0b01100);
2187
    return;
2188
  case 256:
2189
    sve_ptrue(dst, size, /* VL256 */ 0b01101);
2190
    return;
2191
  default:
2192
    break;
2193
  }
2194

2195
  // Special patterns for "ptrue".
2196
  if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2197
    sve_ptrue(dst, size, /* POW2 */ 0b00000);
2198
  } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2199
    sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2200
  } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2201
    sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2202
  } else {
2203
    // Encode to "whileltw" for the remaining cases.
2204
    mov(rscratch1, lane_cnt);
2205
    sve_whileltw(dst, size, zr, rscratch1);
2206
  }
2207
}
2208

2209
// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2210
// Any remaining elements of dst will be filled with zero.
2211
// Clobbers: rscratch1
2212
// Preserves: src, mask
2213
void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2214
                                           FloatRegister vtmp1, FloatRegister vtmp2,
2215
                                           PRegister pgtmp) {
2216
  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2217
  assert_different_registers(dst, src, vtmp1, vtmp2);
2218
  assert_different_registers(mask, pgtmp);
2219

2220
  // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2221
  //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2222
  // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2223
  sve_dup(vtmp2, H, 0);
2224

2225
  // Extend lowest half to type INT.
2226
  // dst = 00004444 00003333 00002222 00001111
2227
  sve_uunpklo(dst, S, src);
2228
  // pgtmp = 00000001 00000000 00000001 00000001
2229
  sve_punpklo(pgtmp, mask);
2230
  // Pack the active elements in size of type INT to the right,
2231
  // and fill the remainings with zero.
2232
  // dst = 00000000 00004444 00002222 00001111
2233
  sve_compact(dst, S, dst, pgtmp);
2234
  // Narrow the result back to type SHORT.
2235
  // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2236
  sve_uzp1(dst, H, dst, vtmp2);
2237
  // Count the active elements of lowest half.
2238
  // rscratch1 = 3
2239
  sve_cntp(rscratch1, S, ptrue, pgtmp);
2240

2241
  // Repeat to the highest half.
2242
  // pgtmp = 00000001 00000000 00000000 00000001
2243
  sve_punpkhi(pgtmp, mask);
2244
  // vtmp1 = 00008888 00007777 00006666 00005555
2245
  sve_uunpkhi(vtmp1, S, src);
2246
  // vtmp1 = 00000000 00000000 00008888 00005555
2247
  sve_compact(vtmp1, S, vtmp1, pgtmp);
2248
  // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2249
  sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2250

2251
  // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2252
  // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2253
  // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2254
  // TRUE_CNT is the number of active elements in the compressed low.
2255
  neg(rscratch1, rscratch1);
2256
  // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2257
  sve_index(vtmp2, H, rscratch1, 1);
2258
  // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2259
  sve_tbl(vtmp1, H, vtmp1, vtmp2);
2260

2261
  // Combine the compressed high(after shifted) with the compressed low.
2262
  // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2263
  sve_orr(dst, dst, vtmp1);
2264
}
2265

2266
// Clobbers: rscratch1, rscratch2
2267
// Preserves: src, mask
2268
void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2269
                                          FloatRegister vtmp1, FloatRegister vtmp2,
2270
                                          FloatRegister vtmp3, FloatRegister vtmp4,
2271
                                          PRegister ptmp, PRegister pgtmp) {
2272
  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2273
  assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2274
  assert_different_registers(mask, ptmp, pgtmp);
2275
  // Example input:   src   = 88 77 66 55 44 33 22 11
2276
  //                  mask  = 01 00 00 01 01 00 01 01
2277
  // Expected result: dst   = 00 00 00 88 55 44 22 11
2278

2279
  sve_dup(vtmp4, B, 0);
2280
  // Extend lowest half to type SHORT.
2281
  // vtmp1 = 0044 0033 0022 0011
2282
  sve_uunpklo(vtmp1, H, src);
2283
  // ptmp = 0001 0000 0001 0001
2284
  sve_punpklo(ptmp, mask);
2285
  // Count the active elements of lowest half.
2286
  // rscratch2 = 3
2287
  sve_cntp(rscratch2, H, ptrue, ptmp);
2288
  // Pack the active elements in size of type SHORT to the right,
2289
  // and fill the remainings with zero.
2290
  // dst = 0000 0044 0022 0011
2291
  sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2292
  // Narrow the result back to type BYTE.
2293
  // dst = 00 00 00 00 00 44 22 11
2294
  sve_uzp1(dst, B, dst, vtmp4);
2295

2296
  // Repeat to the highest half.
2297
  // ptmp = 0001 0000 0000 0001
2298
  sve_punpkhi(ptmp, mask);
2299
  // vtmp1 = 0088 0077 0066 0055
2300
  sve_uunpkhi(vtmp2, H, src);
2301
  // vtmp1 = 0000 0000 0088 0055
2302
  sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2303

2304
  sve_dup(vtmp4, B, 0);
2305
  // vtmp1 = 00 00 00 00 00 00 88 55
2306
  sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2307

2308
  // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2309
  // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2310
  // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2311
  // TRUE_CNT is the number of active elements in the compressed low.
2312
  neg(rscratch2, rscratch2);
2313
  // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2314
  sve_index(vtmp2, B, rscratch2, 1);
2315
  // vtmp1 = 00 00 00 88 55 00 00 00
2316
  sve_tbl(vtmp1, B, vtmp1, vtmp2);
2317
  // Combine the compressed high(after shifted) with the compressed low.
2318
  // dst = 00 00 00 88 55 44 22 11
2319
  sve_orr(dst, dst, vtmp1);
2320
}
2321

2322
void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2323
  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2324
  SIMD_Arrangement size = isQ ? T16B : T8B;
2325
  if (bt == T_BYTE) {
2326
    rbit(dst, size, src);
2327
  } else {
2328
    neon_reverse_bytes(dst, src, bt, isQ);
2329
    rbit(dst, size, dst);
2330
  }
2331
}
2332

2333
void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2334
  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2335
  SIMD_Arrangement size = isQ ? T16B : T8B;
2336
  switch (bt) {
2337
    case T_BYTE:
2338
      if (dst != src) {
2339
        orr(dst, size, src, src);
2340
      }
2341
      break;
2342
    case T_SHORT:
2343
      rev16(dst, size, src);
2344
      break;
2345
    case T_INT:
2346
      rev32(dst, size, src);
2347
      break;
2348
    case T_LONG:
2349
      rev64(dst, size, src);
2350
      break;
2351
    default:
2352
      assert(false, "unsupported");
2353
      ShouldNotReachHere();
2354
  }
2355
}
2356

2357
// Extract a scalar element from an sve vector at position 'idx'.
2358
// The input elements in src are expected to be of integral type.
2359
void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2360
                                             int idx, FloatRegister vtmp) {
2361
  assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2362
  Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2363
  if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2364
    if (bt == T_INT || bt == T_LONG) {
2365
      umov(dst, src, size, idx);
2366
    } else {
2367
      smov(dst, src, size, idx);
2368
    }
2369
  } else {
2370
    sve_orr(vtmp, src, src);
2371
    sve_ext(vtmp, vtmp, idx << size);
2372
    if (bt == T_INT || bt == T_LONG) {
2373
      umov(dst, vtmp, size, 0);
2374
    } else {
2375
      smov(dst, vtmp, size, 0);
2376
    }
2377
  }
2378
}
2379

2380
// java.lang.Math::round intrinsics
2381

2382
// Clobbers: rscratch1, rflags
2383
void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2384
                                          FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2385
  assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2386
  switch (T) {
2387
    case T2S:
2388
    case T4S:
2389
      fmovs(tmp1, T, 0.5f);
2390
      mov(rscratch1, jint_cast(0x1.0p23f));
2391
      break;
2392
    case T2D:
2393
      fmovd(tmp1, T, 0.5);
2394
      mov(rscratch1, julong_cast(0x1.0p52));
2395
      break;
2396
    default:
2397
      assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2398
  }
2399
  fadd(tmp1, T, tmp1, src);
2400
  fcvtms(tmp1, T, tmp1);
2401
  // tmp1 = floor(src + 0.5, ties to even)
2402

2403
  fcvtas(dst, T, src);
2404
  // dst = round(src), ties to away
2405

2406
  fneg(tmp3, T, src);
2407
  dup(tmp2, T, rscratch1);
2408
  cm(HS, tmp3, T, tmp3, tmp2);
2409
  // tmp3 is now a set of flags
2410

2411
  bif(dst, T16B, tmp1, tmp3);
2412
  // result in dst
2413
}
2414

2415
// Clobbers: rscratch1, rflags
2416
void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2417
                                         FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2418
  assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2419
  assert_different_registers(tmp1, tmp2, src, dst);
2420

2421
  switch (T) {
2422
    case S:
2423
      mov(rscratch1, jint_cast(0x1.0p23f));
2424
      break;
2425
    case D:
2426
      mov(rscratch1, julong_cast(0x1.0p52));
2427
      break;
2428
    default:
2429
      assert(T == S || T == D, "invalid register variant");
2430
  }
2431

2432
  sve_frinta(dst, T, ptrue, src);
2433
  // dst = round(src), ties to away
2434

2435
  Label none;
2436

2437
  sve_fneg(tmp1, T, ptrue, src);
2438
  sve_dup(tmp2, T, rscratch1);
2439
  sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2440
  br(EQ, none);
2441
  {
2442
    sve_cpy(tmp1, T, pgtmp, 0.5);
2443
    sve_fadd(tmp1, T, pgtmp, src);
2444
    sve_frintm(dst, T, pgtmp, tmp1);
2445
    // dst = floor(src + 0.5, ties to even)
2446
  }
2447
  bind(none);
2448

2449
  sve_fcvtzs(dst, T, ptrue, dst, T);
2450
  // result in dst
2451
}
2452

2453
void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2454
                                           FloatRegister one, SIMD_Arrangement T) {
2455
  assert_different_registers(dst, src, zero, one);
2456
  assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2457

2458
  facgt(dst, T, src, zero);
2459
  ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2460
  bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2461
}
2462

2463
void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2464
                                          FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2465
    assert_different_registers(dst, src, zero, one, vtmp);
2466
    assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2467

2468
    sve_orr(vtmp, src, src);
2469
    sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2470
    switch (T) {
2471
    case S:
2472
      sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2473
      sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2474
                                        // on the sign of the float value
2475
      break;
2476
    case D:
2477
      sve_and(vtmp, T, min_jlong);
2478
      sve_orr(vtmp, T, jlong_cast(1.0));
2479
      break;
2480
    default:
2481
      assert(false, "unsupported");
2482
      ShouldNotReachHere();
2483
    }
2484
    sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2485
                                       // Result in dst
2486
}
2487

2488
bool C2_MacroAssembler::in_scratch_emit_size() {
2489
  if (ciEnv::current()->task() != nullptr) {
2490
    PhaseOutput* phase_output = Compile::current()->output();
2491
    if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2492
      return true;
2493
    }
2494
  }
2495
  return MacroAssembler::in_scratch_emit_size();
2496
}
2497
jdk

Использование cookies