2
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
3
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
* This code is free software; you can redistribute it and/or modify it
6
* under the terms of the GNU General Public License version 2 only, as
7
* published by the Free Software Foundation.
9
* This code is distributed in the hope that it will be useful, but WITHOUT
10
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12
* version 2 for more details (a copy is included in the LICENSE file that
13
* accompanied this code).
15
* You should have received a copy of the GNU General Public License version
16
* 2 along with this work; if not, write to the Free Software Foundation,
17
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20
* or visit www.oracle.com if you need additional information or have any
25
#include "precompiled.hpp"
26
#include "asm/assembler.hpp"
27
#include "asm/assembler.inline.hpp"
28
#include "opto/c2_MacroAssembler.hpp"
29
#include "opto/compile.hpp"
30
#include "opto/intrinsicnode.hpp"
31
#include "opto/matcher.hpp"
32
#include "opto/output.hpp"
33
#include "opto/subnode.hpp"
34
#include "runtime/stubRoutines.hpp"
35
#include "utilities/globalDefinitions.hpp"
38
#define BLOCK_COMMENT(str) /* nothing */
39
#define STOP(error) stop(error)
41
#define BLOCK_COMMENT(str) block_comment(str)
42
#define STOP(error) block_comment(error); stop(error)
45
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
47
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
49
void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
50
Register tmp2Reg, Register tmp3Reg) {
51
Register oop = objectReg;
52
Register box = boxReg;
53
Register disp_hdr = tmpReg;
54
Register tmp = tmp2Reg;
56
Label object_has_monitor;
57
Label count, no_count;
59
assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
60
assert_different_registers(oop, box, tmp, disp_hdr);
62
// Load markWord from object into displaced_header.
63
ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
65
if (DiagnoseSyncOnValueBasedClasses != 0) {
67
ldrw(tmp, Address(tmp, Klass::access_flags_offset()));
68
tstw(tmp, JVM_ACC_IS_VALUE_BASED_CLASS);
69
br(Assembler::NE, cont);
72
// Check for existing monitor
73
tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
75
if (LockingMode == LM_MONITOR) {
76
tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
79
assert(LockingMode == LM_LEGACY, "must be");
80
// Set tmp to be (markWord of object | UNLOCK_VALUE).
81
orr(tmp, disp_hdr, markWord::unlocked_value);
83
// Initialize the box. (Must happen before we update the object mark!)
84
str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
86
// Compare object markWord with an unlocked value (tmp) and if
87
// equal exchange the stack address of our box with object markWord.
88
// On failure disp_hdr contains the possibly locked markWord.
89
cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
90
/*release*/ true, /*weak*/ false, disp_hdr);
91
br(Assembler::EQ, cont);
93
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
95
// If the compare-and-exchange succeeded, then we found an unlocked
96
// object, will have now locked it will continue at label cont
98
// Check if the owner is self by comparing the value in the
99
// markWord of object (disp_hdr) with the stack pointer.
101
sub(disp_hdr, disp_hdr, rscratch1);
102
mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
103
// If condition is true we are cont and hence we can store 0 as the
104
// displaced header in the box, which indicates that it is a recursive lock.
105
ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result
106
str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
110
// Handle existing monitor.
111
bind(object_has_monitor);
113
// The object's monitor m is unlocked iff m->owner == nullptr,
114
// otherwise m->owner may contain a thread or a stack address.
116
// Try to CAS m->owner from null to current thread.
117
add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
118
cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
119
/*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
121
// Store a non-null value into the box to avoid looking like a re-entrant
122
// lock. The fast-path monitor unlock code checks for
123
// markWord::monitor_value so use markWord::unused_mark which has the
124
// relevant bit set, and also matches ObjectSynchronizer::enter.
125
mov(tmp, (address)markWord::unused_mark().value());
126
str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
128
br(Assembler::EQ, cont); // CAS success means locking succeeded
130
cmp(tmp3Reg, rthread);
131
br(Assembler::NE, cont); // Check for recursive locking
133
// Recursive lock case
134
increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
135
// flag == EQ still from the cmp above, checking if this is a reentrant lock
138
// flag == EQ indicates success
139
// flag == NE indicates failure
140
br(Assembler::NE, no_count);
143
increment(Address(rthread, JavaThread::held_monitor_count_offset()));
148
void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
150
Register oop = objectReg;
151
Register box = boxReg;
152
Register disp_hdr = tmpReg;
153
Register tmp = tmp2Reg;
155
Label object_has_monitor;
156
Label count, no_count;
158
assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
159
assert_different_registers(oop, box, tmp, disp_hdr);
161
if (LockingMode == LM_LEGACY) {
162
// Find the lock address and load the displaced header from the stack.
163
ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
165
// If the displaced header is 0, we have a recursive unlock.
167
br(Assembler::EQ, cont);
170
// Handle existing monitor.
171
ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
172
tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
174
if (LockingMode == LM_MONITOR) {
175
tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
178
assert(LockingMode == LM_LEGACY, "must be");
179
// Check if it is still a light weight lock, this is is true if we
180
// see the stack address of the basicLock in the markWord of the
183
cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
184
/*release*/ true, /*weak*/ false, tmp);
188
assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
190
// Handle existing monitor.
191
bind(object_has_monitor);
192
STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
193
add(tmp, tmp, -(int)markWord::monitor_value); // monitor
195
ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
198
cbz(disp_hdr, notRecursive);
201
sub(disp_hdr, disp_hdr, 1u);
202
str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
203
cmp(disp_hdr, disp_hdr); // Sets flags for result
207
ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
208
ldr(disp_hdr, Address(tmp, ObjectMonitor::cxq_offset()));
209
orr(rscratch1, rscratch1, disp_hdr); // Will be 0 if both are 0.
210
cmp(rscratch1, zr); // Sets flags for result
211
cbnz(rscratch1, cont);
212
// need a release store here
213
lea(tmp, Address(tmp, ObjectMonitor::owner_offset()));
214
stlr(zr, tmp); // set unowned
217
// flag == EQ indicates success
218
// flag == NE indicates failure
219
br(Assembler::NE, no_count);
222
decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
227
void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register t1,
228
Register t2, Register t3) {
229
assert(LockingMode == LM_LIGHTWEIGHT, "must be");
230
assert_different_registers(obj, t1, t2, t3);
232
// Handle inflated monitor.
234
// Finish fast lock successfully. MUST branch to with flag == EQ
236
// Finish fast lock unsuccessfully. MUST branch to with flag == NE
239
if (DiagnoseSyncOnValueBasedClasses != 0) {
241
ldrw(t1, Address(t1, Klass::access_flags_offset()));
242
tstw(t1, JVM_ACC_IS_VALUE_BASED_CLASS);
243
br(Assembler::NE, slow_path);
246
const Register t1_mark = t1;
248
{ // Lightweight locking
250
// Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
253
const Register t2_top = t2;
254
const Register t3_t = t3;
256
// Check if lock-stack is full.
257
ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
258
cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
259
br(Assembler::GT, slow_path);
261
// Check if recursive.
262
subw(t3_t, t2_top, oopSize);
263
ldr(t3_t, Address(rthread, t3_t));
265
br(Assembler::EQ, push);
267
// Relaxed normal load to check for monitor. Optimization for monitor case.
268
ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
269
tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
272
assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
274
// Try to lock. Transition lock-bits 0b01 => 0b00
275
orr(t1_mark, t1_mark, markWord::unlocked_value);
276
eor(t3_t, t1_mark, markWord::unlocked_value);
277
cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
278
/*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
279
br(Assembler::NE, slow_path);
282
// After successful lock, push object on lock-stack.
283
str(obj, Address(rthread, t2_top));
284
addw(t2_top, t2_top, oopSize);
285
strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
289
{ // Handle inflated monitor.
292
// mark contains the tagged ObjectMonitor*.
293
const Register t1_tagged_monitor = t1_mark;
294
const uintptr_t monitor_tag = markWord::monitor_value;
295
const Register t2_owner_addr = t2;
296
const Register t3_owner = t3;
298
// Compute owner address.
299
lea(t2_owner_addr, Address(t1_tagged_monitor, (in_bytes(ObjectMonitor::owner_offset()) - monitor_tag)));
301
// CAS owner (null => current thread).
302
cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
303
/*release*/ false, /*weak*/ false, t3_owner);
304
br(Assembler::EQ, locked);
306
// Check if recursive.
307
cmp(t3_owner, rthread);
308
br(Assembler::NE, slow_path);
311
increment(Address(t1_tagged_monitor, in_bytes(ObjectMonitor::recursions_offset()) - monitor_tag), 1);
315
increment(Address(rthread, JavaThread::held_monitor_count_offset()));
318
// Check that locked label is reached with Flags == EQ.
320
br(Assembler::EQ, flag_correct);
321
stop("Fast Lock Flag != EQ");
326
// Check that slow_path label is reached with Flags == NE.
327
br(Assembler::NE, flag_correct);
328
stop("Fast Lock Flag != NE");
331
// C2 uses the value of Flags (NE vs EQ) to determine the continuation.
334
void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register t1, Register t2,
336
assert(LockingMode == LM_LIGHTWEIGHT, "must be");
337
assert_different_registers(obj, t1, t2, t3);
339
// Handle inflated monitor.
340
Label inflated, inflated_load_monitor;
341
// Finish fast unlock successfully. MUST branch to with flag == EQ
343
// Finish fast unlock unsuccessfully. MUST branch to with flag == NE
346
const Register t1_mark = t1;
347
const Register t2_top = t2;
348
const Register t3_t = t3;
350
{ // Lightweight unlock
352
// Check if obj is top of lock-stack.
353
ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
354
subw(t2_top, t2_top, oopSize);
355
ldr(t3_t, Address(rthread, t2_top));
357
// Top of lock stack was not obj. Must be monitor.
358
br(Assembler::NE, inflated_load_monitor);
361
DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
362
strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
364
// Check if recursive.
365
subw(t3_t, t2_top, oopSize);
366
ldr(t3_t, Address(rthread, t3_t));
368
br(Assembler::EQ, unlocked);
372
ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
374
// Check header for monitor (0b10).
375
tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
377
// Try to unlock. Transition lock bits 0b00 => 0b01
378
assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
379
orr(t3_t, t1_mark, markWord::unlocked_value);
380
cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
381
/*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
382
br(Assembler::EQ, unlocked);
384
// Compare and exchange failed.
385
// Restore lock-stack and handle the unlock in runtime.
386
DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
387
addw(t2_top, t2_top, oopSize);
388
str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
393
{ // Handle inflated monitor.
394
bind(inflated_load_monitor);
395
ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
397
tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
398
stop("Fast Unlock not monitor");
405
subw(t2_top, t2_top, oopSize);
406
cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
407
br(Assembler::LT, check_done);
408
ldr(t3_t, Address(rthread, t2_top));
410
br(Assembler::NE, inflated);
411
stop("Fast Unlock lock on stack");
415
// mark contains the tagged ObjectMonitor*.
416
const Register t1_monitor = t1_mark;
417
const uintptr_t monitor_tag = markWord::monitor_value;
419
// Untag the monitor.
420
sub(t1_monitor, t1_mark, monitor_tag);
422
const Register t2_recursions = t2;
425
// Check if recursive.
426
ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
427
cbz(t2_recursions, not_recursive);
430
sub(t2_recursions, t2_recursions, 1u);
431
str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
433
cmp(t2_recursions, t2_recursions);
439
const Register t2_owner_addr = t2;
441
// Compute owner address.
442
lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
444
// Check if the entry lists are empty.
445
ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
446
ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
447
orr(rscratch1, rscratch1, t3_t);
449
br(Assembler::EQ, release);
451
// The owner may be anonymous and we removed the last obj entry in
452
// the lock-stack. This loses the information about the owner.
453
// Write the thread to the owner field so the runtime knows the owner.
454
str(rthread, Address(t2_owner_addr));
458
// Set owner to null.
459
// Release to satisfy the JMM
460
stlr(zr, t2_owner_addr);
464
decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
467
// Check that unlocked label is reached with Flags == EQ.
469
br(Assembler::EQ, flag_correct);
470
stop("Fast Unlock Flag != EQ");
475
// Check that slow_path label is reached with Flags == NE.
476
br(Assembler::NE, flag_correct);
477
stop("Fast Unlock Flag != NE");
480
// C2 uses the value of Flags (NE vs EQ) to determine the continuation.
483
// Search for str1 in str2 and return index or -1
484
// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
485
void C2_MacroAssembler::string_indexof(Register str2, Register str1,
486
Register cnt2, Register cnt1,
487
Register tmp1, Register tmp2,
488
Register tmp3, Register tmp4,
489
Register tmp5, Register tmp6,
490
int icnt1, Register result, int ae) {
491
// NOTE: tmp5, tmp6 can be zr depending on specific method version
492
Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
494
Register ch1 = rscratch1;
495
Register ch2 = rscratch2;
496
Register cnt1tmp = tmp1;
497
Register cnt2tmp = tmp2;
498
Register cnt1_neg = cnt1;
499
Register cnt2_neg = cnt2;
500
Register result_tmp = tmp4;
502
bool isL = ae == StrIntrinsicNode::LL;
504
bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
505
bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
506
int str1_chr_shift = str1_isL ? 0:1;
507
int str2_chr_shift = str2_isL ? 0:1;
508
int str1_chr_size = str1_isL ? 1:2;
509
int str2_chr_size = str2_isL ? 1:2;
510
chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
511
(chr_insn)&MacroAssembler::ldrh;
512
chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
513
(chr_insn)&MacroAssembler::ldrh;
514
chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
515
chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
517
// Note, inline_string_indexOf() generates checks:
518
// if (substr.count > string.count) return -1;
519
// if (substr.count == 0) return 0;
521
// We have two strings, a source string in str2, cnt2 and a pattern string
522
// in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
524
// For larger pattern and source we use a simplified Boyer Moore algorithm.
525
// With a small pattern and source we use linear scan.
528
sub(result_tmp, cnt2, cnt1);
529
cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
530
br(LT, LINEARSEARCH);
531
dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
534
ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
538
// The Boyer Moore alogorithm is based on the description here:-
540
// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
542
// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
543
// and the 'Good Suffix' rule.
545
// These rules are essentially heuristics for how far we can shift the
546
// pattern along the search string.
548
// The implementation here uses the 'Bad Character' rule only because of the
549
// complexity of initialisation for the 'Good Suffix' rule.
551
// This is also known as the Boyer-Moore-Horspool algorithm:-
553
// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
555
// This particular implementation has few java-specific optimizations.
559
// int bm(unsigned char *x, int m, unsigned char *y, int n) {
562
// unsigned char bc[ASIZE];
564
// /* Preprocessing */
565
// for (i = 0; i < ASIZE; ++i)
567
// for (i = 0; i < m - 1; ) {
570
// // c < 256 for Latin1 string, so, no need for branch
571
// #ifdef PATTERN_STRING_IS_LATIN1
574
// if (c < ASIZE) bc[c] = m - i;
580
// while (j <= n - m) {
583
// for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
584
// if (i < 0) return j;
585
// // c < 256 for Latin1 string, so, no need for branch
586
// #ifdef SOURCE_STRING_IS_LATIN1
587
// // LL case: (c< 256) always true. Remove branch
590
// #ifndef PATTERN_STRING_IS_UTF
591
// // UU case: need if (c<ASIZE) check. Skip 1 character if not.
597
// #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
598
// // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
608
Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
609
BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
610
Register cnt1end = tmp2;
611
Register str2end = cnt2;
612
Register skipch = tmp2;
614
// str1 length is >=8, so, we can read at least 1 register for cases when
615
// UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
616
// UL case. We'll re-read last character in inner pre-loop code to have
617
// single outer pre-loop load
618
const int firstStep = isL ? 7 : 3;
620
const int ASIZE = 256;
621
const int STORED_BYTES = 32; // amount of bytes stored per instruction
623
mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
626
stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
628
br(GT, BM_INIT_LOOP);
630
sub(cnt1tmp, cnt1, 1);
632
add(str2end, str2, result_tmp, LSL, str2_chr_shift);
636
(this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
638
subs(zr, ch1, ASIZE);
641
strb(ch2, Address(sp, ch1));
646
add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
647
if (str1_isL == str2_isL) {
648
// load last 8 bytes (8LL/4UU symbols)
649
ldr(tmp6, Address(tmp6, -wordSize));
651
ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
652
// convert Latin1 to UTF. We'll have to wait until load completed, but
653
// it's still faster than per-character loads+checks
654
lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
655
ubfx(ch1, tmp6, 8, 8); // str1[N-2]
656
ubfx(ch2, tmp6, 16, 8); // str1[N-3]
657
andr(tmp6, tmp6, 0xFF); // str1[N-4]
658
orr(ch2, ch1, ch2, LSL, 16);
659
orr(tmp6, tmp6, tmp3, LSL, 48);
660
orr(tmp6, tmp6, ch2, LSL, 16);
663
(this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
664
sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
665
if (str1_isL == str2_isL) {
666
// re-init tmp3. It's for free because it's executed in parallel with
667
// load above. Alternative is to initialize it before loop, but it'll
668
// affect performance on in-order systems with 2 or more ld/st pipelines
669
lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
671
if (!isL) { // UU/UL case
672
lsl(ch2, cnt1tmp, 1); // offset in bytes
676
ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
679
b(BMLOOPSTR1_AFTER_LOAD);
681
sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
685
(this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
686
(this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
687
BIND(BMLOOPSTR1_AFTER_LOAD);
688
subs(cnt1tmp, cnt1tmp, 1);
689
br(LT, BMLOOPSTR1_LASTCMP);
690
BIND(BMLOOPSTR1_CMP);
695
// if we've met UTF symbol while searching Latin1 pattern, then we can
697
if (str1_isL != str2_isL) {
698
mov(result_tmp, cnt1);
702
subs(zr, skipch, ASIZE);
705
ldrb(result_tmp, Address(sp, skipch)); // load skip distance
707
sub(cnt1tmp, cnt1, 1);
708
add(str2, str2, result_tmp, LSL, str2_chr_shift);
713
BIND(BMLOOPSTR1_LASTCMP);
717
sub(result, str2, tmp5);
718
if (!str2_isL) lsr(result, result, 1);
723
cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
724
br(LT, LINEAR_MEDIUM);
726
RuntimeAddress stub = nullptr;
728
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
729
assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
730
} else if (str1_isL) {
731
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
732
assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
734
stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
735
assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
737
address call = trampoline_call(stub);
738
if (call == nullptr) {
739
DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
740
ciEnv::current()->record_failure("CodeCache is full");
750
Register str2tmp = tmp2;
751
Register first = tmp3;
755
Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
757
cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
760
(this->*str1_load_1chr)(first, Address(str1));
761
lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
762
sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
763
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
764
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
767
(this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
771
adds(cnt2_neg, cnt2_neg, str2_chr_size);
776
adds(cnt1tmp, cnt1_neg, str1_chr_size);
777
add(cnt2tmp, cnt2_neg, str2_chr_size);
781
(this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
782
(this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
785
adds(cnt1tmp, cnt1tmp, str1_chr_size);
786
add(cnt2tmp, cnt2tmp, str2_chr_size);
791
if (str1_isL == str2_isL) {
801
(this->*load_4chr)(ch1, str1);
802
sub(result_tmp, cnt2, 4);
803
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
804
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
807
(this->*load_4chr)(ch2, Address(str2, cnt2_neg));
810
adds(cnt2_neg, cnt2_neg, str2_chr_size);
815
if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
819
(this->*load_2chr)(ch1, str1);
821
sub(result_tmp, cnt2, 2);
823
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
824
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
826
(this->*load_2chr)(ch2, Address(str2, cnt2_neg));
829
adds(cnt2_neg, cnt2_neg, str2_chr_size);
834
if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
835
Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
838
(this->*load_2chr)(first, str1);
839
(this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
841
sub(result_tmp, cnt2, 3);
843
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
844
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
846
(this->*load_2chr)(ch2, Address(str2, cnt2_neg));
850
adds(cnt2_neg, cnt2_neg, str2_chr_size);
855
add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
856
(this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
862
if (icnt1 == -1 || icnt1 == 1) {
863
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
866
(this->*str1_load_1chr)(ch1, str1);
870
sub(result_tmp, cnt2, 8/str2_chr_size);
871
sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
872
mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
873
lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
876
orr(ch1, ch1, ch1, LSL, 8);
878
orr(ch1, ch1, ch1, LSL, 16);
879
orr(ch1, ch1, ch1, LSL, 32);
881
ldr(ch2, Address(str2, cnt2_neg));
883
sub(tmp1, ch2, tmp3);
884
orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
885
bics(tmp1, tmp1, tmp2);
887
adds(cnt2_neg, cnt2_neg, 8);
890
cmp(cnt2_neg, (u1)8);
898
add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
902
mov(result_tmp, cnt2);
903
lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
904
sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
906
(this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
909
adds(cnt2_neg, cnt2_neg, str2_chr_size);
917
add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
921
typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
922
typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
924
void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
925
Register ch, Register result,
926
Register tmp1, Register tmp2, Register tmp3)
928
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
929
Register cnt1_neg = cnt1;
930
Register ch1 = rscratch1;
931
Register result_tmp = rscratch2;
938
orr(ch, ch, ch, LSL, 16);
939
orr(ch, ch, ch, LSL, 32);
942
mov(result_tmp, cnt1);
943
lea(str1, Address(str1, cnt1, Address::uxtw(1)));
944
sub(cnt1_neg, zr, cnt1, LSL, 1);
946
mov(tmp3, 0x0001000100010001);
949
ldr(ch1, Address(str1, cnt1_neg));
951
sub(tmp1, ch1, tmp3);
952
orr(tmp2, ch1, 0x7fff7fff7fff7fff);
953
bics(tmp1, tmp1, tmp2);
955
adds(cnt1_neg, cnt1_neg, 8);
958
cmp(cnt1_neg, (u1)8);
966
add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
970
mov(result_tmp, cnt1);
971
lea(str1, Address(str1, cnt1, Address::uxtw(1)));
972
sub(cnt1_neg, zr, cnt1, LSL, 1);
974
ldrh(ch1, Address(str1, cnt1_neg));
977
adds(cnt1_neg, cnt1_neg, 2);
983
add(result, result_tmp, cnt1_neg, ASR, 1);
987
void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
988
Register ch, Register result,
992
PRegister tmp_pdn, bool isL)
994
// Note that `tmp_pdn` should *NOT* be used as governing predicate register.
995
assert(tmp_pg->is_governing(),
996
"this register has to be a governing predicate register");
998
Label LOOP, MATCH, DONE, NOMATCH;
999
Register vec_len = rscratch1;
1000
Register idx = rscratch2;
1002
SIMD_RegVariant T = (isL == true) ? B : H;
1006
// Assign the particular char throughout the vector.
1007
sve_dup(ztmp2, T, ch);
1015
// Generate a predicate to control the reading of input string.
1016
sve_whilelt(tmp_pg, T, idx, cnt1);
1019
// Read a vector of 8- or 16-bit data depending on the string type. Note
1020
// that inactive elements indicated by the predicate register won't cause
1021
// a data read from memory to the destination vector.
1023
sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1025
sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1027
add(idx, idx, vec_len);
1029
// Perform the comparison. An element of the destination predicate is set
1030
// to active if the particular char is matched.
1031
sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1033
// Branch if the particular char is found.
1036
sve_whilelt(tmp_pg, T, idx, cnt1);
1038
// Loop back if the particular char not found.
1046
// Undo the index increment.
1047
sub(idx, idx, vec_len);
1049
// Crop the vector to find its location.
1050
sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1051
add(result, idx, -1);
1052
sve_incp(result, T, tmp_pdn);
1056
void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1057
Register ch, Register result,
1058
Register tmp1, Register tmp2, Register tmp3)
1060
Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1061
Register cnt1_neg = cnt1;
1062
Register ch1 = rscratch1;
1063
Register result_tmp = rscratch2;
1070
orr(ch, ch, ch, LSL, 8);
1071
orr(ch, ch, ch, LSL, 16);
1072
orr(ch, ch, ch, LSL, 32);
1075
mov(result_tmp, cnt1);
1076
lea(str1, Address(str1, cnt1));
1077
sub(cnt1_neg, zr, cnt1);
1079
mov(tmp3, 0x0101010101010101);
1082
ldr(ch1, Address(str1, cnt1_neg));
1084
sub(tmp1, ch1, tmp3);
1085
orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1086
bics(tmp1, tmp1, tmp2);
1088
adds(cnt1_neg, cnt1_neg, 8);
1091
cmp(cnt1_neg, (u1)8);
1099
add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1103
mov(result_tmp, cnt1);
1104
lea(str1, Address(str1, cnt1));
1105
sub(cnt1_neg, zr, cnt1);
1107
ldrb(ch1, Address(str1, cnt1_neg));
1110
adds(cnt1_neg, cnt1_neg, 1);
1116
add(result, result_tmp, cnt1_neg);
1121
void C2_MacroAssembler::string_compare(Register str1, Register str2,
1122
Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1123
FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1124
PRegister pgtmp1, PRegister pgtmp2, int ae) {
1125
Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1126
DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1127
SHORT_LOOP_START, TAIL_CHECK;
1129
bool isLL = ae == StrIntrinsicNode::LL;
1130
bool isLU = ae == StrIntrinsicNode::LU;
1131
bool isUL = ae == StrIntrinsicNode::UL;
1133
// The stub threshold for LL strings is: 72 (64 + 8) chars
1134
// UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1135
// LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1136
const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1138
bool str1_isL = isLL || isLU;
1139
bool str2_isL = isLL || isUL;
1141
int str1_chr_shift = str1_isL ? 0 : 1;
1142
int str2_chr_shift = str2_isL ? 0 : 1;
1143
int str1_chr_size = str1_isL ? 1 : 2;
1144
int str2_chr_size = str2_isL ? 1 : 2;
1145
int minCharsInWord = isLL ? wordSize : wordSize/2;
1147
FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1148
chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1149
(chr_insn)&MacroAssembler::ldrh;
1150
chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1151
(chr_insn)&MacroAssembler::ldrh;
1152
uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1153
(uxt_insn)&MacroAssembler::uxthw;
1155
BLOCK_COMMENT("string_compare {");
1157
// Bizarrely, the counts are passed in bytes, regardless of whether they
1158
// are L or U strings, however the result is always in characters.
1159
if (!str1_isL) asrw(cnt1, cnt1, 1);
1160
if (!str2_isL) asrw(cnt2, cnt2, 1);
1162
// Compute the minimum of the string lengths and save the difference.
1163
subsw(result, cnt1, cnt2);
1164
cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1166
// A very short string
1167
cmpw(cnt2, minCharsInWord);
1168
br(Assembler::LE, SHORT_STRING);
1170
// Compare longwords
1171
// load first parts of strings and finish initialization while loading
1173
if (str1_isL == str2_isL) { // LL or UU
1174
ldr(tmp1, Address(str1));
1176
br(Assembler::EQ, DONE);
1177
ldr(tmp2, Address(str2));
1178
cmp(cnt2, stub_threshold);
1180
subsw(cnt2, cnt2, minCharsInWord);
1182
lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1183
lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1184
sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1186
ldrs(vtmp, Address(str1));
1187
ldr(tmp2, Address(str2));
1188
cmp(cnt2, stub_threshold);
1190
subw(cnt2, cnt2, 4);
1191
eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1192
lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1193
lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1194
zip1(vtmp, T8B, vtmp, vtmpZ);
1195
sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1196
sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1200
ldr(tmp1, Address(str1));
1201
ldrs(vtmp, Address(str2));
1202
cmp(cnt2, stub_threshold);
1204
subw(cnt2, cnt2, 4);
1205
lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1206
eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1207
lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1208
sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1209
zip1(vtmp, T8B, vtmp, vtmpZ);
1210
sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1214
adds(cnt2, cnt2, isUL ? 4 : 8);
1216
eor(rscratch2, tmp1, tmp2);
1217
cbnz(rscratch2, DIFF);
1220
if (str1_isL == str2_isL) {
1221
ldr(tmp1, Address(str1, cnt2));
1222
ldr(tmp2, Address(str2, cnt2));
1223
adds(cnt2, cnt2, 8);
1225
ldrs(vtmp, Address(str1, cnt1));
1226
ldr(tmp2, Address(str2, cnt2));
1228
zip1(vtmp, T8B, vtmp, vtmpZ);
1230
adds(cnt2, cnt2, 8);
1232
ldrs(vtmp, Address(str2, cnt2));
1233
ldr(tmp1, Address(str1, cnt1));
1234
zip1(vtmp, T8B, vtmp, vtmpZ);
1237
adds(cnt2, cnt2, 4);
1241
eor(rscratch2, tmp1, tmp2);
1242
cbz(rscratch2, NEXT_WORD);
1245
eor(rscratch2, tmp1, tmp2);
1246
cbnz(rscratch2, DIFF);
1247
// Last longword. In the case where length == 4 we compare the
1248
// same longword twice, but that's still faster than another
1249
// conditional branch.
1250
if (str1_isL == str2_isL) {
1251
ldr(tmp1, Address(str1));
1252
ldr(tmp2, Address(str2));
1254
ldrs(vtmp, Address(str1));
1255
ldr(tmp2, Address(str2));
1256
zip1(vtmp, T8B, vtmp, vtmpZ);
1259
ldrs(vtmp, Address(str2));
1260
ldr(tmp1, Address(str1));
1261
zip1(vtmp, T8B, vtmp, vtmpZ);
1265
eor(rscratch2, tmp1, tmp2);
1266
cbz(rscratch2, DONE);
1268
// Find the first different characters in the longwords and
1269
// compute their difference.
1271
rev(rscratch2, rscratch2);
1272
clz(rscratch2, rscratch2);
1273
andr(rscratch2, rscratch2, isLL ? -8 : -16);
1274
lsrv(tmp1, tmp1, rscratch2);
1275
(this->*ext_chr)(tmp1, tmp1);
1276
lsrv(tmp2, tmp2, rscratch2);
1277
(this->*ext_chr)(tmp2, tmp2);
1278
subw(result, tmp1, tmp2);
1283
RuntimeAddress stub = nullptr;
1285
case StrIntrinsicNode::LL:
1286
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1288
case StrIntrinsicNode::UU:
1289
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1291
case StrIntrinsicNode::LU:
1292
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1294
case StrIntrinsicNode::UL:
1295
stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1298
ShouldNotReachHere();
1300
assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1301
address call = trampoline_call(stub);
1302
if (call == nullptr) {
1303
DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1304
ciEnv::current()->record_failure("CodeCache is full");
1310
// Is the minimum length zero?
1312
// arrange code to do most branches while loading and loading next characters
1313
// while comparing previous
1314
(this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1315
subs(cnt2, cnt2, 1);
1316
br(EQ, SHORT_LAST_INIT);
1317
(this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1318
b(SHORT_LOOP_START);
1320
subs(cnt2, cnt2, 1);
1322
bind(SHORT_LOOP_START);
1323
(this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1324
(this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1326
br(NE, SHORT_LOOP_TAIL);
1327
subs(cnt2, cnt2, 1);
1328
br(EQ, SHORT_LAST2);
1329
(this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1330
(this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1331
cmp(tmp2, rscratch1);
1333
sub(result, tmp2, rscratch1);
1335
bind(SHORT_LOOP_TAIL);
1336
sub(result, tmp1, cnt1);
1339
cmp(tmp2, rscratch1);
1341
sub(result, tmp2, rscratch1);
1344
bind(SHORT_LAST_INIT);
1345
(this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1349
sub(result, tmp1, cnt1);
1353
BLOCK_COMMENT("} string_compare");
1356
void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1357
FloatRegister src2, Condition cond, bool isQ) {
1358
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1359
FloatRegister zn = src1, zm = src2;
1360
bool needs_negation = false;
1362
case LT: cond = GT; zn = src2; zm = src1; break;
1363
case LE: cond = GE; zn = src2; zm = src1; break;
1364
case LO: cond = HI; zn = src2; zm = src1; break;
1365
case LS: cond = HS; zn = src2; zm = src1; break;
1366
case NE: cond = EQ; needs_negation = true; break;
1371
if (is_floating_point_type(bt)) {
1372
fcm(cond, dst, size, zn, zm);
1374
cm(cond, dst, size, zn, zm);
1377
if (needs_negation) {
1378
notr(dst, isQ ? T16B : T8B, dst);
1382
void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1383
Condition cond, bool isQ) {
1384
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1385
if (bt == T_FLOAT || bt == T_DOUBLE) {
1386
if (cond == Assembler::NE) {
1387
fcm(Assembler::EQ, dst, size, src);
1388
notr(dst, isQ ? T16B : T8B, dst);
1390
fcm(cond, dst, size, src);
1393
if (cond == Assembler::NE) {
1394
cm(Assembler::EQ, dst, size, src);
1395
notr(dst, isQ ? T16B : T8B, dst);
1397
cm(cond, dst, size, src);
1402
// Compress the least significant bit of each byte to the rightmost and clear
1403
// the higher garbage bits.
1404
void C2_MacroAssembler::bytemask_compress(Register dst) {
1405
// Example input, dst = 0x01 00 00 00 01 01 00 01
1406
// The "??" bytes are garbage.
1407
orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1408
orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1409
orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1410
andr(dst, dst, 0xff); // dst = 0x8D
1413
// Pack the lowest-numbered bit of each mask element in src into a long value
1414
// in dst, at most the first 64 lane elements.
1415
// Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1416
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1417
FloatRegister vtmp1, FloatRegister vtmp2) {
1418
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1419
assert_different_registers(dst, rscratch1);
1420
assert_different_registers(vtmp1, vtmp2);
1422
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1423
// Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1424
// Expected: dst = 0x658D
1426
// Convert the mask into vector with sequential bytes.
1427
// vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1428
sve_cpy(vtmp1, size, src, 1, false);
1430
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1433
if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1434
// Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1435
// is to compress each significant bit of the byte in a cross-lane way. Due
1436
// to the lack of a cross-lane bit-compress instruction, we use BEXT
1437
// (bit-compress in each lane) with the biggest lane size (T = D) then
1438
// concatenate the results.
1440
// The second source input of BEXT, initialized with 0x01 in each byte.
1441
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1442
sve_dup(vtmp2, B, 1);
1444
// BEXT vtmp1.D, vtmp1.D, vtmp2.D
1445
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
1446
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
1447
// ---------------------------------------
1448
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
1449
sve_bext(vtmp1, D, vtmp1, vtmp2);
1451
// Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1453
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
1455
if (lane_cnt <= 8) {
1456
// No need to concatenate.
1457
umov(dst, vtmp1, B, 0);
1458
} else if (lane_cnt <= 16) {
1459
ins(vtmp1, B, vtmp1, 1, 8);
1460
umov(dst, vtmp1, H, 0);
1462
// As the lane count is 64 at most, the final expected value must be in
1463
// the lowest 64 bits after narrowing vtmp1 from D to B.
1464
sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1465
umov(dst, vtmp1, D, 0);
1467
} else if (UseSVE > 0) {
1468
// Compress the lowest 8 bytes.
1470
bytemask_compress(dst);
1471
if (lane_cnt <= 8) return;
1473
// Repeat on higher bytes and join the results.
1474
// Compress 8 bytes in each iteration.
1475
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1476
sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1477
bytemask_compress(rscratch1);
1478
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1481
assert(false, "unsupported");
1482
ShouldNotReachHere();
1486
// Unpack the mask, a long value in src, into predicate register dst based on the
1487
// corresponding data type. Note that dst can support at most 64 lanes.
1488
// Below example gives the expected dst predicate register in different types, with
1489
// a valid src(0x658D) on a 1024-bit vector size machine.
1490
// BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1491
// SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1492
// INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1493
// LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1495
// The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1496
// has 24 significant bits would be an invalid input if dst predicate register refers to
1497
// a LONG type 1024-bit vector, which has at most 16 lanes.
1498
void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1499
FloatRegister vtmp1, FloatRegister vtmp2) {
1500
assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1501
lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1502
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1503
// Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1504
// Expected: dst = 0b01101001 10001101
1506
// Put long value from general purpose register into the first lane of vector.
1507
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
1508
sve_dup(vtmp1, B, 0);
1509
mov(vtmp1, D, 0, src);
1511
// As sve_cmp generates mask value with the minimum unit in byte, we should
1512
// transform the value in the first lane which is mask in bit now to the
1513
// mask in byte, which can be done by SVE2's BDEP instruction.
1515
// The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1516
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
1517
if (lane_cnt <= 8) {
1518
// Nothing. As only one byte exsits.
1519
} else if (lane_cnt <= 16) {
1520
ins(vtmp1, B, vtmp1, 8, 1);
1521
mov(vtmp1, B, 1, zr);
1523
sve_vector_extend(vtmp1, D, vtmp1, B);
1526
// The second source input of BDEP instruction, initialized with 0x01 for each byte.
1527
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1528
sve_dup(vtmp2, B, 1);
1530
// BDEP vtmp1.D, vtmp1.D, vtmp2.D
1531
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
1532
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
1533
// ---------------------------------------
1534
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
1535
sve_bdep(vtmp1, D, vtmp1, vtmp2);
1538
sve_vector_extend(vtmp1, size, vtmp1, B);
1540
// Generate mask according to the given vector, in which the elements have been
1541
// extended to expected type.
1542
// dst = 0b01101001 10001101
1543
sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1547
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1548
FloatRegister zn, FloatRegister zm, Condition cond) {
1549
assert(pg->is_governing(), "This register has to be a governing predicate register");
1550
FloatRegister z1 = zn, z2 = zm;
1552
case LE: z1 = zm; z2 = zn; cond = GE; break;
1553
case LT: z1 = zm; z2 = zn; cond = GT; break;
1554
case LO: z1 = zm; z2 = zn; cond = HI; break;
1555
case LS: z1 = zm; z2 = zn; cond = HS; break;
1560
SIMD_RegVariant size = elemType_to_regVariant(bt);
1561
if (is_floating_point_type(bt)) {
1562
sve_fcm(cond, pd, size, pg, z1, z2);
1564
assert(is_integral_type(bt), "unsupported element type");
1565
sve_cmp(cond, pd, size, pg, z1, z2);
1569
// Get index of the last mask lane that is set
1570
void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1571
SIMD_RegVariant size = elemType_to_regVariant(bt);
1572
sve_rev(ptmp, size, src);
1573
sve_brkb(ptmp, ptrue, ptmp, false);
1574
sve_cntp(dst, size, ptrue, ptmp);
1575
movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1576
subw(dst, rscratch1, dst);
1579
// Extend integer vector src to dst with the same lane count
1580
// but larger element size, e.g. 4B -> 4I
1581
void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1582
FloatRegister src, BasicType src_bt, bool is_unsigned) {
1583
if (src_bt == T_BYTE) {
1584
if (dst_bt == T_SHORT) {
1586
_xshll(is_unsigned, dst, T8H, src, T8B, 0);
1589
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1590
_xshll(is_unsigned, dst, T8H, src, T8B, 0);
1591
_xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1593
} else if (src_bt == T_SHORT) {
1595
assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1596
_xshll(is_unsigned, dst, T4S, src, T4H, 0);
1597
} else if (src_bt == T_INT) {
1599
assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1600
_xshll(is_unsigned, dst, T2D, src, T2S, 0);
1602
ShouldNotReachHere();
1606
// Narrow integer vector src down to dst with the same lane count
1607
// but smaller element size, e.g. 4I -> 4B
1608
void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1609
FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1610
if (src_bt == T_SHORT) {
1612
assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1613
assert(dst_bt == T_BYTE, "unsupported");
1614
xtn(dst, T8B, src, T8H);
1615
} else if (src_bt == T_INT) {
1617
assert(src_vlen_in_bytes == 16, "unsupported");
1618
assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1619
xtn(dst, T4H, src, T4S);
1620
if (dst_bt == T_BYTE) {
1621
xtn(dst, T8B, dst, T8H);
1623
} else if (src_bt == T_LONG) {
1625
assert(src_vlen_in_bytes == 16, "unsupported");
1626
assert(dst_bt == T_INT, "unsupported");
1627
xtn(dst, T2S, src, T2D);
1629
ShouldNotReachHere();
1633
void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1634
FloatRegister src, SIMD_RegVariant src_size,
1636
assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1638
if (src_size == B) {
1641
_sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1644
_sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1645
_sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1648
_sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1649
_sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1650
_sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1653
ShouldNotReachHere();
1655
} else if (src_size == H) {
1656
if (dst_size == S) {
1657
_sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1659
_sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1660
_sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1662
} else if (src_size == S) {
1663
_sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1667
// Vector narrow from src to dst with specified element sizes.
1668
// High part of dst vector will be filled with zero.
1669
void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1670
FloatRegister src, SIMD_RegVariant src_size,
1671
FloatRegister tmp) {
1672
assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1673
assert_different_registers(src, tmp);
1674
sve_dup(tmp, src_size, 0);
1675
if (src_size == D) {
1678
sve_uzp1(dst, S, src, tmp);
1681
assert_different_registers(dst, tmp);
1682
sve_uzp1(dst, S, src, tmp);
1683
sve_uzp1(dst, H, dst, tmp);
1686
assert_different_registers(dst, tmp);
1687
sve_uzp1(dst, S, src, tmp);
1688
sve_uzp1(dst, H, dst, tmp);
1689
sve_uzp1(dst, B, dst, tmp);
1692
ShouldNotReachHere();
1694
} else if (src_size == S) {
1695
if (dst_size == H) {
1696
sve_uzp1(dst, H, src, tmp);
1698
assert_different_registers(dst, tmp);
1699
sve_uzp1(dst, H, src, tmp);
1700
sve_uzp1(dst, B, dst, tmp);
1702
} else if (src_size == H) {
1703
sve_uzp1(dst, B, src, tmp);
1707
// Extend src predicate to dst predicate with the same lane count but larger
1708
// element size, e.g. 64Byte -> 512Long
1709
void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1710
uint dst_element_length_in_bytes,
1711
uint src_element_length_in_bytes) {
1712
if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1713
sve_punpklo(dst, src);
1714
} else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1715
sve_punpklo(dst, src);
1716
sve_punpklo(dst, dst);
1717
} else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1718
sve_punpklo(dst, src);
1719
sve_punpklo(dst, dst);
1720
sve_punpklo(dst, dst);
1722
assert(false, "unsupported");
1723
ShouldNotReachHere();
1727
// Narrow src predicate to dst predicate with the same lane count but
1728
// smaller element size, e.g. 512Long -> 64Byte
1729
void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1730
uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1731
// The insignificant bits in src predicate are expected to be zero.
1732
// To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1733
// passed as the second argument. An example narrowing operation with a given mask would be -
1734
// 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1735
// Mask (for 2 Longs) : TF
1736
// Predicate register for the above mask (16 bits) : 00000001 00000000
1737
// After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1738
// Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1739
assert_different_registers(src, ptmp);
1740
assert_different_registers(dst, ptmp);
1742
if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1743
sve_uzp1(dst, B, src, ptmp);
1744
} else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1745
sve_uzp1(dst, H, src, ptmp);
1746
sve_uzp1(dst, B, dst, ptmp);
1747
} else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1748
sve_uzp1(dst, S, src, ptmp);
1749
sve_uzp1(dst, H, dst, ptmp);
1750
sve_uzp1(dst, B, dst, ptmp);
1752
assert(false, "unsupported");
1753
ShouldNotReachHere();
1757
// Vector reduction add for integral type with ASIMD instructions.
1758
void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1759
Register isrc, FloatRegister vsrc,
1760
unsigned vector_length_in_bytes,
1761
FloatRegister vtmp) {
1762
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1763
assert_different_registers(dst, isrc);
1764
bool isQ = vector_length_in_bytes == 16;
1766
BLOCK_COMMENT("neon_reduce_add_integral {");
1769
addv(vtmp, isQ ? T16B : T8B, vsrc);
1770
smov(dst, vtmp, B, 0);
1771
addw(dst, dst, isrc, ext::sxtb);
1774
addv(vtmp, isQ ? T8H : T4H, vsrc);
1775
smov(dst, vtmp, H, 0);
1776
addw(dst, dst, isrc, ext::sxth);
1779
isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
1780
umov(dst, vtmp, S, 0);
1781
addw(dst, dst, isrc);
1784
assert(isQ, "unsupported");
1786
umov(dst, vtmp, D, 0);
1787
add(dst, dst, isrc);
1790
assert(false, "unsupported");
1791
ShouldNotReachHere();
1793
BLOCK_COMMENT("} neon_reduce_add_integral");
1796
// Vector reduction multiply for integral type with ASIMD instructions.
1797
// Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
1798
// Clobbers: rscratch1
1799
void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
1800
Register isrc, FloatRegister vsrc,
1801
unsigned vector_length_in_bytes,
1802
FloatRegister vtmp1, FloatRegister vtmp2) {
1803
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1804
bool isQ = vector_length_in_bytes == 16;
1806
BLOCK_COMMENT("neon_reduce_mul_integral {");
1810
// Multiply the lower half and higher half of vector iteratively.
1811
// vtmp1 = vsrc[8:15]
1812
ins(vtmp1, D, vsrc, 0, 1);
1813
// vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
1814
mulv(vtmp1, T8B, vtmp1, vsrc);
1815
// vtmp2 = vtmp1[4:7]
1816
ins(vtmp2, S, vtmp1, 0, 1);
1817
// vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
1818
mulv(vtmp1, T8B, vtmp2, vtmp1);
1820
ins(vtmp1, S, vsrc, 0, 1);
1821
mulv(vtmp1, T8B, vtmp1, vsrc);
1823
// vtmp2 = vtmp1[2:3]
1824
ins(vtmp2, H, vtmp1, 0, 1);
1825
// vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
1826
mulv(vtmp2, T8B, vtmp2, vtmp1);
1827
// dst = vtmp2[0] * isrc * vtmp2[1]
1828
umov(rscratch1, vtmp2, B, 0);
1829
mulw(dst, rscratch1, isrc);
1831
umov(rscratch1, vtmp2, B, 1);
1832
mulw(dst, rscratch1, dst);
1837
ins(vtmp2, D, vsrc, 0, 1);
1838
mulv(vtmp2, T4H, vtmp2, vsrc);
1839
ins(vtmp1, S, vtmp2, 0, 1);
1840
mulv(vtmp1, T4H, vtmp1, vtmp2);
1842
ins(vtmp1, S, vsrc, 0, 1);
1843
mulv(vtmp1, T4H, vtmp1, vsrc);
1845
umov(rscratch1, vtmp1, H, 0);
1846
mulw(dst, rscratch1, isrc);
1848
umov(rscratch1, vtmp1, H, 1);
1849
mulw(dst, rscratch1, dst);
1854
ins(vtmp1, D, vsrc, 0, 1);
1855
mulv(vtmp1, T2S, vtmp1, vsrc);
1859
umov(rscratch1, vtmp1, S, 0);
1860
mul(dst, rscratch1, isrc);
1861
umov(rscratch1, vtmp1, S, 1);
1862
mul(dst, rscratch1, dst);
1865
umov(rscratch1, vsrc, D, 0);
1866
mul(dst, isrc, rscratch1);
1867
umov(rscratch1, vsrc, D, 1);
1868
mul(dst, dst, rscratch1);
1871
assert(false, "unsupported");
1872
ShouldNotReachHere();
1874
BLOCK_COMMENT("} neon_reduce_mul_integral");
1877
// Vector reduction multiply for floating-point type with ASIMD instructions.
1878
void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
1879
FloatRegister fsrc, FloatRegister vsrc,
1880
unsigned vector_length_in_bytes,
1881
FloatRegister vtmp) {
1882
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1883
bool isQ = vector_length_in_bytes == 16;
1885
BLOCK_COMMENT("neon_reduce_mul_fp {");
1888
fmuls(dst, fsrc, vsrc);
1889
ins(vtmp, S, vsrc, 0, 1);
1890
fmuls(dst, dst, vtmp);
1892
ins(vtmp, S, vsrc, 0, 2);
1893
fmuls(dst, dst, vtmp);
1894
ins(vtmp, S, vsrc, 0, 3);
1895
fmuls(dst, dst, vtmp);
1899
assert(isQ, "unsupported");
1900
fmuld(dst, fsrc, vsrc);
1901
ins(vtmp, D, vsrc, 0, 1);
1902
fmuld(dst, dst, vtmp);
1905
assert(false, "unsupported");
1906
ShouldNotReachHere();
1908
BLOCK_COMMENT("} neon_reduce_mul_fp");
1911
// Helper to select logical instruction
1912
void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
1913
Register Rn, Register Rm,
1914
enum shift_kind kind, unsigned shift) {
1916
case Op_AndReductionV:
1917
is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
1919
case Op_OrReductionV:
1920
is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
1922
case Op_XorReductionV:
1923
is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
1926
assert(false, "unsupported");
1927
ShouldNotReachHere();
1931
// Vector reduction logical operations And, Or, Xor
1932
// Clobbers: rscratch1
1933
void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
1934
Register isrc, FloatRegister vsrc,
1935
unsigned vector_length_in_bytes) {
1936
assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
1938
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1939
assert_different_registers(dst, isrc);
1940
bool isQ = vector_length_in_bytes == 16;
1942
BLOCK_COMMENT("neon_reduce_logical {");
1943
umov(rscratch1, vsrc, isQ ? D : S, 0);
1944
umov(dst, vsrc, isQ ? D : S, 1);
1945
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
1949
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1951
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1952
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
1953
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1958
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1960
neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
1961
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1966
neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
1968
neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
1971
assert(isQ, "unsupported");
1972
neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
1975
assert(false, "unsupported");
1976
ShouldNotReachHere();
1978
BLOCK_COMMENT("} neon_reduce_logical");
1981
// Vector reduction min/max for integral type with ASIMD instructions.
1982
// Note: vtmp is not used and expected to be fnoreg for T_LONG case.
1983
// Clobbers: rscratch1, rflags
1984
void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
1985
Register isrc, FloatRegister vsrc,
1986
unsigned vector_length_in_bytes,
1987
FloatRegister vtmp) {
1988
assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
1989
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1990
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
1991
assert_different_registers(dst, isrc);
1992
bool isQ = vector_length_in_bytes == 16;
1993
bool is_min = opc == Op_MinReductionV;
1995
BLOCK_COMMENT("neon_reduce_minmax_integral {");
1997
assert(vtmp == fnoreg, "should be");
1998
assert(isQ, "should be");
1999
umov(rscratch1, vsrc, D, 0);
2000
cmp(isrc, rscratch1);
2001
csel(dst, isrc, rscratch1, is_min ? LT : GT);
2002
umov(rscratch1, vsrc, D, 1);
2003
cmp(dst, rscratch1);
2004
csel(dst, dst, rscratch1, is_min ? LT : GT);
2006
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2008
is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2010
is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2013
umov(dst, vtmp, S, 0);
2015
smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2018
cselw(dst, dst, isrc, is_min ? LT : GT);
2020
BLOCK_COMMENT("} neon_reduce_minmax_integral");
2023
// Vector reduction for integral type with SVE instruction.
2024
// Supported operations are Add, And, Or, Xor, Max, Min.
2025
// rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2026
void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2027
FloatRegister src2, PRegister pg, FloatRegister tmp) {
2028
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2029
assert(pg->is_governing(), "This register has to be a governing predicate register");
2030
assert_different_registers(src1, dst);
2031
// Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2032
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2034
case Op_AddReductionVI: {
2035
sve_uaddv(tmp, size, pg, src2);
2037
smov(dst, tmp, size, 0);
2038
addw(dst, src1, dst, ext::sxtb);
2039
} else if (bt == T_SHORT) {
2040
smov(dst, tmp, size, 0);
2041
addw(dst, src1, dst, ext::sxth);
2043
umov(dst, tmp, size, 0);
2044
addw(dst, dst, src1);
2048
case Op_AddReductionVL: {
2049
sve_uaddv(tmp, size, pg, src2);
2050
umov(dst, tmp, size, 0);
2051
add(dst, dst, src1);
2054
case Op_AndReductionV: {
2055
sve_andv(tmp, size, pg, src2);
2056
if (bt == T_INT || bt == T_LONG) {
2057
umov(dst, tmp, size, 0);
2059
smov(dst, tmp, size, 0);
2062
andr(dst, dst, src1);
2064
andw(dst, dst, src1);
2068
case Op_OrReductionV: {
2069
sve_orv(tmp, size, pg, src2);
2070
if (bt == T_INT || bt == T_LONG) {
2071
umov(dst, tmp, size, 0);
2073
smov(dst, tmp, size, 0);
2076
orr(dst, dst, src1);
2078
orrw(dst, dst, src1);
2082
case Op_XorReductionV: {
2083
sve_eorv(tmp, size, pg, src2);
2084
if (bt == T_INT || bt == T_LONG) {
2085
umov(dst, tmp, size, 0);
2087
smov(dst, tmp, size, 0);
2090
eor(dst, dst, src1);
2092
eorw(dst, dst, src1);
2096
case Op_MaxReductionV: {
2097
sve_smaxv(tmp, size, pg, src2);
2098
if (bt == T_INT || bt == T_LONG) {
2099
umov(dst, tmp, size, 0);
2101
smov(dst, tmp, size, 0);
2105
csel(dst, dst, src1, Assembler::GT);
2108
cselw(dst, dst, src1, Assembler::GT);
2112
case Op_MinReductionV: {
2113
sve_sminv(tmp, size, pg, src2);
2114
if (bt == T_INT || bt == T_LONG) {
2115
umov(dst, tmp, size, 0);
2117
smov(dst, tmp, size, 0);
2121
csel(dst, dst, src1, Assembler::LT);
2124
cselw(dst, dst, src1, Assembler::LT);
2129
assert(false, "unsupported");
2130
ShouldNotReachHere();
2133
if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2136
} else if (bt == T_SHORT) {
2142
// Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2143
// to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2144
// max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2145
void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2146
uint32_t max_vector_length = Matcher::max_vector_size(bt);
2147
assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2149
// Set all elements to false if the input "lane_cnt" is zero.
2150
if (lane_cnt == 0) {
2155
SIMD_RegVariant size = elemType_to_regVariant(bt);
2156
assert(size != Q, "invalid size");
2158
// Set all true if "lane_cnt" equals to the max lane count.
2159
if (lane_cnt == max_vector_length) {
2160
sve_ptrue(dst, size, /* ALL */ 0b11111);
2164
// Fixed numbers for "ptrue".
2174
sve_ptrue(dst, size, lane_cnt);
2177
sve_ptrue(dst, size, /* VL16 */ 0b01001);
2180
sve_ptrue(dst, size, /* VL32 */ 0b01010);
2183
sve_ptrue(dst, size, /* VL64 */ 0b01011);
2186
sve_ptrue(dst, size, /* VL128 */ 0b01100);
2189
sve_ptrue(dst, size, /* VL256 */ 0b01101);
2195
// Special patterns for "ptrue".
2196
if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2197
sve_ptrue(dst, size, /* POW2 */ 0b00000);
2198
} else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2199
sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2200
} else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2201
sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2203
// Encode to "whileltw" for the remaining cases.
2204
mov(rscratch1, lane_cnt);
2205
sve_whileltw(dst, size, zr, rscratch1);
2209
// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2210
// Any remaining elements of dst will be filled with zero.
2211
// Clobbers: rscratch1
2212
// Preserves: src, mask
2213
void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2214
FloatRegister vtmp1, FloatRegister vtmp2,
2216
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2217
assert_different_registers(dst, src, vtmp1, vtmp2);
2218
assert_different_registers(mask, pgtmp);
2220
// Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
2221
// mask = 0001 0000 0000 0001 0001 0000 0001 0001
2222
// Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
2223
sve_dup(vtmp2, H, 0);
2225
// Extend lowest half to type INT.
2226
// dst = 00004444 00003333 00002222 00001111
2227
sve_uunpklo(dst, S, src);
2228
// pgtmp = 00000001 00000000 00000001 00000001
2229
sve_punpklo(pgtmp, mask);
2230
// Pack the active elements in size of type INT to the right,
2231
// and fill the remainings with zero.
2232
// dst = 00000000 00004444 00002222 00001111
2233
sve_compact(dst, S, dst, pgtmp);
2234
// Narrow the result back to type SHORT.
2235
// dst = 0000 0000 0000 0000 0000 4444 2222 1111
2236
sve_uzp1(dst, H, dst, vtmp2);
2237
// Count the active elements of lowest half.
2239
sve_cntp(rscratch1, S, ptrue, pgtmp);
2241
// Repeat to the highest half.
2242
// pgtmp = 00000001 00000000 00000000 00000001
2243
sve_punpkhi(pgtmp, mask);
2244
// vtmp1 = 00008888 00007777 00006666 00005555
2245
sve_uunpkhi(vtmp1, S, src);
2246
// vtmp1 = 00000000 00000000 00008888 00005555
2247
sve_compact(vtmp1, S, vtmp1, pgtmp);
2248
// vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2249
sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2251
// Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
2252
// Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2253
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
2254
// TRUE_CNT is the number of active elements in the compressed low.
2255
neg(rscratch1, rscratch1);
2256
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
2257
sve_index(vtmp2, H, rscratch1, 1);
2258
// vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2259
sve_tbl(vtmp1, H, vtmp1, vtmp2);
2261
// Combine the compressed high(after shifted) with the compressed low.
2262
// dst = 0000 0000 0000 8888 5555 4444 2222 1111
2263
sve_orr(dst, dst, vtmp1);
2266
// Clobbers: rscratch1, rscratch2
2267
// Preserves: src, mask
2268
void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2269
FloatRegister vtmp1, FloatRegister vtmp2,
2270
FloatRegister vtmp3, FloatRegister vtmp4,
2271
PRegister ptmp, PRegister pgtmp) {
2272
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2273
assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2274
assert_different_registers(mask, ptmp, pgtmp);
2275
// Example input: src = 88 77 66 55 44 33 22 11
2276
// mask = 01 00 00 01 01 00 01 01
2277
// Expected result: dst = 00 00 00 88 55 44 22 11
2279
sve_dup(vtmp4, B, 0);
2280
// Extend lowest half to type SHORT.
2281
// vtmp1 = 0044 0033 0022 0011
2282
sve_uunpklo(vtmp1, H, src);
2283
// ptmp = 0001 0000 0001 0001
2284
sve_punpklo(ptmp, mask);
2285
// Count the active elements of lowest half.
2287
sve_cntp(rscratch2, H, ptrue, ptmp);
2288
// Pack the active elements in size of type SHORT to the right,
2289
// and fill the remainings with zero.
2290
// dst = 0000 0044 0022 0011
2291
sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2292
// Narrow the result back to type BYTE.
2293
// dst = 00 00 00 00 00 44 22 11
2294
sve_uzp1(dst, B, dst, vtmp4);
2296
// Repeat to the highest half.
2297
// ptmp = 0001 0000 0000 0001
2298
sve_punpkhi(ptmp, mask);
2299
// vtmp1 = 0088 0077 0066 0055
2300
sve_uunpkhi(vtmp2, H, src);
2301
// vtmp1 = 0000 0000 0088 0055
2302
sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2304
sve_dup(vtmp4, B, 0);
2305
// vtmp1 = 00 00 00 00 00 00 88 55
2306
sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2308
// Compressed low: dst = 00 00 00 00 00 44 22 11
2309
// Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
2310
// Left shift(cross lane) compressed high with TRUE_CNT lanes,
2311
// TRUE_CNT is the number of active elements in the compressed low.
2312
neg(rscratch2, rscratch2);
2313
// vtmp2 = {4 3 2 1 0 -1 -2 -3}
2314
sve_index(vtmp2, B, rscratch2, 1);
2315
// vtmp1 = 00 00 00 88 55 00 00 00
2316
sve_tbl(vtmp1, B, vtmp1, vtmp2);
2317
// Combine the compressed high(after shifted) with the compressed low.
2318
// dst = 00 00 00 88 55 44 22 11
2319
sve_orr(dst, dst, vtmp1);
2322
void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2323
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2324
SIMD_Arrangement size = isQ ? T16B : T8B;
2326
rbit(dst, size, src);
2328
neon_reverse_bytes(dst, src, bt, isQ);
2329
rbit(dst, size, dst);
2333
void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2334
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2335
SIMD_Arrangement size = isQ ? T16B : T8B;
2339
orr(dst, size, src, src);
2343
rev16(dst, size, src);
2346
rev32(dst, size, src);
2349
rev64(dst, size, src);
2352
assert(false, "unsupported");
2353
ShouldNotReachHere();
2357
// Extract a scalar element from an sve vector at position 'idx'.
2358
// The input elements in src are expected to be of integral type.
2359
void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2360
int idx, FloatRegister vtmp) {
2361
assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2362
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2363
if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2364
if (bt == T_INT || bt == T_LONG) {
2365
umov(dst, src, size, idx);
2367
smov(dst, src, size, idx);
2370
sve_orr(vtmp, src, src);
2371
sve_ext(vtmp, vtmp, idx << size);
2372
if (bt == T_INT || bt == T_LONG) {
2373
umov(dst, vtmp, size, 0);
2375
smov(dst, vtmp, size, 0);
2380
// java.lang.Math::round intrinsics
2382
// Clobbers: rscratch1, rflags
2383
void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2384
FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2385
assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2389
fmovs(tmp1, T, 0.5f);
2390
mov(rscratch1, jint_cast(0x1.0p23f));
2393
fmovd(tmp1, T, 0.5);
2394
mov(rscratch1, julong_cast(0x1.0p52));
2397
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2399
fadd(tmp1, T, tmp1, src);
2400
fcvtms(tmp1, T, tmp1);
2401
// tmp1 = floor(src + 0.5, ties to even)
2403
fcvtas(dst, T, src);
2404
// dst = round(src), ties to away
2407
dup(tmp2, T, rscratch1);
2408
cm(HS, tmp3, T, tmp3, tmp2);
2409
// tmp3 is now a set of flags
2411
bif(dst, T16B, tmp1, tmp3);
2415
// Clobbers: rscratch1, rflags
2416
void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2417
FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2418
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2419
assert_different_registers(tmp1, tmp2, src, dst);
2423
mov(rscratch1, jint_cast(0x1.0p23f));
2426
mov(rscratch1, julong_cast(0x1.0p52));
2429
assert(T == S || T == D, "invalid register variant");
2432
sve_frinta(dst, T, ptrue, src);
2433
// dst = round(src), ties to away
2437
sve_fneg(tmp1, T, ptrue, src);
2438
sve_dup(tmp2, T, rscratch1);
2439
sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2442
sve_cpy(tmp1, T, pgtmp, 0.5);
2443
sve_fadd(tmp1, T, pgtmp, src);
2444
sve_frintm(dst, T, pgtmp, tmp1);
2445
// dst = floor(src + 0.5, ties to even)
2449
sve_fcvtzs(dst, T, ptrue, dst, T);
2453
void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2454
FloatRegister one, SIMD_Arrangement T) {
2455
assert_different_registers(dst, src, zero, one);
2456
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2458
facgt(dst, T, src, zero);
2459
ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2460
bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2463
void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2464
FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2465
assert_different_registers(dst, src, zero, one, vtmp);
2466
assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2468
sve_orr(vtmp, src, src);
2469
sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2472
sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2473
sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2474
// on the sign of the float value
2477
sve_and(vtmp, T, min_jlong);
2478
sve_orr(vtmp, T, jlong_cast(1.0));
2481
assert(false, "unsupported");
2482
ShouldNotReachHere();
2484
sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2488
bool C2_MacroAssembler::in_scratch_emit_size() {
2489
if (ciEnv::current()->task() != nullptr) {
2490
PhaseOutput* phase_output = Compile::current()->output();
2491
if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2495
return MacroAssembler::in_scratch_emit_size();