2
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
26
#include "precompiled.hpp"
27
#include "asm/assembler.hpp"
28
#include "asm/assembler.inline.hpp"
29
#include "utilities/globalDefinitions.hpp"
30
#include "runtime/globals.hpp"
31
#include "runtime/stubRoutines.hpp"
32
#include "macroAssembler_x86.hpp"
33
#include "stubGenerator_x86_64.hpp"
37
ATTRIBUTE_ALIGNED(64) static const juint ADLER32_ASCALE_TABLE[] = {
38
0x00000000UL, 0x00000001UL, 0x00000002UL, 0x00000003UL,
39
0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL,
40
0x00000008UL, 0x00000009UL, 0x0000000AUL, 0x0000000BUL,
41
0x0000000CUL, 0x0000000DUL, 0x0000000EUL, 0x0000000FUL
44
ATTRIBUTE_ALIGNED(32) static const juint ADLER32_SHUF0_TABLE[] = {
45
0xFFFFFF00UL, 0xFFFFFF01UL, 0xFFFFFF02UL, 0xFFFFFF03UL,
46
0xFFFFFF04UL, 0xFFFFFF05UL, 0xFFFFFF06UL, 0xFFFFFF07UL
49
ATTRIBUTE_ALIGNED(32) static const juint ADLER32_SHUF1_TABLE[] = {
50
0xFFFFFF08UL, 0xFFFFFF09UL, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
51
0xFFFFFF0CUL, 0xFFFFFF0DUL, 0xFFFFFF0EUL, 0xFFFFFF0FUL
60
* c_rarg1 - byte* buff
64
* rax - int adler result
66
address StubGenerator::generate_updateBytesAdler32() {
67
assert(UseAdler32Intrinsics, "");
69
__ align(CodeEntryAlignment);
70
StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
71
address start = __ pc();
73
// Choose an appropriate LIMIT for inner loop based on the granularity
74
// of intermediate results. For int, LIMIT of 5552 will ensure intermediate
75
// results does not overflow Integer.MAX_VALUE before modulo operations.
76
const int LIMIT = 5552;
77
const int BASE = 65521;
78
const int CHUNKSIZE = 16;
79
const int CHUNKSIZE_M1 = CHUNKSIZE - 1;
81
const Register init_d = c_rarg0;
82
const Register data = r9;
83
const Register size = r10;
84
const Register s = r11;
85
const Register a_d = r12; //r12d
86
const Register b_d = r8; //r8d
87
const Register end = r13;
89
assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
90
assert_different_registers(init_d, data, size, s, a_d, b_d, end, rax);
92
const XMMRegister yshuf0 = xmm6;
93
const XMMRegister yshuf1 = xmm7;
94
const XMMRegister ya = xmm0;
95
const XMMRegister yb = xmm1;
96
const XMMRegister ydata0 = xmm2;
97
const XMMRegister ydata1 = xmm3;
98
const XMMRegister ysa = xmm4;
99
const XMMRegister ydata = ysa;
100
const XMMRegister ytmp0 = ydata0;
101
const XMMRegister ytmp1 = ydata1;
102
const XMMRegister ytmp2 = xmm5;
103
const XMMRegister xa = xmm0;
104
const XMMRegister xb = xmm1;
105
const XMMRegister xtmp0 = xmm2;
106
const XMMRegister xtmp1 = xmm3;
107
const XMMRegister xsa = xmm4;
108
const XMMRegister xtmp2 = xmm5;
109
const XMMRegister xtmp3 = xmm8;
110
const XMMRegister xtmp4 = xmm9;
111
const XMMRegister xtmp5 = xmm10;
113
Label SLOOP1, SLOOP1A_AVX2, SLOOP1A_AVX3, AVX3_REDUCE, SKIP_LOOP_1A;
114
Label SKIP_LOOP_1A_AVX3, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
116
__ enter(); // required for proper stackwalking of RuntimeStub frame
122
__ vmovdqu(yshuf0, ExternalAddress((address)ADLER32_SHUF0_TABLE), r14 /*rscratch*/);
123
__ vmovdqu(yshuf1, ExternalAddress((address)ADLER32_SHUF1_TABLE), r14 /*rscratch*/);
125
__ movptr(data, c_rarg1); //data
126
__ movl(size, c_rarg2); //length
128
__ movl(b_d, init_d); //adler
130
__ andl(init_d, 0xFFFF);
132
__ jcc(Assembler::below, LT64);
133
__ movdl(xa, init_d); //vmovd - 32bit
136
__ vpxor(yb, yb, yb, VM_Version::supports_avx512vl() ? Assembler::AVX_512bit : Assembler::AVX_256bit);
139
__ cmovl(Assembler::above, s, size); // s = min(size, LIMIT)
140
__ lea(end, Address(s, data, Address::times_1, -CHUNKSIZE_M1));
141
__ cmpptr(data, end);
142
__ jcc(Assembler::aboveEqual, SKIP_LOOP_1A);
145
if (VM_Version::supports_avx512vl()) {
146
// AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
147
__ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
148
__ jcc(Assembler::belowEqual, SLOOP1A_AVX2);
149
__ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));
151
// Some notes on vectorized main loop algorithm.
152
// Additions are performed in slices of 16 bytes in the main loop.
153
// input size : 64 bytes (a0 - a63).
154
// Iteration0 : ya = [a0 - a15]
156
// Iteration1 : ya = [a0 - a15] + [a16 - a31]
157
// yb = 2 x [a0 - a15] + [a16 - a31]
158
// Iteration2 : ya = [a0 - a15] + [a16 - a31] + [a32 - a47]
159
// yb = 3 x [a0 - a15] + 2 x [a16 - a31] + [a32 - a47]
160
// Iteration4 : ya = [a0 - a15] + [a16 - a31] + [a32 - a47] + [a48 - a63]
161
// yb = 4 x [a0 - a15] + 3 x [a16 - a31] + 2 x [a32 - a47] + [a48 - a63]
162
// Before performing reduction we must scale the intermediate result appropriately.
163
// Since addition was performed in chunks of 16 bytes, thus to match the scalar implementation
164
// Oth lane element must be repeatedly added 16 times, 1st element 15 times and so on so forth.
165
// Thus we first multiply yb by 16 followed by subtracting appropriately scaled ya value.
166
// yb = 16 x yb - [a0 - a15] x ya
167
// = 64 x [a0 - a15] + 48 x [a16 - a31] + 32 x [a32 - a47] + 16 x [a48 - a63] - [a0 - a15] x ya
168
// = 64 x a0 + 63 x a1 + 62 x a2 ...... + a63
169
__ bind(SLOOP1A_AVX3);
170
__ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
171
__ evpmovzxbd(ydata1, Address(data, CHUNKSIZE), Assembler::AVX_512bit);
172
__ vpaddd(ya, ya, ydata0, Assembler::AVX_512bit);
173
__ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
174
__ vpaddd(ya, ya, ydata1, Assembler::AVX_512bit);
175
__ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
176
__ addptr(data, 2*CHUNKSIZE);
177
__ cmpptr(data, end);
178
__ jcc(Assembler::below, SLOOP1A_AVX3);
180
__ addptr(end, CHUNKSIZE);
181
__ cmpptr(data, end);
182
__ jcc(Assembler::aboveEqual, AVX3_REDUCE);
184
__ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
185
__ vpaddd(ya, ya, ydata0, Assembler::AVX_512bit);
186
__ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
187
__ addptr(data, CHUNKSIZE);
189
__ bind(AVX3_REDUCE);
190
__ vpslld(yb, yb, 4, Assembler::AVX_512bit); //b is scaled by 16(avx512))
191
__ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_512bit, r14 /*rscratch*/);
193
// compute horizontal sums of ya, yb, ysa
194
__ vextracti64x4(xtmp0, ya, 1);
195
__ vextracti64x4(xtmp1, yb, 1);
196
__ vextracti64x4(xtmp2, ysa, 1);
197
__ vpaddd(xtmp0, xtmp0, ya, Assembler::AVX_256bit);
198
__ vpaddd(xtmp1, xtmp1, yb, Assembler::AVX_256bit);
199
__ vpaddd(xtmp2, xtmp2, ysa, Assembler::AVX_256bit);
200
__ vextracti128(xa, xtmp0, 1);
201
__ vextracti128(xb, xtmp1, 1);
202
__ vextracti128(xsa, xtmp2, 1);
203
__ vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
204
__ vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
205
__ vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
206
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
207
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
208
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
209
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
210
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
211
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
213
__ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
215
__ addptr(end, CHUNKSIZE_M1);
216
__ testl(s, CHUNKSIZE_M1);
217
__ jcc(Assembler::notEqual, DO_FINAL);
218
__ jmp(SKIP_LOOP_1A_AVX3);
222
__ bind(SLOOP1A_AVX2);
223
__ vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
224
__ addptr(data, CHUNKSIZE);
225
__ vpshufb(ydata0, ydata, yshuf0, Assembler::AVX_256bit);
226
__ vpaddd(ya, ya, ydata0, Assembler::AVX_256bit);
227
__ vpaddd(yb, yb, ya, Assembler::AVX_256bit);
228
__ vpshufb(ydata1, ydata, yshuf1, Assembler::AVX_256bit);
229
__ vpaddd(ya, ya, ydata1, Assembler::AVX_256bit);
230
__ vpaddd(yb, yb, ya, Assembler::AVX_256bit);
231
__ cmpptr(data, end);
232
__ jcc(Assembler::below, SLOOP1A_AVX2);
234
__ bind(SKIP_LOOP_1A);
237
__ vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8(avx)
238
__ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_256bit, r14 /*rscratch*/);
240
// compute horizontal sums of ya, yb, ysa
241
__ vextracti128(xtmp0, ya, 1);
242
__ vextracti128(xtmp1, yb, 1);
243
__ vextracti128(xtmp2, ysa, 1);
244
__ vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
245
__ vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
246
__ vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
247
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
248
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
249
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
250
__ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
251
__ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
252
__ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
254
__ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
256
__ addptr(end, CHUNKSIZE_M1);
257
__ testl(s, CHUNKSIZE_M1);
258
__ jcc(Assembler::notEqual, DO_FINAL);
260
__ bind(SKIP_LOOP_1A_AVX3);
261
// either we're done, or we just did LIMIT
267
__ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
274
__ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
277
__ testl(size, size);
278
__ jcc(Assembler::zero, FINISH);
291
__ movl(a_d, init_d);
292
__ lea(end, Address(data, size, Address::times_1));
293
__ testl(size, size);
294
__ jcc(Assembler::notZero, FINAL_LOOP);
304
__ movzbl(rax, Address(data, 0)); //movzx eax, byte[data]
308
__ cmpptr(data, end);
309
__ jcc(Assembler::below, FINAL_LOOP);
316
__ divl(rcx); // div ecx -- divide edx:eax by ecx, quot->eax, rem->edx
322
__ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx