jdk

stubGenerator_x86_64_adler.cpp
340 строк · 11.5 Кб
Перенос по словам
1
/*
2
* Copyright (c) 2021, 2023, Intel Corporation. All rights reserved.
3
*
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
*
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
9
*
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
15
*
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
*
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
22
* questions.
23
*
24
*/
25

26
#include "precompiled.hpp"
27
#include "asm/assembler.hpp"
28
#include "asm/assembler.inline.hpp"
29
#include "utilities/globalDefinitions.hpp"
30
#include "runtime/globals.hpp"
31
#include "runtime/stubRoutines.hpp"
32
#include "macroAssembler_x86.hpp"
33
#include "stubGenerator_x86_64.hpp"
34

35
#define __ _masm->
36

37
ATTRIBUTE_ALIGNED(64) static const juint ADLER32_ASCALE_TABLE[] = {
38
    0x00000000UL, 0x00000001UL, 0x00000002UL, 0x00000003UL,
39
    0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL,
40
    0x00000008UL, 0x00000009UL, 0x0000000AUL, 0x0000000BUL,
41
    0x0000000CUL, 0x0000000DUL, 0x0000000EUL, 0x0000000FUL
42
};
43

44
ATTRIBUTE_ALIGNED(32) static const juint ADLER32_SHUF0_TABLE[] = {
45
    0xFFFFFF00UL, 0xFFFFFF01UL, 0xFFFFFF02UL, 0xFFFFFF03UL,
46
    0xFFFFFF04UL, 0xFFFFFF05UL, 0xFFFFFF06UL, 0xFFFFFF07UL
47
};
48

49
ATTRIBUTE_ALIGNED(32) static const juint ADLER32_SHUF1_TABLE[] = {
50
    0xFFFFFF08UL, 0xFFFFFF09UL, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
51
    0xFFFFFF0CUL, 0xFFFFFF0DUL, 0xFFFFFF0EUL, 0xFFFFFF0FUL
52
};
53

54

55
/***
56
 *  Arguments:
57
 *
58
 *  Inputs:
59
 *   c_rarg0   - int   adler
60
 *   c_rarg1   - byte* buff
61
 *   c_rarg2   - int   len
62
 *
63
 * Output:
64
 *   rax   - int adler result
65
 */
66
address StubGenerator::generate_updateBytesAdler32() {
67
  assert(UseAdler32Intrinsics, "");
68

69
  __ align(CodeEntryAlignment);
70
  StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
71
  address start = __ pc();
72

73
  // Choose an appropriate LIMIT for inner loop based on the granularity
74
  // of intermediate results. For int, LIMIT of 5552 will ensure intermediate
75
  // results does not overflow Integer.MAX_VALUE before modulo operations.
76
  const int LIMIT = 5552;
77
  const int BASE = 65521;
78
  const int CHUNKSIZE =  16;
79
  const int CHUNKSIZE_M1 = CHUNKSIZE - 1;
80

81
  const Register init_d = c_rarg0;
82
  const Register data = r9;
83
  const Register size = r10;
84
  const Register s = r11;
85
  const Register a_d = r12; //r12d
86
  const Register b_d = r8; //r8d
87
  const Register end = r13;
88

89
  assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
90
  assert_different_registers(init_d, data, size, s, a_d, b_d, end, rax);
91

92
  const XMMRegister yshuf0 = xmm6;
93
  const XMMRegister yshuf1 = xmm7;
94
  const XMMRegister ya = xmm0;
95
  const XMMRegister yb = xmm1;
96
  const XMMRegister ydata0 = xmm2;
97
  const XMMRegister ydata1 = xmm3;
98
  const XMMRegister ysa = xmm4;
99
  const XMMRegister ydata = ysa;
100
  const XMMRegister ytmp0 = ydata0;
101
  const XMMRegister ytmp1 = ydata1;
102
  const XMMRegister ytmp2 = xmm5;
103
  const XMMRegister xa = xmm0;
104
  const XMMRegister xb = xmm1;
105
  const XMMRegister xtmp0 = xmm2;
106
  const XMMRegister xtmp1 = xmm3;
107
  const XMMRegister xsa = xmm4;
108
  const XMMRegister xtmp2 = xmm5;
109
  const XMMRegister xtmp3 = xmm8;
110
  const XMMRegister xtmp4 = xmm9;
111
  const XMMRegister xtmp5 = xmm10;
112

113
  Label SLOOP1, SLOOP1A_AVX2, SLOOP1A_AVX3, AVX3_REDUCE, SKIP_LOOP_1A;
114
  Label SKIP_LOOP_1A_AVX3, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
115

116
  __ enter(); // required for proper stackwalking of RuntimeStub frame
117

118
  __ movq(xtmp3, r12);
119
  __ movq(xtmp4, r13);
120
  __ movq(xtmp5, r14);
121

122
  __ vmovdqu(yshuf0, ExternalAddress((address)ADLER32_SHUF0_TABLE), r14 /*rscratch*/);
123
  __ vmovdqu(yshuf1, ExternalAddress((address)ADLER32_SHUF1_TABLE), r14 /*rscratch*/);
124

125
  __ movptr(data, c_rarg1); //data
126
  __ movl(size, c_rarg2); //length
127

128
  __ movl(b_d, init_d); //adler
129
  __ shrl(b_d, 16);
130
  __ andl(init_d, 0xFFFF);
131
  __ cmpl(size, 32);
132
  __ jcc(Assembler::below, LT64);
133
  __ movdl(xa, init_d); //vmovd - 32bit
134

135
  __ bind(SLOOP1);
136
  __ vpxor(yb, yb, yb, VM_Version::supports_avx512vl() ? Assembler::AVX_512bit : Assembler::AVX_256bit);
137
  __ movl(s, LIMIT);
138
  __ cmpl(s, size);
139
  __ cmovl(Assembler::above, s, size); // s = min(size, LIMIT)
140
  __ lea(end, Address(s, data, Address::times_1, -CHUNKSIZE_M1));
141
  __ cmpptr(data, end);
142
  __ jcc(Assembler::aboveEqual, SKIP_LOOP_1A);
143

144
  __ align32();
145
  if (VM_Version::supports_avx512vl()) {
146
    // AVX2 performs better for smaller inputs because of leaner post loop reduction sequence..
147
    __ cmpl(s, MAX2(128, VM_Version::avx3_threshold()));
148
    __ jcc(Assembler::belowEqual, SLOOP1A_AVX2);
149
    __ lea(end, Address(s, data, Address::times_1, - (2*CHUNKSIZE -1)));
150

151
    // Some notes on vectorized main loop algorithm.
152
    // Additions are performed in slices of 16 bytes in the main loop.
153
    // input size : 64 bytes (a0 - a63).
154
    // Iteration0 : ya =  [a0 - a15]
155
    //              yb =  [a0 - a15]
156
    // Iteration1 : ya =  [a0 - a15] + [a16 - a31]
157
    //              yb =  2 x [a0 - a15] + [a16 - a31]
158
    // Iteration2 : ya =  [a0 - a15] + [a16 - a31] + [a32 - a47]
159
    //              yb =  3 x [a0 - a15] + 2 x [a16 - a31] + [a32 - a47]
160
    // Iteration4 : ya =  [a0 - a15] + [a16 - a31] + [a32 - a47] + [a48 - a63]
161
    //              yb =  4 x [a0 - a15] + 3 x [a16 - a31] + 2 x [a32 - a47] + [a48 - a63]
162
    // Before performing reduction we must scale the intermediate result appropriately.
163
    // Since addition was performed in chunks of 16 bytes, thus to match the scalar implementation
164
    // Oth lane element must be repeatedly added 16 times, 1st element 15 times and so on so forth.
165
    // Thus we first multiply yb by 16 followed by subtracting appropriately scaled ya value.
166
    // yb = 16 x yb  - [a0 - a15] x ya
167
    //    = 64 x [a0 - a15] + 48 x [a16 - a31] + 32 x [a32 - a47] + 16 x [a48 - a63]  -  [a0 - a15] x ya
168
    //    = 64 x a0 + 63 x a1 + 62 x a2 ...... + a63
169
    __ bind(SLOOP1A_AVX3);
170
      __ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
171
      __ evpmovzxbd(ydata1, Address(data, CHUNKSIZE), Assembler::AVX_512bit);
172
      __ vpaddd(ya, ya, ydata0, Assembler::AVX_512bit);
173
      __ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
174
      __ vpaddd(ya, ya, ydata1, Assembler::AVX_512bit);
175
      __ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
176
      __ addptr(data, 2*CHUNKSIZE);
177
      __ cmpptr(data, end);
178
      __ jcc(Assembler::below, SLOOP1A_AVX3);
179

180
    __ addptr(end, CHUNKSIZE);
181
    __ cmpptr(data, end);
182
    __ jcc(Assembler::aboveEqual, AVX3_REDUCE);
183

184
    __ evpmovzxbd(ydata0, Address(data, 0), Assembler::AVX_512bit);
185
    __ vpaddd(ya, ya, ydata0, Assembler::AVX_512bit);
186
    __ vpaddd(yb, yb, ya, Assembler::AVX_512bit);
187
    __ addptr(data, CHUNKSIZE);
188

189
    __ bind(AVX3_REDUCE);
190
    __ vpslld(yb, yb, 4, Assembler::AVX_512bit); //b is scaled by 16(avx512))
191
    __ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_512bit, r14 /*rscratch*/);
192

193
    // compute horizontal sums of ya, yb, ysa
194
    __ vextracti64x4(xtmp0, ya, 1);
195
    __ vextracti64x4(xtmp1, yb, 1);
196
    __ vextracti64x4(xtmp2, ysa, 1);
197
    __ vpaddd(xtmp0, xtmp0, ya, Assembler::AVX_256bit);
198
    __ vpaddd(xtmp1, xtmp1, yb, Assembler::AVX_256bit);
199
    __ vpaddd(xtmp2, xtmp2, ysa, Assembler::AVX_256bit);
200
    __ vextracti128(xa, xtmp0, 1);
201
    __ vextracti128(xb, xtmp1, 1);
202
    __ vextracti128(xsa, xtmp2, 1);
203
    __ vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
204
    __ vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
205
    __ vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
206
    __ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
207
    __ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
208
    __ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
209
    __ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
210
    __ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
211
    __ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
212

213
    __ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
214

215
    __ addptr(end, CHUNKSIZE_M1);
216
    __ testl(s, CHUNKSIZE_M1);
217
    __ jcc(Assembler::notEqual, DO_FINAL);
218
    __ jmp(SKIP_LOOP_1A_AVX3);
219
  }
220

221
  __ align32();
222
  __ bind(SLOOP1A_AVX2);
223
    __ vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
224
    __ addptr(data, CHUNKSIZE);
225
    __ vpshufb(ydata0, ydata, yshuf0, Assembler::AVX_256bit);
226
    __ vpaddd(ya, ya, ydata0, Assembler::AVX_256bit);
227
    __ vpaddd(yb, yb, ya, Assembler::AVX_256bit);
228
    __ vpshufb(ydata1, ydata, yshuf1, Assembler::AVX_256bit);
229
    __ vpaddd(ya, ya, ydata1, Assembler::AVX_256bit);
230
    __ vpaddd(yb, yb, ya, Assembler::AVX_256bit);
231
    __ cmpptr(data, end);
232
    __ jcc(Assembler::below, SLOOP1A_AVX2);
233

234
  __ bind(SKIP_LOOP_1A);
235

236
  // reduce
237
  __ vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8(avx)
238
  __ vpmulld(ysa, ya, ExternalAddress((address)ADLER32_ASCALE_TABLE), Assembler::AVX_256bit, r14 /*rscratch*/);
239

240
  // compute horizontal sums of ya, yb, ysa
241
  __ vextracti128(xtmp0, ya, 1);
242
  __ vextracti128(xtmp1, yb, 1);
243
  __ vextracti128(xtmp2, ysa, 1);
244
  __ vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
245
  __ vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
246
  __ vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
247
  __ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
248
  __ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
249
  __ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
250
  __ vphaddd(xa, xa, xa, Assembler::AVX_128bit);
251
  __ vphaddd(xb, xb, xb, Assembler::AVX_128bit);
252
  __ vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
253

254
  __ vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
255

256
  __ addptr(end, CHUNKSIZE_M1);
257
  __ testl(s, CHUNKSIZE_M1);
258
  __ jcc(Assembler::notEqual, DO_FINAL);
259

260
  __ bind(SKIP_LOOP_1A_AVX3);
261
  // either we're done, or we just did LIMIT
262
  __ subl(size, s);
263

264
  __ movdl(rax, xa);
265
  __ xorl(rdx, rdx);
266
  __ movl(rcx, BASE);
267
  __ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
268
  __ movl(a_d, rdx);
269

270
  __ movdl(rax, xb);
271
  __ addl(rax, b_d);
272
  __ xorl(rdx, rdx);
273
  __ movl(rcx, BASE);
274
  __ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
275
  __ movl(b_d, rdx);
276

277
  __ testl(size, size);
278
  __ jcc(Assembler::zero, FINISH);
279

280
  // continue loop
281
  __ movdl(xa, a_d);
282
  __ jmp(SLOOP1);
283

284
  __ bind(FINISH);
285
  __ movl(rax, b_d);
286
  __ shll(rax, 16);
287
  __ orl(rax, a_d);
288
  __ jmp(END);
289

290
  __ bind(LT64);
291
  __ movl(a_d, init_d);
292
  __ lea(end, Address(data, size, Address::times_1));
293
  __ testl(size, size);
294
  __ jcc(Assembler::notZero, FINAL_LOOP);
295
  __ jmp(ZERO_SIZE);
296

297
  __ bind(DO_FINAL);
298
  __ movdl(a_d, xa);
299
  __ movdl(rax, xb);
300
  __ addl(b_d, rax);
301

302
  __ align32();
303
  __ bind(FINAL_LOOP);
304
  __ movzbl(rax, Address(data, 0)); //movzx   eax, byte[data]
305
  __ addl(a_d, rax);
306
  __ addptr(data, 1);
307
  __ addl(b_d, a_d);
308
  __ cmpptr(data, end);
309
  __ jcc(Assembler::below, FINAL_LOOP);
310

311
  __ bind(ZERO_SIZE);
312

313
  __ movl(rax, a_d);
314
  __ xorl(rdx, rdx);
315
  __ movl(rcx, BASE);
316
  __ divl(rcx); // div ecx -- divide edx:eax by ecx, quot->eax, rem->edx
317
  __ movl(a_d, rdx);
318

319
  __ movl(rax, b_d);
320
  __ xorl(rdx, rdx);
321
  __ movl(rcx, BASE);
322
  __ divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
323
  __ shll(rdx, 16);
324
  __ orl(rdx, a_d);
325
  __ movl(rax, rdx);
326

327
  __ bind(END);
328

329
  __ movq(r14, xtmp5);
330
  __ movq(r13, xtmp4);
331
  __ movq(r12, xtmp3);
332

333
  __ vzeroupper();
334
  __ leave();
335
  __ ret(0);
336

337
  return start;
338
}
339

340
#undef __
341
jdk

Использование cookies