jdk

Форк
0
/
stubGenerator_x86_64_poly1305.cpp 
1697 строк · 64.2 Кб
1
/*
2
 * Copyright (c) 2022, 2024, Intel Corporation. All rights reserved.
3
 *
4
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
 *
6
 * This code is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU General Public License version 2 only, as
8
 * published by the Free Software Foundation.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 *
24
 */
25

26
#include "precompiled.hpp"
27
#include "macroAssembler_x86.hpp"
28
#include "stubGenerator_x86_64.hpp"
29

30
#define __ _masm->
31

32
// References:
33
//  - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols
34
//  - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code"
35
//  - "The design of Poly1305" https://loup-vaillant.fr/tutorials/poly1305-design
36

37
// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5:
38
//
39
// Reduction by 2^130-5 can be expressed as follows:
40
//    ( ax2^130 + b ) mod 2^130-5     //i.e. number split along the 130-bit boundary
41
//                                 = ( ax2^130 - 5xa + 5xa + b ) mod 2^130-5
42
//                                 = ( ax(2^130 - 5) + 5xa + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop
43
//                                 = ( 5xa + b ) mod 2^130-5
44
// QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add'
45
// This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment
46
//
47

48
// Pseudocode for this file (in general):
49
//    * used for poly1305_multiply_scalar
50
//    x used for poly1305_multiply8_avx512
51
//    lower-case variables are scalar numbers in 3x44-bit limbs (in gprs)
52
//    upper-case variables are 8-element vector numbers in 3x44-bit limbs (in zmm registers)
53
//    [ ] used to denote vector numbers (with their elements)
54

55
// Constant Pool:
56
ATTRIBUTE_ALIGNED(64) static const uint64_t POLY1305_PAD_MSG[] = {
57
  0x0000010000000000, 0x0000010000000000,
58
  0x0000010000000000, 0x0000010000000000,
59
  0x0000010000000000, 0x0000010000000000,
60
  0x0000010000000000, 0x0000010000000000,
61
};
62
static address poly1305_pad_msg() {
63
  return (address)POLY1305_PAD_MSG;
64
}
65

66
ATTRIBUTE_ALIGNED(64) static const uint64_t POLY1305_MASK42[] = {
67
  0x000003ffffffffff, 0x000003ffffffffff,
68
  0x000003ffffffffff, 0x000003ffffffffff,
69
  0x000003ffffffffff, 0x000003ffffffffff,
70
  0x000003ffffffffff, 0x000003ffffffffff
71
};
72
static address poly1305_mask42() {
73
  return (address)POLY1305_MASK42;
74
}
75

76
ATTRIBUTE_ALIGNED(64) static const uint64_t POLY1305_MASK44[] = {
77
  0x00000fffffffffff, 0x00000fffffffffff,
78
  0x00000fffffffffff, 0x00000fffffffffff,
79
  0x00000fffffffffff, 0x00000fffffffffff,
80
  0x00000fffffffffff, 0x00000fffffffffff,
81
};
82
static address poly1305_mask44() {
83
  return (address)POLY1305_MASK44;
84
}
85

86
// Compute product for 8 16-byte message blocks,
87
// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0]
88
//
89
// Each block/number is represented by 3 44-bit limb digits, start with multiplication
90
//
91
//      a2       a1       a0
92
// x    r2       r1       r0
93
// ----------------------------------
94
//     a2xr0    a1xr0    a0xr0
95
// +   a1xr1    a0xr1  5xa2xr1'     (r1' = r1<<2)
96
// +   a0xr2  5xa2xr2' 5xa1xr2'     (r2' = r2<<2)
97
// ----------------------------------
98
//        p2       p1       p0
99
//
100
// Then, propagate the carry (bits after bit 44) from lower limbs into higher limbs.
101
// Then, modular reduction from upper limb wrapped to lower limbs
102
//
103
// Math Note 1: 'carry propagation' from p2 to p0 involves multiplication by 5 (i.e. slightly modified modular reduction from above):
104
//    ( p2x2^88 ) mod 2^130-5
105
//                             = ( p2'x2^88 + p2''x2^130) mod 2^130-5 // Split on 130-bit boudary
106
//                             = ( p2'x2^88 + p2''x2^130 - 5xp2'' + 5xp2'') mod 2^130-5
107
//                             = ( p2'x2^88 + p2''x(2^130 - 5) + 5xp2'') mod 2^130-5 // i.e. adding multiples of modulus is a noop
108
//                             = ( p2'x2^88 + 5xp2'') mod 2^130-5
109
//
110
// Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication.
111
// This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5.
112
// For example, partial product (a2xr2):
113
//    (a2x2^88)x(r2x2^88) mod 2^130-5
114
//                                    = (a2xr2 x 2^176) mod 2^130-5
115
//                                    = (a2xr2 x 2^46x2^130) mod 2^130-5
116
//                                    = (a2xr2x2^46 x 2^130- 5xa2xr2x2^46 + 5xa2xr2x2^46) mod 2^130-5
117
//                                    = (a2xr2x2^46 x (2^130- 5) + 5xa2xr2x2^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop
118
//                                    = (5xa2xr2x2^46) mod 2^130-5
119
//                                    = (a2x5xr2x2^2 x 2^44) mod 2^130-5 // Align to limb boudary
120
//                                    = (a2x[5xr2x4] x 2^44) mod 2^130-5
121
//                                    = (a2xR2P x 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2
122
//
123
void StubGenerator::poly1305_multiply8_avx512(
124
  const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
125
  const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
126
  const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
127
  const XMMRegister TMP, const Register rscratch)
128
{
129

130
  // Reset partial sums
131
  __ evpxorq(P0L, P0L, P0L, Assembler::AVX_512bit);
132
  __ evpxorq(P0H, P0H, P0H, Assembler::AVX_512bit);
133
  __ evpxorq(P1L, P1L, P1L, Assembler::AVX_512bit);
134
  __ evpxorq(P1H, P1H, P1H, Assembler::AVX_512bit);
135
  __ evpxorq(P2L, P2L, P2L, Assembler::AVX_512bit);
136
  __ evpxorq(P2H, P2H, P2H, Assembler::AVX_512bit);
137

138
  // Calculate partial products
139
  // p0 = a2xr1'
140
  // p1 = a2xr2'
141
  // p2 = a2xr0
142
  __ evpmadd52luq(P0L, A2, R1P, Assembler::AVX_512bit);
143
  __ evpmadd52huq(P0H, A2, R1P, Assembler::AVX_512bit);
144
  __ evpmadd52luq(P1L, A2, R2P, Assembler::AVX_512bit);
145
  __ evpmadd52huq(P1H, A2, R2P, Assembler::AVX_512bit);
146
  __ evpmadd52luq(P2L, A2, R0, Assembler::AVX_512bit);
147
  __ evpmadd52huq(P2H, A2, R0, Assembler::AVX_512bit);
148

149
  // p0 += a0xr0
150
  // p1 += a0xr1
151
  // p2 += a0xr2
152
  __ evpmadd52luq(P1L, A0, R1, Assembler::AVX_512bit);
153
  __ evpmadd52huq(P1H, A0, R1, Assembler::AVX_512bit);
154
  __ evpmadd52luq(P2L, A0, R2, Assembler::AVX_512bit);
155
  __ evpmadd52huq(P2H, A0, R2, Assembler::AVX_512bit);
156
  __ evpmadd52luq(P0L, A0, R0, Assembler::AVX_512bit);
157
  __ evpmadd52huq(P0H, A0, R0, Assembler::AVX_512bit);
158

159
  // p0 += a1xr2'
160
  // p1 += a1xr0
161
  // p2 += a1xr1
162
  __ evpmadd52luq(P0L, A1, R2P, Assembler::AVX_512bit);
163
  __ evpmadd52huq(P0H, A1, R2P, Assembler::AVX_512bit);
164
  __ evpmadd52luq(P1L, A1, R0, Assembler::AVX_512bit);
165
  __ evpmadd52huq(P1H, A1, R0, Assembler::AVX_512bit);
166
  __ evpmadd52luq(P2L, A1, R1, Assembler::AVX_512bit);
167
  __ evpmadd52huq(P2H, A1, R1, Assembler::AVX_512bit);
168

169
  // Carry propagation:
170
  // (Not quite aligned)                         | More mathematically correct:
171
  //         P2L   P1L   P0L                     |                 P2Lx2^88 + P1Lx2^44 + P0Lx2^0
172
  // + P2H   P1H   P0H                           |   + P2Hx2^140 + P1Hx2^96 + P0Hx2^52
173
  // ---------------------------                 |   -----------------------------------------------
174
  // = P2H    A2    A1    A0                     |   = P2Hx2^130 + A2x2^88 +   A1x2^44 +  A0x2^0
175
  //
176
  __ vpsrlq(TMP, P0L, 44, Assembler::AVX_512bit);
177
  __ evpandq(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
178

179
  __ vpsllq(P0H, P0H, 8, Assembler::AVX_512bit);
180
  __ vpaddq(P0H, P0H, TMP, Assembler::AVX_512bit);
181
  __ vpaddq(P1L, P1L, P0H, Assembler::AVX_512bit);
182
  __ evpandq(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
183

184
  __ vpsrlq(TMP, P1L, 44, Assembler::AVX_512bit);
185
  __ vpsllq(P1H, P1H, 8, Assembler::AVX_512bit);
186
  __ vpaddq(P1H, P1H, TMP, Assembler::AVX_512bit);
187
  __ vpaddq(P2L, P2L, P1H, Assembler::AVX_512bit);
188
  __ evpandq(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits
189

190
  __ vpsrlq(TMP, P2L, 42, Assembler::AVX_512bit);
191
  __ vpsllq(P2H, P2H, 10, Assembler::AVX_512bit);
192
  __ vpaddq(P2H, P2H, TMP, Assembler::AVX_512bit);
193

194
  // Reduction: p2->a0->a1
195
  // Multiply by 5 the highest bits (p2 is above 130 bits)
196
  __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit);
197
  __ vpsllq(P2H, P2H, 2, Assembler::AVX_512bit);
198
  __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit);
199
  __ vpsrlq(TMP, A0, 44, Assembler::AVX_512bit);
200
  __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
201
  __ vpaddq(A1, A1, TMP, Assembler::AVX_512bit);
202
}
203

204
// Compute product for a single 16-byte message blocks
205
// - Assumes that r = [r1 r0] is only 128 bits (not 130)
206
// - Input [a2 a1 a0]; when only128 is set, input is 128 bits (i.e. a2==0)
207
// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used regardless of only128)
208
//
209
// Note 1: a2 here is only two bits so anything above is subject of reduction.
210
// Note 2: Constant c1 = 5xr1 = r1 + (r1 << 2) simplifies multiply with less operations
211
//
212
// Flow of the code below is as follows:
213
//
214
//          a2        a1        a0
215
//        x           r1        r0
216
//   -----------------------------
217
//       a2xr0     a1xr0     a0xr0
218
//   +             a0xr1
219
//   +           5xa2xr1   5xa1xr1
220
//   -----------------------------
221
//     [0|L2L] [L1H|L1L] [L0H|L0L]
222
//
223
//   Registers:  t2:t1     t0:a0
224
//
225
// Completing the multiply and adding (with carry) 3x128-bit limbs into
226
// 192-bits again (3x64-bits):
227
// a0 = L0L
228
// a1 = L0H + L1L
229
// t2 = L1H + L2L
230
void StubGenerator::poly1305_multiply_scalar(
231
  const Register a0, const Register a1, const Register a2,
232
  const Register r0, const Register r1, const Register c1, bool only128,
233
  const Register t0, const Register t1, const Register t2,
234
  const Register mulql, const Register mulqh)
235
{
236
  // mulq instruction requires/clobers rax, rdx (mulql, mulqh)
237

238
  // t2:t1 = (a0 * r1)
239
  __ movq(rax, r1);
240
  __ mulq(a0);
241
  __ movq(t1, rax);
242
  __ movq(t2, rdx);
243

244
  // t0:a0 = (a0 * r0)
245
  __ movq(rax, r0);
246
  __ mulq(a0);
247
  __ movq(a0, rax); // a0 not used in other operations
248
  __ movq(t0, rdx);
249

250
  // t2:t1 += (a1 * r0)
251
  __ movq(rax, r0);
252
  __ mulq(a1);
253
  __ addq(t1, rax);
254
  __ adcq(t2, rdx);
255

256
  // t0:a0 += (a1 * r1x5)
257
  __ movq(rax, c1);
258
  __ mulq(a1);
259
  __ addq(a0, rax);
260
  __ adcq(t0, rdx);
261

262
  // Note: a2 is clamped to 2-bits,
263
  //       r1/r0 is clamped to 60-bits,
264
  //       their product is less than 2^64.
265

266
  if (only128) { // Accumulator only 128 bits, i.e. a2 == 0
267
    // just move and add t0-t1 to a1
268
    __ movq(a1, t0);
269
    __ addq(a1, t1);
270
    __ adcq(t2, 0);
271
  } else {
272
    // t2:t1 += (a2 * r1x5)
273
    __ movq(a1, a2); // use a1 for a2
274
    __ imulq(a1, c1);
275
    __ addq(t1, a1);
276
    __ adcq(t2, 0);
277

278
    __ movq(a1, t0); // t0:a0 => a1:a0
279

280
    // t2:a1 += (a2 * r0):t1
281
    __ imulq(a2, r0);
282
    __ addq(a1, t1);
283
    __ adcq(t2, a2);
284
  }
285

286
  // At this point, 3 64-bit limbs are in t2:a1:a0
287
  // t2 can span over more than 2 bits so final partial reduction step is needed.
288
  //
289
  // Partial reduction (just to fit into 130 bits)
290
  //    a2 = t2 & 3
291
  //    k = (t2 & ~3) + (t2 >> 2)
292
  //         Y    x4  +  Y    x1
293
  //    a2:a1:a0 += k
294
  //
295
  // Result will be in a2:a1:a0
296
  __ movq(t0, t2);
297
  __ movl(a2, t2); // DWORD
298
  __ andq(t0, ~3);
299
  __ shrq(t2, 2);
300
  __ addq(t0, t2);
301
  __ andl(a2, 3); // DWORD
302

303
  // a2:a1:a0 += k (kept in t0)
304
  __ addq(a0, t0);
305
  __ adcq(a1, 0);
306
  __ adcl(a2, 0); // DWORD
307
}
308

309
// Convert array of 128-bit numbers in quadwords (in D0:D1) into 128-bit numbers across 44-bit limbs (in L0:L1:L2)
310
// Optionally pad all the numbers (i.e. add 2^128)
311
//
312
//         +-------------------------+-------------------------+
313
//  D0:D1  | h0 h1 g0 g1 f0 f1 e0 e1 | d0 d1 c0 c1 b0 b1 a0 a1 |
314
//         +-------------------------+-------------------------+
315
//         +-------------------------+
316
//  L2     | h2 d2 g2 c2 f2 b2 e2 a2 |
317
//         +-------------------------+
318
//         +-------------------------+
319
//  L1     | h1 d1 g1 c1 f1 b1 e1 a1 |
320
//         +-------------------------+
321
//         +-------------------------+
322
//  L0     | h0 d0 g0 c0 f0 b0 e0 a0 |
323
//         +-------------------------+
324
//
325
void StubGenerator::poly1305_limbs_avx512(
326
    const XMMRegister D0, const XMMRegister D1,
327
    const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
328
    const XMMRegister TMP, const Register rscratch)
329
{
330
  // Interleave blocks of data
331
  __ evpunpckhqdq(TMP, D0, D1, Assembler::AVX_512bit);
332
  __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit);
333

334
  // Highest 42-bit limbs of new blocks
335
  __ vpsrlq(L2, TMP, 24, Assembler::AVX_512bit);
336
  if (padMSG) {
337
    __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message
338
  }
339

340
  // Middle 44-bit limbs of new blocks
341
  __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit);
342
  __ vpsllq(TMP, TMP, 20, Assembler::AVX_512bit);
343
  __ vpternlogq(L1, 0xA8, TMP, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C)
344

345
  // Lowest 44-bit limbs of new blocks
346
  __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
347
}
348

349
/**
350
 * Copy 5x26-bit (unreduced) limbs stored at Register limbs into  a2:a1:a0 (3x64-bit limbs)
351
 *
352
 * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R)
353
 */
354
void StubGenerator::poly1305_limbs(
355
    const Register limbs, const Register a0, const Register a1, const Register a2,
356
    const Register t0, const Register t1)
357
{
358
  __ movq(a0, Address(limbs, 0));
359
  __ movq(t0, Address(limbs, 8));
360
  __ shlq(t0, 26);
361
  __ addq(a0, t0);
362
  __ movq(t0, Address(limbs, 16));
363
  __ movq(t1, Address(limbs, 24));
364
  __ movq(a1, t0);
365
  __ shlq(t0, 52);
366
  __ shrq(a1, 12);
367
  __ shlq(t1, 14);
368
  __ addq(a0, t0);
369
  __ adcq(a1, t1);
370
  __ movq(t0, Address(limbs, 32));
371
  if (a2 != noreg) {
372
    __ movq(a2, t0);
373
    __ shrq(a2, 24);
374
  }
375
  __ shlq(t0, 40);
376
  __ addq(a1, t0);
377
  if (a2 != noreg) {
378
    __ adcq(a2, 0);
379

380
    // One round of reduction
381
    // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
382
    __ movq(t0, a2);
383
    __ andq(t0, ~3);
384
    __ andq(a2, 3);
385
    __ movq(t1, t0);
386
    __ shrq(t1, 2);
387
    __ addq(t0, t1);
388

389
    __ addq(a0, t0);
390
    __ adcq(a1, 0);
391
    __ adcq(a2, 0);
392
  }
393
}
394

395
/**
396
 * Break 3x64-bit a2:a1:a0 limbs into 5x26-bit limbs and store out into 5 quadwords at address `limbs`
397
 */
398
void StubGenerator::poly1305_limbs_out(
399
    const Register a0, const Register a1, const Register a2,
400
    const Register limbs,
401
    const Register t0, const Register t1)
402
{
403
  // Extra round of reduction
404
  // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
405
  __ movq(t0, a2);
406
  __ andq(t0, ~3);
407
  __ andq(a2, 3);
408
  __ movq(t1, t0);
409
  __ shrq(t1, 2);
410
  __ addq(t0, t1);
411

412
  __ addq(a0, t0);
413
  __ adcq(a1, 0);
414
  __ adcq(a2, 0);
415

416
  // Chop a2:a1:a0 into 26-bit limbs
417
  __ movl(t0, a0);
418
  __ andl(t0, 0x3ffffff);
419
  __ movq(Address(limbs, 0), t0);
420

421
  __ shrq(a0, 26);
422
  __ movl(t0, a0);
423
  __ andl(t0, 0x3ffffff);
424
  __ movq(Address(limbs, 8), t0);
425

426
  __ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1
427
  __ movl(t0, a1);
428
  __ shll(t0, 12);
429
  __ addl(t0, a0);
430
  __ andl(t0, 0x3ffffff);
431
  __ movq(Address(limbs, 16), t0);
432

433
  __ shrq(a1, 14); // already used up 14 bits
434
  __ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced
435
  __ addq(a1, a2); // put remaining bits into a1
436

437
  __ movl(t0, a1);
438
  __ andl(t0, 0x3ffffff);
439
  __ movq(Address(limbs, 24), t0);
440

441
  __ shrq(a1, 26);
442
  __ movl(t0, a1);
443
  //andl(t0, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s)
444
  __ movq(Address(limbs, 32), t0);
445
}
446

447
// This function consumes as many whole 16*16-byte blocks as available in input
448
// After execution, input and length will point at remaining (unprocessed) data
449
// and [a2 a1 a0] will contain the current accumulator value
450
//
451
// Math Note:
452
//    Main loop in this function multiplies each message block by r^16; And some glue before and after..
453
//    Proof (for brevity, split into 4 'rows' instead of 16):
454
//
455
//     hash = ((((m1*r + m2)*r + m3)*r ...  mn)*r
456
//          = m1*r^n + m2*r^(n-1) + ... +mn_1*r^2 + mn*r  // Horner's rule
457
//
458
//          = m1*r^n     + m4*r^(n-4) + m8*r^(n-8) ...    // split into 4 groups for brevity, same applies to 16 blocks
459
//          + m2*r^(n-1) + m5*r^(n-5) + m9*r^(n-9) ...
460
//          + m3*r^(n-2) + m6*r^(n-6) + m10*r^(n-10) ...
461
//          + m4*r^(n-3) + m7*r^(n-7) + m11*r^(n-11) ...
462
//
463
//          = r^4 * (m1*r^(n-4) + m4*r^(n-8) + m8 *r^(n-16) ... + mn_3)   // factor out r^4..r; same applies to 16 but r^16..r factors
464
//          + r^3 * (m2*r^(n-4) + m5*r^(n-8) + m9 *r^(n-16) ... + mn_2)
465
//          + r^2 * (m3*r^(n-4) + m6*r^(n-8) + m10*r^(n-16) ... + mn_1)
466
//          + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0)   // Note last column: message group has no multiplier
467
//
468
//          = (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) * r^4   // reverse Horner's rule, for each group
469
//          + (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) * r^3   // each column is multiplied by r^4, except last
470
//          + (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) * r^2
471
//          + (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) * r^1
472
//
473
// Also see M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code"
474
//
475
// Pseudocode:
476
//  * used for poly1305_multiply_scalar
477
//  x used for poly1305_multiply8_avx512
478
//  lower-case variables are scalar numbers in 3x44-bit limbs (in gprs)
479
//  upper-case variables are 8&16-element vector numbers in 3x44-bit limbs (in zmm registers)
480
//
481
//    CL = a       // [0 0 0 0 0 0 0 a]
482
//    AL = poly1305_limbs_avx512(input)
483
//    AH = poly1305_limbs_avx512(input+8)
484
//    AL = AL + C
485
//    input+=16, length-=16
486
//
487
//    a = r
488
//    a = a*r
489
//  r^2 = a
490
//    a = a*r
491
//  r^3 = a
492
//    r = a*r
493
//  r^4 = a
494
//
495
//    T  = r^4 || r^3 || r^2 || r
496
//    B  = limbs(T)           // [r^4  0  r^3  0  r^2  0  r^1  0 ]
497
//    CL = B >> 1             // [ 0  r^4  0  r^3  0  r^2  0  r^1]
498
//    R  = r^4 || r^4 || ..   // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4]
499
//    B  = BxR                // [r^8  0  r^7  0  r^6  0  r^5  0 ]
500
//    B  = B | CL             // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1]
501
//    CL = B
502
//    R  = r^8 || r^8 || ..   // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8]
503
//    B  = B x R              // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9]
504
//    CH = B
505
//    R = r^16 || r^16 || ..  // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16]
506
//
507
// for (;length>=16; input+=16, length-=16)
508
//     BL = poly1305_limbs_avx512(input)
509
//     BH = poly1305_limbs_avx512(input+8)
510
//     AL = AL x R
511
//     AH = AH x R
512
//     AL = AL + BL
513
//     AH = AH + BH
514
//
515
//  AL = AL x CL
516
//  AH = AH x CH
517
//  A = AL + AH // 16->8 blocks
518
//  T = A >> 4  // 8 ->4 blocks
519
//  A = A + T
520
//  T = A >> 2  // 4 ->2 blocks
521
//  A = A + T
522
//  T = A >> 1  // 2 ->1 blocks
523
//  A = A + T
524
//  a = A
525
//
526
// Register Map:
527
// GPRs:
528
//   input        = rdi
529
//   length       = rbx
530
//   accumulator  = rcx
531
//   R   = r8
532
//   a0  = rsi
533
//   a1  = r9
534
//   a2  = r10
535
//   r0  = r11
536
//   r1  = r12
537
//   c1  = r8;
538
//   t0  = r13
539
//   t1  = r14
540
//   t2  = r15
541
//   stack(rsp, rbp)
542
//   mulq(rax, rdx) in poly1305_multiply_scalar
543
//
544
// ZMMs:
545
//   D: xmm0-1
546
//   TMP: xmm2
547
//   T: xmm3-8
548
//   A: xmm9-14
549
//   B: xmm15-20
550
//   C: xmm21-26
551
//   R: xmm27-31
552
void StubGenerator::poly1305_process_blocks_avx512(
553
    const Register input, const Register length,
554
    const Register a0, const Register a1, const Register a2,
555
    const Register r0, const Register r1, const Register c1)
556
{
557
  Label L_process256Loop, L_process256LoopDone;
558
  const Register t0 = r13;
559
  const Register t1 = r14;
560
  const Register t2 = r15;
561
  const Register mulql = rax;
562
  const Register mulqh = rdx;
563

564
  const XMMRegister D0 = xmm0;
565
  const XMMRegister D1 = xmm1;
566
  const XMMRegister TMP = xmm2;
567

568
  const XMMRegister T0 = xmm3;
569
  const XMMRegister T1 = xmm4;
570
  const XMMRegister T2 = xmm5;
571
  const XMMRegister T3 = xmm6;
572
  const XMMRegister T4 = xmm7;
573
  const XMMRegister T5 = xmm8;
574

575
  const XMMRegister A0 = xmm9;
576
  const XMMRegister A1 = xmm10;
577
  const XMMRegister A2 = xmm11;
578
  const XMMRegister A3 = xmm12;
579
  const XMMRegister A4 = xmm13;
580
  const XMMRegister A5 = xmm14;
581

582
  const XMMRegister B0 = xmm15;
583
  const XMMRegister B1 = xmm16;
584
  const XMMRegister B2 = xmm17;
585
  const XMMRegister B3 = xmm18;
586
  const XMMRegister B4 = xmm19;
587
  const XMMRegister B5 = xmm20;
588

589
  const XMMRegister C0 = xmm21;
590
  const XMMRegister C1 = xmm22;
591
  const XMMRegister C2 = xmm23;
592
  const XMMRegister C3 = xmm24;
593
  const XMMRegister C4 = xmm25;
594
  const XMMRegister C5 = xmm26;
595

596
  const XMMRegister R0 = xmm27;
597
  const XMMRegister R1 = xmm28;
598
  const XMMRegister R2 = xmm29;
599
  const XMMRegister R1P = xmm30;
600
  const XMMRegister R2P = xmm31;
601

602
  // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2
603
  __ movq(t0, a0);
604
  __ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (Acc[43:0])
605
  __ movq(C0, t0);
606

607
  __ movq(t0, a1);
608
  __ shrdq(a0, t0, 44);
609
  __ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // Second limb (Acc[77:52])
610
  __ movq(C1, a0);
611

612
  __ shrdq(a1, a2, 24);
613
  __ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/); // Third limb (Acc[129:88])
614
  __ movq(C2, a1);
615

616
  // To add accumulator, we must unroll first loop iteration
617

618
  // Load first block of data (128 bytes) and pad
619
  // A0 to have bits 0-43 of all 8 blocks in 8 qwords
620
  // A1 to have bits 87-44 of all 8 blocks in 8 qwords
621
  // A2 to have bits 127-88 of all 8 blocks in 8 qwords
622
  __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
623
  __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
624
  poly1305_limbs_avx512(D0, D1, A0, A1, A2, true, TMP, t1 /*rscratch*/);
625

626
  // Add accumulator to the fist message block
627
  __ vpaddq(A0, A0, C0, Assembler::AVX_512bit);
628
  __ vpaddq(A1, A1, C1, Assembler::AVX_512bit);
629
  __ vpaddq(A2, A2, C2, Assembler::AVX_512bit);
630

631
  // Load next blocks of data (128 bytes)  and pad
632
  // A3 to have bits 0-43 of all 8 blocks in 8 qwords
633
  // A4 to have bits 87-44 of all 8 blocks in 8 qwords
634
  // A5 to have bits 127-88 of all 8 blocks in 8 qwords
635
  __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
636
  __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
637
  poly1305_limbs_avx512(D0, D1, A3, A4, A5, true, TMP, t1 /*rscratch*/);
638

639
  __ subl(length, 16*16);
640
  __ lea(input, Address(input,16*16));
641

642
  // Compute the powers of R^1..R^4 and form 44-bit limbs of each
643
  // T0 to have bits 0-127 in 4 quadword pairs
644
  // T1 to have bits 128-129 in alternating 8 qwords
645
  __ vpxorq(T1, T1, T1, Assembler::AVX_512bit);
646
  __ movq(T2, r0);
647
  __ vpinsrq(T2, T2, r1, 1);
648
  __ vinserti32x4(T0, T0, T2, 3);
649

650
  // Calculate R^2
651
  __ movq(a0, r0);
652
  __ movq(a1, r1);
653
  // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator
654
  poly1305_multiply_scalar(a0, a1, a2,
655
                           r0, r1, c1, true,
656
                           t0, t1, t2, mulql, mulqh);
657

658
  __ movq(T2, a0);
659
  __ vpinsrq(T2, T2, a1, 1);
660
  __ vinserti32x4(T0, T0, T2, 2);
661
  __ movq(T2, a2);
662
  __ vinserti32x4(T1, T1, T2, 2);
663

664
  // Calculate R^3
665
  poly1305_multiply_scalar(a0, a1, a2,
666
                           r0, r1, c1, false,
667
                           t0, t1, t2, mulql, mulqh);
668

669
  __ movq(T2, a0);
670
  __ vpinsrq(T2, T2, a1, 1);
671
  __ vinserti32x4(T0, T0, T2, 1);
672
  __ movq(T2, a2);
673
  __ vinserti32x4(T1, T1, T2, 1);
674

675
  // Calculate R^4
676
  poly1305_multiply_scalar(a0, a1, a2,
677
                           r0, r1, c1, false,
678
                           t0, t1, t2, mulql, mulqh);
679

680
  __ movq(T2, a0);
681
  __ vpinsrq(T2, T2, a1, 1);
682
  __ vinserti32x4(T0, T0, T2, 0);
683
  __ movq(T2, a2);
684
  __ vinserti32x4(T1, T1, T2, 0);
685

686
  // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty)
687
  // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords
688
  // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords
689
  // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords
690
  __ vpxorq(T2, T2, T2, Assembler::AVX_512bit);
691
  poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP, t1 /*rscratch*/);
692

693
  // T1 contains the 2 highest bits of the powers of R
694
  __ vpsllq(T1, T1, 40, Assembler::AVX_512bit);
695
  __ evporq(B2, B2, T1, Assembler::AVX_512bit);
696

697
  // Broadcast 44-bit limbs of R^4 into R0,R1,R2
698
  __ mov(t0, a0);
699
  __ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (R^4[43:0])
700
  __ evpbroadcastq(R0, t0, Assembler::AVX_512bit);
701

702
  __ movq(t0, a1);
703
  __ shrdq(a0, t0, 44);
704
  __ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // Second limb (R^4[87:44])
705
  __ evpbroadcastq(R1, a0, Assembler::AVX_512bit);
706

707
  __ shrdq(a1, a2, 24);
708
  __ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/); // Third limb (R^4[129:88])
709
  __ evpbroadcastq(R2, a1, Assembler::AVX_512bit);
710

711
  // Generate 4*5*R^4 into {R2P,R1P}
712
  // Used as multiplier in poly1305_multiply8_avx512 so can
713
  // ignore bottom limb and carry propagation
714
  __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);    // 4*R^4
715
  __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
716
  __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit);  // 5*R^4
717
  __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
718
  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);   // 4*5*R^4
719
  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
720

721
  // Move R^4..R^1 one element over
722
  __ vpslldq(C0, B0, 8, Assembler::AVX_512bit);
723
  __ vpslldq(C1, B1, 8, Assembler::AVX_512bit);
724
  __ vpslldq(C2, B2, 8, Assembler::AVX_512bit);
725

726
  // Calculate R^8-R^5
727
  poly1305_multiply8_avx512(B0, B1, B2,             // ACC=R^4..R^1
728
                            R0, R1, R2, R1P, R2P,   // R^4..R^4, 4*5*R^4
729
                            T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
730

731
  // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
732
  __ evporq(B0, B0, C0, Assembler::AVX_512bit);
733
  __ evporq(B1, B1, C1, Assembler::AVX_512bit);
734
  __ evporq(B2, B2, C2, Assembler::AVX_512bit);
735

736
  // Store R^8-R for later use
737
  __ evmovdquq(C0, B0, Assembler::AVX_512bit);
738
  __ evmovdquq(C1, B1, Assembler::AVX_512bit);
739
  __ evmovdquq(C2, B2, Assembler::AVX_512bit);
740

741
  // Broadcast R^8
742
  __ vpbroadcastq(R0, B0, Assembler::AVX_512bit);
743
  __ vpbroadcastq(R1, B1, Assembler::AVX_512bit);
744
  __ vpbroadcastq(R2, B2, Assembler::AVX_512bit);
745

746
  // Generate 4*5*R^8
747
  __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);
748
  __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
749
  __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit);    // 5*R^8
750
  __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
751
  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);     // 4*5*R^8
752
  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
753

754
  // Calculate R^16-R^9
755
  poly1305_multiply8_avx512(B0, B1, B2,            // ACC=R^8..R^1
756
                            R0, R1, R2, R1P, R2P,  // R^8..R^8, 4*5*R^8
757
                            T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
758

759
  // Store R^16-R^9 for later use
760
  __ evmovdquq(C3, B0, Assembler::AVX_512bit);
761
  __ evmovdquq(C4, B1, Assembler::AVX_512bit);
762
  __ evmovdquq(C5, B2, Assembler::AVX_512bit);
763

764
  // Broadcast R^16
765
  __ vpbroadcastq(R0, B0, Assembler::AVX_512bit);
766
  __ vpbroadcastq(R1, B1, Assembler::AVX_512bit);
767
  __ vpbroadcastq(R2, B2, Assembler::AVX_512bit);
768

769
  // Generate 4*5*R^16
770
  __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);
771
  __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
772
  __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit);  // 5*R^16
773
  __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
774
  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);   // 4*5*R^16
775
  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
776

777
  // VECTOR LOOP: process 16 * 16-byte message block at a time
778
  __ bind(L_process256Loop);
779
  __ cmpl(length, 16*16);
780
  __ jcc(Assembler::less, L_process256LoopDone);
781

782
  // Load and interleave next block of data (128 bytes)
783
  __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
784
  __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
785
  poly1305_limbs_avx512(D0, D1, B0, B1, B2, true, TMP, t1 /*rscratch*/);
786

787
  // Load and interleave next block of data (128 bytes)
788
  __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
789
  __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
790
  poly1305_limbs_avx512(D0, D1, B3, B4, B5, true, TMP, t1 /*rscratch*/);
791

792
  poly1305_multiply8_avx512(A0, A1, A2,            // MSG/ACC 16 blocks
793
                            R0, R1, R2, R1P, R2P,  // R^16..R^16, 4*5*R^16
794
                            T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
795
  poly1305_multiply8_avx512(A3, A4, A5,            // MSG/ACC 16 blocks
796
                            R0, R1, R2, R1P, R2P,  // R^16..R^16, 4*5*R^16
797
                            T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
798

799
  __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
800
  __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
801
  __ vpaddq(A2, A2, B2, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
802
  __ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
803
  __ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
804
  __ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
805

806
  __ subl(length, 16*16);
807
  __ lea(input, Address(input,16*16));
808
  __ jmp(L_process256Loop);
809

810
  __ bind(L_process256LoopDone);
811

812
  // Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value
813
  // Generate 4*5*[R^16..R^9] (ignore lowest limb)
814
  // Use D0 ~ R1P, D1 ~ R2P for higher powers
815
  __ vpsllq(R1P, C4, 2, Assembler::AVX_512bit);
816
  __ vpsllq(R2P, C5, 2, Assembler::AVX_512bit);
817
  __ vpaddq(R1P, R1P, C4, Assembler::AVX_512bit);    // 5*R^8
818
  __ vpaddq(R2P, R2P, C5, Assembler::AVX_512bit);
819
  __ vpsllq(D0, R1P, 2, Assembler::AVX_512bit);      // 4*5*R^8
820
  __ vpsllq(D1, R2P, 2, Assembler::AVX_512bit);
821

822
  // Generate 4*5*[R^8..R^1] (ignore lowest limb)
823
  __ vpsllq(R1P, C1, 2, Assembler::AVX_512bit);
824
  __ vpsllq(R2P, C2, 2, Assembler::AVX_512bit);
825
  __ vpaddq(R1P, R1P, C1, Assembler::AVX_512bit);    // 5*R^8
826
  __ vpaddq(R2P, R2P, C2, Assembler::AVX_512bit);
827
  __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit);     // 4*5*R^8
828
  __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
829

830
  poly1305_multiply8_avx512(A0, A1, A2,            // MSG/ACC 16 blocks
831
                            C3, C4, C5, D0, D1,    // R^16-R^9, R1P, R2P
832
                            T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
833
  poly1305_multiply8_avx512(A3, A4, A5,            // MSG/ACC 16 blocks
834
                            C0, C1, C2, R1P, R2P,  // R^8-R, R1P, R2P
835
                            T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
836

837
  // Add all blocks (horizontally)
838
  // 16->8 blocks
839
  __ vpaddq(A0, A0, A3, Assembler::AVX_512bit);
840
  __ vpaddq(A1, A1, A4, Assembler::AVX_512bit);
841
  __ vpaddq(A2, A2, A5, Assembler::AVX_512bit);
842

843
  // 8 -> 4 blocks
844
  __ vextracti64x4(T0, A0, 1);
845
  __ vextracti64x4(T1, A1, 1);
846
  __ vextracti64x4(T2, A2, 1);
847
  __ vpaddq(A0, A0, T0, Assembler::AVX_256bit);
848
  __ vpaddq(A1, A1, T1, Assembler::AVX_256bit);
849
  __ vpaddq(A2, A2, T2, Assembler::AVX_256bit);
850

851
  // 4 -> 2 blocks
852
  __ vextracti32x4(T0, A0, 1);
853
  __ vextracti32x4(T1, A1, 1);
854
  __ vextracti32x4(T2, A2, 1);
855
  __ vpaddq(A0, A0, T0, Assembler::AVX_128bit);
856
  __ vpaddq(A1, A1, T1, Assembler::AVX_128bit);
857
  __ vpaddq(A2, A2, T2, Assembler::AVX_128bit);
858

859
  // 2 -> 1 blocks
860
  __ vpsrldq(T0, A0, 8, Assembler::AVX_128bit);
861
  __ vpsrldq(T1, A1, 8, Assembler::AVX_128bit);
862
  __ vpsrldq(T2, A2, 8, Assembler::AVX_128bit);
863

864
  // Finish folding and clear second qword
865
  __ mov64(t0, 0xfd);
866
  __ kmovql(k1, t0);
867
  __ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit);
868
  __ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit);
869
  __ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit);
870

871
  // Carry propagation
872
  __ vpsrlq(D0, A0, 44, Assembler::AVX_512bit);
873
  __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, t1 /*rscratch*/); // Clear top 20 bits
874
  __ vpaddq(A1, A1, D0, Assembler::AVX_512bit);
875
  __ vpsrlq(D0, A1, 44, Assembler::AVX_512bit);
876
  __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, t1 /*rscratch*/); // Clear top 20 bits
877
  __ vpaddq(A2, A2, D0, Assembler::AVX_512bit);
878
  __ vpsrlq(D0, A2, 42, Assembler::AVX_512bit);
879
  __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, t1 /*rscratch*/); // Clear top 22 bits
880
  __ vpsllq(D1, D0, 2, Assembler::AVX_512bit);
881
  __ vpaddq(D0, D0, D1, Assembler::AVX_512bit);
882
  __ vpaddq(A0, A0, D0, Assembler::AVX_512bit);
883

884
  // Put together A (accumulator)
885
  __ movq(a0, A0);
886

887
  __ movq(t0, A1);
888
  __ movq(t1, t0);
889
  __ shlq(t1, 44);
890
  __ shrq(t0, 20);
891

892
  __ movq(a2, A2);
893
  __ movq(a1, a2);
894
  __ shlq(a1, 24);
895
  __ shrq(a2, 40);
896

897
  __ addq(a0, t1);
898
  __ adcq(a1, t0);
899
  __ adcq(a2, 0);
900

901
  // Cleanup
902
  // Zero out zmm0-zmm31.
903
  __ vzeroall();
904
  for (XMMRegister rxmm = xmm16; rxmm->is_valid(); rxmm = rxmm->successor()) {
905
    __ vpxorq(rxmm, rxmm, rxmm, Assembler::AVX_512bit);
906
  }
907
}
908

909
// This function consumes as many whole 16-byte blocks as available in input
910
// After execution, input and length will point at remaining (unprocessed) data
911
// and accumulator will point to the current accumulator value
912
address StubGenerator::generate_poly1305_processBlocks() {
913
  __ align(CodeEntryAlignment);
914
  StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
915
  address start = __ pc();
916
  __ enter();
917

918
  // Save all 'SOE' registers
919
  __ push(rbx);
920
  #ifdef _WIN64
921
  __ push(rsi);
922
  __ push(rdi);
923
  #endif
924
  __ push(r12);
925
  __ push(r13);
926
  __ push(r14);
927
  __ push(r15);
928

929
  // Register Map
930
  const Register input        = rdi; // msg
931
  const Register length       = rbx; // msg length in bytes
932
  const Register accumulator  = rcx;
933
  const Register R            = r8;
934

935
  const Register a0 = rsi;  // [in/out] accumulator bits 63..0
936
  const Register a1 = r9;   // [in/out] accumulator bits 127..64
937
  const Register a2 = r10;  // [in/out] accumulator bits 195..128
938
  const Register r0 = r11;  // R constant bits 63..0
939
  const Register r1 = r12;  // R constant bits 127..64
940
  const Register c1 = r8;   // 5*R (upper limb only)
941
  const Register t0 = r13;
942
  const Register t1 = r14;
943
  const Register t2 = r15;
944
  const Register mulql = rax;
945
  const Register mulqh = rdx;
946

947
  // Normalize input
948
  // pseudo-signature: void poly1305_processBlocks(byte[] input, int length, int[5] accumulator, int[5] R)
949
  // input, a, r pointers point at first array element
950
  // java headers bypassed in LibraryCallKit::inline_poly1305_processBlocks
951
  #ifdef _WIN64
952
  // c_rarg0 - rcx
953
  // c_rarg1 - rdx
954
  // c_rarg2 - r8
955
  // c_rarg3 - r9
956
  __ mov(input, c_rarg0);
957
  __ mov(length, c_rarg1);
958
  __ mov(accumulator, c_rarg2);
959
  __ mov(R, c_rarg3);
960
  #else
961
  // c_rarg0 - rdi
962
  // c_rarg1 - rsi
963
  // c_rarg2 - rdx
964
  // c_rarg3 - rcx
965
  // dont clobber R, args copied out-of-order
966
  __ mov(length, c_rarg1);
967
  __ mov(R, c_rarg3);
968
  __ mov(accumulator, c_rarg2);
969
  #endif
970

971
  Label L_process16Loop, L_process16LoopDone;
972

973
  // Load R into r1:r0
974
  poly1305_limbs(R, r0, r1, noreg, t0, t1);
975

976
  // Compute 5*R (Upper limb only)
977
  __ movq(c1, r1);
978
  __ shrq(c1, 2);
979
  __ addq(c1, r1); // c1 = r1 + (r1 >> 2)
980

981
  // Load accumulator into a2:a1:a0
982
  poly1305_limbs(accumulator, a0, a1, a2, t0, t1);
983

984
  // VECTOR LOOP: Minimum of 256 bytes to run vectorized code
985
  __ cmpl(length, 16*16);
986
  __ jcc(Assembler::less, L_process16Loop);
987

988
  if (UseAVX > 2) {
989
    poly1305_process_blocks_avx512(input, length,
990
                                  a0, a1, a2,
991
                                  r0, r1, c1);
992
  } else {
993
    poly1305_process_blocks_avx2(input, length,
994
                                  a0, a1, a2,
995
                                  r0, r1, c1);
996
  }
997

998

999
  // SCALAR LOOP: process one 16-byte message block at a time
1000
  __ bind(L_process16Loop);
1001
  __ cmpl(length, 16);
1002
  __ jcc(Assembler::less, L_process16LoopDone);
1003

1004
  __ addq(a0, Address(input,0));
1005
  __ adcq(a1, Address(input,8));
1006
  __ adcq(a2,1);
1007
  poly1305_multiply_scalar(a0, a1, a2,
1008
                           r0, r1, c1, false,
1009
                           t0, t1, t2, mulql, mulqh);
1010

1011
  __ subl(length, 16);
1012
  __ lea(input, Address(input,16));
1013
  __ jmp(L_process16Loop);
1014
  __ bind(L_process16LoopDone);
1015

1016
  // Write output
1017
  poly1305_limbs_out(a0, a1, a2, accumulator, t0, t1);
1018

1019
  __ pop(r15);
1020
  __ pop(r14);
1021
  __ pop(r13);
1022
  __ pop(r12);
1023
  #ifdef _WIN64
1024
  __ pop(rdi);
1025
  __ pop(rsi);
1026
  #endif
1027
  __ pop(rbx);
1028

1029
  __ leave();
1030
  __ ret(0);
1031
  return start;
1032
}
1033

1034
/*
1035
  The AVX2 implementation below is directly based on the AVX2 Poly1305 hash computation as
1036
  implemented in Intel(R) Multi-Buffer Crypto for IPsec Library.
1037
  (url: https://github.com/intel/intel-ipsec-mb/blob/main/lib/avx2_t3/poly_fma_avx2.asm)
1038

1039
  Additional references:
1040
  [1] Goll M, Gueron S., "Vectorization of Poly1305 message authentication code",
1041
      12th International Conference on Information Technology-New Generations,
1042
      2015 Apr 13 (pp. 145-150). IEEE.
1043
  [2] Bhattacharyya S, Sarkar P., "Improved SIMD implementation of Poly1305",
1044
      IET Information Security. 2020 Sep;14(5):521-30.
1045
  Note: a compact summary of the Goll-Gueron AVX2 algorithm developed in [1] is presented in [2].
1046
  [3] Wikipedia, "Parallel evaluation of Horner's method",
1047
      (url: https://en.wikipedia.org/wiki/Horner%27s_method)
1048
 ----------------------------------------------------------
1049

1050
  Poly1305 AVX2 algorithm:
1051
  Let the 32-byte one-time key be partitioned into two equal parts R and K.
1052
  Let R be the 16-byte secret key used for polynomial evaluation.
1053
  Let K be the 16-byte secret key.
1054
  Let Z_P be prime field over which the polynomial is evaluated. Let P = 2^130 - 5 be the prime.
1055
  Let M be the message which can be represented as a concatenation (||) of 'l' 16-byte blocks M[i].
1056
  i.e., M = M[0] || M[1] || ... || M[i] || ... || M[l-2] || M[l-1]
1057
  To create the coefficients C[i] for polynomial evaluation over Z_P, each 16-byte (i.e., 128-bit)
1058
  message block M[i] is concatenated with bits '10' to make a 130-bit block.
1059
  The last block (<= 16-byte length) is concatenated with 1 followed by 0s to make a 130-bit block.
1060
  Therefore, we define
1061
  C[i]   = M[i] || '10' for 0 <= i <= l-2 ;
1062
  C[l-1] = M[i] || '10...0'
1063
  such that, length(C[i]) = 130 bits, for i ∈ [0, l).
1064

1065
  Let * indicate scalar multiplication (i.e., w = u * v);
1066
  Let × indicate scalar multiplication followed by reduction modulo P (i.e., z = u × v = {(u * v) mod P})
1067

1068
  POLY1305_MAC = (POLY1305_EVAL_POLYNOMIAL(C, R, P) + K) mod 2^128; where,
1069

1070
  POLY1305_EVAL_POLYNOMIAL(C, R, P) = {C[0] * R^l + C[1] * R^(l-1) + ... + C[l-2] * R^2 + C[l-1] * R} mod P
1071
    = R × {C[0] × R^(l-1) + C[1] × R^(l-2) + ... + C[l-2] × R + C[l-1]}
1072
    = R × Polynomial(R; C[0], C[1], ... ,C[l-2], C[l-1])
1073
  Where,
1074
  Polynomial(R; C[0], C[1], ... ,C[l-2], C[l-1]) = Σ{C[i] × R^(l-i-1)} for i ∈ [0, l)
1075
  ----------------------------------------------------------
1076

1077
  Parallel evaluation of POLY1305_EVAL_POLYNOMIAL(C, R, P):
1078
  Let the number of message blocks l = 4*l' + ρ where ρ = l mod 4.
1079
  Using k-way parallel Horner's evaluation [3], for k = 4, we define SUM below:
1080

1081
  SUM = R^4 × Polynomial(R^4; C[0], C[4], C[8]  ... , C[4l'-4]) +
1082
        R^3 × Polynomial(R^4; C[1], C[5], C[9]  ... , C[4l'-3]) +
1083
        R^2 × Polynomial(R^4; C[2], C[6], C[10] ... , C[4l'-2]) +
1084
        R^1 × Polynomial(R^4; C[3], C[7], C[11] ... , C[4l'-1]) +
1085

1086
  Then,
1087
  POLY1305_EVAL_POLYNOMIAL(C, R, P) = SUM if ρ = 0 (i.e., l is multiple of 4)
1088
                        = R × Polynomial(R; SUM + C[l-ρ], C[l-ρ+1], ... , C[l-1]) if ρ > 0
1089
  ----------------------------------------------------------
1090

1091
  Gall-Gueron[1] 4-way SIMD Algorithm[2] for POLY1305_EVAL_POLYNOMIAL(C, R, P):
1092

1093
  Define mathematical vectors (not same as SIMD vector lanes) as below:
1094
  R4321   = [R^4, R^3, R^2, R^1];
1095
  R4444   = [R^4, R^4, R^4, R^4];
1096
  COEF[i] = [C[4i], C[4i+1], C[4i+2], C[4i+3]] for i ∈ [0, l'). For example, COEF[0] and COEF[1] shown below.
1097
  COEF[0] = [C0, C1, C2, C3]
1098
  COEF[1] = [C4, C5, C6, C7]
1099
  T       = [T0, T1, T2, T3] be a temporary vector
1100
  ACC     = [acc, 0, 0, 0]; acc has hash from previous computations (if any), otherwise 0.
1101
  ⊗ indicates component-wise vector multiplication followed by modulo reduction
1102
  ⊕ indicates component-wise vector addition, + indicates scalar addition
1103

1104
  POLY1305_EVAL_POLYNOMIAL(C, R, P) {
1105
    T ← ACC; # load accumulator
1106
    T ← T ⊕ COEF[0]; # add accumulator to the first 4 blocks
1107
    Compute R4321, R4444;
1108
    # SIMD loop
1109
    l' = floor(l/4); # operate on 4 blocks at a time
1110
    for (i = 1 to l'-1):
1111
      T ← (R4444 ⊗ T) ⊕ COEF[i];
1112
    T ← R4321 ⊗ T;
1113
    SUM ← T0 + T1 + T2 + T3;
1114

1115
    # Scalar tail processing
1116
    if (ρ > 0):
1117
      SUM ← R × Polynomial(R; SUM + C[l-ρ], C[l-ρ+1], ... , C[l-1]);
1118
    return SUM;
1119
  }
1120

1121
  Notes:
1122
  (1) Each 130-bit block is represented using three 44-bit limbs (most significant limb is only 42-bit).
1123
      (The Goll-Gueron implementation[1] uses five 26-bit limbs instead).
1124
  (2) Each component of the mathematical vectors is a 130-bit value. The above mathemetical vectors are not to be confused with SIMD vector lanes.
1125
  (3) Each AVX2 YMM register can store four 44-bit limbs in quadwords. Since each 130-bit message block is represented using 3 limbs,
1126
      to store all the limbs of 4 different 130-bit message blocks, we need 3 YMM registers in total.
1127
  (4) In the AVX2 implementation, multiplication followed by modulo reduction and addition are performed for 4 blocks at a time.
1128

1129
*/
1130

1131
void StubGenerator::poly1305_process_blocks_avx2(
1132
    const Register input, const Register length,
1133
    const Register a0, const Register a1, const Register a2,
1134
    const Register r0, const Register r1, const Register c1)
1135
{
1136
  Label L_process256Loop, L_process256LoopDone;
1137
  const Register t0 = r13;
1138
  const Register t1 = r14;
1139
  const Register t2 = r15;
1140
  const Register mulql = rax;
1141
  const Register mulqh = rdx;
1142

1143
  const XMMRegister YMM_ACC0 = xmm0;
1144
  const XMMRegister YMM_ACC1 = xmm1;
1145
  const XMMRegister YMM_ACC2 = xmm2;
1146

1147
  const XMMRegister YTMP1 = xmm3;
1148
  const XMMRegister YTMP2 = xmm4;
1149
  const XMMRegister YTMP3 = xmm5;
1150
  const XMMRegister YTMP4 = xmm6;
1151
  const XMMRegister YTMP5 = xmm7;
1152
  const XMMRegister YTMP6 = xmm8;
1153
  const XMMRegister YTMP7 = xmm9;
1154
  const XMMRegister YTMP8 = xmm10;
1155
  const XMMRegister YTMP9 = xmm11;
1156
  const XMMRegister YTMP10 = xmm12;
1157
  const XMMRegister YTMP11 = xmm13;
1158
  const XMMRegister YTMP12 = xmm14;
1159
  const XMMRegister YTMP13 = xmm15;
1160

1161
  const XMMRegister YMM_R0 = YTMP11;
1162
  const XMMRegister YMM_R1 = YTMP12;
1163
  const XMMRegister YMM_R2 = YTMP13;
1164

1165
  // XWORD aliases of YMM registers (for convenience)
1166
  const XMMRegister XTMP1 = YTMP1;
1167
  const XMMRegister XTMP2 = YTMP2;
1168
  const XMMRegister XTMP3 = YTMP3;
1169

1170
  // Setup stack frame
1171
  // Save rbp and rsp
1172
  __ push(rbp);
1173
  __ movq(rbp, rsp);
1174
  // Align stack and reserve space
1175
  __ andq(rsp, -32);
1176
  __ subptr(rsp, 32*8);
1177

1178
  /* Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1179
    T ← ACC
1180
    T ← T ⊕ COEF[0];
1181
  */
1182

1183
  // Spread accumulator into 44-bit limbs in quadwords
1184
  // Accumulator limbs to be stored in YTMP1,YTMP2,YTMP3
1185
  // First limb (Acc[43:0])
1186
  __ movq(t0, a0);
1187
  __ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/);
1188
  __ movq(XTMP1, t0);
1189
  // Second limb (Acc[87:44])
1190
  __ movq(t0, a1);
1191
  __ shrdq(a0, t0, 44);
1192
  __ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/);
1193
  __ movq(XTMP2, a0);
1194
  // Third limb (Acc[129:88])
1195
  __ shrdq(a1, a2, 24);
1196
  __ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/);
1197
  __ movq(XTMP3, a1);
1198
  // --- end of spread accumulator
1199

1200
  // To add accumulator, we must unroll first loop iteration
1201
  // Load first four 16-byte message blocks of data (64 bytes)
1202
  __ vmovdqu(YTMP4, Address(input, 0));
1203
  __ vmovdqu(YTMP5, Address(input, 32));
1204

1205
  // Interleave the input message data to form 44-bit limbs
1206
  // YMM_ACC0 to have bits 0-43 of all 4 blocks in 4 qwords
1207
  // YMM_ACC1 to have bits 87-44 of all 4 blocks in 4 qwords
1208
  // YMM_ACC2 to have bits 127-88 of all 4 blocks in 4 qwords
1209
  // Interleave blocks of data
1210
  __ vpunpckhqdq(YMM_ACC2, YTMP4, YTMP5, Assembler::AVX_256bit);
1211
  __ vpunpcklqdq(YMM_ACC0, YTMP4, YTMP5, Assembler::AVX_256bit);
1212

1213
  // Middle 44-bit limbs of new blocks
1214
  __ vpsrlq(YMM_ACC1, YMM_ACC0, 44, Assembler::AVX_256bit);
1215
  __ vpsllq(YTMP4, YMM_ACC2, 20, Assembler::AVX_256bit);
1216
  __ vpor(YMM_ACC1, YMM_ACC1, YTMP4, Assembler::AVX_256bit);
1217
  __ vpand(YMM_ACC1, YMM_ACC1, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1218

1219
  // Lowest 44-bit limbs of new blocks
1220
  __ vpand(YMM_ACC0, YMM_ACC0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1221

1222
  // Highest 42-bit limbs of new blocks; pad the msg with 2^128
1223
  __ vpsrlq(YMM_ACC2, YMM_ACC2, 24, Assembler::AVX_256bit);
1224

1225
  // Add 2^128 to all 4 final qwords for the message
1226
  __ vpor(YMM_ACC2, YMM_ACC2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_256bit, t1);
1227
  // --- end of input interleaving and message padding
1228

1229
  // Add accumulator to the fist message block
1230
  // Accumulator limbs in YTMP1,YTMP2,YTMP3
1231
  __ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_256bit);
1232
  __ vpaddq(YMM_ACC1, YMM_ACC1, YTMP2, Assembler::AVX_256bit);
1233
  __ vpaddq(YMM_ACC2, YMM_ACC2, YTMP3, Assembler::AVX_256bit);
1234

1235
  /* Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1236
    Compute R4321, R4444;
1237
    R4321   = [R^4, R^3, R^2, R^1];
1238
    R4444   = [R^4, R^4, R^4, R^4];
1239
  */
1240

1241
  // Compute the powers of R^1..R^4 and form 44-bit limbs of each
1242
  // YTMP5 to have bits 0-127 for R^1 and R^2
1243
  // YTMP6 to have bits 128-129 for R^1 and R^2
1244
  __ movq(XTMP1, r0);
1245
  __ vpinsrq(XTMP1, XTMP1, r1, 1);
1246
  __ vinserti128(YTMP5, YTMP5, XTMP1, 1);
1247
  // clear registers
1248
  __ vpxor(YTMP10, YTMP10, YTMP10, Assembler::AVX_256bit);
1249
  __ vpxor(YTMP6, YTMP6, YTMP6, Assembler::AVX_256bit);
1250

1251
  // Calculate R^2
1252
  // a ← R
1253
  __ movq(a0, r0);
1254
  __ movq(a1, r1);
1255
  // a ← a * R = R^2
1256
  poly1305_multiply_scalar(a0, a1, a2,
1257
                           r0, r1, c1, true,
1258
                           t0, t1, t2, mulql, mulqh);
1259
  // Store R^2 in YTMP5, YTM6
1260
  __ movq(XTMP1, a0);
1261
  __ vpinsrq(XTMP1, XTMP1, a1, 1);
1262
  __ vinserti128(YTMP5, YTMP5, XTMP1, 0);
1263
  __ movq(XTMP1, a2);
1264
  __ vinserti128(YTMP6, YTMP6, XTMP1, 0);
1265

1266
  // Calculate R^3
1267
  // a ← a * R = R^3
1268
  poly1305_multiply_scalar(a0, a1, a2,
1269
                           r0, r1, c1, false,
1270
                           t0, t1, t2, mulql, mulqh);
1271
  // Store R^3 in YTMP7, YTM2
1272
  __ movq(XTMP1, a0);
1273
  __ vpinsrq(XTMP1, XTMP1, a1, 1);
1274
  __ vinserti128(YTMP7, YTMP7, XTMP1, 1);
1275
  __ movq(XTMP1, a2);
1276
  __ vinserti128(YTMP2, YTMP2, XTMP1, 1);
1277

1278
  // Calculate R^4
1279
  // a ← a * R = R^4
1280
  poly1305_multiply_scalar(a0, a1, a2,
1281
                           r0, r1, c1, false,
1282
                           t0, t1, t2, mulql, mulqh);
1283
  // Store R^4 in YTMP7, YTM2
1284
  __ movq(XTMP1, a0);
1285
  __ vpinsrq(XTMP1, XTMP1, a1, 1);
1286
  __ vinserti128(YTMP7, YTMP7, XTMP1, 0);
1287
  __ movq(XTMP1, a2);
1288
  __ vinserti128(YTMP2, YTMP2, XTMP1, 0);
1289

1290
  // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty)
1291
  __ vpunpckhqdq(YMM_R2, YTMP5, YTMP10, Assembler::AVX_256bit);
1292
  __ vpunpcklqdq(YMM_R0, YTMP5, YTMP10, Assembler::AVX_256bit);
1293
  __ vpunpckhqdq(YTMP3, YTMP7, YTMP10, Assembler::AVX_256bit);
1294
  __ vpunpcklqdq(YTMP4, YTMP7, YTMP10, Assembler::AVX_256bit);
1295

1296
  __ vpslldq(YMM_R2, YMM_R2, 8, Assembler::AVX_256bit);
1297
  __ vpslldq(YTMP6, YTMP6, 8, Assembler::AVX_256bit);
1298
  __ vpslldq(YMM_R0, YMM_R0, 8, Assembler::AVX_256bit);
1299
  __ vpor(YMM_R2, YMM_R2, YTMP3, Assembler::AVX_256bit);
1300
  __ vpor(YMM_R0, YMM_R0, YTMP4, Assembler::AVX_256bit);
1301
  __ vpor(YTMP6, YTMP6, YTMP2, Assembler::AVX_256bit);
1302
  // Move 2 MSbits to top 24 bits, to be OR'ed later
1303
  __ vpsllq(YTMP6, YTMP6, 40, Assembler::AVX_256bit);
1304

1305
  __ vpsrlq(YMM_R1, YMM_R0, 44, Assembler::AVX_256bit);
1306
  __ vpsllq(YTMP5, YMM_R2, 20, Assembler::AVX_256bit);
1307
  __ vpor(YMM_R1, YMM_R1, YTMP5, Assembler::AVX_256bit);
1308
  __ vpand(YMM_R1, YMM_R1, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1309

1310
  __ vpand(YMM_R0, YMM_R0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1311
  __ vpsrlq(YMM_R2, YMM_R2, 24, Assembler::AVX_256bit);
1312

1313
  __ vpor(YMM_R2, YMM_R2, YTMP6, Assembler::AVX_256bit);
1314
  // YMM_R0, YMM_R1, YMM_R2 have the limbs of R^1, R^2, R^3, R^4
1315

1316
  // Store R^4-R on stack for later use
1317
  int _r4_r1_save = 0;
1318
  __ vmovdqu(Address(rsp, _r4_r1_save + 0), YMM_R0);
1319
  __ vmovdqu(Address(rsp, _r4_r1_save + 32), YMM_R1);
1320
  __ vmovdqu(Address(rsp, _r4_r1_save + 32*2), YMM_R2);
1321

1322
  // Broadcast 44-bit limbs of R^4
1323
  __ mov(t0, a0);
1324
  __ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (R^4[43:0])
1325
  __ movq(YMM_R0, t0);
1326
  __ vpermq(YMM_R0, YMM_R0, 0x0, Assembler::AVX_256bit);
1327

1328
  __ movq(t0, a1);
1329
  __ shrdq(a0, t0, 44);
1330
  __ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // Second limb (R^4[87:44])
1331
  __ movq(YMM_R1, a0);
1332
  __ vpermq(YMM_R1, YMM_R1, 0x0, Assembler::AVX_256bit);
1333

1334
  __ shrdq(a1, a2, 24);
1335
  __ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/); // Third limb (R^4[129:88])
1336
  __ movq(YMM_R2, a1);
1337
  __ vpermq(YMM_R2, YMM_R2, 0x0, Assembler::AVX_256bit);
1338
  // YMM_R0, YMM_R1, YMM_R2 have the limbs of R^4, R^4, R^4, R^4
1339

1340
  // Generate 4*5*R^4
1341
  // 4*R^4
1342
  __ vpsllq(YTMP1, YMM_R1, 2, Assembler::AVX_256bit);
1343
  __ vpsllq(YTMP2, YMM_R2, 2, Assembler::AVX_256bit);
1344
  // 5*R^4
1345
  __ vpaddq(YTMP1, YTMP1, YMM_R1, Assembler::AVX_256bit);
1346
  __ vpaddq(YTMP2, YTMP2, YMM_R2, Assembler::AVX_256bit);
1347
  // 4*5*R^4
1348
  __ vpsllq(YTMP1, YTMP1, 2, Assembler::AVX_256bit);
1349
  __ vpsllq(YTMP2, YTMP2, 2, Assembler::AVX_256bit);
1350

1351
  //Store broadcasted R^4 and 4*5*R^4 on stack for later use
1352
  int _r4_save = 32*3;
1353
  int _r4p_save = 32*6;
1354
  __ vmovdqu(Address(rsp, _r4_save + 0), YMM_R0);
1355
  __ vmovdqu(Address(rsp, _r4_save + 32), YMM_R1);
1356
  __ vmovdqu(Address(rsp, _r4_save + 32*2), YMM_R2);
1357
  __ vmovdqu(Address(rsp, _r4p_save), YTMP1);
1358
  __ vmovdqu(Address(rsp, _r4p_save + 32), YTMP2);
1359

1360
  // Get the number of multiples of 4 message blocks (64 bytes) for vectorization
1361
  __ movq(t0, length);
1362
  __ andq(t0, 0xffffffc0); // 0xffffffffffffffc0 after sign extension
1363

1364
  // VECTOR LOOP: process 4 * 16-byte message blocks at a time
1365
  __ bind(L_process256Loop);
1366
  __ cmpl(t0, 16*4); //64 bytes (4 blocks at a time)
1367
  __ jcc(Assembler::belowEqual, L_process256LoopDone);
1368

1369
  /*
1370
    Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1371
    l' = floor(l/4)
1372
    for (i = 1 to l'-1):
1373
      T ← (R4444 ⊗ T) ⊕ COEF[i];
1374
  */
1375

1376
  // Perform multiply and reduce while loading the next block and adding it in interleaved manner
1377
  // The logic to advance the SIMD loop counter (i.e. length -= 64) is inside the function below.
1378
  // The function below also includes the logic to load the next 4 blocks of data for efficient port utilization.
1379
  poly1305_msg_mul_reduce_vec4_avx2(YMM_ACC0, YMM_ACC1, YMM_ACC2,
1380
                          Address(rsp, _r4_save + 0), Address(rsp, _r4_save + 32), Address(rsp, _r4_save + 32*2),
1381
                          Address(rsp, _r4p_save), Address(rsp, _r4p_save + 32),
1382
                          YTMP1, YTMP2, YTMP3, YTMP4, YTMP5, YTMP6,
1383
                          YTMP7, YTMP8, YTMP9, YTMP10, YTMP11, YTMP12,
1384
                          input, t0, t1 /*rscratch*/);
1385
  __ jmp(L_process256Loop);
1386
  // end of vector loop
1387
  __ bind(L_process256LoopDone);
1388

1389
  /*
1390
    Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1391
    T ← R4321 ⊗ T;
1392
  */
1393

1394
  // Need to multiply by R^4, R^3, R^2, R
1395
  //Read R^4-R;
1396
  __ vmovdqu(YMM_R0, Address(rsp, _r4_r1_save + 0));
1397
  __ vmovdqu(YMM_R1, Address(rsp, _r4_r1_save + 32));
1398
  __ vmovdqu(YMM_R2, Address(rsp, _r4_r1_save + 32*2));
1399

1400
  // Generate 4*5*[R^4..R^1] (ignore lowest limb)
1401
  // YTMP1 to have bits 87-44 of all 1-4th powers of R' in 4 qwords
1402
  // YTMP2 to have bits 129-88 of all 1-4th powers of R' in 4 qwords
1403
  __ vpsllq(YTMP10, YMM_R1, 2, Assembler::AVX_256bit);
1404
  __ vpaddq(YTMP1, YMM_R1, YTMP10, Assembler::AVX_256bit);  //R1' (R1*5)
1405
  __ vpsllq(YTMP10, YMM_R2, 2, Assembler::AVX_256bit);
1406
  __ vpaddq(YTMP2, YMM_R2, YTMP10, Assembler::AVX_256bit);  //R2' (R2*5)
1407

1408
  // 4*5*R
1409
  __ vpsllq(YTMP1, YTMP1, 2, Assembler::AVX_256bit);
1410
  __ vpsllq(YTMP2, YTMP2, 2, Assembler::AVX_256bit);
1411

1412
  poly1305_mul_reduce_vec4_avx2(YMM_ACC0, YMM_ACC1, YMM_ACC2,
1413
                          YMM_R0, YMM_R1, YMM_R2, YTMP1, YTMP2,
1414
                          YTMP3, YTMP4, YTMP5, YTMP6,
1415
                          YTMP7, YTMP8, YTMP9, t1);
1416
  /*
1417
    Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1418
    SUM ← T0 + T1 + T2 + T3;
1419
  */
1420

1421
  // 4 -> 2 blocks
1422
  __ vextracti128(YTMP1, YMM_ACC0, 1);
1423
  __ vextracti128(YTMP2, YMM_ACC1, 1);
1424
  __ vextracti128(YTMP3, YMM_ACC2, 1);
1425

1426
  __ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_128bit);
1427
  __ vpaddq(YMM_ACC1, YMM_ACC1, YTMP2, Assembler::AVX_128bit);
1428
  __ vpaddq(YMM_ACC2, YMM_ACC2, YTMP3, Assembler::AVX_128bit);
1429
  // 2 -> 1 blocks
1430
  __ vpsrldq(YTMP1, YMM_ACC0, 8, Assembler::AVX_128bit);
1431
  __ vpsrldq(YTMP2, YMM_ACC1, 8, Assembler::AVX_128bit);
1432
  __ vpsrldq(YTMP3, YMM_ACC2, 8, Assembler::AVX_128bit);
1433

1434
  // Finish folding
1435
  __ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_128bit);
1436
  __ vpaddq(YMM_ACC1, YMM_ACC1, YTMP2, Assembler::AVX_128bit);
1437
  __ vpaddq(YMM_ACC2, YMM_ACC2, YTMP3, Assembler::AVX_128bit);
1438

1439
  __ movq(YMM_ACC0, YMM_ACC0);
1440
  __ movq(YMM_ACC1, YMM_ACC1);
1441
  __ movq(YMM_ACC2, YMM_ACC2);
1442

1443
  __ lea(input, Address(input,16*4));
1444
  __ andq(length, 63); // remaining bytes < length 64
1445
  // carry propagation
1446
  __ vpsrlq(YTMP1, YMM_ACC0, 44, Assembler::AVX_128bit);
1447
  __ vpand(YMM_ACC0, YMM_ACC0, ExternalAddress(poly1305_mask44()), Assembler::AVX_128bit, t1); // Clear top 20 bits
1448
  __ vpaddq(YMM_ACC1, YMM_ACC1, YTMP1, Assembler::AVX_128bit);
1449
  __ vpsrlq(YTMP1, YMM_ACC1, 44, Assembler::AVX_128bit);
1450
  __ vpand(YMM_ACC1, YMM_ACC1, ExternalAddress(poly1305_mask44()), Assembler::AVX_128bit, t1); // Clear top 20 bits
1451
  __ vpaddq(YMM_ACC2, YMM_ACC2, YTMP1, Assembler::AVX_128bit);
1452
  __ vpsrlq(YTMP1, YMM_ACC2, 42, Assembler::AVX_128bit);
1453
  __ vpand(YMM_ACC2, YMM_ACC2, ExternalAddress(poly1305_mask42()), Assembler::AVX_128bit, t1); // Clear top 20 bits
1454
  __ vpsllq(YTMP2, YTMP1, 2, Assembler::AVX_128bit);
1455
  __ vpaddq(YTMP1, YTMP1, YTMP2, Assembler::AVX_128bit);
1456
  __ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_128bit);
1457

1458
  // Put together A
1459
  __ movq(a0, YMM_ACC0);
1460
  __ movq(t0, YMM_ACC1);
1461
  __ movq(t1, t0);
1462
  __ shlq(t1, 44);
1463
  __ orq(a0, t1);
1464
  __ shrq(t0, 20);
1465
  __ movq(a2, YMM_ACC2);
1466
  __ movq(a1, a2);
1467
  __ shlq(a1, 24);
1468
  __ orq(a1, t0);
1469
  __ shrq(a2, 40);
1470

1471
  // cleanup
1472
  __ vzeroall(); // clears all ymm registers (ymm0 through ymm15)
1473

1474
  // SAFE DATA (clear powers of R)
1475
  __ vmovdqu(Address(rsp, _r4_r1_save + 0), YTMP1);
1476
  __ vmovdqu(Address(rsp, _r4_r1_save + 32), YTMP1);
1477
  __ vmovdqu(Address(rsp, _r4_r1_save + 32*2), YTMP1);
1478
  __ vmovdqu(Address(rsp, _r4_save + 0), YTMP1);
1479
  __ vmovdqu(Address(rsp, _r4_save + 32), YTMP1);
1480
  __ vmovdqu(Address(rsp, _r4_save + 32*2), YTMP1);
1481
  __ vmovdqu(Address(rsp, _r4p_save), YTMP1);
1482
  __ vmovdqu(Address(rsp, _r4p_save + 32), YTMP1);
1483

1484
  // Save rbp and rsp; clear stack frame
1485
    __ movq(rsp, rbp);
1486
    __ pop(rbp);
1487

1488
}
1489

1490
// Compute component-wise product for 4 16-byte message  blocks,
1491
// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0]
1492
//
1493
// Each block/number is represented by 3 44-bit limb digits, start with multiplication
1494
//
1495
//      a2       a1       a0
1496
// x    r2       r1       r0
1497
// ----------------------------------
1498
//     a2xr0    a1xr0    a0xr0
1499
// +   a1xr1    a0xr1  5xa2xr1'     (r1' = r1<<2)
1500
// +   a0xr2  5xa2xr2' 5xa1xr2'     (r2' = r2<<2)
1501
// ----------------------------------
1502
//        p2       p1       p0
1503
//
1504
void StubGenerator::poly1305_mul_reduce_vec4_avx2(
1505
  const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
1506
  const XMMRegister R0, const XMMRegister R1, const XMMRegister R2,
1507
  const XMMRegister R1P, const XMMRegister R2P,
1508
  const XMMRegister P0L, const XMMRegister P0H,
1509
  const XMMRegister P1L, const XMMRegister P1H,
1510
  const XMMRegister P2L, const XMMRegister P2H,
1511
  const XMMRegister YTMP1, const Register rscratch)
1512
{
1513
  // Reset accumulator
1514
  __ vpxor(P0L, P0L, P0L, Assembler::AVX_256bit);
1515
  __ vpxor(P0H, P0H, P0H, Assembler::AVX_256bit);
1516
  __ vpxor(P1L, P1L, P1L, Assembler::AVX_256bit);
1517
  __ vpxor(P1H, P1H, P1H, Assembler::AVX_256bit);
1518
  __ vpxor(P2L, P2L, P2L, Assembler::AVX_256bit);
1519
  __ vpxor(P2H, P2H, P2H, Assembler::AVX_256bit);
1520

1521
  // Calculate partial products
1522
  // p0 = a2xr1'
1523
  // p1 = a2xr2'
1524
  // p0 += a0xr0
1525
  __ vpmadd52luq(P0L, A2, R1P, Assembler::AVX_256bit);
1526
  __ vpmadd52huq(P0H, A2, R1P, Assembler::AVX_256bit);
1527

1528
  __ vpmadd52luq(P1L, A2, R2P, Assembler::AVX_256bit);
1529
  __ vpmadd52huq(P1H, A2, R2P, Assembler::AVX_256bit);
1530

1531
  __ vpmadd52luq(P0L, A0, R0, Assembler::AVX_256bit);
1532
  __ vpmadd52huq(P0H, A0, R0, Assembler::AVX_256bit);
1533

1534
  // p2 = a2xr0
1535
  // p1 += a0xr1
1536
  // p0 += a1xr2'
1537
  // p2 += a0Xr2
1538
  __ vpmadd52luq(P2L, A2, R0, Assembler::AVX_256bit);
1539
  __ vpmadd52huq(P2H, A2, R0, Assembler::AVX_256bit);
1540

1541
  __ vpmadd52luq(P1L, A0, R1, Assembler::AVX_256bit);
1542
  __ vpmadd52huq(P1H, A0, R1, Assembler::AVX_256bit);
1543

1544
  __ vpmadd52luq(P0L, A1, R2P, Assembler::AVX_256bit);
1545
  __ vpmadd52huq(P0H, A1, R2P, Assembler::AVX_256bit);
1546

1547
  __ vpmadd52luq(P2L, A0, R2, Assembler::AVX_256bit);
1548
  __ vpmadd52huq(P2H, A0, R2, Assembler::AVX_256bit);
1549

1550
  // Carry propgation (first pass)
1551
  __ vpsrlq(YTMP1, P0L, 44, Assembler::AVX_256bit);
1552
  __ vpsllq(P0H, P0H, 8, Assembler::AVX_256bit);
1553
  __ vpmadd52luq(P1L, A1, R0, Assembler::AVX_256bit);
1554
  __ vpmadd52huq(P1H, A1, R0, Assembler::AVX_256bit);
1555
  // Carry propagation (first pass) - continue
1556
  __ vpand(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1557
  __ vpaddq(P0H, P0H, YTMP1, Assembler::AVX_256bit);
1558
  __ vpmadd52luq(P2L, A1, R1, Assembler::AVX_256bit);
1559
  __ vpmadd52huq(P2H, A1, R1, Assembler::AVX_256bit);
1560

1561
  // Carry propagation (first pass) - continue 2
1562
  __ vpaddq(P1L, P1L, P0H, Assembler::AVX_256bit);
1563
  __ vpsllq(P1H, P1H, 8, Assembler::AVX_256bit);
1564
  __ vpsrlq(YTMP1, P1L, 44, Assembler::AVX_256bit);
1565
  __ vpand(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1566

1567
  __ vpaddq(P2L, P2L, P1H, Assembler::AVX_256bit);
1568
  __ vpaddq(P2L, P2L, YTMP1, Assembler::AVX_256bit);
1569
  __ vpand(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_256bit, rscratch); // Clear top 22 bits
1570
  __ vpsrlq(YTMP1, P2L, 42, Assembler::AVX_256bit);
1571
  __ vpsllq(P2H, P2H, 10, Assembler::AVX_256bit);
1572
  __ vpaddq(P2H, P2H, YTMP1, Assembler::AVX_256bit);
1573

1574
  // Carry propagation (second pass)
1575
  // Multiply by 5 the highest bits (above 130 bits)
1576
  __ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1577
  __ vpsllq(P2H, P2H, 2, Assembler::AVX_256bit);
1578
  __ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1579

1580
  __ vpsrlq(YTMP1, A0, 44, Assembler::AVX_256bit);
1581
  __ vpand(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1582
  __ vpaddq(A1, A1, YTMP1, Assembler::AVX_256bit);
1583
}
1584

1585
// Compute component-wise product for 4 16-byte message  blocks and adds the next 4 blocks
1586
// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0],
1587
// followed by [a2 a1 a0] += [n2 n1 n0], where n contains the next 4 blocks of the message.
1588
//
1589
// Each block/number is represented by 3 44-bit limb digits, start with multiplication
1590
//
1591
//      a2       a1       a0
1592
// x    r2       r1       r0
1593
// ----------------------------------
1594
//     a2xr0    a1xr0    a0xr0
1595
// +   a1xr1    a0xr1  5xa2xr1'     (r1' = r1<<2)
1596
// +   a0xr2  5xa2xr2' 5xa1xr2'     (r2' = r2<<2)
1597
// ----------------------------------
1598
//        p2       p1       p0
1599
//
1600
void StubGenerator::poly1305_msg_mul_reduce_vec4_avx2(
1601
  const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
1602
  const Address R0, const Address R1, const Address R2,
1603
  const Address R1P, const Address R2P,
1604
  const XMMRegister P0L, const XMMRegister P0H,
1605
  const XMMRegister P1L, const XMMRegister P1H,
1606
  const XMMRegister P2L, const XMMRegister P2H,
1607
  const XMMRegister YTMP1, const XMMRegister YTMP2,
1608
  const XMMRegister YTMP3, const XMMRegister YTMP4,
1609
  const XMMRegister YTMP5, const XMMRegister YTMP6,
1610
  const Register input, const Register length, const Register rscratch)
1611
{
1612
  // Reset accumulator
1613
  __ vpxor(P0L, P0L, P0L, Assembler::AVX_256bit);
1614
  __ vpxor(P0H, P0H, P0H, Assembler::AVX_256bit);
1615
  __ vpxor(P1L, P1L, P1L, Assembler::AVX_256bit);
1616
  __ vpxor(P1H, P1H, P1H, Assembler::AVX_256bit);
1617
  __ vpxor(P2L, P2L, P2L, Assembler::AVX_256bit);
1618
  __ vpxor(P2H, P2H, P2H, Assembler::AVX_256bit);
1619

1620
  // Calculate partial products
1621
  // p0 = a2xr1'
1622
  // p1 = a2xr2'
1623
  // p2 = a2xr0
1624
      __ vpmadd52luq(P0L, A2, R1P, Assembler::AVX_256bit);
1625
      __ vpmadd52huq(P0H, A2, R1P, Assembler::AVX_256bit);
1626
  // Interleave input loading with hash computation
1627
  __ lea(input, Address(input,16*4));
1628
  __ subl(length, 16*4);
1629
      __ vpmadd52luq(P1L, A2, R2P, Assembler::AVX_256bit);
1630
      __ vpmadd52huq(P1H, A2, R2P, Assembler::AVX_256bit);
1631
  // Load next block of data (64 bytes)
1632
  __ vmovdqu(YTMP1, Address(input, 0));
1633
  __ vmovdqu(YTMP2, Address(input, 32));
1634
  // interleave new blocks of data
1635
  __ vpunpckhqdq(YTMP3, YTMP1, YTMP2, Assembler::AVX_256bit);
1636
  __ vpunpcklqdq(YTMP1, YTMP1, YTMP2, Assembler::AVX_256bit);
1637
      __ vpmadd52luq(P0L, A0, R0, Assembler::AVX_256bit);
1638
      __ vpmadd52huq(P0H, A0, R0, Assembler::AVX_256bit);
1639
  // Highest 42-bit limbs of new blocks
1640
  __ vpsrlq(YTMP6, YTMP3, 24, Assembler::AVX_256bit);
1641
  __ vpor(YTMP6, YTMP6, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_256bit, rscratch);
1642

1643
  //Middle 44-bit limbs of new blocks
1644
  __ vpsrlq(YTMP2, YTMP1, 44, Assembler::AVX_256bit);
1645
  __ vpsllq(YTMP4, YTMP3, 20, Assembler::AVX_256bit);
1646
      // p2 = a2xr0
1647
      __ vpmadd52luq(P2L, A2, R0, Assembler::AVX_256bit);
1648
      __ vpmadd52huq(P2H, A2, R0, Assembler::AVX_256bit);
1649
  __ vpor(YTMP2, YTMP2, YTMP4, Assembler::AVX_256bit);
1650
  __ vpand(YTMP2, YTMP2, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch);
1651
  // Lowest 44-bit limbs of new blocks
1652
  __ vpand(YTMP1, YTMP1, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch);
1653

1654
      __ vpmadd52luq(P1L, A0, R1, Assembler::AVX_256bit);
1655
      __ vpmadd52huq(P1H, A0, R1, Assembler::AVX_256bit);
1656
      __ vpmadd52luq(P0L, A1, R2P, Assembler::AVX_256bit);
1657
      __ vpmadd52huq(P0H, A1, R2P, Assembler::AVX_256bit);
1658
      __ vpmadd52luq(P2L, A0, R2, Assembler::AVX_256bit);
1659
      __ vpmadd52huq(P2H, A0, R2, Assembler::AVX_256bit);
1660

1661
  // Carry propgation (first pass)
1662
  __ vpsrlq(YTMP5, P0L, 44, Assembler::AVX_256bit);
1663
  __ vpsllq(P0H, P0H, 8, Assembler::AVX_256bit);
1664
      __ vpmadd52luq(P1L, A1, R0, Assembler::AVX_256bit);
1665
      __ vpmadd52huq(P1H, A1, R0, Assembler::AVX_256bit);
1666
  // Carry propagation (first pass) - continue
1667
  __ vpand(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1668
  __ vpaddq(P0H, P0H, YTMP5, Assembler::AVX_256bit);
1669
      __ vpmadd52luq(P2L, A1, R1, Assembler::AVX_256bit);
1670
      __ vpmadd52huq(P2H, A1, R1, Assembler::AVX_256bit);
1671

1672
  // Carry propagation (first pass) - continue 2
1673
  __ vpaddq(P1L, P1L, P0H, Assembler::AVX_256bit);
1674
  __ vpsllq(P1H, P1H, 8, Assembler::AVX_256bit);
1675
  __ vpsrlq(YTMP5, P1L, 44, Assembler::AVX_256bit);
1676
  __ vpand(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1677

1678
  __ vpaddq(P2L, P2L, P1H, Assembler::AVX_256bit);
1679
  __ vpaddq(P2L, P2L, YTMP5, Assembler::AVX_256bit);
1680
  __ vpand(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_256bit, rscratch); // Clear top 22 bits
1681
  __ vpaddq(A2, A2, YTMP6, Assembler::AVX_256bit); // Add highest bits from new blocks to accumulator
1682
  __ vpsrlq(YTMP5, P2L, 42, Assembler::AVX_256bit);
1683
  __ vpsllq(P2H, P2H, 10, Assembler::AVX_256bit);
1684
  __ vpaddq(P2H, P2H, YTMP5, Assembler::AVX_256bit);
1685

1686
  // Carry propagation (second pass)
1687
  // Multiply by 5 the highest bits (above 130 bits)
1688
  __ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1689
  __ vpsllq(P2H, P2H, 2, Assembler::AVX_256bit);
1690
  __ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1691

1692
  __ vpsrlq(YTMP5, A0, 44, Assembler::AVX_256bit);
1693
  __ vpand(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1694
  __ vpaddq(A0, A0, YTMP1, Assembler::AVX_256bit); //Add low 42-bit bits from new blocks to accumulator
1695
  __ vpaddq(A1, A1, YTMP2, Assembler::AVX_256bit); //Add medium 42-bit bits from new blocks to accumulator
1696
  __ vpaddq(A1, A1, YTMP5, Assembler::AVX_256bit);
1697
}
1698

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.