2
* Copyright (c) 2022, 2024, Intel Corporation. All rights reserved.
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
26
#include "precompiled.hpp"
27
#include "macroAssembler_x86.hpp"
28
#include "stubGenerator_x86_64.hpp"
33
// - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols
34
// - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code"
35
// - "The design of Poly1305" https://loup-vaillant.fr/tutorials/poly1305-design
37
// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5:
39
// Reduction by 2^130-5 can be expressed as follows:
40
// ( ax2^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary
41
// = ( ax2^130 - 5xa + 5xa + b ) mod 2^130-5
42
// = ( ax(2^130 - 5) + 5xa + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop
43
// = ( 5xa + b ) mod 2^130-5
44
// QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add'
45
// This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment
48
// Pseudocode for this file (in general):
49
// * used for poly1305_multiply_scalar
50
// x used for poly1305_multiply8_avx512
51
// lower-case variables are scalar numbers in 3x44-bit limbs (in gprs)
52
// upper-case variables are 8-element vector numbers in 3x44-bit limbs (in zmm registers)
53
// [ ] used to denote vector numbers (with their elements)
56
ATTRIBUTE_ALIGNED(64) static const uint64_t POLY1305_PAD_MSG[] = {
57
0x0000010000000000, 0x0000010000000000,
58
0x0000010000000000, 0x0000010000000000,
59
0x0000010000000000, 0x0000010000000000,
60
0x0000010000000000, 0x0000010000000000,
62
static address poly1305_pad_msg() {
63
return (address)POLY1305_PAD_MSG;
66
ATTRIBUTE_ALIGNED(64) static const uint64_t POLY1305_MASK42[] = {
67
0x000003ffffffffff, 0x000003ffffffffff,
68
0x000003ffffffffff, 0x000003ffffffffff,
69
0x000003ffffffffff, 0x000003ffffffffff,
70
0x000003ffffffffff, 0x000003ffffffffff
72
static address poly1305_mask42() {
73
return (address)POLY1305_MASK42;
76
ATTRIBUTE_ALIGNED(64) static const uint64_t POLY1305_MASK44[] = {
77
0x00000fffffffffff, 0x00000fffffffffff,
78
0x00000fffffffffff, 0x00000fffffffffff,
79
0x00000fffffffffff, 0x00000fffffffffff,
80
0x00000fffffffffff, 0x00000fffffffffff,
82
static address poly1305_mask44() {
83
return (address)POLY1305_MASK44;
86
// Compute product for 8 16-byte message blocks,
87
// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0]
89
// Each block/number is represented by 3 44-bit limb digits, start with multiplication
93
// ----------------------------------
95
// + a1xr1 a0xr1 5xa2xr1' (r1' = r1<<2)
96
// + a0xr2 5xa2xr2' 5xa1xr2' (r2' = r2<<2)
97
// ----------------------------------
100
// Then, propagate the carry (bits after bit 44) from lower limbs into higher limbs.
101
// Then, modular reduction from upper limb wrapped to lower limbs
103
// Math Note 1: 'carry propagation' from p2 to p0 involves multiplication by 5 (i.e. slightly modified modular reduction from above):
104
// ( p2x2^88 ) mod 2^130-5
105
// = ( p2'x2^88 + p2''x2^130) mod 2^130-5 // Split on 130-bit boudary
106
// = ( p2'x2^88 + p2''x2^130 - 5xp2'' + 5xp2'') mod 2^130-5
107
// = ( p2'x2^88 + p2''x(2^130 - 5) + 5xp2'') mod 2^130-5 // i.e. adding multiples of modulus is a noop
108
// = ( p2'x2^88 + 5xp2'') mod 2^130-5
110
// Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication.
111
// This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5.
112
// For example, partial product (a2xr2):
113
// (a2x2^88)x(r2x2^88) mod 2^130-5
114
// = (a2xr2 x 2^176) mod 2^130-5
115
// = (a2xr2 x 2^46x2^130) mod 2^130-5
116
// = (a2xr2x2^46 x 2^130- 5xa2xr2x2^46 + 5xa2xr2x2^46) mod 2^130-5
117
// = (a2xr2x2^46 x (2^130- 5) + 5xa2xr2x2^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop
118
// = (5xa2xr2x2^46) mod 2^130-5
119
// = (a2x5xr2x2^2 x 2^44) mod 2^130-5 // Align to limb boudary
120
// = (a2x[5xr2x4] x 2^44) mod 2^130-5
121
// = (a2xR2P x 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2
123
void StubGenerator::poly1305_multiply8_avx512(
124
const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
125
const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P,
126
const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H,
127
const XMMRegister TMP, const Register rscratch)
130
// Reset partial sums
131
__ evpxorq(P0L, P0L, P0L, Assembler::AVX_512bit);
132
__ evpxorq(P0H, P0H, P0H, Assembler::AVX_512bit);
133
__ evpxorq(P1L, P1L, P1L, Assembler::AVX_512bit);
134
__ evpxorq(P1H, P1H, P1H, Assembler::AVX_512bit);
135
__ evpxorq(P2L, P2L, P2L, Assembler::AVX_512bit);
136
__ evpxorq(P2H, P2H, P2H, Assembler::AVX_512bit);
138
// Calculate partial products
142
__ evpmadd52luq(P0L, A2, R1P, Assembler::AVX_512bit);
143
__ evpmadd52huq(P0H, A2, R1P, Assembler::AVX_512bit);
144
__ evpmadd52luq(P1L, A2, R2P, Assembler::AVX_512bit);
145
__ evpmadd52huq(P1H, A2, R2P, Assembler::AVX_512bit);
146
__ evpmadd52luq(P2L, A2, R0, Assembler::AVX_512bit);
147
__ evpmadd52huq(P2H, A2, R0, Assembler::AVX_512bit);
152
__ evpmadd52luq(P1L, A0, R1, Assembler::AVX_512bit);
153
__ evpmadd52huq(P1H, A0, R1, Assembler::AVX_512bit);
154
__ evpmadd52luq(P2L, A0, R2, Assembler::AVX_512bit);
155
__ evpmadd52huq(P2H, A0, R2, Assembler::AVX_512bit);
156
__ evpmadd52luq(P0L, A0, R0, Assembler::AVX_512bit);
157
__ evpmadd52huq(P0H, A0, R0, Assembler::AVX_512bit);
162
__ evpmadd52luq(P0L, A1, R2P, Assembler::AVX_512bit);
163
__ evpmadd52huq(P0H, A1, R2P, Assembler::AVX_512bit);
164
__ evpmadd52luq(P1L, A1, R0, Assembler::AVX_512bit);
165
__ evpmadd52huq(P1H, A1, R0, Assembler::AVX_512bit);
166
__ evpmadd52luq(P2L, A1, R1, Assembler::AVX_512bit);
167
__ evpmadd52huq(P2H, A1, R1, Assembler::AVX_512bit);
169
// Carry propagation:
170
// (Not quite aligned) | More mathematically correct:
171
// P2L P1L P0L | P2Lx2^88 + P1Lx2^44 + P0Lx2^0
172
// + P2H P1H P0H | + P2Hx2^140 + P1Hx2^96 + P0Hx2^52
173
// --------------------------- | -----------------------------------------------
174
// = P2H A2 A1 A0 | = P2Hx2^130 + A2x2^88 + A1x2^44 + A0x2^0
176
__ vpsrlq(TMP, P0L, 44, Assembler::AVX_512bit);
177
__ evpandq(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
179
__ vpsllq(P0H, P0H, 8, Assembler::AVX_512bit);
180
__ vpaddq(P0H, P0H, TMP, Assembler::AVX_512bit);
181
__ vpaddq(P1L, P1L, P0H, Assembler::AVX_512bit);
182
__ evpandq(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits
184
__ vpsrlq(TMP, P1L, 44, Assembler::AVX_512bit);
185
__ vpsllq(P1H, P1H, 8, Assembler::AVX_512bit);
186
__ vpaddq(P1H, P1H, TMP, Assembler::AVX_512bit);
187
__ vpaddq(P2L, P2L, P1H, Assembler::AVX_512bit);
188
__ evpandq(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits
190
__ vpsrlq(TMP, P2L, 42, Assembler::AVX_512bit);
191
__ vpsllq(P2H, P2H, 10, Assembler::AVX_512bit);
192
__ vpaddq(P2H, P2H, TMP, Assembler::AVX_512bit);
194
// Reduction: p2->a0->a1
195
// Multiply by 5 the highest bits (p2 is above 130 bits)
196
__ vpaddq(A0, A0, P2H, Assembler::AVX_512bit);
197
__ vpsllq(P2H, P2H, 2, Assembler::AVX_512bit);
198
__ vpaddq(A0, A0, P2H, Assembler::AVX_512bit);
199
__ vpsrlq(TMP, A0, 44, Assembler::AVX_512bit);
200
__ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
201
__ vpaddq(A1, A1, TMP, Assembler::AVX_512bit);
204
// Compute product for a single 16-byte message blocks
205
// - Assumes that r = [r1 r0] is only 128 bits (not 130)
206
// - Input [a2 a1 a0]; when only128 is set, input is 128 bits (i.e. a2==0)
207
// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used regardless of only128)
209
// Note 1: a2 here is only two bits so anything above is subject of reduction.
210
// Note 2: Constant c1 = 5xr1 = r1 + (r1 << 2) simplifies multiply with less operations
212
// Flow of the code below is as follows:
216
// -----------------------------
220
// -----------------------------
221
// [0|L2L] [L1H|L1L] [L0H|L0L]
223
// Registers: t2:t1 t0:a0
225
// Completing the multiply and adding (with carry) 3x128-bit limbs into
226
// 192-bits again (3x64-bits):
230
void StubGenerator::poly1305_multiply_scalar(
231
const Register a0, const Register a1, const Register a2,
232
const Register r0, const Register r1, const Register c1, bool only128,
233
const Register t0, const Register t1, const Register t2,
234
const Register mulql, const Register mulqh)
236
// mulq instruction requires/clobers rax, rdx (mulql, mulqh)
247
__ movq(a0, rax); // a0 not used in other operations
250
// t2:t1 += (a1 * r0)
256
// t0:a0 += (a1 * r1x5)
262
// Note: a2 is clamped to 2-bits,
263
// r1/r0 is clamped to 60-bits,
264
// their product is less than 2^64.
266
if (only128) { // Accumulator only 128 bits, i.e. a2 == 0
267
// just move and add t0-t1 to a1
272
// t2:t1 += (a2 * r1x5)
273
__ movq(a1, a2); // use a1 for a2
278
__ movq(a1, t0); // t0:a0 => a1:a0
280
// t2:a1 += (a2 * r0):t1
286
// At this point, 3 64-bit limbs are in t2:a1:a0
287
// t2 can span over more than 2 bits so final partial reduction step is needed.
289
// Partial reduction (just to fit into 130 bits)
291
// k = (t2 & ~3) + (t2 >> 2)
295
// Result will be in a2:a1:a0
297
__ movl(a2, t2); // DWORD
301
__ andl(a2, 3); // DWORD
303
// a2:a1:a0 += k (kept in t0)
306
__ adcl(a2, 0); // DWORD
309
// Convert array of 128-bit numbers in quadwords (in D0:D1) into 128-bit numbers across 44-bit limbs (in L0:L1:L2)
310
// Optionally pad all the numbers (i.e. add 2^128)
312
// +-------------------------+-------------------------+
313
// D0:D1 | h0 h1 g0 g1 f0 f1 e0 e1 | d0 d1 c0 c1 b0 b1 a0 a1 |
314
// +-------------------------+-------------------------+
315
// +-------------------------+
316
// L2 | h2 d2 g2 c2 f2 b2 e2 a2 |
317
// +-------------------------+
318
// +-------------------------+
319
// L1 | h1 d1 g1 c1 f1 b1 e1 a1 |
320
// +-------------------------+
321
// +-------------------------+
322
// L0 | h0 d0 g0 c0 f0 b0 e0 a0 |
323
// +-------------------------+
325
void StubGenerator::poly1305_limbs_avx512(
326
const XMMRegister D0, const XMMRegister D1,
327
const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG,
328
const XMMRegister TMP, const Register rscratch)
330
// Interleave blocks of data
331
__ evpunpckhqdq(TMP, D0, D1, Assembler::AVX_512bit);
332
__ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit);
334
// Highest 42-bit limbs of new blocks
335
__ vpsrlq(L2, TMP, 24, Assembler::AVX_512bit);
337
__ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message
340
// Middle 44-bit limbs of new blocks
341
__ vpsrlq(L1, L0, 44, Assembler::AVX_512bit);
342
__ vpsllq(TMP, TMP, 20, Assembler::AVX_512bit);
343
__ vpternlogq(L1, 0xA8, TMP, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C)
345
// Lowest 44-bit limbs of new blocks
346
__ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
350
* Copy 5x26-bit (unreduced) limbs stored at Register limbs into a2:a1:a0 (3x64-bit limbs)
352
* a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R)
354
void StubGenerator::poly1305_limbs(
355
const Register limbs, const Register a0, const Register a1, const Register a2,
356
const Register t0, const Register t1)
358
__ movq(a0, Address(limbs, 0));
359
__ movq(t0, Address(limbs, 8));
362
__ movq(t0, Address(limbs, 16));
363
__ movq(t1, Address(limbs, 24));
370
__ movq(t0, Address(limbs, 32));
380
// One round of reduction
381
// Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
396
* Break 3x64-bit a2:a1:a0 limbs into 5x26-bit limbs and store out into 5 quadwords at address `limbs`
398
void StubGenerator::poly1305_limbs_out(
399
const Register a0, const Register a1, const Register a2,
400
const Register limbs,
401
const Register t0, const Register t1)
403
// Extra round of reduction
404
// Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
416
// Chop a2:a1:a0 into 26-bit limbs
418
__ andl(t0, 0x3ffffff);
419
__ movq(Address(limbs, 0), t0);
423
__ andl(t0, 0x3ffffff);
424
__ movq(Address(limbs, 8), t0);
426
__ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1
430
__ andl(t0, 0x3ffffff);
431
__ movq(Address(limbs, 16), t0);
433
__ shrq(a1, 14); // already used up 14 bits
434
__ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced
435
__ addq(a1, a2); // put remaining bits into a1
438
__ andl(t0, 0x3ffffff);
439
__ movq(Address(limbs, 24), t0);
443
//andl(t0, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s)
444
__ movq(Address(limbs, 32), t0);
447
// This function consumes as many whole 16*16-byte blocks as available in input
448
// After execution, input and length will point at remaining (unprocessed) data
449
// and [a2 a1 a0] will contain the current accumulator value
452
// Main loop in this function multiplies each message block by r^16; And some glue before and after..
453
// Proof (for brevity, split into 4 'rows' instead of 16):
455
// hash = ((((m1*r + m2)*r + m3)*r ... mn)*r
456
// = m1*r^n + m2*r^(n-1) + ... +mn_1*r^2 + mn*r // Horner's rule
458
// = m1*r^n + m4*r^(n-4) + m8*r^(n-8) ... // split into 4 groups for brevity, same applies to 16 blocks
459
// + m2*r^(n-1) + m5*r^(n-5) + m9*r^(n-9) ...
460
// + m3*r^(n-2) + m6*r^(n-6) + m10*r^(n-10) ...
461
// + m4*r^(n-3) + m7*r^(n-7) + m11*r^(n-11) ...
463
// = r^4 * (m1*r^(n-4) + m4*r^(n-8) + m8 *r^(n-16) ... + mn_3) // factor out r^4..r; same applies to 16 but r^16..r factors
464
// + r^3 * (m2*r^(n-4) + m5*r^(n-8) + m9 *r^(n-16) ... + mn_2)
465
// + r^2 * (m3*r^(n-4) + m6*r^(n-8) + m10*r^(n-16) ... + mn_1)
466
// + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0) // Note last column: message group has no multiplier
468
// = (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) * r^4 // reverse Horner's rule, for each group
469
// + (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) * r^3 // each column is multiplied by r^4, except last
470
// + (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) * r^2
471
// + (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) * r^1
473
// Also see M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code"
476
// * used for poly1305_multiply_scalar
477
// x used for poly1305_multiply8_avx512
478
// lower-case variables are scalar numbers in 3x44-bit limbs (in gprs)
479
// upper-case variables are 8&16-element vector numbers in 3x44-bit limbs (in zmm registers)
481
// CL = a // [0 0 0 0 0 0 0 a]
482
// AL = poly1305_limbs_avx512(input)
483
// AH = poly1305_limbs_avx512(input+8)
485
// input+=16, length-=16
495
// T = r^4 || r^3 || r^2 || r
496
// B = limbs(T) // [r^4 0 r^3 0 r^2 0 r^1 0 ]
497
// CL = B >> 1 // [ 0 r^4 0 r^3 0 r^2 0 r^1]
498
// R = r^4 || r^4 || .. // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4]
499
// B = BxR // [r^8 0 r^7 0 r^6 0 r^5 0 ]
500
// B = B | CL // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1]
502
// R = r^8 || r^8 || .. // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8]
503
// B = B x R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9]
505
// R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16]
507
// for (;length>=16; input+=16, length-=16)
508
// BL = poly1305_limbs_avx512(input)
509
// BH = poly1305_limbs_avx512(input+8)
517
// A = AL + AH // 16->8 blocks
518
// T = A >> 4 // 8 ->4 blocks
520
// T = A >> 2 // 4 ->2 blocks
522
// T = A >> 1 // 2 ->1 blocks
542
// mulq(rax, rdx) in poly1305_multiply_scalar
552
void StubGenerator::poly1305_process_blocks_avx512(
553
const Register input, const Register length,
554
const Register a0, const Register a1, const Register a2,
555
const Register r0, const Register r1, const Register c1)
557
Label L_process256Loop, L_process256LoopDone;
558
const Register t0 = r13;
559
const Register t1 = r14;
560
const Register t2 = r15;
561
const Register mulql = rax;
562
const Register mulqh = rdx;
564
const XMMRegister D0 = xmm0;
565
const XMMRegister D1 = xmm1;
566
const XMMRegister TMP = xmm2;
568
const XMMRegister T0 = xmm3;
569
const XMMRegister T1 = xmm4;
570
const XMMRegister T2 = xmm5;
571
const XMMRegister T3 = xmm6;
572
const XMMRegister T4 = xmm7;
573
const XMMRegister T5 = xmm8;
575
const XMMRegister A0 = xmm9;
576
const XMMRegister A1 = xmm10;
577
const XMMRegister A2 = xmm11;
578
const XMMRegister A3 = xmm12;
579
const XMMRegister A4 = xmm13;
580
const XMMRegister A5 = xmm14;
582
const XMMRegister B0 = xmm15;
583
const XMMRegister B1 = xmm16;
584
const XMMRegister B2 = xmm17;
585
const XMMRegister B3 = xmm18;
586
const XMMRegister B4 = xmm19;
587
const XMMRegister B5 = xmm20;
589
const XMMRegister C0 = xmm21;
590
const XMMRegister C1 = xmm22;
591
const XMMRegister C2 = xmm23;
592
const XMMRegister C3 = xmm24;
593
const XMMRegister C4 = xmm25;
594
const XMMRegister C5 = xmm26;
596
const XMMRegister R0 = xmm27;
597
const XMMRegister R1 = xmm28;
598
const XMMRegister R2 = xmm29;
599
const XMMRegister R1P = xmm30;
600
const XMMRegister R2P = xmm31;
602
// Spread accumulator into 44-bit limbs in quadwords C0,C1,C2
604
__ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (Acc[43:0])
608
__ shrdq(a0, t0, 44);
609
__ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // Second limb (Acc[77:52])
612
__ shrdq(a1, a2, 24);
613
__ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/); // Third limb (Acc[129:88])
616
// To add accumulator, we must unroll first loop iteration
618
// Load first block of data (128 bytes) and pad
619
// A0 to have bits 0-43 of all 8 blocks in 8 qwords
620
// A1 to have bits 87-44 of all 8 blocks in 8 qwords
621
// A2 to have bits 127-88 of all 8 blocks in 8 qwords
622
__ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
623
__ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
624
poly1305_limbs_avx512(D0, D1, A0, A1, A2, true, TMP, t1 /*rscratch*/);
626
// Add accumulator to the fist message block
627
__ vpaddq(A0, A0, C0, Assembler::AVX_512bit);
628
__ vpaddq(A1, A1, C1, Assembler::AVX_512bit);
629
__ vpaddq(A2, A2, C2, Assembler::AVX_512bit);
631
// Load next blocks of data (128 bytes) and pad
632
// A3 to have bits 0-43 of all 8 blocks in 8 qwords
633
// A4 to have bits 87-44 of all 8 blocks in 8 qwords
634
// A5 to have bits 127-88 of all 8 blocks in 8 qwords
635
__ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
636
__ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
637
poly1305_limbs_avx512(D0, D1, A3, A4, A5, true, TMP, t1 /*rscratch*/);
639
__ subl(length, 16*16);
640
__ lea(input, Address(input,16*16));
642
// Compute the powers of R^1..R^4 and form 44-bit limbs of each
643
// T0 to have bits 0-127 in 4 quadword pairs
644
// T1 to have bits 128-129 in alternating 8 qwords
645
__ vpxorq(T1, T1, T1, Assembler::AVX_512bit);
647
__ vpinsrq(T2, T2, r1, 1);
648
__ vinserti32x4(T0, T0, T2, 3);
653
// "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator
654
poly1305_multiply_scalar(a0, a1, a2,
656
t0, t1, t2, mulql, mulqh);
659
__ vpinsrq(T2, T2, a1, 1);
660
__ vinserti32x4(T0, T0, T2, 2);
662
__ vinserti32x4(T1, T1, T2, 2);
665
poly1305_multiply_scalar(a0, a1, a2,
667
t0, t1, t2, mulql, mulqh);
670
__ vpinsrq(T2, T2, a1, 1);
671
__ vinserti32x4(T0, T0, T2, 1);
673
__ vinserti32x4(T1, T1, T2, 1);
676
poly1305_multiply_scalar(a0, a1, a2,
678
t0, t1, t2, mulql, mulqh);
681
__ vpinsrq(T2, T2, a1, 1);
682
__ vinserti32x4(T0, T0, T2, 0);
684
__ vinserti32x4(T1, T1, T2, 0);
686
// Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty)
687
// B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords
688
// B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords
689
// B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords
690
__ vpxorq(T2, T2, T2, Assembler::AVX_512bit);
691
poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP, t1 /*rscratch*/);
693
// T1 contains the 2 highest bits of the powers of R
694
__ vpsllq(T1, T1, 40, Assembler::AVX_512bit);
695
__ evporq(B2, B2, T1, Assembler::AVX_512bit);
697
// Broadcast 44-bit limbs of R^4 into R0,R1,R2
699
__ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (R^4[43:0])
700
__ evpbroadcastq(R0, t0, Assembler::AVX_512bit);
703
__ shrdq(a0, t0, 44);
704
__ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // Second limb (R^4[87:44])
705
__ evpbroadcastq(R1, a0, Assembler::AVX_512bit);
707
__ shrdq(a1, a2, 24);
708
__ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/); // Third limb (R^4[129:88])
709
__ evpbroadcastq(R2, a1, Assembler::AVX_512bit);
711
// Generate 4*5*R^4 into {R2P,R1P}
712
// Used as multiplier in poly1305_multiply8_avx512 so can
713
// ignore bottom limb and carry propagation
714
__ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); // 4*R^4
715
__ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
716
__ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^4
717
__ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
718
__ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^4
719
__ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
721
// Move R^4..R^1 one element over
722
__ vpslldq(C0, B0, 8, Assembler::AVX_512bit);
723
__ vpslldq(C1, B1, 8, Assembler::AVX_512bit);
724
__ vpslldq(C2, B2, 8, Assembler::AVX_512bit);
727
poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1
728
R0, R1, R2, R1P, R2P, // R^4..R^4, 4*5*R^4
729
T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
731
// Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
732
__ evporq(B0, B0, C0, Assembler::AVX_512bit);
733
__ evporq(B1, B1, C1, Assembler::AVX_512bit);
734
__ evporq(B2, B2, C2, Assembler::AVX_512bit);
736
// Store R^8-R for later use
737
__ evmovdquq(C0, B0, Assembler::AVX_512bit);
738
__ evmovdquq(C1, B1, Assembler::AVX_512bit);
739
__ evmovdquq(C2, B2, Assembler::AVX_512bit);
742
__ vpbroadcastq(R0, B0, Assembler::AVX_512bit);
743
__ vpbroadcastq(R1, B1, Assembler::AVX_512bit);
744
__ vpbroadcastq(R2, B2, Assembler::AVX_512bit);
747
__ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);
748
__ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
749
__ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^8
750
__ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
751
__ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8
752
__ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
754
// Calculate R^16-R^9
755
poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1
756
R0, R1, R2, R1P, R2P, // R^8..R^8, 4*5*R^8
757
T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
759
// Store R^16-R^9 for later use
760
__ evmovdquq(C3, B0, Assembler::AVX_512bit);
761
__ evmovdquq(C4, B1, Assembler::AVX_512bit);
762
__ evmovdquq(C5, B2, Assembler::AVX_512bit);
765
__ vpbroadcastq(R0, B0, Assembler::AVX_512bit);
766
__ vpbroadcastq(R1, B1, Assembler::AVX_512bit);
767
__ vpbroadcastq(R2, B2, Assembler::AVX_512bit);
770
__ vpsllq(R1P, R1, 2, Assembler::AVX_512bit);
771
__ vpsllq(R2P, R2, 2, Assembler::AVX_512bit);
772
__ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^16
773
__ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit);
774
__ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^16
775
__ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
777
// VECTOR LOOP: process 16 * 16-byte message block at a time
778
__ bind(L_process256Loop);
779
__ cmpl(length, 16*16);
780
__ jcc(Assembler::less, L_process256LoopDone);
782
// Load and interleave next block of data (128 bytes)
783
__ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
784
__ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
785
poly1305_limbs_avx512(D0, D1, B0, B1, B2, true, TMP, t1 /*rscratch*/);
787
// Load and interleave next block of data (128 bytes)
788
__ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
789
__ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
790
poly1305_limbs_avx512(D0, D1, B3, B4, B5, true, TMP, t1 /*rscratch*/);
792
poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks
793
R0, R1, R2, R1P, R2P, // R^16..R^16, 4*5*R^16
794
T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
795
poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks
796
R0, R1, R2, R1P, R2P, // R^16..R^16, 4*5*R^16
797
T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
799
__ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
800
__ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
801
__ vpaddq(A2, A2, B2, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
802
__ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
803
__ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
804
__ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
806
__ subl(length, 16*16);
807
__ lea(input, Address(input,16*16));
808
__ jmp(L_process256Loop);
810
__ bind(L_process256LoopDone);
812
// Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value
813
// Generate 4*5*[R^16..R^9] (ignore lowest limb)
814
// Use D0 ~ R1P, D1 ~ R2P for higher powers
815
__ vpsllq(R1P, C4, 2, Assembler::AVX_512bit);
816
__ vpsllq(R2P, C5, 2, Assembler::AVX_512bit);
817
__ vpaddq(R1P, R1P, C4, Assembler::AVX_512bit); // 5*R^8
818
__ vpaddq(R2P, R2P, C5, Assembler::AVX_512bit);
819
__ vpsllq(D0, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8
820
__ vpsllq(D1, R2P, 2, Assembler::AVX_512bit);
822
// Generate 4*5*[R^8..R^1] (ignore lowest limb)
823
__ vpsllq(R1P, C1, 2, Assembler::AVX_512bit);
824
__ vpsllq(R2P, C2, 2, Assembler::AVX_512bit);
825
__ vpaddq(R1P, R1P, C1, Assembler::AVX_512bit); // 5*R^8
826
__ vpaddq(R2P, R2P, C2, Assembler::AVX_512bit);
827
__ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8
828
__ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit);
830
poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks
831
C3, C4, C5, D0, D1, // R^16-R^9, R1P, R2P
832
T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
833
poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks
834
C0, C1, C2, R1P, R2P, // R^8-R, R1P, R2P
835
T0, T1, T2, T3, T4, T5, TMP, t1 /*rscratch*/);
837
// Add all blocks (horizontally)
839
__ vpaddq(A0, A0, A3, Assembler::AVX_512bit);
840
__ vpaddq(A1, A1, A4, Assembler::AVX_512bit);
841
__ vpaddq(A2, A2, A5, Assembler::AVX_512bit);
844
__ vextracti64x4(T0, A0, 1);
845
__ vextracti64x4(T1, A1, 1);
846
__ vextracti64x4(T2, A2, 1);
847
__ vpaddq(A0, A0, T0, Assembler::AVX_256bit);
848
__ vpaddq(A1, A1, T1, Assembler::AVX_256bit);
849
__ vpaddq(A2, A2, T2, Assembler::AVX_256bit);
852
__ vextracti32x4(T0, A0, 1);
853
__ vextracti32x4(T1, A1, 1);
854
__ vextracti32x4(T2, A2, 1);
855
__ vpaddq(A0, A0, T0, Assembler::AVX_128bit);
856
__ vpaddq(A1, A1, T1, Assembler::AVX_128bit);
857
__ vpaddq(A2, A2, T2, Assembler::AVX_128bit);
860
__ vpsrldq(T0, A0, 8, Assembler::AVX_128bit);
861
__ vpsrldq(T1, A1, 8, Assembler::AVX_128bit);
862
__ vpsrldq(T2, A2, 8, Assembler::AVX_128bit);
864
// Finish folding and clear second qword
867
__ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit);
868
__ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit);
869
__ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit);
872
__ vpsrlq(D0, A0, 44, Assembler::AVX_512bit);
873
__ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, t1 /*rscratch*/); // Clear top 20 bits
874
__ vpaddq(A1, A1, D0, Assembler::AVX_512bit);
875
__ vpsrlq(D0, A1, 44, Assembler::AVX_512bit);
876
__ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, t1 /*rscratch*/); // Clear top 20 bits
877
__ vpaddq(A2, A2, D0, Assembler::AVX_512bit);
878
__ vpsrlq(D0, A2, 42, Assembler::AVX_512bit);
879
__ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, t1 /*rscratch*/); // Clear top 22 bits
880
__ vpsllq(D1, D0, 2, Assembler::AVX_512bit);
881
__ vpaddq(D0, D0, D1, Assembler::AVX_512bit);
882
__ vpaddq(A0, A0, D0, Assembler::AVX_512bit);
884
// Put together A (accumulator)
902
// Zero out zmm0-zmm31.
904
for (XMMRegister rxmm = xmm16; rxmm->is_valid(); rxmm = rxmm->successor()) {
905
__ vpxorq(rxmm, rxmm, rxmm, Assembler::AVX_512bit);
909
// This function consumes as many whole 16-byte blocks as available in input
910
// After execution, input and length will point at remaining (unprocessed) data
911
// and accumulator will point to the current accumulator value
912
address StubGenerator::generate_poly1305_processBlocks() {
913
__ align(CodeEntryAlignment);
914
StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
915
address start = __ pc();
918
// Save all 'SOE' registers
930
const Register input = rdi; // msg
931
const Register length = rbx; // msg length in bytes
932
const Register accumulator = rcx;
933
const Register R = r8;
935
const Register a0 = rsi; // [in/out] accumulator bits 63..0
936
const Register a1 = r9; // [in/out] accumulator bits 127..64
937
const Register a2 = r10; // [in/out] accumulator bits 195..128
938
const Register r0 = r11; // R constant bits 63..0
939
const Register r1 = r12; // R constant bits 127..64
940
const Register c1 = r8; // 5*R (upper limb only)
941
const Register t0 = r13;
942
const Register t1 = r14;
943
const Register t2 = r15;
944
const Register mulql = rax;
945
const Register mulqh = rdx;
948
// pseudo-signature: void poly1305_processBlocks(byte[] input, int length, int[5] accumulator, int[5] R)
949
// input, a, r pointers point at first array element
950
// java headers bypassed in LibraryCallKit::inline_poly1305_processBlocks
956
__ mov(input, c_rarg0);
957
__ mov(length, c_rarg1);
958
__ mov(accumulator, c_rarg2);
965
// dont clobber R, args copied out-of-order
966
__ mov(length, c_rarg1);
968
__ mov(accumulator, c_rarg2);
971
Label L_process16Loop, L_process16LoopDone;
974
poly1305_limbs(R, r0, r1, noreg, t0, t1);
976
// Compute 5*R (Upper limb only)
979
__ addq(c1, r1); // c1 = r1 + (r1 >> 2)
981
// Load accumulator into a2:a1:a0
982
poly1305_limbs(accumulator, a0, a1, a2, t0, t1);
984
// VECTOR LOOP: Minimum of 256 bytes to run vectorized code
985
__ cmpl(length, 16*16);
986
__ jcc(Assembler::less, L_process16Loop);
989
poly1305_process_blocks_avx512(input, length,
993
poly1305_process_blocks_avx2(input, length,
999
// SCALAR LOOP: process one 16-byte message block at a time
1000
__ bind(L_process16Loop);
1001
__ cmpl(length, 16);
1002
__ jcc(Assembler::less, L_process16LoopDone);
1004
__ addq(a0, Address(input,0));
1005
__ adcq(a1, Address(input,8));
1007
poly1305_multiply_scalar(a0, a1, a2,
1009
t0, t1, t2, mulql, mulqh);
1011
__ subl(length, 16);
1012
__ lea(input, Address(input,16));
1013
__ jmp(L_process16Loop);
1014
__ bind(L_process16LoopDone);
1017
poly1305_limbs_out(a0, a1, a2, accumulator, t0, t1);
1035
The AVX2 implementation below is directly based on the AVX2 Poly1305 hash computation as
1036
implemented in Intel(R) Multi-Buffer Crypto for IPsec Library.
1037
(url: https://github.com/intel/intel-ipsec-mb/blob/main/lib/avx2_t3/poly_fma_avx2.asm)
1039
Additional references:
1040
[1] Goll M, Gueron S., "Vectorization of Poly1305 message authentication code",
1041
12th International Conference on Information Technology-New Generations,
1042
2015 Apr 13 (pp. 145-150). IEEE.
1043
[2] Bhattacharyya S, Sarkar P., "Improved SIMD implementation of Poly1305",
1044
IET Information Security. 2020 Sep;14(5):521-30.
1045
Note: a compact summary of the Goll-Gueron AVX2 algorithm developed in [1] is presented in [2].
1046
[3] Wikipedia, "Parallel evaluation of Horner's method",
1047
(url: https://en.wikipedia.org/wiki/Horner%27s_method)
1048
----------------------------------------------------------
1050
Poly1305 AVX2 algorithm:
1051
Let the 32-byte one-time key be partitioned into two equal parts R and K.
1052
Let R be the 16-byte secret key used for polynomial evaluation.
1053
Let K be the 16-byte secret key.
1054
Let Z_P be prime field over which the polynomial is evaluated. Let P = 2^130 - 5 be the prime.
1055
Let M be the message which can be represented as a concatenation (||) of 'l' 16-byte blocks M[i].
1056
i.e., M = M[0] || M[1] || ... || M[i] || ... || M[l-2] || M[l-1]
1057
To create the coefficients C[i] for polynomial evaluation over Z_P, each 16-byte (i.e., 128-bit)
1058
message block M[i] is concatenated with bits '10' to make a 130-bit block.
1059
The last block (<= 16-byte length) is concatenated with 1 followed by 0s to make a 130-bit block.
1060
Therefore, we define
1061
C[i] = M[i] || '10' for 0 <= i <= l-2 ;
1062
C[l-1] = M[i] || '10...0'
1063
such that, length(C[i]) = 130 bits, for i ∈ [0, l).
1065
Let * indicate scalar multiplication (i.e., w = u * v);
1066
Let × indicate scalar multiplication followed by reduction modulo P (i.e., z = u × v = {(u * v) mod P})
1068
POLY1305_MAC = (POLY1305_EVAL_POLYNOMIAL(C, R, P) + K) mod 2^128; where,
1070
POLY1305_EVAL_POLYNOMIAL(C, R, P) = {C[0] * R^l + C[1] * R^(l-1) + ... + C[l-2] * R^2 + C[l-1] * R} mod P
1071
= R × {C[0] × R^(l-1) + C[1] × R^(l-2) + ... + C[l-2] × R + C[l-1]}
1072
= R × Polynomial(R; C[0], C[1], ... ,C[l-2], C[l-1])
1074
Polynomial(R; C[0], C[1], ... ,C[l-2], C[l-1]) = Σ{C[i] × R^(l-i-1)} for i ∈ [0, l)
1075
----------------------------------------------------------
1077
Parallel evaluation of POLY1305_EVAL_POLYNOMIAL(C, R, P):
1078
Let the number of message blocks l = 4*l' + ρ where ρ = l mod 4.
1079
Using k-way parallel Horner's evaluation [3], for k = 4, we define SUM below:
1081
SUM = R^4 × Polynomial(R^4; C[0], C[4], C[8] ... , C[4l'-4]) +
1082
R^3 × Polynomial(R^4; C[1], C[5], C[9] ... , C[4l'-3]) +
1083
R^2 × Polynomial(R^4; C[2], C[6], C[10] ... , C[4l'-2]) +
1084
R^1 × Polynomial(R^4; C[3], C[7], C[11] ... , C[4l'-1]) +
1087
POLY1305_EVAL_POLYNOMIAL(C, R, P) = SUM if ρ = 0 (i.e., l is multiple of 4)
1088
= R × Polynomial(R; SUM + C[l-ρ], C[l-ρ+1], ... , C[l-1]) if ρ > 0
1089
----------------------------------------------------------
1091
Gall-Gueron[1] 4-way SIMD Algorithm[2] for POLY1305_EVAL_POLYNOMIAL(C, R, P):
1093
Define mathematical vectors (not same as SIMD vector lanes) as below:
1094
R4321 = [R^4, R^3, R^2, R^1];
1095
R4444 = [R^4, R^4, R^4, R^4];
1096
COEF[i] = [C[4i], C[4i+1], C[4i+2], C[4i+3]] for i ∈ [0, l'). For example, COEF[0] and COEF[1] shown below.
1097
COEF[0] = [C0, C1, C2, C3]
1098
COEF[1] = [C4, C5, C6, C7]
1099
T = [T0, T1, T2, T3] be a temporary vector
1100
ACC = [acc, 0, 0, 0]; acc has hash from previous computations (if any), otherwise 0.
1101
⊗ indicates component-wise vector multiplication followed by modulo reduction
1102
⊕ indicates component-wise vector addition, + indicates scalar addition
1104
POLY1305_EVAL_POLYNOMIAL(C, R, P) {
1105
T ← ACC; # load accumulator
1106
T ← T ⊕ COEF[0]; # add accumulator to the first 4 blocks
1107
Compute R4321, R4444;
1109
l' = floor(l/4); # operate on 4 blocks at a time
1110
for (i = 1 to l'-1):
1111
T ← (R4444 ⊗ T) ⊕ COEF[i];
1113
SUM ← T0 + T1 + T2 + T3;
1115
# Scalar tail processing
1117
SUM ← R × Polynomial(R; SUM + C[l-ρ], C[l-ρ+1], ... , C[l-1]);
1122
(1) Each 130-bit block is represented using three 44-bit limbs (most significant limb is only 42-bit).
1123
(The Goll-Gueron implementation[1] uses five 26-bit limbs instead).
1124
(2) Each component of the mathematical vectors is a 130-bit value. The above mathemetical vectors are not to be confused with SIMD vector lanes.
1125
(3) Each AVX2 YMM register can store four 44-bit limbs in quadwords. Since each 130-bit message block is represented using 3 limbs,
1126
to store all the limbs of 4 different 130-bit message blocks, we need 3 YMM registers in total.
1127
(4) In the AVX2 implementation, multiplication followed by modulo reduction and addition are performed for 4 blocks at a time.
1131
void StubGenerator::poly1305_process_blocks_avx2(
1132
const Register input, const Register length,
1133
const Register a0, const Register a1, const Register a2,
1134
const Register r0, const Register r1, const Register c1)
1136
Label L_process256Loop, L_process256LoopDone;
1137
const Register t0 = r13;
1138
const Register t1 = r14;
1139
const Register t2 = r15;
1140
const Register mulql = rax;
1141
const Register mulqh = rdx;
1143
const XMMRegister YMM_ACC0 = xmm0;
1144
const XMMRegister YMM_ACC1 = xmm1;
1145
const XMMRegister YMM_ACC2 = xmm2;
1147
const XMMRegister YTMP1 = xmm3;
1148
const XMMRegister YTMP2 = xmm4;
1149
const XMMRegister YTMP3 = xmm5;
1150
const XMMRegister YTMP4 = xmm6;
1151
const XMMRegister YTMP5 = xmm7;
1152
const XMMRegister YTMP6 = xmm8;
1153
const XMMRegister YTMP7 = xmm9;
1154
const XMMRegister YTMP8 = xmm10;
1155
const XMMRegister YTMP9 = xmm11;
1156
const XMMRegister YTMP10 = xmm12;
1157
const XMMRegister YTMP11 = xmm13;
1158
const XMMRegister YTMP12 = xmm14;
1159
const XMMRegister YTMP13 = xmm15;
1161
const XMMRegister YMM_R0 = YTMP11;
1162
const XMMRegister YMM_R1 = YTMP12;
1163
const XMMRegister YMM_R2 = YTMP13;
1165
// XWORD aliases of YMM registers (for convenience)
1166
const XMMRegister XTMP1 = YTMP1;
1167
const XMMRegister XTMP2 = YTMP2;
1168
const XMMRegister XTMP3 = YTMP3;
1170
// Setup stack frame
1174
// Align stack and reserve space
1176
__ subptr(rsp, 32*8);
1178
/* Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1183
// Spread accumulator into 44-bit limbs in quadwords
1184
// Accumulator limbs to be stored in YTMP1,YTMP2,YTMP3
1185
// First limb (Acc[43:0])
1187
__ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/);
1189
// Second limb (Acc[87:44])
1191
__ shrdq(a0, t0, 44);
1192
__ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/);
1194
// Third limb (Acc[129:88])
1195
__ shrdq(a1, a2, 24);
1196
__ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/);
1198
// --- end of spread accumulator
1200
// To add accumulator, we must unroll first loop iteration
1201
// Load first four 16-byte message blocks of data (64 bytes)
1202
__ vmovdqu(YTMP4, Address(input, 0));
1203
__ vmovdqu(YTMP5, Address(input, 32));
1205
// Interleave the input message data to form 44-bit limbs
1206
// YMM_ACC0 to have bits 0-43 of all 4 blocks in 4 qwords
1207
// YMM_ACC1 to have bits 87-44 of all 4 blocks in 4 qwords
1208
// YMM_ACC2 to have bits 127-88 of all 4 blocks in 4 qwords
1209
// Interleave blocks of data
1210
__ vpunpckhqdq(YMM_ACC2, YTMP4, YTMP5, Assembler::AVX_256bit);
1211
__ vpunpcklqdq(YMM_ACC0, YTMP4, YTMP5, Assembler::AVX_256bit);
1213
// Middle 44-bit limbs of new blocks
1214
__ vpsrlq(YMM_ACC1, YMM_ACC0, 44, Assembler::AVX_256bit);
1215
__ vpsllq(YTMP4, YMM_ACC2, 20, Assembler::AVX_256bit);
1216
__ vpor(YMM_ACC1, YMM_ACC1, YTMP4, Assembler::AVX_256bit);
1217
__ vpand(YMM_ACC1, YMM_ACC1, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1219
// Lowest 44-bit limbs of new blocks
1220
__ vpand(YMM_ACC0, YMM_ACC0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1222
// Highest 42-bit limbs of new blocks; pad the msg with 2^128
1223
__ vpsrlq(YMM_ACC2, YMM_ACC2, 24, Assembler::AVX_256bit);
1225
// Add 2^128 to all 4 final qwords for the message
1226
__ vpor(YMM_ACC2, YMM_ACC2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_256bit, t1);
1227
// --- end of input interleaving and message padding
1229
// Add accumulator to the fist message block
1230
// Accumulator limbs in YTMP1,YTMP2,YTMP3
1231
__ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_256bit);
1232
__ vpaddq(YMM_ACC1, YMM_ACC1, YTMP2, Assembler::AVX_256bit);
1233
__ vpaddq(YMM_ACC2, YMM_ACC2, YTMP3, Assembler::AVX_256bit);
1235
/* Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1236
Compute R4321, R4444;
1237
R4321 = [R^4, R^3, R^2, R^1];
1238
R4444 = [R^4, R^4, R^4, R^4];
1241
// Compute the powers of R^1..R^4 and form 44-bit limbs of each
1242
// YTMP5 to have bits 0-127 for R^1 and R^2
1243
// YTMP6 to have bits 128-129 for R^1 and R^2
1245
__ vpinsrq(XTMP1, XTMP1, r1, 1);
1246
__ vinserti128(YTMP5, YTMP5, XTMP1, 1);
1248
__ vpxor(YTMP10, YTMP10, YTMP10, Assembler::AVX_256bit);
1249
__ vpxor(YTMP6, YTMP6, YTMP6, Assembler::AVX_256bit);
1256
poly1305_multiply_scalar(a0, a1, a2,
1258
t0, t1, t2, mulql, mulqh);
1259
// Store R^2 in YTMP5, YTM6
1261
__ vpinsrq(XTMP1, XTMP1, a1, 1);
1262
__ vinserti128(YTMP5, YTMP5, XTMP1, 0);
1264
__ vinserti128(YTMP6, YTMP6, XTMP1, 0);
1268
poly1305_multiply_scalar(a0, a1, a2,
1270
t0, t1, t2, mulql, mulqh);
1271
// Store R^3 in YTMP7, YTM2
1273
__ vpinsrq(XTMP1, XTMP1, a1, 1);
1274
__ vinserti128(YTMP7, YTMP7, XTMP1, 1);
1276
__ vinserti128(YTMP2, YTMP2, XTMP1, 1);
1280
poly1305_multiply_scalar(a0, a1, a2,
1282
t0, t1, t2, mulql, mulqh);
1283
// Store R^4 in YTMP7, YTM2
1285
__ vpinsrq(XTMP1, XTMP1, a1, 1);
1286
__ vinserti128(YTMP7, YTMP7, XTMP1, 0);
1288
__ vinserti128(YTMP2, YTMP2, XTMP1, 0);
1290
// Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty)
1291
__ vpunpckhqdq(YMM_R2, YTMP5, YTMP10, Assembler::AVX_256bit);
1292
__ vpunpcklqdq(YMM_R0, YTMP5, YTMP10, Assembler::AVX_256bit);
1293
__ vpunpckhqdq(YTMP3, YTMP7, YTMP10, Assembler::AVX_256bit);
1294
__ vpunpcklqdq(YTMP4, YTMP7, YTMP10, Assembler::AVX_256bit);
1296
__ vpslldq(YMM_R2, YMM_R2, 8, Assembler::AVX_256bit);
1297
__ vpslldq(YTMP6, YTMP6, 8, Assembler::AVX_256bit);
1298
__ vpslldq(YMM_R0, YMM_R0, 8, Assembler::AVX_256bit);
1299
__ vpor(YMM_R2, YMM_R2, YTMP3, Assembler::AVX_256bit);
1300
__ vpor(YMM_R0, YMM_R0, YTMP4, Assembler::AVX_256bit);
1301
__ vpor(YTMP6, YTMP6, YTMP2, Assembler::AVX_256bit);
1302
// Move 2 MSbits to top 24 bits, to be OR'ed later
1303
__ vpsllq(YTMP6, YTMP6, 40, Assembler::AVX_256bit);
1305
__ vpsrlq(YMM_R1, YMM_R0, 44, Assembler::AVX_256bit);
1306
__ vpsllq(YTMP5, YMM_R2, 20, Assembler::AVX_256bit);
1307
__ vpor(YMM_R1, YMM_R1, YTMP5, Assembler::AVX_256bit);
1308
__ vpand(YMM_R1, YMM_R1, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1310
__ vpand(YMM_R0, YMM_R0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, t1);
1311
__ vpsrlq(YMM_R2, YMM_R2, 24, Assembler::AVX_256bit);
1313
__ vpor(YMM_R2, YMM_R2, YTMP6, Assembler::AVX_256bit);
1314
// YMM_R0, YMM_R1, YMM_R2 have the limbs of R^1, R^2, R^3, R^4
1316
// Store R^4-R on stack for later use
1317
int _r4_r1_save = 0;
1318
__ vmovdqu(Address(rsp, _r4_r1_save + 0), YMM_R0);
1319
__ vmovdqu(Address(rsp, _r4_r1_save + 32), YMM_R1);
1320
__ vmovdqu(Address(rsp, _r4_r1_save + 32*2), YMM_R2);
1322
// Broadcast 44-bit limbs of R^4
1324
__ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (R^4[43:0])
1325
__ movq(YMM_R0, t0);
1326
__ vpermq(YMM_R0, YMM_R0, 0x0, Assembler::AVX_256bit);
1329
__ shrdq(a0, t0, 44);
1330
__ andq(a0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // Second limb (R^4[87:44])
1331
__ movq(YMM_R1, a0);
1332
__ vpermq(YMM_R1, YMM_R1, 0x0, Assembler::AVX_256bit);
1334
__ shrdq(a1, a2, 24);
1335
__ andq(a1, ExternalAddress(poly1305_mask42()), t1 /*rscratch*/); // Third limb (R^4[129:88])
1336
__ movq(YMM_R2, a1);
1337
__ vpermq(YMM_R2, YMM_R2, 0x0, Assembler::AVX_256bit);
1338
// YMM_R0, YMM_R1, YMM_R2 have the limbs of R^4, R^4, R^4, R^4
1342
__ vpsllq(YTMP1, YMM_R1, 2, Assembler::AVX_256bit);
1343
__ vpsllq(YTMP2, YMM_R2, 2, Assembler::AVX_256bit);
1345
__ vpaddq(YTMP1, YTMP1, YMM_R1, Assembler::AVX_256bit);
1346
__ vpaddq(YTMP2, YTMP2, YMM_R2, Assembler::AVX_256bit);
1348
__ vpsllq(YTMP1, YTMP1, 2, Assembler::AVX_256bit);
1349
__ vpsllq(YTMP2, YTMP2, 2, Assembler::AVX_256bit);
1351
//Store broadcasted R^4 and 4*5*R^4 on stack for later use
1352
int _r4_save = 32*3;
1353
int _r4p_save = 32*6;
1354
__ vmovdqu(Address(rsp, _r4_save + 0), YMM_R0);
1355
__ vmovdqu(Address(rsp, _r4_save + 32), YMM_R1);
1356
__ vmovdqu(Address(rsp, _r4_save + 32*2), YMM_R2);
1357
__ vmovdqu(Address(rsp, _r4p_save), YTMP1);
1358
__ vmovdqu(Address(rsp, _r4p_save + 32), YTMP2);
1360
// Get the number of multiples of 4 message blocks (64 bytes) for vectorization
1361
__ movq(t0, length);
1362
__ andq(t0, 0xffffffc0); // 0xffffffffffffffc0 after sign extension
1364
// VECTOR LOOP: process 4 * 16-byte message blocks at a time
1365
__ bind(L_process256Loop);
1366
__ cmpl(t0, 16*4); //64 bytes (4 blocks at a time)
1367
__ jcc(Assembler::belowEqual, L_process256LoopDone);
1370
Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1372
for (i = 1 to l'-1):
1373
T ← (R4444 ⊗ T) ⊕ COEF[i];
1376
// Perform multiply and reduce while loading the next block and adding it in interleaved manner
1377
// The logic to advance the SIMD loop counter (i.e. length -= 64) is inside the function below.
1378
// The function below also includes the logic to load the next 4 blocks of data for efficient port utilization.
1379
poly1305_msg_mul_reduce_vec4_avx2(YMM_ACC0, YMM_ACC1, YMM_ACC2,
1380
Address(rsp, _r4_save + 0), Address(rsp, _r4_save + 32), Address(rsp, _r4_save + 32*2),
1381
Address(rsp, _r4p_save), Address(rsp, _r4p_save + 32),
1382
YTMP1, YTMP2, YTMP3, YTMP4, YTMP5, YTMP6,
1383
YTMP7, YTMP8, YTMP9, YTMP10, YTMP11, YTMP12,
1384
input, t0, t1 /*rscratch*/);
1385
__ jmp(L_process256Loop);
1386
// end of vector loop
1387
__ bind(L_process256LoopDone);
1390
Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1394
// Need to multiply by R^4, R^3, R^2, R
1396
__ vmovdqu(YMM_R0, Address(rsp, _r4_r1_save + 0));
1397
__ vmovdqu(YMM_R1, Address(rsp, _r4_r1_save + 32));
1398
__ vmovdqu(YMM_R2, Address(rsp, _r4_r1_save + 32*2));
1400
// Generate 4*5*[R^4..R^1] (ignore lowest limb)
1401
// YTMP1 to have bits 87-44 of all 1-4th powers of R' in 4 qwords
1402
// YTMP2 to have bits 129-88 of all 1-4th powers of R' in 4 qwords
1403
__ vpsllq(YTMP10, YMM_R1, 2, Assembler::AVX_256bit);
1404
__ vpaddq(YTMP1, YMM_R1, YTMP10, Assembler::AVX_256bit); //R1' (R1*5)
1405
__ vpsllq(YTMP10, YMM_R2, 2, Assembler::AVX_256bit);
1406
__ vpaddq(YTMP2, YMM_R2, YTMP10, Assembler::AVX_256bit); //R2' (R2*5)
1409
__ vpsllq(YTMP1, YTMP1, 2, Assembler::AVX_256bit);
1410
__ vpsllq(YTMP2, YTMP2, 2, Assembler::AVX_256bit);
1412
poly1305_mul_reduce_vec4_avx2(YMM_ACC0, YMM_ACC1, YMM_ACC2,
1413
YMM_R0, YMM_R1, YMM_R2, YTMP1, YTMP2,
1414
YTMP3, YTMP4, YTMP5, YTMP6,
1415
YTMP7, YTMP8, YTMP9, t1);
1417
Compute the following steps of POLY1305_EVAL_POLYNOMIAL algorithm
1418
SUM ← T0 + T1 + T2 + T3;
1422
__ vextracti128(YTMP1, YMM_ACC0, 1);
1423
__ vextracti128(YTMP2, YMM_ACC1, 1);
1424
__ vextracti128(YTMP3, YMM_ACC2, 1);
1426
__ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_128bit);
1427
__ vpaddq(YMM_ACC1, YMM_ACC1, YTMP2, Assembler::AVX_128bit);
1428
__ vpaddq(YMM_ACC2, YMM_ACC2, YTMP3, Assembler::AVX_128bit);
1430
__ vpsrldq(YTMP1, YMM_ACC0, 8, Assembler::AVX_128bit);
1431
__ vpsrldq(YTMP2, YMM_ACC1, 8, Assembler::AVX_128bit);
1432
__ vpsrldq(YTMP3, YMM_ACC2, 8, Assembler::AVX_128bit);
1435
__ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_128bit);
1436
__ vpaddq(YMM_ACC1, YMM_ACC1, YTMP2, Assembler::AVX_128bit);
1437
__ vpaddq(YMM_ACC2, YMM_ACC2, YTMP3, Assembler::AVX_128bit);
1439
__ movq(YMM_ACC0, YMM_ACC0);
1440
__ movq(YMM_ACC1, YMM_ACC1);
1441
__ movq(YMM_ACC2, YMM_ACC2);
1443
__ lea(input, Address(input,16*4));
1444
__ andq(length, 63); // remaining bytes < length 64
1445
// carry propagation
1446
__ vpsrlq(YTMP1, YMM_ACC0, 44, Assembler::AVX_128bit);
1447
__ vpand(YMM_ACC0, YMM_ACC0, ExternalAddress(poly1305_mask44()), Assembler::AVX_128bit, t1); // Clear top 20 bits
1448
__ vpaddq(YMM_ACC1, YMM_ACC1, YTMP1, Assembler::AVX_128bit);
1449
__ vpsrlq(YTMP1, YMM_ACC1, 44, Assembler::AVX_128bit);
1450
__ vpand(YMM_ACC1, YMM_ACC1, ExternalAddress(poly1305_mask44()), Assembler::AVX_128bit, t1); // Clear top 20 bits
1451
__ vpaddq(YMM_ACC2, YMM_ACC2, YTMP1, Assembler::AVX_128bit);
1452
__ vpsrlq(YTMP1, YMM_ACC2, 42, Assembler::AVX_128bit);
1453
__ vpand(YMM_ACC2, YMM_ACC2, ExternalAddress(poly1305_mask42()), Assembler::AVX_128bit, t1); // Clear top 20 bits
1454
__ vpsllq(YTMP2, YTMP1, 2, Assembler::AVX_128bit);
1455
__ vpaddq(YTMP1, YTMP1, YTMP2, Assembler::AVX_128bit);
1456
__ vpaddq(YMM_ACC0, YMM_ACC0, YTMP1, Assembler::AVX_128bit);
1459
__ movq(a0, YMM_ACC0);
1460
__ movq(t0, YMM_ACC1);
1465
__ movq(a2, YMM_ACC2);
1472
__ vzeroall(); // clears all ymm registers (ymm0 through ymm15)
1474
// SAFE DATA (clear powers of R)
1475
__ vmovdqu(Address(rsp, _r4_r1_save + 0), YTMP1);
1476
__ vmovdqu(Address(rsp, _r4_r1_save + 32), YTMP1);
1477
__ vmovdqu(Address(rsp, _r4_r1_save + 32*2), YTMP1);
1478
__ vmovdqu(Address(rsp, _r4_save + 0), YTMP1);
1479
__ vmovdqu(Address(rsp, _r4_save + 32), YTMP1);
1480
__ vmovdqu(Address(rsp, _r4_save + 32*2), YTMP1);
1481
__ vmovdqu(Address(rsp, _r4p_save), YTMP1);
1482
__ vmovdqu(Address(rsp, _r4p_save + 32), YTMP1);
1484
// Save rbp and rsp; clear stack frame
1490
// Compute component-wise product for 4 16-byte message blocks,
1491
// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0]
1493
// Each block/number is represented by 3 44-bit limb digits, start with multiplication
1497
// ----------------------------------
1499
// + a1xr1 a0xr1 5xa2xr1' (r1' = r1<<2)
1500
// + a0xr2 5xa2xr2' 5xa1xr2' (r2' = r2<<2)
1501
// ----------------------------------
1504
void StubGenerator::poly1305_mul_reduce_vec4_avx2(
1505
const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
1506
const XMMRegister R0, const XMMRegister R1, const XMMRegister R2,
1507
const XMMRegister R1P, const XMMRegister R2P,
1508
const XMMRegister P0L, const XMMRegister P0H,
1509
const XMMRegister P1L, const XMMRegister P1H,
1510
const XMMRegister P2L, const XMMRegister P2H,
1511
const XMMRegister YTMP1, const Register rscratch)
1513
// Reset accumulator
1514
__ vpxor(P0L, P0L, P0L, Assembler::AVX_256bit);
1515
__ vpxor(P0H, P0H, P0H, Assembler::AVX_256bit);
1516
__ vpxor(P1L, P1L, P1L, Assembler::AVX_256bit);
1517
__ vpxor(P1H, P1H, P1H, Assembler::AVX_256bit);
1518
__ vpxor(P2L, P2L, P2L, Assembler::AVX_256bit);
1519
__ vpxor(P2H, P2H, P2H, Assembler::AVX_256bit);
1521
// Calculate partial products
1525
__ vpmadd52luq(P0L, A2, R1P, Assembler::AVX_256bit);
1526
__ vpmadd52huq(P0H, A2, R1P, Assembler::AVX_256bit);
1528
__ vpmadd52luq(P1L, A2, R2P, Assembler::AVX_256bit);
1529
__ vpmadd52huq(P1H, A2, R2P, Assembler::AVX_256bit);
1531
__ vpmadd52luq(P0L, A0, R0, Assembler::AVX_256bit);
1532
__ vpmadd52huq(P0H, A0, R0, Assembler::AVX_256bit);
1538
__ vpmadd52luq(P2L, A2, R0, Assembler::AVX_256bit);
1539
__ vpmadd52huq(P2H, A2, R0, Assembler::AVX_256bit);
1541
__ vpmadd52luq(P1L, A0, R1, Assembler::AVX_256bit);
1542
__ vpmadd52huq(P1H, A0, R1, Assembler::AVX_256bit);
1544
__ vpmadd52luq(P0L, A1, R2P, Assembler::AVX_256bit);
1545
__ vpmadd52huq(P0H, A1, R2P, Assembler::AVX_256bit);
1547
__ vpmadd52luq(P2L, A0, R2, Assembler::AVX_256bit);
1548
__ vpmadd52huq(P2H, A0, R2, Assembler::AVX_256bit);
1550
// Carry propgation (first pass)
1551
__ vpsrlq(YTMP1, P0L, 44, Assembler::AVX_256bit);
1552
__ vpsllq(P0H, P0H, 8, Assembler::AVX_256bit);
1553
__ vpmadd52luq(P1L, A1, R0, Assembler::AVX_256bit);
1554
__ vpmadd52huq(P1H, A1, R0, Assembler::AVX_256bit);
1555
// Carry propagation (first pass) - continue
1556
__ vpand(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1557
__ vpaddq(P0H, P0H, YTMP1, Assembler::AVX_256bit);
1558
__ vpmadd52luq(P2L, A1, R1, Assembler::AVX_256bit);
1559
__ vpmadd52huq(P2H, A1, R1, Assembler::AVX_256bit);
1561
// Carry propagation (first pass) - continue 2
1562
__ vpaddq(P1L, P1L, P0H, Assembler::AVX_256bit);
1563
__ vpsllq(P1H, P1H, 8, Assembler::AVX_256bit);
1564
__ vpsrlq(YTMP1, P1L, 44, Assembler::AVX_256bit);
1565
__ vpand(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1567
__ vpaddq(P2L, P2L, P1H, Assembler::AVX_256bit);
1568
__ vpaddq(P2L, P2L, YTMP1, Assembler::AVX_256bit);
1569
__ vpand(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_256bit, rscratch); // Clear top 22 bits
1570
__ vpsrlq(YTMP1, P2L, 42, Assembler::AVX_256bit);
1571
__ vpsllq(P2H, P2H, 10, Assembler::AVX_256bit);
1572
__ vpaddq(P2H, P2H, YTMP1, Assembler::AVX_256bit);
1574
// Carry propagation (second pass)
1575
// Multiply by 5 the highest bits (above 130 bits)
1576
__ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1577
__ vpsllq(P2H, P2H, 2, Assembler::AVX_256bit);
1578
__ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1580
__ vpsrlq(YTMP1, A0, 44, Assembler::AVX_256bit);
1581
__ vpand(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1582
__ vpaddq(A1, A1, YTMP1, Assembler::AVX_256bit);
1585
// Compute component-wise product for 4 16-byte message blocks and adds the next 4 blocks
1586
// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] x [r2 r1 r0],
1587
// followed by [a2 a1 a0] += [n2 n1 n0], where n contains the next 4 blocks of the message.
1589
// Each block/number is represented by 3 44-bit limb digits, start with multiplication
1593
// ----------------------------------
1595
// + a1xr1 a0xr1 5xa2xr1' (r1' = r1<<2)
1596
// + a0xr2 5xa2xr2' 5xa1xr2' (r2' = r2<<2)
1597
// ----------------------------------
1600
void StubGenerator::poly1305_msg_mul_reduce_vec4_avx2(
1601
const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
1602
const Address R0, const Address R1, const Address R2,
1603
const Address R1P, const Address R2P,
1604
const XMMRegister P0L, const XMMRegister P0H,
1605
const XMMRegister P1L, const XMMRegister P1H,
1606
const XMMRegister P2L, const XMMRegister P2H,
1607
const XMMRegister YTMP1, const XMMRegister YTMP2,
1608
const XMMRegister YTMP3, const XMMRegister YTMP4,
1609
const XMMRegister YTMP5, const XMMRegister YTMP6,
1610
const Register input, const Register length, const Register rscratch)
1612
// Reset accumulator
1613
__ vpxor(P0L, P0L, P0L, Assembler::AVX_256bit);
1614
__ vpxor(P0H, P0H, P0H, Assembler::AVX_256bit);
1615
__ vpxor(P1L, P1L, P1L, Assembler::AVX_256bit);
1616
__ vpxor(P1H, P1H, P1H, Assembler::AVX_256bit);
1617
__ vpxor(P2L, P2L, P2L, Assembler::AVX_256bit);
1618
__ vpxor(P2H, P2H, P2H, Assembler::AVX_256bit);
1620
// Calculate partial products
1624
__ vpmadd52luq(P0L, A2, R1P, Assembler::AVX_256bit);
1625
__ vpmadd52huq(P0H, A2, R1P, Assembler::AVX_256bit);
1626
// Interleave input loading with hash computation
1627
__ lea(input, Address(input,16*4));
1628
__ subl(length, 16*4);
1629
__ vpmadd52luq(P1L, A2, R2P, Assembler::AVX_256bit);
1630
__ vpmadd52huq(P1H, A2, R2P, Assembler::AVX_256bit);
1631
// Load next block of data (64 bytes)
1632
__ vmovdqu(YTMP1, Address(input, 0));
1633
__ vmovdqu(YTMP2, Address(input, 32));
1634
// interleave new blocks of data
1635
__ vpunpckhqdq(YTMP3, YTMP1, YTMP2, Assembler::AVX_256bit);
1636
__ vpunpcklqdq(YTMP1, YTMP1, YTMP2, Assembler::AVX_256bit);
1637
__ vpmadd52luq(P0L, A0, R0, Assembler::AVX_256bit);
1638
__ vpmadd52huq(P0H, A0, R0, Assembler::AVX_256bit);
1639
// Highest 42-bit limbs of new blocks
1640
__ vpsrlq(YTMP6, YTMP3, 24, Assembler::AVX_256bit);
1641
__ vpor(YTMP6, YTMP6, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_256bit, rscratch);
1643
//Middle 44-bit limbs of new blocks
1644
__ vpsrlq(YTMP2, YTMP1, 44, Assembler::AVX_256bit);
1645
__ vpsllq(YTMP4, YTMP3, 20, Assembler::AVX_256bit);
1647
__ vpmadd52luq(P2L, A2, R0, Assembler::AVX_256bit);
1648
__ vpmadd52huq(P2H, A2, R0, Assembler::AVX_256bit);
1649
__ vpor(YTMP2, YTMP2, YTMP4, Assembler::AVX_256bit);
1650
__ vpand(YTMP2, YTMP2, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch);
1651
// Lowest 44-bit limbs of new blocks
1652
__ vpand(YTMP1, YTMP1, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch);
1654
__ vpmadd52luq(P1L, A0, R1, Assembler::AVX_256bit);
1655
__ vpmadd52huq(P1H, A0, R1, Assembler::AVX_256bit);
1656
__ vpmadd52luq(P0L, A1, R2P, Assembler::AVX_256bit);
1657
__ vpmadd52huq(P0H, A1, R2P, Assembler::AVX_256bit);
1658
__ vpmadd52luq(P2L, A0, R2, Assembler::AVX_256bit);
1659
__ vpmadd52huq(P2H, A0, R2, Assembler::AVX_256bit);
1661
// Carry propgation (first pass)
1662
__ vpsrlq(YTMP5, P0L, 44, Assembler::AVX_256bit);
1663
__ vpsllq(P0H, P0H, 8, Assembler::AVX_256bit);
1664
__ vpmadd52luq(P1L, A1, R0, Assembler::AVX_256bit);
1665
__ vpmadd52huq(P1H, A1, R0, Assembler::AVX_256bit);
1666
// Carry propagation (first pass) - continue
1667
__ vpand(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1668
__ vpaddq(P0H, P0H, YTMP5, Assembler::AVX_256bit);
1669
__ vpmadd52luq(P2L, A1, R1, Assembler::AVX_256bit);
1670
__ vpmadd52huq(P2H, A1, R1, Assembler::AVX_256bit);
1672
// Carry propagation (first pass) - continue 2
1673
__ vpaddq(P1L, P1L, P0H, Assembler::AVX_256bit);
1674
__ vpsllq(P1H, P1H, 8, Assembler::AVX_256bit);
1675
__ vpsrlq(YTMP5, P1L, 44, Assembler::AVX_256bit);
1676
__ vpand(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1678
__ vpaddq(P2L, P2L, P1H, Assembler::AVX_256bit);
1679
__ vpaddq(P2L, P2L, YTMP5, Assembler::AVX_256bit);
1680
__ vpand(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_256bit, rscratch); // Clear top 22 bits
1681
__ vpaddq(A2, A2, YTMP6, Assembler::AVX_256bit); // Add highest bits from new blocks to accumulator
1682
__ vpsrlq(YTMP5, P2L, 42, Assembler::AVX_256bit);
1683
__ vpsllq(P2H, P2H, 10, Assembler::AVX_256bit);
1684
__ vpaddq(P2H, P2H, YTMP5, Assembler::AVX_256bit);
1686
// Carry propagation (second pass)
1687
// Multiply by 5 the highest bits (above 130 bits)
1688
__ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1689
__ vpsllq(P2H, P2H, 2, Assembler::AVX_256bit);
1690
__ vpaddq(A0, A0, P2H, Assembler::AVX_256bit);
1692
__ vpsrlq(YTMP5, A0, 44, Assembler::AVX_256bit);
1693
__ vpand(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_256bit, rscratch); // Clear top 20 bits
1694
__ vpaddq(A0, A0, YTMP1, Assembler::AVX_256bit); //Add low 42-bit bits from new blocks to accumulator
1695
__ vpaddq(A1, A1, YTMP2, Assembler::AVX_256bit); //Add medium 42-bit bits from new blocks to accumulator
1696
__ vpaddq(A1, A1, YTMP5, Assembler::AVX_256bit);