2
* Copyright (c) 2024, Intel Corporation. All rights reserved.
4
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6
* This code is free software; you can redistribute it and/or modify it
7
* under the terms of the GNU General Public License version 2 only, as
8
* published by the Free Software Foundation.
10
* This code is distributed in the hope that it will be useful, but WITHOUT
11
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
* version 2 for more details (a copy is included in the LICENSE file that
14
* accompanied this code).
16
* You should have received a copy of the GNU General Public License version
17
* 2 along with this work; if not, write to the Free Software Foundation,
18
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
* or visit www.oracle.com if you need additional information or have any
26
#include "precompiled.hpp"
27
#include "macroAssembler_x86.hpp"
28
#include "stubGenerator_x86_64.hpp"
32
ATTRIBUTE_ALIGNED(64) uint64_t MODULUS_P256[] = {
33
0x000fffffffffffffULL, 0x00000fffffffffffULL,
34
0x0000000000000000ULL, 0x0000001000000000ULL,
35
0x0000ffffffff0000ULL, 0x0000000000000000ULL,
36
0x0000000000000000ULL, 0x0000000000000000ULL
38
static address modulus_p256() {
39
return (address)MODULUS_P256;
42
ATTRIBUTE_ALIGNED(64) uint64_t P256_MASK52[] = {
43
0x000fffffffffffffULL, 0x000fffffffffffffULL,
44
0x000fffffffffffffULL, 0x000fffffffffffffULL,
45
0xffffffffffffffffULL, 0xffffffffffffffffULL,
46
0xffffffffffffffffULL, 0xffffffffffffffffULL,
48
static address p256_mask52() {
49
return (address)P256_MASK52;
52
ATTRIBUTE_ALIGNED(64) uint64_t SHIFT1R[] = {
53
0x0000000000000001ULL, 0x0000000000000002ULL,
54
0x0000000000000003ULL, 0x0000000000000004ULL,
55
0x0000000000000005ULL, 0x0000000000000006ULL,
56
0x0000000000000007ULL, 0x0000000000000000ULL,
58
static address shift_1R() {
59
return (address)SHIFT1R;
62
ATTRIBUTE_ALIGNED(64) uint64_t SHIFT1L[] = {
63
0x0000000000000007ULL, 0x0000000000000000ULL,
64
0x0000000000000001ULL, 0x0000000000000002ULL,
65
0x0000000000000003ULL, 0x0000000000000004ULL,
66
0x0000000000000005ULL, 0x0000000000000006ULL,
68
static address shift_1L() {
69
return (address)SHIFT1L;
73
* Unrolled Word-by-Word Montgomery Multiplication
74
* r = a * b * 2^-260 (mod P)
76
* Reference [1]: Shay Gueron and Vlad Krasnov
77
* "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
78
* See Figure 5. "Algorithm 2: Word-by-Word Montgomery Multiplication for a Montgomery
79
* Friendly modulus p". Note: Step 6. Skipped; Instead use numAdds to reuse existing overflow
84
* +--+--+--+--+--+--+--+--+
85
* M = load(*modulus_p256) | 0| 0| 0|m5|m4|m3|m2|m1|
86
* +--+--+--+--+--+--+--+--+
87
* A = load(*aLimbs) | 0| 0| 0|a5|a4|a3|a2|a1|
88
* +--+--+--+--+--+--+--+--+
89
* Acc1 = 0 | 0| 0| 0| 0| 0| 0| 0| 0|
90
* +--+--+--+--+--+--+--+--+
92
* +--+--+--+--+--+--+--+--+
93
* Acc2 = 0 | 0| 0| 0| 0| 0| 0| 0| 0|
94
* +--+--+--+--+--+--+--+--+
95
* B = replicate(bLimbs[i]) |bi|bi|bi|bi|bi|bi|bi|bi|
96
* +--+--+--+--+--+--+--+--+
97
* +--+--+--+--+--+--+--+--+
98
* Acc1+=| 0| 0| 0|c5|c4|c3|c2|c1|
99
* *| 0| 0| 0|a5|a4|a3|a2|a1|
100
* Acc1 += A * B |bi|bi|bi|bi|bi|bi|bi|bi|
101
* +--+--+--+--+--+--+--+--+
102
* Acc2+=| 0| 0| 0| 0| 0| 0| 0| 0|
103
* *h| 0| 0| 0|a5|a4|a3|a2|a1|
104
* Acc2 += A *h B |bi|bi|bi|bi|bi|bi|bi|bi|
105
* +--+--+--+--+--+--+--+--+
106
* N = replicate(Acc1[0]) |n0|n0|n0|n0|n0|n0|n0|n0|
107
* +--+--+--+--+--+--+--+--+
108
* +--+--+--+--+--+--+--+--+
109
* Acc1+=| 0| 0| 0|c5|c4|c3|c2|c1|
110
* *| 0| 0| 0|m5|m4|m3|m2|m1|
111
* Acc1 += M * N |n0|n0|n0|n0|n0|n0|n0|n0| Note: 52 low bits of Acc1[0] == 0 due to Montgomery!
112
* +--+--+--+--+--+--+--+--+
113
* Acc2+=| 0| 0| 0|d5|d4|d3|d2|d1|
114
* *h| 0| 0| 0|m5|m4|m3|m2|m1|
115
* Acc2 += M *h N |n0|n0|n0|n0|n0|n0|n0|n0|
116
* +--+--+--+--+--+--+--+--+
118
* // Combine high/low partial sums Acc1 + Acc2
119
* +--+--+--+--+--+--+--+--+
120
* carry = Acc1[0] >> 52 | 0| 0| 0| 0| 0| 0| 0|c1|
121
* +--+--+--+--+--+--+--+--+
123
* +--+--+--+--+--+--+--+--+
124
* Acc1 = Acc1 shift one q element>> | 0| 0| 0| 0|c5|c4|c3|c2|
125
* +--+--+--+--+--+--+--+--+
128
* // Last Carry round: Combine high/low partial sums Acc1<high_bits> + Acc1 + Acc2
130
* Acc1 = Acc1 shift one q element >>
131
* Acc1 = mask52(Acc1)
136
void montgomeryMultiply(const Register aLimbs, const Register bLimbs, const Register rLimbs, const Register tmp, MacroAssembler* _masm) {
138
Register rscratch = tmp;
141
XMMRegister A = xmm0;
142
XMMRegister B = xmm1;
143
XMMRegister T = xmm2;
146
XMMRegister Acc1 = xmm10;
147
XMMRegister Acc2 = xmm11;
148
XMMRegister N = xmm12;
149
XMMRegister carry = xmm13;
152
XMMRegister modulus = xmm20;
153
XMMRegister shift1L = xmm21;
154
XMMRegister shift1R = xmm22;
155
XMMRegister mask52 = xmm23;
156
KRegister limb0 = k1;
157
KRegister allLimbs = k2;
160
__ kmovql(limb0, t0);
162
__ kmovql(allLimbs, t0);
163
__ evmovdquq(shift1L, allLimbs, ExternalAddress(shift_1L()), false, Assembler::AVX_512bit, rscratch);
164
__ evmovdquq(shift1R, allLimbs, ExternalAddress(shift_1R()), false, Assembler::AVX_512bit, rscratch);
165
__ evmovdquq(mask52, allLimbs, ExternalAddress(p256_mask52()), false, Assembler::AVX_512bit, rscratch);
167
// M = load(*modulus_p256)
168
__ evmovdquq(modulus, allLimbs, ExternalAddress(modulus_p256()), false, Assembler::AVX_512bit, rscratch);
170
// A = load(*aLimbs); masked evmovdquq() can be slow. Instead load full 256bit, and compbine with 64bit
171
__ evmovdquq(A, Address(aLimbs, 8), Assembler::AVX_256bit);
172
__ evpermq(A, allLimbs, shift1L, A, false, Assembler::AVX_512bit);
173
__ movq(T, Address(aLimbs, 0));
174
__ evporq(A, A, T, Assembler::AVX_512bit);
177
__ vpxorq(Acc1, Acc1, Acc1, Assembler::AVX_512bit);
178
for (int i = 0; i< 5; i++) {
180
__ vpxorq(Acc2, Acc2, Acc2, Assembler::AVX_512bit);
182
// B = replicate(bLimbs[i])
183
__ vpbroadcastq(B, Address(bLimbs, i*8), Assembler::AVX_512bit);
186
__ evpmadd52luq(Acc1, A, B, Assembler::AVX_512bit);
189
__ evpmadd52huq(Acc2, A, B, Assembler::AVX_512bit);
191
// N = replicate(Acc1[0])
192
__ vpbroadcastq(N, Acc1, Assembler::AVX_512bit);
195
__ evpmadd52luq(Acc1, modulus, N, Assembler::AVX_512bit);
198
__ evpmadd52huq(Acc2, modulus, N, Assembler::AVX_512bit);
202
// Combine high/low partial sums Acc1 + Acc2
204
// carry = Acc1[0] >> 52
205
__ evpsrlq(carry, limb0, Acc1, 52, true, Assembler::AVX_512bit);
208
__ evpaddq(Acc2, limb0, carry, Acc2, true, Assembler::AVX_512bit);
210
// Acc1 = Acc1 shift one q element >>
211
__ evpermq(Acc1, allLimbs, shift1R, Acc1, false, Assembler::AVX_512bit);
213
// Acc1 = Acc1 + Acc2
214
__ vpaddq(Acc1, Acc1, Acc2, Assembler::AVX_512bit);
217
// Last Carry round: Combine high/low partial sums Acc1<high_bits> + Acc1 + Acc2
218
// carry = Acc1 >> 52
219
__ evpsrlq(carry, allLimbs, Acc1, 52, true, Assembler::AVX_512bit);
221
// Acc1 = Acc1 shift one q element >>
222
__ evpermq(Acc1, allLimbs, shift1R, Acc1, false, Assembler::AVX_512bit);
224
// Acc1 = mask52(Acc1)
225
__ evpandq(Acc1, Acc1, mask52, Assembler::AVX_512bit); // Clear top 12 bits
228
__ evpaddq(Acc2, allLimbs, carry, Acc2, true, Assembler::AVX_512bit);
230
// Acc1 = Acc1 + Acc2
231
__ vpaddq(Acc1, Acc1, Acc2, Assembler::AVX_512bit);
233
// output to rLimbs (1 + 4 limbs)
234
__ movq(Address(rLimbs, 0), Acc1);
235
__ evpermq(Acc1, k0, shift1R, Acc1, true, Assembler::AVX_512bit);
236
__ evmovdquq(Address(rLimbs, 8), k0, Acc1, true, Assembler::AVX_256bit);
239
address StubGenerator::generate_intpoly_montgomeryMult_P256() {
240
__ align(CodeEntryAlignment);
241
StubCodeMark mark(this, "StubRoutines", "intpoly_montgomeryMult_P256");
242
address start = __ pc();
246
const Register aLimbs = c_rarg0; // rdi | rcx
247
const Register bLimbs = c_rarg1; // rsi | rdx
248
const Register rLimbs = c_rarg2; // rdx | r8
249
const Register tmp = r9;
251
montgomeryMultiply(aLimbs, bLimbs, rLimbs, tmp, _masm);
260
// - constant time (i.e. no branches)
261
// - no-side channel (i.e. all memory must always be accessed, and in same order)
262
void assign_avx(XMMRegister A, Address aAddr, XMMRegister B, Address bAddr, KRegister select, int vector_len, MacroAssembler* _masm) {
263
__ evmovdquq(A, aAddr, vector_len);
264
__ evmovdquq(B, bAddr, vector_len);
265
__ evmovdquq(A, select, B, true, vector_len);
266
__ evmovdquq(aAddr, A, vector_len);
269
void assign_scalar(Address aAddr, Address bAddr, Register select, Register tmp, MacroAssembler* _masm) {
271
// long dummyLimbs = maskValue & (a[i] ^ b[i]);
272
// a[i] = dummyLimbs ^ a[i];
276
__ andq(tmp, select);
280
address StubGenerator::generate_intpoly_assign() {
282
// MontgomeryIntPolynP256: 5 = 4 + 1
283
// IntegerPolynomial1305: 5 = 4 + 1
284
// IntegerPolynomial25519: 10 = 8 + 2
285
// IntegerPolynomialP256: 10 = 8 + 2
286
// Curve25519OrderField: 10 = 8 + 2
287
// Curve25519OrderField: 10 = 8 + 2
288
// P256OrderField: 10 = 8 + 2
289
// IntegerPolynomialP384: 14 = 8 + 4 + 2
290
// P384OrderField: 14 = 8 + 4 + 2
291
// IntegerPolynomial448: 16 = 8 + 8
292
// Curve448OrderField: 16 = 8 + 8
293
// Curve448OrderField: 16 = 8 + 8
294
// IntegerPolynomialP521: 19 = 8 + 8 + 2 + 1
295
// P521OrderField: 19 = 8 + 8 + 2 + 1
296
// Special Cases 5, 10, 14, 16, 19
298
__ align(CodeEntryAlignment);
299
StubCodeMark mark(this, "StubRoutines", "intpoly_assign");
300
address start = __ pc();
304
const Register set = c_rarg0;
305
const Register aLimbs = c_rarg1;
306
const Register bLimbs = c_rarg2;
307
const Register length = c_rarg3;
308
XMMRegister A = xmm0;
309
XMMRegister B = xmm1;
312
KRegister select = k1;
313
Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_DefaultLoop, L_Done;
316
__ kmovql(select, set);
318
// NOTE! Crypto code cannot branch on user input. However; allowed to branch on number of limbs;
319
// Number of limbs is a constant in each IntegerPolynomial (i.e. this side-channel branch leaks
320
// number of limbs which is not a secret)
322
__ jcc(Assembler::equal, L_Length5);
324
__ jcc(Assembler::equal, L_Length10);
326
__ jcc(Assembler::equal, L_Length14);
328
__ jcc(Assembler::equal, L_Length16);
330
__ jcc(Assembler::equal, L_Length19);
332
// Default copy loop (UNLIKELY)
334
__ jcc(Assembler::lessEqual, L_Done);
335
__ bind(L_DefaultLoop);
336
assign_scalar(Address(aLimbs, 0), Address(bLimbs, 0), set, tmp, _masm);
338
__ lea(aLimbs, Address(aLimbs,8));
339
__ lea(bLimbs, Address(bLimbs,8));
341
__ jcc(Assembler::greater, L_DefaultLoop);
344
__ bind(L_Length5); // 1 + 4
345
assign_scalar(Address(aLimbs, 0), Address(bLimbs, 0), set, tmp, _masm);
346
assign_avx(A, Address(aLimbs, 8), B, Address(bLimbs, 8), select, Assembler::AVX_256bit, _masm);
349
__ bind(L_Length10); // 2 + 8
350
assign_avx(A, Address(aLimbs, 0), B, Address(bLimbs, 0), select, Assembler::AVX_128bit, _masm);
351
assign_avx(A, Address(aLimbs, 16), B, Address(bLimbs, 16), select, Assembler::AVX_512bit, _masm);
354
__ bind(L_Length14); // 2 + 4 + 8
355
assign_avx(A, Address(aLimbs, 0), B, Address(bLimbs, 0), select, Assembler::AVX_128bit, _masm);
356
assign_avx(A, Address(aLimbs, 16), B, Address(bLimbs, 16), select, Assembler::AVX_256bit, _masm);
357
assign_avx(A, Address(aLimbs, 48), B, Address(bLimbs, 48), select, Assembler::AVX_512bit, _masm);
360
__ bind(L_Length16); // 8 + 8
361
assign_avx(A, Address(aLimbs, 0), B, Address(bLimbs, 0), select, Assembler::AVX_512bit, _masm);
362
assign_avx(A, Address(aLimbs, 64), B, Address(bLimbs, 64), select, Assembler::AVX_512bit, _masm);
365
__ bind(L_Length19); // 1 + 2 + 8 + 8
366
assign_scalar(Address(aLimbs, 0), Address(bLimbs, 0), set, tmp, _masm);
367
assign_avx(A, Address(aLimbs, 8), B, Address(bLimbs, 8), select, Assembler::AVX_128bit, _masm);
368
assign_avx(A, Address(aLimbs, 24), B, Address(bLimbs, 24), select, Assembler::AVX_512bit, _masm);
369
assign_avx(A, Address(aLimbs, 88), B, Address(bLimbs, 88), select, Assembler::AVX_512bit, _masm);