jdk

Форк
0
/
stubGenerator_x86_64_poly_mont.cpp 
375 строк · 14.5 Кб
1
/*
2
 * Copyright (c) 2024, Intel Corporation. All rights reserved.
3
 *
4
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
 *
6
 * This code is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU General Public License version 2 only, as
8
 * published by the Free Software Foundation.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 *
24
 */
25

26
#include "precompiled.hpp"
27
#include "macroAssembler_x86.hpp"
28
#include "stubGenerator_x86_64.hpp"
29

30
#define __ _masm->
31

32
ATTRIBUTE_ALIGNED(64) uint64_t MODULUS_P256[] = {
33
  0x000fffffffffffffULL, 0x00000fffffffffffULL,
34
  0x0000000000000000ULL, 0x0000001000000000ULL,
35
  0x0000ffffffff0000ULL, 0x0000000000000000ULL,
36
  0x0000000000000000ULL, 0x0000000000000000ULL
37
};
38
static address modulus_p256() {
39
  return (address)MODULUS_P256;
40
}
41

42
ATTRIBUTE_ALIGNED(64) uint64_t P256_MASK52[] = {
43
  0x000fffffffffffffULL, 0x000fffffffffffffULL,
44
  0x000fffffffffffffULL, 0x000fffffffffffffULL,
45
  0xffffffffffffffffULL, 0xffffffffffffffffULL,
46
  0xffffffffffffffffULL, 0xffffffffffffffffULL,
47
};
48
static address p256_mask52() {
49
  return (address)P256_MASK52;
50
}
51

52
ATTRIBUTE_ALIGNED(64) uint64_t SHIFT1R[] = {
53
  0x0000000000000001ULL, 0x0000000000000002ULL,
54
  0x0000000000000003ULL, 0x0000000000000004ULL,
55
  0x0000000000000005ULL, 0x0000000000000006ULL,
56
  0x0000000000000007ULL, 0x0000000000000000ULL,
57
};
58
static address shift_1R() {
59
  return (address)SHIFT1R;
60
}
61

62
ATTRIBUTE_ALIGNED(64) uint64_t SHIFT1L[] = {
63
  0x0000000000000007ULL, 0x0000000000000000ULL,
64
  0x0000000000000001ULL, 0x0000000000000002ULL,
65
  0x0000000000000003ULL, 0x0000000000000004ULL,
66
  0x0000000000000005ULL, 0x0000000000000006ULL,
67
};
68
static address shift_1L() {
69
  return (address)SHIFT1L;
70
}
71

72
/**
73
 * Unrolled Word-by-Word Montgomery Multiplication
74
 * r = a * b * 2^-260 (mod P)
75
 *
76
 * Reference [1]: Shay Gueron and Vlad Krasnov
77
 *    "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
78
 *    See Figure 5. "Algorithm 2: Word-by-Word Montgomery Multiplication for a Montgomery
79
 *    Friendly modulus p". Note: Step 6. Skipped; Instead use numAdds to reuse existing overflow
80
 *    logic.
81
 *
82
 * Pseudocode:
83
 *
84
 *                                                     +--+--+--+--+--+--+--+--+
85
 *   M = load(*modulus_p256)                           | 0| 0| 0|m5|m4|m3|m2|m1|
86
 *                                                     +--+--+--+--+--+--+--+--+
87
 *   A = load(*aLimbs)                                 | 0| 0| 0|a5|a4|a3|a2|a1|
88
 *                                                     +--+--+--+--+--+--+--+--+
89
 *   Acc1 = 0                                          | 0| 0| 0| 0| 0| 0| 0| 0|
90
 *                                                     +--+--+--+--+--+--+--+--+
91
 *      ---- for i = 0 to 4
92
 *                                                     +--+--+--+--+--+--+--+--+
93
 *          Acc2 = 0                                   | 0| 0| 0| 0| 0| 0| 0| 0|
94
 *                                                     +--+--+--+--+--+--+--+--+
95
 *          B = replicate(bLimbs[i])                   |bi|bi|bi|bi|bi|bi|bi|bi|
96
 *                                                     +--+--+--+--+--+--+--+--+
97
 *                                                     +--+--+--+--+--+--+--+--+
98
 *                                               Acc1+=| 0| 0| 0|c5|c4|c3|c2|c1|
99
 *                                                    *| 0| 0| 0|a5|a4|a3|a2|a1|
100
 *          Acc1 += A *  B                             |bi|bi|bi|bi|bi|bi|bi|bi|
101
 *                                                     +--+--+--+--+--+--+--+--+
102
 *                                               Acc2+=| 0| 0| 0| 0| 0| 0| 0| 0|
103
 *                                                   *h| 0| 0| 0|a5|a4|a3|a2|a1|
104
 *          Acc2 += A *h B                             |bi|bi|bi|bi|bi|bi|bi|bi|
105
 *                                                     +--+--+--+--+--+--+--+--+
106
 *          N = replicate(Acc1[0])                     |n0|n0|n0|n0|n0|n0|n0|n0|
107
 *                                                     +--+--+--+--+--+--+--+--+
108
 *                                                     +--+--+--+--+--+--+--+--+
109
 *                                               Acc1+=| 0| 0| 0|c5|c4|c3|c2|c1|
110
 *                                                    *| 0| 0| 0|m5|m4|m3|m2|m1|
111
 *          Acc1 += M *  N                             |n0|n0|n0|n0|n0|n0|n0|n0| Note: 52 low bits of Acc1[0] == 0 due to Montgomery!
112
 *                                                     +--+--+--+--+--+--+--+--+
113
 *                                               Acc2+=| 0| 0| 0|d5|d4|d3|d2|d1|
114
 *                                                   *h| 0| 0| 0|m5|m4|m3|m2|m1|
115
 *          Acc2 += M *h N                             |n0|n0|n0|n0|n0|n0|n0|n0|
116
 *                                                     +--+--+--+--+--+--+--+--+
117
 *          if (i == 4) break;
118
 *          // Combine high/low partial sums Acc1 + Acc2
119
 *                                                     +--+--+--+--+--+--+--+--+
120
 *          carry = Acc1[0] >> 52                      | 0| 0| 0| 0| 0| 0| 0|c1|
121
 *                                                     +--+--+--+--+--+--+--+--+
122
 *          Acc2[0] += carry
123
 *                                                     +--+--+--+--+--+--+--+--+
124
 *          Acc1 = Acc1 shift one q element>>          | 0| 0| 0| 0|c5|c4|c3|c2|
125
 *                                                     +--+--+--+--+--+--+--+--+
126
 *          Acc1 = Acc1 + Acc2
127
 *      ---- done
128
 *   // Last Carry round: Combine high/low partial sums Acc1<high_bits> + Acc1 + Acc2
129
 *   carry = Acc1 >> 52
130
 *   Acc1 = Acc1 shift one q element >>
131
 *   Acc1  = mask52(Acc1)
132
 *   Acc2  += carry
133
 *   Acc1 = Acc1 + Acc2
134
 *   output to rLimbs
135
 */
136
void montgomeryMultiply(const Register aLimbs, const Register bLimbs, const Register rLimbs, const Register tmp, MacroAssembler* _masm) {
137
  Register t0 = tmp;
138
  Register rscratch = tmp;
139

140
  // Inputs
141
  XMMRegister A = xmm0;
142
  XMMRegister B = xmm1;
143
  XMMRegister T = xmm2;
144

145
  // Intermediates
146
  XMMRegister Acc1 = xmm10;
147
  XMMRegister Acc2 = xmm11;
148
  XMMRegister N    = xmm12;
149
  XMMRegister carry = xmm13;
150

151
  // // Constants
152
  XMMRegister modulus = xmm20;
153
  XMMRegister shift1L = xmm21;
154
  XMMRegister shift1R = xmm22;
155
  XMMRegister mask52  = xmm23;
156
  KRegister limb0    = k1;
157
  KRegister allLimbs = k2;
158

159
  __ mov64(t0, 0x1);
160
  __ kmovql(limb0, t0);
161
  __ mov64(t0, 0x1f);
162
  __ kmovql(allLimbs, t0);
163
  __ evmovdquq(shift1L, allLimbs, ExternalAddress(shift_1L()), false, Assembler::AVX_512bit, rscratch);
164
  __ evmovdquq(shift1R, allLimbs, ExternalAddress(shift_1R()), false, Assembler::AVX_512bit, rscratch);
165
  __ evmovdquq(mask52, allLimbs, ExternalAddress(p256_mask52()), false, Assembler::AVX_512bit, rscratch);
166

167
  // M = load(*modulus_p256)
168
  __ evmovdquq(modulus, allLimbs, ExternalAddress(modulus_p256()), false, Assembler::AVX_512bit, rscratch);
169

170
  // A = load(*aLimbs);  masked evmovdquq() can be slow. Instead load full 256bit, and compbine with 64bit
171
  __ evmovdquq(A, Address(aLimbs, 8), Assembler::AVX_256bit);
172
  __ evpermq(A, allLimbs, shift1L, A, false, Assembler::AVX_512bit);
173
  __ movq(T, Address(aLimbs, 0));
174
  __ evporq(A, A, T, Assembler::AVX_512bit);
175

176
  // Acc1 = 0
177
  __ vpxorq(Acc1, Acc1, Acc1, Assembler::AVX_512bit);
178
  for (int i = 0; i< 5; i++) {
179
      // Acc2 = 0
180
      __ vpxorq(Acc2, Acc2, Acc2, Assembler::AVX_512bit);
181

182
      // B = replicate(bLimbs[i])
183
      __ vpbroadcastq(B, Address(bLimbs, i*8), Assembler::AVX_512bit);
184

185
      // Acc1 += A * B
186
      __ evpmadd52luq(Acc1, A, B, Assembler::AVX_512bit);
187

188
      // Acc2 += A *h B
189
      __ evpmadd52huq(Acc2, A, B, Assembler::AVX_512bit);
190

191
      // N = replicate(Acc1[0])
192
      __ vpbroadcastq(N, Acc1, Assembler::AVX_512bit);
193

194
      // Acc1 += M *  N
195
      __ evpmadd52luq(Acc1, modulus, N, Assembler::AVX_512bit);
196

197
      // Acc2 += M *h N
198
      __ evpmadd52huq(Acc2, modulus, N, Assembler::AVX_512bit);
199

200
      if (i == 4) break;
201

202
      // Combine high/low partial sums Acc1 + Acc2
203

204
      // carry = Acc1[0] >> 52
205
      __ evpsrlq(carry, limb0, Acc1, 52, true, Assembler::AVX_512bit);
206

207
      // Acc2[0] += carry
208
      __ evpaddq(Acc2, limb0, carry, Acc2, true, Assembler::AVX_512bit);
209

210
      // Acc1 = Acc1 shift one q element >>
211
      __ evpermq(Acc1, allLimbs, shift1R, Acc1, false, Assembler::AVX_512bit);
212

213
      // Acc1 = Acc1 + Acc2
214
      __ vpaddq(Acc1, Acc1, Acc2, Assembler::AVX_512bit);
215
  }
216

217
  // Last Carry round: Combine high/low partial sums Acc1<high_bits> + Acc1 + Acc2
218
  // carry = Acc1 >> 52
219
  __ evpsrlq(carry, allLimbs, Acc1, 52, true, Assembler::AVX_512bit);
220

221
  // Acc1 = Acc1 shift one q element >>
222
  __ evpermq(Acc1, allLimbs, shift1R, Acc1, false, Assembler::AVX_512bit);
223

224
  // Acc1  = mask52(Acc1)
225
  __ evpandq(Acc1, Acc1, mask52, Assembler::AVX_512bit); // Clear top 12 bits
226

227
  // Acc2 += carry
228
  __ evpaddq(Acc2, allLimbs, carry, Acc2, true, Assembler::AVX_512bit);
229

230
  // Acc1 = Acc1 + Acc2
231
  __ vpaddq(Acc1, Acc1, Acc2, Assembler::AVX_512bit);
232

233
  // output to rLimbs (1 + 4 limbs)
234
  __ movq(Address(rLimbs, 0), Acc1);
235
  __ evpermq(Acc1, k0, shift1R, Acc1, true, Assembler::AVX_512bit);
236
  __ evmovdquq(Address(rLimbs, 8), k0, Acc1, true, Assembler::AVX_256bit);
237
}
238

239
address StubGenerator::generate_intpoly_montgomeryMult_P256() {
240
  __ align(CodeEntryAlignment);
241
  StubCodeMark mark(this, "StubRoutines", "intpoly_montgomeryMult_P256");
242
  address start = __ pc();
243
  __ enter();
244

245
  // Register Map
246
  const Register aLimbs  = c_rarg0; // rdi | rcx
247
  const Register bLimbs  = c_rarg1; // rsi | rdx
248
  const Register rLimbs  = c_rarg2; // rdx | r8
249
  const Register tmp     = r9;
250

251
  montgomeryMultiply(aLimbs, bLimbs, rLimbs, tmp, _masm);
252

253
  __ leave();
254
  __ ret(0);
255
  return start;
256
}
257

258
// A = B if select
259
// Must be:
260
//  - constant time (i.e. no branches)
261
//  - no-side channel (i.e. all memory must always be accessed, and in same order)
262
void assign_avx(XMMRegister A, Address aAddr, XMMRegister B, Address bAddr, KRegister select, int vector_len, MacroAssembler* _masm) {
263
  __ evmovdquq(A, aAddr, vector_len);
264
  __ evmovdquq(B, bAddr, vector_len);
265
  __ evmovdquq(A, select, B, true, vector_len);
266
  __ evmovdquq(aAddr, A, vector_len);
267
}
268

269
void assign_scalar(Address aAddr, Address bAddr, Register select, Register tmp, MacroAssembler* _masm) {
270
  // Original java:
271
  // long dummyLimbs = maskValue & (a[i] ^ b[i]);
272
  // a[i] = dummyLimbs ^ a[i];
273

274
  __ movq(tmp, aAddr);
275
  __ xorq(tmp, bAddr);
276
  __ andq(tmp, select);
277
  __ xorq(aAddr, tmp);
278
}
279

280
address StubGenerator::generate_intpoly_assign() {
281
  // KNOWN Lengths:
282
  //   MontgomeryIntPolynP256:  5 = 4 + 1
283
  //   IntegerPolynomial1305:   5 = 4 + 1
284
  //   IntegerPolynomial25519: 10 = 8 + 2
285
  //   IntegerPolynomialP256:  10 = 8 + 2
286
  //   Curve25519OrderField:   10 = 8 + 2
287
  //   Curve25519OrderField:   10 = 8 + 2
288
  //   P256OrderField:         10 = 8 + 2
289
  //   IntegerPolynomialP384:  14 = 8 + 4 + 2
290
  //   P384OrderField:         14 = 8 + 4 + 2
291
  //   IntegerPolynomial448:   16 = 8 + 8
292
  //   Curve448OrderField:     16 = 8 + 8
293
  //   Curve448OrderField:     16 = 8 + 8
294
  //   IntegerPolynomialP521:  19 = 8 + 8 + 2 + 1
295
  //   P521OrderField:         19 = 8 + 8 + 2 + 1
296
  // Special Cases 5, 10, 14, 16, 19
297

298
  __ align(CodeEntryAlignment);
299
  StubCodeMark mark(this, "StubRoutines", "intpoly_assign");
300
  address start = __ pc();
301
  __ enter();
302

303
  // Inputs
304
  const Register set     = c_rarg0;
305
  const Register aLimbs  = c_rarg1;
306
  const Register bLimbs  = c_rarg2;
307
  const Register length  = c_rarg3;
308
  XMMRegister A = xmm0;
309
  XMMRegister B = xmm1;
310

311
  Register tmp = r9;
312
  KRegister select = k1;
313
  Label L_Length5, L_Length10, L_Length14, L_Length16, L_Length19, L_DefaultLoop, L_Done;
314

315
  __ negq(set);
316
  __ kmovql(select, set);
317

318
  // NOTE! Crypto code cannot branch on user input. However; allowed to branch on number of limbs;
319
  // Number of limbs is a constant in each IntegerPolynomial (i.e. this side-channel branch leaks
320
  //   number of limbs which is not a secret)
321
  __ cmpl(length, 5);
322
  __ jcc(Assembler::equal, L_Length5);
323
  __ cmpl(length, 10);
324
  __ jcc(Assembler::equal, L_Length10);
325
  __ cmpl(length, 14);
326
  __ jcc(Assembler::equal, L_Length14);
327
  __ cmpl(length, 16);
328
  __ jcc(Assembler::equal, L_Length16);
329
  __ cmpl(length, 19);
330
  __ jcc(Assembler::equal, L_Length19);
331

332
  // Default copy loop (UNLIKELY)
333
  __ cmpl(length, 0);
334
  __ jcc(Assembler::lessEqual, L_Done);
335
  __ bind(L_DefaultLoop);
336
  assign_scalar(Address(aLimbs, 0), Address(bLimbs, 0), set, tmp, _masm);
337
  __ subl(length, 1);
338
  __ lea(aLimbs, Address(aLimbs,8));
339
  __ lea(bLimbs, Address(bLimbs,8));
340
  __ cmpl(length, 0);
341
  __ jcc(Assembler::greater, L_DefaultLoop);
342
  __ jmp(L_Done);
343

344
  __ bind(L_Length5); // 1 + 4
345
  assign_scalar(Address(aLimbs, 0), Address(bLimbs, 0), set, tmp, _masm);
346
  assign_avx(A, Address(aLimbs, 8), B, Address(bLimbs, 8), select, Assembler::AVX_256bit, _masm);
347
  __ jmp(L_Done);
348

349
  __ bind(L_Length10); // 2 + 8
350
  assign_avx(A, Address(aLimbs, 0),  B, Address(bLimbs, 0),  select, Assembler::AVX_128bit, _masm);
351
  assign_avx(A, Address(aLimbs, 16), B, Address(bLimbs, 16), select, Assembler::AVX_512bit, _masm);
352
  __ jmp(L_Done);
353

354
  __ bind(L_Length14); // 2 + 4 + 8
355
  assign_avx(A, Address(aLimbs, 0),  B, Address(bLimbs, 0),  select, Assembler::AVX_128bit, _masm);
356
  assign_avx(A, Address(aLimbs, 16), B, Address(bLimbs, 16), select, Assembler::AVX_256bit, _masm);
357
  assign_avx(A, Address(aLimbs, 48), B, Address(bLimbs, 48), select, Assembler::AVX_512bit, _masm);
358
  __ jmp(L_Done);
359

360
  __ bind(L_Length16); // 8 + 8
361
  assign_avx(A, Address(aLimbs, 0),  B, Address(bLimbs, 0),  select, Assembler::AVX_512bit, _masm);
362
  assign_avx(A, Address(aLimbs, 64), B, Address(bLimbs, 64), select, Assembler::AVX_512bit, _masm);
363
  __ jmp(L_Done);
364

365
  __ bind(L_Length19); // 1 + 2 + 8 + 8
366
  assign_scalar(Address(aLimbs, 0), Address(bLimbs, 0), set, tmp, _masm);
367
  assign_avx(A, Address(aLimbs, 8),  B, Address(bLimbs, 8),  select, Assembler::AVX_128bit, _masm);
368
  assign_avx(A, Address(aLimbs, 24), B, Address(bLimbs, 24), select, Assembler::AVX_512bit, _masm);
369
  assign_avx(A, Address(aLimbs, 88), B, Address(bLimbs, 88), select, Assembler::AVX_512bit, _masm);
370

371
  __ bind(L_Done);
372
  __ leave();
373
  __ ret(0);
374
  return start;
375
}
376

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.