jdk

macroAssembler_aarch64_aes.cpp
691 строка · 24.2 Кб
Перенос по словам
1
/*
2
 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
3
 * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
4
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5
 *
6
 * This code is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU General Public License version 2 only, as
8
 * published by the Free Software Foundation.
9
 *
10
 * This code is distributed in the hope that it will be useful, but WITHOUT
11
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
 * version 2 for more details (a copy is included in the LICENSE file that
14
 * accompanied this code).
15
 *
16
 * You should have received a copy of the GNU General Public License version
17
 * 2 along with this work; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19
 *
20
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21
 * or visit www.oracle.com if you need additional information or have any
22
 * questions.
23
 *
24
 */
25

26
#include "precompiled.hpp"
27

28
#include "asm/assembler.hpp"
29
#include "asm/assembler.inline.hpp"
30
#include "macroAssembler_aarch64.hpp"
31
#include "memory/resourceArea.hpp"
32
#include "runtime/stubRoutines.hpp"
33

34
void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
35
  Label L_doLast;
36

37
  ld1(v0, T16B, from); // get 16 bytes of input
38

39
  ld1(v5, T16B, post(key, 16));
40
  rev32(v5, T16B, v5);
41

42
  ld1(v1, v2, v3, v4, T16B, post(key, 64));
43
  rev32(v1, T16B, v1);
44
  rev32(v2, T16B, v2);
45
  rev32(v3, T16B, v3);
46
  rev32(v4, T16B, v4);
47
  aesd(v0, v1);
48
  aesimc(v0, v0);
49
  aesd(v0, v2);
50
  aesimc(v0, v0);
51
  aesd(v0, v3);
52
  aesimc(v0, v0);
53
  aesd(v0, v4);
54
  aesimc(v0, v0);
55

56
  ld1(v1, v2, v3, v4, T16B, post(key, 64));
57
  rev32(v1, T16B, v1);
58
  rev32(v2, T16B, v2);
59
  rev32(v3, T16B, v3);
60
  rev32(v4, T16B, v4);
61
  aesd(v0, v1);
62
  aesimc(v0, v0);
63
  aesd(v0, v2);
64
  aesimc(v0, v0);
65
  aesd(v0, v3);
66
  aesimc(v0, v0);
67
  aesd(v0, v4);
68
  aesimc(v0, v0);
69

70
  ld1(v1, v2, T16B, post(key, 32));
71
  rev32(v1, T16B, v1);
72
  rev32(v2, T16B, v2);
73

74
  cmpw(keylen, 44);
75
  br(Assembler::EQ, L_doLast);
76

77
  aesd(v0, v1);
78
  aesimc(v0, v0);
79
  aesd(v0, v2);
80
  aesimc(v0, v0);
81

82
  ld1(v1, v2, T16B, post(key, 32));
83
  rev32(v1, T16B, v1);
84
  rev32(v2, T16B, v2);
85

86
  cmpw(keylen, 52);
87
  br(Assembler::EQ, L_doLast);
88

89
  aesd(v0, v1);
90
  aesimc(v0, v0);
91
  aesd(v0, v2);
92
  aesimc(v0, v0);
93

94
  ld1(v1, v2, T16B, post(key, 32));
95
  rev32(v1, T16B, v1);
96
  rev32(v2, T16B, v2);
97

98
  bind(L_doLast);
99

100
  aesd(v0, v1);
101
  aesimc(v0, v0);
102
  aesd(v0, v2);
103

104
  eor(v0, T16B, v0, v5);
105

106
  st1(v0, T16B, to);
107

108
  // Preserve the address of the start of the key
109
  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
110
}
111

112
// Load expanded key into v17..v31
113
void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
114
  Label L_loadkeys_44, L_loadkeys_52;
115
  cmpw(keylen, 52);
116
  br(Assembler::LO, L_loadkeys_44);
117
  br(Assembler::EQ, L_loadkeys_52);
118

119
  ld1(v17, v18,  T16B,  post(key, 32));
120
  rev32(v17,  T16B, v17);
121
  rev32(v18,  T16B, v18);
122
  bind(L_loadkeys_52);
123
  ld1(v19, v20,  T16B,  post(key, 32));
124
  rev32(v19,  T16B, v19);
125
  rev32(v20,  T16B, v20);
126
  bind(L_loadkeys_44);
127
  ld1(v21, v22, v23, v24,  T16B,  post(key, 64));
128
  rev32(v21,  T16B, v21);
129
  rev32(v22,  T16B, v22);
130
  rev32(v23,  T16B, v23);
131
  rev32(v24,  T16B, v24);
132
  ld1(v25, v26, v27, v28,  T16B,  post(key, 64));
133
  rev32(v25,  T16B, v25);
134
  rev32(v26,  T16B, v26);
135
  rev32(v27,  T16B, v27);
136
  rev32(v28,  T16B, v28);
137
  ld1(v29, v30, v31,  T16B, post(key, 48));
138
  rev32(v29,  T16B, v29);
139
  rev32(v30,  T16B, v30);
140
  rev32(v31,  T16B, v31);
141

142
  // Preserve the address of the start of the key
143
  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
144
}
145

146
// NeoverseTM N1Software Optimization Guide:
147
// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
148
// instruction pairs will exhibit the performance characteristics
149
// described in Section 4.6.
150
void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
151
  aese(input, subkey); aesmc(input, input);
152
}
153

154
// KernelGenerator
155
//
156
// The abstract base class of an unrolled function generator.
157
// Subclasses override generate(), length(), and next() to generate
158
// unrolled and interleaved functions.
159
//
160
// The core idea is that a subclass defines a method which generates
161
// the base case of a function and a method to generate a clone of it,
162
// shifted to a different set of registers. KernelGenerator will then
163
// generate several interleaved copies of the function, with each one
164
// using a different set of registers.
165

166
// The subclass must implement three methods: length(), which is the
167
// number of instruction bundles in the intrinsic, generate(int n)
168
// which emits the nth instruction bundle in the intrinsic, and next()
169
// which takes an instance of the generator and returns a version of it,
170
// shifted to a new set of registers.
171

172
class KernelGenerator: public MacroAssembler {
173
protected:
174
  const int _unrolls;
175
public:
176
  KernelGenerator(Assembler *as, int unrolls)
177
    : MacroAssembler(as->code()), _unrolls(unrolls) { }
178
  virtual void generate(int index) = 0;
179
  virtual int length() = 0;
180
  virtual KernelGenerator *next() = 0;
181
  int unrolls() { return _unrolls; }
182
  void unroll();
183
};
184

185
void KernelGenerator::unroll() {
186
  ResourceMark rm;
187
  KernelGenerator **generators
188
    = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
189

190
  generators[0] = this;
191
  for (int i = 1; i < unrolls(); i++) {
192
    generators[i] = generators[i-1]->next();
193
  }
194

195
  for (int j = 0; j < length(); j++) {
196
    for (int i = 0; i < unrolls(); i++) {
197
      generators[i]->generate(j);
198
    }
199
  }
200
}
201

202
// An unrolled and interleaved generator for AES encryption.
203
class AESKernelGenerator: public KernelGenerator {
204
  Register _from, _to;
205
  const Register _keylen;
206
  FloatRegister _data;
207
  const FloatRegister _subkeys;
208
  bool _once;
209
  Label _rounds_44, _rounds_52;
210

211
public:
212
  AESKernelGenerator(Assembler *as, int unrolls,
213
                     Register from, Register to, Register keylen, FloatRegister data,
214
                     FloatRegister subkeys, bool once = true)
215
    : KernelGenerator(as, unrolls),
216
      _from(from), _to(to), _keylen(keylen), _data(data),
217
      _subkeys(subkeys), _once(once) {
218
  }
219

220
  virtual void generate(int index) {
221
    switch (index) {
222
    case  0:
223
      if (_from != noreg) {
224
        ld1(_data, T16B, _from); // get 16 bytes of input
225
      }
226
      break;
227
    case  1:
228
      if (_once) {
229
        cmpw(_keylen, 52);
230
        br(Assembler::LO, _rounds_44);
231
        br(Assembler::EQ, _rounds_52);
232
      }
233
      break;
234
    case  2:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 0));  break;
235
    case  3:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 1));  break;
236
    case  4:
237
      if (_once)  bind(_rounds_52);
238
      break;
239
    case  5:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 2));  break;
240
    case  6:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 3));  break;
241
    case  7:
242
      if (_once)  bind(_rounds_44);
243
      break;
244
    case  8:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  4));  break;
245
    case  9:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  5));  break;
246
    case 10:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  6));  break;
247
    case 11:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  7));  break;
248
    case 12:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  8));  break;
249
    case 13:  aes_round(_data, as_FloatRegister(_subkeys->encoding() +  9));  break;
250
    case 14:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 10));  break;
251
    case 15:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 11));  break;
252
    case 16:  aes_round(_data, as_FloatRegister(_subkeys->encoding() + 12));  break;
253
    case 17:  aese(_data, as_FloatRegister(_subkeys->encoding() + 13));  break;
254
    case 18:  eor(_data, T16B, _data, as_FloatRegister(_subkeys->encoding() + 14));  break;
255
    case 19:
256
      if (_to != noreg) {
257
        st1(_data, T16B, _to);
258
      }
259
      break;
260
    default: ShouldNotReachHere();
261
    }
262
  }
263

264
  virtual KernelGenerator *next() {
265
    return new AESKernelGenerator(this, _unrolls,
266
                                  _from, _to, _keylen,
267
                                  _data->successor(), _subkeys, /*once*/false);
268
  }
269

270
  virtual int length() { return 20; }
271
};
272

273
// Uses expanded key in v17..v31
274
// Returns encrypted values in inputs.
275
// If to != noreg, store value at to; likewise from
276
// Preserves key, keylen
277
// Increments from, to
278
// Input data in v0, v1, ...
279
// unrolls controls the number of times to unroll the generated function
280
void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
281
                                    FloatRegister data, int unrolls) {
282
  AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
283
}
284

285
// ghash_multiply and ghash_reduce are the non-unrolled versions of
286
// the GHASH function generators.
287
void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
288
                                     FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
289
                                     FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
290
  // Karatsuba multiplication performs a 128*128 -> 256-bit
291
  // multiplication in three 128-bit multiplications and a few
292
  // additions.
293
  //
294
  // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
295
  // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
296
  //
297
  // Inputs:
298
  //
299
  // A0 in a.d[0]     (subkey)
300
  // A1 in a.d[1]
301
  // (A1+A0) in a1_xor_a0.d[0]
302
  //
303
  // B0 in b.d[0]     (state)
304
  // B1 in b.d[1]
305

306
  ext(tmp1, T16B, b, b, 0x08);
307
  pmull2(result_hi, T1Q, b, a, T2D);  // A1*B1
308
  eor(tmp1, T16B, tmp1, b);           // (B1+B0)
309
  pmull(result_lo,  T1Q, b, a, T1D);  // A0*B0
310
  pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
311

312
  ext(tmp1, T16B, result_lo, result_hi, 0x08);
313
  eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
314
  eor(tmp2, T16B, tmp2, tmp1);
315
  eor(tmp2, T16B, tmp2, tmp3);
316

317
  // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
318
  ins(result_hi, D, tmp2, 0, 1);
319
  ins(result_lo, D, tmp2, 1, 0);
320
}
321

322
void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
323
                  FloatRegister p, FloatRegister vzr, FloatRegister t1) {
324
  const FloatRegister t0 = result;
325

326
  // The GCM field polynomial f is z^128 + p(z), where p =
327
  // z^7+z^2+z+1.
328
  //
329
  //    z^128 === -p(z)  (mod (z^128 + p(z)))
330
  //
331
  // so, given that the product we're reducing is
332
  //    a == lo + hi * z^128
333
  // substituting,
334
  //      === lo - hi * p(z)  (mod (z^128 + p(z)))
335
  //
336
  // we reduce by multiplying hi by p(z) and subtracting the result
337
  // from (i.e. XORing it with) lo.  Because p has no nonzero high
338
  // bits we can do this with two 64-bit multiplications, lo*p and
339
  // hi*p.
340

341
  pmull2(t0, T1Q, hi, p, T2D);
342
  ext(t1, T16B, t0, vzr, 8);
343
  eor(hi, T16B, hi, t1);
344
  ext(t1, T16B, vzr, t0, 8);
345
  eor(lo, T16B, lo, t1);
346
  pmull(t0, T1Q, hi, p, T1D);
347
  eor(result, T16B, lo, t0);
348
}
349

350
class GHASHMultiplyGenerator: public KernelGenerator {
351
  FloatRegister _result_lo, _result_hi, _b,
352
    _a, _vzr, _a1_xor_a0, _p,
353
    _tmp1, _tmp2, _tmp3;
354

355
public:
356
  GHASHMultiplyGenerator(Assembler *as, int unrolls,
357
                         /* offsetted registers */
358
                         FloatRegister result_lo, FloatRegister result_hi,
359
                         FloatRegister b,
360
                         /* non-offsetted (shared) registers */
361
                         FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
362
                         /* offsetted (temp) registers */
363
                         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
364
    : KernelGenerator(as, unrolls),
365
      _result_lo(result_lo), _result_hi(result_hi), _b(b),
366
      _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
367
      _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
368

369
  int register_stride = 7;
370

371
  virtual void generate(int index) {
372
    // Karatsuba multiplication performs a 128*128 -> 256-bit
373
    // multiplication in three 128-bit multiplications and a few
374
    // additions.
375
    //
376
    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
377
    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
378
    //
379
    // Inputs:
380
    //
381
    // A0 in a.d[0]     (subkey)
382
    // A1 in a.d[1]
383
    // (A1+A0) in a1_xor_a0.d[0]
384
    //
385
    // B0 in b.d[0]     (state)
386
    // B1 in b.d[1]
387

388
    switch (index) {
389
      case  0:  ext(_tmp1, T16B, _b, _b, 0x08);  break;
390
      case  1:  pmull2(_result_hi, T1Q, _b, _a, T2D);  // A1*B1
391
        break;
392
      case  2:  eor(_tmp1, T16B, _tmp1, _b);           // (B1+B0)
393
        break;
394
      case  3:  pmull(_result_lo,  T1Q, _b, _a, T1D);  // A0*B0
395
        break;
396
      case  4:  pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
397
        break;
398

399
      case  5:  ext(_tmp1, T16B, _result_lo, _result_hi, 0x08);  break;
400
      case  6:  eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
401
        break;
402
      case  7:  eor(_tmp2, T16B, _tmp2, _tmp1);  break;
403
      case  8:  eor(_tmp2, T16B, _tmp2, _tmp3);  break;
404

405
        // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
406
      case  9:  ins(_result_hi, D, _tmp2, 0, 1);  break;
407
      case 10:  ins(_result_lo, D, _tmp2, 1, 0);  break;
408
      default: ShouldNotReachHere();
409
    }
410
  }
411

412
  virtual KernelGenerator* next() {
413
    GHASHMultiplyGenerator* result = new GHASHMultiplyGenerator(*this);
414
    result->_result_lo = as_FloatRegister(result->_result_lo->encoding() + register_stride);
415
    result->_result_hi = as_FloatRegister(result->_result_hi->encoding() + register_stride);
416
    result->_b         = as_FloatRegister(result->_b        ->encoding() + register_stride);
417
    result->_tmp1      = as_FloatRegister(result->_tmp1     ->encoding() + register_stride);
418
    result->_tmp2      = as_FloatRegister(result->_tmp2     ->encoding() + register_stride);
419
    result->_tmp3      = as_FloatRegister(result->_tmp3     ->encoding() + register_stride);
420
    return result;
421
  }
422

423
  virtual int length() { return 11; }
424
};
425

426
// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
427
// The FloatRegister argument called data is optional: if it is a
428
// valid register, we interleave LD1 instructions with the
429
// reduction. This is to reduce latency next time around the loop.
430
class GHASHReduceGenerator: public KernelGenerator {
431
  FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
432
  int _once;
433
public:
434
  GHASHReduceGenerator(Assembler *as, int unrolls,
435
                       /* offsetted registers */
436
                       FloatRegister result, FloatRegister lo, FloatRegister hi,
437
                       /* non-offsetted (shared) registers */
438
                       FloatRegister p, FloatRegister vzr, FloatRegister data,
439
                       /* offsetted (temp) registers */
440
                       FloatRegister t1)
441
    : KernelGenerator(as, unrolls),
442
      _result(result), _lo(lo), _hi(hi),
443
      _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
444

445
  int register_stride = 7;
446

447
  virtual void generate(int index) {
448
    const FloatRegister t0 = _result;
449

450
    switch (index) {
451
      // The GCM field polynomial f is z^128 + p(z), where p =
452
      // z^7+z^2+z+1.
453
      //
454
      //    z^128 === -p(z)  (mod (z^128 + p(z)))
455
      //
456
      // so, given that the product we're reducing is
457
      //    a == lo + hi * z^128
458
      // substituting,
459
      //      === lo - hi * p(z)  (mod (z^128 + p(z)))
460
      //
461
      // we reduce by multiplying hi by p(z) and subtracting the _result
462
      // from (i.e. XORing it with) lo.  Because p has no nonzero high
463
      // bits we can do this with two 64-bit multiplications, lo*p and
464
      // hi*p.
465

466
      case  0:  pmull2(t0, T1Q, _hi, _p, T2D);  break;
467
      case  1:  ext(_t1, T16B, t0, _vzr, 8);  break;
468
      case  2:  eor(_hi, T16B, _hi, _t1);  break;
469
      case  3:  ext(_t1, T16B, _vzr, t0, 8);  break;
470
      case  4:  eor(_lo, T16B, _lo, _t1);  break;
471
      case  5:  pmull(t0, T1Q, _hi, _p, T1D);  break;
472
      case  6:  eor(_result, T16B, _lo, t0);  break;
473
      default: ShouldNotReachHere();
474
    }
475

476
    // Sprinkle load instructions into the generated instructions
477
    if (_data->is_valid() && _once) {
478
      assert(length() >= unrolls(), "not enough room for inteleaved loads");
479
      if (index < unrolls()) {
480
        ld1(as_FloatRegister(_data->encoding() + index*register_stride), T16B, post(r2, 0x10));
481
      }
482
    }
483
  }
484

485
  virtual KernelGenerator *next() {
486
    GHASHReduceGenerator *result = new GHASHReduceGenerator(*this);
487
    result->_result = as_FloatRegister(result->_result->encoding() + register_stride);
488
    result->_hi     = as_FloatRegister(result->_hi    ->encoding() + register_stride);
489
    result->_lo     = as_FloatRegister(result->_lo    ->encoding() + register_stride);
490
    result->_t1     = as_FloatRegister(result->_t1    ->encoding() + register_stride);
491
    result->_once = false;
492
    return result;
493
  }
494

495
 int length() { return 7; }
496
};
497

498
// Perform a GHASH multiply/reduce on a single FloatRegister.
499
void MacroAssembler::ghash_modmul(FloatRegister result,
500
                                  FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
501
                                  FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
502
                                  FloatRegister t1, FloatRegister t2, FloatRegister t3) {
503
  ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
504
  ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
505
}
506

507
// Interleaved GHASH processing.
508
//
509
// Clobbers all vector registers.
510
//
511
void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
512
                                              Register subkeyH,
513
                                              Register data, Register blocks, int unrolls) {
514
  int register_stride = 7;
515

516
  // Bafflingly, GCM uses little-endian for the byte order, but
517
  // big-endian for the bit order.  For example, the polynomial 1 is
518
  // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
519
  //
520
  // So, we must either reverse the bytes in each word and do
521
  // everything big-endian or reverse the bits in each byte and do
522
  // it little-endian.  On AArch64 it's more idiomatic to reverse
523
  // the bits in each byte (we have an instruction, RBIT, to do
524
  // that) and keep the data in little-endian bit order through the
525
  // calculation, bit-reversing the inputs and outputs.
526

527
  assert(unrolls * register_stride < 32, "out of registers");
528

529
  FloatRegister a1_xor_a0 = v28;
530
  FloatRegister Hprime = v29;
531
  FloatRegister vzr = v30;
532
  FloatRegister p = v31;
533
  eor(vzr, T16B, vzr, vzr); // zero register
534

535
  ldrq(p, field_polynomial);    // The field polynomial
536

537
  ldrq(v0, Address(state));
538
  ldrq(Hprime, Address(subkeyH));
539

540
  rev64(v0, T16B, v0);          // Bit-reverse words in state and subkeyH
541
  rbit(v0, T16B, v0);
542
  rev64(Hprime, T16B, Hprime);
543
  rbit(Hprime, T16B, Hprime);
544

545
  // Powers of H -> Hprime
546

547
  Label already_calculated, done;
548
  {
549
    // The first time around we'll have to calculate H**2, H**3, etc.
550
    // Look at the largest power of H in the subkeyH array to see if
551
    // it's already been calculated.
552
    ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
553
    orr(rscratch1, rscratch1, rscratch2);
554
    cbnz(rscratch1, already_calculated);
555

556
    orr(v6, T16B, Hprime, Hprime);  // Start with H in v6 and Hprime
557
    for (int i = 1; i < unrolls; i++) {
558
      ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
559
      eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
560
      ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
561
                   Hprime, vzr, a1_xor_a0, p,
562
                   /*temps*/v1, v3, v2);
563
      rev64(v1, T16B, v6);
564
      rbit(v1, T16B, v1);
565
      strq(v1, Address(subkeyH, 16 * i));
566
    }
567
    b(done);
568
  }
569
  {
570
    bind(already_calculated);
571

572
    // Load the largest power of H we need into v6.
573
    ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
574
    rev64(v6, T16B, v6);
575
    rbit(v6, T16B, v6);
576
  }
577
  bind(done);
578

579
  orr(Hprime, T16B, v6, v6);     // Move H ** unrolls into Hprime
580

581
  // Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
582
  // v0 contains the initial state. Clear the others.
583
  for (int i = 1; i < unrolls; i++) {
584
    int ofs = register_stride * i;
585
    FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
586
    eor(v0_ofs, T16B, v0_ofs, v0_ofs); // zero each state register
587
  }
588

589
  ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
590
  eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
591

592
  // Load #unrolls blocks of data
593
  for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
594
    FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
595
    ld1(v2_ofs, T16B, post(data, 0x10));
596
  }
597

598
  // Register assignments, replicated across 4 clones, v0 ... v23
599
  //
600
  // v0: input / output: current state, result of multiply/reduce
601
  // v1: temp
602
  // v2: input: one block of data (the ciphertext)
603
  //     also used as a temp once the data has been consumed
604
  // v3: temp
605
  // v4: output: high part of product
606
  // v5: output: low part ...
607
  // v6: unused
608
  //
609
  // Not replicated:
610
  //
611
  // v28: High part of H xor low part of H'
612
  // v29: H' (hash subkey)
613
  // v30: zero
614
  // v31: Reduction polynomial of the Galois field
615

616
  // Inner loop.
617
  // Do the whole load/add/multiply/reduce over all our data except
618
  // the last few rows.
619
  {
620
    Label L_ghash_loop;
621
    bind(L_ghash_loop);
622

623
    // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
624
    // prfm(Address(data, 128), PLDL1KEEP);
625

626
    // Xor data into current state
627
    for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
628
      FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
629
      FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
630
      rbit(v2_ofs, T16B, v2_ofs);
631
      eor(v2_ofs, T16B, v0_ofs, v2_ofs);   // bit-swapped data ^ bit-swapped state
632
    }
633

634
    // Generate fully-unrolled multiply-reduce in two stages.
635

636
    GHASHMultiplyGenerator(this, unrolls,
637
                           /*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
638
                           Hprime, a1_xor_a0, p, vzr,
639
                           /*temps*/v1, v3, /* reuse b*/v2) .unroll();
640

641
    // NB: GHASHReduceGenerator also loads the next #unrolls blocks of
642
    // data into v0, v0+ofs, the current state.
643
    GHASHReduceGenerator (this, unrolls,
644
                          /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
645
                          /*data*/v2, /*temp*/v3) .unroll();
646

647
    sub(blocks, blocks, unrolls);
648
    cmp(blocks, (unsigned char)(unrolls * 2));
649
    br(GE, L_ghash_loop);
650
  }
651

652
  // Merge the #unrolls states.  Note that the data for the next
653
  // iteration has already been loaded into v4, v4+ofs, etc...
654

655
  // First, we multiply/reduce each clone by the appropriate power of H.
656
  for (int i = 0; i < unrolls; i++) {
657
    int ofs = register_stride * i;
658
    FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
659
    FloatRegister v1_ofs = as_FloatRegister(v1->encoding() + ofs);
660
    FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
661
    FloatRegister v3_ofs = as_FloatRegister(v3->encoding() + ofs);
662
    FloatRegister v4_ofs = as_FloatRegister(v4->encoding() + ofs);
663
    FloatRegister v5_ofs = as_FloatRegister(v5->encoding() + ofs);
664

665
    ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
666

667
    rbit(v2_ofs, T16B, v2_ofs);
668
    eor(v2_ofs, T16B, v0_ofs, v2_ofs);   // bit-swapped data ^ bit-swapped state
669

670
    rev64(Hprime, T16B, Hprime);
671
    rbit(Hprime, T16B, Hprime);
672
    ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
673
    eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
674
    ghash_modmul(/*result*/v0_ofs, /*result_lo*/v5_ofs, /*result_hi*/v4_ofs, /*b*/v2_ofs,
675
                 Hprime, vzr, a1_xor_a0, p,
676
                 /*temps*/v1_ofs, v3_ofs, /* reuse b*/v2_ofs);
677
  }
678

679
  // Then we sum the results.
680
  for (int i = 1; i < unrolls; i++) {
681
    FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + register_stride * i);
682
    eor(v0, T16B, v0, v0_ofs);
683
  }
684

685
  sub(blocks, blocks, (unsigned char)unrolls);
686

687
  // And finally bit-reverse the state back to big endian.
688
  rev64(v0, T16B, v0);
689
  rbit(v0, T16B, v0);
690
  st1(v0, T16B, state);
691
}
692
jdk

Использование cookies