26
#include "precompiled.hpp"
28
#include "asm/assembler.hpp"
29
#include "asm/assembler.inline.hpp"
30
#include "macroAssembler_aarch64.hpp"
31
#include "memory/resourceArea.hpp"
32
#include "runtime/stubRoutines.hpp"
34
void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
39
ld1(v5, T16B, post(key, 16));
42
ld1(v1, v2, v3, v4, T16B, post(key, 64));
56
ld1(v1, v2, v3, v4, T16B, post(key, 64));
70
ld1(v1, v2, T16B, post(key, 32));
75
br(Assembler::EQ, L_doLast);
82
ld1(v1, v2, T16B, post(key, 32));
87
br(Assembler::EQ, L_doLast);
94
ld1(v1, v2, T16B, post(key, 32));
104
eor(v0, T16B, v0, v5);
109
sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
113
void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
114
Label L_loadkeys_44, L_loadkeys_52;
116
br(Assembler::LO, L_loadkeys_44);
117
br(Assembler::EQ, L_loadkeys_52);
119
ld1(v17, v18, T16B, post(key, 32));
120
rev32(v17, T16B, v17);
121
rev32(v18, T16B, v18);
123
ld1(v19, v20, T16B, post(key, 32));
124
rev32(v19, T16B, v19);
125
rev32(v20, T16B, v20);
127
ld1(v21, v22, v23, v24, T16B, post(key, 64));
128
rev32(v21, T16B, v21);
129
rev32(v22, T16B, v22);
130
rev32(v23, T16B, v23);
131
rev32(v24, T16B, v24);
132
ld1(v25, v26, v27, v28, T16B, post(key, 64));
133
rev32(v25, T16B, v25);
134
rev32(v26, T16B, v26);
135
rev32(v27, T16B, v27);
136
rev32(v28, T16B, v28);
137
ld1(v29, v30, v31, T16B, post(key, 48));
138
rev32(v29, T16B, v29);
139
rev32(v30, T16B, v30);
140
rev32(v31, T16B, v31);
143
sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
150
void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
151
aese(input, subkey); aesmc(input, input);
172
class KernelGenerator: public MacroAssembler {
176
KernelGenerator(Assembler *as, int unrolls)
177
: MacroAssembler(as->code()), _unrolls(unrolls) { }
178
virtual void generate(int index) = 0;
179
virtual int length() = 0;
180
virtual KernelGenerator *next() = 0;
181
int unrolls() { return _unrolls; }
185
void KernelGenerator::unroll() {
187
KernelGenerator **generators
188
= NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
190
generators[0] = this;
191
for (int i = 1; i < unrolls(); i++) {
192
generators[i] = generators[i-1]->next();
195
for (int j = 0; j < length(); j++) {
196
for (int i = 0; i < unrolls(); i++) {
197
generators[i]->generate(j);
203
class AESKernelGenerator: public KernelGenerator {
205
const Register _keylen;
207
const FloatRegister _subkeys;
209
Label _rounds_44, _rounds_52;
212
AESKernelGenerator(Assembler *as, int unrolls,
213
Register from, Register to, Register keylen, FloatRegister data,
214
FloatRegister subkeys, bool once = true)
215
: KernelGenerator(as, unrolls),
216
_from(from), _to(to), _keylen(keylen), _data(data),
217
_subkeys(subkeys), _once(once) {
220
virtual void generate(int index) {
223
if (_from != noreg) {
224
ld1(_data, T16B, _from);
230
br(Assembler::LO, _rounds_44);
231
br(Assembler::EQ, _rounds_52);
234
case 2: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 0)); break;
235
case 3: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 1)); break;
237
if (_once) bind(_rounds_52);
239
case 5: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 2)); break;
240
case 6: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 3)); break;
242
if (_once) bind(_rounds_44);
244
case 8: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 4)); break;
245
case 9: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 5)); break;
246
case 10: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 6)); break;
247
case 11: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 7)); break;
248
case 12: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 8)); break;
249
case 13: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 9)); break;
250
case 14: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 10)); break;
251
case 15: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 11)); break;
252
case 16: aes_round(_data, as_FloatRegister(_subkeys->encoding() + 12)); break;
253
case 17: aese(_data, as_FloatRegister(_subkeys->encoding() + 13)); break;
254
case 18: eor(_data, T16B, _data, as_FloatRegister(_subkeys->encoding() + 14)); break;
257
st1(_data, T16B, _to);
260
default: ShouldNotReachHere();
264
virtual KernelGenerator *next() {
265
return new AESKernelGenerator(this, _unrolls,
267
_data->successor(), _subkeys, false);
270
virtual int length() { return 20; }
280
void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
281
FloatRegister data, int unrolls) {
282
AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
287
void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
288
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
289
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
306
ext(tmp1, T16B, b, b, 0x08);
307
pmull2(result_hi, T1Q, b, a, T2D);
308
eor(tmp1, T16B, tmp1, b);
309
pmull(result_lo, T1Q, b, a, T1D);
310
pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D);
312
ext(tmp1, T16B, result_lo, result_hi, 0x08);
313
eor(tmp3, T16B, result_hi, result_lo);
314
eor(tmp2, T16B, tmp2, tmp1);
315
eor(tmp2, T16B, tmp2, tmp3);
318
ins(result_hi, D, tmp2, 0, 1);
319
ins(result_lo, D, tmp2, 1, 0);
322
void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
323
FloatRegister p, FloatRegister vzr, FloatRegister t1) {
324
const FloatRegister t0 = result;
341
pmull2(t0, T1Q, hi, p, T2D);
342
ext(t1, T16B, t0, vzr, 8);
343
eor(hi, T16B, hi, t1);
344
ext(t1, T16B, vzr, t0, 8);
345
eor(lo, T16B, lo, t1);
346
pmull(t0, T1Q, hi, p, T1D);
347
eor(result, T16B, lo, t0);
350
class GHASHMultiplyGenerator: public KernelGenerator {
351
FloatRegister _result_lo, _result_hi, _b,
352
_a, _vzr, _a1_xor_a0, _p,
356
GHASHMultiplyGenerator(Assembler *as, int unrolls,
358
FloatRegister result_lo, FloatRegister result_hi,
361
FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
363
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
364
: KernelGenerator(as, unrolls),
365
_result_lo(result_lo), _result_hi(result_hi), _b(b),
366
_a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
367
_tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
369
int register_stride = 7;
371
virtual void generate(int index) {
389
case 0: ext(_tmp1, T16B, _b, _b, 0x08); break;
390
case 1: pmull2(_result_hi, T1Q, _b, _a, T2D);
392
case 2: eor(_tmp1, T16B, _tmp1, _b);
394
case 3: pmull(_result_lo, T1Q, _b, _a, T1D);
396
case 4: pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D);
399
case 5: ext(_tmp1, T16B, _result_lo, _result_hi, 0x08); break;
400
case 6: eor(_tmp3, T16B, _result_hi, _result_lo);
402
case 7: eor(_tmp2, T16B, _tmp2, _tmp1); break;
403
case 8: eor(_tmp2, T16B, _tmp2, _tmp3); break;
406
case 9: ins(_result_hi, D, _tmp2, 0, 1); break;
407
case 10: ins(_result_lo, D, _tmp2, 1, 0); break;
408
default: ShouldNotReachHere();
412
virtual KernelGenerator* next() {
413
GHASHMultiplyGenerator* result = new GHASHMultiplyGenerator(*this);
414
result->_result_lo = as_FloatRegister(result->_result_lo->encoding() + register_stride);
415
result->_result_hi = as_FloatRegister(result->_result_hi->encoding() + register_stride);
416
result->_b = as_FloatRegister(result->_b ->encoding() + register_stride);
417
result->_tmp1 = as_FloatRegister(result->_tmp1 ->encoding() + register_stride);
418
result->_tmp2 = as_FloatRegister(result->_tmp2 ->encoding() + register_stride);
419
result->_tmp3 = as_FloatRegister(result->_tmp3 ->encoding() + register_stride);
423
virtual int length() { return 11; }
430
class GHASHReduceGenerator: public KernelGenerator {
431
FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
434
GHASHReduceGenerator(Assembler *as, int unrolls,
436
FloatRegister result, FloatRegister lo, FloatRegister hi,
438
FloatRegister p, FloatRegister vzr, FloatRegister data,
441
: KernelGenerator(as, unrolls),
442
_result(result), _lo(lo), _hi(hi),
443
_p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
445
int register_stride = 7;
447
virtual void generate(int index) {
448
const FloatRegister t0 = _result;
466
case 0: pmull2(t0, T1Q, _hi, _p, T2D); break;
467
case 1: ext(_t1, T16B, t0, _vzr, 8); break;
468
case 2: eor(_hi, T16B, _hi, _t1); break;
469
case 3: ext(_t1, T16B, _vzr, t0, 8); break;
470
case 4: eor(_lo, T16B, _lo, _t1); break;
471
case 5: pmull(t0, T1Q, _hi, _p, T1D); break;
472
case 6: eor(_result, T16B, _lo, t0); break;
473
default: ShouldNotReachHere();
477
if (_data->is_valid() && _once) {
478
assert(length() >= unrolls(), "not enough room for inteleaved loads");
479
if (index < unrolls()) {
480
ld1(as_FloatRegister(_data->encoding() + index*register_stride), T16B, post(r2, 0x10));
485
virtual KernelGenerator *next() {
486
GHASHReduceGenerator *result = new GHASHReduceGenerator(*this);
487
result->_result = as_FloatRegister(result->_result->encoding() + register_stride);
488
result->_hi = as_FloatRegister(result->_hi ->encoding() + register_stride);
489
result->_lo = as_FloatRegister(result->_lo ->encoding() + register_stride);
490
result->_t1 = as_FloatRegister(result->_t1 ->encoding() + register_stride);
491
result->_once = false;
495
int length() { return 7; }
499
void MacroAssembler::ghash_modmul(FloatRegister result,
500
FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
501
FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
502
FloatRegister t1, FloatRegister t2, FloatRegister t3) {
503
ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
504
ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
511
void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
513
Register data, Register blocks, int unrolls) {
514
int register_stride = 7;
527
assert(unrolls * register_stride < 32, "out of registers");
529
FloatRegister a1_xor_a0 = v28;
530
FloatRegister Hprime = v29;
531
FloatRegister vzr = v30;
532
FloatRegister p = v31;
533
eor(vzr, T16B, vzr, vzr);
535
ldrq(p, field_polynomial);
537
ldrq(v0, Address(state));
538
ldrq(Hprime, Address(subkeyH));
542
rev64(Hprime, T16B, Hprime);
543
rbit(Hprime, T16B, Hprime);
547
Label already_calculated, done;
552
ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
553
orr(rscratch1, rscratch1, rscratch2);
554
cbnz(rscratch1, already_calculated);
556
orr(v6, T16B, Hprime, Hprime);
557
for (int i = 1; i < unrolls; i++) {
558
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08);
559
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);
560
ghash_modmul(v6, v5, v4, v6,
561
Hprime, vzr, a1_xor_a0, p,
565
strq(v1, Address(subkeyH, 16 * i));
570
bind(already_calculated);
573
ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
579
orr(Hprime, T16B, v6, v6);
583
for (int i = 1; i < unrolls; i++) {
584
int ofs = register_stride * i;
585
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
586
eor(v0_ofs, T16B, v0_ofs, v0_ofs);
589
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08);
590
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);
593
for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
594
FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
595
ld1(v2_ofs, T16B, post(data, 0x10));
627
for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
628
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
629
FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
630
rbit(v2_ofs, T16B, v2_ofs);
631
eor(v2_ofs, T16B, v0_ofs, v2_ofs);
636
GHASHMultiplyGenerator(this, unrolls,
638
Hprime, a1_xor_a0, p, vzr,
639
v1, v3, v2) .unroll();
643
GHASHReduceGenerator (this, unrolls,
647
sub(blocks, blocks, unrolls);
648
cmp(blocks, (unsigned char)(unrolls * 2));
649
br(GE, L_ghash_loop);
656
for (int i = 0; i < unrolls; i++) {
657
int ofs = register_stride * i;
658
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + ofs);
659
FloatRegister v1_ofs = as_FloatRegister(v1->encoding() + ofs);
660
FloatRegister v2_ofs = as_FloatRegister(v2->encoding() + ofs);
661
FloatRegister v3_ofs = as_FloatRegister(v3->encoding() + ofs);
662
FloatRegister v4_ofs = as_FloatRegister(v4->encoding() + ofs);
663
FloatRegister v5_ofs = as_FloatRegister(v5->encoding() + ofs);
665
ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
667
rbit(v2_ofs, T16B, v2_ofs);
668
eor(v2_ofs, T16B, v0_ofs, v2_ofs);
670
rev64(Hprime, T16B, Hprime);
671
rbit(Hprime, T16B, Hprime);
672
ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08);
673
eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);
674
ghash_modmul(v0_ofs, v5_ofs, v4_ofs, v2_ofs,
675
Hprime, vzr, a1_xor_a0, p,
676
v1_ofs, v3_ofs, v2_ofs);
680
for (int i = 1; i < unrolls; i++) {
681
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + register_stride * i);
682
eor(v0, T16B, v0, v0_ofs);
685
sub(blocks, blocks, (unsigned char)unrolls);
690
st1(v0, T16B, state);