32
#include "crc32_constants.h"
33
#include "crc32_braid_tbl.h"
35
#if defined (__clang__)
36
#include "fallback_builtins.h"
41
#define VMX_ALIGN_MASK (VMX_ALIGN-1)
43
static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) {
45
crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
49
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
51
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
52
unsigned int prealign;
55
unsigned long len = (unsigned long) _len;
57
if (p == (const unsigned char *) 0x0)
62
if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
63
crc = crc32_align(crc, p, len);
67
if ((unsigned long)p & VMX_ALIGN_MASK) {
68
prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK);
69
crc = crc32_align(crc, p, prealign);
74
crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK);
76
tail = len & VMX_ALIGN_MASK;
78
p += len & ~VMX_ALIGN_MASK;
79
crc = crc32_align(crc, p, tail);
92
#define GROUP_ENDING_NOP __asm__("ori 2,2,0" ::: "memory")
94
#if BYTE_ORDER == BIG_ENDIAN
99
#define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb, (__vector unsigned char) vc)
100
#if BYTE_ORDER == LITTLE_ENDIAN
102
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL };
104
static const __vector unsigned long long vperm_const ALIGNED_(16) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL };
107
#define VEC_PERM(vr, va, vb, vc)
110
static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) {
112
const __vector unsigned long long vzero = {0,0};
113
const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL};
115
const __vector unsigned long long vmask_32bit =
116
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4);
118
const __vector unsigned long long vmask_64bit =
119
(__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8);
121
__vector unsigned long long vcrc;
123
__vector unsigned long long vconst1, vconst2;
126
__vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7;
129
__vector unsigned long long v0 = {0,0};
130
__vector unsigned long long v1 = {0,0};
131
__vector unsigned long long v2 = {0,0};
132
__vector unsigned long long v3 = {0,0};
133
__vector unsigned long long v4 = {0,0};
134
__vector unsigned long long v5 = {0,0};
135
__vector unsigned long long v6 = {0,0};
136
__vector unsigned long long v7 = {0,0};
140
__vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7;
145
unsigned long chunks;
147
unsigned long block_size;
151
unsigned long length = len & 0xFFFFFFFFFFFFFF80UL;
153
vcrc = (__vector unsigned long long)__builtin_pack_vector_int128(0UL, crc);
160
vconst1 = vec_ld(offset, vcrc_short_const);
161
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
162
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
165
vdata0 = vec_xor(vdata0, vcrc);
167
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
168
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
169
v0 = vec_xor(v0, vdata0);
171
for (i = 16; i < len; i += 16) {
172
vconst1 = vec_ld(offset + i, vcrc_short_const);
173
vdata0 = vec_ld(i, (__vector unsigned long long*) p);
174
VEC_PERM(vdata0, vdata0, vconst1, vperm_const);
175
vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw(
176
(__vector unsigned int)vdata0, (__vector unsigned int)vconst1);
177
v0 = vec_xor(v0, vdata0);
182
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
183
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
185
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
186
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
188
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
189
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
191
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
192
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
194
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
195
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
197
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
198
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
200
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
201
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
203
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
204
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
207
vdata0 = vec_xor(vdata0, vcrc);
214
if (block_size > MAX_SIZE) {
215
block_size = MAX_SIZE;
218
length = length - block_size;
225
offset = (MAX_SIZE/8) - (block_size/8);
227
chunks = (block_size/128)-1;
229
vconst1 = vec_ld(offset, vcrc_const);
231
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
232
(__vector unsigned long long)vconst1);
233
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
234
(__vector unsigned long long)vconst1);
235
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
236
(__vector unsigned long long)vconst1);
237
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
238
(__vector unsigned long long)vconst1);
239
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
240
(__vector unsigned long long)vconst1);
241
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
242
(__vector unsigned long long)vconst1);
243
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
244
(__vector unsigned long long)vconst1);
245
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
246
(__vector unsigned long long)vconst1);
250
vconst2 = vec_ld(offset, vcrc_const);
253
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
254
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
256
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
257
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
259
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
260
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
262
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
263
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
265
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
266
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
268
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
269
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
271
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
272
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
274
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
275
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
283
for (i = 0; i < chunks-2; i++) {
284
vconst1 = vec_ld(offset, vcrc_const);
288
v0 = vec_xor(v0, va0);
289
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
290
(__vector unsigned long long)vconst2);
291
vdata0 = vec_ld(0, (__vector unsigned long long*) p);
292
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
295
v1 = vec_xor(v1, va1);
296
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
297
(__vector unsigned long long)vconst2);
298
vdata1 = vec_ld(16, (__vector unsigned long long*) p);
299
VEC_PERM(vdata1, vdata1, vdata1, vperm_const);
302
v2 = vec_xor(v2, va2);
303
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)
304
vdata2, (__vector unsigned long long)vconst2);
305
vdata2 = vec_ld(32, (__vector unsigned long long*) p);
306
VEC_PERM(vdata2, vdata2, vdata2, vperm_const);
309
v3 = vec_xor(v3, va3);
310
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
311
(__vector unsigned long long)vconst2);
312
vdata3 = vec_ld(48, (__vector unsigned long long*) p);
313
VEC_PERM(vdata3, vdata3, vdata3, vperm_const);
315
vconst2 = vec_ld(offset, vcrc_const);
318
v4 = vec_xor(v4, va4);
319
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
320
(__vector unsigned long long)vconst1);
321
vdata4 = vec_ld(64, (__vector unsigned long long*) p);
322
VEC_PERM(vdata4, vdata4, vdata4, vperm_const);
325
v5 = vec_xor(v5, va5);
326
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
327
(__vector unsigned long long)vconst1);
328
vdata5 = vec_ld(80, (__vector unsigned long long*) p);
329
VEC_PERM(vdata5, vdata5, vdata5, vperm_const);
332
v6 = vec_xor(v6, va6);
333
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
334
(__vector unsigned long long)vconst1);
335
vdata6 = vec_ld(96, (__vector unsigned long long*) p);
336
VEC_PERM(vdata6, vdata6, vdata6, vperm_const);
339
v7 = vec_xor(v7, va7);
340
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
341
(__vector unsigned long long)vconst1);
342
vdata7 = vec_ld(112, (__vector unsigned long long*) p);
343
VEC_PERM(vdata7, vdata7, vdata7, vperm_const);
349
vconst1 = vec_ld(offset, vcrc_const);
352
v0 = vec_xor(v0, va0);
353
va0 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata0,
354
(__vector unsigned long long)vconst1);
357
v1 = vec_xor(v1, va1);
358
va1 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata1,
359
(__vector unsigned long long)vconst1);
362
v2 = vec_xor(v2, va2);
363
va2 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata2,
364
(__vector unsigned long long)vconst1);
367
v3 = vec_xor(v3, va3);
368
va3 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata3,
369
(__vector unsigned long long)vconst1);
372
v4 = vec_xor(v4, va4);
373
va4 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata4,
374
(__vector unsigned long long)vconst1);
377
v5 = vec_xor(v5, va5);
378
va5 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata5,
379
(__vector unsigned long long)vconst1);
382
v6 = vec_xor(v6, va6);
383
va6 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata6,
384
(__vector unsigned long long)vconst1);
387
v7 = vec_xor(v7, va7);
388
va7 = __builtin_crypto_vpmsumd((__vector unsigned long long)vdata7,
389
(__vector unsigned long long)vconst1);
393
v0 = vec_xor(v0, va0);
394
v1 = vec_xor(v1, va1);
395
v2 = vec_xor(v2, va2);
396
v3 = vec_xor(v3, va3);
397
v4 = vec_xor(v4, va4);
398
v5 = vec_xor(v5, va5);
399
v6 = vec_xor(v6, va6);
400
v7 = vec_xor(v7, va7);
408
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
409
(__vector unsigned char)vzero, 4);
410
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1,
411
(__vector unsigned char)vzero, 4);
412
v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2,
413
(__vector unsigned char)vzero, 4);
414
v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3,
415
(__vector unsigned char)vzero, 4);
416
v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4,
417
(__vector unsigned char)vzero, 4);
418
v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5,
419
(__vector unsigned char)vzero, 4);
420
v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6,
421
(__vector unsigned char)vzero, 4);
422
v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7,
423
(__vector unsigned char)vzero, 4);
426
va0 = vec_ld(0, (__vector unsigned long long*) p);
427
VEC_PERM(va0, va0, va0, vperm_const);
429
va1 = vec_ld(16, (__vector unsigned long long*) p);
430
VEC_PERM(va1, va1, va1, vperm_const);
432
va2 = vec_ld(32, (__vector unsigned long long*) p);
433
VEC_PERM(va2, va2, va2, vperm_const);
435
va3 = vec_ld(48, (__vector unsigned long long*) p);
436
VEC_PERM(va3, va3, va3, vperm_const);
438
va4 = vec_ld(64, (__vector unsigned long long*) p);
439
VEC_PERM(va4, va4, va4, vperm_const);
441
va5 = vec_ld(80, (__vector unsigned long long*) p);
442
VEC_PERM(va5, va5, va5, vperm_const);
444
va6 = vec_ld(96, (__vector unsigned long long*) p);
445
VEC_PERM(va6, va6, va6, vperm_const);
447
va7 = vec_ld(112, (__vector unsigned long long*) p);
448
VEC_PERM(va7, va7, va7, vperm_const);
452
vdata0 = vec_xor(v0, va0);
453
vdata1 = vec_xor(v1, va1);
454
vdata2 = vec_xor(v2, va2);
455
vdata3 = vec_xor(v3, va3);
456
vdata4 = vec_xor(v4, va4);
457
vdata5 = vec_xor(v5, va5);
458
vdata6 = vec_xor(v6, va6);
459
vdata7 = vec_xor(v7, va7);
467
v0 = vec_xor(v0, v0);
468
v1 = vec_xor(v1, v1);
469
v2 = vec_xor(v2, v2);
470
v3 = vec_xor(v3, v3);
471
v4 = vec_xor(v4, v4);
472
v5 = vec_xor(v5, v5);
473
v6 = vec_xor(v6, v6);
474
v7 = vec_xor(v7, v7);
476
length = length + 128;
478
} while (next_block);
481
length = (len & 127);
484
offset = 128 - length;
486
v0 = vec_ld(offset, vcrc_short_const);
487
v1 = vec_ld(offset + 16, vcrc_short_const);
488
v2 = vec_ld(offset + 32, vcrc_short_const);
489
v3 = vec_ld(offset + 48, vcrc_short_const);
490
v4 = vec_ld(offset + 64, vcrc_short_const);
491
v5 = vec_ld(offset + 80, vcrc_short_const);
492
v6 = vec_ld(offset + 96, vcrc_short_const);
493
v7 = vec_ld(offset + 112, vcrc_short_const);
497
v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
498
(__vector unsigned int)vdata0, (__vector unsigned int)v0);
499
v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
500
(__vector unsigned int)vdata1, (__vector unsigned int)v1);
501
v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
502
(__vector unsigned int)vdata2, (__vector unsigned int)v2);
503
v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
504
(__vector unsigned int)vdata3, (__vector unsigned int)v3);
505
v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
506
(__vector unsigned int)vdata4, (__vector unsigned int)v4);
507
v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
508
(__vector unsigned int)vdata5, (__vector unsigned int)v5);
509
v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
510
(__vector unsigned int)vdata6, (__vector unsigned int)v6);
511
v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
512
(__vector unsigned int)vdata7, (__vector unsigned int)v7);
515
for (i = 0; i < length; i+=16) {
516
vdata0 = vec_ld(i,(__vector unsigned long long*)p);
517
VEC_PERM(vdata0, vdata0, vdata0, vperm_const);
518
va0 = vec_ld(offset + i,vcrc_short_const);
519
va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw(
520
(__vector unsigned int)vdata0, (__vector unsigned int)va0);
521
v0 = vec_xor(v0, va0);
525
v0 = vec_xor(v0, v1);
526
v2 = vec_xor(v2, v3);
527
v4 = vec_xor(v4, v5);
528
v6 = vec_xor(v6, v7);
530
v0 = vec_xor(v0, v2);
531
v4 = vec_xor(v4, v6);
533
v0 = vec_xor(v0, v4);
537
vconst1 = vec_ld(0, v_Barrett_const);
538
vconst2 = vec_ld(16, v_Barrett_const);
540
v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
541
(__vector unsigned char)v0, 8);
545
__vector unsigned char vsht_splat = vec_splat_u8 (1);
546
v0 = (__vector unsigned long long)vec_sll((__vector unsigned char)v0, vsht_splat);
548
v0 = vec_and(v0, vmask_64bit);
559
v1 = vec_and(v0, vmask_32bit);
562
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
563
(__vector unsigned long long)vconst1);
566
v1 = vec_and(v1, vmask_32bit);
568
v1 = __builtin_crypto_vpmsumd((__vector unsigned long long)v1,
569
(__vector unsigned long long)vconst2);
571
v0 = vec_xor (v0, v1);
581
v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0,
582
(__vector unsigned char)vzero, 4);
584
#if BYTE_ORDER == BIG_ENDIAN