14
#define vmx_zero() (vec_splat_u32(0))
16
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
18
for (i = 0; i < len; ++i) {
24
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
26
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
27
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
28
const vector unsigned char t2 = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17};
29
const vector unsigned char t3 = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
33
const vector unsigned char s0_perm = {0, 1, 2, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
34
const vector unsigned char shift_vec = vec_sl(vec_splat_u8(8), vec_splat_u8(2));
35
vector unsigned int adacc, s2acc;
36
vector unsigned int pair_vec = vec_ld(0, s);
37
adacc = vec_perm(pair_vec, pair_vec, s0_perm);
38
#if BYTE_ORDER == LITTLE_ENDIAN
39
s2acc = vec_sro(pair_vec, shift_vec);
41
s2acc = vec_slo(pair_vec, shift_vec);
44
vector unsigned int zero = vmx_zero();
45
vector unsigned int s3acc = zero;
46
vector unsigned int s3acc_0 = zero;
47
vector unsigned int adacc_prev = adacc;
48
vector unsigned int adacc_prev_0 = zero;
50
vector unsigned int s2acc_0 = zero;
51
vector unsigned int s2acc_1 = zero;
52
vector unsigned int s2acc_2 = zero;
56
vector unsigned int adacc_0 = zero;
58
int num_iter = len / 4;
61
for (int i = 0; i < num_iter; ++i) {
62
vector unsigned char d0 = vec_ld(0, buf);
63
vector unsigned char d1 = vec_ld(16, buf);
64
vector unsigned char d2 = vec_ld(32, buf);
65
vector unsigned char d3 = vec_ld(48, buf);
69
adacc = vec_sum4s(d0, adacc);
70
s3acc = vec_add(s3acc, adacc_prev);
71
s3acc_0 = vec_add(s3acc_0, adacc_prev_0);
72
s2acc = vec_msum(t0, d0, s2acc);
75
adacc_0 = vec_sum4s(d1, adacc_0);
76
s2acc_0 = vec_msum(t1, d1, s2acc_0);
77
adacc = vec_sum4s(d2, adacc);
78
s2acc_1 = vec_msum(t2, d2, s2acc_1);
79
s2acc_2 = vec_msum(t3, d3, s2acc_2);
80
adacc_0 = vec_sum4s(d3, adacc_0);
83
adacc_prev_0 = adacc_0;
87
adacc = vec_add(adacc, adacc_0);
88
s3acc = vec_add(s3acc, s3acc_0);
89
s3acc = vec_sl(s3acc, vec_splat_u32(6));
92
adacc_prev = vec_add(adacc_prev_0, adacc_prev);
93
adacc_prev = vec_sl(adacc_prev, vec_splat_u32(4));
95
vector unsigned char d0 = vec_ld(0, buf);
96
adacc = vec_sum4s(d0, adacc);
97
s3acc = vec_add(s3acc, adacc_prev);
98
s2acc = vec_msum(t3, d0, s2acc);
99
adacc_prev = vec_sl(adacc, vec_splat_u32(4));
106
s2acc = vec_add(s2acc, s2acc_0);
107
s2acc_2 = vec_add(s2acc_1, s2acc_2);
108
s2acc = vec_add(s2acc, s2acc_2);
110
s2acc = vec_add(s2acc, s3acc);
112
adacc = vec_add(adacc, vec_sld(adacc, adacc, 8));
113
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 8));
114
adacc = vec_add(adacc, vec_sld(adacc, adacc, 4));
115
s2acc = vec_add(s2acc, vec_sld(s2acc, s2acc, 4));
117
vec_ste(adacc, 0, s);
118
vec_ste(s2acc, 0, s+1);
121
Z_INTERNAL uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
123
uint32_t pair[16] ALIGNED_(16);
124
memset(&pair[2], 0, 14);
126
unsigned int done = 0, i;
131
sum2 = (adler >> 16) & 0xffff;
137
if (UNLIKELY(len == 1))
138
return adler32_len_1(adler, buf, sum2);
141
if (UNLIKELY(buf == NULL))
145
if (UNLIKELY(len < 16))
146
return adler32_len_16(adler, buf, len, sum2);
150
if ((uintptr_t)buf & 0xf) {
151
al = 16-((uintptr_t)buf & 0xf);
155
vmx_handle_head_or_tail(pair, buf, al);
162
for (i = al; i < len; i += n) {
163
int remaining = (int)(len-i);
164
n = MIN(remaining, (i == al) ? n : NMAX);
169
vmx_accum32(pair, buf + i, n / 16);
173
done += (n / 16) * 16;
178
vmx_handle_head_or_tail(pair, (buf + done), len - done);
184
return (pair[1] << 16) | pair[0];