opencv
1/* Adler32 for POWER8 using VSX instructions.
2* Copyright (C) 2020 IBM Corporation
3* Author: Rogerio Alves <rcardoso@linux.ibm.com>
4* For conditions of distribution and use, see copyright notice in zlib.h
5*
6* Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
7* instructions.
8*
9* If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
10* iteration n) is the initial value of adler - at start _0 is 1 unless
11* adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
12* the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
13* Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
14* after iteration N.
15*
16* Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
17* N-1*c[1] + ... + c[N]
18*
19* In a more general way:
20*
21* s1_N = s1_0 + sum(i=1 to N)c[i]
22* s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
23*
24* Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
25* can process N-bit at time we can do this at once.
26*
27* Since VSX can support 16-bit vector instructions, we can process
28* 16-bit at time using N = 16 we have:
29*
30* s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
31* s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
32*
33* After the first iteration we calculate the adler32 checksum for 16 bytes.
34*
35* For more background about adler32 please check the RFC:
36* https://www.ietf.org/rfc/rfc1950.txt
37*/
38
39#ifdef POWER8_VSX40
41#include <altivec.h>42#include "zbuild.h"43#include "adler32_p.h"44
45/* Vector across sum unsigned int (saturate). */
46static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {47__b = vec_sld(__a, __a, 8);48__b = vec_add(__b, __a);49__a = vec_sld(__b, __b, 4);50__a = vec_add(__a, __b);51
52return __a;53}
54
55Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {56uint32_t s1 = adler & 0xffff;57uint32_t s2 = (adler >> 16) & 0xffff;58
59/* in case user likes doing a byte at a time, keep it fast */60if (UNLIKELY(len == 1))61return adler32_len_1(s1, buf, s2);62
63/* If buffer is empty or len=0 we need to return adler initial value. */64if (UNLIKELY(buf == NULL))65return 1;66
67/* This is faster than VSX code for len < 64. */68if (len < 64)69return adler32_len_64(s1, buf, len, s2);70
71/* Use POWER VSX instructions for len >= 64. */72const vector unsigned int v_zeros = { 0 };73const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,746, 5, 4, 3, 2, 1};75const vector unsigned char vsh = vec_splat_u8(4);76const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};77vector unsigned int vs1 = { 0 };78vector unsigned int vs2 = { 0 };79vector unsigned int vs1_save = { 0 };80vector unsigned int vsum1, vsum2;81vector unsigned char vbuf;82int n;83
84vs1[0] = s1;85vs2[0] = s2;86
87/* Do length bigger than NMAX in blocks of NMAX size. */88while (len >= NMAX) {89len -= NMAX;90n = NMAX / 16;91do {92vbuf = vec_xl(0, (unsigned char *) buf);93vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */94/* sum(i=1 to 16) buf[i]*(16-i+1). */95vsum2 = vec_msum(vbuf, v_mul, v_zeros);96/* Save vs1. */97vs1_save = vec_add(vs1_save, vs1);98/* Accumulate the sums. */99vs1 = vec_add(vsum1, vs1);100vs2 = vec_add(vsum2, vs2);101
102buf += 16;103} while (--n);104/* Once each block of NMAX size. */105vs1 = vec_sumsu(vs1, vsum1);106vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */107vs2 = vec_add(vs1_save, vs2);108vs2 = vec_sumsu(vs2, vsum2);109
110/* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521. */111vs1[0] = vs1[0] % BASE;112/* vs2[0] = s2_i + 16*s1_save +113sum(i=1 to 16)(16-i+1)*buf[i] mod 65521. */
114vs2[0] = vs2[0] % BASE;115
116vs1 = vec_and(vs1, vmask);117vs2 = vec_and(vs2, vmask);118vs1_save = v_zeros;119}120
121/* len is less than NMAX one modulo is needed. */122if (len >= 16) {123while (len >= 16) {124len -= 16;125
126vbuf = vec_xl(0, (unsigned char *) buf);127
128vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i]. */129/* sum(i=1 to 16) buf[i]*(16-i+1). */130vsum2 = vec_msum(vbuf, v_mul, v_zeros);131/* Save vs1. */132vs1_save = vec_add(vs1_save, vs1);133/* Accumulate the sums. */134vs1 = vec_add(vsum1, vs1);135vs2 = vec_add(vsum2, vs2);136
137buf += 16;138}139/* Since the size will be always less than NMAX we do this once. */140vs1 = vec_sumsu(vs1, vsum1);141vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save. */142vs2 = vec_add(vs1_save, vs2);143vs2 = vec_sumsu(vs2, vsum2);144}145/* Copy result back to s1, s2 (mod 65521). */146s1 = vs1[0] % BASE;147s2 = vs2[0] % BASE;148
149/* Process tail (len < 16). */150return adler32_len_16(s1, buf, len, s2);151}
152
153#endif /* POWER8_VSX */154