opencv

adler32_power8.c
153 строки · 5.0 Кб
Перенос по словам
1
/* Adler32 for POWER8 using VSX instructions.
2
 * Copyright (C) 2020 IBM Corporation
3
 * Author: Rogerio Alves <rcardoso@linux.ibm.com>
4
 * For conditions of distribution and use, see copyright notice in zlib.h
5
 *
6
 * Calculate adler32 checksum for 16 bytes at once using POWER8+ VSX (vector)
7
 * instructions.
8
 *
9
 * If adler32 do 1 byte at time on the first iteration s1 is s1_0 (_n means
10
 * iteration n) is the initial value of adler - at start  _0 is 1 unless
11
 * adler initial value is different than 1. So s1_1 = s1_0 + c[0] after
12
 * the first calculation. For the iteration s1_2 = s1_1 + c[1] and so on.
13
 * Hence, for iteration N, s1_N = s1_(N-1) + c[N] is the value of s1 on
14
 * after iteration N.
15
 *
16
 * Therefore, for s2 and iteration N, s2_N = s2_0 + N*s1_N + N*c[0] +
17
 * N-1*c[1] + ... + c[N]
18
 *
19
 * In a more general way:
20
 *
21
 * s1_N = s1_0 + sum(i=1 to N)c[i]
22
 * s2_N = s2_0 + N*s1 + sum (i=1 to N)(N-i+1)*c[i]
23
 *
24
 * Where s1_N, s2_N are the values for s1, s2 after N iterations. So if we
25
 * can process N-bit at time we can do this at once.
26
 *
27
 * Since VSX can support 16-bit vector instructions, we can process
28
 * 16-bit at time using N = 16 we have:
29
 *
30
 * s1 = s1_16 = s1_(16-1) + c[16] = s1_0 + sum(i=1 to 16)c[i]
31
 * s2 = s2_16 = s2_0 + 16*s1 + sum(i=1 to 16)(16-i+1)*c[i]
32
 *
33
 * After the first iteration we calculate the adler32 checksum for 16 bytes.
34
 *
35
 * For more background about adler32 please check the RFC:
36
 * https://www.ietf.org/rfc/rfc1950.txt
37
 */
38

39
#ifdef POWER8_VSX
40

41
#include <altivec.h>
42
#include "zbuild.h"
43
#include "adler32_p.h"
44

45
/* Vector across sum unsigned int (saturate).  */
46
static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsigned int __b) {
47
    __b = vec_sld(__a, __a, 8);
48
    __b = vec_add(__b, __a);
49
    __a = vec_sld(__b, __b, 4);
50
    __a = vec_add(__a, __b);
51

52
    return __a;
53
}
54

55
Z_INTERNAL uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
56
    uint32_t s1 = adler & 0xffff;
57
    uint32_t s2 = (adler >> 16) & 0xffff;
58

59
    /* in case user likes doing a byte at a time, keep it fast */
60
    if (UNLIKELY(len == 1))
61
        return adler32_len_1(s1, buf, s2);
62

63
    /* If buffer is empty or len=0 we need to return adler initial value.  */
64
    if (UNLIKELY(buf == NULL))
65
        return 1;
66

67
    /* This is faster than VSX code for len < 64.  */
68
    if (len < 64)
69
        return adler32_len_64(s1, buf, len, s2);
70

71
    /* Use POWER VSX instructions for len >= 64. */
72
    const vector unsigned int v_zeros = { 0 };
73
    const vector unsigned char v_mul = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
74
         6, 5, 4, 3, 2, 1};
75
    const vector unsigned char vsh = vec_splat_u8(4);
76
    const vector unsigned int vmask = {0xffffffff, 0x0, 0x0, 0x0};
77
    vector unsigned int vs1 = { 0 };
78
    vector unsigned int vs2 = { 0 };
79
    vector unsigned int vs1_save = { 0 };
80
    vector unsigned int vsum1, vsum2;
81
    vector unsigned char vbuf;
82
    int n;
83

84
    vs1[0] = s1;
85
    vs2[0] = s2;
86

87
    /* Do length bigger than NMAX in blocks of NMAX size.  */
88
    while (len >= NMAX) {
89
        len -= NMAX;
90
        n = NMAX / 16;
91
        do {
92
            vbuf = vec_xl(0, (unsigned char *) buf);
93
            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
94
            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
95
            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
96
            /* Save vs1.  */
97
            vs1_save = vec_add(vs1_save, vs1);
98
            /* Accumulate the sums.  */
99
            vs1 = vec_add(vsum1, vs1);
100
            vs2 = vec_add(vsum2, vs2);
101

102
            buf += 16;
103
        } while (--n);
104
        /* Once each block of NMAX size.  */
105
        vs1 = vec_sumsu(vs1, vsum1);
106
        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
107
        vs2 = vec_add(vs1_save, vs2);
108
        vs2 = vec_sumsu(vs2, vsum2);
109

110
        /* vs1[0] = (s1_i + sum(i=1 to 16)buf[i]) mod 65521.  */
111
        vs1[0] = vs1[0] % BASE;
112
        /* vs2[0] = s2_i + 16*s1_save +
113
           sum(i=1 to 16)(16-i+1)*buf[i] mod 65521.  */
114
        vs2[0] = vs2[0] % BASE;
115

116
        vs1 = vec_and(vs1, vmask);
117
        vs2 = vec_and(vs2, vmask);
118
        vs1_save = v_zeros;
119
    }
120

121
    /* len is less than NMAX one modulo is needed.  */
122
    if (len >= 16) {
123
        while (len >= 16) {
124
            len -= 16;
125

126
            vbuf = vec_xl(0, (unsigned char *) buf);
127

128
            vsum1 = vec_sum4s(vbuf, v_zeros); /* sum(i=1 to 16) buf[i].  */
129
            /* sum(i=1 to 16) buf[i]*(16-i+1).  */
130
            vsum2 = vec_msum(vbuf, v_mul, v_zeros);
131
            /* Save vs1.  */
132
            vs1_save = vec_add(vs1_save, vs1);
133
            /* Accumulate the sums.  */
134
            vs1 = vec_add(vsum1, vs1);
135
            vs2 = vec_add(vsum2, vs2);
136

137
            buf += 16;
138
        }
139
        /* Since the size will be always less than NMAX we do this once.  */
140
        vs1 = vec_sumsu(vs1, vsum1);
141
        vs1_save = vec_sll(vs1_save, vsh); /* 16*vs1_save.  */
142
        vs2 = vec_add(vs1_save, vs2);
143
        vs2 = vec_sumsu(vs2, vsum2);
144
    }
145
    /* Copy result back to s1, s2 (mod 65521).  */
146
    s1 = vs1[0] % BASE;
147
    s2 = vs2[0] % BASE;
148

149
    /* Process tail (len < 16).  */
150
    return adler32_len_16(s1, buf, len, s2);
151
}
152

153
#endif /* POWER8_VSX */
154
opencv

Использование cookies