opencv

adler32_avx2.c
154 строки · 6.0 Кб
Перенос по словам
1
/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
2
 * Copyright (C) 1995-2011 Mark Adler
3
 * Copyright (C) 2022 Adam Stylinski
4
 * Authors:
5
 *   Brian Bockelman <bockelman@gmail.com>
6
 *   Adam Stylinski <kungfujesus06@gmail.com>
7
 * For conditions of distribution and use, see copyright notice in zlib.h
8
 */
9

10
#ifdef X86_AVX2
11

12
#include "../../zbuild.h"
13
#include <immintrin.h>
14
#include "../../adler32_fold.h"
15
#include "../../adler32_p.h"
16
#include "adler32_avx2_p.h"
17
#include "x86_intrins.h"
18

19
#ifdef X86_SSE42
20
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
21
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
22

23
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
24
#define sub32(a, b, c) adler32_ssse3(a, b, c)
25
#else
26
#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
27
#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
28
#endif
29

30
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
31
    if (src == NULL) return 1L;
32
    if (len == 0) return adler;
33

34
    uint32_t adler0, adler1;
35
    adler1 = (adler >> 16) & 0xffff;
36
    adler0 = adler & 0xffff;
37

38
rem_peel:
39
    if (len < 16) {
40
        if (COPY) {
41
            return adler32_copy_len_16(adler0, src, dst, len, adler1);
42
        } else {
43
            return adler32_len_16(adler0, src, len, adler1);
44
        }
45
    } else if (len < 32) {
46
        if (COPY) {
47
            return copy_sub32(adler, dst, src, len);
48
        } else {
49
            return sub32(adler, src, len);
50
        }
51
    }
52

53
    __m256i vs1, vs2;
54

55
    const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
56
                                           14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
57
    const __m256i dot3v = _mm256_set1_epi16(1);
58
    const __m256i zero = _mm256_setzero_si256();
59

60
    while (len >= 32) {
61
        vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
62
        vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
63
        __m256i vs1_0 = vs1;
64
        __m256i vs3 = _mm256_setzero_si256();
65

66
        size_t k = MIN(len, NMAX);
67
        k -= k % 32;
68
        len -= k;
69

70
        while (k >= 32) {
71
            /*
72
               vs1 = adler + sum(c[i])
73
               vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
74
            */
75
            __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
76
            src += 32;
77
            k -= 32;
78

79
            __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
80

81
            if (COPY) {
82
                _mm256_storeu_si256((__m256i*)dst, vbuf);
83
                dst += 32;
84
            }
85
 
86
            vs1 = _mm256_add_epi32(vs1, vs1_sad);
87
            vs3 = _mm256_add_epi32(vs3, vs1_0);
88
            __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
89
            __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
90
            vs2 = _mm256_add_epi32(vsum2, vs2);
91
            vs1_0 = vs1;
92
        }
93

94
        /* Defer the multiplication with 32 to outside of the loop */
95
        vs3 = _mm256_slli_epi32(vs3, 5);
96
        vs2 = _mm256_add_epi32(vs2, vs3);
97

98
        /* The compiler is generating the following sequence for this integer modulus
99
         * when done the scalar way, in GPRs:
100

101
         adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
102
                 (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
103

104
         mov    $0x80078071,%edi // move magic constant into 32 bit register %edi
105
         ...
106
         vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
107
         mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
108
         imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
109
         shr    $0x2f,%rsi // shift right by 47
110
         imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
111
         sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
112
         ...
113
         // repeats for each element with vpextract instructions
114

115
         This is tricky with AVX2 for a number of reasons:
116
             1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
117
             2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
118
                 back down to 32 bit precision later (there is in AVX512)
119
             3.) Full width integer multiplications aren't cheap
120

121
         We can, however, do a relatively cheap sequence for horizontal sums.
122
         Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
123
         previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
124
         that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
125
         performed on the maximum possible inputs before overflow
126
         */
127

128

129
         /* In AVX2-land, this trip through GPRs will probably be unavoidable, as there's no cheap and easy
130
          * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
131
          * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
132
          * what the compiler is doing to avoid integer divisions. */
133
         adler0 = partial_hsum256(vs1) % BASE;
134
         adler1 = hsum256(vs2) % BASE;
135
    }
136

137
    adler = adler0 | (adler1 << 16);
138

139
    if (len) {
140
        goto rem_peel;
141
    }
142

143
    return adler;
144
}
145

146
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
147
    return adler32_fold_copy_impl(adler, NULL, src, len, 0);
148
}
149

150
Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
151
    return adler32_fold_copy_impl(adler, dst, src, len, 1);
152
}
153

154
#endif
155
opencv

Использование cookies