8
#include "../generic/chunk_permute_table.h"
10
typedef __m256i chunk_t;
14
#define HAVE_CHUNKMEMSET_2
15
#define HAVE_CHUNKMEMSET_4
16
#define HAVE_CHUNKMEMSET_8
21
static const lut_rem_pair perm_idx_lut[29] = {
38
{11 * 32 + 16 * 2, 13},
39
{11 * 32 + 16 * 3, 12},
40
{11 * 32 + 16 * 4, 11},
41
{11 * 32 + 16 * 5, 10},
42
{11 * 32 + 16 * 6, 9},
43
{11 * 32 + 16 * 7, 8},
44
{11 * 32 + 16 * 8, 7},
45
{11 * 32 + 16 * 9, 6},
46
{11 * 32 + 16 * 10, 5},
47
{11 * 32 + 16 * 11, 4},
48
{11 * 32 + 16 * 12, 3},
49
{11 * 32 + 16 * 13, 2},
50
{11 * 32 + 16 * 14, 1}
53
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
55
memcpy(&tmp, from, sizeof(tmp));
56
*chunk = _mm256_set1_epi16(tmp);
59
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
61
memcpy(&tmp, from, sizeof(tmp));
62
*chunk = _mm256_set1_epi32(tmp);
65
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
67
memcpy(&tmp, from, sizeof(tmp));
68
*chunk = _mm256_set1_epi64x(tmp);
71
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
72
*chunk = _mm256_loadu_si256((__m256i *)s);
75
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
76
_mm256_storeu_si256((__m256i *)out, *chunk);
79
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
80
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
85
*chunk_rem = lut_rem.remval;
88
__msan_unpoison(buf + dist, 32 - dist);
94
const __m256i permute_xform =
95
_mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
97
__m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
98
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
99
perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
100
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
101
ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
102
} else if (dist == 16) {
103
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
104
return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
106
__m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
107
__m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
109
__m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
110
__m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
111
__m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
114
__m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
115
ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
121
#define CHUNKSIZE chunksize_avx2
122
#define CHUNKCOPY chunkcopy_avx2
123
#define CHUNKUNROLL chunkunroll_avx2
124
#define CHUNKMEMSET chunkmemset_avx2
125
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx2
127
#include "chunkset_tpl.h"
129
#define INFLATE_FAST inflate_fast_avx2
131
#include "inffast_tpl.h"