ClickHouse

Форк
0
/
isValidUTF8.cpp 
274 строки · 12.3 Кб
1
#include <DataTypes/DataTypeString.h>
2
#include <Functions/FunctionFactory.h>
3
#include <Functions/FunctionStringOrArrayToT.h>
4
#include <Common/isValidUTF8.h>
5

6
namespace DB
7
{
8
namespace ErrorCodes
9
{
10
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
11
}
12
/// inspired by https://github.com/cyb70289/utf8/
13
struct ValidUTF8Impl
14
{
15
    /*
16
MIT License
17

18
Copyright (c) 2019 Yibo Cai
19

20
Permission is hereby granted, free of charge, to any person obtaining a copy
21
of this software and associated documentation files (the "Software"), to deal
22
in the Software without restriction, including without limitation the rights
23
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
24
copies of the Software, and to permit persons to whom the Software is
25
furnished to do so, subject to the following conditions:
26

27
The above copyright notice and this permission notice shall be included in all
28
copies or substantial portions of the Software.
29

30
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36
SOFTWARE.
37
*/
38

39
    /*
40
 * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
41
 *
42
 * Table 3-7. Well-Formed UTF-8 Byte Sequences
43
 *
44
 * +--------------------+------------+-------------+------------+-------------+
45
 * | Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
46
 * +--------------------+------------+-------------+------------+-------------+
47
 * | U+0000..U+007F     | 00..7F     |             |            |             |
48
 * +--------------------+------------+-------------+------------+-------------+
49
 * | U+0080..U+07FF     | C2..DF     | 80..BF      |            |             |
50
 * +--------------------+------------+-------------+------------+-------------+
51
 * | U+0800..U+0FFF     | E0         | A0..BF      | 80..BF     |             |
52
 * +--------------------+------------+-------------+------------+-------------+
53
 * | U+1000..U+CFFF     | E1..EC     | 80..BF      | 80..BF     |             |
54
 * +--------------------+------------+-------------+------------+-------------+
55
 * | U+D000..U+D7FF     | ED         | 80..9F      | 80..BF     |             |
56
 * +--------------------+------------+-------------+------------+-------------+
57
 * | U+E000..U+FFFF     | EE..EF     | 80..BF      | 80..BF     |             |
58
 * +--------------------+------------+-------------+------------+-------------+
59
 * | U+10000..U+3FFFF   | F0         | 90..BF      | 80..BF     | 80..BF      |
60
 * +--------------------+------------+-------------+------------+-------------+
61
 * | U+40000..U+FFFFF   | F1..F3     | 80..BF      | 80..BF     | 80..BF      |
62
 * +--------------------+------------+-------------+------------+-------------+
63
 * | U+100000..U+10FFFF | F4         | 80..8F      | 80..BF     | 80..BF      |
64
 * +--------------------+------------+-------------+------------+-------------+
65
 */
66

67
#ifndef __SSE4_1__
68
    static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return DB::UTF8::isValidUTF8(data, len); }
69
#else
70
    static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len)
71
    {
72
        /*
73
        * Map high nibble of "First Byte" to legal character length minus 1
74
        * 0x00 ~ 0xBF --> 0
75
        * 0xC0 ~ 0xDF --> 1
76
        * 0xE0 ~ 0xEF --> 2
77
        * 0xF0 ~ 0xFF --> 3
78
        */
79
        const __m128i first_len_tbl = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
80

81
        /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
82
        const __m128i first_range_tbl = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
83

84
        /*
85
        * Range table, map range index to min and max values
86
        */
87
        const __m128i range_min_tbl
88
            = _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
89

90
        const __m128i range_max_tbl
91
            = _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
92

93
        /*
94
        * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
95
        * which the Second Byte are not 80~BF. It contains "range index adjustment".
96
        * +------------+---------------+------------------+----------------+
97
        * | First Byte | original range| range adjustment | adjusted range |
98
        * +------------+---------------+------------------+----------------+
99
        * | E0         | 2             | 2                | 4              |
100
        * +------------+---------------+------------------+----------------+
101
        * | ED         | 2             | 3                | 5              |
102
        * +------------+---------------+------------------+----------------+
103
        * | F0         | 3             | 3                | 6              |
104
        * +------------+---------------+------------------+----------------+
105
        * | F4         | 4             | 4                | 8              |
106
        * +------------+---------------+------------------+----------------+
107
        */
108

109
        /* index1 -> E0, index14 -> ED */
110
        const __m128i df_ee_tbl = _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
111

112
        /* index1 -> F0, index5 -> F4 */
113
        const __m128i ef_fe_tbl = _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
114

115
        __m128i prev_input = _mm_set1_epi8(0);
116
        __m128i prev_first_len = _mm_set1_epi8(0);
117
        __m128i error = _mm_set1_epi8(0);
118

119
        auto check_packed = [&](__m128i input) noexcept
120
        {
121
            /* high_nibbles = input >> 4 */
122
            const __m128i high_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
123

124
            /* first_len = legal character length minus 1 */
125
            /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
126
            /* first_len = first_len_tbl[high_nibbles] */
127
            __m128i first_len = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
128

129
            /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
130
            /* range = first_range_tbl[high_nibbles] */
131
            __m128i range = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
132

133
            /* Second Byte: set range index to first_len */
134
            /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
135
            /* range |= (first_len, prev_first_len) << 1 byte */
136
            range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
137

138
            /* Third Byte: set range index to saturate_sub(first_len, 1) */
139
            /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
140
            __m128i tmp1;
141
            __m128i tmp2;
142
            /* tmp1 = saturate_sub(first_len, 1) */
143
            tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
144
            /* tmp2 = saturate_sub(prev_first_len, 1) */
145
            tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
146
            /* range |= (tmp1, tmp2) << 2 bytes */
147
            range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
148

149
            /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
150
            /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
151
            /* tmp1 = saturate_sub(first_len, 2) */
152
            tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
153
            /* tmp2 = saturate_sub(prev_first_len, 2) */
154
            tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
155
            /* range |= (tmp1, tmp2) << 3 bytes */
156
            range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
157

158
            /*
159
             * Now we have below range indices calculated
160
             * Correct cases:
161
             * - 8 for C0~FF
162
             * - 3 for 1st byte after F0~FF
163
             * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
164
             * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
165
             *         3rd byte after F0~FF
166
             * - 0 for others
167
             * Error cases:
168
             *   9,10,11 if non ascii First Byte overlaps
169
             *   E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
170
             */
171

172
            /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
173
            /* Overlaps lead to index 9~15, which are illegal in range table */
174
            __m128i shift1, pos, range2;
175
            /* shift1 = (input, prev_input) << 1 byte */
176
            shift1 = _mm_alignr_epi8(input, prev_input, 15);
177
            pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
178
            /*
179
             * shift1:  | EF  F0 ... FE | FF  00  ... ...  DE | DF  E0 ... EE |
180
             * pos:     | 0   1      15 | 16  17           239| 240 241    255|
181
             * pos-240: | 0   0      0  | 0   0            0  | 0   1      15 |
182
             * pos+112: | 112 113    127|       >= 128        |     >= 128    |
183
             */
184
            tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
185
            range2 = _mm_shuffle_epi8(df_ee_tbl, tmp1);
186
            tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
187
            range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp2));
188

189
            range = _mm_add_epi8(range, range2);
190

191
            /* Load min and max values per calculated range index */
192
            __m128i minv = _mm_shuffle_epi8(range_min_tbl, range);
193
            __m128i maxv = _mm_shuffle_epi8(range_max_tbl, range);
194

195
            /* Check value range */
196
            error = _mm_or_si128(error, _mm_cmplt_epi8(input, minv));
197
            error = _mm_or_si128(error, _mm_cmpgt_epi8(input, maxv));
198

199
            prev_input = input;
200
            prev_first_len = first_len;
201

202
            data += 16;
203
            len -= 16;
204
        };
205

206
        while (len >= 16) // NOLINT
207
            check_packed(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)));
208

209
        /// 0 <= len <= 15 for now. Reading data from data - 1 because of right padding of 15 and left padding
210
        /// Then zero some bytes from the unknown memory and check again.
211
        alignas(16) char buf[32];
212
        _mm_store_si128(reinterpret_cast<__m128i *>(buf), _mm_loadu_si128(reinterpret_cast<const __m128i *>(data - 1)));
213
        memset(buf + len + 1, 0, 16);
214
        check_packed(_mm_loadu_si128(reinterpret_cast<__m128i *>(buf + 1)));
215

216
        return _mm_testz_si128(error, error);
217
    }
218
#endif
219

220
    static constexpr bool is_fixed_to_constant = false;
221

222
    static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
223
    {
224
        size_t size = offsets.size();
225
        size_t prev_offset = 0;
226
        for (size_t i = 0; i < size; ++i)
227
        {
228
            res[i] = isValidUTF8(data.data() + prev_offset, offsets[i] - 1 - prev_offset);
229
            prev_offset = offsets[i];
230
        }
231
    }
232

233
    static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {}
234

235
    static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
236
    {
237
        size_t size = data.size() / n;
238
        for (size_t i = 0; i < size; ++i)
239
            res[i] = isValidUTF8(data.data() + i * n, n);
240
    }
241

242
    [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)
243
    {
244
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to Array argument");
245
    }
246

247
    [[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &)
248
    {
249
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to UUID argument");
250
    }
251

252
    [[noreturn]] static void ipv6(const ColumnIPv6::Container &, size_t &, PaddedPODArray<UInt8> &)
253
    {
254
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to IPv6 argument");
255
    }
256

257
    [[noreturn]] static void ipv4(const ColumnIPv4::Container &, size_t &, PaddedPODArray<UInt8> &)
258
    {
259
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to IPv4 argument");
260
    }
261
};
262

263
struct NameIsValidUTF8
264
{
265
    static constexpr auto name = "isValidUTF8";
266
};
267
using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;
268

269
REGISTER_FUNCTION(IsValidUTF8)
270
{
271
    factory.registerFunction<FunctionValidUTF8>();
272
}
273

274
}
275

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.