ClickHouse
274 строки · 12.3 Кб
1#include <DataTypes/DataTypeString.h>2#include <Functions/FunctionFactory.h>3#include <Functions/FunctionStringOrArrayToT.h>4#include <Common/isValidUTF8.h>5
6namespace DB7{
8namespace ErrorCodes9{
10extern const int ILLEGAL_TYPE_OF_ARGUMENT;11}
12/// inspired by https://github.com/cyb70289/utf8/
13struct ValidUTF8Impl14{
15/*16MIT License
17
18Copyright (c) 2019 Yibo Cai
19
20Permission is hereby granted, free of charge, to any person obtaining a copy
21of this software and associated documentation files (the "Software"), to deal
22in the Software without restriction, including without limitation the rights
23to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
24copies of the Software, and to permit persons to whom the Software is
25furnished to do so, subject to the following conditions:
26
27The above copyright notice and this permission notice shall be included in all
28copies or substantial portions of the Software.
29
30THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36SOFTWARE.
37*/
38
39/*40* http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
41*
42* Table 3-7. Well-Formed UTF-8 Byte Sequences
43*
44* +--------------------+------------+-------------+------------+-------------+
45* | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
46* +--------------------+------------+-------------+------------+-------------+
47* | U+0000..U+007F | 00..7F | | | |
48* +--------------------+------------+-------------+------------+-------------+
49* | U+0080..U+07FF | C2..DF | 80..BF | | |
50* +--------------------+------------+-------------+------------+-------------+
51* | U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
52* +--------------------+------------+-------------+------------+-------------+
53* | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
54* +--------------------+------------+-------------+------------+-------------+
55* | U+D000..U+D7FF | ED | 80..9F | 80..BF | |
56* +--------------------+------------+-------------+------------+-------------+
57* | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
58* +--------------------+------------+-------------+------------+-------------+
59* | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
60* +--------------------+------------+-------------+------------+-------------+
61* | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
62* +--------------------+------------+-------------+------------+-------------+
63* | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
64* +--------------------+------------+-------------+------------+-------------+
65*/
66
67#ifndef __SSE4_1__68static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return DB::UTF8::isValidUTF8(data, len); }69#else70static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len)71{72/*73* Map high nibble of "First Byte" to legal character length minus 1
74* 0x00 ~ 0xBF --> 0
75* 0xC0 ~ 0xDF --> 1
76* 0xE0 ~ 0xEF --> 2
77* 0xF0 ~ 0xFF --> 3
78*/
79const __m128i first_len_tbl = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);80
81/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */82const __m128i first_range_tbl = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);83
84/*85* Range table, map range index to min and max values
86*/
87const __m128i range_min_tbl88= _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);89
90const __m128i range_max_tbl91= _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);92
93/*94* Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
95* which the Second Byte are not 80~BF. It contains "range index adjustment".
96* +------------+---------------+------------------+----------------+
97* | First Byte | original range| range adjustment | adjusted range |
98* +------------+---------------+------------------+----------------+
99* | E0 | 2 | 2 | 4 |
100* +------------+---------------+------------------+----------------+
101* | ED | 2 | 3 | 5 |
102* +------------+---------------+------------------+----------------+
103* | F0 | 3 | 3 | 6 |
104* +------------+---------------+------------------+----------------+
105* | F4 | 4 | 4 | 8 |
106* +------------+---------------+------------------+----------------+
107*/
108
109/* index1 -> E0, index14 -> ED */110const __m128i df_ee_tbl = _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);111
112/* index1 -> F0, index5 -> F4 */113const __m128i ef_fe_tbl = _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);114
115__m128i prev_input = _mm_set1_epi8(0);116__m128i prev_first_len = _mm_set1_epi8(0);117__m128i error = _mm_set1_epi8(0);118
119auto check_packed = [&](__m128i input) noexcept120{121/* high_nibbles = input >> 4 */122const __m128i high_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));123
124/* first_len = legal character length minus 1 */125/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */126/* first_len = first_len_tbl[high_nibbles] */127__m128i first_len = _mm_shuffle_epi8(first_len_tbl, high_nibbles);128
129/* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */130/* range = first_range_tbl[high_nibbles] */131__m128i range = _mm_shuffle_epi8(first_range_tbl, high_nibbles);132
133/* Second Byte: set range index to first_len */134/* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */135/* range |= (first_len, prev_first_len) << 1 byte */136range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));137
138/* Third Byte: set range index to saturate_sub(first_len, 1) */139/* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */140__m128i tmp1;141__m128i tmp2;142/* tmp1 = saturate_sub(first_len, 1) */143tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));144/* tmp2 = saturate_sub(prev_first_len, 1) */145tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));146/* range |= (tmp1, tmp2) << 2 bytes */147range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));148
149/* Fourth Byte: set range index to saturate_sub(first_len, 2) */150/* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */151/* tmp1 = saturate_sub(first_len, 2) */152tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));153/* tmp2 = saturate_sub(prev_first_len, 2) */154tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));155/* range |= (tmp1, tmp2) << 3 bytes */156range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));157
158/*159* Now we have below range indices calculated
160* Correct cases:
161* - 8 for C0~FF
162* - 3 for 1st byte after F0~FF
163* - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
164* - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
165* 3rd byte after F0~FF
166* - 0 for others
167* Error cases:
168* 9,10,11 if non ascii First Byte overlaps
169* E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
170*/
171
172/* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */173/* Overlaps lead to index 9~15, which are illegal in range table */174__m128i shift1, pos, range2;175/* shift1 = (input, prev_input) << 1 byte */176shift1 = _mm_alignr_epi8(input, prev_input, 15);177pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));178/*179* shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
180* pos: | 0 1 15 | 16 17 239| 240 241 255|
181* pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
182* pos+112: | 112 113 127| >= 128 | >= 128 |
183*/
184tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));185range2 = _mm_shuffle_epi8(df_ee_tbl, tmp1);186tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));187range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp2));188
189range = _mm_add_epi8(range, range2);190
191/* Load min and max values per calculated range index */192__m128i minv = _mm_shuffle_epi8(range_min_tbl, range);193__m128i maxv = _mm_shuffle_epi8(range_max_tbl, range);194
195/* Check value range */196error = _mm_or_si128(error, _mm_cmplt_epi8(input, minv));197error = _mm_or_si128(error, _mm_cmpgt_epi8(input, maxv));198
199prev_input = input;200prev_first_len = first_len;201
202data += 16;203len -= 16;204};205
206while (len >= 16) // NOLINT207check_packed(_mm_loadu_si128(reinterpret_cast<const __m128i *>(data)));208
209/// 0 <= len <= 15 for now. Reading data from data - 1 because of right padding of 15 and left padding210/// Then zero some bytes from the unknown memory and check again.211alignas(16) char buf[32];212_mm_store_si128(reinterpret_cast<__m128i *>(buf), _mm_loadu_si128(reinterpret_cast<const __m128i *>(data - 1)));213memset(buf + len + 1, 0, 16);214check_packed(_mm_loadu_si128(reinterpret_cast<__m128i *>(buf + 1)));215
216return _mm_testz_si128(error, error);217}218#endif219
220static constexpr bool is_fixed_to_constant = false;221
222static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)223{224size_t size = offsets.size();225size_t prev_offset = 0;226for (size_t i = 0; i < size; ++i)227{228res[i] = isValidUTF8(data.data() + prev_offset, offsets[i] - 1 - prev_offset);229prev_offset = offsets[i];230}231}232
233static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {}234
235static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)236{237size_t size = data.size() / n;238for (size_t i = 0; i < size; ++i)239res[i] = isValidUTF8(data.data() + i * n, n);240}241
242[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)243{244throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to Array argument");245}246
247[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &)248{249throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to UUID argument");250}251
252[[noreturn]] static void ipv6(const ColumnIPv6::Container &, size_t &, PaddedPODArray<UInt8> &)253{254throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to IPv6 argument");255}256
257[[noreturn]] static void ipv4(const ColumnIPv4::Container &, size_t &, PaddedPODArray<UInt8> &)258{259throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Cannot apply function isValidUTF8 to IPv4 argument");260}261};262
263struct NameIsValidUTF8264{
265static constexpr auto name = "isValidUTF8";266};267using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;268
269REGISTER_FUNCTION(IsValidUTF8)270{
271factory.registerFunction<FunctionValidUTF8>();272}
273
274}
275