ClickHouse
171 строка · 5.3 Кб
1#include <DataTypes/DataTypeString.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionStringToString.h>
4#include <IO/WriteBufferFromVector.h>
5#include <IO/WriteHelpers.h>
6#include <Poco/UTF8Encoding.h>
7
8#include <string_view>
9
10#include <base/simd.h>
11
12#ifdef __SSE2__
13# include <emmintrin.h>
14#endif
15
16#if defined(__aarch64__) && defined(__ARM_NEON)
17# include <arm_neon.h>
18# pragma clang diagnostic ignored "-Wreserved-identifier"
19#endif
20
21namespace DB
22{
23
24namespace ErrorCodes
25{
26extern const int ILLEGAL_COLUMN;
27}
28
29extern const UInt8 length_of_utf8_sequence[256];
30
31namespace
32{
33
34struct ToValidUTF8Impl
35{
36static void toValidUTF8One(const char * begin, const char * end, WriteBuffer & write_buffer)
37{
38static constexpr std::string_view replacement = "\xEF\xBF\xBD";
39
40const char * p = begin;
41const char * valid_start = begin;
42
43/// The last recorded character was `replacement`.
44bool just_put_replacement = false;
45
46auto put_valid = [&write_buffer, &just_put_replacement](const char * data, size_t len)
47{
48if (len == 0)
49return;
50just_put_replacement = false;
51write_buffer.write(data, len);
52};
53
54auto put_replacement = [&write_buffer, &just_put_replacement]()
55{
56if (just_put_replacement)
57return;
58just_put_replacement = true;
59write_buffer.write(replacement.data(), replacement.size());
60};
61
62while (p < end)
63{
64#ifdef __SSE2__
65/// Fast skip of ASCII
66static constexpr size_t SIMD_BYTES = 16;
67const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
68
69while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(p))))
70p += SIMD_BYTES;
71
72if (!(p < end))
73break;
74#elif defined(__aarch64__) && defined(__ARM_NEON)
75/// Fast skip of ASCII for aarch64.
76static constexpr size_t SIMD_BYTES = 16;
77const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
78/// Other options include
79/// vmaxvq_u8(input) < 0b10000000;
80/// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
81/// SIMDJSON uses it for 64 byte masks, so it's a little different.
82/// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
83/// shrn version has universally <=3 cycles, on servers 2 cycles.
84while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
85p += SIMD_BYTES;
86
87if (!(p < end))
88break;
89#endif
90
91size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
92
93if (len > 4)
94{
95/// Invalid start of sequence. Skip one byte.
96put_valid(valid_start, p - valid_start);
97put_replacement();
98++p;
99valid_start = p;
100}
101else if (p + len > end)
102{
103/// Sequence was not fully written to this buffer.
104break;
105}
106else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(p), static_cast<int>(len)))
107{
108/// Valid sequence.
109p += len;
110}
111else
112{
113/// Invalid sequence. Skip just first byte.
114put_valid(valid_start, p - valid_start);
115put_replacement();
116++p;
117valid_start = p;
118}
119}
120
121put_valid(valid_start, p - valid_start);
122
123if (p != end)
124put_replacement();
125}
126
127static void vector(
128const ColumnString::Chars & data,
129const ColumnString::Offsets & offsets,
130ColumnString::Chars & res_data,
131ColumnString::Offsets & res_offsets)
132{
133const size_t offsets_size = offsets.size();
134/// It can be larger than that, but we believe it is unlikely to happen.
135res_data.resize(data.size());
136res_offsets.resize(offsets_size);
137
138size_t prev_offset = 0;
139WriteBufferFromVector<ColumnString::Chars> write_buffer(res_data);
140for (size_t i = 0; i < offsets_size; ++i)
141{
142const char * haystack_data = reinterpret_cast<const char *>(&data[prev_offset]);
143const size_t haystack_size = offsets[i] - prev_offset - 1;
144toValidUTF8One(haystack_data, haystack_data + haystack_size, write_buffer);
145writeChar(0, write_buffer);
146res_offsets[i] = write_buffer.count();
147prev_offset = offsets[i];
148}
149write_buffer.finalize();
150}
151
152[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
153{
154throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by toValidUTF8 function");
155}
156};
157
158struct NameToValidUTF8
159{
160static constexpr auto name = "toValidUTF8";
161};
162using FunctionToValidUTF8 = FunctionStringToString<ToValidUTF8Impl, NameToValidUTF8>;
163
164}
165
166REGISTER_FUNCTION(ToValidUTF8)
167{
168factory.registerFunction<FunctionToValidUTF8>();
169}
170
171}
172