ClickHouse
228 строк · 7.6 Кб
1#include <Columns/ColumnString.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionStringToString.h>
4#include <Functions/HTMLCharacterReference.h>
5#include <base/find_symbols.h>
6#include <base/hex.h>
7#include <Common/StringUtils/StringUtils.h>
8
9
10namespace DB
11{
12namespace ErrorCodes
13{
14extern const int ILLEGAL_TYPE_OF_ARGUMENT;
15}
16
17namespace
18{
19struct DecodeHTMLComponentName
20{
21static constexpr auto name = "decodeHTMLComponent";
22};
23
24class FunctionDecodeHTMLComponentImpl
25{
26public:
27static void vector(
28const ColumnString::Chars & data,
29const ColumnString::Offsets & offsets,
30ColumnString::Chars & res_data,
31ColumnString::Offsets & res_offsets)
32{
33/// The size of result is always not more than the size of source.
34/// Because entities decodes to the shorter byte sequence.
35/// Example: &#xx... &#xx... will decode to UTF-8 byte sequence not longer than 4 bytes.
36res_data.resize(data.size());
37
38size_t size = offsets.size();
39res_offsets.resize(size);
40
41size_t prev_offset = 0;
42size_t res_offset = 0;
43
44for (size_t i = 0; i < size; ++i)
45{
46const char * src_data = reinterpret_cast<const char *>(&data[prev_offset]);
47size_t src_size = offsets[i] - prev_offset;
48size_t dst_size = execute(src_data, src_size, reinterpret_cast<char *>(res_data.data() + res_offset));
49
50res_offset += dst_size;
51res_offsets[i] = res_offset;
52prev_offset = offsets[i];
53}
54
55res_data.resize(res_offset);
56}
57
58[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
59{
60throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function decodeHTMLComponent cannot work with FixedString argument");
61}
62
63private:
64static const int max_legal_unicode_value = 0x10FFFF;
65static const int max_decimal_length_of_unicode_point = 7; /// 1114111
66
67
68static size_t execute(const char * src, size_t src_size, char * dst)
69{
70const char * src_pos = src;
71const char * src_end = src + src_size;
72char * dst_pos = dst;
73
74// to hold char seq for lookup, reuse it
75std::vector<char> seq;
76while (true)
77{
78const char * entity_pos = find_first_symbols<'&'>(src_pos, src_end);
79
80/// Copy text between entities.
81size_t bytes_to_copy = entity_pos - src_pos;
82memcpySmallAllowReadWriteOverflow15(dst_pos, src_pos, bytes_to_copy);
83dst_pos += bytes_to_copy;
84src_pos = entity_pos;
85
86++entity_pos;
87
88const char * entity_end = find_first_symbols<';'>(entity_pos, src_end);
89
90if (entity_end == src_end)
91break;
92
93bool parsed = false;
94
95/// covers &#NNNN; or &#xNNNN hexadecimal values;
96uint32_t code_point = 0;
97if (isValidNumericEntity(entity_pos, entity_end, code_point))
98{
99codePointToUTF8(code_point, dst_pos);
100parsed = true;
101}
102else /// covers html encoded character sequences
103{
104// seq_length should also include `;` at the end
105size_t seq_length = (entity_end - entity_pos) + 1;
106seq.assign(entity_pos, entity_pos + seq_length);
107// null terminate the sequence
108seq.push_back('\0');
109// lookup the html sequence in the perfect hashmap.
110const auto * res = HTMLCharacterHash::Lookup(seq.data(), strlen(seq.data()));
111// reset so that it's reused in the next iteration
112seq.clear();
113if (res)
114{
115const auto * glyph = res->glyph;
116for (size_t i = 0; i < strlen(glyph); ++i)
117{
118*dst_pos = glyph[i];
119++dst_pos;
120}
121parsed = true;
122}
123else
124parsed = false;
125}
126
127if (parsed)
128{
129/// Skip the parsed entity.
130src_pos = entity_end + 1;
131}
132else
133{
134/// Copy one byte as is and skip it.
135*dst_pos = *src_pos;
136++dst_pos;
137++src_pos;
138}
139}
140
141/// Copy the rest of the string.
142if (src_pos < src_end)
143{
144size_t bytes_to_copy = src_end - src_pos;
145memcpySmallAllowReadWriteOverflow15(dst_pos, src_pos, bytes_to_copy);
146dst_pos += bytes_to_copy;
147}
148
149return dst_pos - dst;
150}
151
152static size_t codePointToUTF8(uint32_t code_point, char *& dst_pos)
153{
154if (code_point < (1 << 7))
155{
156dst_pos[0] = (code_point & 0x7F);
157++dst_pos;
158return 1;
159}
160else if (code_point < (1 << 11))
161{
162dst_pos[0] = ((code_point >> 6) & 0x1F) + 0xC0;
163dst_pos[1] = (code_point & 0x3F) + 0x80;
164dst_pos += 2;
165return 2;
166}
167else if (code_point < (1 << 16))
168{
169dst_pos[0] = ((code_point >> 12) & 0x0F) + 0xE0;
170dst_pos[1] = ((code_point >> 6) & 0x3F) + 0x80;
171dst_pos[2] = (code_point & 0x3F) + 0x80;
172dst_pos += 3;
173return 3;
174}
175else
176{
177dst_pos[0] = ((code_point >> 18) & 0x07) + 0xF0;
178dst_pos[1] = ((code_point >> 12) & 0x3F) + 0x80;
179dst_pos[2] = ((code_point >> 6) & 0x3F) + 0x80;
180dst_pos[3] = (code_point & 0x3F) + 0x80;
181dst_pos += 4;
182return 4;
183}
184}
185
186[[maybe_unused]] static bool isValidNumericEntity(const char * src, const char * end, uint32_t & code_point)
187{
188if (src + strlen("#") >= end)
189return false;
190if (src[0] != '#' || (end - src > 1 + max_decimal_length_of_unicode_point))
191return false;
192
193if (src + 2 < end && (src[1] == 'x' || src[1] == 'X'))
194{
195src += 2;
196for (; src < end; ++src)
197{
198if (!isHexDigit(*src))
199return false;
200code_point *= 16;
201code_point += unhex(*src);
202}
203}
204else
205{
206src += 1;
207for (; src < end; ++src)
208{
209if (!isNumericASCII(*src))
210return false;
211code_point *= 10;
212code_point += *src - '0';
213}
214}
215
216return code_point <= max_legal_unicode_value;
217}
218};
219
220using FunctionDecodeHTMLComponent = FunctionStringToString<FunctionDecodeHTMLComponentImpl, DecodeHTMLComponentName>;
221
222}
223
224REGISTER_FUNCTION(DecodeHTMLComponent)
225{
226factory.registerFunction<FunctionDecodeHTMLComponent>();
227}
228}
229