ClickHouse

Форк
0
/
decodeHTMLComponent.cpp 
228 строк · 7.6 Кб
1
#include <Columns/ColumnString.h>
2
#include <Functions/FunctionFactory.h>
3
#include <Functions/FunctionStringToString.h>
4
#include <Functions/HTMLCharacterReference.h>
5
#include <base/find_symbols.h>
6
#include <base/hex.h>
7
#include <Common/StringUtils/StringUtils.h>
8

9

10
namespace DB
11
{
12
namespace ErrorCodes
13
{
14
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
15
}
16

17
namespace
18
{
19
    struct DecodeHTMLComponentName
20
    {
21
        static constexpr auto name = "decodeHTMLComponent";
22
    };
23

24
    class FunctionDecodeHTMLComponentImpl
25
    {
26
    public:
27
        static void vector(
28
            const ColumnString::Chars & data,
29
            const ColumnString::Offsets & offsets,
30
            ColumnString::Chars & res_data,
31
            ColumnString::Offsets & res_offsets)
32
        {
33
            /// The size of result is always not more than the size of source.
34
            /// Because entities decodes to the shorter byte sequence.
35
            /// Example: &#xx... &#xx... will decode to UTF-8 byte sequence not longer than 4 bytes.
36
            res_data.resize(data.size());
37

38
            size_t size = offsets.size();
39
            res_offsets.resize(size);
40

41
            size_t prev_offset = 0;
42
            size_t res_offset = 0;
43

44
            for (size_t i = 0; i < size; ++i)
45
            {
46
                const char * src_data = reinterpret_cast<const char *>(&data[prev_offset]);
47
                size_t src_size = offsets[i] - prev_offset;
48
                size_t dst_size = execute(src_data, src_size, reinterpret_cast<char *>(res_data.data() + res_offset));
49

50
                res_offset += dst_size;
51
                res_offsets[i] = res_offset;
52
                prev_offset = offsets[i];
53
            }
54

55
            res_data.resize(res_offset);
56
        }
57

58
        [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
59
        {
60
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function decodeHTMLComponent cannot work with FixedString argument");
61
        }
62

63
    private:
64
        static const int max_legal_unicode_value = 0x10FFFF;
65
        static const int max_decimal_length_of_unicode_point = 7; /// 1114111
66

67

68
        static size_t execute(const char * src, size_t src_size, char * dst)
69
        {
70
            const char * src_pos = src;
71
            const char * src_end = src + src_size;
72
            char * dst_pos = dst;
73

74
            // to hold char seq for lookup, reuse it
75
            std::vector<char> seq;
76
            while (true)
77
            {
78
                const char * entity_pos = find_first_symbols<'&'>(src_pos, src_end);
79

80
                /// Copy text between entities.
81
                size_t bytes_to_copy = entity_pos - src_pos;
82
                memcpySmallAllowReadWriteOverflow15(dst_pos, src_pos, bytes_to_copy);
83
                dst_pos += bytes_to_copy;
84
                src_pos = entity_pos;
85

86
                ++entity_pos;
87

88
                const char * entity_end = find_first_symbols<';'>(entity_pos, src_end);
89

90
                if (entity_end == src_end)
91
                    break;
92

93
                bool parsed = false;
94

95
                /// covers &#NNNN; or &#xNNNN hexadecimal values;
96
                uint32_t code_point = 0;
97
                if (isValidNumericEntity(entity_pos, entity_end, code_point))
98
                {
99
                    codePointToUTF8(code_point, dst_pos);
100
                    parsed = true;
101
                }
102
                else /// covers html encoded character sequences
103
                {
104
                    // seq_length should also include `;` at the end
105
                    size_t seq_length = (entity_end - entity_pos) + 1;
106
                    seq.assign(entity_pos, entity_pos + seq_length);
107
                    // null terminate the sequence
108
                    seq.push_back('\0');
109
                    // lookup the html sequence in the perfect hashmap.
110
                    const auto * res = HTMLCharacterHash::Lookup(seq.data(), strlen(seq.data()));
111
                    // reset so that it's reused in the next iteration
112
                    seq.clear();
113
                    if (res)
114
                    {
115
                        const auto * glyph = res->glyph;
116
                        for (size_t i = 0; i < strlen(glyph); ++i)
117
                        {
118
                            *dst_pos = glyph[i];
119
                            ++dst_pos;
120
                        }
121
                        parsed = true;
122
                    }
123
                    else
124
                        parsed = false;
125
                }
126

127
                if (parsed)
128
                {
129
                    /// Skip the parsed entity.
130
                    src_pos = entity_end + 1;
131
                }
132
                else
133
                {
134
                    /// Copy one byte as is and skip it.
135
                    *dst_pos = *src_pos;
136
                    ++dst_pos;
137
                    ++src_pos;
138
                }
139
            }
140

141
            /// Copy the rest of the string.
142
            if (src_pos < src_end)
143
            {
144
                size_t bytes_to_copy = src_end - src_pos;
145
                memcpySmallAllowReadWriteOverflow15(dst_pos, src_pos, bytes_to_copy);
146
                dst_pos += bytes_to_copy;
147
            }
148

149
            return dst_pos - dst;
150
        }
151

152
        static size_t codePointToUTF8(uint32_t code_point, char *& dst_pos)
153
        {
154
            if (code_point < (1 << 7))
155
            {
156
                dst_pos[0] = (code_point & 0x7F);
157
                ++dst_pos;
158
                return 1;
159
            }
160
            else if (code_point < (1 << 11))
161
            {
162
                dst_pos[0] = ((code_point >> 6) & 0x1F) + 0xC0;
163
                dst_pos[1] = (code_point & 0x3F) + 0x80;
164
                dst_pos += 2;
165
                return 2;
166
            }
167
            else if (code_point < (1 << 16))
168
            {
169
                dst_pos[0] = ((code_point >> 12) & 0x0F) + 0xE0;
170
                dst_pos[1] = ((code_point >> 6) & 0x3F) + 0x80;
171
                dst_pos[2] = (code_point & 0x3F) + 0x80;
172
                dst_pos += 3;
173
                return 3;
174
            }
175
            else
176
            {
177
                dst_pos[0] = ((code_point >> 18) & 0x07) + 0xF0;
178
                dst_pos[1] = ((code_point >> 12) & 0x3F) + 0x80;
179
                dst_pos[2] = ((code_point >> 6) & 0x3F) + 0x80;
180
                dst_pos[3] = (code_point & 0x3F) + 0x80;
181
                dst_pos += 4;
182
                return 4;
183
            }
184
        }
185

186
        [[maybe_unused]] static bool isValidNumericEntity(const char * src, const char * end, uint32_t & code_point)
187
        {
188
            if (src + strlen("#") >= end)
189
                return false;
190
            if (src[0] != '#' || (end - src > 1 + max_decimal_length_of_unicode_point))
191
                return false;
192

193
            if (src + 2 < end && (src[1] == 'x' || src[1] == 'X'))
194
            {
195
                src += 2;
196
                for (; src < end; ++src)
197
                {
198
                    if (!isHexDigit(*src))
199
                        return false;
200
                    code_point *= 16;
201
                    code_point += unhex(*src);
202
                }
203
            }
204
            else
205
            {
206
                src += 1;
207
                for (; src < end; ++src)
208
                {
209
                    if (!isNumericASCII(*src))
210
                        return false;
211
                    code_point *= 10;
212
                    code_point += *src - '0';
213
                }
214
            }
215

216
            return code_point <= max_legal_unicode_value;
217
        }
218
    };
219

220
    using FunctionDecodeHTMLComponent = FunctionStringToString<FunctionDecodeHTMLComponentImpl, DecodeHTMLComponentName>;
221

222
}
223

224
REGISTER_FUNCTION(DecodeHTMLComponent)
225
{
226
    factory.registerFunction<FunctionDecodeHTMLComponent>();
227
}
228
}
229

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.