ClickHouse

Форк
0
/
toValidUTF8.cpp 
171 строка · 5.3 Кб
1
#include <DataTypes/DataTypeString.h>
2
#include <Functions/FunctionFactory.h>
3
#include <Functions/FunctionStringToString.h>
4
#include <IO/WriteBufferFromVector.h>
5
#include <IO/WriteHelpers.h>
6
#include <Poco/UTF8Encoding.h>
7

8
#include <string_view>
9

10
#include <base/simd.h>
11

12
#ifdef __SSE2__
13
#    include <emmintrin.h>
14
#endif
15

16
#if defined(__aarch64__) && defined(__ARM_NEON)
17
#    include <arm_neon.h>
18
#      pragma clang diagnostic ignored "-Wreserved-identifier"
19
#endif
20

21
namespace DB
22
{
23

24
namespace ErrorCodes
25
{
26
    extern const int ILLEGAL_COLUMN;
27
}
28

29
extern const UInt8 length_of_utf8_sequence[256];
30

31
namespace
32
{
33

34
struct ToValidUTF8Impl
35
{
36
    static void toValidUTF8One(const char * begin, const char * end, WriteBuffer & write_buffer)
37
    {
38
        static constexpr std::string_view replacement = "\xEF\xBF\xBD";
39

40
        const char * p = begin;
41
        const char * valid_start = begin;
42

43
        /// The last recorded character was `replacement`.
44
        bool just_put_replacement = false;
45

46
        auto put_valid = [&write_buffer, &just_put_replacement](const char * data, size_t len)
47
        {
48
            if (len == 0)
49
                return;
50
            just_put_replacement = false;
51
            write_buffer.write(data, len);
52
        };
53

54
        auto put_replacement = [&write_buffer, &just_put_replacement]()
55
        {
56
            if (just_put_replacement)
57
                return;
58
            just_put_replacement = true;
59
            write_buffer.write(replacement.data(), replacement.size());
60
        };
61

62
        while (p < end)
63
        {
64
#ifdef __SSE2__
65
            /// Fast skip of ASCII
66
            static constexpr size_t SIMD_BYTES = 16;
67
            const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
68

69
            while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(p))))
70
                p += SIMD_BYTES;
71

72
            if (!(p < end))
73
                break;
74
#elif defined(__aarch64__) && defined(__ARM_NEON)
75
            /// Fast skip of ASCII for aarch64.
76
            static constexpr size_t SIMD_BYTES = 16;
77
            const char * simd_end = p + (end - p) / SIMD_BYTES * SIMD_BYTES;
78
            /// Other options include
79
            /// vmaxvq_u8(input) < 0b10000000;
80
            /// Used by SIMDJSON, has latency 3 for M1, 6 for everything else
81
            /// SIMDJSON uses it for 64 byte masks, so it's a little different.
82
            /// vmaxvq_u32(vandq_u32(input, vdupq_n_u32(0x80808080))) // u32 version has latency 3
83
            /// shrn version has universally <=3 cycles, on servers 2 cycles.
84
            while (p < simd_end && getNibbleMask(vcgeq_u8(vld1q_u8(reinterpret_cast<const uint8_t *>(p)), vdupq_n_u8(0x80))) == 0)
85
                p += SIMD_BYTES;
86

87
            if (!(p < end))
88
                break;
89
#endif
90

91
            size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];
92

93
            if (len > 4)
94
            {
95
                /// Invalid start of sequence. Skip one byte.
96
                put_valid(valid_start, p - valid_start);
97
                put_replacement();
98
                ++p;
99
                valid_start = p;
100
            }
101
            else if (p + len > end)
102
            {
103
                /// Sequence was not fully written to this buffer.
104
                break;
105
            }
106
            else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<const unsigned char *>(p), static_cast<int>(len)))
107
            {
108
                /// Valid sequence.
109
                p += len;
110
            }
111
            else
112
            {
113
                /// Invalid sequence. Skip just first byte.
114
                put_valid(valid_start, p - valid_start);
115
                put_replacement();
116
                ++p;
117
                valid_start = p;
118
            }
119
        }
120

121
        put_valid(valid_start, p - valid_start);
122

123
        if (p != end)
124
            put_replacement();
125
    }
126

127
    static void vector(
128
        const ColumnString::Chars & data,
129
        const ColumnString::Offsets & offsets,
130
        ColumnString::Chars & res_data,
131
        ColumnString::Offsets & res_offsets)
132
    {
133
        const size_t offsets_size = offsets.size();
134
        /// It can be larger than that, but we believe it is unlikely to happen.
135
        res_data.resize(data.size());
136
        res_offsets.resize(offsets_size);
137

138
        size_t prev_offset = 0;
139
        WriteBufferFromVector<ColumnString::Chars> write_buffer(res_data);
140
        for (size_t i = 0; i < offsets_size; ++i)
141
        {
142
            const char * haystack_data = reinterpret_cast<const char *>(&data[prev_offset]);
143
            const size_t haystack_size = offsets[i] - prev_offset - 1;
144
            toValidUTF8One(haystack_data, haystack_data + haystack_size, write_buffer);
145
            writeChar(0, write_buffer);
146
            res_offsets[i] = write_buffer.count();
147
            prev_offset = offsets[i];
148
        }
149
        write_buffer.finalize();
150
    }
151

152
    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
153
    {
154
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by toValidUTF8 function");
155
    }
156
};
157

158
struct NameToValidUTF8
159
{
160
    static constexpr auto name = "toValidUTF8";
161
};
162
using FunctionToValidUTF8 = FunctionStringToString<ToValidUTF8Impl, NameToValidUTF8>;
163

164
}
165

166
REGISTER_FUNCTION(ToValidUTF8)
167
{
168
    factory.registerFunction<FunctionToValidUTF8>();
169
}
170

171
}
172

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.