ClickHouse
84 строки · 2.3 Кб
1#include <DataTypes/DataTypeString.h>
2#include <Columns/ColumnString.h>
3#include <Functions/FunctionFactory.h>
4#include <Functions/FunctionStringToString.h>
5
6
7namespace DB
8{
9namespace ErrorCodes
10{
11extern const int ILLEGAL_COLUMN;
12}
13
14namespace
15{
16
17/** Reverse the sequence of code points in a UTF-8 encoded string.
18* The result may not match the expected result, because modifying code points (for example, diacritics) may be applied to another symbols.
19* If the string is not encoded in UTF-8, then the behavior is undefined.
20*/
21struct ReverseUTF8Impl
22{
23static void vector(const ColumnString::Chars & data,
24const ColumnString::Offsets & offsets,
25ColumnString::Chars & res_data,
26ColumnString::Offsets & res_offsets)
27{
28res_data.resize(data.size());
29res_offsets.assign(offsets);
30size_t size = offsets.size();
31
32ColumnString::Offset prev_offset = 0;
33for (size_t i = 0; i < size; ++i)
34{
35ColumnString::Offset j = prev_offset;
36while (j < offsets[i] - 1)
37{
38if (data[j] < 0xBF)
39{
40res_data[offsets[i] + prev_offset - 2 - j] = data[j];
41j += 1;
42}
43else if (data[j] < 0xE0)
44{
45memcpy(&res_data[offsets[i] + prev_offset - 2 - j - 1], &data[j], 2);
46j += 2;
47}
48else if (data[j] < 0xF0)
49{
50memcpy(&res_data[offsets[i] + prev_offset - 2 - j - 2], &data[j], 3);
51j += 3;
52}
53else
54{
55res_data[offsets[i] + prev_offset - 2 - j] = data[j];
56j += 1;
57}
58}
59
60res_data[offsets[i] - 1] = 0;
61prev_offset = offsets[i];
62}
63}
64
65[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
66{
67throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot apply function reverseUTF8 to fixed string.");
68}
69};
70
71struct NameReverseUTF8
72{
73static constexpr auto name = "reverseUTF8";
74};
75using FunctionReverseUTF8 = FunctionStringToString<ReverseUTF8Impl, NameReverseUTF8, true>;
76
77}
78
79REGISTER_FUNCTION(ReverseUTF8)
80{
81factory.registerFunction<FunctionReverseUTF8>();
82}
83
84}
85