ClickHouse

Форк
0
/
punycode.cpp 
202 строки · 7.1 Кб
1
#include "config.h"
2

3
#if USE_IDNA
4

5
#include <Columns/ColumnString.h>
6
#include <Functions/FunctionFactory.h>
7
#include <Functions/FunctionStringToString.h>
8

9
#    pragma clang diagnostic push
10
#    pragma clang diagnostic ignored "-Wnewline-eof"
11
#    include <ada/idna/punycode.h>
12
#    include <ada/idna/unicode_transcoding.h>
13
#    pragma clang diagnostic pop
14

15
namespace DB
16
{
17

18
namespace ErrorCodes
19
{
20
    extern const int BAD_ARGUMENTS;
21
    extern const int NOT_IMPLEMENTED;
22
}
23

24
/// Implementation of
25
/// - punycodeEncode(), punycodeDecode() and tryPunycodeDecode(), see https://en.wikipedia.org/wiki/Punycode
26

27
enum class ErrorHandling
28
{
29
    Throw,  /// Throw exception
30
    Empty   /// Return empty string
31
};
32

33

34
struct PunycodeEncode
35
{
36
    /// Encoding-as-punycode can only fail if the input isn't valid UTF8. In that case, return undefined output, i.e. garbage-in, garbage-out.
37
    static void vector(
38
        const ColumnString::Chars & data,
39
        const ColumnString::Offsets & offsets,
40
        ColumnString::Chars & res_data,
41
        ColumnString::Offsets & res_offsets)
42
    {
43
        const size_t rows = offsets.size();
44
        res_data.reserve(data.size()); /// just a guess, assuming the input is all-ASCII
45
        res_offsets.reserve(rows);
46

47
        size_t prev_offset = 0;
48
        std::u32string value_utf32;
49
        std::string value_puny;
50
        for (size_t row = 0; row < rows; ++row)
51
        {
52
            const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
53
            const size_t value_length = offsets[row] - prev_offset - 1;
54

55
            const size_t value_utf32_length = ada::idna::utf32_length_from_utf8(value, value_length);
56
            value_utf32.resize(value_utf32_length);
57
            const size_t codepoints = ada::idna::utf8_to_utf32(value, value_length, value_utf32.data());
58
            if (codepoints == 0)
59
                value_utf32.clear(); /// input was empty or no valid UTF-8
60

61
            const bool ok = ada::idna::utf32_to_punycode(value_utf32, value_puny);
62
            if (!ok)
63
                value_puny.clear();
64

65
            res_data.insert(value_puny.c_str(), value_puny.c_str() + value_puny.size() + 1);
66
            res_offsets.push_back(res_data.size());
67

68
            prev_offset = offsets[row];
69

70
            value_utf32.clear();
71
            value_puny.clear(); /// utf32_to_punycode() appends to its output string
72
        }
73
    }
74

75
    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
76
    {
77
        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Arguments of type FixedString are not allowed");
78
    }
79
};
80

81

82
template <ErrorHandling error_handling>
83
struct PunycodeDecode
84
{
85
    static void vector(
86
        const ColumnString::Chars & data,
87
        const ColumnString::Offsets & offsets,
88
        ColumnString::Chars & res_data,
89
        ColumnString::Offsets & res_offsets)
90
    {
91
        const size_t rows = offsets.size();
92
        res_data.reserve(data.size()); /// just a guess, assuming the input is all-ASCII
93
        res_offsets.reserve(rows);
94

95
        size_t prev_offset = 0;
96
        std::u32string value_utf32;
97
        std::string value_utf8;
98
        for (size_t row = 0; row < rows; ++row)
99
        {
100
            const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
101
            const size_t value_length = offsets[row] - prev_offset - 1;
102

103
            const std::string_view value_punycode(value, value_length);
104
            const bool ok = ada::idna::punycode_to_utf32(value_punycode, value_utf32);
105
            if (!ok)
106
            {
107
                if constexpr (error_handling == ErrorHandling::Throw)
108
                {
109
                    throw Exception(ErrorCodes::BAD_ARGUMENTS, "'{}' is not a valid Punycode-encoded string", value_punycode);
110
                }
111
                else
112
                {
113
                    static_assert(error_handling == ErrorHandling::Empty);
114
                    value_utf32.clear();
115
                }
116
            }
117

118
            const size_t utf8_length = ada::idna::utf8_length_from_utf32(value_utf32.data(), value_utf32.size());
119
            value_utf8.resize(utf8_length);
120
            ada::idna::utf32_to_utf8(value_utf32.data(), value_utf32.size(), value_utf8.data());
121

122
            res_data.insert(value_utf8.c_str(), value_utf8.c_str() + value_utf8.size() + 1);
123
            res_offsets.push_back(res_data.size());
124

125
            prev_offset = offsets[row];
126

127
            value_utf32.clear(); /// punycode_to_utf32() appends to its output string
128
            value_utf8.clear();
129
        }
130
    }
131

132
    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
133
    {
134
        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Arguments of type FixedString are not allowed");
135
    }
136
};
137

138
struct NamePunycodeEncode { static constexpr auto name = "punycodeEncode"; };
139
struct NamePunycodeDecode { static constexpr auto name = "punycodeDecode"; };
140
struct NameTryPunycodeDecode { static constexpr auto name = "tryPunycodeDecode"; };
141

142
using FunctionPunycodeEncode = FunctionStringToString<PunycodeEncode, NamePunycodeEncode>;
143
using FunctionPunycodeDecode = FunctionStringToString<PunycodeDecode<ErrorHandling::Throw>, NamePunycodeDecode>;
144
using FunctionTryPunycodeDecode = FunctionStringToString<PunycodeDecode<ErrorHandling::Empty>, NameTryPunycodeDecode>;
145

146
REGISTER_FUNCTION(Punycode)
147
{
148
    factory.registerFunction<FunctionPunycodeEncode>(FunctionDocumentation{
149
        .description=R"(
150
Computes a Punycode representation of a string.)",
151
        .syntax="punycodeEncode(str)",
152
        .arguments={{"str", "Input string"}},
153
        .returned_value="The punycode representation [String](/docs/en/sql-reference/data-types/string.md).",
154
        .examples={
155
            {"simple",
156
            "SELECT punycodeEncode('München') AS puny;",
157
            R"(
158
┌─puny───────┐
159
│ Mnchen-3ya │
160
└────────────┘
161
            )"
162
            }}
163
    });
164

165
    factory.registerFunction<FunctionPunycodeDecode>(FunctionDocumentation{
166
        .description=R"(
167
Computes a Punycode representation of a string. Throws an exception if the input is not valid Punycode.)",
168
        .syntax="punycodeDecode(str)",
169
        .arguments={{"str", "A Punycode-encoded string"}},
170
        .returned_value="The plaintext representation [String](/docs/en/sql-reference/data-types/string.md).",
171
        .examples={
172
            {"simple",
173
            "SELECT punycodeDecode('Mnchen-3ya') AS plain;",
174
            R"(
175
┌─plain───┐
176
│ München │
177
└─────────┘
178
            )"
179
            }}
180
    });
181

182
    factory.registerFunction<FunctionTryPunycodeDecode>(FunctionDocumentation{
183
        .description=R"(
184
Computes a Punycode representation of a string. Returns an empty string if the input is not valid Punycode.)",
185
        .syntax="punycodeDecode(str)",
186
        .arguments={{"str", "A Punycode-encoded string"}},
187
        .returned_value="The plaintext representation [String](/docs/en/sql-reference/data-types/string.md).",
188
        .examples={
189
            {"simple",
190
            "SELECT tryPunycodeDecode('Mnchen-3ya') AS plain;",
191
            R"(
192
┌─plain───┐
193
│ München │
194
└─────────┘
195
            )"
196
            }}
197
    });
198
}
199

200
}
201

202
#endif
203

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.