ClickHouse
202 строки · 7.1 Кб
1#include "config.h"
2
3#if USE_IDNA
4
5#include <Columns/ColumnString.h>
6#include <Functions/FunctionFactory.h>
7#include <Functions/FunctionStringToString.h>
8
9# pragma clang diagnostic push
10# pragma clang diagnostic ignored "-Wnewline-eof"
11# include <ada/idna/punycode.h>
12# include <ada/idna/unicode_transcoding.h>
13# pragma clang diagnostic pop
14
15namespace DB
16{
17
18namespace ErrorCodes
19{
20extern const int BAD_ARGUMENTS;
21extern const int NOT_IMPLEMENTED;
22}
23
24/// Implementation of
25/// - punycodeEncode(), punycodeDecode() and tryPunycodeDecode(), see https://en.wikipedia.org/wiki/Punycode
26
27enum class ErrorHandling
28{
29Throw, /// Throw exception
30Empty /// Return empty string
31};
32
33
34struct PunycodeEncode
35{
36/// Encoding-as-punycode can only fail if the input isn't valid UTF8. In that case, return undefined output, i.e. garbage-in, garbage-out.
37static void vector(
38const ColumnString::Chars & data,
39const ColumnString::Offsets & offsets,
40ColumnString::Chars & res_data,
41ColumnString::Offsets & res_offsets)
42{
43const size_t rows = offsets.size();
44res_data.reserve(data.size()); /// just a guess, assuming the input is all-ASCII
45res_offsets.reserve(rows);
46
47size_t prev_offset = 0;
48std::u32string value_utf32;
49std::string value_puny;
50for (size_t row = 0; row < rows; ++row)
51{
52const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
53const size_t value_length = offsets[row] - prev_offset - 1;
54
55const size_t value_utf32_length = ada::idna::utf32_length_from_utf8(value, value_length);
56value_utf32.resize(value_utf32_length);
57const size_t codepoints = ada::idna::utf8_to_utf32(value, value_length, value_utf32.data());
58if (codepoints == 0)
59value_utf32.clear(); /// input was empty or no valid UTF-8
60
61const bool ok = ada::idna::utf32_to_punycode(value_utf32, value_puny);
62if (!ok)
63value_puny.clear();
64
65res_data.insert(value_puny.c_str(), value_puny.c_str() + value_puny.size() + 1);
66res_offsets.push_back(res_data.size());
67
68prev_offset = offsets[row];
69
70value_utf32.clear();
71value_puny.clear(); /// utf32_to_punycode() appends to its output string
72}
73}
74
75[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
76{
77throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Arguments of type FixedString are not allowed");
78}
79};
80
81
82template <ErrorHandling error_handling>
83struct PunycodeDecode
84{
85static void vector(
86const ColumnString::Chars & data,
87const ColumnString::Offsets & offsets,
88ColumnString::Chars & res_data,
89ColumnString::Offsets & res_offsets)
90{
91const size_t rows = offsets.size();
92res_data.reserve(data.size()); /// just a guess, assuming the input is all-ASCII
93res_offsets.reserve(rows);
94
95size_t prev_offset = 0;
96std::u32string value_utf32;
97std::string value_utf8;
98for (size_t row = 0; row < rows; ++row)
99{
100const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
101const size_t value_length = offsets[row] - prev_offset - 1;
102
103const std::string_view value_punycode(value, value_length);
104const bool ok = ada::idna::punycode_to_utf32(value_punycode, value_utf32);
105if (!ok)
106{
107if constexpr (error_handling == ErrorHandling::Throw)
108{
109throw Exception(ErrorCodes::BAD_ARGUMENTS, "'{}' is not a valid Punycode-encoded string", value_punycode);
110}
111else
112{
113static_assert(error_handling == ErrorHandling::Empty);
114value_utf32.clear();
115}
116}
117
118const size_t utf8_length = ada::idna::utf8_length_from_utf32(value_utf32.data(), value_utf32.size());
119value_utf8.resize(utf8_length);
120ada::idna::utf32_to_utf8(value_utf32.data(), value_utf32.size(), value_utf8.data());
121
122res_data.insert(value_utf8.c_str(), value_utf8.c_str() + value_utf8.size() + 1);
123res_offsets.push_back(res_data.size());
124
125prev_offset = offsets[row];
126
127value_utf32.clear(); /// punycode_to_utf32() appends to its output string
128value_utf8.clear();
129}
130}
131
132[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
133{
134throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Arguments of type FixedString are not allowed");
135}
136};
137
138struct NamePunycodeEncode { static constexpr auto name = "punycodeEncode"; };
139struct NamePunycodeDecode { static constexpr auto name = "punycodeDecode"; };
140struct NameTryPunycodeDecode { static constexpr auto name = "tryPunycodeDecode"; };
141
142using FunctionPunycodeEncode = FunctionStringToString<PunycodeEncode, NamePunycodeEncode>;
143using FunctionPunycodeDecode = FunctionStringToString<PunycodeDecode<ErrorHandling::Throw>, NamePunycodeDecode>;
144using FunctionTryPunycodeDecode = FunctionStringToString<PunycodeDecode<ErrorHandling::Empty>, NameTryPunycodeDecode>;
145
146REGISTER_FUNCTION(Punycode)
147{
148factory.registerFunction<FunctionPunycodeEncode>(FunctionDocumentation{
149.description=R"(
150Computes a Punycode representation of a string.)",
151.syntax="punycodeEncode(str)",
152.arguments={{"str", "Input string"}},
153.returned_value="The punycode representation [String](/docs/en/sql-reference/data-types/string.md).",
154.examples={
155{"simple",
156"SELECT punycodeEncode('München') AS puny;",
157R"(
158┌─puny───────┐
159│ Mnchen-3ya │
160└────────────┘
161)"
162}}
163});
164
165factory.registerFunction<FunctionPunycodeDecode>(FunctionDocumentation{
166.description=R"(
167Computes a Punycode representation of a string. Throws an exception if the input is not valid Punycode.)",
168.syntax="punycodeDecode(str)",
169.arguments={{"str", "A Punycode-encoded string"}},
170.returned_value="The plaintext representation [String](/docs/en/sql-reference/data-types/string.md).",
171.examples={
172{"simple",
173"SELECT punycodeDecode('Mnchen-3ya') AS plain;",
174R"(
175┌─plain───┐
176│ München │
177└─────────┘
178)"
179}}
180});
181
182factory.registerFunction<FunctionTryPunycodeDecode>(FunctionDocumentation{
183.description=R"(
184Computes a Punycode representation of a string. Returns an empty string if the input is not valid Punycode.)",
185.syntax="punycodeDecode(str)",
186.arguments={{"str", "A Punycode-encoded string"}},
187.returned_value="The plaintext representation [String](/docs/en/sql-reference/data-types/string.md).",
188.examples={
189{"simple",
190"SELECT tryPunycodeDecode('Mnchen-3ya') AS plain;",
191R"(
192┌─plain───┐
193│ München │
194└─────────┘
195)"
196}}
197});
198}
199
200}
201
202#endif
203