ClickHouse
114 строк · 3.2 Кб
1#include <DataTypes/DataTypeString.h>2#include <Functions/FunctionStringToString.h>3#include <Functions/LowerUpperUTF8Impl.h>4#include <Functions/FunctionFactory.h>5#include <Poco/Unicode.h>6
7
8namespace DB9{
10
11namespace ErrorCodes12{
13extern const int BAD_ARGUMENTS;14}
15
16namespace
17{
18
19struct InitcapUTF8Impl20{
21static void vector(22const ColumnString::Chars & data,23const ColumnString::Offsets & offsets,24ColumnString::Chars & res_data,25ColumnString::Offsets & res_offsets)26{27if (data.empty())28return;29res_data.resize(data.size());30res_offsets.assign(offsets);31array(data.data(), data.data() + data.size(), offsets, res_data.data());32}33
34[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)35{36throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function initcapUTF8 cannot work with FixedString argument");37}38
39static void processCodePoint(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool& prev_alphanum)40{41size_t src_sequence_length = UTF8::seqLength(*src);42auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);43
44if (src_code_point)45{46bool alpha = Poco::Unicode::isAlpha(*src_code_point);47bool alphanum = alpha || Poco::Unicode::isDigit(*src_code_point);48
49int dst_code_point = *src_code_point;50if (alphanum && !prev_alphanum)51{52if (alpha)53dst_code_point = Poco::Unicode::toUpper(*src_code_point);54}55else if (alpha)56{57dst_code_point = Poco::Unicode::toLower(*src_code_point);58}59prev_alphanum = alphanum;60if (dst_code_point > 0)61{62size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);63assert(dst_sequence_length <= 4);64
65if (dst_sequence_length == src_sequence_length)66{67src += dst_sequence_length;68dst += dst_sequence_length;69return;70}71}72}73
74*dst = *src;75++dst;76++src;77prev_alphanum = false;78}79
80private:81
82static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)83{84const auto * offset_it = offsets.begin();85const UInt8 * begin = src;86
87/// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)88while (src < src_end)89{90const UInt8 * row_end = begin + *offset_it;91chassert(row_end >= src);92bool prev_alphanum = false;93while (src < row_end)94processCodePoint(src, row_end, dst, prev_alphanum);95++offset_it;96}97}98};99
100struct NameInitcapUTF8101{
102static constexpr auto name = "initcapUTF8";103};104
105using FunctionInitcapUTF8 = FunctionStringToString<InitcapUTF8Impl, NameInitcapUTF8>;106
107}
108
109REGISTER_FUNCTION(InitcapUTF8)110{
111factory.registerFunction<FunctionInitcapUTF8>();112}
113
114}
115