ClickHouse
224 строки · 7.7 Кб
1#include "config.h"2
3#if USE_ICU4# include <Columns/ColumnConst.h>5# include <Columns/ColumnString.h>6# include <DataTypes/DataTypeString.h>7# include <Functions/FunctionFactory.h>8# include <Functions/FunctionHelpers.h>9# include <Functions/IFunction.h>10# include <IO/WriteHelpers.h>11# include <Common/ObjectPool.h>12# include <Common/typeid_cast.h>13# include <base/range.h>14
15# include <memory>16# include <string>17# include <unicode/ucnv.h>18
19
20namespace DB21{
22
23namespace ErrorCodes24{
25extern const int ILLEGAL_TYPE_OF_ARGUMENT;26extern const int CANNOT_CREATE_CHARSET_CONVERTER;27extern const int CANNOT_CONVERT_CHARSET;28extern const int ILLEGAL_COLUMN;29}
30
31namespace
32{
33
34/** convertCharset(s, from, to)
35*
36* Assuming string 's' contains bytes in charset 'from',
37* returns another string with bytes, representing same content in charset 'to'.
38* from and to must be constants.
39*
40* When bytes are illegal in 'from' charset or are not representable in 'to' charset,
41* behavior is implementation specific.
42*/
43class FunctionConvertCharset : public IFunction44{
45private:46struct Converter : private boost::noncopyable47{48UConverter * impl;49
50explicit Converter(const String & charset)51{52UErrorCode status = U_ZERO_ERROR;53impl = ucnv_open(charset.data(), &status);54
55if (U_SUCCESS(status))56ucnv_setToUCallBack(impl,57UCNV_TO_U_CALLBACK_SUBSTITUTE,58nullptr,59nullptr, nullptr,60&status);61
62if (U_SUCCESS(status))63ucnv_setFromUCallBack(impl,64UCNV_FROM_U_CALLBACK_SUBSTITUTE,65nullptr,66nullptr, nullptr,67&status);68
69if (!U_SUCCESS(status))70throw Exception(ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER, "Cannot create UConverter with charset {}, error: {}",71charset, String(u_errorName(status)));72}73
74~Converter()75{76ucnv_close(impl);77}78};79
80/// Separate converter is created for each thread.81using Pool = ObjectPoolMap<Converter, String>;82
83static Pool::Pointer getConverter(const String & charset)84{85static Pool pool;86return pool.get(charset, [&charset] { return new Converter(charset); });87}88
89static void convert(const String & from_charset, const String & to_charset,90const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets,91ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets)92{93auto converter_from = getConverter(from_charset);94auto converter_to = getConverter(to_charset);95
96ColumnString::Offset current_from_offset = 0;97ColumnString::Offset current_to_offset = 0;98
99size_t size = from_offsets.size();100to_offsets.resize(size);101
102PODArray<UChar> uchars;103
104for (size_t i = 0; i < size; ++i)105{106size_t from_string_size = from_offsets[i] - current_from_offset - 1;107
108/// We assume that empty string is empty in every charset.109if (0 != from_string_size)110{111/// reset state of converter112ucnv_reset(converter_from->impl);113ucnv_reset(converter_to->impl);114
115/// maximum number of code points is number of bytes in input string plus one for terminating zero116uchars.resize(from_string_size + 1);117
118UErrorCode status = U_ZERO_ERROR;119int32_t res = ucnv_toUChars(120converter_from->impl,121uchars.data(), uchars.size(),122reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,123&status);124
125if (!U_SUCCESS(status))126throw Exception(ErrorCodes::CANNOT_CONVERT_CHARSET, "Cannot convert from charset {}, error: {}",127from_charset, String(u_errorName(status)));128
129auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);130auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);131
132to_chars.resize(current_to_offset + max_to_size);133
134res = ucnv_fromUChars(135converter_to->impl,136reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,137uchars.data(), res,138&status);139
140if (!U_SUCCESS(status))141throw Exception(ErrorCodes::CANNOT_CONVERT_CHARSET, "Cannot convert to charset {}, error: {}",142to_charset, String(u_errorName(status)));143
144current_to_offset += res;145}146
147if (to_chars.size() < current_to_offset + 1)148to_chars.resize(current_to_offset + 1);149
150to_chars[current_to_offset] = 0;151
152++current_to_offset;153to_offsets[i] = current_to_offset;154
155current_from_offset = from_offsets[i];156}157
158to_chars.resize(current_to_offset);159}160
161public:162static constexpr auto name = "convertCharset";163static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionConvertCharset>(); }164
165String getName() const override166{167return name;168}169
170size_t getNumberOfArguments() const override { return 3; }171
172bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }173
174DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override175{176for (size_t i : collections::range(0, 3))177if (!isString(arguments[i]))178throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}, must be String",179arguments[i]->getName(), getName());180
181return std::make_shared<DataTypeString>();182}183
184bool useDefaultImplementationForConstants() const override { return true; }185ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }186
187ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override188{189const ColumnWithTypeAndName & arg_from = arguments[0];190const ColumnWithTypeAndName & arg_charset_from = arguments[1];191const ColumnWithTypeAndName & arg_charset_to = arguments[2];192
193const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get());194const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get());195
196if (!col_charset_from || !col_charset_to)197throw Exception(ErrorCodes::ILLEGAL_COLUMN,198"2nd and 3rd arguments of function {} (source charset and destination charset) must "199"be constant strings.", getName());200
201String charset_from = col_charset_from->getValue<String>();202String charset_to = col_charset_to->getValue<String>();203
204if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get()))205{206auto col_to = ColumnString::create();207convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets());208return col_to;209}210else211throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column passed as first argument of function {} (must be ColumnString).", getName());212}213};214
215}
216
217REGISTER_FUNCTION(ConvertCharset)218{
219factory.registerFunction<FunctionConvertCharset>();220}
221
222}
223
224#endif225