ClickHouse

Форк
0
/
convertCharset.cpp 
224 строки · 7.7 Кб
1
#include "config.h"
2

3
#if USE_ICU
4
#    include <Columns/ColumnConst.h>
5
#    include <Columns/ColumnString.h>
6
#    include <DataTypes/DataTypeString.h>
7
#    include <Functions/FunctionFactory.h>
8
#    include <Functions/FunctionHelpers.h>
9
#    include <Functions/IFunction.h>
10
#    include <IO/WriteHelpers.h>
11
#    include <Common/ObjectPool.h>
12
#    include <Common/typeid_cast.h>
13
#    include <base/range.h>
14

15
#    include <memory>
16
#    include <string>
17
#    include <unicode/ucnv.h>
18

19

20
namespace DB
21
{
22

23
namespace ErrorCodes
24
{
25
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
26
    extern const int CANNOT_CREATE_CHARSET_CONVERTER;
27
    extern const int CANNOT_CONVERT_CHARSET;
28
    extern const int ILLEGAL_COLUMN;
29
}
30

31
namespace
32
{
33

34
/** convertCharset(s, from, to)
35
  *
36
  * Assuming string 's' contains bytes in charset 'from',
37
  *  returns another string with bytes, representing same content in charset 'to'.
38
  * from and to must be constants.
39
  *
40
  * When bytes are illegal in 'from' charset or are not representable in 'to' charset,
41
  *  behavior is implementation specific.
42
  */
43
class FunctionConvertCharset : public IFunction
44
{
45
private:
46
    struct Converter : private boost::noncopyable
47
    {
48
        UConverter * impl;
49

50
        explicit Converter(const String & charset)
51
        {
52
            UErrorCode status = U_ZERO_ERROR;
53
            impl = ucnv_open(charset.data(), &status);
54

55
            if (U_SUCCESS(status))
56
                ucnv_setToUCallBack(impl,
57
                    UCNV_TO_U_CALLBACK_SUBSTITUTE,
58
                    nullptr,
59
                    nullptr, nullptr,
60
                    &status);
61

62
            if (U_SUCCESS(status))
63
                ucnv_setFromUCallBack(impl,
64
                    UCNV_FROM_U_CALLBACK_SUBSTITUTE,
65
                    nullptr,
66
                    nullptr, nullptr,
67
                    &status);
68

69
            if (!U_SUCCESS(status))
70
                throw Exception(ErrorCodes::CANNOT_CREATE_CHARSET_CONVERTER, "Cannot create UConverter with charset {}, error: {}",
71
                    charset, String(u_errorName(status)));
72
        }
73

74
        ~Converter()
75
        {
76
            ucnv_close(impl);
77
        }
78
    };
79

80
    /// Separate converter is created for each thread.
81
    using Pool = ObjectPoolMap<Converter, String>;
82

83
    static Pool::Pointer getConverter(const String & charset)
84
    {
85
        static Pool pool;
86
        return pool.get(charset, [&charset] { return new Converter(charset); });
87
    }
88

89
    static void convert(const String & from_charset, const String & to_charset,
90
        const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets,
91
        ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets)
92
    {
93
        auto converter_from = getConverter(from_charset);
94
        auto converter_to = getConverter(to_charset);
95

96
        ColumnString::Offset current_from_offset = 0;
97
        ColumnString::Offset current_to_offset = 0;
98

99
        size_t size = from_offsets.size();
100
        to_offsets.resize(size);
101

102
        PODArray<UChar> uchars;
103

104
        for (size_t i = 0; i < size; ++i)
105
        {
106
            size_t from_string_size = from_offsets[i] - current_from_offset - 1;
107

108
            /// We assume that empty string is empty in every charset.
109
            if (0 != from_string_size)
110
            {
111
                /// reset state of converter
112
                ucnv_reset(converter_from->impl);
113
                ucnv_reset(converter_to->impl);
114

115
                /// maximum number of code points is number of bytes in input string plus one for terminating zero
116
                uchars.resize(from_string_size + 1);
117

118
                UErrorCode status = U_ZERO_ERROR;
119
                int32_t res = ucnv_toUChars(
120
                    converter_from->impl,
121
                    uchars.data(), uchars.size(),
122
                    reinterpret_cast<const char *>(&from_chars[current_from_offset]), from_string_size,
123
                    &status);
124

125
                if (!U_SUCCESS(status))
126
                    throw Exception(ErrorCodes::CANNOT_CONVERT_CHARSET, "Cannot convert from charset {}, error: {}",
127
                        from_charset, String(u_errorName(status)));
128

129
                auto max_to_char_size = ucnv_getMaxCharSize(converter_to->impl);
130
                auto max_to_size = UCNV_GET_MAX_BYTES_FOR_STRING(res, max_to_char_size);
131

132
                to_chars.resize(current_to_offset + max_to_size);
133

134
                res = ucnv_fromUChars(
135
                    converter_to->impl,
136
                    reinterpret_cast<char *>(&to_chars[current_to_offset]), max_to_size,
137
                    uchars.data(), res,
138
                    &status);
139

140
                if (!U_SUCCESS(status))
141
                    throw Exception(ErrorCodes::CANNOT_CONVERT_CHARSET, "Cannot convert to charset {}, error: {}",
142
                        to_charset, String(u_errorName(status)));
143

144
                current_to_offset += res;
145
            }
146

147
            if (to_chars.size() < current_to_offset + 1)
148
                to_chars.resize(current_to_offset + 1);
149

150
            to_chars[current_to_offset] = 0;
151

152
            ++current_to_offset;
153
            to_offsets[i] = current_to_offset;
154

155
            current_from_offset = from_offsets[i];
156
        }
157

158
        to_chars.resize(current_to_offset);
159
    }
160

161
public:
162
    static constexpr auto name = "convertCharset";
163
    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionConvertCharset>(); }
164

165
    String getName() const override
166
    {
167
        return name;
168
    }
169

170
    size_t getNumberOfArguments() const override { return 3; }
171

172
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
173

174
    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
175
    {
176
        for (size_t i : collections::range(0, 3))
177
            if (!isString(arguments[i]))
178
                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}, must be String",
179
                    arguments[i]->getName(), getName());
180

181
        return std::make_shared<DataTypeString>();
182
    }
183

184
    bool useDefaultImplementationForConstants() const override { return true; }
185
    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
186

187
    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
188
    {
189
        const ColumnWithTypeAndName & arg_from = arguments[0];
190
        const ColumnWithTypeAndName & arg_charset_from = arguments[1];
191
        const ColumnWithTypeAndName & arg_charset_to = arguments[2];
192

193
        const ColumnConst * col_charset_from = checkAndGetColumnConstStringOrFixedString(arg_charset_from.column.get());
194
        const ColumnConst * col_charset_to = checkAndGetColumnConstStringOrFixedString(arg_charset_to.column.get());
195

196
        if (!col_charset_from || !col_charset_to)
197
            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
198
                            "2nd and 3rd arguments of function {} (source charset and destination charset) must "
199
                            "be constant strings.", getName());
200

201
        String charset_from = col_charset_from->getValue<String>();
202
        String charset_to = col_charset_to->getValue<String>();
203

204
        if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(arg_from.column.get()))
205
        {
206
            auto col_to = ColumnString::create();
207
            convert(charset_from, charset_to, col_from->getChars(), col_from->getOffsets(), col_to->getChars(), col_to->getOffsets());
208
            return col_to;
209
        }
210
        else
211
            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column passed as first argument of function {} (must be ColumnString).", getName());
212
    }
213
};
214

215
}
216

217
REGISTER_FUNCTION(ConvertCharset)
218
{
219
    factory.registerFunction<FunctionConvertCharset>();
220
}
221

222
}
223

224
#endif
225

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.