ClickHouse

Форк
0
/
padString.cpp 
326 строк · 13.5 Кб
1
#include <Columns/ColumnFixedString.h>
2
#include <Columns/ColumnString.h>
3
#include <DataTypes/DataTypeString.h>
4
#include <Functions/FunctionFactory.h>
5
#include <Functions/FunctionHelpers.h>
6
#include <Functions/GatherUtils/Algorithms.h>
7
#include <Functions/GatherUtils/Sinks.h>
8
#include <Functions/GatherUtils/Sources.h>
9

10
namespace DB
11
{
12
using namespace GatherUtils;
13

14
namespace ErrorCodes
15
{
16
    extern const int ILLEGAL_COLUMN;
17
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
18
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
19
    extern const int TOO_LARGE_STRING_SIZE;
20
    extern const int INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE;
21
}
22

23
namespace
24
{
25
    /// The maximum new padded length.
26
    constexpr ssize_t MAX_NEW_LENGTH = 1000000;
27

28
    /// Appends padding characters to a sink based on a pad string.
29
    /// Depending on how many padding characters are required to add
30
    /// the pad string can be copied only partly or be repeated multiple times.
31
    template <bool is_utf8>
32
    class PaddingChars
33
    {
34
    public:
35
        explicit PaddingChars(const String & pad_string_) : pad_string(pad_string_) { init(); }
36

37
        ALWAYS_INLINE size_t numCharsInPadString() const
38
        {
39
            if constexpr (is_utf8)
40
                return utf8_offsets.size() - 1;
41
            else
42
                return pad_string.length();
43
        }
44

45
        ALWAYS_INLINE size_t numCharsToNumBytes(size_t count) const
46
        {
47
            if constexpr (is_utf8)
48
                return utf8_offsets[count];
49
            else
50
                return count;
51
        }
52

53
        void appendTo(StringSink & res_sink, size_t num_chars) const
54
        {
55
            if (!num_chars)
56
                return;
57

58
            const size_t step = numCharsInPadString();
59
            while (true)
60
            {
61
                if (num_chars <= step)
62
                {
63
                    writeSlice(StringSource::Slice{std::bit_cast<const UInt8 *>(pad_string.data()), numCharsToNumBytes(num_chars)}, res_sink);
64
                    break;
65
                }
66
                writeSlice(StringSource::Slice{std::bit_cast<const UInt8 *>(pad_string.data()), numCharsToNumBytes(step)}, res_sink);
67
                num_chars -= step;
68
            }
69
        }
70

71
    private:
72
        void init()
73
        {
74
            if (pad_string.empty())
75
                pad_string = " ";
76

77
            if constexpr (is_utf8)
78
            {
79
                size_t offset = 0;
80
                utf8_offsets.reserve(pad_string.length() + 1);
81
                while (true)
82
                {
83
                    utf8_offsets.push_back(offset);
84
                    if (offset == pad_string.length())
85
                        break;
86
                    offset += UTF8::seqLength(pad_string[offset]);
87
                    if (offset > pad_string.length())
88
                        offset = pad_string.length();
89
                }
90
            }
91

92
            /// Not necessary, but good for performance.
93
            /// We repeat `pad_string` multiple times until it's length becomes 16 or more.
94
            /// It speeds up the function appendTo() because it allows to copy padding characters by portions of at least
95
            /// 16 bytes instead of single bytes.
96
            while (numCharsInPadString() < 16)
97
            {
98
                pad_string += pad_string;
99
                if constexpr (is_utf8)
100
                {
101
                    size_t old_size = utf8_offsets.size();
102
                    utf8_offsets.reserve((old_size - 1) * 2);
103
                    size_t base = utf8_offsets.back();
104
                    for (size_t i = 1; i != old_size; ++i)
105
                        utf8_offsets.push_back(utf8_offsets[i] + base);
106
                }
107
            }
108
        }
109

110
        String pad_string;
111

112
        /// Offsets of code points in `pad_string`:
113
        /// utf8_offsets[0] is the offset of the first code point in `pad_string`, it's always 0;
114
        /// utf8_offsets[1] is the offset of the second code point in `pad_string`;
115
        /// utf8_offsets[2] is the offset of the third code point in `pad_string`;
116
        /// ...
117
        std::vector<size_t> utf8_offsets;
118
    };
119

120
    /// Returns the number of characters in a slice.
121
    template <bool is_utf8>
122
    inline ALWAYS_INLINE size_t getLengthOfSlice(const StringSource::Slice & slice)
123
    {
124
        if constexpr (is_utf8)
125
            return UTF8::countCodePoints(slice.data, slice.size);
126
        else
127
            return slice.size;
128
    }
129

130
    /// Moves the end of a slice back by n characters.
131
    template <bool is_utf8>
132
    inline ALWAYS_INLINE StringSource::Slice removeSuffixFromSlice(const StringSource::Slice & slice, size_t suffix_length)
133
    {
134
        StringSource::Slice res = slice;
135
        if constexpr (is_utf8)
136
            res.size = UTF8StringSource::skipCodePointsBackward(slice.data + slice.size, suffix_length, slice.data) - res.data;
137
        else
138
            res.size -= std::min(suffix_length, res.size);
139
        return res;
140
    }
141

142
    /// If `is_right_pad` - it's the rightPad() function instead of leftPad().
143
    /// If `is_utf8` - lengths are measured in code points instead of bytes.
144
    template <bool is_right_pad, bool is_utf8>
145
    class FunctionPadString : public IFunction
146
    {
147
    public:
148
        static constexpr auto name = is_right_pad ? (is_utf8 ? "rightPadUTF8" : "rightPad") : (is_utf8 ? "leftPadUTF8" : "leftPad");
149
        static FunctionPtr create(const ContextPtr) { return std::make_shared<FunctionPadString>(); }
150

151
        String getName() const override { return name; }
152

153
        bool isVariadic() const override { return true; }
154
        size_t getNumberOfArguments() const override { return 0; }
155

156
        bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
157

158
        bool useDefaultImplementationForConstants() const override { return false; }
159

160
        DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
161
        {
162
            size_t number_of_arguments = arguments.size();
163

164
            if (number_of_arguments != 2 && number_of_arguments != 3)
165
                throw Exception(
166
                    ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
167
                    "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3",
168
                    getName(),
169
                    number_of_arguments);
170

171
            if (!isStringOrFixedString(arguments[0]))
172
                throw Exception(
173
                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
174
                    "Illegal type {} of the first argument of function {}, should be string",
175
                    arguments[0]->getName(),
176
                    getName());
177

178
            if (!isInteger(arguments[1]))
179
                throw Exception(
180
                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
181
                    "Illegal type {} of the second argument of function {}, should be unsigned integer",
182
                    arguments[1]->getName(),
183
                    getName());
184

185
            if (number_of_arguments == 3 && !isStringOrFixedString(arguments[2]))
186
                throw Exception(
187
                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
188
                    "Illegal type {} of the third argument of function {}, should be const string",
189
                    arguments[2]->getName(),
190
                    getName());
191

192
            return std::make_shared<DataTypeString>();
193
        }
194

195
        ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
196
        {
197
            auto column_string = arguments[0].column;
198
            auto column_length = arguments[1].column;
199

200
            String pad_string;
201
            if (arguments.size() == 3)
202
            {
203
                auto column_pad = arguments[2].column;
204
                const ColumnConst * column_pad_const = checkAndGetColumnConst<ColumnString>(column_pad.get());
205
                if (!column_pad_const)
206
                    throw Exception(
207
                        ErrorCodes::ILLEGAL_COLUMN,
208
                        "Illegal column {}, third argument of function {} must be a constant string",
209
                        column_pad->getName(),
210
                        getName());
211

212
                pad_string = column_pad_const->getValue<String>();
213
            }
214
            PaddingChars<is_utf8> padding_chars{pad_string};
215

216
            auto col_res = ColumnString::create();
217
            StringSink res_sink{*col_res, input_rows_count};
218

219
            if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
220
                executeForSource(StringSource{*col}, column_length, padding_chars, res_sink);
221
            else if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_string.get()))
222
                executeForSource(FixedStringSource{*col_fixed}, column_length, padding_chars, res_sink);
223
            else if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
224
                executeForSource(ConstSource<StringSource>{*col_const}, column_length, padding_chars, res_sink);
225
            else if (const ColumnConst * col_const_fixed = checkAndGetColumnConst<ColumnFixedString>(column_string.get()))
226
                executeForSource(ConstSource<FixedStringSource>{*col_const_fixed}, column_length, padding_chars, res_sink);
227
            else
228
                throw Exception(
229
                    ErrorCodes::ILLEGAL_COLUMN,
230
                    "Illegal column {}, first argument of function {} must be a string",
231
                    arguments[0].column->getName(),
232
                    getName());
233

234
            return col_res;
235
        }
236

237
    private:
238
        template <typename SourceStrings>
239
        void executeForSource(
240
            SourceStrings && strings,
241
            const ColumnPtr & column_length,
242
            const PaddingChars<is_utf8> & padding_chars,
243
            StringSink & res_sink) const
244
        {
245
            if (const auto * col_const = checkAndGetColumn<ColumnConst>(column_length.get()))
246
                executeForSourceAndLength(std::forward<SourceStrings>(strings), ConstSource<GenericValueSource>{*col_const}, padding_chars, res_sink);
247
            else
248
                executeForSourceAndLength(std::forward<SourceStrings>(strings), GenericValueSource{*column_length}, padding_chars, res_sink);
249
        }
250

251
        template <typename SourceStrings, typename SourceLengths>
252
        void executeForSourceAndLength(
253
            SourceStrings && strings,
254
            SourceLengths && lengths,
255
            const PaddingChars<is_utf8> & padding_chars,
256
            StringSink & res_sink) const
257
        {
258
            bool is_const_new_length = lengths.isConst();
259
            ssize_t new_length = 0;
260

261
            /// Insert padding characters to each string from `strings`, write the result strings into `res_sink`.
262
            /// If for some input string its current length is greater than the specified new length then that string
263
            /// will be trimmed to the specified new length instead of padding.
264
            for (; !res_sink.isEnd(); res_sink.next(), strings.next(), lengths.next())
265
            {
266
                auto str = strings.getWhole();
267
                ssize_t current_length = getLengthOfSlice<is_utf8>(str);
268

269
                if (!res_sink.rowNum() || !is_const_new_length)
270
                {
271
                    /// If `is_const_new_length` is true we can get and check the new length only once.
272
                    auto new_length_slice = lengths.getWhole();
273
                    new_length = new_length_slice.elements->getInt(new_length_slice.position);
274
                    if (new_length > MAX_NEW_LENGTH)
275
                    {
276
                        throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "New padded length ({}) is too big, maximum is: {}",
277
                            std::to_string(new_length), std::to_string(MAX_NEW_LENGTH));
278
                    }
279
                    if (new_length < 0)
280
                    {
281
                        throw Exception(
282
                            ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, "New padded length ({}) is negative", std::to_string(new_length));
283
                    }
284
                    if (is_const_new_length)
285
                    {
286
                        size_t rows_count = res_sink.offsets.size();
287
                        res_sink.reserve((new_length + 1 /* zero terminator */) * rows_count);
288
                    }
289
                }
290

291
                if (new_length == current_length)
292
                {
293
                    writeSlice(str, res_sink);
294
                }
295
                else if (new_length < current_length)
296
                {
297
                    str = removeSuffixFromSlice<is_utf8>(str, current_length - new_length);
298
                    writeSlice(str, res_sink);
299
                }
300
                else if (new_length > current_length)
301
                {
302
                    if constexpr (!is_right_pad)
303
                        padding_chars.appendTo(res_sink, new_length - current_length);
304

305
                    writeSlice(str, res_sink);
306

307
                    if constexpr (is_right_pad)
308
                        padding_chars.appendTo(res_sink, new_length - current_length);
309
                }
310
            }
311
        }
312
    };
313
}
314

315
REGISTER_FUNCTION(PadString)
316
{
317
    factory.registerFunction<FunctionPadString<false, false>>(); /// leftPad
318
    factory.registerFunction<FunctionPadString<false, true>>();  /// leftPadUTF8
319
    factory.registerFunction<FunctionPadString<true, false>>();  /// rightPad
320
    factory.registerFunction<FunctionPadString<true, true>>();   /// rightPadUTF8
321

322
    factory.registerAlias("lpad", "leftPad", FunctionFactory::CaseInsensitive);
323
    factory.registerAlias("rpad", "rightPad", FunctionFactory::CaseInsensitive);
324
}
325

326
}
327

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.