ClickHouse
326 строк · 13.5 Кб
1#include <Columns/ColumnFixedString.h>
2#include <Columns/ColumnString.h>
3#include <DataTypes/DataTypeString.h>
4#include <Functions/FunctionFactory.h>
5#include <Functions/FunctionHelpers.h>
6#include <Functions/GatherUtils/Algorithms.h>
7#include <Functions/GatherUtils/Sinks.h>
8#include <Functions/GatherUtils/Sources.h>
9
10namespace DB
11{
12using namespace GatherUtils;
13
14namespace ErrorCodes
15{
16extern const int ILLEGAL_COLUMN;
17extern const int ILLEGAL_TYPE_OF_ARGUMENT;
18extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
19extern const int TOO_LARGE_STRING_SIZE;
20extern const int INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE;
21}
22
23namespace
24{
25/// The maximum new padded length.
26constexpr ssize_t MAX_NEW_LENGTH = 1000000;
27
28/// Appends padding characters to a sink based on a pad string.
29/// Depending on how many padding characters are required to add
30/// the pad string can be copied only partly or be repeated multiple times.
31template <bool is_utf8>
32class PaddingChars
33{
34public:
35explicit PaddingChars(const String & pad_string_) : pad_string(pad_string_) { init(); }
36
37ALWAYS_INLINE size_t numCharsInPadString() const
38{
39if constexpr (is_utf8)
40return utf8_offsets.size() - 1;
41else
42return pad_string.length();
43}
44
45ALWAYS_INLINE size_t numCharsToNumBytes(size_t count) const
46{
47if constexpr (is_utf8)
48return utf8_offsets[count];
49else
50return count;
51}
52
53void appendTo(StringSink & res_sink, size_t num_chars) const
54{
55if (!num_chars)
56return;
57
58const size_t step = numCharsInPadString();
59while (true)
60{
61if (num_chars <= step)
62{
63writeSlice(StringSource::Slice{std::bit_cast<const UInt8 *>(pad_string.data()), numCharsToNumBytes(num_chars)}, res_sink);
64break;
65}
66writeSlice(StringSource::Slice{std::bit_cast<const UInt8 *>(pad_string.data()), numCharsToNumBytes(step)}, res_sink);
67num_chars -= step;
68}
69}
70
71private:
72void init()
73{
74if (pad_string.empty())
75pad_string = " ";
76
77if constexpr (is_utf8)
78{
79size_t offset = 0;
80utf8_offsets.reserve(pad_string.length() + 1);
81while (true)
82{
83utf8_offsets.push_back(offset);
84if (offset == pad_string.length())
85break;
86offset += UTF8::seqLength(pad_string[offset]);
87if (offset > pad_string.length())
88offset = pad_string.length();
89}
90}
91
92/// Not necessary, but good for performance.
93/// We repeat `pad_string` multiple times until it's length becomes 16 or more.
94/// It speeds up the function appendTo() because it allows to copy padding characters by portions of at least
95/// 16 bytes instead of single bytes.
96while (numCharsInPadString() < 16)
97{
98pad_string += pad_string;
99if constexpr (is_utf8)
100{
101size_t old_size = utf8_offsets.size();
102utf8_offsets.reserve((old_size - 1) * 2);
103size_t base = utf8_offsets.back();
104for (size_t i = 1; i != old_size; ++i)
105utf8_offsets.push_back(utf8_offsets[i] + base);
106}
107}
108}
109
110String pad_string;
111
112/// Offsets of code points in `pad_string`:
113/// utf8_offsets[0] is the offset of the first code point in `pad_string`, it's always 0;
114/// utf8_offsets[1] is the offset of the second code point in `pad_string`;
115/// utf8_offsets[2] is the offset of the third code point in `pad_string`;
116/// ...
117std::vector<size_t> utf8_offsets;
118};
119
120/// Returns the number of characters in a slice.
121template <bool is_utf8>
122inline ALWAYS_INLINE size_t getLengthOfSlice(const StringSource::Slice & slice)
123{
124if constexpr (is_utf8)
125return UTF8::countCodePoints(slice.data, slice.size);
126else
127return slice.size;
128}
129
130/// Moves the end of a slice back by n characters.
131template <bool is_utf8>
132inline ALWAYS_INLINE StringSource::Slice removeSuffixFromSlice(const StringSource::Slice & slice, size_t suffix_length)
133{
134StringSource::Slice res = slice;
135if constexpr (is_utf8)
136res.size = UTF8StringSource::skipCodePointsBackward(slice.data + slice.size, suffix_length, slice.data) - res.data;
137else
138res.size -= std::min(suffix_length, res.size);
139return res;
140}
141
142/// If `is_right_pad` - it's the rightPad() function instead of leftPad().
143/// If `is_utf8` - lengths are measured in code points instead of bytes.
144template <bool is_right_pad, bool is_utf8>
145class FunctionPadString : public IFunction
146{
147public:
148static constexpr auto name = is_right_pad ? (is_utf8 ? "rightPadUTF8" : "rightPad") : (is_utf8 ? "leftPadUTF8" : "leftPad");
149static FunctionPtr create(const ContextPtr) { return std::make_shared<FunctionPadString>(); }
150
151String getName() const override { return name; }
152
153bool isVariadic() const override { return true; }
154size_t getNumberOfArguments() const override { return 0; }
155
156bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
157
158bool useDefaultImplementationForConstants() const override { return false; }
159
160DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
161{
162size_t number_of_arguments = arguments.size();
163
164if (number_of_arguments != 2 && number_of_arguments != 3)
165throw Exception(
166ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
167"Number of arguments for function {} doesn't match: passed {}, should be 2 or 3",
168getName(),
169number_of_arguments);
170
171if (!isStringOrFixedString(arguments[0]))
172throw Exception(
173ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
174"Illegal type {} of the first argument of function {}, should be string",
175arguments[0]->getName(),
176getName());
177
178if (!isInteger(arguments[1]))
179throw Exception(
180ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
181"Illegal type {} of the second argument of function {}, should be unsigned integer",
182arguments[1]->getName(),
183getName());
184
185if (number_of_arguments == 3 && !isStringOrFixedString(arguments[2]))
186throw Exception(
187ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
188"Illegal type {} of the third argument of function {}, should be const string",
189arguments[2]->getName(),
190getName());
191
192return std::make_shared<DataTypeString>();
193}
194
195ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
196{
197auto column_string = arguments[0].column;
198auto column_length = arguments[1].column;
199
200String pad_string;
201if (arguments.size() == 3)
202{
203auto column_pad = arguments[2].column;
204const ColumnConst * column_pad_const = checkAndGetColumnConst<ColumnString>(column_pad.get());
205if (!column_pad_const)
206throw Exception(
207ErrorCodes::ILLEGAL_COLUMN,
208"Illegal column {}, third argument of function {} must be a constant string",
209column_pad->getName(),
210getName());
211
212pad_string = column_pad_const->getValue<String>();
213}
214PaddingChars<is_utf8> padding_chars{pad_string};
215
216auto col_res = ColumnString::create();
217StringSink res_sink{*col_res, input_rows_count};
218
219if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_string.get()))
220executeForSource(StringSource{*col}, column_length, padding_chars, res_sink);
221else if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_string.get()))
222executeForSource(FixedStringSource{*col_fixed}, column_length, padding_chars, res_sink);
223else if (const ColumnConst * col_const = checkAndGetColumnConst<ColumnString>(column_string.get()))
224executeForSource(ConstSource<StringSource>{*col_const}, column_length, padding_chars, res_sink);
225else if (const ColumnConst * col_const_fixed = checkAndGetColumnConst<ColumnFixedString>(column_string.get()))
226executeForSource(ConstSource<FixedStringSource>{*col_const_fixed}, column_length, padding_chars, res_sink);
227else
228throw Exception(
229ErrorCodes::ILLEGAL_COLUMN,
230"Illegal column {}, first argument of function {} must be a string",
231arguments[0].column->getName(),
232getName());
233
234return col_res;
235}
236
237private:
238template <typename SourceStrings>
239void executeForSource(
240SourceStrings && strings,
241const ColumnPtr & column_length,
242const PaddingChars<is_utf8> & padding_chars,
243StringSink & res_sink) const
244{
245if (const auto * col_const = checkAndGetColumn<ColumnConst>(column_length.get()))
246executeForSourceAndLength(std::forward<SourceStrings>(strings), ConstSource<GenericValueSource>{*col_const}, padding_chars, res_sink);
247else
248executeForSourceAndLength(std::forward<SourceStrings>(strings), GenericValueSource{*column_length}, padding_chars, res_sink);
249}
250
251template <typename SourceStrings, typename SourceLengths>
252void executeForSourceAndLength(
253SourceStrings && strings,
254SourceLengths && lengths,
255const PaddingChars<is_utf8> & padding_chars,
256StringSink & res_sink) const
257{
258bool is_const_new_length = lengths.isConst();
259ssize_t new_length = 0;
260
261/// Insert padding characters to each string from `strings`, write the result strings into `res_sink`.
262/// If for some input string its current length is greater than the specified new length then that string
263/// will be trimmed to the specified new length instead of padding.
264for (; !res_sink.isEnd(); res_sink.next(), strings.next(), lengths.next())
265{
266auto str = strings.getWhole();
267ssize_t current_length = getLengthOfSlice<is_utf8>(str);
268
269if (!res_sink.rowNum() || !is_const_new_length)
270{
271/// If `is_const_new_length` is true we can get and check the new length only once.
272auto new_length_slice = lengths.getWhole();
273new_length = new_length_slice.elements->getInt(new_length_slice.position);
274if (new_length > MAX_NEW_LENGTH)
275{
276throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "New padded length ({}) is too big, maximum is: {}",
277std::to_string(new_length), std::to_string(MAX_NEW_LENGTH));
278}
279if (new_length < 0)
280{
281throw Exception(
282ErrorCodes::INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE, "New padded length ({}) is negative", std::to_string(new_length));
283}
284if (is_const_new_length)
285{
286size_t rows_count = res_sink.offsets.size();
287res_sink.reserve((new_length + 1 /* zero terminator */) * rows_count);
288}
289}
290
291if (new_length == current_length)
292{
293writeSlice(str, res_sink);
294}
295else if (new_length < current_length)
296{
297str = removeSuffixFromSlice<is_utf8>(str, current_length - new_length);
298writeSlice(str, res_sink);
299}
300else if (new_length > current_length)
301{
302if constexpr (!is_right_pad)
303padding_chars.appendTo(res_sink, new_length - current_length);
304
305writeSlice(str, res_sink);
306
307if constexpr (is_right_pad)
308padding_chars.appendTo(res_sink, new_length - current_length);
309}
310}
311}
312};
313}
314
315REGISTER_FUNCTION(PadString)
316{
317factory.registerFunction<FunctionPadString<false, false>>(); /// leftPad
318factory.registerFunction<FunctionPadString<false, true>>(); /// leftPadUTF8
319factory.registerFunction<FunctionPadString<true, false>>(); /// rightPad
320factory.registerFunction<FunctionPadString<true, true>>(); /// rightPadUTF8
321
322factory.registerAlias("lpad", "leftPad", FunctionFactory::CaseInsensitive);
323factory.registerAlias("rpad", "rightPad", FunctionFactory::CaseInsensitive);
324}
325
326}
327