ClickHouse

Форк
0
/
FunctionGenerateRandomStructure.cpp 
446 строк · 15.6 Кб
1
#include <Functions/FunctionGenerateRandomStructure.h>
2
#include <Functions/FunctionFactory.h>
3
#include <Functions/FunctionHelpers.h>
4
#include <Functions/IFunction.h>
5
#include <Columns/ColumnString.h>
6
#include <DataTypes/DataTypeString.h>
7
#include <DataTypes/DataTypeFixedString.h>
8
#include <Interpreters/Context.h>
9
#include <Common/randomSeed.h>
10
#include <Common/FunctionDocumentation.h>
11
#include <IO/WriteHelpers.h>
12
#include <IO/WriteBufferFromVector.h>
13

14
#include <pcg_random.hpp>
15

16
namespace DB
17
{
18

19
namespace ErrorCodes
20
{
21
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
22
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
23
    extern const int BAD_ARGUMENTS;
24
}
25

26
namespace
27
{
28
    const size_t MAX_NUMBER_OF_COLUMNS = 128;
29
    const size_t MAX_TUPLE_ELEMENTS = 16;
30
    const size_t MAX_DATETIME64_PRECISION = 9;
31
    const size_t MAX_DECIMAL32_PRECISION = 9;
32
    const size_t MAX_DECIMAL64_PRECISION = 18;
33
    const size_t MAX_DECIMAL128_PRECISION = 38;
34
    const size_t MAX_DECIMAL256_PRECISION = 76;
35
    const size_t MAX_DEPTH = 16;
36

37
    constexpr std::array<TypeIndex, 29> simple_types
38
    {
39
        TypeIndex::Int8,
40
        TypeIndex::UInt8,
41
        TypeIndex::Int16,
42
        TypeIndex::UInt16,
43
        TypeIndex::Int32,
44
        TypeIndex::UInt32,
45
        TypeIndex::Int64,
46
        TypeIndex::UInt64,
47
        TypeIndex::Int128,
48
        TypeIndex::UInt128,
49
        TypeIndex::Int256,
50
        TypeIndex::UInt256,
51
        TypeIndex::Float32,
52
        TypeIndex::Float64,
53
        TypeIndex::Decimal32,
54
        TypeIndex::Decimal64,
55
        TypeIndex::Decimal128,
56
        TypeIndex::Decimal256,
57
        TypeIndex::Date,
58
        TypeIndex::Date32,
59
        TypeIndex::DateTime,
60
        TypeIndex::DateTime64,
61
        TypeIndex::String,
62
        TypeIndex::FixedString,
63
        TypeIndex::Enum8,
64
        TypeIndex::Enum16,
65
        TypeIndex::IPv4,
66
        TypeIndex::IPv6,
67
        TypeIndex::UUID,
68
    };
69

70
    constexpr std::array<TypeIndex, 5> complex_types
71
    {
72
        TypeIndex::Nullable,
73
        TypeIndex::LowCardinality,
74
        TypeIndex::Array,
75
        TypeIndex::Tuple,
76
        TypeIndex::Map,
77
    };
78

79
    constexpr std::array<TypeIndex, 22> map_key_types
80
    {
81
        TypeIndex::Int8,
82
        TypeIndex::UInt8,
83
        TypeIndex::Int16,
84
        TypeIndex::UInt16,
85
        TypeIndex::Int32,
86
        TypeIndex::UInt32,
87
        TypeIndex::Int64,
88
        TypeIndex::UInt64,
89
        TypeIndex::Int128,
90
        TypeIndex::UInt128,
91
        TypeIndex::Int256,
92
        TypeIndex::UInt256,
93
        TypeIndex::Date,
94
        TypeIndex::Date32,
95
        TypeIndex::DateTime,
96
        TypeIndex::String,
97
        TypeIndex::FixedString,
98
        TypeIndex::IPv4,
99
        TypeIndex::Enum8,
100
        TypeIndex::Enum16,
101
        TypeIndex::UUID,
102
        TypeIndex::LowCardinality,
103
    };
104

105
    constexpr std::array<TypeIndex, 22> suspicious_lc_types
106
    {
107
        TypeIndex::Int8,
108
        TypeIndex::UInt8,
109
        TypeIndex::Int16,
110
        TypeIndex::UInt16,
111
        TypeIndex::Int32,
112
        TypeIndex::UInt32,
113
        TypeIndex::Int64,
114
        TypeIndex::UInt64,
115
        TypeIndex::Int128,
116
        TypeIndex::UInt128,
117
        TypeIndex::Int256,
118
        TypeIndex::UInt256,
119
        TypeIndex::Float32,
120
        TypeIndex::Float64,
121
        TypeIndex::Date,
122
        TypeIndex::Date32,
123
        TypeIndex::DateTime,
124
        TypeIndex::String,
125
        TypeIndex::FixedString,
126
        TypeIndex::IPv4,
127
        TypeIndex::IPv6,
128
        TypeIndex::UUID,
129
    };
130

131
    template <bool allow_complex_types>
132
    constexpr auto getAllTypes()
133
    {
134
        constexpr size_t complex_types_size = complex_types.size() * allow_complex_types;
135
        constexpr size_t result_size = simple_types.size() + complex_types_size;
136
        std::array<TypeIndex, result_size> result;
137
        size_t index = 0;
138

139
        for (size_t i = 0; i != simple_types.size(); ++i, ++index)
140
            result[index] = simple_types[i];
141

142
        for (size_t i = 0; i != complex_types_size; ++i, ++index)
143
            result[index] = complex_types[i];
144

145
        return result;
146
    }
147

148
    size_t generateNumberOfColumns(pcg64 & rng)
149
    {
150
        return rng() % MAX_NUMBER_OF_COLUMNS + 1;
151
    }
152

153
    void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types)
154
    {
155
        bool make_nullable = rng() % 2;
156
        if (make_nullable)
157
            writeCString("Nullable(", buf);
158

159
        if (allow_suspicious_lc_types)
160
        {
161
            TypeIndex type = suspicious_lc_types[rng() % suspicious_lc_types.size()];
162

163
            if (type == TypeIndex::FixedString)
164
                writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
165
            else
166
                writeString(magic_enum::enum_name<TypeIndex>(type), buf);
167
        }
168
        else
169
        {
170
            /// Support only String and FixedString.
171
            if (rng() % 2)
172
                writeCString("String", buf);
173
            else
174
                writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
175
        }
176

177
        if (make_nullable)
178
            writeChar(')', buf);
179
    }
180

181
    void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value)
182
    {
183
        /// Don't generate big enums, because it will lead to really big result
184
        /// and slowness of this function, and it can lead to `Max query size exceeded`
185
        /// while using this function with generateRandom.
186
        size_t num_values = rng() % 16 + 1;
187
        std::vector<Int16> values(num_values);
188

189
        /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1].
190
        for (Int16 & x : values)
191
            x = rng() % (2 * max_value + 3 - num_values) - max_value - 1;
192
        /// Make all numbers unique.
193
        std::sort(values.begin(), values.end());
194
        for (size_t i = 0; i < num_values; ++i)
195
            values[i] += i;
196
        std::shuffle(values.begin(), values.end(), rng);
197
        for (size_t i = 0; i != num_values; ++i)
198
        {
199
            if (i != 0)
200
                writeCString(", ", buf);
201
            writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(values[i]), buf);
202
        }
203
    }
204

205
    void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf)
206
    {
207
        TypeIndex type = map_key_types[rng() % map_key_types.size()];
208
        switch (type)
209
        {
210
            case TypeIndex::FixedString:
211
                writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
212
                break;
213
            case TypeIndex::LowCardinality:
214
                writeCString("LowCardinality(", buf);
215
                /// Map key supports only String and FixedString inside LowCardinality.
216
                if (rng() % 2)
217
                    writeCString("String", buf);
218
                else
219
                    writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
220
                writeChar(')', buf);
221
                break;
222
            case TypeIndex::Enum8:
223
                writeCString("Enum8(", buf);
224
                writeEnumValues(column_name, rng, buf, INT8_MAX);
225
                writeChar(')', buf);
226
                break;
227
            case TypeIndex::Enum16:
228
                writeCString("Enum16(", buf);
229
                writeEnumValues(column_name, rng, buf, INT16_MAX);
230
                writeChar(')', buf);
231
                break;
232
            default:
233
                writeString(magic_enum::enum_name<TypeIndex>(type), buf);
234
                break;
235
        }
236
    }
237

238
    template <bool allow_complex_types = true>
239
    void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0)
240
    {
241
        if (allow_complex_types && depth > MAX_DEPTH)
242
            writeRandomType<false>(column_name, rng, buf, depth);
243

244
        constexpr auto all_types = getAllTypes<allow_complex_types>();
245
        auto type = all_types[rng() % all_types.size()];
246

247
        switch (type)
248
        {
249
            case TypeIndex::UInt8:
250
                if (rng() % 2)
251
                    writeCString("UInt8", buf);
252
                else
253
                    writeCString("Bool", buf);
254
                return;
255
            case TypeIndex::FixedString:
256
                writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);
257
                return;
258
            case TypeIndex::DateTime64:
259
                writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf);
260
                return;
261
            case TypeIndex::Decimal32:
262
                writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf);
263
                return;
264
            case TypeIndex::Decimal64:
265
                writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf);
266
                return;
267
            case TypeIndex::Decimal128:
268
                writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf);
269
                return;
270
            case TypeIndex::Decimal256:
271
                writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf);
272
                return;
273
            case TypeIndex::Enum8:
274
                writeCString("Enum8(", buf);
275
                writeEnumValues(column_name, rng, buf, INT8_MAX);
276
                writeChar(')', buf);
277
                return;
278
            case TypeIndex::Enum16:
279
                writeCString("Enum16(", buf);
280
                writeEnumValues(column_name, rng, buf, INT16_MAX);
281
                writeChar(')', buf);
282
                return;
283
            case TypeIndex::LowCardinality:
284
                writeCString("LowCardinality(", buf);
285
                writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types);
286
                writeChar(')', buf);
287
                return;
288
            case TypeIndex::Nullable:
289
            {
290
                writeCString("Nullable(", buf);
291
                writeRandomType<false>(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);
292
                writeChar(')', buf);
293
                return;
294
            }
295
            case TypeIndex::Array:
296
            {
297
                writeCString("Array(", buf);
298
                writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);
299
                writeChar(')', buf);
300
                return;
301
            }
302
            case TypeIndex::Map:
303
            {
304
                writeCString("Map(", buf);
305
                writeMapKeyType(column_name, rng, buf);
306
                writeCString(", ", buf);
307
                writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);
308
                writeChar(')', buf);
309
                return;
310
            }
311
            case TypeIndex::Tuple:
312
            {
313
                size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1;
314
                bool generate_nested = rng() % 2;
315
                bool generate_named_tuple = rng() % 2;
316
                if (generate_nested)
317
                    writeCString("Nested(", buf);
318
                else
319
                    writeCString("Tuple(", buf);
320

321
                for (size_t i = 0; i != elements; ++i)
322
                {
323
                    if (i != 0)
324
                        writeCString(", ", buf);
325

326
                    String element_name = "e" + std::to_string(i + 1);
327
                    if (generate_named_tuple || generate_nested)
328
                    {
329
                        writeString(element_name, buf);
330
                        writeChar(' ', buf);
331
                    }
332
                    writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1);
333
                }
334
                writeChar(')', buf);
335
                return;
336
            }
337
            default:
338
                writeString(magic_enum::enum_name<TypeIndex>(type), buf);
339
                return;
340
        }
341
    }
342

343
    void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types)
344
    {
345
        for (size_t i = 0; i != number_of_columns; ++i)
346
        {
347
            if (i != 0)
348
                writeCString(", ", buf);
349
            String column_name = "c" + std::to_string(i + 1);
350
            writeString(column_name, buf);
351
            writeChar(' ', buf);
352
            writeRandomType(column_name, rng, buf, allow_suspicious_lc_types);
353
        }
354
    }
355
}
356

357
DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes & arguments) const
358
{
359
    if (arguments.size() > 2)
360
        throw Exception(
361
            ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
362
            "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2",
363
            getName(), arguments.size());
364

365

366
    for (size_t i = 0; i != arguments.size(); ++i)
367
    {
368
        if (!isUInt(arguments[i]) && !arguments[i]->onlyNull())
369
        {
370
            throw Exception(
371
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
372
                "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null",
373
                arguments[i]->getName(),
374
                i + 1,
375
                getName());
376
        }
377
    }
378

379
    return std::make_shared<DataTypeString>();
380
}
381

382
ColumnPtr FunctionGenerateRandomStructure::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const
383
{
384
    size_t seed = randomSeed();
385
    size_t number_of_columns = 0;
386

387
    if (!arguments.empty() && !arguments[0].column->onlyNull())
388
    {
389
        number_of_columns = arguments[0].column->getUInt(0);
390
        if (number_of_columns > MAX_NUMBER_OF_COLUMNS)
391
            throw Exception(
392
                ErrorCodes::BAD_ARGUMENTS,
393
                "Maximum allowed number of columns is {}, got {}",
394
                MAX_NUMBER_OF_COLUMNS,
395
                number_of_columns);
396
    }
397

398
    if (arguments.size() > 1 && !arguments[1].column->onlyNull())
399
        seed = arguments[1].column->getUInt(0);
400

401
    pcg64 rng(seed);
402
    if (number_of_columns == 0)
403
        number_of_columns = generateNumberOfColumns(rng);
404

405
    auto col_res = ColumnString::create();
406
    auto & string_column = assert_cast<ColumnString &>(*col_res);
407
    auto & chars = string_column.getChars();
408
    WriteBufferFromVector buf(chars);
409
    writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types);
410
    buf.finalize();
411
    chars.push_back(0);
412
    string_column.getOffsets().push_back(chars.size());
413
    return ColumnConst::create(std::move(col_res), input_rows_count);
414
}
415

416
String FunctionGenerateRandomStructure::generateRandomStructure(size_t seed, const ContextPtr & context)
417
{
418
    pcg64 rng(seed);
419
    size_t number_of_columns = generateNumberOfColumns(rng);
420
    WriteBufferFromOwnString buf;
421
    writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types);
422
    return buf.str();
423
}
424

425
REGISTER_FUNCTION(GenerateRandomStructure)
426
{
427
    factory.registerFunction<FunctionGenerateRandomStructure>(FunctionDocumentation
428
        {
429
            .description=R"(
430
Generates a random table structure.
431
This function takes 2 optional constant arguments:
432
the number of columns in the result structure (random by default) and random seed (random by default)
433
The maximum number of columns is 128.
434
The function returns a value of type String.
435
)",
436
            .examples{
437
                {"random", "SELECT generateRandomStructure()", "c1 UInt32, c2 FixedString(25)"},
438
                {"with specified number of columns", "SELECT generateRandomStructure(3)", "c1 String, c2 Array(Int32), c3 LowCardinality(String)"},
439
                {"with specified seed", "SELECT generateRandomStructure(1, 42)", "c1 UInt128"},
440
            },
441
            .categories{"Random"}
442
        },
443
        FunctionFactory::CaseSensitive);
444
}
445

446
}
447

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.