ClickHouse
446 строк · 15.6 Кб
1#include <Functions/FunctionGenerateRandomStructure.h>2#include <Functions/FunctionFactory.h>3#include <Functions/FunctionHelpers.h>4#include <Functions/IFunction.h>5#include <Columns/ColumnString.h>6#include <DataTypes/DataTypeString.h>7#include <DataTypes/DataTypeFixedString.h>8#include <Interpreters/Context.h>9#include <Common/randomSeed.h>10#include <Common/FunctionDocumentation.h>11#include <IO/WriteHelpers.h>12#include <IO/WriteBufferFromVector.h>13
14#include <pcg_random.hpp>15
16namespace DB17{
18
19namespace ErrorCodes20{
21extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;22extern const int ILLEGAL_TYPE_OF_ARGUMENT;23extern const int BAD_ARGUMENTS;24}
25
26namespace
27{
28const size_t MAX_NUMBER_OF_COLUMNS = 128;29const size_t MAX_TUPLE_ELEMENTS = 16;30const size_t MAX_DATETIME64_PRECISION = 9;31const size_t MAX_DECIMAL32_PRECISION = 9;32const size_t MAX_DECIMAL64_PRECISION = 18;33const size_t MAX_DECIMAL128_PRECISION = 38;34const size_t MAX_DECIMAL256_PRECISION = 76;35const size_t MAX_DEPTH = 16;36
37constexpr std::array<TypeIndex, 29> simple_types38{39TypeIndex::Int8,40TypeIndex::UInt8,41TypeIndex::Int16,42TypeIndex::UInt16,43TypeIndex::Int32,44TypeIndex::UInt32,45TypeIndex::Int64,46TypeIndex::UInt64,47TypeIndex::Int128,48TypeIndex::UInt128,49TypeIndex::Int256,50TypeIndex::UInt256,51TypeIndex::Float32,52TypeIndex::Float64,53TypeIndex::Decimal32,54TypeIndex::Decimal64,55TypeIndex::Decimal128,56TypeIndex::Decimal256,57TypeIndex::Date,58TypeIndex::Date32,59TypeIndex::DateTime,60TypeIndex::DateTime64,61TypeIndex::String,62TypeIndex::FixedString,63TypeIndex::Enum8,64TypeIndex::Enum16,65TypeIndex::IPv4,66TypeIndex::IPv6,67TypeIndex::UUID,68};69
70constexpr std::array<TypeIndex, 5> complex_types71{72TypeIndex::Nullable,73TypeIndex::LowCardinality,74TypeIndex::Array,75TypeIndex::Tuple,76TypeIndex::Map,77};78
79constexpr std::array<TypeIndex, 22> map_key_types80{81TypeIndex::Int8,82TypeIndex::UInt8,83TypeIndex::Int16,84TypeIndex::UInt16,85TypeIndex::Int32,86TypeIndex::UInt32,87TypeIndex::Int64,88TypeIndex::UInt64,89TypeIndex::Int128,90TypeIndex::UInt128,91TypeIndex::Int256,92TypeIndex::UInt256,93TypeIndex::Date,94TypeIndex::Date32,95TypeIndex::DateTime,96TypeIndex::String,97TypeIndex::FixedString,98TypeIndex::IPv4,99TypeIndex::Enum8,100TypeIndex::Enum16,101TypeIndex::UUID,102TypeIndex::LowCardinality,103};104
105constexpr std::array<TypeIndex, 22> suspicious_lc_types106{107TypeIndex::Int8,108TypeIndex::UInt8,109TypeIndex::Int16,110TypeIndex::UInt16,111TypeIndex::Int32,112TypeIndex::UInt32,113TypeIndex::Int64,114TypeIndex::UInt64,115TypeIndex::Int128,116TypeIndex::UInt128,117TypeIndex::Int256,118TypeIndex::UInt256,119TypeIndex::Float32,120TypeIndex::Float64,121TypeIndex::Date,122TypeIndex::Date32,123TypeIndex::DateTime,124TypeIndex::String,125TypeIndex::FixedString,126TypeIndex::IPv4,127TypeIndex::IPv6,128TypeIndex::UUID,129};130
131template <bool allow_complex_types>132constexpr auto getAllTypes()133{134constexpr size_t complex_types_size = complex_types.size() * allow_complex_types;135constexpr size_t result_size = simple_types.size() + complex_types_size;136std::array<TypeIndex, result_size> result;137size_t index = 0;138
139for (size_t i = 0; i != simple_types.size(); ++i, ++index)140result[index] = simple_types[i];141
142for (size_t i = 0; i != complex_types_size; ++i, ++index)143result[index] = complex_types[i];144
145return result;146}147
148size_t generateNumberOfColumns(pcg64 & rng)149{150return rng() % MAX_NUMBER_OF_COLUMNS + 1;151}152
153void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types)154{155bool make_nullable = rng() % 2;156if (make_nullable)157writeCString("Nullable(", buf);158
159if (allow_suspicious_lc_types)160{161TypeIndex type = suspicious_lc_types[rng() % suspicious_lc_types.size()];162
163if (type == TypeIndex::FixedString)164writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);165else166writeString(magic_enum::enum_name<TypeIndex>(type), buf);167}168else169{170/// Support only String and FixedString.171if (rng() % 2)172writeCString("String", buf);173else174writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);175}176
177if (make_nullable)178writeChar(')', buf);179}180
181void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value)182{183/// Don't generate big enums, because it will lead to really big result184/// and slowness of this function, and it can lead to `Max query size exceeded`185/// while using this function with generateRandom.186size_t num_values = rng() % 16 + 1;187std::vector<Int16> values(num_values);188
189/// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1].190for (Int16 & x : values)191x = rng() % (2 * max_value + 3 - num_values) - max_value - 1;192/// Make all numbers unique.193std::sort(values.begin(), values.end());194for (size_t i = 0; i < num_values; ++i)195values[i] += i;196std::shuffle(values.begin(), values.end(), rng);197for (size_t i = 0; i != num_values; ++i)198{199if (i != 0)200writeCString(", ", buf);201writeString("'" + column_name + "V" + std::to_string(i) + "' = " + std::to_string(values[i]), buf);202}203}204
205void writeMapKeyType(const String & column_name, pcg64 & rng, WriteBuffer & buf)206{207TypeIndex type = map_key_types[rng() % map_key_types.size()];208switch (type)209{210case TypeIndex::FixedString:211writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);212break;213case TypeIndex::LowCardinality:214writeCString("LowCardinality(", buf);215/// Map key supports only String and FixedString inside LowCardinality.216if (rng() % 2)217writeCString("String", buf);218else219writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);220writeChar(')', buf);221break;222case TypeIndex::Enum8:223writeCString("Enum8(", buf);224writeEnumValues(column_name, rng, buf, INT8_MAX);225writeChar(')', buf);226break;227case TypeIndex::Enum16:228writeCString("Enum16(", buf);229writeEnumValues(column_name, rng, buf, INT16_MAX);230writeChar(')', buf);231break;232default:233writeString(magic_enum::enum_name<TypeIndex>(type), buf);234break;235}236}237
238template <bool allow_complex_types = true>239void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0)240{241if (allow_complex_types && depth > MAX_DEPTH)242writeRandomType<false>(column_name, rng, buf, depth);243
244constexpr auto all_types = getAllTypes<allow_complex_types>();245auto type = all_types[rng() % all_types.size()];246
247switch (type)248{249case TypeIndex::UInt8:250if (rng() % 2)251writeCString("UInt8", buf);252else253writeCString("Bool", buf);254return;255case TypeIndex::FixedString:256writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf);257return;258case TypeIndex::DateTime64:259writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf);260return;261case TypeIndex::Decimal32:262writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf);263return;264case TypeIndex::Decimal64:265writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf);266return;267case TypeIndex::Decimal128:268writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf);269return;270case TypeIndex::Decimal256:271writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf);272return;273case TypeIndex::Enum8:274writeCString("Enum8(", buf);275writeEnumValues(column_name, rng, buf, INT8_MAX);276writeChar(')', buf);277return;278case TypeIndex::Enum16:279writeCString("Enum16(", buf);280writeEnumValues(column_name, rng, buf, INT16_MAX);281writeChar(')', buf);282return;283case TypeIndex::LowCardinality:284writeCString("LowCardinality(", buf);285writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types);286writeChar(')', buf);287return;288case TypeIndex::Nullable:289{290writeCString("Nullable(", buf);291writeRandomType<false>(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);292writeChar(')', buf);293return;294}295case TypeIndex::Array:296{297writeCString("Array(", buf);298writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);299writeChar(')', buf);300return;301}302case TypeIndex::Map:303{304writeCString("Map(", buf);305writeMapKeyType(column_name, rng, buf);306writeCString(", ", buf);307writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1);308writeChar(')', buf);309return;310}311case TypeIndex::Tuple:312{313size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1;314bool generate_nested = rng() % 2;315bool generate_named_tuple = rng() % 2;316if (generate_nested)317writeCString("Nested(", buf);318else319writeCString("Tuple(", buf);320
321for (size_t i = 0; i != elements; ++i)322{323if (i != 0)324writeCString(", ", buf);325
326String element_name = "e" + std::to_string(i + 1);327if (generate_named_tuple || generate_nested)328{329writeString(element_name, buf);330writeChar(' ', buf);331}332writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1);333}334writeChar(')', buf);335return;336}337default:338writeString(magic_enum::enum_name<TypeIndex>(type), buf);339return;340}341}342
343void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types)344{345for (size_t i = 0; i != number_of_columns; ++i)346{347if (i != 0)348writeCString(", ", buf);349String column_name = "c" + std::to_string(i + 1);350writeString(column_name, buf);351writeChar(' ', buf);352writeRandomType(column_name, rng, buf, allow_suspicious_lc_types);353}354}355}
356
357DataTypePtr FunctionGenerateRandomStructure::getReturnTypeImpl(const DataTypes & arguments) const358{
359if (arguments.size() > 2)360throw Exception(361ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,362"Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2",363getName(), arguments.size());364
365
366for (size_t i = 0; i != arguments.size(); ++i)367{368if (!isUInt(arguments[i]) && !arguments[i]->onlyNull())369{370throw Exception(371ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,372"Illegal type {} of the {} argument of function {}, expected unsigned integer or Null",373arguments[i]->getName(),374i + 1,375getName());376}377}378
379return std::make_shared<DataTypeString>();380}
381
382ColumnPtr FunctionGenerateRandomStructure::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const383{
384size_t seed = randomSeed();385size_t number_of_columns = 0;386
387if (!arguments.empty() && !arguments[0].column->onlyNull())388{389number_of_columns = arguments[0].column->getUInt(0);390if (number_of_columns > MAX_NUMBER_OF_COLUMNS)391throw Exception(392ErrorCodes::BAD_ARGUMENTS,393"Maximum allowed number of columns is {}, got {}",394MAX_NUMBER_OF_COLUMNS,395number_of_columns);396}397
398if (arguments.size() > 1 && !arguments[1].column->onlyNull())399seed = arguments[1].column->getUInt(0);400
401pcg64 rng(seed);402if (number_of_columns == 0)403number_of_columns = generateNumberOfColumns(rng);404
405auto col_res = ColumnString::create();406auto & string_column = assert_cast<ColumnString &>(*col_res);407auto & chars = string_column.getChars();408WriteBufferFromVector buf(chars);409writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types);410buf.finalize();411chars.push_back(0);412string_column.getOffsets().push_back(chars.size());413return ColumnConst::create(std::move(col_res), input_rows_count);414}
415
416String FunctionGenerateRandomStructure::generateRandomStructure(size_t seed, const ContextPtr & context)417{
418pcg64 rng(seed);419size_t number_of_columns = generateNumberOfColumns(rng);420WriteBufferFromOwnString buf;421writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types);422return buf.str();423}
424
425REGISTER_FUNCTION(GenerateRandomStructure)426{
427factory.registerFunction<FunctionGenerateRandomStructure>(FunctionDocumentation428{429.description=R"(430Generates a random table structure.
431This function takes 2 optional constant arguments:
432the number of columns in the result structure (random by default) and random seed (random by default)
433The maximum number of columns is 128.
434The function returns a value of type String.
435)",436.examples{437{"random", "SELECT generateRandomStructure()", "c1 UInt32, c2 FixedString(25)"},438{"with specified number of columns", "SELECT generateRandomStructure(3)", "c1 String, c2 Array(Int32), c3 LowCardinality(String)"},439{"with specified seed", "SELECT generateRandomStructure(1, 42)", "c1 UInt128"},440},441.categories{"Random"}442},443FunctionFactory::CaseSensitive);444}
445
446}
447