ClickHouse
119 строк · 3.5 Кб
1#include <cctype>2
3#include <Functions/FunctionFactory.h>4#include <Functions/FunctionStringToString.h>5#include <Common/StringUtils/StringUtils.h>6
7
8namespace DB9{
10
11namespace ErrorCodes12{
13extern const int ILLEGAL_COLUMN;14}
15/** Soundex algorithm, https://en.wikipedia.org/wiki/Soundex
16* Implemented similarly as in most SQL dialects:
17* 1. Save the first letter. Map all occurrences of a, e, i, o, u, y, h, w. to zero(0)
18* 2. Replace all consonants (include the first letter) with digits as follows:
19* - b, f, p, v → 1
20* - c, g, j, k, q, s, x, z → 2
21* - d, t → 3
22* - l → 4
23* - m, n → 5
24* - r → 6
25* 3. Replace all adjacent same digits with one digit, and then remove all the zero (0) digits
26* 4. If the saved letter's digit is the same as the resulting first digit, remove the digit (keep the letter).
27* 5. Append 3 zeros if result contains less than 3 digits. Remove all except first letter and 3 digits after it.
28*/
29
30struct SoundexImpl31{
32static constexpr auto length = 4z;33static constexpr auto soundex_map = "01230120022455012623010202";34
35static void calculate(const char * value, size_t value_length, char * out)36{37const char * cur = value;38const char * const end = value + value_length;39char * const out_end = out + length;40
41while (cur < end && !isAlphaASCII(*cur))42++cur;43
44char prev_code = '0';45if (cur < end)46{47*out = toUpperIfAlphaASCII(*cur);48++out;49prev_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];50++cur;51}52
53while (cur < end && !isAlphaASCII(*cur))54++cur;55
56while (cur < end && out < out_end)57{58char current_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];59if ((current_code != '0') && (current_code != prev_code))60{61*out = current_code;62++out;63}64prev_code = current_code;65++cur;66
67while (cur < end && !isAlphaASCII(*cur))68++cur;69}70
71while (out < out_end)72{73*out = '0';74++out;75}76}77
78static void vector(79const ColumnString::Chars & data,80const ColumnString::Offsets & offsets,81ColumnString::Chars & res_data,82ColumnString::Offsets & res_offsets)83{84const size_t size = offsets.size();85res_data.resize(size * (length + 1));86res_offsets.resize(size);87
88size_t prev_offset = 0;89for (size_t i = 0; i < size; ++i)90{91const char * value = reinterpret_cast<const char *>(&data[prev_offset]);92const size_t value_length = offsets[i] - prev_offset - 1;93const size_t out_index = i * (length + 1);94calculate(value, value_length, reinterpret_cast<char *>(&res_data[out_index]));95res_data[out_index + length] = '\0';96res_offsets[i] = (out_index + length + 1);97prev_offset = offsets[i];98}99}100
101[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)102{103throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by soundex function");104}105};106
107struct NameSoundex108{
109static constexpr auto name = "soundex";110};111
112REGISTER_FUNCTION(Soundex)113{
114factory.registerFunction<FunctionStringToString<SoundexImpl, NameSoundex>>(115FunctionDocumentation{.description="Returns Soundex code of a string."}, FunctionFactory::CaseInsensitive);116}
117
118
119}
120