ClickHouse

Форк
0
/
soundex.cpp 
119 строк · 3.5 Кб
1
#include <cctype>
2

3
#include <Functions/FunctionFactory.h>
4
#include <Functions/FunctionStringToString.h>
5
#include <Common/StringUtils/StringUtils.h>
6

7

8
namespace DB
9
{
10

11
namespace ErrorCodes
12
{
13
    extern const int ILLEGAL_COLUMN;
14
}
15
/** Soundex algorithm, https://en.wikipedia.org/wiki/Soundex
16
  * Implemented similarly as in most SQL dialects:
17
  * 1. Save the first letter. Map all occurrences of a, e, i, o, u, y, h, w. to zero(0)
18
  * 2. Replace all consonants (include the first letter) with digits as follows:
19
  *  - b, f, p, v → 1
20
  *  - c, g, j, k, q, s, x, z → 2
21
  *  - d, t → 3
22
  *  - l → 4
23
  *  - m, n → 5
24
  *  - r → 6
25
  * 3. Replace all adjacent same digits with one digit, and then remove all the zero (0) digits
26
  * 4. If the saved letter's digit is the same as the resulting first digit, remove the digit (keep the letter).
27
  * 5. Append 3 zeros if result contains less than 3 digits. Remove all except first letter and 3 digits after it.
28
  */
29

30
struct SoundexImpl
31
{
32
    static constexpr auto length = 4z;
33
    static constexpr auto soundex_map = "01230120022455012623010202";
34

35
    static void calculate(const char * value, size_t value_length, char * out)
36
    {
37
        const char * cur = value;
38
        const char * const end = value + value_length;
39
        char * const out_end = out + length;
40

41
        while (cur < end && !isAlphaASCII(*cur))
42
            ++cur;
43

44
        char prev_code = '0';
45
        if (cur < end)
46
        {
47
            *out = toUpperIfAlphaASCII(*cur);
48
            ++out;
49
            prev_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
50
            ++cur;
51
        }
52

53
        while (cur < end && !isAlphaASCII(*cur))
54
            ++cur;
55

56
        while (cur < end && out < out_end)
57
        {
58
            char current_code = soundex_map[toUpperIfAlphaASCII(*cur) - 'A'];
59
            if ((current_code != '0') && (current_code != prev_code))
60
            {
61
                *out = current_code;
62
                ++out;
63
            }
64
            prev_code = current_code;
65
            ++cur;
66

67
            while (cur < end && !isAlphaASCII(*cur))
68
                ++cur;
69
        }
70

71
        while (out < out_end)
72
        {
73
            *out = '0';
74
            ++out;
75
        }
76
    }
77

78
    static void vector(
79
        const ColumnString::Chars & data,
80
        const ColumnString::Offsets & offsets,
81
        ColumnString::Chars & res_data,
82
        ColumnString::Offsets & res_offsets)
83
    {
84
        const size_t size = offsets.size();
85
        res_data.resize(size * (length + 1));
86
        res_offsets.resize(size);
87

88
        size_t prev_offset = 0;
89
        for (size_t i = 0; i < size; ++i)
90
        {
91
            const char * value = reinterpret_cast<const char *>(&data[prev_offset]);
92
            const size_t value_length = offsets[i] - prev_offset - 1;
93
            const size_t out_index = i * (length + 1);
94
            calculate(value, value_length, reinterpret_cast<char *>(&res_data[out_index]));
95
            res_data[out_index + length] = '\0';
96
            res_offsets[i] = (out_index + length + 1);
97
            prev_offset = offsets[i];
98
        }
99
    }
100

101
    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
102
    {
103
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by soundex function");
104
    }
105
};
106

107
struct NameSoundex
108
{
109
    static constexpr auto name = "soundex";
110
};
111

112
REGISTER_FUNCTION(Soundex)
113
{
114
    factory.registerFunction<FunctionStringToString<SoundexImpl, NameSoundex>>(
115
        FunctionDocumentation{.description="Returns Soundex code of a string."}, FunctionFactory::CaseInsensitive);
116
}
117

118

119
}
120

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.