ClickHouse

FunctionsStringSimilarity.cpp
569 строк · 22.8 Кб
Перенос по словам
1
#include <Functions/FunctionsStringSimilarity.h>
2
#include <Functions/FunctionFactory.h>
3
#include <Functions/FunctionsHashing.h>
4
#include <Common/HashTable/ClearableHashMap.h>
5
#include <Common/HashTable/Hash.h>
6
#include <Common/MemorySanitizer.h>
7
#include <Common/UTF8Helpers.h>
8

9
#include <Core/Defines.h>
10

11
#include <base/unaligned.h>
12

13
#include <algorithm>
14
#include <climits>
15
#include <cstring>
16
#include <limits>
17
#include <memory>
18
#include <utility>
19

20
#ifdef __SSE4_2__
21
#    include <nmmintrin.h>
22
#endif
23

24
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
25
#    include <arm_acle.h>
26
#endif
27

28
#if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
29
#include "vec_crc32.h"
30
#endif
31

32
namespace DB
33
{
34
/** Distance function implementation.
35
  * We calculate all the n-grams from left string and count by the index of
36
  * 16 bits hash of them in the map.
37
  * Then calculate all the n-grams from the right string and calculate
38
  * the n-gram distance on the flight by adding and subtracting from the hashmap.
39
  * Then return the map into the condition of which it was after the left string
40
  * calculation. If the right string size is big (more than 2**15 bytes),
41
  * the strings are not similar at all and we return 1.
42
  */
43
template <size_t N, class CodePoint, bool UTF8, bool case_insensitive, bool symmetric>
44
struct NgramDistanceImpl
45
{
46
    using ResultType = Float32;
47

48
    /// map_size for ngram difference.
49
    static constexpr size_t map_size = 1u << 16;
50

51
    /// If the haystack size is bigger than this, behaviour is unspecified for this function.
52
    static constexpr size_t max_string_size = 1u << 15;
53

54
    /// Default padding to read safely.
55
    static constexpr size_t default_padding = 16;
56

57
    /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
58
    static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
59

60
    /** map_size of this fits mostly in L2 cache all the time.
61
      * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
62
      * integer array.
63
      */
64
    using NgramCount = UInt16;
65

66
    static ALWAYS_INLINE UInt16 calculateASCIIHash(const CodePoint * code_points)
67
    {
68
        return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu;
69
    }
70

71
    static ALWAYS_INLINE UInt16 calculateUTF8Hash(const CodePoint * code_points)
72
    {
73
        UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1];
74
#ifdef __SSE4_2__
75
        return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu;
76
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
77
        return __crc32cd(code_points[2], combined) & 0xFFFFu;
78
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
79
        return crc32_ppc(code_points[2], reinterpret_cast<const unsigned char *>(&combined), sizeof(combined)) & 0xFFFFu;
80
#elif defined(__s390x__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
81
        return s390x_crc32c(code_points[2], combined) & 0xFFFFu;
82
#else
83
        return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu;
84
#endif
85
    }
86

87
    template <size_t Offset, class Container, size_t... I>
88
    static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
89
    {
90
        ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
91
    }
92

93
    static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end)
94
    {
95
        /// Offset before which we copy some data.
96
        constexpr size_t padding_offset = default_padding - N + 1;
97
        /// We have an array like this for ASCII (N == 4, other cases are similar)
98
        /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
99
        /// And we copy                                ^^^^^^^^^^^^^^^ these bytes to the start
100
        /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
101
        memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
102
        /// Now we have an array
103
        /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
104
        ///              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105
        /// Doing unaligned read of 16 bytes and copy them like above
106
        /// 16 is also chosen to do two `movups`.
107
        /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
108
        memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
109

110
        if constexpr (case_insensitive)
111
        {
112
            /// Due to PODArray padding accessing more elements should be OK
113
            __msan_unpoison(code_points + (N - 1), padding_offset * sizeof(CodePoint));
114
            /// We really need template lambdas with C++20 to do it inline
115
            unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
116
        }
117
        pos += padding_offset;
118
        if (pos > end)
119
            return default_padding - (pos - end);
120
        return default_padding;
121
    }
122

123
    static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end)
124
    {
125
        /// The same copying as described in the function above.
126
        memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
127

128
        size_t num = N - 1;
129
        while (num < default_padding && pos < end)
130
        {
131
            size_t length = UTF8::seqLength(*pos);
132

133
            if (pos + length > end)
134
                length = end - pos;
135

136
            CodePoint res;
137
            /// This is faster than just memcpy because of compiler optimizations with moving bytes.
138
            switch (length)
139
            {
140
                case 1:
141
                    res = 0;
142
                    if constexpr (std::endian::native == std::endian::little)
143
                        memcpy(&res, pos, 1);
144
                    else
145
                        reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 1, pos, 1);
146
                    break;
147
                case 2:
148
                    res = 0;
149
                    if constexpr (std::endian::native == std::endian::little)
150
                        memcpy(&res, pos, 2);
151
                    else
152
                        reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 2, pos, 2);
153
                    break;
154
                case 3:
155
                    res = 0;
156
                    if constexpr (std::endian::native == std::endian::little)
157
                        memcpy(&res, pos, 3);
158
                    else
159
                        reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 3, pos, 3);
160
                    break;
161
                default:
162
                    if constexpr (std::endian::native == std::endian::little)
163
                        memcpy(&res, pos, 4);
164
                    else
165
                        reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 4, pos, 4);
166
            }
167

168
            /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte.
169
            /// And first bit of first byte if there are two bytes.
170
            /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrillic letters also does.
171
            /// For others, we don't care now. Lowering UTF is not a cheap operation.
172
            if constexpr (case_insensitive)
173
            {
174
                switch (length)
175
                {
176
                    case 4:
177
                        res &= ~(1u << (5 + 3 * CHAR_BIT));
178
                        [[fallthrough]];
179
                    case 3:
180
                        res &= ~(1u << (5 + 2 * CHAR_BIT));
181
                        [[fallthrough]];
182
                    case 2:
183
                        res &= ~(1u);
184
                        res &= ~(1u << (5 + CHAR_BIT));
185
                        [[fallthrough]];
186
                    default:
187
                        res &= ~(1u << 5);
188
                }
189
            }
190

191
            pos += length;
192
            code_points[num++] = res;
193
        }
194
        return num;
195
    }
196

197
    template <bool save_ngrams>
198
    static ALWAYS_INLINE inline size_t calculateNeedleStats(
199
        const char * data,
200
        const size_t size,
201
        NgramCount * ngram_stats,
202
        [[maybe_unused]] NgramCount * ngram_storage,
203
        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
204
        UInt16 (*hash_functor)(const CodePoint *))
205
    {
206
        const char * start = data;
207
        const char * end = data + size;
208
        CodePoint cp[simultaneously_codepoints_num] = {};
209
        /// read_code_points returns the position of cp where it stopped reading codepoints.
210
        size_t found = read_code_points(cp, start, end);
211
        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
212
        size_t i = N - 1;
213
        size_t len = 0;
214
        do
215
        {
216
            for (; i + N <= found; ++i)
217
            {
218
                ++len;
219
                UInt16 hash = hash_functor(cp + i);
220
                if constexpr (save_ngrams)
221
                    *ngram_storage++ = hash;
222
                ++ngram_stats[hash];
223
            }
224
            i = 0;
225
        } while (start < end && (found = read_code_points(cp, start, end)));
226

227
        return len;
228
    }
229

230
    template <bool reuse_stats>
231
    static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
232
        const char * data,
233
        const size_t size,
234
        NgramCount * ngram_stats,
235
        size_t & distance,
236
        [[maybe_unused]] UInt16 * ngram_storage,
237
        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
238
        UInt16 (*hash_functor)(const CodePoint *))
239
    {
240
        size_t ngram_cnt = 0;
241
        const char * start = data;
242
        const char * end = data + size;
243
        CodePoint cp[simultaneously_codepoints_num] = {};
244

245
        /// read_code_points returns the position of cp where it stopped reading codepoints.
246
        size_t found = read_code_points(cp, start, end);
247
        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
248
        size_t iter = N - 1;
249

250
        do
251
        {
252
            for (; iter + N <= found; ++iter)
253
            {
254
                UInt16 hash = hash_functor(cp + iter);
255
                /// For symmetric version we should add when we can't subtract to get symmetric difference.
256
                if (static_cast<Int16>(ngram_stats[hash]) > 0)
257
                    --distance;
258
                else if constexpr (symmetric)
259
                    ++distance;
260
                if constexpr (reuse_stats)
261
                    ngram_storage[ngram_cnt] = hash;
262
                ++ngram_cnt;
263
                --ngram_stats[hash];
264
            }
265
            iter = 0;
266
        } while (start < end && (found = read_code_points(cp, start, end)));
267

268
        /// Return the state of hash map to its initial.
269
        if constexpr (reuse_stats)
270
        {
271
            for (size_t i = 0; i < ngram_cnt; ++i)
272
                ++ngram_stats[ngram_storage[i]];
273
        }
274
        return ngram_cnt;
275
    }
276

277
    template <class Callback, class... Args>
278
    static inline auto dispatchSearcher(Callback callback, Args &&... args)
279
    {
280
        if constexpr (!UTF8)
281
            return callback(std::forward<Args>(args)..., readASCIICodePoints, calculateASCIIHash);
282
        else
283
            return callback(std::forward<Args>(args)..., readUTF8CodePoints, calculateUTF8Hash);
284
    }
285

286
    static void constantConstant(std::string data, std::string needle, Float32 & res)
287
    {
288
        std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
289

290
        /// We use unsafe versions of getting ngrams, so I decided to use padded strings.
291
        const size_t needle_size = needle.size();
292
        const size_t data_size = data.size();
293
        needle.resize(needle_size + default_padding);
294
        data.resize(data_size + default_padding);
295

296
        size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats.get(), nullptr);
297
        size_t distance = second_size;
298
        if (data_size <= max_string_size)
299
        {
300
            size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats.get(), distance, nullptr);
301
            /// For !symmetric version we should not use first_size.
302
            if constexpr (symmetric)
303
                res = distance * 1.f / std::max(first_size + second_size, 1uz);
304
            else
305
                res = 1.f - distance * 1.f / std::max(second_size, 1uz);
306
        }
307
        else
308
        {
309
            if constexpr (symmetric)
310
                res = 1.f;
311
            else
312
                res = 0.f;
313
        }
314
    }
315

316
    static void vectorVector(
317
        const ColumnString::Chars & haystack_data,
318
        const ColumnString::Offsets & haystack_offsets,
319
        const ColumnString::Chars & needle_data,
320
        const ColumnString::Offsets & needle_offsets,
321
        PaddedPODArray<Float32> & res)
322
    {
323
        const size_t haystack_offsets_size = haystack_offsets.size();
324
        size_t prev_haystack_offset = 0;
325
        size_t prev_needle_offset = 0;
326

327
        std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
328

329
        /// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
330
        /// And we can reuse these storages in one thread because we care only about what was written to first places.
331
        std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
332
        std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
333

334
        for (size_t i = 0; i < haystack_offsets_size; ++i)
335
        {
336
            const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]);
337
            const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1;
338
            const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]);
339
            const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;
340

341
            if (needle_size <= max_string_size && haystack_size <= max_string_size)
342
            {
343
                /// Get needle stats.
344
                const size_t needle_stats_size = dispatchSearcher(
345
                    calculateNeedleStats<true>,
346
                    needle,
347
                    needle_size,
348
                    common_stats.get(),
349
                    needle_ngram_storage.get());
350

351
                size_t distance = needle_stats_size;
352

353
                /// Combine with haystack stats, return to initial needle stats.
354
                const size_t haystack_stats_size = dispatchSearcher(
355
                    calculateHaystackStatsAndMetric<true>,
356
                    haystack,
357
                    haystack_size,
358
                    common_stats.get(),
359
                    distance,
360
                    haystack_ngram_storage.get());
361

362
                /// Return to zero array stats.
363
                for (size_t j = 0; j < needle_stats_size; ++j)
364
                    --common_stats[needle_ngram_storage[j]];
365

366
                /// For now, common stats is a zero array.
367

368

369
                /// For !symmetric version we should not use haystack_stats_size.
370
                if constexpr (symmetric)
371
                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, 1uz);
372
                else
373
                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, 1uz);
374
            }
375
            else
376
            {
377
                /// Strings are too big, we are assuming they are not the same. This is done because of limiting number
378
                /// of bigrams added and not allocating too much memory.
379
                if constexpr (symmetric)
380
                    res[i] = 1.f;
381
                else
382
                    res[i] = 0.f;
383
            }
384

385
            prev_needle_offset = needle_offsets[i];
386
            prev_haystack_offset = haystack_offsets[i];
387
        }
388
    }
389

390
    static void constantVector(
391
        std::string haystack,
392
        const ColumnString::Chars & needle_data,
393
        const ColumnString::Offsets & needle_offsets,
394
        PaddedPODArray<Float32> & res)
395
    {
396
        /// For symmetric version it is better to use vector_constant
397
        if constexpr (symmetric)
398
        {
399
            vectorConstant(needle_data, needle_offsets, std::move(haystack), res);
400
        }
401
        else
402
        {
403
            const size_t haystack_size = haystack.size();
404
            haystack.resize(haystack_size + default_padding);
405

406
            /// For logic explanation see vector_vector function.
407
            const size_t needle_offsets_size = needle_offsets.size();
408
            size_t prev_offset = 0;
409

410
            std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
411

412
            std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
413
            std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
414

415
            for (size_t i = 0; i < needle_offsets_size; ++i)
416
            {
417
                const char * needle = reinterpret_cast<const char *>(&needle_data[prev_offset]);
418
                const size_t needle_size = needle_offsets[i] - prev_offset - 1;
419

420
                if (needle_size <= max_string_size && haystack_size <= max_string_size)
421
                {
422
                    const size_t needle_stats_size = dispatchSearcher(
423
                        calculateNeedleStats<true>,
424
                        needle,
425
                        needle_size,
426
                        common_stats.get(),
427
                        needle_ngram_storage.get());
428

429
                    size_t distance = needle_stats_size;
430

431
                    dispatchSearcher(
432
                        calculateHaystackStatsAndMetric<true>,
433
                        haystack.data(),
434
                        haystack_size,
435
                        common_stats.get(),
436
                        distance,
437
                        haystack_ngram_storage.get());
438

439
                    for (size_t j = 0; j < needle_stats_size; ++j)
440
                        --common_stats[needle_ngram_storage[j]];
441

442
                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, 1uz);
443
                }
444
                else
445
                {
446
                    res[i] = 0.f;
447
                }
448

449
                prev_offset = needle_offsets[i];
450
            }
451

452
        }
453
    }
454

455
    static void vectorConstant(
456
        const ColumnString::Chars & data,
457
        const ColumnString::Offsets & offsets,
458
        std::string needle,
459
        PaddedPODArray<Float32> & res)
460
    {
461
        /// zeroing our map
462
        std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
463

464
        /// We can reuse these storages in one thread because we care only about what was written to first places.
465
        std::unique_ptr<UInt16[]> ngram_storage(new NgramCount[max_string_size]);
466

467
        /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
468
        const size_t needle_size = needle.size();
469
        needle.resize(needle_size + default_padding);
470

471
        const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats.get(), nullptr);
472

473
        size_t distance = needle_stats_size;
474
        size_t prev_offset = 0;
475
        for (size_t i = 0; i < offsets.size(); ++i)
476
        {
477
            const UInt8 * haystack = &data[prev_offset];
478
            const size_t haystack_size = offsets[i] - prev_offset - 1;
479
            if (haystack_size <= max_string_size)
480
            {
481
                size_t haystack_stats_size = dispatchSearcher(
482
                    calculateHaystackStatsAndMetric<true>,
483
                    reinterpret_cast<const char *>(haystack),
484
                    haystack_size, common_stats.get(),
485
                    distance,
486
                    ngram_storage.get());
487
                /// For !symmetric version we should not use haystack_stats_size.
488
                if constexpr (symmetric)
489
                    res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, 1uz);
490
                else
491
                    res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, 1uz);
492
            }
493
            else
494
            {
495
                /// if the strings are too big, we say they are completely not the same
496
                if constexpr (symmetric)
497
                    res[i] = 1.f;
498
                else
499
                    res[i] = 0.f;
500
            }
501
            distance = needle_stats_size;
502
            prev_offset = offsets[i];
503
        }
504
    }
505
};
506

507

508
struct NameNgramDistance
509
{
510
    static constexpr auto name = "ngramDistance";
511
};
512
struct NameNgramDistanceCaseInsensitive
513
{
514
    static constexpr auto name = "ngramDistanceCaseInsensitive";
515
};
516

517
struct NameNgramDistanceUTF8
518
{
519
    static constexpr auto name = "ngramDistanceUTF8";
520
};
521

522
struct NameNgramDistanceUTF8CaseInsensitive
523
{
524
    static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
525
};
526

527
struct NameNgramSearch
528
{
529
    static constexpr auto name = "ngramSearch";
530
};
531
struct NameNgramSearchCaseInsensitive
532
{
533
    static constexpr auto name = "ngramSearchCaseInsensitive";
534
};
535
struct NameNgramSearchUTF8
536
{
537
    static constexpr auto name = "ngramSearchUTF8";
538
};
539

540
struct NameNgramSearchUTF8CaseInsensitive
541
{
542
    static constexpr auto name = "ngramSearchCaseInsensitiveUTF8";
543
};
544

545
using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, true>, NameNgramDistance>;
546
using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>;
547
using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, true>, NameNgramDistanceUTF8>;
548
using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
549

550
using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, false>, NameNgramSearch>;
551
using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>;
552
using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, false>, NameNgramSearchUTF8>;
553
using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>;
554

555

556
REGISTER_FUNCTION(StringSimilarity)
557
{
558
    factory.registerFunction<FunctionNgramDistance>();
559
    factory.registerFunction<FunctionNgramDistanceCaseInsensitive>();
560
    factory.registerFunction<FunctionNgramDistanceUTF8>();
561
    factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>();
562

563
    factory.registerFunction<FunctionNgramSearch>();
564
    factory.registerFunction<FunctionNgramSearchCaseInsensitive>();
565
    factory.registerFunction<FunctionNgramSearchUTF8>();
566
    factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>();
567
}
568

569
}
570
ClickHouse

Использование cookies