ClickHouse
569 строк · 22.8 Кб
1#include <Functions/FunctionsStringSimilarity.h>
2#include <Functions/FunctionFactory.h>
3#include <Functions/FunctionsHashing.h>
4#include <Common/HashTable/ClearableHashMap.h>
5#include <Common/HashTable/Hash.h>
6#include <Common/MemorySanitizer.h>
7#include <Common/UTF8Helpers.h>
8
9#include <Core/Defines.h>
10
11#include <base/unaligned.h>
12
13#include <algorithm>
14#include <climits>
15#include <cstring>
16#include <limits>
17#include <memory>
18#include <utility>
19
20#ifdef __SSE4_2__
21# include <nmmintrin.h>
22#endif
23
24#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
25# include <arm_acle.h>
26#endif
27
28#if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
29#include "vec_crc32.h"
30#endif
31
32namespace DB
33{
34/** Distance function implementation.
35* We calculate all the n-grams from left string and count by the index of
36* 16 bits hash of them in the map.
37* Then calculate all the n-grams from the right string and calculate
38* the n-gram distance on the flight by adding and subtracting from the hashmap.
39* Then return the map into the condition of which it was after the left string
40* calculation. If the right string size is big (more than 2**15 bytes),
41* the strings are not similar at all and we return 1.
42*/
43template <size_t N, class CodePoint, bool UTF8, bool case_insensitive, bool symmetric>
44struct NgramDistanceImpl
45{
46using ResultType = Float32;
47
48/// map_size for ngram difference.
49static constexpr size_t map_size = 1u << 16;
50
51/// If the haystack size is bigger than this, behaviour is unspecified for this function.
52static constexpr size_t max_string_size = 1u << 15;
53
54/// Default padding to read safely.
55static constexpr size_t default_padding = 16;
56
57/// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
58static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
59
60/** map_size of this fits mostly in L2 cache all the time.
61* Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
62* integer array.
63*/
64using NgramCount = UInt16;
65
66static ALWAYS_INLINE UInt16 calculateASCIIHash(const CodePoint * code_points)
67{
68return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu;
69}
70
71static ALWAYS_INLINE UInt16 calculateUTF8Hash(const CodePoint * code_points)
72{
73UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1];
74#ifdef __SSE4_2__
75return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu;
76#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
77return __crc32cd(code_points[2], combined) & 0xFFFFu;
78#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
79return crc32_ppc(code_points[2], reinterpret_cast<const unsigned char *>(&combined), sizeof(combined)) & 0xFFFFu;
80#elif defined(__s390x__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
81return s390x_crc32c(code_points[2], combined) & 0xFFFFu;
82#else
83return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu;
84#endif
85}
86
87template <size_t Offset, class Container, size_t... I>
88static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
89{
90((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
91}
92
93static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end)
94{
95/// Offset before which we copy some data.
96constexpr size_t padding_offset = default_padding - N + 1;
97/// We have an array like this for ASCII (N == 4, other cases are similar)
98/// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
99/// And we copy ^^^^^^^^^^^^^^^ these bytes to the start
100/// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
101memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
102/// Now we have an array
103/// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
104/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
105/// Doing unaligned read of 16 bytes and copy them like above
106/// 16 is also chosen to do two `movups`.
107/// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
108memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
109
110if constexpr (case_insensitive)
111{
112/// Due to PODArray padding accessing more elements should be OK
113__msan_unpoison(code_points + (N - 1), padding_offset * sizeof(CodePoint));
114/// We really need template lambdas with C++20 to do it inline
115unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
116}
117pos += padding_offset;
118if (pos > end)
119return default_padding - (pos - end);
120return default_padding;
121}
122
123static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end)
124{
125/// The same copying as described in the function above.
126memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
127
128size_t num = N - 1;
129while (num < default_padding && pos < end)
130{
131size_t length = UTF8::seqLength(*pos);
132
133if (pos + length > end)
134length = end - pos;
135
136CodePoint res;
137/// This is faster than just memcpy because of compiler optimizations with moving bytes.
138switch (length)
139{
140case 1:
141res = 0;
142if constexpr (std::endian::native == std::endian::little)
143memcpy(&res, pos, 1);
144else
145reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 1, pos, 1);
146break;
147case 2:
148res = 0;
149if constexpr (std::endian::native == std::endian::little)
150memcpy(&res, pos, 2);
151else
152reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 2, pos, 2);
153break;
154case 3:
155res = 0;
156if constexpr (std::endian::native == std::endian::little)
157memcpy(&res, pos, 3);
158else
159reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 3, pos, 3);
160break;
161default:
162if constexpr (std::endian::native == std::endian::little)
163memcpy(&res, pos, 4);
164else
165reverseMemcpy(reinterpret_cast<char*>(&res) + sizeof(CodePoint) - 4, pos, 4);
166}
167
168/// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte.
169/// And first bit of first byte if there are two bytes.
170/// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrillic letters also does.
171/// For others, we don't care now. Lowering UTF is not a cheap operation.
172if constexpr (case_insensitive)
173{
174switch (length)
175{
176case 4:
177res &= ~(1u << (5 + 3 * CHAR_BIT));
178[[fallthrough]];
179case 3:
180res &= ~(1u << (5 + 2 * CHAR_BIT));
181[[fallthrough]];
182case 2:
183res &= ~(1u);
184res &= ~(1u << (5 + CHAR_BIT));
185[[fallthrough]];
186default:
187res &= ~(1u << 5);
188}
189}
190
191pos += length;
192code_points[num++] = res;
193}
194return num;
195}
196
197template <bool save_ngrams>
198static ALWAYS_INLINE inline size_t calculateNeedleStats(
199const char * data,
200const size_t size,
201NgramCount * ngram_stats,
202[[maybe_unused]] NgramCount * ngram_storage,
203size_t (*read_code_points)(CodePoint *, const char *&, const char *),
204UInt16 (*hash_functor)(const CodePoint *))
205{
206const char * start = data;
207const char * end = data + size;
208CodePoint cp[simultaneously_codepoints_num] = {};
209/// read_code_points returns the position of cp where it stopped reading codepoints.
210size_t found = read_code_points(cp, start, end);
211/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
212size_t i = N - 1;
213size_t len = 0;
214do
215{
216for (; i + N <= found; ++i)
217{
218++len;
219UInt16 hash = hash_functor(cp + i);
220if constexpr (save_ngrams)
221*ngram_storage++ = hash;
222++ngram_stats[hash];
223}
224i = 0;
225} while (start < end && (found = read_code_points(cp, start, end)));
226
227return len;
228}
229
230template <bool reuse_stats>
231static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
232const char * data,
233const size_t size,
234NgramCount * ngram_stats,
235size_t & distance,
236[[maybe_unused]] UInt16 * ngram_storage,
237size_t (*read_code_points)(CodePoint *, const char *&, const char *),
238UInt16 (*hash_functor)(const CodePoint *))
239{
240size_t ngram_cnt = 0;
241const char * start = data;
242const char * end = data + size;
243CodePoint cp[simultaneously_codepoints_num] = {};
244
245/// read_code_points returns the position of cp where it stopped reading codepoints.
246size_t found = read_code_points(cp, start, end);
247/// We need to start for the first time here, because first N - 1 codepoints mean nothing.
248size_t iter = N - 1;
249
250do
251{
252for (; iter + N <= found; ++iter)
253{
254UInt16 hash = hash_functor(cp + iter);
255/// For symmetric version we should add when we can't subtract to get symmetric difference.
256if (static_cast<Int16>(ngram_stats[hash]) > 0)
257--distance;
258else if constexpr (symmetric)
259++distance;
260if constexpr (reuse_stats)
261ngram_storage[ngram_cnt] = hash;
262++ngram_cnt;
263--ngram_stats[hash];
264}
265iter = 0;
266} while (start < end && (found = read_code_points(cp, start, end)));
267
268/// Return the state of hash map to its initial.
269if constexpr (reuse_stats)
270{
271for (size_t i = 0; i < ngram_cnt; ++i)
272++ngram_stats[ngram_storage[i]];
273}
274return ngram_cnt;
275}
276
277template <class Callback, class... Args>
278static inline auto dispatchSearcher(Callback callback, Args &&... args)
279{
280if constexpr (!UTF8)
281return callback(std::forward<Args>(args)..., readASCIICodePoints, calculateASCIIHash);
282else
283return callback(std::forward<Args>(args)..., readUTF8CodePoints, calculateUTF8Hash);
284}
285
286static void constantConstant(std::string data, std::string needle, Float32 & res)
287{
288std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
289
290/// We use unsafe versions of getting ngrams, so I decided to use padded strings.
291const size_t needle_size = needle.size();
292const size_t data_size = data.size();
293needle.resize(needle_size + default_padding);
294data.resize(data_size + default_padding);
295
296size_t second_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats.get(), nullptr);
297size_t distance = second_size;
298if (data_size <= max_string_size)
299{
300size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric<false>, data.data(), data_size, common_stats.get(), distance, nullptr);
301/// For !symmetric version we should not use first_size.
302if constexpr (symmetric)
303res = distance * 1.f / std::max(first_size + second_size, 1uz);
304else
305res = 1.f - distance * 1.f / std::max(second_size, 1uz);
306}
307else
308{
309if constexpr (symmetric)
310res = 1.f;
311else
312res = 0.f;
313}
314}
315
316static void vectorVector(
317const ColumnString::Chars & haystack_data,
318const ColumnString::Offsets & haystack_offsets,
319const ColumnString::Chars & needle_data,
320const ColumnString::Offsets & needle_offsets,
321PaddedPODArray<Float32> & res)
322{
323const size_t haystack_offsets_size = haystack_offsets.size();
324size_t prev_haystack_offset = 0;
325size_t prev_needle_offset = 0;
326
327std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
328
329/// The main motivation is to not allocate more on stack because we have already allocated a lot (128Kb).
330/// And we can reuse these storages in one thread because we care only about what was written to first places.
331std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
332std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
333
334for (size_t i = 0; i < haystack_offsets_size; ++i)
335{
336const char * haystack = reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]);
337const size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1;
338const char * needle = reinterpret_cast<const char *>(&needle_data[prev_needle_offset]);
339const size_t needle_size = needle_offsets[i] - prev_needle_offset - 1;
340
341if (needle_size <= max_string_size && haystack_size <= max_string_size)
342{
343/// Get needle stats.
344const size_t needle_stats_size = dispatchSearcher(
345calculateNeedleStats<true>,
346needle,
347needle_size,
348common_stats.get(),
349needle_ngram_storage.get());
350
351size_t distance = needle_stats_size;
352
353/// Combine with haystack stats, return to initial needle stats.
354const size_t haystack_stats_size = dispatchSearcher(
355calculateHaystackStatsAndMetric<true>,
356haystack,
357haystack_size,
358common_stats.get(),
359distance,
360haystack_ngram_storage.get());
361
362/// Return to zero array stats.
363for (size_t j = 0; j < needle_stats_size; ++j)
364--common_stats[needle_ngram_storage[j]];
365
366/// For now, common stats is a zero array.
367
368
369/// For !symmetric version we should not use haystack_stats_size.
370if constexpr (symmetric)
371res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, 1uz);
372else
373res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, 1uz);
374}
375else
376{
377/// Strings are too big, we are assuming they are not the same. This is done because of limiting number
378/// of bigrams added and not allocating too much memory.
379if constexpr (symmetric)
380res[i] = 1.f;
381else
382res[i] = 0.f;
383}
384
385prev_needle_offset = needle_offsets[i];
386prev_haystack_offset = haystack_offsets[i];
387}
388}
389
390static void constantVector(
391std::string haystack,
392const ColumnString::Chars & needle_data,
393const ColumnString::Offsets & needle_offsets,
394PaddedPODArray<Float32> & res)
395{
396/// For symmetric version it is better to use vector_constant
397if constexpr (symmetric)
398{
399vectorConstant(needle_data, needle_offsets, std::move(haystack), res);
400}
401else
402{
403const size_t haystack_size = haystack.size();
404haystack.resize(haystack_size + default_padding);
405
406/// For logic explanation see vector_vector function.
407const size_t needle_offsets_size = needle_offsets.size();
408size_t prev_offset = 0;
409
410std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
411
412std::unique_ptr<UInt16[]> needle_ngram_storage(new UInt16[max_string_size]);
413std::unique_ptr<UInt16[]> haystack_ngram_storage(new UInt16[max_string_size]);
414
415for (size_t i = 0; i < needle_offsets_size; ++i)
416{
417const char * needle = reinterpret_cast<const char *>(&needle_data[prev_offset]);
418const size_t needle_size = needle_offsets[i] - prev_offset - 1;
419
420if (needle_size <= max_string_size && haystack_size <= max_string_size)
421{
422const size_t needle_stats_size = dispatchSearcher(
423calculateNeedleStats<true>,
424needle,
425needle_size,
426common_stats.get(),
427needle_ngram_storage.get());
428
429size_t distance = needle_stats_size;
430
431dispatchSearcher(
432calculateHaystackStatsAndMetric<true>,
433haystack.data(),
434haystack_size,
435common_stats.get(),
436distance,
437haystack_ngram_storage.get());
438
439for (size_t j = 0; j < needle_stats_size; ++j)
440--common_stats[needle_ngram_storage[j]];
441
442res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, 1uz);
443}
444else
445{
446res[i] = 0.f;
447}
448
449prev_offset = needle_offsets[i];
450}
451
452}
453}
454
455static void vectorConstant(
456const ColumnString::Chars & data,
457const ColumnString::Offsets & offsets,
458std::string needle,
459PaddedPODArray<Float32> & res)
460{
461/// zeroing our map
462std::unique_ptr<NgramCount[]> common_stats{new NgramCount[map_size]{}};
463
464/// We can reuse these storages in one thread because we care only about what was written to first places.
465std::unique_ptr<UInt16[]> ngram_storage(new NgramCount[max_string_size]);
466
467/// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
468const size_t needle_size = needle.size();
469needle.resize(needle_size + default_padding);
470
471const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats<false>, needle.data(), needle_size, common_stats.get(), nullptr);
472
473size_t distance = needle_stats_size;
474size_t prev_offset = 0;
475for (size_t i = 0; i < offsets.size(); ++i)
476{
477const UInt8 * haystack = &data[prev_offset];
478const size_t haystack_size = offsets[i] - prev_offset - 1;
479if (haystack_size <= max_string_size)
480{
481size_t haystack_stats_size = dispatchSearcher(
482calculateHaystackStatsAndMetric<true>,
483reinterpret_cast<const char *>(haystack),
484haystack_size, common_stats.get(),
485distance,
486ngram_storage.get());
487/// For !symmetric version we should not use haystack_stats_size.
488if constexpr (symmetric)
489res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, 1uz);
490else
491res[i] = 1.f - distance * 1.f / std::max(needle_stats_size, 1uz);
492}
493else
494{
495/// if the strings are too big, we say they are completely not the same
496if constexpr (symmetric)
497res[i] = 1.f;
498else
499res[i] = 0.f;
500}
501distance = needle_stats_size;
502prev_offset = offsets[i];
503}
504}
505};
506
507
508struct NameNgramDistance
509{
510static constexpr auto name = "ngramDistance";
511};
512struct NameNgramDistanceCaseInsensitive
513{
514static constexpr auto name = "ngramDistanceCaseInsensitive";
515};
516
517struct NameNgramDistanceUTF8
518{
519static constexpr auto name = "ngramDistanceUTF8";
520};
521
522struct NameNgramDistanceUTF8CaseInsensitive
523{
524static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
525};
526
527struct NameNgramSearch
528{
529static constexpr auto name = "ngramSearch";
530};
531struct NameNgramSearchCaseInsensitive
532{
533static constexpr auto name = "ngramSearchCaseInsensitive";
534};
535struct NameNgramSearchUTF8
536{
537static constexpr auto name = "ngramSearchUTF8";
538};
539
540struct NameNgramSearchUTF8CaseInsensitive
541{
542static constexpr auto name = "ngramSearchCaseInsensitiveUTF8";
543};
544
545using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, true>, NameNgramDistance>;
546using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, true>, NameNgramDistanceCaseInsensitive>;
547using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, true>, NameNgramDistanceUTF8>;
548using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
549
550using FunctionNgramSearch = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false, false>, NameNgramSearch>;
551using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true, false>, NameNgramSearchCaseInsensitive>;
552using FunctionNgramSearchUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false, false>, NameNgramSearchUTF8>;
553using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true, false>, NameNgramSearchUTF8CaseInsensitive>;
554
555
556REGISTER_FUNCTION(StringSimilarity)
557{
558factory.registerFunction<FunctionNgramDistance>();
559factory.registerFunction<FunctionNgramDistanceCaseInsensitive>();
560factory.registerFunction<FunctionNgramDistanceUTF8>();
561factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>();
562
563factory.registerFunction<FunctionNgramSearch>();
564factory.registerFunction<FunctionNgramSearchCaseInsensitive>();
565factory.registerFunction<FunctionNgramSearchUTF8>();
566factory.registerFunction<FunctionNgramSearchCaseInsensitiveUTF8>();
567}
568
569}
570