ClickHouse
105 строк · 3.5 Кб
1#include <Functions/checkHyperscanRegexp.h>
2
3#include <Common/Exception.h>
4#include <charconv>
5
6namespace DB
7{
8namespace ErrorCodes
9{
10extern const int BAD_ARGUMENTS;
11}
12
13void checkHyperscanRegexp(const std::vector<std::string_view> & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length)
14{
15if (max_hyperscan_regexp_length > 0 || max_hyperscan_regexp_total_length > 0)
16{
17size_t total_regexp_length = 0;
18for (const auto & regexp : regexps)
19{
20if (max_hyperscan_regexp_length > 0 && regexp.size() > max_hyperscan_regexp_length)
21throw Exception(ErrorCodes::BAD_ARGUMENTS, "Regexp length too large ({} > {})", regexp.size(), max_hyperscan_regexp_length);
22total_regexp_length += regexp.size();
23}
24
25if (max_hyperscan_regexp_total_length > 0 && total_regexp_length > max_hyperscan_regexp_total_length)
26throw Exception(ErrorCodes::BAD_ARGUMENTS, "Total regexp lengths too large ({} > {})",
27total_regexp_length, max_hyperscan_regexp_total_length);
28}
29}
30
31namespace
32{
33
34bool isLargerThanFifty(std::string_view str)
35{
36int number;
37auto [_, ec] = std::from_chars(str.begin(), str.end(), number);
38if (ec != std::errc())
39return false;
40return number > 50;
41}
42
43}
44
45/// Check for sub-patterns of the form x{n} or x{n,} can be expensive. Ignore spaces before/after n and m.
46bool SlowWithHyperscanChecker::isSlowOneRepeat(std::string_view regexp)
47{
48std::string_view haystack(regexp.data(), regexp.size());
49std::string_view matches[2];
50size_t start_pos = 0;
51while (start_pos < haystack.size())
52{
53if (searcher_one_repeat.Match(haystack, start_pos, haystack.size(), re2::RE2::Anchor::UNANCHORED, matches, 2))
54{
55const auto & match = matches[0];
56start_pos = (matches[0].data() - haystack.data()) + match.size(); // new start pos = prefix before match + match length
57const auto & submatch = matches[1];
58if (isLargerThanFifty({submatch.data(), submatch.size()}))
59return true;
60}
61else
62break;
63}
64return false;
65}
66
67/// Check if sub-patterns of the form x{n,m} can be expensive. Ignore spaces before/after n and m.
68bool SlowWithHyperscanChecker::isSlowTwoRepeats(std::string_view regexp)
69{
70std::string_view haystack(regexp.data(), regexp.size());
71std::string_view matches[3];
72size_t start_pos = 0;
73while (start_pos < haystack.size())
74{
75if (searcher_two_repeats.Match(haystack, start_pos, haystack.size(), re2::RE2::Anchor::UNANCHORED, matches, 3))
76{
77const auto & match = matches[0];
78start_pos = (matches[0].data() - haystack.data()) + match.size(); // new start pos = prefix before match + match length
79const auto & submatch1 = matches[1];
80const auto & submatch2 = matches[2];
81if (isLargerThanFifty({submatch1.data(), submatch1.size()})
82|| isLargerThanFifty({submatch2.data(), submatch2.size()}))
83return true;
84}
85else
86break;
87}
88return false;
89}
90
91SlowWithHyperscanChecker::SlowWithHyperscanChecker()
92: searcher_one_repeat(R"(\{\s*([\d]+)\s*,?\s*})")
93, searcher_two_repeats(R"(\{\s*([\d]+)\s*,\s*([\d]+)\s*\})")
94{}
95
96bool SlowWithHyperscanChecker::isSlow(std::string_view regexp)
97{
98if (isSlowOneRepeat(regexp))
99return true;
100else if (isSlowTwoRepeats(regexp))
101return true;
102return false;
103}
104
105}
106