llvm-project

FuzzyMatch.cpp
402 строки · 15.3 Кб
Перенос по словам
1
//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
10
// we consider the possible partial match states:
11
//
12
//     u n i q u e _ p t r
13
//   +---------------------
14
//   |A . . . . . . . . . .
15
//  u|
16
//   |. . . . . . . . . . .
17
//  _|
18
//   |. . . . . . . O . . .
19
//  p|
20
//   |. . . . . . . . . . B
21
//
22
// Each dot represents some prefix of the pattern being matched against some
23
// prefix of the word.
24
//   - A is the initial state: '' matched against ''
25
//   - O is an intermediate state: 'u_' matched against 'unique_'
26
//   - B is the target state: 'u_p' matched against 'unique_ptr'
27
//
28
// We aim to find the best path from A->B.
29
//  - Moving right (consuming a word character)
30
//    Always legal: not all word characters must match.
31
//  - Moving diagonally (consuming both a word and pattern character)
32
//    Legal if the characters match.
33
//  - Moving down (consuming a pattern character) is never legal.
34
//    Never legal: all pattern characters must match something.
35
// Characters are matched case-insensitively.
36
// The first pattern character may only match the start of a word segment.
37
//
38
// The scoring is based on heuristics:
39
//  - when matching a character, apply a bonus or penalty depending on the
40
//    match quality (does case match, do word segments align, etc)
41
//  - when skipping a character, apply a penalty if it hurts the match
42
//    (it starts a word segment, or splits the matched region, etc)
43
//
44
// These heuristics require the ability to "look backward" one character, to
45
// see whether it was matched or not. Therefore the dynamic-programming matrix
46
// has an extra dimension (last character matched).
47
// Each entry also has an additional flag indicating whether the last-but-one
48
// character matched, which is needed to trace back through the scoring table
49
// and reconstruct the match.
50
//
51
// We treat strings as byte-sequences, so only ASCII has first-class support.
52
//
53
// This algorithm was inspired by VS code's client-side filtering, and aims
54
// to be mostly-compatible.
55
//
56
//===----------------------------------------------------------------------===//
57

58
#include "FuzzyMatch.h"
59
#include "llvm/Support/Format.h"
60
#include <optional>
61

62
namespace clang {
63
namespace clangd {
64

65
constexpr int FuzzyMatcher::MaxPat;
66
constexpr int FuzzyMatcher::MaxWord;
67

68
static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
69
// A "negative infinity" score that won't overflow.
70
// We use this to mark unreachable states and forbidden solutions.
71
// Score field is 15 bits wide, min value is -2^14, we use half of that.
72
static constexpr int AwfulScore = -(1 << 13);
73
static bool isAwful(int S) { return S < AwfulScore / 2; }
74
static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
75

76
FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
77
    : PatN(std::min<int>(MaxPat, Pattern.size())),
78
      ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
79
  std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
80
  for (int I = 0; I < PatN; ++I)
81
    LowPat[I] = lower(Pat[I]);
82
  Scores[0][0][Miss] = {0, Miss};
83
  Scores[0][0][Match] = {AwfulScore, Miss};
84
  for (int P = 0; P <= PatN; ++P)
85
    for (int W = 0; W < P; ++W)
86
      for (Action A : {Miss, Match})
87
        Scores[P][W][A] = {AwfulScore, Miss};
88
  PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN),
89
                              llvm::MutableArrayRef(PatRole, PatN));
90
}
91

92
std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
93
  if (!(WordContainsPattern = init(Word)))
94
    return std::nullopt;
95
  if (!PatN)
96
    return 1;
97
  buildGraph();
98
  auto Best = std::max(Scores[PatN][WordN][Miss].Score,
99
                       Scores[PatN][WordN][Match].Score);
100
  if (isAwful(Best))
101
    return std::nullopt;
102
  float Score =
103
      ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
104
  // If the pattern is as long as the word, we have an exact string match,
105
  // since every pattern character must match something.
106
  if (WordN == PatN)
107
    Score *= 2; // May not be perfect 2 if case differs in a significant way.
108
  return Score;
109
}
110

111
// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
112
// The top 6 bits of the char select the byte, the bottom 2 select the offset.
113
// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
114
constexpr static uint8_t CharTypes[] = {
115
    0x00, 0x00, 0x00, 0x00, // Control characters
116
    0x00, 0x00, 0x00, 0x00, // Control characters
117
    0xff, 0xff, 0xff, 0xff, // Punctuation
118
    0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
119
    0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
120
    0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
121
    0x57, 0x55, 0x55, 0x55, // ` and a-o
122
    0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
123
    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
124
    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
125
    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
126
    0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
127
};
128

129
// The Role can be determined from the Type of a character and its neighbors:
130
//
131
//   Example  | Chars | Type | Role
132
//   ---------+--------------+-----
133
//   F(o)oBar | Foo   | Ull  | Tail
134
//   Foo(B)ar | oBa   | lUl  | Head
135
//   (f)oo    | ^fo   | Ell  | Head
136
//   H(T)TP   | HTT   | UUU  | Tail
137
//
138
// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
139
// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
140
// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
141
constexpr static uint8_t CharRoles[] = {
142
    // clang-format off
143
    //         Curr= Empty Lower Upper Separ
144
    /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
145
    /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
146
    /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
147
    /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
148
    // clang-format on
149
};
150

151
template <typename T> static T packedLookup(const uint8_t *Data, int I) {
152
  return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
153
}
154
CharTypeSet calculateRoles(llvm::StringRef Text,
155
                           llvm::MutableArrayRef<CharRole> Roles) {
156
  assert(Text.size() == Roles.size());
157
  if (Text.size() == 0)
158
    return 0;
159
  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
160
  CharTypeSet TypeSet = 1 << Type;
161
  // Types holds a sliding window of (Prev, Curr, Next) types.
162
  // Initial value is (Empty, Empty, type of Text[0]).
163
  int Types = Type;
164
  // Rotate slides in the type of the next character.
165
  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
166
  for (unsigned I = 0; I < Text.size() - 1; ++I) {
167
    // For each character, rotate in the next, and look up the role.
168
    Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
169
    TypeSet |= 1 << Type;
170
    Rotate(Type);
171
    Roles[I] = packedLookup<CharRole>(CharRoles, Types);
172
  }
173
  // For the last character, the "next character" is Empty.
174
  Rotate(Empty);
175
  Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
176
  return TypeSet;
177
}
178

179
// Sets up the data structures matching Word.
180
// Returns false if we can cheaply determine that no match is possible.
181
bool FuzzyMatcher::init(llvm::StringRef NewWord) {
182
  WordN = std::min<int>(MaxWord, NewWord.size());
183
  if (PatN > WordN)
184
    return false;
185
  std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
186
  if (PatN == 0)
187
    return true;
188
  for (int I = 0; I < WordN; ++I)
189
    LowWord[I] = lower(Word[I]);
190

191
  // Cheap subsequence check.
192
  for (int W = 0, P = 0; P != PatN; ++W) {
193
    if (W == WordN)
194
      return false;
195
    if (LowWord[W] == LowPat[P])
196
      ++P;
197
  }
198

199
  // FIXME: some words are hard to tokenize algorithmically.
200
  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
201
  // We could add a tokenization dictionary for common stdlib names.
202
  WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN),
203
                               llvm::MutableArrayRef(WordRole, WordN));
204
  return true;
205
}
206

207
// The forwards pass finds the mappings of Pattern onto Word.
208
// Score = best score achieved matching Word[..W] against Pat[..P].
209
// Unlike other tables, indices range from 0 to N *inclusive*
210
// Matched = whether we chose to match Word[W] with Pat[P] or not.
211
//
212
// Points are mostly assigned to matched characters, with 1 being a good score
213
// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
214
// This range is not strict: we can apply larger bonuses/penalties, or penalize
215
// non-matched characters.
216
void FuzzyMatcher::buildGraph() {
217
  for (int W = 0; W < WordN; ++W) {
218
    Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
219
                              Miss};
220
    Scores[0][W + 1][Match] = {AwfulScore, Miss};
221
  }
222
  for (int P = 0; P < PatN; ++P) {
223
    for (int W = P; W < WordN; ++W) {
224
      auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
225

226
      auto MatchMissScore = PreMiss[Match].Score;
227
      auto MissMissScore = PreMiss[Miss].Score;
228
      if (P < PatN - 1) { // Skipping trailing characters is always free.
229
        MatchMissScore -= skipPenalty(W, Match);
230
        MissMissScore -= skipPenalty(W, Miss);
231
      }
232
      Score[Miss] = (MatchMissScore > MissMissScore)
233
                        ? ScoreInfo{MatchMissScore, Match}
234
                        : ScoreInfo{MissMissScore, Miss};
235

236
      auto &PreMatch = Scores[P][W];
237
      auto MatchMatchScore =
238
          allowMatch(P, W, Match)
239
              ? PreMatch[Match].Score + matchBonus(P, W, Match)
240
              : AwfulScore;
241
      auto MissMatchScore = allowMatch(P, W, Miss)
242
                                ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
243
                                : AwfulScore;
244
      Score[Match] = (MatchMatchScore > MissMatchScore)
245
                         ? ScoreInfo{MatchMatchScore, Match}
246
                         : ScoreInfo{MissMatchScore, Miss};
247
    }
248
  }
249
}
250

251
bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
252
  if (LowPat[P] != LowWord[W])
253
    return false;
254
  // We require a "strong" match:
255
  // - for the first pattern character.  [foo] !~ "barefoot"
256
  // - after a gap.                      [pat] !~ "patnther"
257
  if (Last == Miss) {
258
    // We're banning matches outright, so conservatively accept some other cases
259
    // where our segmentation might be wrong:
260
    //  - allow matching B in ABCDef (but not in NDEBUG)
261
    //  - we'd like to accept print in sprintf, but too many false positives
262
    if (WordRole[W] == Tail &&
263
        (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
264
      return false;
265
  }
266
  return true;
267
}
268

269
int FuzzyMatcher::skipPenalty(int W, Action Last) const {
270
  if (W == 0) // Skipping the first character.
271
    return 3;
272
  if (WordRole[W] == Head) // Skipping a segment.
273
    return 1; // We want to keep this lower than a consecutive match bonus.
274
  // Instead of penalizing non-consecutive matches, we give a bonus to a
275
  // consecutive match in matchBonus. This produces a better score distribution
276
  // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
277
  return 0;
278
}
279

280
int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
281
  assert(LowPat[P] == LowWord[W]);
282
  int S = 1;
283
  bool IsPatSingleCase =
284
      (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
285
  // Bonus: case matches, or a Head in the pattern aligns with one in the word.
286
  // Single-case patterns lack segmentation signals and we assume any character
287
  // can be a head of a segment.
288
  if (Pat[P] == Word[W] ||
289
      (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
290
    ++S;
291
  // Bonus: a consecutive match. First character match also gets a bonus to
292
  // ensure prefix final match score normalizes to 1.0.
293
  if (W == 0 || Last == Match)
294
    S += 2;
295
  // Penalty: matching inside a segment (and previous char wasn't matched).
296
  if (WordRole[W] == Tail && P && Last == Miss)
297
    S -= 3;
298
  // Penalty: a Head in the pattern matches in the middle of a word segment.
299
  if (PatRole[P] == Head && WordRole[W] == Tail)
300
    --S;
301
  // Penalty: matching the first pattern character in the middle of a segment.
302
  if (P == 0 && WordRole[W] == Tail)
303
    S -= 4;
304
  assert(S <= PerfectBonus);
305
  return S;
306
}
307

308
llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
309
  llvm::SmallString<256> Result;
310
  OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
311
     << llvm::StringRef(Pat, PatN) << "] ===\n";
312
  if (PatN == 0) {
313
    OS << "Pattern is empty: perfect match.\n";
314
    return Result = llvm::StringRef(Word, WordN);
315
  }
316
  if (WordN == 0) {
317
    OS << "Word is empty: no match.\n";
318
    return Result;
319
  }
320
  if (!WordContainsPattern) {
321
    OS << "Substring check failed.\n";
322
    return Result;
323
  }
324
  if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
325
                       Scores[PatN][WordN][Miss].Score))) {
326
    OS << "Substring check passed, but all matches are forbidden\n";
327
  }
328
  if (!(PatTypeSet & 1 << Upper))
329
    OS << "Lowercase query, so scoring ignores case\n";
330

331
  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
332
  // The Score table has cumulative scores, subtracting along this path gives
333
  // us the per-letter scores.
334
  Action Last =
335
      (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
336
          ? Match
337
          : Miss;
338
  int S[MaxWord];
339
  Action A[MaxWord];
340
  for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
341
    A[W] = Last;
342
    const auto &Cell = Scores[P + 1][W + 1][Last];
343
    if (Last == Match)
344
      --P;
345
    const auto &Prev = Scores[P + 1][W][Cell.Prev];
346
    S[W] = Cell.Score - Prev.Score;
347
    Last = Cell.Prev;
348
  }
349
  for (int I = 0; I < WordN; ++I) {
350
    if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
351
      Result.push_back('[');
352
    if (A[I] == Miss && I > 0 && A[I - 1] == Match)
353
      Result.push_back(']');
354
    Result.push_back(Word[I]);
355
  }
356
  if (A[WordN - 1] == Match)
357
    Result.push_back(']');
358

359
  for (char C : llvm::StringRef(Word, WordN))
360
    OS << " " << C << " ";
361
  OS << "\n";
362
  for (int I = 0, J = 0; I < WordN; I++)
363
    OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
364
  OS << "\n";
365
  for (int I = 0; I < WordN; I++)
366
    OS << llvm::format("%2d ", S[I]);
367
  OS << "\n";
368

369
  OS << "\nSegmentation:";
370
  OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
371
  for (int I = 0; I < WordN; ++I)
372
    OS << "?-+ "[static_cast<int>(WordRole[I])];
373
  OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
374
  for (int I = 0; I < PatN; ++I)
375
    OS << "?-+ "[static_cast<int>(PatRole[I])];
376
  OS << "\n";
377

378
  OS << "\nScoring table (last-Miss, last-Match):\n";
379
  OS << " |    ";
380
  for (char C : llvm::StringRef(Word, WordN))
381
    OS << "  " << C << " ";
382
  OS << "\n";
383
  OS << "-+----" << std::string(WordN * 4, '-') << "\n";
384
  for (int I = 0; I <= PatN; ++I) {
385
    for (Action A : {Miss, Match}) {
386
      OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
387
      for (int J = 0; J <= WordN; ++J) {
388
        if (!isAwful(Scores[I][J][A].Score))
389
          OS << llvm::format("%3d%c", Scores[I][J][A].Score,
390
                             Scores[I][J][A].Prev == Match ? '*' : ' ');
391
        else
392
          OS << "    ";
393
      }
394
      OS << "\n";
395
    }
396
  }
397

398
  return Result;
399
}
400

401
} // namespace clangd
402
} // namespace clang
403
llvm-project

Использование cookies