git
/
kwset.c
811 строк · 22.2 Кб
1/*
2* This file has been copied from commit e7ac713d^ in the GNU grep git
3* repository. A few small changes have been made to adapt the code to
4* Git.
5*/
6
7/* kwset.c - search for any of a set of keywords.
8Copyright 1989, 1998, 2000, 2005 Free Software Foundation, Inc.
9
10This program is free software; you can redistribute it and/or modify
11it under the terms of the GNU General Public License as published by
12the Free Software Foundation; either version 2, or (at your option)
13any later version.
14
15This program is distributed in the hope that it will be useful,
16but WITHOUT ANY WARRANTY; without even the implied warranty of
17MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18GNU General Public License for more details.
19
20You should have received a copy of the GNU General Public License
21along with this program; if not, see <https://www.gnu.org/licenses/>. */
22
23/* Written August 1989 by Mike Haertel.
24The author may be reached (Email) at the address mike@ai.mit.edu,
25or (US mail) as Mike Haertel c/o Free Software Foundation. */
26
27/* The algorithm implemented by these routines bears a startling resemblance
28to one discovered by Beate Commentz-Walter, although it is not identical.
29See "A String Matching Algorithm Fast on the Average," Technical Report,
30IBM-Germany, Scientific Center Heidelberg, Tiergartenstrasse 15, D-6900
31Heidelberg, Germany. See also Aho, A.V., and M. Corasick, "Efficient
32String Matching: An Aid to Bibliographic Search," CACM June 1975,
33Vol. 18, No. 6, which describes the failure function used below. */
34
35#include "git-compat-util.h"36
37#include "kwset.h"38#include "compat/obstack.h"39
40#define NCHAR (UCHAR_MAX + 1)41/* adapter for `xmalloc()`, which takes `size_t`, not `long` */
42static void *obstack_chunk_alloc(long size)43{
44if (size < 0)45BUG("Cannot allocate a negative amount: %ld", size);46return xmalloc(size);47}
48#define obstack_chunk_free free49
50#define U(c) ((unsigned char) (c))51
52/* For case-insensitive kwset */
53const unsigned char tolower_trans_tbl[256] = {540x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,550x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,560x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,570x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,58' ', '!', '"', '#', '$', '%', '&', 0x27,59'(', ')', '*', '+', ',', '-', '.', '/',60'0', '1', '2', '3', '4', '5', '6', '7',61'8', '9', ':', ';', '<', '=', '>', '?',62'@', 'a', 'b', 'c', 'd', 'e', 'f', 'g',63'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',64'p', 'q', 'r', 's', 't', 'u', 'v', 'w',65'x', 'y', 'z', '[', 0x5c, ']', '^', '_',66'`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',67'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',68'p', 'q', 'r', 's', 't', 'u', 'v', 'w',69'x', 'y', 'z', '{', '|', '}', '~', 0x7f,700x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,710x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,720x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,730x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,740xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,750xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,760xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,770xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,780xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,790xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,800xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,810xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,820xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,830xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,840xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,850xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,86};87
88/* Balanced tree of edges and labels leaving a given trie node. */
89struct tree90{
91struct tree *llink; /* Left link; MUST be first field. */92struct tree *rlink; /* Right link (to larger labels). */93struct trie *trie; /* Trie node pointed to by this edge. */94unsigned char label; /* Label on this edge. */95char balance; /* Difference in depths of subtrees. */96};97
98/* Node of a trie representing a set of reversed keywords. */
99struct trie100{
101unsigned int accepting; /* Word index of accepted word, or zero. */102struct tree *links; /* Tree of edges leaving this node. */103struct trie *parent; /* Parent of this node. */104struct trie *next; /* List of all trie nodes in level order. */105struct trie *fail; /* Aho-Corasick failure function. */106int depth; /* Depth of this node from the root. */107int shift; /* Shift function for search failures. */108int maxshift; /* Max shift of self and descendants. */109};110
111/* Structure returned opaquely to the caller, containing everything. */
112struct kwset113{
114struct obstack obstack; /* Obstack for node allocation. */115int words; /* Number of words in the trie. */116struct trie *trie; /* The trie itself. */117int mind; /* Minimum depth of an accepting node. */118int maxd; /* Maximum depth of any node. */119unsigned char delta[NCHAR]; /* Delta table for rapid search. */120struct trie *next[NCHAR]; /* Table of children of the root. */121char *target; /* Target string if there's only one. */122int mind2; /* Used in Boyer-Moore search for one string. */123unsigned char const *trans; /* Character translation table. */124};125
126/* Allocate and initialize a keyword set object, returning an opaque
127pointer to it. Return NULL if memory is not available. */
128kwset_t
129kwsalloc (unsigned char const *trans)130{
131struct kwset *kwset;132
133kwset = (struct kwset *) xmalloc(sizeof (struct kwset));134
135obstack_init(&kwset->obstack);136kwset->words = 0;137kwset->trie138= (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie));139if (!kwset->trie)140{141kwsfree((kwset_t) kwset);142return NULL;143}144kwset->trie->accepting = 0;145kwset->trie->links = NULL;146kwset->trie->parent = NULL;147kwset->trie->next = NULL;148kwset->trie->fail = NULL;149kwset->trie->depth = 0;150kwset->trie->shift = 0;151kwset->mind = INT_MAX;152kwset->maxd = -1;153kwset->target = NULL;154kwset->trans = trans;155
156return (kwset_t) kwset;157}
158
159/* This upper bound is valid for CHAR_BIT >= 4 and
160exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
161#define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2)162
163/* Add the given string to the contents of the keyword set. Return NULL
164for success, an error message otherwise. */
165const char *166kwsincr (kwset_t kws, char const *text, size_t len)167{
168struct kwset *kwset;169register struct trie *trie;170register unsigned char label;171register struct tree *link;172register int depth;173struct tree *links[DEPTH_SIZE];174enum { L, R } dirs[DEPTH_SIZE];175struct tree *t, *r, *l, *rl, *lr;176
177kwset = (struct kwset *) kws;178trie = kwset->trie;179text += len;180
181/* Descend the trie (built of reversed keywords) character-by-character,182installing new nodes when necessary. */
183while (len--)184{185label = kwset->trans ? kwset->trans[U(*--text)] : *--text;186
187/* Descend the tree of outgoing links for this trie node,188looking for the current character and keeping track
189of the path followed. */
190link = trie->links;191links[0] = (struct tree *) &trie->links;192dirs[0] = L;193depth = 1;194
195while (link && label != link->label)196{197links[depth] = link;198if (label < link->label)199dirs[depth++] = L, link = link->llink;200else201dirs[depth++] = R, link = link->rlink;202}203
204/* The current character doesn't have an outgoing link at205this trie node, so build a new trie node and install
206a link in the current trie node's tree. */
207if (!link)208{209link = (struct tree *) obstack_alloc(&kwset->obstack,210sizeof (struct tree));211if (!link)212return "memory exhausted";213link->llink = NULL;214link->rlink = NULL;215link->trie = (struct trie *) obstack_alloc(&kwset->obstack,216sizeof (struct trie));217if (!link->trie)218{219obstack_free(&kwset->obstack, link);220return "memory exhausted";221}222link->trie->accepting = 0;223link->trie->links = NULL;224link->trie->parent = trie;225link->trie->next = NULL;226link->trie->fail = NULL;227link->trie->depth = trie->depth + 1;228link->trie->shift = 0;229link->label = label;230link->balance = 0;231
232/* Install the new tree node in its parent. */233if (dirs[--depth] == L)234links[depth]->llink = link;235else236links[depth]->rlink = link;237
238/* Back up the tree fixing the balance flags. */239while (depth && !links[depth]->balance)240{241if (dirs[depth] == L)242--links[depth]->balance;243else244++links[depth]->balance;245--depth;246}247
248/* Rebalance the tree by pointer rotations if necessary. */249if (depth && ((dirs[depth] == L && --links[depth]->balance)250|| (dirs[depth] == R && ++links[depth]->balance)))251{252switch (links[depth]->balance)253{254case (char) -2:255switch (dirs[depth + 1])256{257case L:258r = links[depth], t = r->llink, rl = t->rlink;259t->rlink = r, r->llink = rl;260t->balance = r->balance = 0;261break;262case R:263r = links[depth], l = r->llink, t = l->rlink;264rl = t->rlink, lr = t->llink;265t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;266l->balance = t->balance != 1 ? 0 : -1;267r->balance = t->balance != (char) -1 ? 0 : 1;268t->balance = 0;269break;270default:271abort ();272}273break;274case 2:275switch (dirs[depth + 1])276{277case R:278l = links[depth], t = l->rlink, lr = t->llink;279t->llink = l, l->rlink = lr;280t->balance = l->balance = 0;281break;282case L:283l = links[depth], r = l->rlink, t = r->llink;284lr = t->llink, rl = t->rlink;285t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;286l->balance = t->balance != 1 ? 0 : -1;287r->balance = t->balance != (char) -1 ? 0 : 1;288t->balance = 0;289break;290default:291abort ();292}293break;294default:295abort ();296}297
298if (dirs[depth - 1] == L)299links[depth - 1]->llink = t;300else301links[depth - 1]->rlink = t;302}303}304
305trie = link->trie;306}307
308/* Mark the node we finally reached as accepting, encoding the309index number of this word in the keyword set so far. */
310if (!trie->accepting)311trie->accepting = 1 + 2 * kwset->words;312++kwset->words;313
314/* Keep track of the longest and shortest string of the keyword set. */315if (trie->depth < kwset->mind)316kwset->mind = trie->depth;317if (trie->depth > kwset->maxd)318kwset->maxd = trie->depth;319
320return NULL;321}
322
323/* Enqueue the trie nodes referenced from the given tree in the
324given queue. */
325static void326enqueue (struct tree *tree, struct trie **last)327{
328if (!tree)329return;330enqueue(tree->llink, last);331enqueue(tree->rlink, last);332(*last) = (*last)->next = tree->trie;333}
334
335/* Compute the Aho-Corasick failure function for the trie nodes referenced
336from the given tree, given the failure function for their parent as
337well as a last resort failure node. */
338static void339treefails (register struct tree const *tree, struct trie const *fail,340struct trie *recourse)341{
342register struct tree *link;343
344if (!tree)345return;346
347treefails(tree->llink, fail, recourse);348treefails(tree->rlink, fail, recourse);349
350/* Find, in the chain of fails going back to the root, the first351node that has a descendant on the current label. */
352while (fail)353{354link = fail->links;355while (link && tree->label != link->label)356if (tree->label < link->label)357link = link->llink;358else359link = link->rlink;360if (link)361{362tree->trie->fail = link->trie;363return;364}365fail = fail->fail;366}367
368tree->trie->fail = recourse;369}
370
371/* Set delta entries for the links of the given tree such that
372the preexisting delta value is larger than the current depth. */
373static void374treedelta (register struct tree const *tree,375register unsigned int depth,376unsigned char delta[])377{
378if (!tree)379return;380treedelta(tree->llink, depth, delta);381treedelta(tree->rlink, depth, delta);382if (depth < delta[tree->label])383delta[tree->label] = depth;384}
385
386/* Return true if A has every label in B. */
387static int388hasevery (register struct tree const *a, register struct tree const *b)389{
390if (!b)391return 1;392if (!hasevery(a, b->llink))393return 0;394if (!hasevery(a, b->rlink))395return 0;396while (a && b->label != a->label)397if (b->label < a->label)398a = a->llink;399else400a = a->rlink;401return !!a;402}
403
404/* Compute a vector, indexed by character code, of the trie nodes
405referenced from the given tree. */
406static void407treenext (struct tree const *tree, struct trie *next[])408{
409if (!tree)410return;411treenext(tree->llink, next);412treenext(tree->rlink, next);413next[tree->label] = tree->trie;414}
415
416/* Compute the shift for each trie node, as well as the delta
417table and next cache for the given keyword set. */
418const char *419kwsprep (kwset_t kws)420{
421register struct kwset *kwset;422register int i;423register struct trie *curr;424register unsigned char const *trans;425unsigned char delta[NCHAR];426
427kwset = (struct kwset *) kws;428
429/* Initial values for the delta table; will be changed later. The430delta entry for a given character is the smallest depth of any
431node at which an outgoing edge is labeled by that character. */
432memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR);433
434/* Check if we can use the simple boyer-moore algorithm, instead435of the hairy commentz-walter algorithm. */
436if (kwset->words == 1 && kwset->trans == NULL)437{438char c;439
440/* Looking for just one string. Extract it from the trie. */441kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);442if (!kwset->target)443return "memory exhausted";444for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)445{446kwset->target[i] = curr->links->label;447curr = curr->links->trie;448}449/* Build the Boyer Moore delta. Boy that's easy compared to CW. */450for (i = 0; i < kwset->mind; ++i)451delta[U(kwset->target[i])] = kwset->mind - (i + 1);452/* Find the minimal delta2 shift that we might make after453a backwards match has failed. */
454c = kwset->target[kwset->mind - 1];455for (i = kwset->mind - 2; i >= 0; --i)456if (kwset->target[i] == c)457break;458kwset->mind2 = kwset->mind - (i + 1);459}460else461{462register struct trie *fail;463struct trie *last, *next[NCHAR];464
465/* Traverse the nodes of the trie in level order, simultaneously466computing the delta table, failure function, and shift function. */
467for (curr = last = kwset->trie; curr; curr = curr->next)468{469/* Enqueue the immediate descendants in the level order queue. */470enqueue(curr->links, &last);471
472curr->shift = kwset->mind;473curr->maxshift = kwset->mind;474
475/* Update the delta table for the descendants of this node. */476treedelta(curr->links, curr->depth, delta);477
478/* Compute the failure function for the descendants of this node. */479treefails(curr->links, curr->fail, kwset->trie);480
481/* Update the shifts at each node in the current node's chain482of fails back to the root. */
483for (fail = curr->fail; fail; fail = fail->fail)484{485/* If the current node has some outgoing edge that the fail486doesn't, then the shift at the fail should be no larger
487than the difference of their depths. */
488if (!hasevery(fail->links, curr->links))489if (curr->depth - fail->depth < fail->shift)490fail->shift = curr->depth - fail->depth;491
492/* If the current node is accepting then the shift at the493fail and its descendants should be no larger than the
494difference of their depths. */
495if (curr->accepting && fail->maxshift > curr->depth - fail->depth)496fail->maxshift = curr->depth - fail->depth;497}498}499
500/* Traverse the trie in level order again, fixing up all nodes whose501shift exceeds their inherited maxshift. */
502for (curr = kwset->trie->next; curr; curr = curr->next)503{504if (curr->maxshift > curr->parent->maxshift)505curr->maxshift = curr->parent->maxshift;506if (curr->shift > curr->maxshift)507curr->shift = curr->maxshift;508}509
510/* Create a vector, indexed by character code, of the outgoing links511from the root node. */
512for (i = 0; i < NCHAR; ++i)513next[i] = NULL;514treenext(kwset->trie->links, next);515
516if ((trans = kwset->trans))517for (i = 0; i < NCHAR; ++i)518kwset->next[i] = next[U(trans[i])];519else520COPY_ARRAY(kwset->next, next, NCHAR);521}522
523/* Fix things up for any translation table. */524if ((trans = kwset->trans))525for (i = 0; i < NCHAR; ++i)526kwset->delta[i] = delta[U(trans[i])];527else528memcpy(kwset->delta, delta, NCHAR);529
530return NULL;531}
532
533/* Fast boyer-moore search. */
534static size_t535bmexec (kwset_t kws, char const *text, size_t size)536{
537struct kwset const *kwset;538register unsigned char const *d1;539register char const *ep, *sp, *tp;540register int d, gc, i, len, md2;541
542kwset = (struct kwset const *) kws;543len = kwset->mind;544
545if (len == 0)546return 0;547if (len > size)548return -1;549if (len == 1)550{551tp = memchr (text, kwset->target[0], size);552return tp ? tp - text : -1;553}554
555d1 = kwset->delta;556sp = kwset->target + len;557gc = U(sp[-2]);558md2 = kwset->mind2;559tp = text + len;560
561/* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */562if (size > 12 * len)563/* 11 is not a bug, the initial offset happens only once. */564for (ep = text + size - 11 * len;;)565{566while (tp <= ep)567{568d = d1[U(tp[-1])], tp += d;569d = d1[U(tp[-1])], tp += d;570if (d == 0)571goto found;572d = d1[U(tp[-1])], tp += d;573d = d1[U(tp[-1])], tp += d;574d = d1[U(tp[-1])], tp += d;575if (d == 0)576goto found;577d = d1[U(tp[-1])], tp += d;578d = d1[U(tp[-1])], tp += d;579d = d1[U(tp[-1])], tp += d;580if (d == 0)581goto found;582d = d1[U(tp[-1])], tp += d;583d = d1[U(tp[-1])], tp += d;584}585break;586found:587if (U(tp[-2]) == gc)588{589for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)590;591if (i > len)592return tp - len - text;593}594tp += md2;595}596
597/* Now we have only a few characters left to search. We598carefully avoid ever producing an out-of-bounds pointer. */
599ep = text + size;600d = d1[U(tp[-1])];601while (d <= ep - tp)602{603d = d1[U((tp += d)[-1])];604if (d != 0)605continue;606if (U(tp[-2]) == gc)607{608for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)609;610if (i > len)611return tp - len - text;612}613d = md2;614}615
616return -1;617}
618
619/* Hairy multiple string search. */
620static size_t621cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch)622{
623struct kwset const *kwset;624struct trie * const *next;625struct trie const *trie;626struct trie const *accept;627char const *beg, *lim, *mch, *lmch;628register unsigned char c;629register unsigned char const *delta;630register int d;631register char const *end, *qlim;632register struct tree const *tree;633register unsigned char const *trans;634
635accept = NULL;636
637/* Initialize register copies and look for easy ways out. */638kwset = (struct kwset *) kws;639if (len < kwset->mind)640return -1;641next = kwset->next;642delta = kwset->delta;643trans = kwset->trans;644lim = text + len;645end = text;646if ((d = kwset->mind) != 0)647mch = NULL;648else649{650mch = text, accept = kwset->trie;651goto match;652}653
654if (len >= 4 * kwset->mind)655qlim = lim - 4 * kwset->mind;656else657qlim = NULL;658
659while (lim - end >= d)660{661if (qlim && end <= qlim)662{663end += d - 1;664while ((d = delta[c = *end]) && end < qlim)665{666end += d;667end += delta[U(*end)];668end += delta[U(*end)];669}670++end;671}672else673d = delta[c = (end += d)[-1]];674if (d)675continue;676beg = end - 1;677trie = next[c];678if (trie->accepting)679{680mch = beg;681accept = trie;682}683d = trie->shift;684while (beg > text)685{686c = trans ? trans[U(*--beg)] : *--beg;687tree = trie->links;688while (tree && c != tree->label)689if (c < tree->label)690tree = tree->llink;691else692tree = tree->rlink;693if (tree)694{695trie = tree->trie;696if (trie->accepting)697{698mch = beg;699accept = trie;700}701}702else703break;704d = trie->shift;705}706if (mch)707goto match;708}709return -1;710
711match:712/* Given a known match, find the longest possible match anchored713at or before its starting point. This is nearly a verbatim
714copy of the preceding main search loops. */
715if (lim - mch > kwset->maxd)716lim = mch + kwset->maxd;717lmch = NULL;718d = 1;719while (lim - end >= d)720{721if ((d = delta[c = (end += d)[-1]]) != 0)722continue;723beg = end - 1;724if (!(trie = next[c]))725{726d = 1;727continue;728}729if (trie->accepting && beg <= mch)730{731lmch = beg;732accept = trie;733}734d = trie->shift;735while (beg > text)736{737c = trans ? trans[U(*--beg)] : *--beg;738tree = trie->links;739while (tree && c != tree->label)740if (c < tree->label)741tree = tree->llink;742else743tree = tree->rlink;744if (tree)745{746trie = tree->trie;747if (trie->accepting && beg <= mch)748{749lmch = beg;750accept = trie;751}752}753else754break;755d = trie->shift;756}757if (lmch)758{759mch = lmch;760goto match;761}762if (!d)763d = 1;764}765
766if (kwsmatch)767{768kwsmatch->index = accept->accepting / 2;769kwsmatch->offset[0] = mch - text;770kwsmatch->size[0] = accept->depth;771}772return mch - text;773}
774
775/* Search through the given text for a match of any member of the
776given keyword set. Return a pointer to the first character of
777the matching substring, or NULL if no match is found. If FOUNDLEN
778is non-NULL store in the referenced location the length of the
779matching substring. Similarly, if FOUNDIDX is non-NULL, store
780in the referenced location the index number of the particular
781keyword matched. */
782size_t
783kwsexec (kwset_t kws, char const *text, size_t size,784struct kwsmatch *kwsmatch)785{
786struct kwset const *kwset = (struct kwset *) kws;787if (kwset->words == 1 && kwset->trans == NULL)788{789size_t ret = bmexec (kws, text, size);790if (kwsmatch != NULL && ret != (size_t) -1)791{792kwsmatch->index = 0;793kwsmatch->offset[0] = ret;794kwsmatch->size[0] = kwset->mind;795}796return ret;797}798else799return cwexec(kws, text, size, kwsmatch);800}
801
802/* Free the components of the given keyword set. */
803void
804kwsfree (kwset_t kws)805{
806struct kwset *kwset;807
808kwset = (struct kwset *) kws;809obstack_free(&kwset->obstack, NULL);810free(kws);811}
812