embox
667 строк · 17.0 Кб
1/***************************************************************
2T-Rex a tiny regular expression library
3
4Copyright (C) 2003-2006 Alberto Demichelis
5
6This software is provided 'as-is', without any express
7or implied warranty. In no event will the authors be held
8liable for any damages arising from the use of this software.
9
10Permission is granted to anyone to use this software for
11any purpose, including commercial applications, and to alter
12it and redistribute it freely, subject to the following restrictions:
13
141. The origin of this software must not be misrepresented;
15you must not claim that you wrote the original software.
16If you use this software in a product, an acknowledgment
17in the product documentation would be appreciated but
18is not required.
19
202. Altered source versions must be plainly marked as such,
21and must not be misrepresented as being the original software.
22
233. This notice may not be removed or altered from any
24source distribution.
25
26****************************************************************/
27#include <string.h>28#include <stdlib.h>29#include <ctype.h>30#include <setjmp.h>31#include "trex.h"32
33#ifdef _UINCODE34#define scisprint iswprint35#define scstrlen wcslen36#define scprintf wprintf37#define _SC(x) L(x)38#else39#define scisprint isprint40#define scstrlen strlen41#define scprintf printf42#define _SC(x) (x)43#endif44
45#ifdef _DEBUG46#include <stdio.h>47
48static const TRexChar *g_nnames[] =49{
50_SC("NONE"),_SC("OP_GREEDY"), _SC("OP_OR"),51_SC("OP_EXPR"),_SC("OP_NOCAPEXPR"),_SC("OP_DOT"), _SC("OP_CLASS"),52_SC("OP_CCLASS"),_SC("OP_NCLASS"),_SC("OP_RANGE"),_SC("OP_CHAR"),53_SC("OP_EOL"),_SC("OP_BOL"),_SC("OP_WB")54};55
56#endif57#define OP_GREEDY (MAX_CHAR+1) // * + ? {n}58#define OP_OR (MAX_CHAR+2)59#define OP_EXPR (MAX_CHAR+3) //parentesis ()60#define OP_NOCAPEXPR (MAX_CHAR+4) //parentesis (?:)61#define OP_DOT (MAX_CHAR+5)62#define OP_CLASS (MAX_CHAR+6)63#define OP_CCLASS (MAX_CHAR+7)64#define OP_NCLASS (MAX_CHAR+8) //negates class the [^65#define OP_RANGE (MAX_CHAR+9)66#define OP_CHAR (MAX_CHAR+10)67#define OP_EOL (MAX_CHAR+11)68#define OP_BOL (MAX_CHAR+12)69#define OP_WB (MAX_CHAR+13)70
71#define TREX_SYMBOL_ANY_CHAR ('.')72#define TREX_SYMBOL_GREEDY_ONE_OR_MORE ('+')73#define TREX_SYMBOL_GREEDY_ZERO_OR_MORE ('*')74#define TREX_SYMBOL_GREEDY_ZERO_OR_ONE ('?')75#define TREX_SYMBOL_BRANCH ('|')76#define TREX_SYMBOL_END_OF_STRING ('$')77#define TREX_SYMBOL_BEGINNING_OF_STRING ('^')78#define TREX_SYMBOL_ESCAPE_CHAR ('\\')79
80
81typedef int TRexNodeType;82
83typedef struct tagTRexNode{84TRexNodeType type;85int left;86int right;87int next;88}TRexNode;89
90struct TRex{91const TRexChar *_eol;92const TRexChar *_bol;93const TRexChar *_p;94int _first;95int _op;96TRexNode *_nodes;97int _nallocated;98int _nsize;99int _nsubexpr;100TRexMatch *_matches;101int _currsubexp;102void *_jmpbuf;103TRexChar *_error;104};105
106static int trex_list(TRex *exp);107
108static int trex_newnode(TRex *exp, TRexNodeType type)109{
110TRexNode n;111int newid;112n.type = type;113n.next = n.right = n.left = -1;114if(type == OP_EXPR)115n.right = exp->_nsubexpr++;116if(exp->_nallocated < (exp->_nsize + 1)) {117//int oldsize = exp->_nallocated;118exp->_nallocated *= 2;119exp->_nodes = (TRexNode *)realloc(exp->_nodes, exp->_nallocated * sizeof(TRexNode));120}121exp->_nodes[exp->_nsize++] = n;122newid = exp->_nsize - 1;123return (int)newid;124}
125
126static void trex_error(TRex *exp,const TRexChar *error)127{
128if(exp->_error) strcpy(exp->_error, error);129longjmp(*((jmp_buf*)exp->_jmpbuf),-1);130}
131
132static void trex_expect(TRex *exp, int n){133if((*exp->_p) != n)134trex_error(exp, _SC("expected paren"));135exp->_p++;136}
137
138static TRexChar trex_escapechar(TRex *exp)139{
140if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR){141exp->_p++;142switch(*exp->_p) {143case 'v': exp->_p++; return '\v';144case 'n': exp->_p++; return '\n';145case 't': exp->_p++; return '\t';146case 'r': exp->_p++; return '\r';147case 'f': exp->_p++; return '\f';148default: return (*exp->_p++);149}150} else if(!scisprint(*exp->_p)) trex_error(exp,_SC("letter expected"));151return (*exp->_p++);152}
153
154static int trex_charclass(TRex *exp,int classid)155{
156int n = trex_newnode(exp,OP_CCLASS);157exp->_nodes[n].left = classid;158return n;159}
160
161static int trex_charnode(TRex *exp,TRexBool isclass)162{
163TRexChar t;164if(*exp->_p == TREX_SYMBOL_ESCAPE_CHAR) {165exp->_p++;166switch(*exp->_p) {167case 'n': exp->_p++; return trex_newnode(exp,'\n');168case 't': exp->_p++; return trex_newnode(exp,'\t');169case 'r': exp->_p++; return trex_newnode(exp,'\r');170case 'f': exp->_p++; return trex_newnode(exp,'\f');171case 'v': exp->_p++; return trex_newnode(exp,'\v');172case 'a': case 'A': case 'w': case 'W': case 's': case 'S':173case 'd': case 'D': case 'x': case 'X': case 'c': case 'C':174case 'p': case 'P': case 'l': case 'u':175{176t = *exp->_p; exp->_p++;177return trex_charclass(exp,t);178}179case 'b':180case 'B':181if(!isclass) {182int node = trex_newnode(exp,OP_WB);183exp->_nodes[node].left = *exp->_p;184exp->_p++;185return node;186} //else default187default:188t = *exp->_p; exp->_p++;189return trex_newnode(exp,t);190}191}192else if(!scisprint(*exp->_p)) {193
194trex_error(exp,_SC("letter expected"));195}196t = *exp->_p; exp->_p++;197return trex_newnode(exp,t);198}
199static int trex_class(TRex *exp)200{
201int ret = -1;202int first = -1,chain;203if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING){204ret = trex_newnode(exp,OP_NCLASS);205exp->_p++;206}else ret = trex_newnode(exp,OP_CLASS);207
208if(*exp->_p == ']') trex_error(exp,_SC("empty class"));209chain = ret;210while(*exp->_p != ']' && exp->_p != exp->_eol) {211if(*exp->_p == '-' && first != -1){212int r,t;213if(*exp->_p++ == ']') trex_error(exp,_SC("unfinished range"));214r = trex_newnode(exp,OP_RANGE);215if(first>*exp->_p) trex_error(exp,_SC("invalid range"));216if(exp->_nodes[first].type == OP_CCLASS) trex_error(exp,_SC("cannot use character classes in ranges"));217exp->_nodes[r].left = exp->_nodes[first].type;218t = trex_escapechar(exp);219exp->_nodes[r].right = t;220exp->_nodes[chain].next = r;221chain = r;222first = -1;223}224else{225if(first!=-1){226int c = first;227exp->_nodes[chain].next = c;228chain = c;229first = trex_charnode(exp,TRex_True);230}231else{232first = trex_charnode(exp,TRex_True);233}234}235}236if(first!=-1){237int c = first;238exp->_nodes[chain].next = c;239chain = c;240first = -1;241}242/* hack? */243exp->_nodes[ret].left = exp->_nodes[ret].next;244exp->_nodes[ret].next = -1;245return ret;246}
247
248static int trex_parsenumber(TRex *exp)249{
250int ret = *exp->_p-'0';251int positions = 10;252exp->_p++;253while(isdigit(*exp->_p)) {254ret = ret*10+(*exp->_p++-'0');255if(positions==1000000000) trex_error(exp,_SC("overflow in numeric constant"));256positions *= 10;257};258return ret;259}
260
261static int trex_element(TRex *exp)262{
263int ret = -1;264switch(*exp->_p)265{266case '(': {267int expr,newn;268exp->_p++;269
270
271if(*exp->_p =='?') {272exp->_p++;273trex_expect(exp,':');274expr = trex_newnode(exp,OP_NOCAPEXPR);275}276else277expr = trex_newnode(exp,OP_EXPR);278newn = trex_list(exp);279exp->_nodes[expr].left = newn;280ret = expr;281trex_expect(exp,')');282}283break;284case '[':285exp->_p++;286ret = trex_class(exp);287trex_expect(exp,']');288break;289case TREX_SYMBOL_END_OF_STRING: exp->_p++; ret = trex_newnode(exp,OP_EOL);break;290case TREX_SYMBOL_ANY_CHAR: exp->_p++; ret = trex_newnode(exp,OP_DOT);break;291default:292ret = trex_charnode(exp,TRex_False);293break;294}295
296{297TRexBool isgreedy = TRex_False;298unsigned short p0 = 0, p1 = 0;299switch(*exp->_p){300case TREX_SYMBOL_GREEDY_ZERO_OR_MORE: p0 = 0; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;301case TREX_SYMBOL_GREEDY_ONE_OR_MORE: p0 = 1; p1 = 0xFFFF; exp->_p++; isgreedy = TRex_True; break;302case TREX_SYMBOL_GREEDY_ZERO_OR_ONE: p0 = 0; p1 = 1; exp->_p++; isgreedy = TRex_True; break;303case '{':304exp->_p++;305if(!isdigit(*exp->_p)) trex_error(exp,_SC("number expected"));306p0 = (unsigned short)trex_parsenumber(exp);307/*******************************/308switch(*exp->_p) {309case '}':310p1 = p0; exp->_p++;311break;312case ',':313exp->_p++;314p1 = 0xFFFF;315if(isdigit(*exp->_p)){316p1 = (unsigned short)trex_parsenumber(exp);317}318trex_expect(exp,'}');319break;320default:321trex_error(exp,_SC(", or } expected"));322}323/*******************************/324isgreedy = TRex_True;325break;326
327}328if(isgreedy) {329//int op;330int nnode = trex_newnode(exp,OP_GREEDY);331//op = OP_GREEDY;332exp->_nodes[nnode].left = ret;333exp->_nodes[nnode].right = ((p0)<<16)|p1;334ret = nnode;335}336}337if((*exp->_p != TREX_SYMBOL_BRANCH) && (*exp->_p != ')') && (*exp->_p != TREX_SYMBOL_GREEDY_ZERO_OR_MORE) && (*exp->_p != TREX_SYMBOL_GREEDY_ONE_OR_MORE) && (*exp->_p != '\0')) {338int nnode = trex_element(exp);339exp->_nodes[ret].next = nnode;340}341
342return ret;343}
344
345static int trex_list(TRex *exp)346{
347int ret=-1,e;348if(*exp->_p == TREX_SYMBOL_BEGINNING_OF_STRING) {349exp->_p++;350ret = trex_newnode(exp,OP_BOL);351}352e = trex_element(exp);353if(ret != -1) {354exp->_nodes[ret].next = e;355}356else ret = e;357
358if(*exp->_p == TREX_SYMBOL_BRANCH) {359int temp,tright;360exp->_p++;361temp = trex_newnode(exp,OP_OR);362exp->_nodes[temp].left = ret;363tright = trex_list(exp);364exp->_nodes[temp].right = tright;365ret = temp;366}367return ret;368}
369
370static TRexBool trex_matchcclass(int cclass,TRexChar c)371{
372switch(cclass) {373case 'a': return isalpha(c)?TRex_True:TRex_False;374case 'A': return !isalpha(c)?TRex_True:TRex_False;375case 'w': return (isalnum(c) || c == '_')?TRex_True:TRex_False;376case 'W': return (!isalnum(c) && c != '_')?TRex_True:TRex_False;377case 's': return isspace(c)?TRex_True:TRex_False;378case 'S': return !isspace(c)?TRex_True:TRex_False;379case 'd': return isdigit(c)?TRex_True:TRex_False;380case 'D': return !isdigit(c)?TRex_True:TRex_False;381case 'x': return isxdigit(c)?TRex_True:TRex_False;382case 'X': return !isxdigit(c)?TRex_True:TRex_False;383case 'c': return iscntrl(c)?TRex_True:TRex_False;384case 'C': return !iscntrl(c)?TRex_True:TRex_False;385case 'p': return ispunct(c)?TRex_True:TRex_False;386case 'P': return !ispunct(c)?TRex_True:TRex_False;387case 'l': return islower(c)?TRex_True:TRex_False;388case 'u': return isupper(c)?TRex_True:TRex_False;389}390return TRex_False; /*cannot happen*/391}
392
393static TRexBool trex_matchclass(TRex* exp,TRexNode *node,TRexChar c)394{
395do {396switch(node->type) {397case OP_RANGE:398if(c >= node->left && c <= node->right) return TRex_True;399break;400case OP_CCLASS:401if(trex_matchcclass(node->left,c)) return TRex_True;402break;403default:404if(c == node->type)return TRex_True;405}406} while((node->next != -1) && (node = &exp->_nodes[node->next]));407return TRex_False;408}
409
410static const TRexChar *trex_matchnode(TRex* exp,TRexNode *node,const TRexChar *str,TRexNode *next)411{
412
413TRexNodeType type = node->type;414switch(type) {415case OP_GREEDY: {416//TRexNode *greedystop = (node->next != -1) ? &exp->_nodes[node->next] : NULL;417TRexNode *greedystop = NULL;418int p0 = (node->right >> 16)&0x0000FFFF, p1 = node->right&0x0000FFFF, nmaches = 0;419const TRexChar *s=str, *good = str;420
421if(node->next != -1) {422greedystop = &exp->_nodes[node->next];423}424else {425greedystop = next;426}427
428while((nmaches == 0xFFFF || nmaches < p1)) {429
430const TRexChar *stop;431if(!(s = trex_matchnode(exp,&exp->_nodes[node->left],s,greedystop)))432break;433nmaches++;434good=s;435if(greedystop) {436//checks that 0 matches satisfy the expression(if so skips)437//if not would always stop(for instance if is a '?')438if(greedystop->type != OP_GREEDY ||439(greedystop->type == OP_GREEDY && ((greedystop->right >> 16)&0x0000FFFF) != 0))440{441TRexNode *gnext = NULL;442if(greedystop->next != -1) {443gnext = &exp->_nodes[greedystop->next];444}else if(next && next->next != -1){445gnext = &exp->_nodes[next->next];446}447stop = trex_matchnode(exp,greedystop,s,gnext);448if(stop) {449//if satisfied stop it450if(p0 == p1 && p0 == nmaches) break;451else if(nmaches >= p0 && p1 == 0xFFFF) break;452else if(nmaches >= p0 && nmaches <= p1) break;453}454}455}456
457if(s >= exp->_eol)458break;459}460if(p0 == p1 && p0 == nmaches) return good;461else if(nmaches >= p0 && p1 == 0xFFFF) return good;462else if(nmaches >= p0 && nmaches <= p1) return good;463return NULL;464}465case OP_OR: {466const TRexChar *asd = str;467TRexNode *temp=&exp->_nodes[node->left];468while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) {469if(temp->next != -1)470temp = &exp->_nodes[temp->next];471else472return asd;473}474asd = str;475temp = &exp->_nodes[node->right];476while( (asd = trex_matchnode(exp,temp,asd,NULL)) ) {477if(temp->next != -1)478temp = &exp->_nodes[temp->next];479else480return asd;481}482return NULL;483break;484}485case OP_EXPR:486case OP_NOCAPEXPR:{487TRexNode *n = &exp->_nodes[node->left];488const TRexChar *cur = str;489int capture = -1;490if(node->type != OP_NOCAPEXPR && node->right == exp->_currsubexp) {491capture = exp->_currsubexp;492exp->_matches[capture].begin = cur;493exp->_currsubexp++;494}495
496do {497TRexNode *subnext = NULL;498if(n->next != -1) {499subnext = &exp->_nodes[n->next];500}else {501subnext = next;502}503if(!(cur = trex_matchnode(exp,n,cur,subnext))) {504if(capture != -1){505exp->_matches[capture].begin = 0;506exp->_matches[capture].len = 0;507}508return NULL;509}510} while((n->next != -1) && (n = &exp->_nodes[n->next]));511
512if(capture != -1)513exp->_matches[capture].len = cur - exp->_matches[capture].begin;514return cur;515}516case OP_WB:517if((str == exp->_bol && !isspace(*str))518|| (str == exp->_eol && !isspace(*(str-1)))519|| (!isspace(*str) && isspace(*(str+1)))520|| (isspace(*str) && !isspace(*(str+1))) ) {521return (node->left == 'b')?str:NULL;522}523return (node->left == 'b')?NULL:str;524case OP_BOL:525if(str == exp->_bol) return str;526return NULL;527case OP_EOL:528if(str == exp->_eol) return str;529return NULL;530case OP_DOT:{531str++;532}533return str;534case OP_NCLASS:535case OP_CLASS:536if(trex_matchclass(exp,&exp->_nodes[node->left],*str)?(type == OP_CLASS?TRex_True:TRex_False):(type == OP_NCLASS?TRex_True:TRex_False)) {537str++;538return str;539}540return NULL;541case OP_CCLASS:542if(trex_matchcclass(node->left,*str)) {543str++;544return str;545}546return NULL;547default: /* char */548if(*str != node->type) return NULL;549str++;550return str;551}552return NULL;553}
554
555/* public api */
556TRex *trex_compile(const TRexChar *pattern,TRexChar *error)557{
558TRex *exp = (TRex *)malloc(sizeof(TRex));559exp->_eol = exp->_bol = NULL;560exp->_p = pattern;561exp->_nallocated = (int)scstrlen(pattern) * sizeof(TRexChar);562exp->_nodes = (TRexNode *)malloc(exp->_nallocated * sizeof(TRexNode));563exp->_nsize = 0;564exp->_matches = 0;565exp->_nsubexpr = 0;566exp->_first = trex_newnode(exp,OP_EXPR);567exp->_error = error;568exp->_jmpbuf = malloc(sizeof(jmp_buf));569if(setjmp(*((jmp_buf*)exp->_jmpbuf)) == 0) {570int res = trex_list(exp);571exp->_nodes[exp->_first].left = res;572if(*exp->_p!='\0')573trex_error(exp,_SC("unexpected character"));574#ifdef _DEBUG575{576int nsize,i;577TRexNode *t;578nsize = exp->_nsize;579t = &exp->_nodes[0];580scprintf(_SC("\n"));581for(i = 0;i < nsize; i++) {582if(exp->_nodes[i].type>MAX_CHAR)583scprintf(_SC("[%02d] %10s "),i,g_nnames[exp->_nodes[i].type-MAX_CHAR]);584else585scprintf(_SC("[%02d] %10c "),i,exp->_nodes[i].type);586scprintf(_SC("left %02d right %02d next %02d\n"),exp->_nodes[i].left,exp->_nodes[i].right,exp->_nodes[i].next);587}588scprintf(_SC("\n"));589}590#endif591exp->_matches = (TRexMatch *) malloc(exp->_nsubexpr * sizeof(TRexMatch));592memset(exp->_matches,0,exp->_nsubexpr * sizeof(TRexMatch));593}594else{595trex_free(exp);596return NULL;597}598return exp;599}
600
601void trex_free(TRex *exp)602{
603if(exp) {604if(exp->_nodes) free(exp->_nodes);605if(exp->_jmpbuf) free(exp->_jmpbuf);606if(exp->_matches) free(exp->_matches);607free(exp);608}609}
610
611TRexBool trex_match(TRex* exp,const TRexChar* text)612{
613const TRexChar* res = NULL;614exp->_bol = text;615exp->_eol = text + scstrlen(text);616exp->_currsubexp = 0;617res = trex_matchnode(exp,exp->_nodes,text,NULL);618if(res == NULL || res != exp->_eol)619return TRex_False;620return TRex_True;621}
622
623TRexBool trex_searchrange(TRex* exp,const TRexChar* text_begin,const TRexChar* text_end,const TRexChar** out_begin, const TRexChar** out_end)624{
625const TRexChar *cur = NULL;626int node = exp->_first;627if(text_begin >= text_end) return TRex_False;628exp->_bol = text_begin;629exp->_eol = text_end;630do {631cur = text_begin;632while(node != -1) {633exp->_currsubexp = 0;634cur = trex_matchnode(exp,&exp->_nodes[node],cur,NULL);635if(!cur)636break;637node = exp->_nodes[node].next;638}639text_begin++;640} while(cur == NULL && text_begin != text_end);641
642if(cur == NULL)643return TRex_False;644
645--text_begin;646
647if(out_begin) *out_begin = text_begin;648if(out_end) *out_end = cur;649return TRex_True;650}
651
652TRexBool trex_search(TRex* exp,const TRexChar* text, const TRexChar** out_begin, const TRexChar** out_end)653{
654return trex_searchrange(exp,text,text + scstrlen(text),out_begin,out_end);655}
656
657int trex_getsubexpcount(TRex* exp)658{
659return exp->_nsubexpr;660}
661
662TRexBool trex_getsubexp(TRex* exp, int n, TRexMatch *subexp)663{
664if( n<0 || n >= exp->_nsubexpr) return TRex_False;665*subexp = exp->_matches[n];666return TRex_True;667}
668
669