Home | History | Annotate | Download | only in rexpr
      1 /*
      2  * This file contains code for
      3  *
      4  *      int rexpr(char *expr, char *s);
      5  *
      6  * which answers
      7  *
      8  *      1 if 's' is in the language described by the regular expression 'expr'
      9  *      0 if it is not
     10  *     -1 if the regular expression is invalid
     11  *
     12  * Language membership is determined by constructing a non-deterministic
     13  * finite automata (NFA) from the regular expression.  A depth-
     14  * first-search is performed on the NFA (graph) to check for a match of 's'.
     15  * Each non-epsilon arc consumes one character from 's'.  Backtracking is
     16  * performed to check all possible paths through the NFA.
     17  *
     18  * Regular expressions follow the meta-language:
     19  *
     20  * <regExpr>        ::= <andExpr> ( '|' <andExpr> )*
     21  *
     22  * <andExpr>        ::= <expr> ( <expr> )*
     23  *
     24  * <expr>           ::= {'~'} '[' <atomList> ']' <repeatSymbol>
     25  *                      | '(' <regExpr> ')' <repeatSymbol>
     26  *                      | '{' <regExpr> '}' <repeatSymbol>
     27  *                      | <atom> <repeatSymbol>
     28  *
     29  * <repeatSymbol>   ::= { '*' | '+' }
     30  *
     31  * <atomList>       ::= <atom> ( <atom> )*
     32  *                      | { <atomList> } <atom> '-' <atom> { <atomList> }
     33  *
     34  * <atom>           ::= Token[Atom]
     35  *
     36  * Notes:
     37  *		~	means complement the set in [..]. i.e. all characters not listed
     38  *		*	means match 0 or more times (can be on expression or atom)
     39  *		+	means match 1 or more times (can be on expression or atom)
     40  *		{}	optional
     41  *		()	grouping
     42  *		[]	set of atoms
     43  *		x-y	all characters from x to y (found only in [..])
     44  *		\xx the character with value xx
     45  *
     46  * Examples:
     47  *		[a-z]+
     48  *			match 1 or more lower-case letters (e.g. variable)
     49  *
     50  *		0x[0-9A-Fa-f]+
     51  *			match a hex number with 0x on front (e.g. 0xA1FF)
     52  *
     53  *		[0-9]+.[0-9]+{e[0-9]+}
     54  *			match a floating point number (e.g. 3.14e21)
     55  *
     56  * Code example:
     57  *		if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
     58  *
     59  * Terence Parr
     60  * Purdue University
     61  * April 1991
     62  */
     63 
     64 #include <stdio.h>
     65 #include <ctype.h>
     66 #ifdef __STDC__
     67 #include <stdlib.h>
     68 #else
     69 #include <malloc.h>
     70 #endif
     71 #include "rexpr.h"
     72 
     73 #ifdef __USE_PROTOS
     74 static int regExpr( GraphPtr g );
     75 static int andExpr( GraphPtr g );
     76 static int expr( GraphPtr g );
     77 static int repeatSymbol( GraphPtr g );
     78 static int atomList( char *p, int complement );
     79 static void next( void );
     80 static ArcPtr newGraphArc( void );
     81 static NodePtr newNode( void );
     82 static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label );
     83 static Graph BuildNFA_atom( int label );
     84 static Graph BuildNFA_AB( Graph A, Graph B );
     85 static Graph BuildNFA_AorB( Graph A, Graph B );
     86 static Graph BuildNFA_set( char *s );
     87 static Graph BuildNFA_Astar( Graph A );
     88 static Graph BuildNFA_Aplus( Graph A );
     89 static Graph BuildNFA_Aoptional( Graph A );
     90 #else
     91 static int regExpr();
     92 static int andExpr();
     93 static int expr();
     94 static int repeatSymbol();
     95 static int atomList();
     96 static void next();
     97 static ArcPtr newGraphArc();
     98 static NodePtr newNode();
     99 static int ArcBetweenGraphNode();
    100 static Graph BuildNFA_atom();
    101 static Graph BuildNFA_AB();
    102 static Graph BuildNFA_AorB();
    103 static Graph BuildNFA_set();
    104 static Graph BuildNFA_Astar();
    105 static Graph BuildNFA_Aplus();
    106 static Graph BuildNFA_Aoptional();
    107 #endif
    108 
    109 static char *_c;
    110 static int token, tokchar;
    111 static NodePtr accept;
    112 static NodePtr freelist = NULL;
    113 
    114 /*
    115  * return 1 if s in language described by expr
    116  *        0 if s is not
    117  *       -1 if expr is an invalid regular expression
    118  */
    119 #ifdef __USE_PROTOS
    120 static int rexpr(char *expr,char *s)
    121 #else
    122 static int rexpr(expr, s)
    123 char *expr, *s;
    124 #endif
    125 {
    126 	NodePtr p,q;
    127 	Graph nfa;
    128 	int result;
    129 
    130 	fprintf(stderr, "rexpr(%s,%s);\n", expr,s);
    131 	freelist = NULL;
    132 	_c = expr;
    133 	next();
    134 	if ( regExpr(&nfa) == -1 ) return -1;
    135 	accept = nfa.right;
    136 	result = match(nfa.left, s);
    137 	/* free all your memory */
    138 	p = q = freelist;
    139 	while ( p!=NULL ) { q = p->track; free(p); p = q; }
    140 	return result;
    141 }
    142 
    143 /*
    144  * do a depth-first-search on the NFA looking for a path from start to
    145  * accept state labelled with the characters of 's'.
    146  */
    147 
    148 #ifdef __USE_PROTOS
    149 static int match(NodePtr automaton,char *s)
    150 #else
    151 static int match(automaton, s)
    152 NodePtr automaton;
    153 char *s;
    154 #endif
    155 {
    156 	ArcPtr p;
    157 
    158 	if ( automaton == accept && *s == '\0' ) return 1;	/* match */
    159 
    160 	for (p=automaton->arcs; p!=NULL; p=p->next)			/* try all arcs */
    161 	{
    162 		if ( p->label == Epsilon )
    163 		{
    164 			if ( match(p->target, s) ) return 1;
    165 		}
    166 		else if ( p->label == *s )
    167 				if ( match(p->target, s+1) ) return 1;
    168 	}
    169 	return 0;
    170 }
    171 
    172 /*
    173  * <regExpr>        ::= <andExpr> ( '|' {<andExpr>} )*
    174  *
    175  * Return -1 if syntax error
    176  * Return  0 if none found
    177  * Return  1 if a regExrp was found
    178  */
    179 
    180 #ifdef __USE_PROTOS
    181 static int regExpr(GraphPtr g)
    182 #else
    183 static int regExpr(g)
    184 GraphPtr g;
    185 #endif
    186 {
    187 	Graph g1, g2;
    188 
    189 	if ( andExpr(&g1) == -1 )
    190 	{
    191 		return -1;
    192 	}
    193 
    194 	while ( token == '|' )
    195 	{
    196 		int a;
    197 		next();
    198 		a = andExpr(&g2);
    199 		if ( a == -1 ) return -1;	/* syntax error below */
    200 		else if ( !a ) return 1;	/* empty alternative */
    201 		g1 = BuildNFA_AorB(g1, g2);
    202 	}
    203 
    204 	if ( token!='\0' ) return -1;
    205 
    206 	*g = g1;
    207 	return 1;
    208 }
    209 
    210 /*
    211  * <andExpr>        ::= <expr> ( <expr> )*
    212  */
    213 
    214 #ifdef __USE_PROTOS
    215 static int andExpr(GraphPtr g)
    216 #else
    217 static int andExpr(g)
    218 GraphPtr g;
    219 #endif
    220 {
    221 	Graph g1, g2;
    222 
    223 	if ( expr(&g1) == -1 )
    224 	{
    225 		return -1;
    226 	}
    227 
    228 	while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' )
    229 	{
    230 		if (expr(&g2) == -1) return -1;
    231 		g1 = BuildNFA_AB(g1, g2);
    232 	}
    233 
    234 	*g = g1;
    235 	return 1;
    236 }
    237 
    238 /*
    239  * <expr>           ::=    {'~'} '[' <atomList> ']' <repeatSymbol>
    240  *                      | '(' <regExpr> ')' <repeatSymbol>
    241  *                      | '{' <regExpr> '}' <repeatSymbol>
    242  *                      | <atom> <repeatSymbol>
    243  */
    244 
    245 #ifdef __USE_PROTOS
    246 static int expr(GraphPtr g)
    247 #else
    248 static int expr(g)
    249 GraphPtr g;
    250 #endif
    251 {
    252 	int complement = 0;
    253 	char s[257];    /* alloc space for string of char in [] */
    254 
    255 	if ( token == '~' || token == '[' )
    256 	{
    257 		if ( token == '~' ) {complement = 1; next();}
    258 		if ( token != '[' ) return -1;
    259 		next();
    260 		if ( atomList( s, complement ) == -1 ) return -1;
    261 		*g = BuildNFA_set( s );
    262 		if ( token != ']' ) return -1;
    263 		next();
    264 		repeatSymbol( g );
    265 		return 1;
    266 	}
    267 	if ( token == '(' )
    268 	{
    269 		next();
    270 		if ( regExpr( g ) == -1 ) return -1;
    271 		if ( token != ')' ) return -1;
    272 		next();
    273 		repeatSymbol( g );
    274 		return 1;
    275 	}
    276 	if ( token == '{' )
    277 	{
    278 		next();
    279 		if ( regExpr( g ) == -1 ) return -1;
    280 		if ( token != '}' ) return -1;
    281 		next();
    282 		/* S p e c i a l  C a s e   O p t i o n a l  {  } */
    283 		if ( token != '*' && token != '+' )
    284 		{
    285 			*g = BuildNFA_Aoptional( *g );
    286 		}
    287 		repeatSymbol( g );
    288 		return 1;
    289 	}
    290 	if ( token == Atom )
    291 	{
    292 		*g = BuildNFA_atom( tokchar );
    293 		next();
    294 		repeatSymbol( g );
    295 		return 1;
    296 	}
    297 
    298 	return -1;
    299 }
    300 
    301 /*
    302  * <repeatSymbol>   ::= { '*' | '+' }
    303  */
    304 #ifdef __USE_PROTOS
    305 static int repeatSymbol(GraphPtr g)
    306 #else
    307 static int repeatSymbol(g)
    308 GraphPtr g;
    309 #endif
    310 {
    311 	switch ( token )
    312 	{
    313 		case '*' : *g = BuildNFA_Astar( *g ); next(); break;
    314 		case '+' : *g = BuildNFA_Aplus( *g ); next(); break;
    315 	}
    316 	return 1;
    317 }
    318 
    319 /*
    320  * <atomList>       ::= <atom> { <atom> }*
    321  *                      { <atomList> } <atom> '-' <atom> { <atomList> }
    322  *
    323  * a-b is same as ab
    324  * q-a is same as q
    325  */
    326 
    327 #ifdef __USE_PROTOS
    328 static int atomList(char *p, int complement)
    329 #else
    330 static int atomList(p, complement)
    331 char *p;
    332 int complement;
    333 #endif
    334 {
    335 	static unsigned char set[256];		/* no duplicates */
    336 	int first, last, i;
    337 	char *s = p;
    338 
    339 	if ( token != Atom ) return -1;
    340 
    341 	for (i=0; i<256; i++) set[i] = 0;
    342 	while ( token == Atom )
    343 	{
    344 		if ( !set[tokchar] ) *s++ = tokchar;
    345 		set[tokchar] = 1;    			/* Add atom to set */
    346 		next();
    347 		if ( token == '-' )         	/* have we found '-' */
    348 		{
    349 			first = *(s-1);             /* Get last char */
    350 			next();
    351 			if ( token != Atom ) return -1;
    352 			else
    353 			{
    354 				last = tokchar;
    355 			}
    356 			for (i = first+1; i <= last; i++)
    357 			{
    358 				if ( !set[tokchar] ) *s++ = i;
    359 				set[i] = 1;    			/* Add atom to set */
    360 			}
    361 			next();
    362 		}
    363 	}
    364 	*s = '\0';
    365 	if ( complement )
    366 	{
    367 		for (i=0; i<256; i++) set[i] = !set[i];
    368 		for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;
    369 		*s = '\0';
    370 	}
    371 	return 1;
    372 }
    373 
    374 /* a somewhat stupid lexical analyzer */
    375 
    376 #ifdef __USE_PROTOS
    377 static void next(void)
    378 #else
    379 static void next()
    380 #endif
    381 {
    382 	while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
    383 	if ( *_c=='\\' )
    384 	{
    385 		_c++;
    386 		if ( isdigit(*_c) )
    387 		{
    388 			int n=0;
    389 			while ( isdigit(*_c) )
    390 			{
    391 				n = n*10 + (*_c++ - '0');
    392 			}
    393 			if ( n>255 ) n=255;
    394 			tokchar = n;
    395 		}
    396 		else
    397 		{
    398 			switch (*_c)
    399 			{
    400 				case 'n' : tokchar = '\n'; break;
    401 				case 't' : tokchar = '\t'; break;
    402 				case 'r' : tokchar = '\r'; break;
    403 				default  : tokchar = *_c;
    404 			}
    405 			_c++;
    406 		}
    407 		token = Atom;
    408 	}
    409 	else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' &&
    410 			  *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' &&
    411 			  *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' )
    412 	{
    413 		token = Atom;
    414 		tokchar = *_c++;
    415 	}
    416 	else
    417 	{
    418 		token = tokchar = *_c++;
    419 	}
    420 }
    421 
    422 /* N F A  B u i l d i n g  R o u t i n e s */
    423 
    424 #ifdef __USE_PROTOS
    425 static ArcPtr newGraphArc(void)
    426 #else
    427 static ArcPtr newGraphArc()
    428 #endif
    429 {
    430 	ArcPtr p;
    431 	p = (ArcPtr) calloc(1, sizeof(Arc));
    432 	if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
    433 	if ( freelist != NULL ) p->track = (ArcPtr) freelist;
    434 	freelist = (NodePtr) p;
    435 	return p;
    436 }
    437 
    438 #ifdef __USE_PROTOS
    439 static NodePtr newNode(void)
    440 #else
    441 static NodePtr newNode()
    442 #endif
    443 {
    444 	NodePtr p;
    445 	p = (NodePtr) calloc(1, sizeof(Node));
    446 	if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
    447 	if ( freelist != NULL ) p->track = freelist;
    448 	freelist = p;
    449 	return p;
    450 }
    451 
    452 #ifdef __USE_PROTOS
    453 static void ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label)
    454 #else
    455 static void ArcBetweenGraphNodes(i, j, label)
    456 NodePtr i, j;
    457 int label;
    458 #endif
    459 {
    460 	ArcPtr a;
    461 
    462 	a = newGraphArc();
    463 	if ( i->arcs == NULL ) i->arctail = i->arcs = a;
    464 	else {(i->arctail)->next = a; i->arctail = a;}
    465 	a->label = label;
    466 	a->target = j;
    467 }
    468 
    469 #ifdef __USE_PROTOS
    470 static Graph BuildNFA_atom(int label)
    471 #else
    472 static Graph BuildNFA_atom(label)
    473 int label;
    474 #endif
    475 {
    476 	Graph g;
    477 
    478 	g.left = newNode();
    479 	g.right = newNode();
    480 	ArcBetweenGraphNodes(g.left, g.right, label);
    481 	return( g );
    482 }
    483 
    484 #ifdef __USE_PROTOS
    485 static Graph BuildNFA_AB(Graph A,Graph B)
    486 #else
    487 static Graph BuildNFA_AB(A, B)
    488 Graph A, B;
    489 #endif
    490 {
    491 	Graph g;
    492 
    493 	ArcBetweenGraphNodes(A.right, B.left, Epsilon);
    494 	g.left = A.left;
    495 	g.right = B.right;
    496 	return( g );
    497 }
    498 
    499 #ifdef __USE_PROTOS
    500 static Graph BuildNFA_AorB(Graph A,Graph B)
    501 #else
    502 static Graph BuildNFA_AorB(A, B)
    503 Graph A, B;
    504 #endif
    505 {
    506 	Graph g;
    507 
    508 	g.left = newNode();
    509 	ArcBetweenGraphNodes(g.left, A.left, Epsilon);
    510 	ArcBetweenGraphNodes(g.left, B.left, Epsilon);
    511 	g.right = newNode();
    512 	ArcBetweenGraphNodes(A.right, g.right, Epsilon);
    513 	ArcBetweenGraphNodes(B.right, g.right, Epsilon);
    514 	return( g );
    515 }
    516 
    517 #ifdef __USE_PROTOS
    518 static Graph BuildNFA_set(char *s)
    519 #else
    520 static Graph BuildNFA_set( s )
    521 char *s;
    522 #endif
    523 {
    524 	Graph g;
    525 
    526 	if ( s == NULL ) return g;
    527 
    528 	g.left = newNode();
    529 	g.right = newNode();
    530 	while ( *s != '\0' )
    531 	{
    532 		ArcBetweenGraphNodes(g.left, g.right, *s++);
    533 	}
    534 	return g;
    535 }
    536 
    537 #ifdef __USE_PROTOS
    538 static Graph BuildNFA_Astar(Graph A)
    539 #else
    540 static Graph BuildNFA_Astar( A )
    541 Graph A;
    542 #endif
    543 {
    544 	Graph g;
    545 
    546 	g.left = newNode();
    547 	g.right = newNode();
    548 
    549 	ArcBetweenGraphNodes(g.left, A.left, Epsilon);
    550 	ArcBetweenGraphNodes(g.left, g.right, Epsilon);
    551 	ArcBetweenGraphNodes(A.right, g.right, Epsilon);
    552 	ArcBetweenGraphNodes(A.right, A.left, Epsilon);
    553 
    554 	return( g );
    555 }
    556 
    557 #ifdef __USE_PROTOS
    558 static Graph BuildNFA_Aplus(Graph A)
    559 #else
    560 static Graph BuildNFA_Aplus( A )
    561 Graph A;
    562 #endif
    563 {
    564 	ArcBetweenGraphNodes(A.right, A.left, Epsilon);
    565 
    566 	return( A );
    567 }
    568 
    569 #ifdef __USE_PROTOS
    570 static Graph BuildNFA_Aoptional(Graph A)
    571 #else
    572 static Graph BuildNFA_Aoptional( A )
    573 Graph A;
    574 #endif
    575 {
    576 	Graph g;
    577 
    578 	g.left = newNode();
    579 	g.right = newNode();
    580 
    581 	ArcBetweenGraphNodes(g.left, A.left, Epsilon);
    582 	ArcBetweenGraphNodes(g.left, g.right, Epsilon);
    583 	ArcBetweenGraphNodes(A.right, g.right, Epsilon);
    584 
    585 	return( g );
    586 }
    587