1 /* 2 * This file contains code for 3 * 4 * int rexpr(char *expr, char *s); 5 * 6 * which answers 7 * 8 * 1 if 's' is in the language described by the regular expression 'expr' 9 * 0 if it is not 10 * -1 if the regular expression is invalid 11 * 12 * Language membership is determined by constructing a non-deterministic 13 * finite automata (NFA) from the regular expression. A depth- 14 * first-search is performed on the NFA (graph) to check for a match of 's'. 15 * Each non-epsilon arc consumes one character from 's'. Backtracking is 16 * performed to check all possible paths through the NFA. 17 * 18 * Regular expressions follow the meta-language: 19 * 20 * <regExpr> ::= <andExpr> ( '|' <andExpr> )* 21 * 22 * <andExpr> ::= <expr> ( <expr> )* 23 * 24 * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol> 25 * | '(' <regExpr> ')' <repeatSymbol> 26 * | '{' <regExpr> '}' <repeatSymbol> 27 * | <atom> <repeatSymbol> 28 * 29 * <repeatSymbol> ::= { '*' | '+' } 30 * 31 * <atomList> ::= <atom> ( <atom> )* 32 * | { <atomList> } <atom> '-' <atom> { <atomList> } 33 * 34 * <atom> ::= Token[Atom] 35 * 36 * Notes: 37 * ~ means complement the set in [..]. i.e. all characters not listed 38 * * means match 0 or more times (can be on expression or atom) 39 * + means match 1 or more times (can be on expression or atom) 40 * {} optional 41 * () grouping 42 * [] set of atoms 43 * x-y all characters from x to y (found only in [..]) 44 * \xx the character with value xx 45 * 46 * Examples: 47 * [a-z]+ 48 * match 1 or more lower-case letters (e.g. variable) 49 * 50 * 0x[0-9A-Fa-f]+ 51 * match a hex number with 0x on front (e.g. 0xA1FF) 52 * 53 * [0-9]+.[0-9]+{e[0-9]+} 54 * match a floating point number (e.g. 3.14e21) 55 * 56 * Code example: 57 * if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword 58 * 59 * Terence Parr 60 * Purdue University 61 * April 1991 62 */ 63 64 #include <stdio.h> 65 #include <ctype.h> 66 #ifdef __STDC__ 67 #include <stdlib.h> 68 #else 69 #include <malloc.h> 70 #endif 71 #include "rexpr.h" 72 73 #ifdef __USE_PROTOS 74 static int regExpr( GraphPtr g ); 75 static int andExpr( GraphPtr g ); 76 static int expr( GraphPtr g ); 77 static int repeatSymbol( GraphPtr g ); 78 static int atomList( char *p, int complement ); 79 static void next( void ); 80 static ArcPtr newGraphArc( void ); 81 static NodePtr newNode( void ); 82 static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label ); 83 static Graph BuildNFA_atom( int label ); 84 static Graph BuildNFA_AB( Graph A, Graph B ); 85 static Graph BuildNFA_AorB( Graph A, Graph B ); 86 static Graph BuildNFA_set( char *s ); 87 static Graph BuildNFA_Astar( Graph A ); 88 static Graph BuildNFA_Aplus( Graph A ); 89 static Graph BuildNFA_Aoptional( Graph A ); 90 #else 91 static int regExpr(); 92 static int andExpr(); 93 static int expr(); 94 static int repeatSymbol(); 95 static int atomList(); 96 static void next(); 97 static ArcPtr newGraphArc(); 98 static NodePtr newNode(); 99 static int ArcBetweenGraphNode(); 100 static Graph BuildNFA_atom(); 101 static Graph BuildNFA_AB(); 102 static Graph BuildNFA_AorB(); 103 static Graph BuildNFA_set(); 104 static Graph BuildNFA_Astar(); 105 static Graph BuildNFA_Aplus(); 106 static Graph BuildNFA_Aoptional(); 107 #endif 108 109 static char *_c; 110 static int token, tokchar; 111 static NodePtr accept; 112 static NodePtr freelist = NULL; 113 114 /* 115 * return 1 if s in language described by expr 116 * 0 if s is not 117 * -1 if expr is an invalid regular expression 118 */ 119 #ifdef __USE_PROTOS 120 static int rexpr(char *expr,char *s) 121 #else 122 static int rexpr(expr, s) 123 char *expr, *s; 124 #endif 125 { 126 NodePtr p,q; 127 Graph nfa; 128 int result; 129 130 fprintf(stderr, "rexpr(%s,%s);\n", expr,s); 131 freelist = NULL; 132 _c = expr; 133 next(); 134 if ( regExpr(&nfa) == -1 ) return -1; 135 accept = nfa.right; 136 result = match(nfa.left, s); 137 /* free all your memory */ 138 p = q = freelist; 139 while ( p!=NULL ) { q = p->track; free(p); p = q; } 140 return result; 141 } 142 143 /* 144 * do a depth-first-search on the NFA looking for a path from start to 145 * accept state labelled with the characters of 's'. 146 */ 147 148 #ifdef __USE_PROTOS 149 static int match(NodePtr automaton,char *s) 150 #else 151 static int match(automaton, s) 152 NodePtr automaton; 153 char *s; 154 #endif 155 { 156 ArcPtr p; 157 158 if ( automaton == accept && *s == '\0' ) return 1; /* match */ 159 160 for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */ 161 { 162 if ( p->label == Epsilon ) 163 { 164 if ( match(p->target, s) ) return 1; 165 } 166 else if ( p->label == *s ) 167 if ( match(p->target, s+1) ) return 1; 168 } 169 return 0; 170 } 171 172 /* 173 * <regExpr> ::= <andExpr> ( '|' {<andExpr>} )* 174 * 175 * Return -1 if syntax error 176 * Return 0 if none found 177 * Return 1 if a regExrp was found 178 */ 179 180 #ifdef __USE_PROTOS 181 static int regExpr(GraphPtr g) 182 #else 183 static int regExpr(g) 184 GraphPtr g; 185 #endif 186 { 187 Graph g1, g2; 188 189 if ( andExpr(&g1) == -1 ) 190 { 191 return -1; 192 } 193 194 while ( token == '|' ) 195 { 196 int a; 197 next(); 198 a = andExpr(&g2); 199 if ( a == -1 ) return -1; /* syntax error below */ 200 else if ( !a ) return 1; /* empty alternative */ 201 g1 = BuildNFA_AorB(g1, g2); 202 } 203 204 if ( token!='\0' ) return -1; 205 206 *g = g1; 207 return 1; 208 } 209 210 /* 211 * <andExpr> ::= <expr> ( <expr> )* 212 */ 213 214 #ifdef __USE_PROTOS 215 static int andExpr(GraphPtr g) 216 #else 217 static int andExpr(g) 218 GraphPtr g; 219 #endif 220 { 221 Graph g1, g2; 222 223 if ( expr(&g1) == -1 ) 224 { 225 return -1; 226 } 227 228 while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' ) 229 { 230 if (expr(&g2) == -1) return -1; 231 g1 = BuildNFA_AB(g1, g2); 232 } 233 234 *g = g1; 235 return 1; 236 } 237 238 /* 239 * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol> 240 * | '(' <regExpr> ')' <repeatSymbol> 241 * | '{' <regExpr> '}' <repeatSymbol> 242 * | <atom> <repeatSymbol> 243 */ 244 245 #ifdef __USE_PROTOS 246 static int expr(GraphPtr g) 247 #else 248 static int expr(g) 249 GraphPtr g; 250 #endif 251 { 252 int complement = 0; 253 char s[257]; /* alloc space for string of char in [] */ 254 255 if ( token == '~' || token == '[' ) 256 { 257 if ( token == '~' ) {complement = 1; next();} 258 if ( token != '[' ) return -1; 259 next(); 260 if ( atomList( s, complement ) == -1 ) return -1; 261 *g = BuildNFA_set( s ); 262 if ( token != ']' ) return -1; 263 next(); 264 repeatSymbol( g ); 265 return 1; 266 } 267 if ( token == '(' ) 268 { 269 next(); 270 if ( regExpr( g ) == -1 ) return -1; 271 if ( token != ')' ) return -1; 272 next(); 273 repeatSymbol( g ); 274 return 1; 275 } 276 if ( token == '{' ) 277 { 278 next(); 279 if ( regExpr( g ) == -1 ) return -1; 280 if ( token != '}' ) return -1; 281 next(); 282 /* S p e c i a l C a s e O p t i o n a l { } */ 283 if ( token != '*' && token != '+' ) 284 { 285 *g = BuildNFA_Aoptional( *g ); 286 } 287 repeatSymbol( g ); 288 return 1; 289 } 290 if ( token == Atom ) 291 { 292 *g = BuildNFA_atom( tokchar ); 293 next(); 294 repeatSymbol( g ); 295 return 1; 296 } 297 298 return -1; 299 } 300 301 /* 302 * <repeatSymbol> ::= { '*' | '+' } 303 */ 304 #ifdef __USE_PROTOS 305 static int repeatSymbol(GraphPtr g) 306 #else 307 static int repeatSymbol(g) 308 GraphPtr g; 309 #endif 310 { 311 switch ( token ) 312 { 313 case '*' : *g = BuildNFA_Astar( *g ); next(); break; 314 case '+' : *g = BuildNFA_Aplus( *g ); next(); break; 315 } 316 return 1; 317 } 318 319 /* 320 * <atomList> ::= <atom> { <atom> }* 321 * { <atomList> } <atom> '-' <atom> { <atomList> } 322 * 323 * a-b is same as ab 324 * q-a is same as q 325 */ 326 327 #ifdef __USE_PROTOS 328 static int atomList(char *p, int complement) 329 #else 330 static int atomList(p, complement) 331 char *p; 332 int complement; 333 #endif 334 { 335 static unsigned char set[256]; /* no duplicates */ 336 int first, last, i; 337 char *s = p; 338 339 if ( token != Atom ) return -1; 340 341 for (i=0; i<256; i++) set[i] = 0; 342 while ( token == Atom ) 343 { 344 if ( !set[tokchar] ) *s++ = tokchar; 345 set[tokchar] = 1; /* Add atom to set */ 346 next(); 347 if ( token == '-' ) /* have we found '-' */ 348 { 349 first = *(s-1); /* Get last char */ 350 next(); 351 if ( token != Atom ) return -1; 352 else 353 { 354 last = tokchar; 355 } 356 for (i = first+1; i <= last; i++) 357 { 358 if ( !set[tokchar] ) *s++ = i; 359 set[i] = 1; /* Add atom to set */ 360 } 361 next(); 362 } 363 } 364 *s = '\0'; 365 if ( complement ) 366 { 367 for (i=0; i<256; i++) set[i] = !set[i]; 368 for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i; 369 *s = '\0'; 370 } 371 return 1; 372 } 373 374 /* a somewhat stupid lexical analyzer */ 375 376 #ifdef __USE_PROTOS 377 static void next(void) 378 #else 379 static void next() 380 #endif 381 { 382 while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++; 383 if ( *_c=='\\' ) 384 { 385 _c++; 386 if ( isdigit(*_c) ) 387 { 388 int n=0; 389 while ( isdigit(*_c) ) 390 { 391 n = n*10 + (*_c++ - '0'); 392 } 393 if ( n>255 ) n=255; 394 tokchar = n; 395 } 396 else 397 { 398 switch (*_c) 399 { 400 case 'n' : tokchar = '\n'; break; 401 case 't' : tokchar = '\t'; break; 402 case 'r' : tokchar = '\r'; break; 403 default : tokchar = *_c; 404 } 405 _c++; 406 } 407 token = Atom; 408 } 409 else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' && 410 *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' && 411 *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' ) 412 { 413 token = Atom; 414 tokchar = *_c++; 415 } 416 else 417 { 418 token = tokchar = *_c++; 419 } 420 } 421 422 /* N F A B u i l d i n g R o u t i n e s */ 423 424 #ifdef __USE_PROTOS 425 static ArcPtr newGraphArc(void) 426 #else 427 static ArcPtr newGraphArc() 428 #endif 429 { 430 ArcPtr p; 431 p = (ArcPtr) calloc(1, sizeof(Arc)); 432 if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);} 433 if ( freelist != NULL ) p->track = (ArcPtr) freelist; 434 freelist = (NodePtr) p; 435 return p; 436 } 437 438 #ifdef __USE_PROTOS 439 static NodePtr newNode(void) 440 #else 441 static NodePtr newNode() 442 #endif 443 { 444 NodePtr p; 445 p = (NodePtr) calloc(1, sizeof(Node)); 446 if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);} 447 if ( freelist != NULL ) p->track = freelist; 448 freelist = p; 449 return p; 450 } 451 452 #ifdef __USE_PROTOS 453 static void ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label) 454 #else 455 static void ArcBetweenGraphNodes(i, j, label) 456 NodePtr i, j; 457 int label; 458 #endif 459 { 460 ArcPtr a; 461 462 a = newGraphArc(); 463 if ( i->arcs == NULL ) i->arctail = i->arcs = a; 464 else {(i->arctail)->next = a; i->arctail = a;} 465 a->label = label; 466 a->target = j; 467 } 468 469 #ifdef __USE_PROTOS 470 static Graph BuildNFA_atom(int label) 471 #else 472 static Graph BuildNFA_atom(label) 473 int label; 474 #endif 475 { 476 Graph g; 477 478 g.left = newNode(); 479 g.right = newNode(); 480 ArcBetweenGraphNodes(g.left, g.right, label); 481 return( g ); 482 } 483 484 #ifdef __USE_PROTOS 485 static Graph BuildNFA_AB(Graph A,Graph B) 486 #else 487 static Graph BuildNFA_AB(A, B) 488 Graph A, B; 489 #endif 490 { 491 Graph g; 492 493 ArcBetweenGraphNodes(A.right, B.left, Epsilon); 494 g.left = A.left; 495 g.right = B.right; 496 return( g ); 497 } 498 499 #ifdef __USE_PROTOS 500 static Graph BuildNFA_AorB(Graph A,Graph B) 501 #else 502 static Graph BuildNFA_AorB(A, B) 503 Graph A, B; 504 #endif 505 { 506 Graph g; 507 508 g.left = newNode(); 509 ArcBetweenGraphNodes(g.left, A.left, Epsilon); 510 ArcBetweenGraphNodes(g.left, B.left, Epsilon); 511 g.right = newNode(); 512 ArcBetweenGraphNodes(A.right, g.right, Epsilon); 513 ArcBetweenGraphNodes(B.right, g.right, Epsilon); 514 return( g ); 515 } 516 517 #ifdef __USE_PROTOS 518 static Graph BuildNFA_set(char *s) 519 #else 520 static Graph BuildNFA_set( s ) 521 char *s; 522 #endif 523 { 524 Graph g; 525 526 if ( s == NULL ) return g; 527 528 g.left = newNode(); 529 g.right = newNode(); 530 while ( *s != '\0' ) 531 { 532 ArcBetweenGraphNodes(g.left, g.right, *s++); 533 } 534 return g; 535 } 536 537 #ifdef __USE_PROTOS 538 static Graph BuildNFA_Astar(Graph A) 539 #else 540 static Graph BuildNFA_Astar( A ) 541 Graph A; 542 #endif 543 { 544 Graph g; 545 546 g.left = newNode(); 547 g.right = newNode(); 548 549 ArcBetweenGraphNodes(g.left, A.left, Epsilon); 550 ArcBetweenGraphNodes(g.left, g.right, Epsilon); 551 ArcBetweenGraphNodes(A.right, g.right, Epsilon); 552 ArcBetweenGraphNodes(A.right, A.left, Epsilon); 553 554 return( g ); 555 } 556 557 #ifdef __USE_PROTOS 558 static Graph BuildNFA_Aplus(Graph A) 559 #else 560 static Graph BuildNFA_Aplus( A ) 561 Graph A; 562 #endif 563 { 564 ArcBetweenGraphNodes(A.right, A.left, Epsilon); 565 566 return( A ); 567 } 568 569 #ifdef __USE_PROTOS 570 static Graph BuildNFA_Aoptional(Graph A) 571 #else 572 static Graph BuildNFA_Aoptional( A ) 573 Graph A; 574 #endif 575 { 576 Graph g; 577 578 g.left = newNode(); 579 g.right = newNode(); 580 581 ArcBetweenGraphNodes(g.left, A.left, Epsilon); 582 ArcBetweenGraphNodes(g.left, g.right, Epsilon); 583 ArcBetweenGraphNodes(A.right, g.right, Epsilon); 584 585 return( g ); 586 } 587