1 #include <time.h> 2 #include <string.h> 3 #include <stdio.h> 4 #include <ctype.h> 5 6 #include "tools/re2c/globals.h" 7 #include "tools/re2c/parse.h" 8 #include "tools/re2c/dfa.h" 9 10 static Symbol *first = NULL; 11 12 void 13 Symbol_init(Symbol *r, const SubStr *str) 14 { 15 r->next = first; 16 Str_init(&r->name, str); 17 r->re = NULL; 18 first = r; 19 } 20 21 Symbol * 22 Symbol_find(const SubStr *str) 23 { 24 Symbol *sym; 25 for(sym = first; sym; sym = sym->next) 26 if(SubStr_eq(&sym->name, str)) return sym; 27 return Symbol_new(str); 28 } 29 30 /* 31 void showIns(FILE *o, const Ins *i, const Ins *base){ 32 o.width(3); 33 o << &i - &base << ": "; 34 switch(i.i.tag){ 35 case CHAR: { 36 o << "match "; 37 for(const Ins *j = &(&i)[1]; j < (Ins*) i.i.link; ++j) 38 prtCh(o, j->c.value); 39 break; 40 } case GOTO: 41 o << "goto " << ((Ins*) i.i.link - &base); 42 break; 43 case FORK: 44 o << "fork " << ((Ins*) i.i.link - &base); 45 break; 46 case CTXT: 47 o << "term " << ((RuleOp*) i.i.link)->accept; 48 break; 49 case TERM: 50 o << "term " << ((RuleOp*) i.i.link)->accept; 51 break; 52 } 53 o << "\n"; 54 } 55 */ 56 57 static unsigned int 58 AltOp_fixedLength(RegExp *r) 59 { 60 unsigned int l1 = RegExp_fixedLength(r->d.AltCatOp.exp1); 61 /* XXX? Should be exp2? */ 62 unsigned int l2 = RegExp_fixedLength(r->d.AltCatOp.exp1); 63 if(l1 != l2 || l1 == ~0u) 64 return ~0u; 65 return l1; 66 } 67 68 static unsigned int 69 CatOp_fixedLength(RegExp *r) 70 { 71 unsigned int l1, l2; 72 if((l1 = RegExp_fixedLength(r->d.AltCatOp.exp1)) != ~0u ) 73 if((l2 = RegExp_fixedLength(r->d.AltCatOp.exp2)) != ~0u) 74 return l1+l2; 75 return ~0u; 76 } 77 78 unsigned int 79 RegExp_fixedLength(RegExp *r) 80 { 81 switch (r->type) { 82 case NULLOP: 83 return 0; 84 case MATCHOP: 85 return 1; 86 case ALTOP: 87 return AltOp_fixedLength(r); 88 case CATOP: 89 return CatOp_fixedLength(r); 90 default: 91 return ~0u; 92 } 93 return ~0u; 94 } 95 96 void 97 RegExp_calcSize(RegExp *re, Char *rep) 98 { 99 Range *r; 100 unsigned int c; 101 102 switch (re->type) { 103 case NULLOP: 104 re->size = 0; 105 break; 106 case MATCHOP: 107 re->size = 1; 108 for(r = re->d.match; r; r = r->next) 109 for(c = r->lb; c < r->ub; ++c) 110 if(rep[c] == c) 111 ++re->size; 112 break; 113 case RULEOP: 114 RegExp_calcSize(re->d.RuleOp.exp, rep); 115 RegExp_calcSize(re->d.RuleOp.ctx, rep); 116 re->size = re->d.RuleOp.exp->size + re->d.RuleOp.ctx->size + 1; 117 break; 118 case ALTOP: 119 RegExp_calcSize(re->d.AltCatOp.exp1, rep); 120 RegExp_calcSize(re->d.AltCatOp.exp2, rep); 121 re->size = re->d.AltCatOp.exp1->size 122 + re->d.AltCatOp.exp2->size + 2; 123 break; 124 case CATOP: 125 RegExp_calcSize(re->d.AltCatOp.exp1, rep); 126 RegExp_calcSize(re->d.AltCatOp.exp2, rep); 127 re->size = re->d.AltCatOp.exp1->size + re->d.AltCatOp.exp2->size; 128 break; 129 case CLOSEOP: 130 RegExp_calcSize(re->d.exp, rep); 131 re->size = re->d.exp->size + 1; 132 break; 133 case CLOSEVOP: 134 RegExp_calcSize(re->d.CloseVOp.exp, rep); 135 136 if (re->d.CloseVOp.max >= 0) 137 re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + 138 ((1 + re->d.CloseVOp.exp->size) * 139 (re->d.CloseVOp.max - re->d.CloseVOp.min)); 140 else 141 re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + 1; 142 break; 143 } 144 } 145 146 static void 147 MatchOp_compile(RegExp *re, Char *rep, Ins *i) 148 { 149 Ins *j; 150 unsigned int bump; 151 Range *r; 152 unsigned int c; 153 154 i->i.tag = CHAR; 155 i->i.link = &i[re->size]; 156 j = &i[1]; 157 bump = re->size; 158 for(r = re->d.match; r; r = r->next){ 159 for(c = r->lb; c < r->ub; ++c){ 160 if(rep[c] == c){ 161 j->c.value = c; 162 j->c.bump = --bump; 163 j++; 164 } 165 } 166 } 167 } 168 169 static void 170 AltOp_compile(RegExp *re, Char *rep, Ins *i){ 171 Ins *j; 172 173 i->i.tag = FORK; 174 j = &i[re->d.AltCatOp.exp1->size + 1]; 175 i->i.link = &j[1]; 176 RegExp_compile(re->d.AltCatOp.exp1, rep, &i[1]); 177 j->i.tag = GOTO; 178 j->i.link = &j[re->d.AltCatOp.exp2->size + 1]; 179 RegExp_compile(re->d.AltCatOp.exp2, rep, &j[1]); 180 } 181 182 void 183 RegExp_compile(RegExp *re, Char *rep, Ins *i) 184 { 185 Ins *jumppoint; 186 int st = 0; 187 188 switch (re->type) { 189 case NULLOP: 190 break; 191 case MATCHOP: 192 MatchOp_compile(re, rep, i); 193 break; 194 case RULEOP: 195 re->d.RuleOp.ins = i; 196 RegExp_compile(re->d.RuleOp.exp, rep, &i[0]); 197 i += re->d.RuleOp.exp->size; 198 RegExp_compile(re->d.RuleOp.ctx, rep, &i[0]); 199 i += re->d.RuleOp.ctx->size; 200 i->i.tag = TERM; 201 i->i.link = re; 202 break; 203 case ALTOP: 204 AltOp_compile(re, rep, i); 205 break; 206 case CATOP: 207 RegExp_compile(re->d.AltCatOp.exp1, rep, &i[0]); 208 RegExp_compile(re->d.AltCatOp.exp2, rep, 209 &i[re->d.AltCatOp.exp1->size]); 210 break; 211 case CLOSEOP: 212 RegExp_compile(re->d.exp, rep, &i[0]); 213 i += re->d.exp->size; 214 i->i.tag = FORK; 215 i->i.link = i - re->d.exp->size; 216 break; 217 case CLOSEVOP: 218 jumppoint = i + ((1 + re->d.CloseVOp.exp->size) * 219 (re->d.CloseVOp.max - re->d.CloseVOp.min)); 220 for(st = re->d.CloseVOp.min; st < re->d.CloseVOp.max; st++) { 221 i->i.tag = FORK; 222 i->i.link = jumppoint; 223 i+=1; 224 RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]); 225 i += re->d.CloseVOp.exp->size; 226 } 227 for(st = 0; st < re->d.CloseVOp.min; st++) { 228 RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]); 229 i += re->d.CloseVOp.exp->size; 230 if(re->d.CloseVOp.max < 0 && st == 0) { 231 i->i.tag = FORK; 232 i->i.link = i - re->d.CloseVOp.exp->size; 233 i++; 234 } 235 } 236 break; 237 } 238 } 239 240 static void 241 MatchOp_split(RegExp *re, CharSet *s) 242 { 243 Range *r; 244 unsigned int c; 245 246 for(r = re->d.match; r; r = r->next){ 247 for(c = r->lb; c < r->ub; ++c){ 248 CharPtn *x = s->rep[c], *a = x->nxt; 249 if(!a){ 250 if(x->card == 1) 251 continue; 252 x->nxt = a = s->freeHead; 253 if(!(s->freeHead = s->freeHead->nxt)) 254 s->freeTail = &s->freeHead; 255 a->nxt = NULL; 256 x->fix = s->fix; 257 s->fix = x; 258 } 259 if(--(x->card) == 0){ 260 *s->freeTail = x; 261 *(s->freeTail = &x->nxt) = NULL; 262 } 263 s->rep[c] = a; 264 ++(a->card); 265 } 266 } 267 for(; s->fix; s->fix = s->fix->fix) 268 if(s->fix->card) 269 s->fix->nxt = NULL; 270 } 271 272 void 273 RegExp_split(RegExp *re, CharSet *s) 274 { 275 switch (re->type) { 276 case NULLOP: 277 break; 278 case MATCHOP: 279 MatchOp_split(re, s); 280 break; 281 case RULEOP: 282 RegExp_split(re->d.RuleOp.exp, s); 283 RegExp_split(re->d.RuleOp.ctx, s); 284 break; 285 case ALTOP: 286 /* FALLTHROUGH */ 287 case CATOP: 288 RegExp_split(re->d.AltCatOp.exp1, s); 289 RegExp_split(re->d.AltCatOp.exp2, s); 290 break; 291 case CLOSEOP: 292 RegExp_split(re->d.exp, s); 293 break; 294 case CLOSEVOP: 295 RegExp_split(re->d.CloseVOp.exp, s); 296 break; 297 } 298 } 299 300 void 301 RegExp_display(RegExp *re, FILE *o) 302 { 303 switch (re->type) { 304 case NULLOP: 305 fputc('_', o); 306 break; 307 case MATCHOP: 308 Range_out(o, re->d.match); 309 break; 310 case RULEOP: 311 RegExp_display(re->d.RuleOp.exp, o); 312 fputc('/', o); 313 RegExp_display(re->d.RuleOp.ctx, o); 314 fputc(';', o); 315 break; 316 case ALTOP: 317 RegExp_display(re->d.AltCatOp.exp1, o); 318 fputc('|', o); 319 RegExp_display(re->d.AltCatOp.exp2, o); 320 break; 321 case CATOP: 322 RegExp_display(re->d.AltCatOp.exp1, o); 323 RegExp_display(re->d.AltCatOp.exp2, o); 324 break; 325 case CLOSEOP: 326 RegExp_display(re->d.exp, o); 327 fputc('+', o); 328 break; 329 } 330 } 331 332 void 333 Range_out(FILE *o, const Range *r) 334 { 335 if(!r) 336 return; 337 338 if((r->ub - r->lb) == 1){ 339 prtCh(o, r->lb); 340 } else { 341 prtCh(o, r->lb); 342 fputc('-', o); 343 prtCh(o, r->ub-1); 344 } 345 Range_out(o, r->next); 346 } 347 348 static Range *doUnion(Range *r1, Range *r2){ 349 Range *r, **rP = &r; 350 for(;;){ 351 Range *s; 352 if(r1->lb <= r2->lb){ 353 s = Range_new_copy(r1); 354 } else { 355 s = Range_new_copy(r2); 356 } 357 *rP = s; 358 rP = &s->next; 359 for(;;){ 360 if(r1->lb <= r2->lb){ 361 if(r1->lb > s->ub) 362 break; 363 if(r1->ub > s->ub) 364 s->ub = r1->ub; 365 if(!(r1 = r1->next)){ 366 unsigned int ub = 0; 367 for(; r2 && r2->lb <= s->ub; r2 = r2->next) 368 ub = r2->ub; 369 if(ub > s->ub) 370 s->ub = ub; 371 *rP = r2; 372 return r; 373 } 374 } else { 375 if(r2->lb > s->ub) 376 break; 377 if(r2->ub > s->ub) 378 s->ub = r2->ub; 379 if(!(r2 = r2->next)){ 380 unsigned int ub = 0; 381 for(; r1 && r1->lb <= s->ub; r1 = r1->next) 382 ub = r1->ub; 383 if(ub > s->ub) 384 s->ub = ub; 385 *rP = r1; 386 return r; 387 } 388 } 389 } 390 } 391 *rP = NULL; 392 return r; 393 } 394 395 static Range *doDiff(Range *r1, Range *r2){ 396 Range *r, *s, **rP = &r; 397 for(; r1; r1 = r1->next){ 398 unsigned int lb = r1->lb; 399 for(; r2 && r2->ub <= r1->lb; r2 = r2->next); 400 for(; r2 && r2->lb < r1->ub; r2 = r2->next){ 401 if(lb < r2->lb){ 402 *rP = s = Range_new(lb, r2->lb); 403 rP = &s->next; 404 } 405 if((lb = r2->ub) >= r1->ub) 406 goto noMore; 407 } 408 *rP = s = Range_new(lb, r1->ub); 409 rP = &s->next; 410 noMore:; 411 } 412 *rP = NULL; 413 return r; 414 } 415 416 static RegExp *merge(RegExp *m1, RegExp *m2){ 417 if(!m1) 418 return m2; 419 if(!m2) 420 return m1; 421 return RegExp_new_MatchOp(doUnion(m1->d.match, m2->d.match)); 422 } 423 424 RegExp *mkDiff(RegExp *e1, RegExp *e2){ 425 RegExp *m1, *m2; 426 Range *r; 427 if(!(m1 = RegExp_isA(e1, MATCHOP))) 428 return NULL; 429 if(!(m2 = RegExp_isA(e2, MATCHOP))) 430 return NULL; 431 r = doDiff(m1->d.match, m2->d.match); 432 return r? RegExp_new_MatchOp(r) : RegExp_new_NullOp(); 433 } 434 435 static RegExp *doAlt(RegExp *e1, RegExp *e2){ 436 if(!e1) 437 return e2; 438 if(!e2) 439 return e1; 440 return RegExp_new_AltOp(e1, e2); 441 } 442 443 RegExp *mkAlt(RegExp *e1, RegExp *e2){ 444 RegExp *a; 445 RegExp *m1, *m2; 446 if((a = RegExp_isA(e1, ALTOP))){ 447 if((m1 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP))) 448 e1 = a->d.AltCatOp.exp2; 449 } else if((m1 = RegExp_isA(e1, MATCHOP))){ 450 e1 = NULL; 451 } 452 if((a = RegExp_isA(e2, ALTOP))){ 453 if((m2 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP))) 454 e2 = a->d.AltCatOp.exp2; 455 } else if((m2 = RegExp_isA(e2, MATCHOP))){ 456 e2 = NULL; 457 } 458 return doAlt(merge(m1, m2), doAlt(e1, e2)); 459 } 460 461 static unsigned char unescape(SubStr *s){ 462 unsigned char c; 463 unsigned char v; 464 s->len--; 465 if((c = *s->str++) != '\\' || s->len == 0) 466 return xlat[c]; 467 s->len--; 468 switch(c = *s->str++){ 469 case 'n': 470 return xlat['\n']; 471 case 't': 472 return xlat['\t']; 473 case 'v': 474 return xlat['\v']; 475 case 'b': 476 return xlat['\b']; 477 case 'r': 478 return xlat['\r']; 479 case 'f': 480 return xlat['\f']; 481 case 'a': 482 return xlat['\a']; 483 case '0': case '1': case '2': case '3': 484 case '4': case '5': case '6': case '7': { 485 v = c - '0'; 486 for(; s->len != 0 && '0' <= (c = *s->str) && c <= '7'; s->len--, s->str++) 487 v = v*8 + (c - '0'); 488 return v; 489 } default: 490 return xlat[c]; 491 } 492 } 493 494 static Range *getRange(SubStr *s){ 495 unsigned char lb = unescape(s), ub; 496 if(s->len < 2 || *s->str != '-'){ 497 ub = lb; 498 } else { 499 s->len--; s->str++; 500 ub = unescape(s); 501 if(ub < lb){ 502 unsigned char tmp; 503 tmp = lb; lb = ub; ub = tmp; 504 } 505 } 506 return Range_new(lb, ub+1); 507 } 508 509 static RegExp *matchChar(unsigned int c){ 510 return RegExp_new_MatchOp(Range_new(c, c+1)); 511 } 512 513 RegExp *strToRE(SubStr s){ 514 RegExp *re; 515 s.len -= 2; s.str += 1; 516 if(s.len == 0) 517 return RegExp_new_NullOp(); 518 re = matchChar(unescape(&s)); 519 while(s.len > 0) 520 re = RegExp_new_CatOp(re, matchChar(unescape(&s))); 521 return re; 522 } 523 524 RegExp *strToCaseInsensitiveRE(SubStr s){ 525 unsigned char c; 526 RegExp *re, *reL, *reU; 527 s.len -= 2; s.str += 1; 528 if(s.len == 0) 529 return RegExp_new_NullOp(); 530 c = unescape(&s); 531 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { 532 reL = matchChar(tolower(c)); 533 reU = matchChar(toupper(c)); 534 re = mkAlt(reL, reU); 535 } else { 536 re = matchChar(c); 537 } 538 while(s.len > 0) { 539 c = unescape(&s); 540 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { 541 reL = matchChar(tolower(c)); 542 reU = matchChar(toupper(c)); 543 re = RegExp_new_CatOp(re, mkAlt(reL, reU)); 544 } else { 545 re = RegExp_new_CatOp(re, matchChar(c)); 546 } 547 } 548 return re; 549 } 550 551 RegExp *ranToRE(SubStr s){ 552 Range *r; 553 s.len -= 2; s.str += 1; 554 if(s.len == 0) 555 return RegExp_new_NullOp(); 556 r = getRange(&s); 557 while(s.len > 0) 558 r = doUnion(r, getRange(&s)); 559 return RegExp_new_MatchOp(r); 560 } 561 562 RegExp *invToRE(SubStr s) 563 { 564 RegExp *any, *ran, *inv; 565 SubStr *ss; 566 567 568 s.len--; 569 s.str++; 570 571 ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]")); 572 any = ranToRE(*ss); 573 free(ss); 574 if (s.len <= 2) 575 return any; 576 577 ran = ranToRE(s); 578 inv = mkDiff(any, ran); 579 580 free(ran); 581 free(any); 582 583 return inv; 584 } 585 586 RegExp *mkDot() 587 { 588 SubStr *ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]")); 589 RegExp * any = ranToRE(*ss); 590 RegExp * ran = matchChar('\n'); 591 RegExp * inv = mkDiff(any, ran); 592 593 free(ss); 594 free(ran); 595 free(any); 596 597 return inv; 598 } 599 600 RegExp * 601 RegExp_new_RuleOp(RegExp *e, RegExp *c, Token *t, unsigned int a) 602 { 603 RegExp *r = malloc(sizeof(RegExp)); 604 r->type = RULEOP; 605 r->d.RuleOp.exp = e; 606 r->d.RuleOp.ctx = c; 607 r->d.RuleOp.ins = NULL; 608 r->d.RuleOp.accept = a; 609 r->d.RuleOp.code = t; 610 return r; 611 } 612 613 static void optimize(Ins *i){ 614 while(!isMarked(i)){ 615 mark(i); 616 if(i->i.tag == CHAR){ 617 i = (Ins*) i->i.link; 618 } else if(i->i.tag == GOTO || i->i.tag == FORK){ 619 Ins *target = (Ins*) i->i.link; 620 optimize(target); 621 if(target->i.tag == GOTO) 622 i->i.link = target->i.link == target? i : target; 623 if(i->i.tag == FORK){ 624 Ins *follow = (Ins*) &i[1]; 625 optimize(follow); 626 if(follow->i.tag == GOTO && follow->i.link == follow){ 627 i->i.tag = GOTO; 628 } else if(i->i.link == i){ 629 i->i.tag = GOTO; 630 i->i.link = follow; 631 } 632 } 633 return; 634 } else { 635 ++i; 636 } 637 } 638 } 639 640 void genCode(FILE *o, RegExp *re){ 641 CharSet cs; 642 unsigned int j; 643 Char rep[nChars]; 644 Ins *ins, *eoi; 645 DFA *dfa; 646 647 memset(&cs, 0, sizeof(cs)); 648 for(j = 0; j < nChars; ++j){ 649 cs.rep[j] = &cs.ptn[0]; 650 cs.ptn[j].nxt = &cs.ptn[j+1]; 651 } 652 cs.freeHead = &cs.ptn[1]; 653 *(cs.freeTail = &cs.ptn[nChars-1].nxt) = NULL; 654 cs.ptn[0].card = nChars; 655 cs.ptn[0].nxt = NULL; 656 RegExp_split(re, &cs); 657 /* 658 for(unsigned int k = 0; k < nChars;){ 659 for(j = k; ++k < nChars && cs.rep[k] == cs.rep[j];); 660 printSpan(cerr, j, k); 661 cerr << "\t" << cs.rep[j] - &cs.ptn[0] << endl; 662 } 663 */ 664 for(j = 0; j < nChars; ++j){ 665 if(!cs.rep[j]->nxt) 666 cs.rep[j]->nxt = &cs.ptn[j]; 667 rep[j] = (Char) (cs.rep[j]->nxt - &cs.ptn[0]); 668 } 669 670 RegExp_calcSize(re, rep); 671 ins = malloc(sizeof(Ins)*(re->size+1)); 672 memset(ins, 0, (re->size+1)*sizeof(Ins)); 673 RegExp_compile(re, rep, ins); 674 eoi = &ins[re->size]; 675 eoi->i.tag = GOTO; 676 eoi->i.link = eoi; 677 678 optimize(ins); 679 for(j = 0; j < re->size;){ 680 unmark(&ins[j]); 681 if(ins[j].i.tag == CHAR){ 682 j = (Ins*) ins[j].i.link - ins; 683 } else { 684 j++; 685 } 686 } 687 688 dfa = DFA_new(ins, re->size, 0, 256, rep); 689 DFA_emit(dfa, o); 690 DFA_delete(dfa); 691 free(ins); 692 } 693