1 /*- 2 * This code is derived from OpenBSD's libc/regex, original license follows: 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 /* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45 #ifdef SNAMES 46 #define matcher smatcher 47 #define fast sfast 48 #define slow sslow 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #define nope snope 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #define nope lnope 68 #endif 69 70 /* another structure passed up and down to avoid zillions of parameters */ 71 struct match { 72 struct re_guts *g; 73 int eflags; 74 llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 const char *offp; /* offsets work from here */ 76 const char *beginp; /* start of string -- virtual NUL precedes */ 77 const char *endp; /* end of string -- virtual NUL here */ 78 const char *coldp; /* can be no match starting before here */ 79 const char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85 }; 86 87 static int matcher(struct re_guts *, const char *, size_t, 88 llvm_regmatch_t[], int); 89 static const char *dissect(struct match *, const char *, const char *, sopno, 90 sopno); 91 static const char *backref(struct match *, const char *, const char *, sopno, 92 sopno, sopno, int); 93 static const char *fast(struct match *, const char *, const char *, sopno, sopno); 94 static const char *slow(struct match *, const char *, const char *, sopno, sopno); 95 static states step(struct re_guts *, sopno, sopno, states, int, states); 96 #define MAX_RECURSION 100 97 #define BOL (OUT+1) 98 #define EOL (BOL+1) 99 #define BOLEOL (BOL+2) 100 #define NOTHING (BOL+3) 101 #define BOW (BOL+4) 102 #define EOW (BOL+5) 103 #define CODEMAX (BOL+5) /* highest code used */ 104 #define NONCHAR(c) ((c) > CHAR_MAX) 105 #define NNONCHAR (CODEMAX-CHAR_MAX) 106 #ifdef REDEBUG 107 static void print(struct match *, char *, states, int, FILE *); 108 #endif 109 #ifdef REDEBUG 110 static void at(struct match *, char *, char *, char *, sopno, sopno); 111 #endif 112 #ifdef REDEBUG 113 static char *pchar(int); 114 #endif 115 116 #ifdef REDEBUG 117 #define SP(t, s, c) print(m, t, s, c, stdout) 118 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 119 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 120 static int nope = 0; 121 #else 122 #define SP(t, s, c) /* nothing */ 123 #define AT(t, p1, p2, s1, s2) /* nothing */ 124 #define NOTE(s) /* nothing */ 125 #endif 126 127 /* 128 - matcher - the actual matching engine 129 */ 130 static int /* 0 success, REG_NOMATCH failure */ 131 matcher(struct re_guts *g, const char *string, size_t nmatch, 132 llvm_regmatch_t pmatch[], 133 int eflags) 134 { 135 const char *endp; 136 size_t i; 137 struct match mv; 138 struct match *m = &mv; 139 const char *dp; 140 const sopno gf = g->firststate+1; /* +1 for OEND */ 141 const sopno gl = g->laststate; 142 const char *start; 143 const char *stop; 144 145 /* simplify the situation where possible */ 146 if (g->cflags®_NOSUB) 147 nmatch = 0; 148 if (eflags®_STARTEND) { 149 start = string + pmatch[0].rm_so; 150 stop = string + pmatch[0].rm_eo; 151 } else { 152 start = string; 153 stop = start + strlen(start); 154 } 155 if (stop < start) 156 return(REG_INVARG); 157 158 /* prescreening; this does wonders for this rather slow code */ 159 if (g->must != NULL) { 160 for (dp = start; dp < stop; dp++) 161 if (*dp == g->must[0] && stop - dp >= g->mlen && 162 memcmp(dp, g->must, (size_t)g->mlen) == 0) 163 break; 164 if (dp == stop) /* we didn't find g->must */ 165 return(REG_NOMATCH); 166 } 167 168 /* match struct setup */ 169 m->g = g; 170 m->eflags = eflags; 171 m->pmatch = NULL; 172 m->lastpos = NULL; 173 m->offp = string; 174 m->beginp = start; 175 m->endp = stop; 176 STATESETUP(m, 4); 177 SETUP(m->st); 178 SETUP(m->fresh); 179 SETUP(m->tmp); 180 SETUP(m->empty); 181 CLEAR(m->empty); 182 183 /* this loop does only one repetition except for backrefs */ 184 for (;;) { 185 endp = fast(m, start, stop, gf, gl); 186 if (endp == NULL) { /* a miss */ 187 free(m->pmatch); 188 free((void*)m->lastpos); 189 STATETEARDOWN(m); 190 return(REG_NOMATCH); 191 } 192 if (nmatch == 0 && !g->backrefs) 193 break; /* no further info needed */ 194 195 /* where? */ 196 assert(m->coldp != NULL); 197 for (;;) { 198 NOTE("finding start"); 199 endp = slow(m, m->coldp, stop, gf, gl); 200 if (endp != NULL) 201 break; 202 assert(m->coldp < m->endp); 203 m->coldp++; 204 } 205 if (nmatch == 1 && !g->backrefs) 206 break; /* no further info needed */ 207 208 /* oh my, they want the subexpressions... */ 209 if (m->pmatch == NULL) 210 m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) * 211 sizeof(llvm_regmatch_t)); 212 if (m->pmatch == NULL) { 213 STATETEARDOWN(m); 214 return(REG_ESPACE); 215 } 216 for (i = 1; i <= m->g->nsub; i++) 217 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 218 if (!g->backrefs && !(m->eflags®_BACKR)) { 219 NOTE("dissecting"); 220 dp = dissect(m, m->coldp, endp, gf, gl); 221 } else { 222 if (g->nplus > 0 && m->lastpos == NULL) 223 m->lastpos = (const char **)malloc((g->nplus+1) * 224 sizeof(char *)); 225 if (g->nplus > 0 && m->lastpos == NULL) { 226 free(m->pmatch); 227 STATETEARDOWN(m); 228 return(REG_ESPACE); 229 } 230 NOTE("backref dissect"); 231 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 232 } 233 if (dp != NULL) 234 break; 235 236 /* uh-oh... we couldn't find a subexpression-level match */ 237 assert(g->backrefs); /* must be back references doing it */ 238 assert(g->nplus == 0 || m->lastpos != NULL); 239 for (;;) { 240 if (dp != NULL || endp <= m->coldp) 241 break; /* defeat */ 242 NOTE("backoff"); 243 endp = slow(m, m->coldp, endp-1, gf, gl); 244 if (endp == NULL) 245 break; /* defeat */ 246 /* try it on a shorter possibility */ 247 #ifndef NDEBUG 248 for (i = 1; i <= m->g->nsub; i++) { 249 assert(m->pmatch[i].rm_so == -1); 250 assert(m->pmatch[i].rm_eo == -1); 251 } 252 #endif 253 NOTE("backoff dissect"); 254 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 255 } 256 assert(dp == NULL || dp == endp); 257 if (dp != NULL) /* found a shorter one */ 258 break; 259 260 /* despite initial appearances, there is no match here */ 261 NOTE("false alarm"); 262 if (m->coldp == stop) 263 break; 264 start = m->coldp + 1; /* recycle starting later */ 265 } 266 267 /* fill in the details if requested */ 268 if (nmatch > 0) { 269 pmatch[0].rm_so = m->coldp - m->offp; 270 pmatch[0].rm_eo = endp - m->offp; 271 } 272 if (nmatch > 1) { 273 assert(m->pmatch != NULL); 274 for (i = 1; i < nmatch; i++) 275 if (i <= m->g->nsub) 276 pmatch[i] = m->pmatch[i]; 277 else { 278 pmatch[i].rm_so = -1; 279 pmatch[i].rm_eo = -1; 280 } 281 } 282 283 if (m->pmatch != NULL) 284 free((char *)m->pmatch); 285 if (m->lastpos != NULL) 286 free((char *)m->lastpos); 287 STATETEARDOWN(m); 288 return(0); 289 } 290 291 /* 292 - dissect - figure out what matched what, no back references 293 */ 294 static const char * /* == stop (success) always */ 295 dissect(struct match *m, const char *start, const char *stop, sopno startst, 296 sopno stopst) 297 { 298 int i; 299 sopno ss; /* start sop of current subRE */ 300 sopno es; /* end sop of current subRE */ 301 const char *sp; /* start of string matched by it */ 302 const char *stp; /* string matched by it cannot pass here */ 303 const char *rest; /* start of rest of string */ 304 const char *tail; /* string unmatched by rest of RE */ 305 sopno ssub; /* start sop of subsubRE */ 306 sopno esub; /* end sop of subsubRE */ 307 const char *ssp; /* start of string matched by subsubRE */ 308 const char *sep; /* end of string matched by subsubRE */ 309 const char *oldssp; /* previous ssp */ 310 311 AT("diss", start, stop, startst, stopst); 312 sp = start; 313 for (ss = startst; ss < stopst; ss = es) { 314 /* identify end of subRE */ 315 es = ss; 316 switch (OP(m->g->strip[es])) { 317 case OPLUS_: 318 case OQUEST_: 319 es += OPND(m->g->strip[es]); 320 break; 321 case OCH_: 322 while (OP(m->g->strip[es]) != O_CH) 323 es += OPND(m->g->strip[es]); 324 break; 325 } 326 es++; 327 328 /* figure out what it matched */ 329 switch (OP(m->g->strip[ss])) { 330 case OEND: 331 assert(nope); 332 break; 333 case OCHAR: 334 sp++; 335 break; 336 case OBOL: 337 case OEOL: 338 case OBOW: 339 case OEOW: 340 break; 341 case OANY: 342 case OANYOF: 343 sp++; 344 break; 345 case OBACK_: 346 case O_BACK: 347 assert(nope); 348 break; 349 /* cases where length of match is hard to find */ 350 case OQUEST_: 351 stp = stop; 352 for (;;) { 353 /* how long could this one be? */ 354 rest = slow(m, sp, stp, ss, es); 355 assert(rest != NULL); /* it did match */ 356 /* could the rest match the rest? */ 357 tail = slow(m, rest, stop, es, stopst); 358 if (tail == stop) 359 break; /* yes! */ 360 /* no -- try a shorter match for this one */ 361 stp = rest - 1; 362 assert(stp >= sp); /* it did work */ 363 } 364 ssub = ss + 1; 365 esub = es - 1; 366 /* did innards match? */ 367 if (slow(m, sp, rest, ssub, esub) != NULL) { 368 const char *dp = dissect(m, sp, rest, ssub, esub); 369 (void)dp; /* avoid warning if assertions off */ 370 assert(dp == rest); 371 } else /* no */ 372 assert(sp == rest); 373 sp = rest; 374 break; 375 case OPLUS_: 376 stp = stop; 377 for (;;) { 378 /* how long could this one be? */ 379 rest = slow(m, sp, stp, ss, es); 380 assert(rest != NULL); /* it did match */ 381 /* could the rest match the rest? */ 382 tail = slow(m, rest, stop, es, stopst); 383 if (tail == stop) 384 break; /* yes! */ 385 /* no -- try a shorter match for this one */ 386 stp = rest - 1; 387 assert(stp >= sp); /* it did work */ 388 } 389 ssub = ss + 1; 390 esub = es - 1; 391 ssp = sp; 392 oldssp = ssp; 393 for (;;) { /* find last match of innards */ 394 sep = slow(m, ssp, rest, ssub, esub); 395 if (sep == NULL || sep == ssp) 396 break; /* failed or matched null */ 397 oldssp = ssp; /* on to next try */ 398 ssp = sep; 399 } 400 if (sep == NULL) { 401 /* last successful match */ 402 sep = ssp; 403 ssp = oldssp; 404 } 405 assert(sep == rest); /* must exhaust substring */ 406 assert(slow(m, ssp, sep, ssub, esub) == rest); 407 { 408 const char *dp = dissect(m, ssp, sep, ssub, esub); 409 (void)dp; /* avoid warning if assertions off */ 410 assert(dp == sep); 411 } 412 sp = rest; 413 break; 414 case OCH_: 415 stp = stop; 416 for (;;) { 417 /* how long could this one be? */ 418 rest = slow(m, sp, stp, ss, es); 419 assert(rest != NULL); /* it did match */ 420 /* could the rest match the rest? */ 421 tail = slow(m, rest, stop, es, stopst); 422 if (tail == stop) 423 break; /* yes! */ 424 /* no -- try a shorter match for this one */ 425 stp = rest - 1; 426 assert(stp >= sp); /* it did work */ 427 } 428 ssub = ss + 1; 429 esub = ss + OPND(m->g->strip[ss]) - 1; 430 assert(OP(m->g->strip[esub]) == OOR1); 431 for (;;) { /* find first matching branch */ 432 if (slow(m, sp, rest, ssub, esub) == rest) 433 break; /* it matched all of it */ 434 /* that one missed, try next one */ 435 assert(OP(m->g->strip[esub]) == OOR1); 436 esub++; 437 assert(OP(m->g->strip[esub]) == OOR2); 438 ssub = esub + 1; 439 esub += OPND(m->g->strip[esub]); 440 if (OP(m->g->strip[esub]) == OOR2) 441 esub--; 442 else 443 assert(OP(m->g->strip[esub]) == O_CH); 444 } 445 { 446 const char *dp = dissect(m, sp, rest, ssub, esub); 447 (void)dp; /* avoid warning if assertions off */ 448 assert(dp == rest); 449 } 450 sp = rest; 451 break; 452 case O_PLUS: 453 case O_QUEST: 454 case OOR1: 455 case OOR2: 456 case O_CH: 457 assert(nope); 458 break; 459 case OLPAREN: 460 i = OPND(m->g->strip[ss]); 461 assert(0 < i && i <= m->g->nsub); 462 m->pmatch[i].rm_so = sp - m->offp; 463 break; 464 case ORPAREN: 465 i = OPND(m->g->strip[ss]); 466 assert(0 < i && i <= m->g->nsub); 467 m->pmatch[i].rm_eo = sp - m->offp; 468 break; 469 default: /* uh oh */ 470 assert(nope); 471 break; 472 } 473 } 474 475 assert(sp == stop); 476 return(sp); 477 } 478 479 /* 480 - backref - figure out what matched what, figuring in back references 481 */ 482 static const char * /* == stop (success) or NULL (failure) */ 483 backref(struct match *m, const char *start, const char *stop, sopno startst, 484 sopno stopst, sopno lev, int rec) /* PLUS nesting level */ 485 { 486 int i; 487 sopno ss; /* start sop of current subRE */ 488 const char *sp; /* start of string matched by it */ 489 sopno ssub; /* start sop of subsubRE */ 490 sopno esub; /* end sop of subsubRE */ 491 const char *ssp; /* start of string matched by subsubRE */ 492 const char *dp; 493 size_t len; 494 int hard; 495 sop s; 496 llvm_regoff_t offsave; 497 cset *cs; 498 499 AT("back", start, stop, startst, stopst); 500 sp = start; 501 502 /* get as far as we can with easy stuff */ 503 hard = 0; 504 for (ss = startst; !hard && ss < stopst; ss++) 505 switch (OP(s = m->g->strip[ss])) { 506 case OCHAR: 507 if (sp == stop || *sp++ != (char)OPND(s)) 508 return(NULL); 509 break; 510 case OANY: 511 if (sp == stop) 512 return(NULL); 513 sp++; 514 break; 515 case OANYOF: 516 cs = &m->g->sets[OPND(s)]; 517 if (sp == stop || !CHIN(cs, *sp++)) 518 return(NULL); 519 break; 520 case OBOL: 521 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 522 (sp < m->endp && *(sp-1) == '\n' && 523 (m->g->cflags®_NEWLINE)) ) 524 { /* yes */ } 525 else 526 return(NULL); 527 break; 528 case OEOL: 529 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 530 (sp < m->endp && *sp == '\n' && 531 (m->g->cflags®_NEWLINE)) ) 532 { /* yes */ } 533 else 534 return(NULL); 535 break; 536 case OBOW: 537 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 538 (sp < m->endp && *(sp-1) == '\n' && 539 (m->g->cflags®_NEWLINE)) || 540 (sp > m->beginp && 541 !ISWORD(*(sp-1))) ) && 542 (sp < m->endp && ISWORD(*sp)) ) 543 { /* yes */ } 544 else 545 return(NULL); 546 break; 547 case OEOW: 548 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 549 (sp < m->endp && *sp == '\n' && 550 (m->g->cflags®_NEWLINE)) || 551 (sp < m->endp && !ISWORD(*sp)) ) && 552 (sp > m->beginp && ISWORD(*(sp-1))) ) 553 { /* yes */ } 554 else 555 return(NULL); 556 break; 557 case O_QUEST: 558 break; 559 case OOR1: /* matches null but needs to skip */ 560 ss++; 561 s = m->g->strip[ss]; 562 do { 563 assert(OP(s) == OOR2); 564 ss += OPND(s); 565 } while (OP(s = m->g->strip[ss]) != O_CH); 566 /* note that the ss++ gets us past the O_CH */ 567 break; 568 default: /* have to make a choice */ 569 hard = 1; 570 break; 571 } 572 if (!hard) { /* that was it! */ 573 if (sp != stop) 574 return(NULL); 575 return(sp); 576 } 577 ss--; /* adjust for the for's final increment */ 578 579 /* the hard stuff */ 580 AT("hard", sp, stop, ss, stopst); 581 s = m->g->strip[ss]; 582 switch (OP(s)) { 583 case OBACK_: /* the vilest depths */ 584 i = OPND(s); 585 assert(0 < i && i <= m->g->nsub); 586 if (m->pmatch[i].rm_eo == -1) 587 return(NULL); 588 assert(m->pmatch[i].rm_so != -1); 589 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 590 if (len == 0 && rec++ > MAX_RECURSION) 591 return(NULL); 592 assert(stop - m->beginp >= len); 593 if (sp > stop - len) 594 return(NULL); /* not enough left to match */ 595 ssp = m->offp + m->pmatch[i].rm_so; 596 if (memcmp(sp, ssp, len) != 0) 597 return(NULL); 598 while (m->g->strip[ss] != SOP(O_BACK, i)) 599 ss++; 600 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 601 break; 602 case OQUEST_: /* to null or not */ 603 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 604 if (dp != NULL) 605 return(dp); /* not */ 606 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 607 break; 608 case OPLUS_: 609 assert(m->lastpos != NULL); 610 assert(lev+1 <= m->g->nplus); 611 m->lastpos[lev+1] = sp; 612 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 613 break; 614 case O_PLUS: 615 if (sp == m->lastpos[lev]) /* last pass matched null */ 616 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 617 /* try another pass */ 618 m->lastpos[lev] = sp; 619 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 620 if (dp == NULL) 621 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 622 else 623 return(dp); 624 break; 625 case OCH_: /* find the right one, if any */ 626 ssub = ss + 1; 627 esub = ss + OPND(s) - 1; 628 assert(OP(m->g->strip[esub]) == OOR1); 629 for (;;) { /* find first matching branch */ 630 dp = backref(m, sp, stop, ssub, esub, lev, rec); 631 if (dp != NULL) 632 return(dp); 633 /* that one missed, try next one */ 634 if (OP(m->g->strip[esub]) == O_CH) 635 return(NULL); /* there is none */ 636 esub++; 637 assert(OP(m->g->strip[esub]) == OOR2); 638 ssub = esub + 1; 639 esub += OPND(m->g->strip[esub]); 640 if (OP(m->g->strip[esub]) == OOR2) 641 esub--; 642 else 643 assert(OP(m->g->strip[esub]) == O_CH); 644 } 645 break; 646 case OLPAREN: /* must undo assignment if rest fails */ 647 i = OPND(s); 648 assert(0 < i && i <= m->g->nsub); 649 offsave = m->pmatch[i].rm_so; 650 m->pmatch[i].rm_so = sp - m->offp; 651 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 652 if (dp != NULL) 653 return(dp); 654 m->pmatch[i].rm_so = offsave; 655 return(NULL); 656 break; 657 case ORPAREN: /* must undo assignment if rest fails */ 658 i = OPND(s); 659 assert(0 < i && i <= m->g->nsub); 660 offsave = m->pmatch[i].rm_eo; 661 m->pmatch[i].rm_eo = sp - m->offp; 662 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 663 if (dp != NULL) 664 return(dp); 665 m->pmatch[i].rm_eo = offsave; 666 return(NULL); 667 break; 668 default: /* uh oh */ 669 assert(nope); 670 break; 671 } 672 673 /* "can't happen" */ 674 assert(nope); 675 /* NOTREACHED */ 676 return NULL; 677 } 678 679 /* 680 - fast - step through the string at top speed 681 */ 682 static const char * /* where tentative match ended, or NULL */ 683 fast(struct match *m, const char *start, const char *stop, sopno startst, 684 sopno stopst) 685 { 686 states st = m->st; 687 states fresh = m->fresh; 688 states tmp = m->tmp; 689 const char *p = start; 690 int c = (start == m->beginp) ? OUT : *(start-1); 691 int lastc; /* previous c */ 692 int flagch; 693 int i; 694 const char *coldp; /* last p after which no match was underway */ 695 696 CLEAR(st); 697 SET1(st, startst); 698 st = step(m->g, startst, stopst, st, NOTHING, st); 699 ASSIGN(fresh, st); 700 SP("start", st, *p); 701 coldp = NULL; 702 for (;;) { 703 /* next character */ 704 lastc = c; 705 c = (p == m->endp) ? OUT : *p; 706 if (EQ(st, fresh)) 707 coldp = p; 708 709 /* is there an EOL and/or BOL between lastc and c? */ 710 flagch = '\0'; 711 i = 0; 712 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 713 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 714 flagch = BOL; 715 i = m->g->nbol; 716 } 717 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 718 (c == OUT && !(m->eflags®_NOTEOL)) ) { 719 flagch = (flagch == BOL) ? BOLEOL : EOL; 720 i += m->g->neol; 721 } 722 if (i != 0) { 723 for (; i > 0; i--) 724 st = step(m->g, startst, stopst, st, flagch, st); 725 SP("boleol", st, c); 726 } 727 728 /* how about a word boundary? */ 729 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 730 (c != OUT && ISWORD(c)) ) { 731 flagch = BOW; 732 } 733 if ( (lastc != OUT && ISWORD(lastc)) && 734 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 735 flagch = EOW; 736 } 737 if (flagch == BOW || flagch == EOW) { 738 st = step(m->g, startst, stopst, st, flagch, st); 739 SP("boweow", st, c); 740 } 741 742 /* are we done? */ 743 if (ISSET(st, stopst) || p == stop) 744 break; /* NOTE BREAK OUT */ 745 746 /* no, we must deal with this character */ 747 ASSIGN(tmp, st); 748 ASSIGN(st, fresh); 749 assert(c != OUT); 750 st = step(m->g, startst, stopst, tmp, c, st); 751 SP("aft", st, c); 752 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 753 p++; 754 } 755 756 assert(coldp != NULL); 757 m->coldp = coldp; 758 if (ISSET(st, stopst)) 759 return(p+1); 760 else 761 return(NULL); 762 } 763 764 /* 765 - slow - step through the string more deliberately 766 */ 767 static const char * /* where it ended */ 768 slow(struct match *m, const char *start, const char *stop, sopno startst, 769 sopno stopst) 770 { 771 states st = m->st; 772 states empty = m->empty; 773 states tmp = m->tmp; 774 const char *p = start; 775 int c = (start == m->beginp) ? OUT : *(start-1); 776 int lastc; /* previous c */ 777 int flagch; 778 int i; 779 const char *matchp; /* last p at which a match ended */ 780 781 AT("slow", start, stop, startst, stopst); 782 CLEAR(st); 783 SET1(st, startst); 784 SP("sstart", st, *p); 785 st = step(m->g, startst, stopst, st, NOTHING, st); 786 matchp = NULL; 787 for (;;) { 788 /* next character */ 789 lastc = c; 790 c = (p == m->endp) ? OUT : *p; 791 792 /* is there an EOL and/or BOL between lastc and c? */ 793 flagch = '\0'; 794 i = 0; 795 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 796 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 797 flagch = BOL; 798 i = m->g->nbol; 799 } 800 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 801 (c == OUT && !(m->eflags®_NOTEOL)) ) { 802 flagch = (flagch == BOL) ? BOLEOL : EOL; 803 i += m->g->neol; 804 } 805 if (i != 0) { 806 for (; i > 0; i--) 807 st = step(m->g, startst, stopst, st, flagch, st); 808 SP("sboleol", st, c); 809 } 810 811 /* how about a word boundary? */ 812 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 813 (c != OUT && ISWORD(c)) ) { 814 flagch = BOW; 815 } 816 if ( (lastc != OUT && ISWORD(lastc)) && 817 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 818 flagch = EOW; 819 } 820 if (flagch == BOW || flagch == EOW) { 821 st = step(m->g, startst, stopst, st, flagch, st); 822 SP("sboweow", st, c); 823 } 824 825 /* are we done? */ 826 if (ISSET(st, stopst)) 827 matchp = p; 828 if (EQ(st, empty) || p == stop) 829 break; /* NOTE BREAK OUT */ 830 831 /* no, we must deal with this character */ 832 ASSIGN(tmp, st); 833 ASSIGN(st, empty); 834 assert(c != OUT); 835 st = step(m->g, startst, stopst, tmp, c, st); 836 SP("saft", st, c); 837 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 838 p++; 839 } 840 841 return(matchp); 842 } 843 844 845 /* 846 - step - map set of states reachable before char to set reachable after 847 */ 848 static states 849 step(struct re_guts *g, 850 sopno start, /* start state within strip */ 851 sopno stop, /* state after stop state within strip */ 852 states bef, /* states reachable before */ 853 int ch, /* character or NONCHAR code */ 854 states aft) /* states already known reachable after */ 855 { 856 cset *cs; 857 sop s; 858 sopno pc; 859 onestate here; /* note, macros know this name */ 860 sopno look; 861 int i; 862 863 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 864 s = g->strip[pc]; 865 switch (OP(s)) { 866 case OEND: 867 assert(pc == stop-1); 868 break; 869 case OCHAR: 870 /* only characters can match */ 871 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 872 if (ch == (char)OPND(s)) 873 FWD(aft, bef, 1); 874 break; 875 case OBOL: 876 if (ch == BOL || ch == BOLEOL) 877 FWD(aft, bef, 1); 878 break; 879 case OEOL: 880 if (ch == EOL || ch == BOLEOL) 881 FWD(aft, bef, 1); 882 break; 883 case OBOW: 884 if (ch == BOW) 885 FWD(aft, bef, 1); 886 break; 887 case OEOW: 888 if (ch == EOW) 889 FWD(aft, bef, 1); 890 break; 891 case OANY: 892 if (!NONCHAR(ch)) 893 FWD(aft, bef, 1); 894 break; 895 case OANYOF: 896 cs = &g->sets[OPND(s)]; 897 if (!NONCHAR(ch) && CHIN(cs, ch)) 898 FWD(aft, bef, 1); 899 break; 900 case OBACK_: /* ignored here */ 901 case O_BACK: 902 FWD(aft, aft, 1); 903 break; 904 case OPLUS_: /* forward, this is just an empty */ 905 FWD(aft, aft, 1); 906 break; 907 case O_PLUS: /* both forward and back */ 908 FWD(aft, aft, 1); 909 i = ISSETBACK(aft, OPND(s)); 910 BACK(aft, aft, OPND(s)); 911 if (!i && ISSETBACK(aft, OPND(s))) { 912 /* oho, must reconsider loop body */ 913 pc -= OPND(s) + 1; 914 INIT(here, pc); 915 } 916 break; 917 case OQUEST_: /* two branches, both forward */ 918 FWD(aft, aft, 1); 919 FWD(aft, aft, OPND(s)); 920 break; 921 case O_QUEST: /* just an empty */ 922 FWD(aft, aft, 1); 923 break; 924 case OLPAREN: /* not significant here */ 925 case ORPAREN: 926 FWD(aft, aft, 1); 927 break; 928 case OCH_: /* mark the first two branches */ 929 FWD(aft, aft, 1); 930 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 931 FWD(aft, aft, OPND(s)); 932 break; 933 case OOR1: /* done a branch, find the O_CH */ 934 if (ISSTATEIN(aft, here)) { 935 for (look = 1; 936 OP(s = g->strip[pc+look]) != O_CH; 937 look += OPND(s)) 938 assert(OP(s) == OOR2); 939 FWD(aft, aft, look); 940 } 941 break; 942 case OOR2: /* propagate OCH_'s marking */ 943 FWD(aft, aft, 1); 944 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 945 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 946 FWD(aft, aft, OPND(s)); 947 } 948 break; 949 case O_CH: /* just empty */ 950 FWD(aft, aft, 1); 951 break; 952 default: /* ooooops... */ 953 assert(nope); 954 break; 955 } 956 } 957 958 return(aft); 959 } 960 961 #ifdef REDEBUG 962 /* 963 - print - print a set of states 964 */ 965 static void 966 print(struct match *m, char *caption, states st, int ch, FILE *d) 967 { 968 struct re_guts *g = m->g; 969 int i; 970 int first = 1; 971 972 if (!(m->eflags®_TRACE)) 973 return; 974 975 (void)fprintf(d, "%s", caption); 976 if (ch != '\0') 977 (void)fprintf(d, " %s", pchar(ch)); 978 for (i = 0; i < g->nstates; i++) 979 if (ISSET(st, i)) { 980 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 981 first = 0; 982 } 983 (void)fprintf(d, "\n"); 984 } 985 986 /* 987 - at - print current situation 988 */ 989 static void 990 at(struct match *m, char *title, char *start, char *stop, sopno startst, 991 sopno stopst) 992 { 993 if (!(m->eflags®_TRACE)) 994 return; 995 996 (void)printf("%s %s-", title, pchar(*start)); 997 (void)printf("%s ", pchar(*stop)); 998 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 999 } 1000 1001 #ifndef PCHARDONE 1002 #define PCHARDONE /* never again */ 1003 /* 1004 - pchar - make a character printable 1005 * 1006 * Is this identical to regchar() over in debug.c? Well, yes. But a 1007 * duplicate here avoids having a debugging-capable regexec.o tied to 1008 * a matching debug.o, and this is convenient. It all disappears in 1009 * the non-debug compilation anyway, so it doesn't matter much. 1010 */ 1011 static char * /* -> representation */ 1012 pchar(int ch) 1013 { 1014 static char pbuf[10]; 1015 1016 if (isprint(ch) || ch == ' ') 1017 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1018 else 1019 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1020 return(pbuf); 1021 } 1022 #endif 1023 #endif 1024 1025 #undef matcher 1026 #undef fast 1027 #undef slow 1028 #undef dissect 1029 #undef backref 1030 #undef step 1031 #undef print 1032 #undef at 1033 #undef match 1034 #undef nope 1035