1 /* $OpenBSD: engine.c,v 1.15 2005/08/05 13:03:00 espie Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 /* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45 #ifdef SNAMES 46 #define matcher smatcher 47 #define fast sfast 48 #define slow sslow 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #define nope snope 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #define nope lnope 68 #endif 69 70 /* another structure passed up and down to avoid zillions of parameters */ 71 struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85 }; 86 87 static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88 static char *dissect(struct match *, char *, char *, sopno, sopno); 89 static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90 static char *fast(struct match *, char *, char *, sopno, sopno); 91 static char *slow(struct match *, char *, char *, sopno, sopno); 92 static states step(struct re_guts *, sopno, sopno, states, int, states); 93 #define MAX_RECURSION 100 94 #define BOL (OUT+1) 95 #define EOL (BOL+1) 96 #define BOLEOL (BOL+2) 97 #define NOTHING (BOL+3) 98 #define BOW (BOL+4) 99 #define EOW (BOL+5) 100 #define CODEMAX (BOL+5) /* highest code used */ 101 #define NONCHAR(c) ((c) > CHAR_MAX) 102 #define NNONCHAR (CODEMAX-CHAR_MAX) 103 #ifdef REDEBUG 104 static void print(struct match *, char *, states, int, FILE *); 105 #endif 106 #ifdef REDEBUG 107 static void at(struct match *, char *, char *, char *, sopno, sopno); 108 #endif 109 #ifdef REDEBUG 110 static char *pchar(int); 111 #endif 112 113 #ifdef REDEBUG 114 #define SP(t, s, c) print(m, t, s, c, stdout) 115 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 116 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 117 static int nope = 0; 118 #else 119 #define SP(t, s, c) /* nothing */ 120 #define AT(t, p1, p2, s1, s2) /* nothing */ 121 #define NOTE(s) /* nothing */ 122 #endif 123 124 /* 125 - matcher - the actual matching engine 126 */ 127 static int /* 0 success, REG_NOMATCH failure */ 128 matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 129 int eflags) 130 { 131 char *endp; 132 int i; 133 struct match mv; 134 struct match *m = &mv; 135 char *dp; 136 const sopno gf = g->firststate+1; /* +1 for OEND */ 137 const sopno gl = g->laststate; 138 char *start; 139 char *stop; 140 141 /* simplify the situation where possible */ 142 if (g->cflags®_NOSUB) 143 nmatch = 0; 144 if (eflags®_STARTEND) { 145 start = string + pmatch[0].rm_so; 146 stop = string + pmatch[0].rm_eo; 147 } else { 148 start = string; 149 stop = start + strlen(start); 150 } 151 if (stop < start) 152 return(REG_INVARG); 153 154 /* prescreening; this does wonders for this rather slow code */ 155 if (g->must != NULL) { 156 for (dp = start; dp < stop; dp++) 157 if (*dp == g->must[0] && stop - dp >= g->mlen && 158 memcmp(dp, g->must, (size_t)g->mlen) == 0) 159 break; 160 if (dp == stop) /* we didn't find g->must */ 161 return(REG_NOMATCH); 162 } 163 164 /* match struct setup */ 165 m->g = g; 166 m->eflags = eflags; 167 m->pmatch = NULL; 168 m->lastpos = NULL; 169 m->offp = string; 170 m->beginp = start; 171 m->endp = (char*)stop; 172 STATESETUP(m, 4); 173 SETUP(m->st); 174 SETUP(m->fresh); 175 SETUP(m->tmp); 176 SETUP(m->empty); 177 CLEAR(m->empty); 178 179 /* this loop does only one repetition except for backrefs */ 180 for (;;) { 181 endp = fast(m, start, stop, gf, gl); 182 if (endp == NULL) { /* a miss */ 183 free(m->pmatch); 184 free(m->lastpos); 185 STATETEARDOWN(m); 186 return(REG_NOMATCH); 187 } 188 if (nmatch == 0 && !g->backrefs) 189 break; /* no further info needed */ 190 191 /* where? */ 192 assert(m->coldp != NULL); 193 for (;;) { 194 NOTE("finding start"); 195 endp = slow(m, m->coldp, stop, gf, gl); 196 if (endp != NULL) 197 break; 198 assert(m->coldp < m->endp); 199 m->coldp++; 200 } 201 if (nmatch == 1 && !g->backrefs) 202 break; /* no further info needed */ 203 204 /* oh my, he wants the subexpressions... */ 205 if (m->pmatch == NULL) 206 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 207 sizeof(regmatch_t)); 208 if (m->pmatch == NULL) { 209 STATETEARDOWN(m); 210 return(REG_ESPACE); 211 } 212 for (i = 1; i <= (int)m->g->nsub; i++) 213 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 214 if (!g->backrefs && !(m->eflags®_BACKR)) { 215 NOTE("dissecting"); 216 dp = dissect(m, m->coldp, endp, gf, gl); 217 } else { 218 if (g->nplus > 0 && m->lastpos == NULL) 219 m->lastpos = (char **)malloc((g->nplus+1) * 220 sizeof(char *)); 221 if (g->nplus > 0 && m->lastpos == NULL) { 222 free(m->pmatch); 223 STATETEARDOWN(m); 224 return(REG_ESPACE); 225 } 226 NOTE("backref dissect"); 227 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 228 } 229 if (dp != NULL) 230 break; 231 232 /* uh-oh... we couldn't find a subexpression-level match */ 233 assert(g->backrefs); /* must be back references doing it */ 234 assert(g->nplus == 0 || m->lastpos != NULL); 235 for (;;) { 236 if (dp != NULL || endp <= m->coldp) 237 break; /* defeat */ 238 NOTE("backoff"); 239 endp = slow(m, m->coldp, endp-1, gf, gl); 240 if (endp == NULL) 241 break; /* defeat */ 242 /* try it on a shorter possibility */ 243 #ifndef NDEBUG 244 for (i = 1; i <= m->g->nsub; i++) { 245 assert(m->pmatch[i].rm_so == -1); 246 assert(m->pmatch[i].rm_eo == -1); 247 } 248 #endif 249 NOTE("backoff dissect"); 250 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 251 } 252 assert(dp == NULL || dp == endp); 253 if (dp != NULL) /* found a shorter one */ 254 break; 255 256 /* despite initial appearances, there is no match here */ 257 NOTE("false alarm"); 258 if (m->coldp == stop) 259 break; 260 start = m->coldp + 1; /* recycle starting later */ 261 } 262 263 /* fill in the details if requested */ 264 if (nmatch > 0) { 265 pmatch[0].rm_so = m->coldp - m->offp; 266 pmatch[0].rm_eo = endp - m->offp; 267 } 268 if (nmatch > 1) { 269 assert(m->pmatch != NULL); 270 for (i = 1; i < (ssize_t)nmatch; i++) 271 if (i <= (int)m->g->nsub) 272 pmatch[i] = m->pmatch[i]; 273 else { 274 pmatch[i].rm_so = -1; 275 pmatch[i].rm_eo = -1; 276 } 277 } 278 279 if (m->pmatch != NULL) 280 free((char *)m->pmatch); 281 if (m->lastpos != NULL) 282 free((char *)m->lastpos); 283 STATETEARDOWN(m); 284 return(0); 285 } 286 287 /* 288 - dissect - figure out what matched what, no back references 289 */ 290 static char * /* == stop (success) always */ 291 dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 292 { 293 int i; 294 sopno ss; /* start sop of current subRE */ 295 sopno es; /* end sop of current subRE */ 296 char *sp; /* start of string matched by it */ 297 char *stp; /* string matched by it cannot pass here */ 298 char *rest; /* start of rest of string */ 299 char *tail; /* string unmatched by rest of RE */ 300 sopno ssub; /* start sop of subsubRE */ 301 sopno esub; /* end sop of subsubRE */ 302 char *ssp; /* start of string matched by subsubRE */ 303 char *sep; /* end of string matched by subsubRE */ 304 char *oldssp; /* previous ssp */ 305 306 AT("diss", start, stop, startst, stopst); 307 sp = start; 308 for (ss = startst; ss < stopst; ss = es) { 309 /* identify end of subRE */ 310 es = ss; 311 switch (OP(m->g->strip[es])) { 312 case OPLUS_: 313 case OQUEST_: 314 es += OPND(m->g->strip[es]); 315 break; 316 case OCH_: 317 while (OP(m->g->strip[es]) != O_CH) 318 es += OPND(m->g->strip[es]); 319 break; 320 } 321 es++; 322 323 /* figure out what it matched */ 324 switch (OP(m->g->strip[ss])) { 325 case OEND: 326 assert(nope); 327 break; 328 case OCHAR: 329 sp++; 330 break; 331 case OBOL: 332 case OEOL: 333 case OBOW: 334 case OEOW: 335 break; 336 case OANY: 337 case OANYOF: 338 sp++; 339 break; 340 case OBACK_: 341 case O_BACK: 342 assert(nope); 343 break; 344 /* cases where length of match is hard to find */ 345 case OQUEST_: 346 stp = stop; 347 for (;;) { 348 /* how long could this one be? */ 349 rest = slow(m, sp, stp, ss, es); 350 assert(rest != NULL); /* it did match */ 351 /* could the rest match the rest? */ 352 tail = slow(m, rest, stop, es, stopst); 353 if (tail == stop) 354 break; /* yes! */ 355 /* no -- try a shorter match for this one */ 356 stp = rest - 1; 357 assert(stp >= sp); /* it did work */ 358 } 359 ssub = ss + 1; 360 esub = es - 1; 361 /* did innards match? */ 362 if (slow(m, sp, rest, ssub, esub) != NULL) { 363 if (dissect(m, sp, rest, ssub, esub) != rest) { 364 assert(0 && "dissect(...) should return rest"); 365 } 366 } else /* no */ 367 assert(sp == rest); 368 sp = rest; 369 break; 370 case OPLUS_: 371 stp = stop; 372 for (;;) { 373 /* how long could this one be? */ 374 rest = slow(m, sp, stp, ss, es); 375 assert(rest != NULL); /* it did match */ 376 /* could the rest match the rest? */ 377 tail = slow(m, rest, stop, es, stopst); 378 if (tail == stop) 379 break; /* yes! */ 380 /* no -- try a shorter match for this one */ 381 stp = rest - 1; 382 assert(stp >= sp); /* it did work */ 383 } 384 ssub = ss + 1; 385 esub = es - 1; 386 ssp = sp; 387 oldssp = ssp; 388 for (;;) { /* find last match of innards */ 389 sep = slow(m, ssp, rest, ssub, esub); 390 if (sep == NULL || sep == ssp) 391 break; /* failed or matched null */ 392 oldssp = ssp; /* on to next try */ 393 ssp = sep; 394 } 395 if (sep == NULL) { 396 /* last successful match */ 397 sep = ssp; 398 ssp = oldssp; 399 } 400 assert(sep == rest); /* must exhaust substring */ 401 assert(slow(m, ssp, sep, ssub, esub) == rest); 402 if (dissect(m, ssp, sep, ssub, esub) != sep) { 403 assert(0 && "dissect(...) should return sep"); 404 } 405 sp = rest; 406 break; 407 case OCH_: 408 stp = stop; 409 for (;;) { 410 /* how long could this one be? */ 411 rest = slow(m, sp, stp, ss, es); 412 assert(rest != NULL); /* it did match */ 413 /* could the rest match the rest? */ 414 tail = slow(m, rest, stop, es, stopst); 415 if (tail == stop) 416 break; /* yes! */ 417 /* no -- try a shorter match for this one */ 418 stp = rest - 1; 419 assert(stp >= sp); /* it did work */ 420 } 421 ssub = ss + 1; 422 esub = ss + OPND(m->g->strip[ss]) - 1; 423 assert(OP(m->g->strip[esub]) == OOR1); 424 for (;;) { /* find first matching branch */ 425 if (slow(m, sp, rest, ssub, esub) == rest) 426 break; /* it matched all of it */ 427 /* that one missed, try next one */ 428 assert(OP(m->g->strip[esub]) == OOR1); 429 esub++; 430 assert(OP(m->g->strip[esub]) == OOR2); 431 ssub = esub + 1; 432 esub += OPND(m->g->strip[esub]); 433 if (OP(m->g->strip[esub]) == OOR2) 434 esub--; 435 else 436 assert(OP(m->g->strip[esub]) == O_CH); 437 } 438 if (dissect(m, sp, rest, ssub, esub) != rest) { 439 assert(0 && "dissect(...) should return rest"); 440 } 441 sp = rest; 442 break; 443 case O_PLUS: 444 case O_QUEST: 445 case OOR1: 446 case OOR2: 447 case O_CH: 448 assert(nope); 449 break; 450 case OLPAREN: 451 i = OPND(m->g->strip[ss]); 452 assert(0 < i && i <= m->g->nsub); 453 m->pmatch[i].rm_so = sp - m->offp; 454 break; 455 case ORPAREN: 456 i = OPND(m->g->strip[ss]); 457 assert(0 < i && i <= m->g->nsub); 458 m->pmatch[i].rm_eo = sp - m->offp; 459 break; 460 default: /* uh oh */ 461 assert(nope); 462 break; 463 } 464 } 465 466 assert(sp == stop); 467 return(sp); 468 } 469 470 /* 471 - backref - figure out what matched what, figuring in back references 472 */ 473 static char * /* == stop (success) or NULL (failure) */ 474 backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 475 sopno lev, int rec) /* PLUS nesting level */ 476 { 477 int i; 478 sopno ss; /* start sop of current subRE */ 479 char *sp; /* start of string matched by it */ 480 sopno ssub; /* start sop of subsubRE */ 481 sopno esub; /* end sop of subsubRE */ 482 char *ssp; /* start of string matched by subsubRE */ 483 char *dp; 484 size_t len; 485 int hard; 486 sop s; 487 regoff_t offsave; 488 cset *cs; 489 490 AT("back", start, stop, startst, stopst); 491 sp = start; 492 493 /* get as far as we can with easy stuff */ 494 hard = 0; 495 for (ss = startst; !hard && ss < stopst; ss++) 496 switch (OP(s = m->g->strip[ss])) { 497 case OCHAR: 498 if (sp == stop || *sp++ != (char)OPND(s)) 499 return(NULL); 500 break; 501 case OANY: 502 if (sp == stop) 503 return(NULL); 504 sp++; 505 break; 506 case OANYOF: 507 cs = &m->g->sets[OPND(s)]; 508 if (sp == stop || !CHIN(cs, *sp++)) 509 return(NULL); 510 break; 511 case OBOL: 512 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 513 (sp < m->endp && *(sp-1) == '\n' && 514 (m->g->cflags®_NEWLINE)) ) 515 { /* yes */ } 516 else 517 return(NULL); 518 break; 519 case OEOL: 520 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 521 (sp < m->endp && *sp == '\n' && 522 (m->g->cflags®_NEWLINE)) ) 523 { /* yes */ } 524 else 525 return(NULL); 526 break; 527 case OBOW: 528 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 529 (sp < m->endp && *(sp-1) == '\n' && 530 (m->g->cflags®_NEWLINE)) || 531 (sp > m->beginp && 532 !ISWORD(*(sp-1))) ) && 533 (sp < m->endp && ISWORD(*sp)) ) 534 { /* yes */ } 535 else 536 return(NULL); 537 break; 538 case OEOW: 539 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 540 (sp < m->endp && *sp == '\n' && 541 (m->g->cflags®_NEWLINE)) || 542 (sp < m->endp && !ISWORD(*sp)) ) && 543 (sp > m->beginp && ISWORD(*(sp-1))) ) 544 { /* yes */ } 545 else 546 return(NULL); 547 break; 548 case O_QUEST: 549 break; 550 case OOR1: /* matches null but needs to skip */ 551 ss++; 552 s = m->g->strip[ss]; 553 do { 554 assert(OP(s) == OOR2); 555 ss += OPND(s); 556 } while (OP(s = m->g->strip[ss]) != O_CH); 557 /* note that the ss++ gets us past the O_CH */ 558 break; 559 default: /* have to make a choice */ 560 hard = 1; 561 break; 562 } 563 if (!hard) { /* that was it! */ 564 if (sp != stop) 565 return(NULL); 566 return(sp); 567 } 568 ss--; /* adjust for the for's final increment */ 569 570 /* the hard stuff */ 571 AT("hard", sp, stop, ss, stopst); 572 s = m->g->strip[ss]; 573 switch (OP(s)) { 574 case OBACK_: /* the vilest depths */ 575 i = OPND(s); 576 assert(0 < i && i <= m->g->nsub); 577 if (m->pmatch[i].rm_eo == -1) 578 return(NULL); 579 assert(m->pmatch[i].rm_so != -1); 580 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 581 if (len == 0 && rec++ > MAX_RECURSION) 582 return(NULL); 583 assert(stop - m->beginp >= len); 584 if (sp > stop - len) 585 return(NULL); /* not enough left to match */ 586 ssp = m->offp + m->pmatch[i].rm_so; 587 if (memcmp(sp, ssp, len) != 0) 588 return(NULL); 589 while (m->g->strip[ss] != SOP(O_BACK, i)) 590 ss++; 591 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 592 break; 593 case OQUEST_: /* to null or not */ 594 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 595 if (dp != NULL) 596 return(dp); /* not */ 597 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 598 break; 599 case OPLUS_: 600 assert(m->lastpos != NULL); 601 assert(lev+1 <= m->g->nplus); 602 m->lastpos[lev+1] = sp; 603 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 604 break; 605 case O_PLUS: 606 if (sp == m->lastpos[lev]) /* last pass matched null */ 607 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 608 /* try another pass */ 609 m->lastpos[lev] = sp; 610 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 611 if (dp == NULL) 612 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 613 else 614 return(dp); 615 break; 616 case OCH_: /* find the right one, if any */ 617 ssub = ss + 1; 618 esub = ss + OPND(s) - 1; 619 assert(OP(m->g->strip[esub]) == OOR1); 620 for (;;) { /* find first matching branch */ 621 dp = backref(m, sp, stop, ssub, esub, lev, rec); 622 if (dp != NULL) 623 return(dp); 624 /* that one missed, try next one */ 625 if (OP(m->g->strip[esub]) == O_CH) 626 return(NULL); /* there is none */ 627 esub++; 628 assert(OP(m->g->strip[esub]) == OOR2); 629 ssub = esub + 1; 630 esub += OPND(m->g->strip[esub]); 631 if (OP(m->g->strip[esub]) == OOR2) 632 esub--; 633 else 634 assert(OP(m->g->strip[esub]) == O_CH); 635 } 636 break; 637 case OLPAREN: /* must undo assignment if rest fails */ 638 i = OPND(s); 639 assert(0 < i && i <= m->g->nsub); 640 offsave = m->pmatch[i].rm_so; 641 m->pmatch[i].rm_so = sp - m->offp; 642 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 643 if (dp != NULL) 644 return(dp); 645 m->pmatch[i].rm_so = offsave; 646 return(NULL); 647 break; 648 case ORPAREN: /* must undo assignment if rest fails */ 649 i = OPND(s); 650 assert(0 < i && i <= m->g->nsub); 651 offsave = m->pmatch[i].rm_eo; 652 m->pmatch[i].rm_eo = sp - m->offp; 653 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 654 if (dp != NULL) 655 return(dp); 656 m->pmatch[i].rm_eo = offsave; 657 return(NULL); 658 break; 659 default: /* uh oh */ 660 assert(nope); 661 break; 662 } 663 664 /* "can't happen" */ 665 assert(nope); 666 /* NOTREACHED */ 667 return 0; 668 } 669 670 /* 671 - fast - step through the string at top speed 672 */ 673 static char * /* where tentative match ended, or NULL */ 674 fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 675 { 676 states st = m->st; 677 states fresh = m->fresh; 678 states tmp = m->tmp; 679 char *p = start; 680 int c = (start == m->beginp) ? OUT : *(start-1); 681 int lastc; /* previous c */ 682 int flagch; 683 int i; 684 char *coldp; /* last p after which no match was underway */ 685 686 CLEAR(st); 687 SET1(st, startst); 688 st = step(m->g, startst, stopst, st, NOTHING, st); 689 ASSIGN(fresh, st); 690 SP("start", st, *p); 691 coldp = NULL; 692 for (;;) { 693 /* next character */ 694 lastc = c; 695 c = (p == m->endp) ? OUT : *p; 696 if (EQ(st, fresh)) 697 coldp = p; 698 699 /* is there an EOL and/or BOL between lastc and c? */ 700 flagch = '\0'; 701 i = 0; 702 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 703 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 704 flagch = BOL; 705 i = m->g->nbol; 706 } 707 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 708 (c == OUT && !(m->eflags®_NOTEOL)) ) { 709 flagch = (flagch == BOL) ? BOLEOL : EOL; 710 i += m->g->neol; 711 } 712 if (i != 0) { 713 for (; i > 0; i--) 714 st = step(m->g, startst, stopst, st, flagch, st); 715 SP("boleol", st, c); 716 } 717 718 /* how about a word boundary? */ 719 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 720 (c != OUT && ISWORD(c)) ) { 721 flagch = BOW; 722 } 723 if ( (lastc != OUT && ISWORD(lastc)) && 724 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 725 flagch = EOW; 726 } 727 if (flagch == BOW || flagch == EOW) { 728 st = step(m->g, startst, stopst, st, flagch, st); 729 SP("boweow", st, c); 730 } 731 732 /* are we done? */ 733 if (ISSET(st, stopst) || p == stop) 734 break; /* NOTE BREAK OUT */ 735 736 /* no, we must deal with this character */ 737 ASSIGN(tmp, st); 738 ASSIGN(st, fresh); 739 assert(c != OUT); 740 st = step(m->g, startst, stopst, tmp, c, st); 741 SP("aft", st, c); 742 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 743 p++; 744 } 745 746 assert(coldp != NULL); 747 m->coldp = coldp; 748 if (ISSET(st, stopst)) 749 return(p+1); 750 else 751 return(NULL); 752 } 753 754 /* 755 - slow - step through the string more deliberately 756 */ 757 static char * /* where it ended */ 758 slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 759 { 760 states st = m->st; 761 states empty = m->empty; 762 states tmp = m->tmp; 763 char *p = start; 764 int c = (start == m->beginp) ? OUT : *(start-1); 765 int lastc; /* previous c */ 766 int flagch; 767 int i; 768 char *matchp; /* last p at which a match ended */ 769 770 AT("slow", start, stop, startst, stopst); 771 CLEAR(st); 772 SET1(st, startst); 773 SP("sstart", st, *p); 774 st = step(m->g, startst, stopst, st, NOTHING, st); 775 matchp = NULL; 776 for (;;) { 777 /* next character */ 778 lastc = c; 779 c = (p == m->endp) ? OUT : *p; 780 781 /* is there an EOL and/or BOL between lastc and c? */ 782 flagch = '\0'; 783 i = 0; 784 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 785 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 786 flagch = BOL; 787 i = m->g->nbol; 788 } 789 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 790 (c == OUT && !(m->eflags®_NOTEOL)) ) { 791 flagch = (flagch == BOL) ? BOLEOL : EOL; 792 i += m->g->neol; 793 } 794 if (i != 0) { 795 for (; i > 0; i--) 796 st = step(m->g, startst, stopst, st, flagch, st); 797 SP("sboleol", st, c); 798 } 799 800 /* how about a word boundary? */ 801 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 802 (c != OUT && ISWORD(c)) ) { 803 flagch = BOW; 804 } 805 if ( (lastc != OUT && ISWORD(lastc)) && 806 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 807 flagch = EOW; 808 } 809 if (flagch == BOW || flagch == EOW) { 810 st = step(m->g, startst, stopst, st, flagch, st); 811 SP("sboweow", st, c); 812 } 813 814 /* are we done? */ 815 if (ISSET(st, stopst)) 816 matchp = p; 817 if (EQ(st, empty) || p == stop) 818 break; /* NOTE BREAK OUT */ 819 820 /* no, we must deal with this character */ 821 ASSIGN(tmp, st); 822 ASSIGN(st, empty); 823 assert(c != OUT); 824 st = step(m->g, startst, stopst, tmp, c, st); 825 SP("saft", st, c); 826 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 827 p++; 828 } 829 830 return(matchp); 831 } 832 833 834 /* 835 - step - map set of states reachable before char to set reachable after 836 */ 837 static states 838 step(struct re_guts *g, 839 sopno start, /* start state within strip */ 840 sopno stop, /* state after stop state within strip */ 841 states bef, /* states reachable before */ 842 int ch, /* character or NONCHAR code */ 843 states aft) /* states already known reachable after */ 844 { 845 cset *cs; 846 sop s; 847 sopno pc; 848 onestate here; /* note, macros know this name */ 849 sopno look; 850 int i; 851 852 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 853 s = g->strip[pc]; 854 switch (OP(s)) { 855 case OEND: 856 assert(pc == stop-1); 857 break; 858 case OCHAR: 859 /* only characters can match */ 860 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 861 if (ch == (char)OPND(s)) 862 FWD(aft, bef, 1); 863 break; 864 case OBOL: 865 if (ch == BOL || ch == BOLEOL) 866 FWD(aft, bef, 1); 867 break; 868 case OEOL: 869 if (ch == EOL || ch == BOLEOL) 870 FWD(aft, bef, 1); 871 break; 872 case OBOW: 873 if (ch == BOW) 874 FWD(aft, bef, 1); 875 break; 876 case OEOW: 877 if (ch == EOW) 878 FWD(aft, bef, 1); 879 break; 880 case OANY: 881 if (!NONCHAR(ch)) 882 FWD(aft, bef, 1); 883 break; 884 case OANYOF: 885 cs = &g->sets[OPND(s)]; 886 if (!NONCHAR(ch) && CHIN(cs, ch)) 887 FWD(aft, bef, 1); 888 break; 889 case OBACK_: /* ignored here */ 890 case O_BACK: 891 FWD(aft, aft, 1); 892 break; 893 case OPLUS_: /* forward, this is just an empty */ 894 FWD(aft, aft, 1); 895 break; 896 case O_PLUS: /* both forward and back */ 897 FWD(aft, aft, 1); 898 i = ISSETBACK(aft, OPND(s)); 899 BACK(aft, aft, OPND(s)); 900 if (!i && ISSETBACK(aft, OPND(s))) { 901 /* oho, must reconsider loop body */ 902 pc -= OPND(s) + 1; 903 INIT(here, pc); 904 } 905 break; 906 case OQUEST_: /* two branches, both forward */ 907 FWD(aft, aft, 1); 908 FWD(aft, aft, OPND(s)); 909 break; 910 case O_QUEST: /* just an empty */ 911 FWD(aft, aft, 1); 912 break; 913 case OLPAREN: /* not significant here */ 914 case ORPAREN: 915 FWD(aft, aft, 1); 916 break; 917 case OCH_: /* mark the first two branches */ 918 FWD(aft, aft, 1); 919 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 920 FWD(aft, aft, OPND(s)); 921 break; 922 case OOR1: /* done a branch, find the O_CH */ 923 if (ISSTATEIN(aft, here)) { 924 for (look = 1; 925 OP(s = g->strip[pc+look]) != O_CH; 926 look += OPND(s)) 927 assert(OP(s) == OOR2); 928 FWD(aft, aft, look); 929 } 930 break; 931 case OOR2: /* propagate OCH_'s marking */ 932 FWD(aft, aft, 1); 933 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 934 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 935 FWD(aft, aft, OPND(s)); 936 } 937 break; 938 case O_CH: /* just empty */ 939 FWD(aft, aft, 1); 940 break; 941 default: /* ooooops... */ 942 assert(nope); 943 break; 944 } 945 } 946 947 return(aft); 948 } 949 950 #ifdef REDEBUG 951 /* 952 - print - print a set of states 953 */ 954 static void 955 print(struct match *m, char *caption, states st, int ch, FILE *d) 956 { 957 struct re_guts *g = m->g; 958 int i; 959 int first = 1; 960 961 if (!(m->eflags®_TRACE)) 962 return; 963 964 (void)fprintf(d, "%s", caption); 965 if (ch != '\0') 966 (void)fprintf(d, " %s", pchar(ch)); 967 for (i = 0; i < g->nstates; i++) 968 if (ISSET(st, i)) { 969 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 970 first = 0; 971 } 972 (void)fprintf(d, "\n"); 973 } 974 975 /* 976 - at - print current situation 977 */ 978 static void 979 at(struct match *m, char *title, char *start, char *stop, sopno startst, 980 sopno stopst) 981 { 982 if (!(m->eflags®_TRACE)) 983 return; 984 985 (void)printf("%s %s-", title, pchar(*start)); 986 (void)printf("%s ", pchar(*stop)); 987 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 988 } 989 990 #ifndef PCHARDONE 991 #define PCHARDONE /* never again */ 992 /* 993 - pchar - make a character printable 994 * 995 * Is this identical to regchar() over in debug.c? Well, yes. But a 996 * duplicate here avoids having a debugging-capable regexec.o tied to 997 * a matching debug.o, and this is convenient. It all disappears in 998 * the non-debug compilation anyway, so it doesn't matter much. 999 */ 1000 static char * /* -> representation */ 1001 pchar(int ch) 1002 { 1003 static char pbuf[10]; 1004 1005 if (isprint(ch) || ch == ' ') 1006 (void)_snprintf(pbuf, sizeof pbuf, "%c", ch); 1007 else 1008 (void)_snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1009 return(pbuf); 1010 } 1011 #endif 1012 #endif 1013 1014 #undef matcher 1015 #undef fast 1016 #undef slow 1017 #undef dissect 1018 #undef backref 1019 #undef step 1020 #undef print 1021 #undef at 1022 #undef match 1023 #undef nope 1024