1 /* $OpenBSD: engine.c,v 1.15 2005/08/05 13:03:00 espie Exp $ */ 2 3 /*- 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)engine.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 /* 39 * The matching engine and friends. This file is #included by regexec.c 40 * after suitable #defines of a variety of macros used herein, so that 41 * different state representations can be used without duplicating masses 42 * of code. 43 */ 44 45 #ifdef SNAMES 46 #define matcher smatcher 47 #define fast sfast 48 #define slow sslow 49 #define dissect sdissect 50 #define backref sbackref 51 #define step sstep 52 #define print sprint 53 #define at sat 54 #define match smat 55 #define nope snope 56 #endif 57 #ifdef LNAMES 58 #define matcher lmatcher 59 #define fast lfast 60 #define slow lslow 61 #define dissect ldissect 62 #define backref lbackref 63 #define step lstep 64 #define print lprint 65 #define at lat 66 #define match lmat 67 #define nope lnope 68 #endif 69 70 /* another structure passed up and down to avoid zillions of parameters */ 71 struct match { 72 struct re_guts *g; 73 int eflags; 74 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 75 char *offp; /* offsets work from here */ 76 char *beginp; /* start of string -- virtual NUL precedes */ 77 char *endp; /* end of string -- virtual NUL here */ 78 char *coldp; /* can be no match starting before here */ 79 char **lastpos; /* [nplus+1] */ 80 STATEVARS; 81 states st; /* current states */ 82 states fresh; /* states for a fresh start */ 83 states tmp; /* temporary */ 84 states empty; /* empty set of states */ 85 }; 86 87 static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int); 88 static char *dissect(struct match *, char *, char *, sopno, sopno); 89 static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int); 90 static char *fast(struct match *, char *, char *, sopno, sopno); 91 static char *slow(struct match *, char *, char *, sopno, sopno); 92 static states step(struct re_guts *, sopno, sopno, states, int, states); 93 #define MAX_RECURSION 100 94 #define BOL (OUT+1) 95 #define EOL (BOL+1) 96 #define BOLEOL (BOL+2) 97 #define NOTHING (BOL+3) 98 #define BOW (BOL+4) 99 #define EOW (BOL+5) 100 #define CODEMAX (BOL+5) /* highest code used */ 101 #define NONCHAR(c) ((c) > CHAR_MAX) 102 #define NNONCHAR (CODEMAX-CHAR_MAX) 103 #ifdef REDEBUG 104 static void print(struct match *, char *, states, int, FILE *); 105 #endif 106 #ifdef REDEBUG 107 static void at(struct match *, char *, char *, char *, sopno, sopno); 108 #endif 109 #ifdef REDEBUG 110 static char *pchar(int); 111 #endif 112 113 #ifdef REDEBUG 114 #define SP(t, s, c) print(m, t, s, c, stdout) 115 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 116 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 117 static int nope = 0; 118 #else 119 #define SP(t, s, c) /* nothing */ 120 #define AT(t, p1, p2, s1, s2) /* nothing */ 121 #define NOTE(s) /* nothing */ 122 #endif 123 124 /* 125 - matcher - the actual matching engine 126 */ 127 static int /* 0 success, REG_NOMATCH failure */ 128 matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], 129 int eflags) 130 { 131 char *endp; 132 int i; 133 struct match mv; 134 struct match *m = &mv; 135 char *dp; 136 const sopno gf = g->firststate+1; /* +1 for OEND */ 137 const sopno gl = g->laststate; 138 char *start; 139 char *stop; 140 141 /* simplify the situation where possible */ 142 if (g->cflags®_NOSUB) 143 nmatch = 0; 144 if (eflags®_STARTEND) { 145 start = string + pmatch[0].rm_so; 146 stop = string + pmatch[0].rm_eo; 147 } else { 148 start = string; 149 stop = start + strlen(start); 150 } 151 if (stop < start) 152 return(REG_INVARG); 153 154 /* prescreening; this does wonders for this rather slow code */ 155 if (g->must != NULL) { 156 for (dp = start; dp < stop; dp++) 157 if (*dp == g->must[0] && stop - dp >= g->mlen && 158 memcmp(dp, g->must, (size_t)g->mlen) == 0) 159 break; 160 if (dp == stop) /* we didn't find g->must */ 161 return(REG_NOMATCH); 162 } 163 164 /* match struct setup */ 165 m->g = g; 166 m->eflags = eflags; 167 m->pmatch = NULL; 168 m->lastpos = NULL; 169 m->offp = string; 170 m->beginp = start; 171 m->endp = (char*)stop; 172 STATESETUP(m, 4); 173 SETUP(m->st); 174 SETUP(m->fresh); 175 SETUP(m->tmp); 176 SETUP(m->empty); 177 CLEAR(m->empty); 178 179 /* this loop does only one repetition except for backrefs */ 180 for (;;) { 181 endp = fast(m, start, stop, gf, gl); 182 if (endp == NULL) { /* a miss */ 183 free(m->pmatch); 184 free(m->lastpos); 185 STATETEARDOWN(m); 186 return(REG_NOMATCH); 187 } 188 if (nmatch == 0 && !g->backrefs) 189 break; /* no further info needed */ 190 191 /* where? */ 192 assert(m->coldp != NULL); 193 for (;;) { 194 NOTE("finding start"); 195 endp = slow(m, m->coldp, stop, gf, gl); 196 if (endp != NULL) 197 break; 198 assert(m->coldp < m->endp); 199 m->coldp++; 200 } 201 if (nmatch == 1 && !g->backrefs) 202 break; /* no further info needed */ 203 204 /* oh my, he wants the subexpressions... */ 205 if (m->pmatch == NULL) 206 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 207 sizeof(regmatch_t)); 208 if (m->pmatch == NULL) { 209 STATETEARDOWN(m); 210 return(REG_ESPACE); 211 } 212 for (i = 1; i <= (int)m->g->nsub; i++) 213 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 214 if (!g->backrefs && !(m->eflags®_BACKR)) { 215 NOTE("dissecting"); 216 dp = dissect(m, m->coldp, endp, gf, gl); 217 } else { 218 if (g->nplus > 0 && m->lastpos == NULL) 219 m->lastpos = (char **)malloc((g->nplus+1) * 220 sizeof(char *)); 221 if (g->nplus > 0 && m->lastpos == NULL) { 222 free(m->pmatch); 223 STATETEARDOWN(m); 224 return(REG_ESPACE); 225 } 226 NOTE("backref dissect"); 227 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 228 } 229 if (dp != NULL) 230 break; 231 232 /* uh-oh... we couldn't find a subexpression-level match */ 233 assert(g->backrefs); /* must be back references doing it */ 234 assert(g->nplus == 0 || m->lastpos != NULL); 235 for (;;) { 236 if (dp != NULL || endp <= m->coldp) 237 break; /* defeat */ 238 NOTE("backoff"); 239 endp = slow(m, m->coldp, endp-1, gf, gl); 240 if (endp == NULL) 241 break; /* defeat */ 242 /* try it on a shorter possibility */ 243 #ifndef NDEBUG 244 for (i = 1; i <= m->g->nsub; i++) { 245 assert(m->pmatch[i].rm_so == -1); 246 assert(m->pmatch[i].rm_eo == -1); 247 } 248 #endif 249 NOTE("backoff dissect"); 250 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 251 } 252 assert(dp == NULL || dp == endp); 253 if (dp != NULL) /* found a shorter one */ 254 break; 255 256 /* despite initial appearances, there is no match here */ 257 NOTE("false alarm"); 258 if (m->coldp == stop) 259 break; 260 start = m->coldp + 1; /* recycle starting later */ 261 } 262 263 /* fill in the details if requested */ 264 if (nmatch > 0) { 265 pmatch[0].rm_so = m->coldp - m->offp; 266 pmatch[0].rm_eo = endp - m->offp; 267 } 268 if (nmatch > 1) { 269 assert(m->pmatch != NULL); 270 for (i = 1; i < (ssize_t)nmatch; i++) 271 if (i <= (int)m->g->nsub) 272 pmatch[i] = m->pmatch[i]; 273 else { 274 pmatch[i].rm_so = -1; 275 pmatch[i].rm_eo = -1; 276 } 277 } 278 279 if (m->pmatch != NULL) 280 free((char *)m->pmatch); 281 if (m->lastpos != NULL) 282 free((char *)m->lastpos); 283 STATETEARDOWN(m); 284 return(0); 285 } 286 287 /* 288 - dissect - figure out what matched what, no back references 289 */ 290 static char * /* == stop (success) always */ 291 dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 292 { 293 int i; 294 sopno ss; /* start sop of current subRE */ 295 sopno es; /* end sop of current subRE */ 296 char *sp; /* start of string matched by it */ 297 char *stp; /* string matched by it cannot pass here */ 298 char *rest; /* start of rest of string */ 299 char *tail; /* string unmatched by rest of RE */ 300 sopno ssub; /* start sop of subsubRE */ 301 sopno esub; /* end sop of subsubRE */ 302 char *ssp; /* start of string matched by subsubRE */ 303 char *sep; /* end of string matched by subsubRE */ 304 char *oldssp; /* previous ssp */ 305 char *dp; 306 307 AT("diss", start, stop, startst, stopst); 308 sp = start; 309 for (ss = startst; ss < stopst; ss = es) { 310 /* identify end of subRE */ 311 es = ss; 312 switch (OP(m->g->strip[es])) { 313 case OPLUS_: 314 case OQUEST_: 315 es += OPND(m->g->strip[es]); 316 break; 317 case OCH_: 318 while (OP(m->g->strip[es]) != O_CH) 319 es += OPND(m->g->strip[es]); 320 break; 321 } 322 es++; 323 324 /* figure out what it matched */ 325 switch (OP(m->g->strip[ss])) { 326 case OEND: 327 assert(nope); 328 break; 329 case OCHAR: 330 sp++; 331 break; 332 case OBOL: 333 case OEOL: 334 case OBOW: 335 case OEOW: 336 break; 337 case OANY: 338 case OANYOF: 339 sp++; 340 break; 341 case OBACK_: 342 case O_BACK: 343 assert(nope); 344 break; 345 /* cases where length of match is hard to find */ 346 case OQUEST_: 347 stp = stop; 348 for (;;) { 349 /* how long could this one be? */ 350 rest = slow(m, sp, stp, ss, es); 351 assert(rest != NULL); /* it did match */ 352 /* could the rest match the rest? */ 353 tail = slow(m, rest, stop, es, stopst); 354 if (tail == stop) 355 break; /* yes! */ 356 /* no -- try a shorter match for this one */ 357 stp = rest - 1; 358 assert(stp >= sp); /* it did work */ 359 } 360 ssub = ss + 1; 361 esub = es - 1; 362 /* did innards match? */ 363 if (slow(m, sp, rest, ssub, esub) != NULL) { 364 dp = dissect(m, sp, rest, ssub, esub); 365 assert(dp == rest); 366 } else /* no */ 367 assert(sp == rest); 368 sp = rest; 369 break; 370 case OPLUS_: 371 stp = stop; 372 for (;;) { 373 /* how long could this one be? */ 374 rest = slow(m, sp, stp, ss, es); 375 assert(rest != NULL); /* it did match */ 376 /* could the rest match the rest? */ 377 tail = slow(m, rest, stop, es, stopst); 378 if (tail == stop) 379 break; /* yes! */ 380 /* no -- try a shorter match for this one */ 381 stp = rest - 1; 382 assert(stp >= sp); /* it did work */ 383 } 384 ssub = ss + 1; 385 esub = es - 1; 386 ssp = sp; 387 oldssp = ssp; 388 for (;;) { /* find last match of innards */ 389 sep = slow(m, ssp, rest, ssub, esub); 390 if (sep == NULL || sep == ssp) 391 break; /* failed or matched null */ 392 oldssp = ssp; /* on to next try */ 393 ssp = sep; 394 } 395 if (sep == NULL) { 396 /* last successful match */ 397 sep = ssp; 398 ssp = oldssp; 399 } 400 assert(sep == rest); /* must exhaust substring */ 401 assert(slow(m, ssp, sep, ssub, esub) == rest); 402 dp = dissect(m, ssp, sep, ssub, esub); 403 assert(dp == sep); 404 sp = rest; 405 break; 406 case OCH_: 407 stp = stop; 408 for (;;) { 409 /* how long could this one be? */ 410 rest = slow(m, sp, stp, ss, es); 411 assert(rest != NULL); /* it did match */ 412 /* could the rest match the rest? */ 413 tail = slow(m, rest, stop, es, stopst); 414 if (tail == stop) 415 break; /* yes! */ 416 /* no -- try a shorter match for this one */ 417 stp = rest - 1; 418 assert(stp >= sp); /* it did work */ 419 } 420 ssub = ss + 1; 421 esub = ss + OPND(m->g->strip[ss]) - 1; 422 assert(OP(m->g->strip[esub]) == OOR1); 423 for (;;) { /* find first matching branch */ 424 if (slow(m, sp, rest, ssub, esub) == rest) 425 break; /* it matched all of it */ 426 /* that one missed, try next one */ 427 assert(OP(m->g->strip[esub]) == OOR1); 428 esub++; 429 assert(OP(m->g->strip[esub]) == OOR2); 430 ssub = esub + 1; 431 esub += OPND(m->g->strip[esub]); 432 if (OP(m->g->strip[esub]) == OOR2) 433 esub--; 434 else 435 assert(OP(m->g->strip[esub]) == O_CH); 436 } 437 dp = dissect(m, sp, rest, ssub, esub); 438 assert(dp == rest); 439 sp = rest; 440 break; 441 case O_PLUS: 442 case O_QUEST: 443 case OOR1: 444 case OOR2: 445 case O_CH: 446 assert(nope); 447 break; 448 case OLPAREN: 449 i = OPND(m->g->strip[ss]); 450 assert(0 < i && i <= m->g->nsub); 451 m->pmatch[i].rm_so = sp - m->offp; 452 break; 453 case ORPAREN: 454 i = OPND(m->g->strip[ss]); 455 assert(0 < i && i <= m->g->nsub); 456 m->pmatch[i].rm_eo = sp - m->offp; 457 break; 458 default: /* uh oh */ 459 assert(nope); 460 break; 461 } 462 } 463 464 assert(sp == stop); 465 return(sp); 466 } 467 468 /* 469 - backref - figure out what matched what, figuring in back references 470 */ 471 static char * /* == stop (success) or NULL (failure) */ 472 backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, 473 sopno lev, int rec) /* PLUS nesting level */ 474 { 475 int i; 476 sopno ss; /* start sop of current subRE */ 477 char *sp; /* start of string matched by it */ 478 sopno ssub; /* start sop of subsubRE */ 479 sopno esub; /* end sop of subsubRE */ 480 char *ssp; /* start of string matched by subsubRE */ 481 char *dp; 482 size_t len; 483 int hard; 484 sop s; 485 regoff_t offsave; 486 cset *cs; 487 488 AT("back", start, stop, startst, stopst); 489 sp = start; 490 491 /* get as far as we can with easy stuff */ 492 hard = 0; 493 for (ss = startst; !hard && ss < stopst; ss++) 494 switch (OP(s = m->g->strip[ss])) { 495 case OCHAR: 496 if (sp == stop || *sp++ != (char)OPND(s)) 497 return(NULL); 498 break; 499 case OANY: 500 if (sp == stop) 501 return(NULL); 502 sp++; 503 break; 504 case OANYOF: 505 cs = &m->g->sets[OPND(s)]; 506 if (sp == stop || !CHIN(cs, *sp++)) 507 return(NULL); 508 break; 509 case OBOL: 510 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 511 (sp < m->endp && *(sp-1) == '\n' && 512 (m->g->cflags®_NEWLINE)) ) 513 { /* yes */ } 514 else 515 return(NULL); 516 break; 517 case OEOL: 518 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 519 (sp < m->endp && *sp == '\n' && 520 (m->g->cflags®_NEWLINE)) ) 521 { /* yes */ } 522 else 523 return(NULL); 524 break; 525 case OBOW: 526 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 527 (sp < m->endp && *(sp-1) == '\n' && 528 (m->g->cflags®_NEWLINE)) || 529 (sp > m->beginp && 530 !ISWORD(*(sp-1))) ) && 531 (sp < m->endp && ISWORD(*sp)) ) 532 { /* yes */ } 533 else 534 return(NULL); 535 break; 536 case OEOW: 537 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 538 (sp < m->endp && *sp == '\n' && 539 (m->g->cflags®_NEWLINE)) || 540 (sp < m->endp && !ISWORD(*sp)) ) && 541 (sp > m->beginp && ISWORD(*(sp-1))) ) 542 { /* yes */ } 543 else 544 return(NULL); 545 break; 546 case O_QUEST: 547 break; 548 case OOR1: /* matches null but needs to skip */ 549 ss++; 550 s = m->g->strip[ss]; 551 do { 552 assert(OP(s) == OOR2); 553 ss += OPND(s); 554 } while (OP(s = m->g->strip[ss]) != O_CH); 555 /* note that the ss++ gets us past the O_CH */ 556 break; 557 default: /* have to make a choice */ 558 hard = 1; 559 break; 560 } 561 if (!hard) { /* that was it! */ 562 if (sp != stop) 563 return(NULL); 564 return(sp); 565 } 566 ss--; /* adjust for the for's final increment */ 567 568 /* the hard stuff */ 569 AT("hard", sp, stop, ss, stopst); 570 s = m->g->strip[ss]; 571 switch (OP(s)) { 572 case OBACK_: /* the vilest depths */ 573 i = OPND(s); 574 assert(0 < i && i <= m->g->nsub); 575 if (m->pmatch[i].rm_eo == -1) 576 return(NULL); 577 assert(m->pmatch[i].rm_so != -1); 578 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 579 if (len == 0 && rec++ > MAX_RECURSION) 580 return(NULL); 581 assert(stop - m->beginp >= len); 582 if (sp > stop - len) 583 return(NULL); /* not enough left to match */ 584 ssp = m->offp + m->pmatch[i].rm_so; 585 if (memcmp(sp, ssp, len) != 0) 586 return(NULL); 587 while (m->g->strip[ss] != SOP(O_BACK, i)) 588 ss++; 589 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 590 break; 591 case OQUEST_: /* to null or not */ 592 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 593 if (dp != NULL) 594 return(dp); /* not */ 595 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 596 break; 597 case OPLUS_: 598 assert(m->lastpos != NULL); 599 assert(lev+1 <= m->g->nplus); 600 m->lastpos[lev+1] = sp; 601 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 602 break; 603 case O_PLUS: 604 if (sp == m->lastpos[lev]) /* last pass matched null */ 605 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 606 /* try another pass */ 607 m->lastpos[lev] = sp; 608 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 609 if (dp == NULL) 610 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 611 else 612 return(dp); 613 break; 614 case OCH_: /* find the right one, if any */ 615 ssub = ss + 1; 616 esub = ss + OPND(s) - 1; 617 assert(OP(m->g->strip[esub]) == OOR1); 618 for (;;) { /* find first matching branch */ 619 dp = backref(m, sp, stop, ssub, esub, lev, rec); 620 if (dp != NULL) 621 return(dp); 622 /* that one missed, try next one */ 623 if (OP(m->g->strip[esub]) == O_CH) 624 return(NULL); /* there is none */ 625 esub++; 626 assert(OP(m->g->strip[esub]) == OOR2); 627 ssub = esub + 1; 628 esub += OPND(m->g->strip[esub]); 629 if (OP(m->g->strip[esub]) == OOR2) 630 esub--; 631 else 632 assert(OP(m->g->strip[esub]) == O_CH); 633 } 634 break; 635 case OLPAREN: /* must undo assignment if rest fails */ 636 i = OPND(s); 637 assert(0 < i && i <= m->g->nsub); 638 offsave = m->pmatch[i].rm_so; 639 m->pmatch[i].rm_so = sp - m->offp; 640 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 641 if (dp != NULL) 642 return(dp); 643 m->pmatch[i].rm_so = offsave; 644 return(NULL); 645 break; 646 case ORPAREN: /* must undo assignment if rest fails */ 647 i = OPND(s); 648 assert(0 < i && i <= m->g->nsub); 649 offsave = m->pmatch[i].rm_eo; 650 m->pmatch[i].rm_eo = sp - m->offp; 651 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 652 if (dp != NULL) 653 return(dp); 654 m->pmatch[i].rm_eo = offsave; 655 return(NULL); 656 break; 657 default: /* uh oh */ 658 assert(nope); 659 break; 660 } 661 662 /* "can't happen" */ 663 assert(nope); 664 /* NOTREACHED */ 665 return 0; 666 } 667 668 /* 669 - fast - step through the string at top speed 670 */ 671 static char * /* where tentative match ended, or NULL */ 672 fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 673 { 674 states st = m->st; 675 states fresh = m->fresh; 676 states tmp = m->tmp; 677 char *p = start; 678 int c = (start == m->beginp) ? OUT : *(start-1); 679 int lastc; /* previous c */ 680 int flagch; 681 int i; 682 char *coldp; /* last p after which no match was underway */ 683 684 CLEAR(st); 685 SET1(st, startst); 686 st = step(m->g, startst, stopst, st, NOTHING, st); 687 ASSIGN(fresh, st); 688 SP("start", st, *p); 689 coldp = NULL; 690 for (;;) { 691 /* next character */ 692 lastc = c; 693 c = (p == m->endp) ? OUT : *p; 694 if (EQ(st, fresh)) 695 coldp = p; 696 697 /* is there an EOL and/or BOL between lastc and c? */ 698 flagch = '\0'; 699 i = 0; 700 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 701 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 702 flagch = BOL; 703 i = m->g->nbol; 704 } 705 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 706 (c == OUT && !(m->eflags®_NOTEOL)) ) { 707 flagch = (flagch == BOL) ? BOLEOL : EOL; 708 i += m->g->neol; 709 } 710 if (i != 0) { 711 for (; i > 0; i--) 712 st = step(m->g, startst, stopst, st, flagch, st); 713 SP("boleol", st, c); 714 } 715 716 /* how about a word boundary? */ 717 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 718 (c != OUT && ISWORD(c)) ) { 719 flagch = BOW; 720 } 721 if ( (lastc != OUT && ISWORD(lastc)) && 722 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 723 flagch = EOW; 724 } 725 if (flagch == BOW || flagch == EOW) { 726 st = step(m->g, startst, stopst, st, flagch, st); 727 SP("boweow", st, c); 728 } 729 730 /* are we done? */ 731 if (ISSET(st, stopst) || p == stop) 732 break; /* NOTE BREAK OUT */ 733 734 /* no, we must deal with this character */ 735 ASSIGN(tmp, st); 736 ASSIGN(st, fresh); 737 assert(c != OUT); 738 st = step(m->g, startst, stopst, tmp, c, st); 739 SP("aft", st, c); 740 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 741 p++; 742 } 743 744 assert(coldp != NULL); 745 m->coldp = coldp; 746 if (ISSET(st, stopst)) 747 return(p+1); 748 else 749 return(NULL); 750 } 751 752 /* 753 - slow - step through the string more deliberately 754 */ 755 static char * /* where it ended */ 756 slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst) 757 { 758 states st = m->st; 759 states empty = m->empty; 760 states tmp = m->tmp; 761 char *p = start; 762 int c = (start == m->beginp) ? OUT : *(start-1); 763 int lastc; /* previous c */ 764 int flagch; 765 int i; 766 char *matchp; /* last p at which a match ended */ 767 768 AT("slow", start, stop, startst, stopst); 769 CLEAR(st); 770 SET1(st, startst); 771 SP("sstart", st, *p); 772 st = step(m->g, startst, stopst, st, NOTHING, st); 773 matchp = NULL; 774 for (;;) { 775 /* next character */ 776 lastc = c; 777 c = (p == m->endp) ? OUT : *p; 778 779 /* is there an EOL and/or BOL between lastc and c? */ 780 flagch = '\0'; 781 i = 0; 782 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 783 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 784 flagch = BOL; 785 i = m->g->nbol; 786 } 787 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 788 (c == OUT && !(m->eflags®_NOTEOL)) ) { 789 flagch = (flagch == BOL) ? BOLEOL : EOL; 790 i += m->g->neol; 791 } 792 if (i != 0) { 793 for (; i > 0; i--) 794 st = step(m->g, startst, stopst, st, flagch, st); 795 SP("sboleol", st, c); 796 } 797 798 /* how about a word boundary? */ 799 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 800 (c != OUT && ISWORD(c)) ) { 801 flagch = BOW; 802 } 803 if ( (lastc != OUT && ISWORD(lastc)) && 804 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 805 flagch = EOW; 806 } 807 if (flagch == BOW || flagch == EOW) { 808 st = step(m->g, startst, stopst, st, flagch, st); 809 SP("sboweow", st, c); 810 } 811 812 /* are we done? */ 813 if (ISSET(st, stopst)) 814 matchp = p; 815 if (EQ(st, empty) || p == stop) 816 break; /* NOTE BREAK OUT */ 817 818 /* no, we must deal with this character */ 819 ASSIGN(tmp, st); 820 ASSIGN(st, empty); 821 assert(c != OUT); 822 st = step(m->g, startst, stopst, tmp, c, st); 823 SP("saft", st, c); 824 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 825 p++; 826 } 827 828 return(matchp); 829 } 830 831 832 /* 833 - step - map set of states reachable before char to set reachable after 834 */ 835 static states 836 step(struct re_guts *g, 837 sopno start, /* start state within strip */ 838 sopno stop, /* state after stop state within strip */ 839 states bef, /* states reachable before */ 840 int ch, /* character or NONCHAR code */ 841 states aft) /* states already known reachable after */ 842 { 843 cset *cs; 844 sop s; 845 sopno pc; 846 onestate here; /* note, macros know this name */ 847 sopno look; 848 int i; 849 850 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 851 s = g->strip[pc]; 852 switch (OP(s)) { 853 case OEND: 854 assert(pc == stop-1); 855 break; 856 case OCHAR: 857 /* only characters can match */ 858 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 859 if (ch == (char)OPND(s)) 860 FWD(aft, bef, 1); 861 break; 862 case OBOL: 863 if (ch == BOL || ch == BOLEOL) 864 FWD(aft, bef, 1); 865 break; 866 case OEOL: 867 if (ch == EOL || ch == BOLEOL) 868 FWD(aft, bef, 1); 869 break; 870 case OBOW: 871 if (ch == BOW) 872 FWD(aft, bef, 1); 873 break; 874 case OEOW: 875 if (ch == EOW) 876 FWD(aft, bef, 1); 877 break; 878 case OANY: 879 if (!NONCHAR(ch)) 880 FWD(aft, bef, 1); 881 break; 882 case OANYOF: 883 cs = &g->sets[OPND(s)]; 884 if (!NONCHAR(ch) && CHIN(cs, ch)) 885 FWD(aft, bef, 1); 886 break; 887 case OBACK_: /* ignored here */ 888 case O_BACK: 889 FWD(aft, aft, 1); 890 break; 891 case OPLUS_: /* forward, this is just an empty */ 892 FWD(aft, aft, 1); 893 break; 894 case O_PLUS: /* both forward and back */ 895 FWD(aft, aft, 1); 896 i = ISSETBACK(aft, OPND(s)); 897 BACK(aft, aft, OPND(s)); 898 if (!i && ISSETBACK(aft, OPND(s))) { 899 /* oho, must reconsider loop body */ 900 pc -= OPND(s) + 1; 901 INIT(here, pc); 902 } 903 break; 904 case OQUEST_: /* two branches, both forward */ 905 FWD(aft, aft, 1); 906 FWD(aft, aft, OPND(s)); 907 break; 908 case O_QUEST: /* just an empty */ 909 FWD(aft, aft, 1); 910 break; 911 case OLPAREN: /* not significant here */ 912 case ORPAREN: 913 FWD(aft, aft, 1); 914 break; 915 case OCH_: /* mark the first two branches */ 916 FWD(aft, aft, 1); 917 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 918 FWD(aft, aft, OPND(s)); 919 break; 920 case OOR1: /* done a branch, find the O_CH */ 921 if (ISSTATEIN(aft, here)) { 922 for (look = 1; 923 OP(s = g->strip[pc+look]) != O_CH; 924 look += OPND(s)) 925 assert(OP(s) == OOR2); 926 FWD(aft, aft, look); 927 } 928 break; 929 case OOR2: /* propagate OCH_'s marking */ 930 FWD(aft, aft, 1); 931 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 932 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 933 FWD(aft, aft, OPND(s)); 934 } 935 break; 936 case O_CH: /* just empty */ 937 FWD(aft, aft, 1); 938 break; 939 default: /* ooooops... */ 940 assert(nope); 941 break; 942 } 943 } 944 945 return(aft); 946 } 947 948 #ifdef REDEBUG 949 /* 950 - print - print a set of states 951 */ 952 static void 953 print(struct match *m, char *caption, states st, int ch, FILE *d) 954 { 955 struct re_guts *g = m->g; 956 int i; 957 int first = 1; 958 959 if (!(m->eflags®_TRACE)) 960 return; 961 962 (void)fprintf(d, "%s", caption); 963 if (ch != '\0') 964 (void)fprintf(d, " %s", pchar(ch)); 965 for (i = 0; i < g->nstates; i++) 966 if (ISSET(st, i)) { 967 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 968 first = 0; 969 } 970 (void)fprintf(d, "\n"); 971 } 972 973 /* 974 - at - print current situation 975 */ 976 static void 977 at(struct match *m, char *title, char *start, char *stop, sopno startst, 978 sopno stopst) 979 { 980 if (!(m->eflags®_TRACE)) 981 return; 982 983 (void)printf("%s %s-", title, pchar(*start)); 984 (void)printf("%s ", pchar(*stop)); 985 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 986 } 987 988 #ifndef PCHARDONE 989 #define PCHARDONE /* never again */ 990 /* 991 - pchar - make a character printable 992 * 993 * Is this identical to regchar() over in debug.c? Well, yes. But a 994 * duplicate here avoids having a debugging-capable regexec.o tied to 995 * a matching debug.o, and this is convenient. It all disappears in 996 * the non-debug compilation anyway, so it doesn't matter much. 997 */ 998 static char * /* -> representation */ 999 pchar(int ch) 1000 { 1001 static char pbuf[10]; 1002 1003 if (isprint(ch) || ch == ' ') 1004 (void)_snprintf(pbuf, sizeof pbuf, "%c", ch); 1005 else 1006 (void)_snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1007 return(pbuf); 1008 } 1009 #endif 1010 #endif 1011 1012 #undef matcher 1013 #undef fast 1014 #undef slow 1015 #undef dissect 1016 #undef backref 1017 #undef step 1018 #undef print 1019 #undef at 1020 #undef match 1021 #undef nope 1022