1 /*- 2 * This code is derived from OpenBSD's libc/regex, original license follows: 3 * 4 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 5 * Copyright (c) 1992, 1993, 1994 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Henry Spencer. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 36 */ 37 38 #include <sys/types.h> 39 #include <stdio.h> 40 #include <string.h> 41 #include <ctype.h> 42 #include <limits.h> 43 #include <stdlib.h> 44 #include "regex_impl.h" 45 46 #include "regutils.h" 47 #include "regex2.h" 48 49 #include "regcclass.h" 50 #include "regcname.h" 51 52 /* 53 * parse structure, passed up and down to avoid global variables and 54 * other clumsinesses 55 */ 56 struct parse { 57 char *next; /* next character in RE */ 58 char *end; /* end of string (-> NUL normally) */ 59 int error; /* has an error been seen? */ 60 sop *strip; /* malloced strip */ 61 sopno ssize; /* malloced strip size (allocated) */ 62 sopno slen; /* malloced strip length (used) */ 63 int ncsalloc; /* number of csets allocated */ 64 struct re_guts *g; 65 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 66 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 67 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 68 }; 69 70 static void p_ere(struct parse *, int); 71 static void p_ere_exp(struct parse *); 72 static void p_str(struct parse *); 73 static void p_bre(struct parse *, int, int); 74 static int p_simp_re(struct parse *, int); 75 static int p_count(struct parse *); 76 static void p_bracket(struct parse *); 77 static void p_b_term(struct parse *, cset *); 78 static void p_b_cclass(struct parse *, cset *); 79 static void p_b_eclass(struct parse *, cset *); 80 static char p_b_symbol(struct parse *); 81 static char p_b_coll_elem(struct parse *, int); 82 static char othercase(int); 83 static void bothcases(struct parse *, int); 84 static void ordinary(struct parse *, int); 85 static void nonnewline(struct parse *); 86 static void repeat(struct parse *, sopno, int, int); 87 static int seterr(struct parse *, int); 88 static cset *allocset(struct parse *); 89 static void freeset(struct parse *, cset *); 90 static int freezeset(struct parse *, cset *); 91 static int firstch(struct parse *, cset *); 92 static int nch(struct parse *, cset *); 93 static void mcadd(struct parse *, cset *, const char *); 94 static void mcinvert(struct parse *, cset *); 95 static void mccase(struct parse *, cset *); 96 static int isinsets(struct re_guts *, int); 97 static int samesets(struct re_guts *, int, int); 98 static void categorize(struct parse *, struct re_guts *); 99 static sopno dupl(struct parse *, sopno, sopno); 100 static void doemit(struct parse *, sop, size_t); 101 static void doinsert(struct parse *, sop, size_t, sopno); 102 static void dofwd(struct parse *, sopno, sop); 103 static void enlarge(struct parse *, sopno); 104 static void stripsnug(struct parse *, struct re_guts *); 105 static void findmust(struct parse *, struct re_guts *); 106 static sopno pluscount(struct parse *, struct re_guts *); 107 108 static char nuls[10]; /* place to point scanner in event of error */ 109 110 /* 111 * macros for use with parse structure 112 * BEWARE: these know that the parse structure is named `p' !!! 113 */ 114 #define PEEK() (*p->next) 115 #define PEEK2() (*(p->next+1)) 116 #define MORE() (p->next < p->end) 117 #define MORE2() (p->next+1 < p->end) 118 #define SEE(c) (MORE() && PEEK() == (c)) 119 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 120 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 121 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 122 #define NEXT() (p->next++) 123 #define NEXT2() (p->next += 2) 124 #define NEXTn(n) (p->next += (n)) 125 #define GETNEXT() (*p->next++) 126 #define SETERROR(e) seterr(p, (e)) 127 #define REQUIRE(co, e) (void)((co) || SETERROR(e)) 128 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 129 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 130 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 131 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 132 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 133 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 134 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 135 #define HERE() (p->slen) 136 #define THERE() (p->slen - 1) 137 #define THERETHERE() (p->slen - 2) 138 #define DROP(n) (p->slen -= (n)) 139 140 #ifdef _POSIX2_RE_DUP_MAX 141 #define DUPMAX _POSIX2_RE_DUP_MAX 142 #else 143 #define DUPMAX 255 144 #endif 145 #define INFINITY (DUPMAX + 1) 146 147 #ifndef NDEBUG 148 static int never = 0; /* for use in asserts; shuts lint up */ 149 #else 150 #define never 0 /* some <assert.h>s have bugs too */ 151 #endif 152 153 /* 154 - llvm_regcomp - interface for parser and compilation 155 */ 156 int /* 0 success, otherwise REG_something */ 157 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) 158 { 159 struct parse pa; 160 struct re_guts *g; 161 struct parse *p = &pa; 162 int i; 163 size_t len; 164 #ifdef REDEBUG 165 # define GOODFLAGS(f) (f) 166 #else 167 # define GOODFLAGS(f) ((f)&~REG_DUMP) 168 #endif 169 170 cflags = GOODFLAGS(cflags); 171 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 172 return(REG_INVARG); 173 174 if (cflags®_PEND) { 175 if (preg->re_endp < pattern) 176 return(REG_INVARG); 177 len = preg->re_endp - pattern; 178 } else 179 len = strlen((const char *)pattern); 180 181 /* do the mallocs early so failure handling is easy */ 182 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 183 (NC-1)*sizeof(cat_t)); 184 if (g == NULL) 185 return(REG_ESPACE); 186 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 187 p->strip = (sop *)calloc(p->ssize, sizeof(sop)); 188 p->slen = 0; 189 if (p->strip == NULL) { 190 free((char *)g); 191 return(REG_ESPACE); 192 } 193 194 /* set things up */ 195 p->g = g; 196 p->next = (char *)pattern; /* convenience; we do not modify it */ 197 p->end = p->next + len; 198 p->error = 0; 199 p->ncsalloc = 0; 200 for (i = 0; i < NPAREN; i++) { 201 p->pbegin[i] = 0; 202 p->pend[i] = 0; 203 } 204 g->csetsize = NC; 205 g->sets = NULL; 206 g->setbits = NULL; 207 g->ncsets = 0; 208 g->cflags = cflags; 209 g->iflags = 0; 210 g->nbol = 0; 211 g->neol = 0; 212 g->must = NULL; 213 g->mlen = 0; 214 g->nsub = 0; 215 g->ncategories = 1; /* category 0 is "everything else" */ 216 g->categories = &g->catspace[-(CHAR_MIN)]; 217 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 218 g->backrefs = 0; 219 220 /* do it */ 221 EMIT(OEND, 0); 222 g->firststate = THERE(); 223 if (cflags®_EXTENDED) 224 p_ere(p, OUT); 225 else if (cflags®_NOSPEC) 226 p_str(p); 227 else 228 p_bre(p, OUT, OUT); 229 EMIT(OEND, 0); 230 g->laststate = THERE(); 231 232 /* tidy up loose ends and fill things in */ 233 categorize(p, g); 234 stripsnug(p, g); 235 findmust(p, g); 236 g->nplus = pluscount(p, g); 237 g->magic = MAGIC2; 238 preg->re_nsub = g->nsub; 239 preg->re_g = g; 240 preg->re_magic = MAGIC1; 241 #ifndef REDEBUG 242 /* not debugging, so can't rely on the assert() in llvm_regexec() */ 243 if (g->iflags®EX_BAD) 244 SETERROR(REG_ASSERT); 245 #endif 246 247 /* win or lose, we're done */ 248 if (p->error != 0) /* lose */ 249 llvm_regfree(preg); 250 return(p->error); 251 } 252 253 /* 254 - p_ere - ERE parser top level, concatenation and alternation 255 */ 256 static void 257 p_ere(struct parse *p, int stop) /* character this ERE should end at */ 258 { 259 char c; 260 sopno prevback = 0; 261 sopno prevfwd = 0; 262 sopno conc; 263 int first = 1; /* is this the first alternative? */ 264 265 for (;;) { 266 /* do a bunch of concatenated expressions */ 267 conc = HERE(); 268 while (MORE() && (c = PEEK()) != '|' && c != stop) 269 p_ere_exp(p); 270 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 271 272 if (!EAT('|')) 273 break; /* NOTE BREAK OUT */ 274 275 if (first) { 276 INSERT(OCH_, conc); /* offset is wrong */ 277 prevfwd = conc; 278 prevback = conc; 279 first = 0; 280 } 281 ASTERN(OOR1, prevback); 282 prevback = THERE(); 283 AHEAD(prevfwd); /* fix previous offset */ 284 prevfwd = HERE(); 285 EMIT(OOR2, 0); /* offset is very wrong */ 286 } 287 288 if (!first) { /* tail-end fixups */ 289 AHEAD(prevfwd); 290 ASTERN(O_CH, prevback); 291 } 292 293 assert(!MORE() || SEE(stop)); 294 } 295 296 /* 297 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 298 */ 299 static void 300 p_ere_exp(struct parse *p) 301 { 302 char c; 303 sopno pos; 304 int count; 305 int count2; 306 sopno subno; 307 int wascaret = 0; 308 309 assert(MORE()); /* caller should have ensured this */ 310 c = GETNEXT(); 311 312 pos = HERE(); 313 switch (c) { 314 case '(': 315 REQUIRE(MORE(), REG_EPAREN); 316 p->g->nsub++; 317 subno = p->g->nsub; 318 if (subno < NPAREN) 319 p->pbegin[subno] = HERE(); 320 EMIT(OLPAREN, subno); 321 if (!SEE(')')) 322 p_ere(p, ')'); 323 if (subno < NPAREN) { 324 p->pend[subno] = HERE(); 325 assert(p->pend[subno] != 0); 326 } 327 EMIT(ORPAREN, subno); 328 MUSTEAT(')', REG_EPAREN); 329 break; 330 #ifndef POSIX_MISTAKE 331 case ')': /* happens only if no current unmatched ( */ 332 /* 333 * You may ask, why the ifndef? Because I didn't notice 334 * this until slightly too late for 1003.2, and none of the 335 * other 1003.2 regular-expression reviewers noticed it at 336 * all. So an unmatched ) is legal POSIX, at least until 337 * we can get it fixed. 338 */ 339 SETERROR(REG_EPAREN); 340 break; 341 #endif 342 case '^': 343 EMIT(OBOL, 0); 344 p->g->iflags |= USEBOL; 345 p->g->nbol++; 346 wascaret = 1; 347 break; 348 case '$': 349 EMIT(OEOL, 0); 350 p->g->iflags |= USEEOL; 351 p->g->neol++; 352 break; 353 case '|': 354 SETERROR(REG_EMPTY); 355 break; 356 case '*': 357 case '+': 358 case '?': 359 SETERROR(REG_BADRPT); 360 break; 361 case '.': 362 if (p->g->cflags®_NEWLINE) 363 nonnewline(p); 364 else 365 EMIT(OANY, 0); 366 break; 367 case '[': 368 p_bracket(p); 369 break; 370 case '\\': 371 REQUIRE(MORE(), REG_EESCAPE); 372 c = GETNEXT(); 373 ordinary(p, c); 374 break; 375 case '{': /* okay as ordinary except if digit follows */ 376 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 377 /* FALLTHROUGH */ 378 default: 379 ordinary(p, c); 380 break; 381 } 382 383 if (!MORE()) 384 return; 385 c = PEEK(); 386 /* we call { a repetition if followed by a digit */ 387 if (!( c == '*' || c == '+' || c == '?' || 388 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 389 return; /* no repetition, we're done */ 390 NEXT(); 391 392 REQUIRE(!wascaret, REG_BADRPT); 393 switch (c) { 394 case '*': /* implemented as +? */ 395 /* this case does not require the (y|) trick, noKLUDGE */ 396 INSERT(OPLUS_, pos); 397 ASTERN(O_PLUS, pos); 398 INSERT(OQUEST_, pos); 399 ASTERN(O_QUEST, pos); 400 break; 401 case '+': 402 INSERT(OPLUS_, pos); 403 ASTERN(O_PLUS, pos); 404 break; 405 case '?': 406 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 407 INSERT(OCH_, pos); /* offset slightly wrong */ 408 ASTERN(OOR1, pos); /* this one's right */ 409 AHEAD(pos); /* fix the OCH_ */ 410 EMIT(OOR2, 0); /* offset very wrong... */ 411 AHEAD(THERE()); /* ...so fix it */ 412 ASTERN(O_CH, THERETHERE()); 413 break; 414 case '{': 415 count = p_count(p); 416 if (EAT(',')) { 417 if (isdigit((uch)PEEK())) { 418 count2 = p_count(p); 419 REQUIRE(count <= count2, REG_BADBR); 420 } else /* single number with comma */ 421 count2 = INFINITY; 422 } else /* just a single number */ 423 count2 = count; 424 repeat(p, pos, count, count2); 425 if (!EAT('}')) { /* error heuristics */ 426 while (MORE() && PEEK() != '}') 427 NEXT(); 428 REQUIRE(MORE(), REG_EBRACE); 429 SETERROR(REG_BADBR); 430 } 431 break; 432 } 433 434 if (!MORE()) 435 return; 436 c = PEEK(); 437 if (!( c == '*' || c == '+' || c == '?' || 438 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 439 return; 440 SETERROR(REG_BADRPT); 441 } 442 443 /* 444 - p_str - string (no metacharacters) "parser" 445 */ 446 static void 447 p_str(struct parse *p) 448 { 449 REQUIRE(MORE(), REG_EMPTY); 450 while (MORE()) 451 ordinary(p, GETNEXT()); 452 } 453 454 /* 455 - p_bre - BRE parser top level, anchoring and concatenation 456 * Giving end1 as OUT essentially eliminates the end1/end2 check. 457 * 458 * This implementation is a bit of a kludge, in that a trailing $ is first 459 * taken as an ordinary character and then revised to be an anchor. The 460 * only undesirable side effect is that '$' gets included as a character 461 * category in such cases. This is fairly harmless; not worth fixing. 462 * The amount of lookahead needed to avoid this kludge is excessive. 463 */ 464 static void 465 p_bre(struct parse *p, 466 int end1, /* first terminating character */ 467 int end2) /* second terminating character */ 468 { 469 sopno start = HERE(); 470 int first = 1; /* first subexpression? */ 471 int wasdollar = 0; 472 473 if (EAT('^')) { 474 EMIT(OBOL, 0); 475 p->g->iflags |= USEBOL; 476 p->g->nbol++; 477 } 478 while (MORE() && !SEETWO(end1, end2)) { 479 wasdollar = p_simp_re(p, first); 480 first = 0; 481 } 482 if (wasdollar) { /* oops, that was a trailing anchor */ 483 DROP(1); 484 EMIT(OEOL, 0); 485 p->g->iflags |= USEEOL; 486 p->g->neol++; 487 } 488 489 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 490 } 491 492 /* 493 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 494 */ 495 static int /* was the simple RE an unbackslashed $? */ 496 p_simp_re(struct parse *p, 497 int starordinary) /* is a leading * an ordinary character? */ 498 { 499 int c; 500 int count; 501 int count2; 502 sopno pos; 503 int i; 504 sopno subno; 505 # define BACKSL (1<<CHAR_BIT) 506 507 pos = HERE(); /* repetion op, if any, covers from here */ 508 509 assert(MORE()); /* caller should have ensured this */ 510 c = GETNEXT(); 511 if (c == '\\') { 512 REQUIRE(MORE(), REG_EESCAPE); 513 c = BACKSL | GETNEXT(); 514 } 515 switch (c) { 516 case '.': 517 if (p->g->cflags®_NEWLINE) 518 nonnewline(p); 519 else 520 EMIT(OANY, 0); 521 break; 522 case '[': 523 p_bracket(p); 524 break; 525 case BACKSL|'{': 526 SETERROR(REG_BADRPT); 527 break; 528 case BACKSL|'(': 529 p->g->nsub++; 530 subno = p->g->nsub; 531 if (subno < NPAREN) 532 p->pbegin[subno] = HERE(); 533 EMIT(OLPAREN, subno); 534 /* the MORE here is an error heuristic */ 535 if (MORE() && !SEETWO('\\', ')')) 536 p_bre(p, '\\', ')'); 537 if (subno < NPAREN) { 538 p->pend[subno] = HERE(); 539 assert(p->pend[subno] != 0); 540 } 541 EMIT(ORPAREN, subno); 542 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 543 break; 544 case BACKSL|')': /* should not get here -- must be user */ 545 case BACKSL|'}': 546 SETERROR(REG_EPAREN); 547 break; 548 case BACKSL|'1': 549 case BACKSL|'2': 550 case BACKSL|'3': 551 case BACKSL|'4': 552 case BACKSL|'5': 553 case BACKSL|'6': 554 case BACKSL|'7': 555 case BACKSL|'8': 556 case BACKSL|'9': 557 i = (c&~BACKSL) - '0'; 558 assert(i < NPAREN); 559 if (p->pend[i] != 0) { 560 assert(i <= p->g->nsub); 561 EMIT(OBACK_, i); 562 assert(p->pbegin[i] != 0); 563 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 564 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 565 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 566 EMIT(O_BACK, i); 567 } else 568 SETERROR(REG_ESUBREG); 569 p->g->backrefs = 1; 570 break; 571 case '*': 572 REQUIRE(starordinary, REG_BADRPT); 573 /* FALLTHROUGH */ 574 default: 575 ordinary(p, (char)c); 576 break; 577 } 578 579 if (EAT('*')) { /* implemented as +? */ 580 /* this case does not require the (y|) trick, noKLUDGE */ 581 INSERT(OPLUS_, pos); 582 ASTERN(O_PLUS, pos); 583 INSERT(OQUEST_, pos); 584 ASTERN(O_QUEST, pos); 585 } else if (EATTWO('\\', '{')) { 586 count = p_count(p); 587 if (EAT(',')) { 588 if (MORE() && isdigit((uch)PEEK())) { 589 count2 = p_count(p); 590 REQUIRE(count <= count2, REG_BADBR); 591 } else /* single number with comma */ 592 count2 = INFINITY; 593 } else /* just a single number */ 594 count2 = count; 595 repeat(p, pos, count, count2); 596 if (!EATTWO('\\', '}')) { /* error heuristics */ 597 while (MORE() && !SEETWO('\\', '}')) 598 NEXT(); 599 REQUIRE(MORE(), REG_EBRACE); 600 SETERROR(REG_BADBR); 601 } 602 } else if (c == '$') /* $ (but not \$) ends it */ 603 return(1); 604 605 return(0); 606 } 607 608 /* 609 - p_count - parse a repetition count 610 */ 611 static int /* the value */ 612 p_count(struct parse *p) 613 { 614 int count = 0; 615 int ndigits = 0; 616 617 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 618 count = count*10 + (GETNEXT() - '0'); 619 ndigits++; 620 } 621 622 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 623 return(count); 624 } 625 626 /* 627 - p_bracket - parse a bracketed character list 628 * 629 * Note a significant property of this code: if the allocset() did SETERROR, 630 * no set operations are done. 631 */ 632 static void 633 p_bracket(struct parse *p) 634 { 635 cset *cs; 636 int invert = 0; 637 638 /* Dept of Truly Sickening Special-Case Kludges */ 639 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 640 EMIT(OBOW, 0); 641 NEXTn(6); 642 return; 643 } 644 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 645 EMIT(OEOW, 0); 646 NEXTn(6); 647 return; 648 } 649 650 if ((cs = allocset(p)) == NULL) { 651 /* allocset did set error status in p */ 652 return; 653 } 654 655 if (EAT('^')) 656 invert++; /* make note to invert set at end */ 657 if (EAT(']')) 658 CHadd(cs, ']'); 659 else if (EAT('-')) 660 CHadd(cs, '-'); 661 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 662 p_b_term(p, cs); 663 if (EAT('-')) 664 CHadd(cs, '-'); 665 MUSTEAT(']', REG_EBRACK); 666 667 if (p->error != 0) { /* don't mess things up further */ 668 freeset(p, cs); 669 return; 670 } 671 672 if (p->g->cflags®_ICASE) { 673 int i; 674 int ci; 675 676 for (i = p->g->csetsize - 1; i >= 0; i--) 677 if (CHIN(cs, i) && isalpha(i)) { 678 ci = othercase(i); 679 if (ci != i) 680 CHadd(cs, ci); 681 } 682 if (cs->multis != NULL) 683 mccase(p, cs); 684 } 685 if (invert) { 686 int i; 687 688 for (i = p->g->csetsize - 1; i >= 0; i--) 689 if (CHIN(cs, i)) 690 CHsub(cs, i); 691 else 692 CHadd(cs, i); 693 if (p->g->cflags®_NEWLINE) 694 CHsub(cs, '\n'); 695 if (cs->multis != NULL) 696 mcinvert(p, cs); 697 } 698 699 assert(cs->multis == NULL); /* xxx */ 700 701 if (nch(p, cs) == 1) { /* optimize singleton sets */ 702 ordinary(p, firstch(p, cs)); 703 freeset(p, cs); 704 } else 705 EMIT(OANYOF, freezeset(p, cs)); 706 } 707 708 /* 709 - p_b_term - parse one term of a bracketed character list 710 */ 711 static void 712 p_b_term(struct parse *p, cset *cs) 713 { 714 char c; 715 char start, finish; 716 int i; 717 718 /* classify what we've got */ 719 switch ((MORE()) ? PEEK() : '\0') { 720 case '[': 721 c = (MORE2()) ? PEEK2() : '\0'; 722 break; 723 case '-': 724 SETERROR(REG_ERANGE); 725 return; /* NOTE RETURN */ 726 break; 727 default: 728 c = '\0'; 729 break; 730 } 731 732 switch (c) { 733 case ':': /* character class */ 734 NEXT2(); 735 REQUIRE(MORE(), REG_EBRACK); 736 c = PEEK(); 737 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 738 p_b_cclass(p, cs); 739 REQUIRE(MORE(), REG_EBRACK); 740 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 741 break; 742 case '=': /* equivalence class */ 743 NEXT2(); 744 REQUIRE(MORE(), REG_EBRACK); 745 c = PEEK(); 746 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 747 p_b_eclass(p, cs); 748 REQUIRE(MORE(), REG_EBRACK); 749 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 750 break; 751 default: /* symbol, ordinary character, or range */ 752 /* xxx revision needed for multichar stuff */ 753 start = p_b_symbol(p); 754 if (SEE('-') && MORE2() && PEEK2() != ']') { 755 /* range */ 756 NEXT(); 757 if (EAT('-')) 758 finish = '-'; 759 else 760 finish = p_b_symbol(p); 761 } else 762 finish = start; 763 /* xxx what about signed chars here... */ 764 REQUIRE(start <= finish, REG_ERANGE); 765 for (i = start; i <= finish; i++) 766 CHadd(cs, i); 767 break; 768 } 769 } 770 771 /* 772 - p_b_cclass - parse a character-class name and deal with it 773 */ 774 static void 775 p_b_cclass(struct parse *p, cset *cs) 776 { 777 char *sp = p->next; 778 struct cclass *cp; 779 size_t len; 780 const char *u; 781 char c; 782 783 while (MORE() && isalpha((uch)PEEK())) 784 NEXT(); 785 len = p->next - sp; 786 for (cp = cclasses; cp->name != NULL; cp++) 787 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 788 break; 789 if (cp->name == NULL) { 790 /* oops, didn't find it */ 791 SETERROR(REG_ECTYPE); 792 return; 793 } 794 795 u = cp->chars; 796 while ((c = *u++) != '\0') 797 CHadd(cs, c); 798 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 799 MCadd(p, cs, u); 800 } 801 802 /* 803 - p_b_eclass - parse an equivalence-class name and deal with it 804 * 805 * This implementation is incomplete. xxx 806 */ 807 static void 808 p_b_eclass(struct parse *p, cset *cs) 809 { 810 char c; 811 812 c = p_b_coll_elem(p, '='); 813 CHadd(cs, c); 814 } 815 816 /* 817 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 818 */ 819 static char /* value of symbol */ 820 p_b_symbol(struct parse *p) 821 { 822 char value; 823 824 REQUIRE(MORE(), REG_EBRACK); 825 if (!EATTWO('[', '.')) 826 return(GETNEXT()); 827 828 /* collating symbol */ 829 value = p_b_coll_elem(p, '.'); 830 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 831 return(value); 832 } 833 834 /* 835 - p_b_coll_elem - parse a collating-element name and look it up 836 */ 837 static char /* value of collating element */ 838 p_b_coll_elem(struct parse *p, 839 int endc) /* name ended by endc,']' */ 840 { 841 char *sp = p->next; 842 struct cname *cp; 843 int len; 844 845 while (MORE() && !SEETWO(endc, ']')) 846 NEXT(); 847 if (!MORE()) { 848 SETERROR(REG_EBRACK); 849 return(0); 850 } 851 len = p->next - sp; 852 for (cp = cnames; cp->name != NULL; cp++) 853 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 854 return(cp->code); /* known name */ 855 if (len == 1) 856 return(*sp); /* single character */ 857 SETERROR(REG_ECOLLATE); /* neither */ 858 return(0); 859 } 860 861 /* 862 - othercase - return the case counterpart of an alphabetic 863 */ 864 static char /* if no counterpart, return ch */ 865 othercase(int ch) 866 { 867 ch = (uch)ch; 868 assert(isalpha(ch)); 869 if (isupper(ch)) 870 return ((uch)tolower(ch)); 871 else if (islower(ch)) 872 return ((uch)toupper(ch)); 873 else /* peculiar, but could happen */ 874 return(ch); 875 } 876 877 /* 878 - bothcases - emit a dualcase version of a two-case character 879 * 880 * Boy, is this implementation ever a kludge... 881 */ 882 static void 883 bothcases(struct parse *p, int ch) 884 { 885 char *oldnext = p->next; 886 char *oldend = p->end; 887 char bracket[3]; 888 889 ch = (uch)ch; 890 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 891 p->next = bracket; 892 p->end = bracket+2; 893 bracket[0] = ch; 894 bracket[1] = ']'; 895 bracket[2] = '\0'; 896 p_bracket(p); 897 assert(p->next == bracket+2); 898 p->next = oldnext; 899 p->end = oldend; 900 } 901 902 /* 903 - ordinary - emit an ordinary character 904 */ 905 static void 906 ordinary(struct parse *p, int ch) 907 { 908 cat_t *cap = p->g->categories; 909 910 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 911 bothcases(p, ch); 912 else { 913 EMIT(OCHAR, (uch)ch); 914 if (cap[ch] == 0) 915 cap[ch] = p->g->ncategories++; 916 } 917 } 918 919 /* 920 - nonnewline - emit REG_NEWLINE version of OANY 921 * 922 * Boy, is this implementation ever a kludge... 923 */ 924 static void 925 nonnewline(struct parse *p) 926 { 927 char *oldnext = p->next; 928 char *oldend = p->end; 929 char bracket[4]; 930 931 p->next = bracket; 932 p->end = bracket+3; 933 bracket[0] = '^'; 934 bracket[1] = '\n'; 935 bracket[2] = ']'; 936 bracket[3] = '\0'; 937 p_bracket(p); 938 assert(p->next == bracket+3); 939 p->next = oldnext; 940 p->end = oldend; 941 } 942 943 /* 944 - repeat - generate code for a bounded repetition, recursively if needed 945 */ 946 static void 947 repeat(struct parse *p, 948 sopno start, /* operand from here to end of strip */ 949 int from, /* repeated from this number */ 950 int to) /* to this number of times (maybe INFINITY) */ 951 { 952 sopno finish = HERE(); 953 # define N 2 954 # define INF 3 955 # define REP(f, t) ((f)*8 + (t)) 956 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 957 sopno copy; 958 959 if (p->error != 0) /* head off possible runaway recursion */ 960 return; 961 962 assert(from <= to); 963 964 switch (REP(MAP(from), MAP(to))) { 965 case REP(0, 0): /* must be user doing this */ 966 DROP(finish-start); /* drop the operand */ 967 break; 968 case REP(0, 1): /* as x{1,1}? */ 969 case REP(0, N): /* as x{1,n}? */ 970 case REP(0, INF): /* as x{1,}? */ 971 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 972 INSERT(OCH_, start); /* offset is wrong... */ 973 repeat(p, start+1, 1, to); 974 ASTERN(OOR1, start); 975 AHEAD(start); /* ... fix it */ 976 EMIT(OOR2, 0); 977 AHEAD(THERE()); 978 ASTERN(O_CH, THERETHERE()); 979 break; 980 case REP(1, 1): /* trivial case */ 981 /* done */ 982 break; 983 case REP(1, N): /* as x?x{1,n-1} */ 984 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 985 INSERT(OCH_, start); 986 ASTERN(OOR1, start); 987 AHEAD(start); 988 EMIT(OOR2, 0); /* offset very wrong... */ 989 AHEAD(THERE()); /* ...so fix it */ 990 ASTERN(O_CH, THERETHERE()); 991 copy = dupl(p, start+1, finish+1); 992 assert(copy == finish+4); 993 repeat(p, copy, 1, to-1); 994 break; 995 case REP(1, INF): /* as x+ */ 996 INSERT(OPLUS_, start); 997 ASTERN(O_PLUS, start); 998 break; 999 case REP(N, N): /* as xx{m-1,n-1} */ 1000 copy = dupl(p, start, finish); 1001 repeat(p, copy, from-1, to-1); 1002 break; 1003 case REP(N, INF): /* as xx{n-1,INF} */ 1004 copy = dupl(p, start, finish); 1005 repeat(p, copy, from-1, to); 1006 break; 1007 default: /* "can't happen" */ 1008 SETERROR(REG_ASSERT); /* just in case */ 1009 break; 1010 } 1011 } 1012 1013 /* 1014 - seterr - set an error condition 1015 */ 1016 static int /* useless but makes type checking happy */ 1017 seterr(struct parse *p, int e) 1018 { 1019 if (p->error == 0) /* keep earliest error condition */ 1020 p->error = e; 1021 p->next = nuls; /* try to bring things to a halt */ 1022 p->end = nuls; 1023 return(0); /* make the return value well-defined */ 1024 } 1025 1026 /* 1027 - allocset - allocate a set of characters for [] 1028 */ 1029 static cset * 1030 allocset(struct parse *p) 1031 { 1032 int no = p->g->ncsets++; 1033 size_t nc; 1034 size_t nbytes; 1035 cset *cs; 1036 size_t css = (size_t)p->g->csetsize; 1037 int i; 1038 1039 if (no >= p->ncsalloc) { /* need another column of space */ 1040 void *ptr; 1041 1042 p->ncsalloc += CHAR_BIT; 1043 nc = p->ncsalloc; 1044 assert(nc % CHAR_BIT == 0); 1045 nbytes = nc / CHAR_BIT * css; 1046 1047 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); 1048 if (ptr == NULL) 1049 goto nomem; 1050 p->g->sets = ptr; 1051 1052 ptr = (uch *)realloc((char *)p->g->setbits, nbytes); 1053 if (ptr == NULL) 1054 goto nomem; 1055 p->g->setbits = ptr; 1056 1057 for (i = 0; i < no; i++) 1058 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1059 1060 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); 1061 } 1062 /* XXX should not happen */ 1063 if (p->g->sets == NULL || p->g->setbits == NULL) 1064 goto nomem; 1065 1066 cs = &p->g->sets[no]; 1067 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1068 cs->mask = 1 << ((no) % CHAR_BIT); 1069 cs->hash = 0; 1070 cs->smultis = 0; 1071 cs->multis = NULL; 1072 1073 return(cs); 1074 nomem: 1075 free(p->g->sets); 1076 p->g->sets = NULL; 1077 free(p->g->setbits); 1078 p->g->setbits = NULL; 1079 1080 SETERROR(REG_ESPACE); 1081 /* caller's responsibility not to do set ops */ 1082 return(NULL); 1083 } 1084 1085 /* 1086 - freeset - free a now-unused set 1087 */ 1088 static void 1089 freeset(struct parse *p, cset *cs) 1090 { 1091 size_t i; 1092 cset *top = &p->g->sets[p->g->ncsets]; 1093 size_t css = (size_t)p->g->csetsize; 1094 1095 for (i = 0; i < css; i++) 1096 CHsub(cs, i); 1097 if (cs == top-1) /* recover only the easy case */ 1098 p->g->ncsets--; 1099 } 1100 1101 /* 1102 - freezeset - final processing on a set of characters 1103 * 1104 * The main task here is merging identical sets. This is usually a waste 1105 * of time (although the hash code minimizes the overhead), but can win 1106 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1107 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1108 * the same value! 1109 */ 1110 static int /* set number */ 1111 freezeset(struct parse *p, cset *cs) 1112 { 1113 uch h = cs->hash; 1114 size_t i; 1115 cset *top = &p->g->sets[p->g->ncsets]; 1116 cset *cs2; 1117 size_t css = (size_t)p->g->csetsize; 1118 1119 /* look for an earlier one which is the same */ 1120 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1121 if (cs2->hash == h && cs2 != cs) { 1122 /* maybe */ 1123 for (i = 0; i < css; i++) 1124 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1125 break; /* no */ 1126 if (i == css) 1127 break; /* yes */ 1128 } 1129 1130 if (cs2 < top) { /* found one */ 1131 freeset(p, cs); 1132 cs = cs2; 1133 } 1134 1135 return((int)(cs - p->g->sets)); 1136 } 1137 1138 /* 1139 - firstch - return first character in a set (which must have at least one) 1140 */ 1141 static int /* character; there is no "none" value */ 1142 firstch(struct parse *p, cset *cs) 1143 { 1144 size_t i; 1145 size_t css = (size_t)p->g->csetsize; 1146 1147 for (i = 0; i < css; i++) 1148 if (CHIN(cs, i)) 1149 return((char)i); 1150 assert(never); 1151 return(0); /* arbitrary */ 1152 } 1153 1154 /* 1155 - nch - number of characters in a set 1156 */ 1157 static int 1158 nch(struct parse *p, cset *cs) 1159 { 1160 size_t i; 1161 size_t css = (size_t)p->g->csetsize; 1162 int n = 0; 1163 1164 for (i = 0; i < css; i++) 1165 if (CHIN(cs, i)) 1166 n++; 1167 return(n); 1168 } 1169 1170 /* 1171 - mcadd - add a collating element to a cset 1172 */ 1173 static void 1174 mcadd( struct parse *p, cset *cs, const char *cp) 1175 { 1176 size_t oldend = cs->smultis; 1177 void *np; 1178 1179 cs->smultis += strlen(cp) + 1; 1180 np = realloc(cs->multis, cs->smultis); 1181 if (np == NULL) { 1182 if (cs->multis) 1183 free(cs->multis); 1184 cs->multis = NULL; 1185 SETERROR(REG_ESPACE); 1186 return; 1187 } 1188 cs->multis = np; 1189 1190 llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); 1191 } 1192 1193 /* 1194 - mcinvert - invert the list of collating elements in a cset 1195 * 1196 * This would have to know the set of possibilities. Implementation 1197 * is deferred. 1198 */ 1199 /* ARGSUSED */ 1200 static void 1201 mcinvert(struct parse *p, cset *cs) 1202 { 1203 assert(cs->multis == NULL); /* xxx */ 1204 } 1205 1206 /* 1207 - mccase - add case counterparts of the list of collating elements in a cset 1208 * 1209 * This would have to know the set of possibilities. Implementation 1210 * is deferred. 1211 */ 1212 /* ARGSUSED */ 1213 static void 1214 mccase(struct parse *p, cset *cs) 1215 { 1216 assert(cs->multis == NULL); /* xxx */ 1217 } 1218 1219 /* 1220 - isinsets - is this character in any sets? 1221 */ 1222 static int /* predicate */ 1223 isinsets(struct re_guts *g, int c) 1224 { 1225 uch *col; 1226 int i; 1227 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1228 unsigned uc = (uch)c; 1229 1230 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1231 if (col[uc] != 0) 1232 return(1); 1233 return(0); 1234 } 1235 1236 /* 1237 - samesets - are these two characters in exactly the same sets? 1238 */ 1239 static int /* predicate */ 1240 samesets(struct re_guts *g, int c1, int c2) 1241 { 1242 uch *col; 1243 int i; 1244 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1245 unsigned uc1 = (uch)c1; 1246 unsigned uc2 = (uch)c2; 1247 1248 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1249 if (col[uc1] != col[uc2]) 1250 return(0); 1251 return(1); 1252 } 1253 1254 /* 1255 - categorize - sort out character categories 1256 */ 1257 static void 1258 categorize(struct parse *p, struct re_guts *g) 1259 { 1260 cat_t *cats = g->categories; 1261 int c; 1262 int c2; 1263 cat_t cat; 1264 1265 /* avoid making error situations worse */ 1266 if (p->error != 0) 1267 return; 1268 1269 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1270 if (cats[c] == 0 && isinsets(g, c)) { 1271 cat = g->ncategories++; 1272 cats[c] = cat; 1273 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1274 if (cats[c2] == 0 && samesets(g, c, c2)) 1275 cats[c2] = cat; 1276 } 1277 } 1278 1279 /* 1280 - dupl - emit a duplicate of a bunch of sops 1281 */ 1282 static sopno /* start of duplicate */ 1283 dupl(struct parse *p, 1284 sopno start, /* from here */ 1285 sopno finish) /* to this less one */ 1286 { 1287 sopno ret = HERE(); 1288 sopno len = finish - start; 1289 1290 assert(finish >= start); 1291 if (len == 0) 1292 return(ret); 1293 enlarge(p, p->ssize + len); /* this many unexpected additions */ 1294 assert(p->ssize >= p->slen + len); 1295 (void) memmove((char *)(p->strip + p->slen), 1296 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 1297 p->slen += len; 1298 return(ret); 1299 } 1300 1301 /* 1302 - doemit - emit a strip operator 1303 * 1304 * It might seem better to implement this as a macro with a function as 1305 * hard-case backup, but it's just too big and messy unless there are 1306 * some changes to the data structures. Maybe later. 1307 */ 1308 static void 1309 doemit(struct parse *p, sop op, size_t opnd) 1310 { 1311 /* avoid making error situations worse */ 1312 if (p->error != 0) 1313 return; 1314 1315 /* deal with oversize operands ("can't happen", more or less) */ 1316 assert(opnd < 1<<OPSHIFT); 1317 1318 /* deal with undersized strip */ 1319 if (p->slen >= p->ssize) 1320 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 1321 assert(p->slen < p->ssize); 1322 1323 /* finally, it's all reduced to the easy case */ 1324 p->strip[p->slen++] = SOP(op, opnd); 1325 } 1326 1327 /* 1328 - doinsert - insert a sop into the strip 1329 */ 1330 static void 1331 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1332 { 1333 sopno sn; 1334 sop s; 1335 int i; 1336 1337 /* avoid making error situations worse */ 1338 if (p->error != 0) 1339 return; 1340 1341 sn = HERE(); 1342 EMIT(op, opnd); /* do checks, ensure space */ 1343 assert(HERE() == sn+1); 1344 s = p->strip[sn]; 1345 1346 /* adjust paren pointers */ 1347 assert(pos > 0); 1348 for (i = 1; i < NPAREN; i++) { 1349 if (p->pbegin[i] >= pos) { 1350 p->pbegin[i]++; 1351 } 1352 if (p->pend[i] >= pos) { 1353 p->pend[i]++; 1354 } 1355 } 1356 1357 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 1358 (HERE()-pos-1)*sizeof(sop)); 1359 p->strip[pos] = s; 1360 } 1361 1362 /* 1363 - dofwd - complete a forward reference 1364 */ 1365 static void 1366 dofwd(struct parse *p, sopno pos, sop value) 1367 { 1368 /* avoid making error situations worse */ 1369 if (p->error != 0) 1370 return; 1371 1372 assert(value < 1<<OPSHIFT); 1373 p->strip[pos] = OP(p->strip[pos]) | value; 1374 } 1375 1376 /* 1377 - enlarge - enlarge the strip 1378 */ 1379 static void 1380 enlarge(struct parse *p, sopno size) 1381 { 1382 sop *sp; 1383 1384 if (p->ssize >= size) 1385 return; 1386 1387 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 1388 if (sp == NULL) { 1389 SETERROR(REG_ESPACE); 1390 return; 1391 } 1392 p->strip = sp; 1393 p->ssize = size; 1394 } 1395 1396 /* 1397 - stripsnug - compact the strip 1398 */ 1399 static void 1400 stripsnug(struct parse *p, struct re_guts *g) 1401 { 1402 g->nstates = p->slen; 1403 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 1404 if (g->strip == NULL) { 1405 SETERROR(REG_ESPACE); 1406 g->strip = p->strip; 1407 } 1408 } 1409 1410 /* 1411 - findmust - fill in must and mlen with longest mandatory literal string 1412 * 1413 * This algorithm could do fancy things like analyzing the operands of | 1414 * for common subsequences. Someday. This code is simple and finds most 1415 * of the interesting cases. 1416 * 1417 * Note that must and mlen got initialized during setup. 1418 */ 1419 static void 1420 findmust(struct parse *p, struct re_guts *g) 1421 { 1422 sop *scan; 1423 sop *start = 0; /* start initialized in the default case, after that */ 1424 sop *newstart = 0; /* newstart was initialized in the OCHAR case */ 1425 sopno newlen; 1426 sop s; 1427 char *cp; 1428 sopno i; 1429 1430 /* avoid making error situations worse */ 1431 if (p->error != 0) 1432 return; 1433 1434 /* find the longest OCHAR sequence in strip */ 1435 newlen = 0; 1436 scan = g->strip + 1; 1437 do { 1438 s = *scan++; 1439 switch (OP(s)) { 1440 case OCHAR: /* sequence member */ 1441 if (newlen == 0) /* new sequence */ 1442 newstart = scan - 1; 1443 newlen++; 1444 break; 1445 case OPLUS_: /* things that don't break one */ 1446 case OLPAREN: 1447 case ORPAREN: 1448 break; 1449 case OQUEST_: /* things that must be skipped */ 1450 case OCH_: 1451 scan--; 1452 do { 1453 scan += OPND(s); 1454 s = *scan; 1455 /* assert() interferes w debug printouts */ 1456 if (OP(s) != O_QUEST && OP(s) != O_CH && 1457 OP(s) != OOR2) { 1458 g->iflags |= REGEX_BAD; 1459 return; 1460 } 1461 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1462 /* fallthrough */ 1463 default: /* things that break a sequence */ 1464 if (newlen > g->mlen) { /* ends one */ 1465 start = newstart; 1466 g->mlen = newlen; 1467 } 1468 newlen = 0; 1469 break; 1470 } 1471 } while (OP(s) != OEND); 1472 1473 if (g->mlen == 0) /* there isn't one */ 1474 return; 1475 1476 /* turn it into a character string */ 1477 g->must = malloc((size_t)g->mlen + 1); 1478 if (g->must == NULL) { /* argh; just forget it */ 1479 g->mlen = 0; 1480 return; 1481 } 1482 cp = g->must; 1483 scan = start; 1484 for (i = g->mlen; i > 0; i--) { 1485 while (OP(s = *scan++) != OCHAR) 1486 continue; 1487 assert(cp < g->must + g->mlen); 1488 *cp++ = (char)OPND(s); 1489 } 1490 assert(cp == g->must + g->mlen); 1491 *cp++ = '\0'; /* just on general principles */ 1492 } 1493 1494 /* 1495 - pluscount - count + nesting 1496 */ 1497 static sopno /* nesting depth */ 1498 pluscount(struct parse *p, struct re_guts *g) 1499 { 1500 sop *scan; 1501 sop s; 1502 sopno plusnest = 0; 1503 sopno maxnest = 0; 1504 1505 if (p->error != 0) 1506 return(0); /* there may not be an OEND */ 1507 1508 scan = g->strip + 1; 1509 do { 1510 s = *scan++; 1511 switch (OP(s)) { 1512 case OPLUS_: 1513 plusnest++; 1514 break; 1515 case O_PLUS: 1516 if (plusnest > maxnest) 1517 maxnest = plusnest; 1518 plusnest--; 1519 break; 1520 } 1521 } while (OP(s) != OEND); 1522 if (plusnest != 0) 1523 g->iflags |= REGEX_BAD; 1524 return(maxnest); 1525 } 1526