1 /* Extended regular expression matching and search library, version 2 0.12. (Implements POSIX draft P10003.2/D11.2, except for 3 internationalization features.) 4 5 Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program; if not, write to the Free Software 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 20 USA. */ 21 22 /* AIX requires this to be the first thing in the file. */ 23 #if defined (_AIX) && !defined (REGEX_MALLOC) 24 #pragma alloca 25 #endif 26 27 #undef _GNU_SOURCE 28 #define _GNU_SOURCE 29 30 #include "cs_config.h" 31 #include "util/osdep.h" 32 33 #ifdef HAVE_CONFIG_H 34 #include <config.h> 35 #endif 36 37 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 38 #include <sys/types.h> 39 40 /* This is for other GNU distributions with internationalized messages. */ 41 #if HAVE_LIBINTL_H || defined (_LIBC) 42 # include <libintl.h> 43 #else 44 # define gettext(msgid) (msgid) 45 #endif 46 47 #ifndef gettext_noop 48 /* This define is so xgettext can find the internationalizable 49 strings. */ 50 #define gettext_noop(String) String 51 #endif 52 53 /* The `emacs' switch turns on certain matching commands 54 that make sense only in Emacs. */ 55 #ifdef emacs 56 57 #include "lisp.h" 58 #include "buffer.h" 59 #include "syntax.h" 60 61 #else /* not emacs */ 62 63 /* If we are not linking with Emacs proper, 64 we can't use the relocating allocator 65 even if config.h says that we can. */ 66 #undef REL_ALLOC 67 68 #if defined (STDC_HEADERS) || defined (_LIBC) 69 #include <stdlib.h> 70 #else 71 char *malloc (); 72 char *realloc (); 73 #endif 74 75 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 76 If nothing else has been done, use the method below. */ 77 #ifdef INHIBIT_STRING_HEADER 78 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY)) 79 #if !defined (bzero) && !defined (bcopy) 80 #undef INHIBIT_STRING_HEADER 81 #endif 82 #endif 83 #endif 84 85 /* This is the normal way of making sure we have a bcopy and a bzero. 86 This is used in most programs--a few other programs avoid this 87 by defining INHIBIT_STRING_HEADER. */ 88 #ifndef INHIBIT_STRING_HEADER 89 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC) 90 #include <string.h> 91 #ifndef bcmp 92 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 93 #endif 94 #ifndef bcopy 95 #define bcopy(s, d, n) memcpy ((d), (s), (n)) 96 #endif 97 #ifndef bzero 98 #define bzero(s, n) memset ((s), 0, (n)) 99 #endif 100 #else 101 #include <strings.h> 102 #endif 103 #endif 104 105 /* Define the syntax stuff for \<, \>, etc. */ 106 107 /* This must be nonzero for the wordchar and notwordchar pattern 108 commands in re_match_2. */ 109 #ifndef Sword 110 #define Sword 1 111 #endif 112 113 #ifdef SWITCH_ENUM_BUG 114 #define SWITCH_ENUM_CAST(x) ((int)(x)) 115 #else 116 #define SWITCH_ENUM_CAST(x) (x) 117 #endif 118 119 #ifdef SYNTAX_TABLE 120 121 extern char *re_syntax_table; 122 123 #else /* not SYNTAX_TABLE */ 124 125 /* How many characters in the character set. */ 126 #define CHAR_SET_SIZE 256 127 128 static char re_syntax_table[CHAR_SET_SIZE]; 129 130 static void 131 init_syntax_once () 132 { 133 register int c; 134 static int done = 0; 135 136 if (done) 137 return; 138 139 bzero (re_syntax_table, sizeof re_syntax_table); 140 141 for (c = 'a'; c <= 'z'; c++) 142 re_syntax_table[c] = Sword; 143 144 for (c = 'A'; c <= 'Z'; c++) 145 re_syntax_table[c] = Sword; 146 147 for (c = '0'; c <= '9'; c++) 148 re_syntax_table[c] = Sword; 149 150 re_syntax_table['_'] = Sword; 151 152 done = 1; 153 } 154 155 #endif /* not SYNTAX_TABLE */ 156 157 #define SYNTAX(c) re_syntax_table[c] 158 159 #endif /* not emacs */ 160 161 /* Get the interface, including the syntax bits. */ 163 #include "regex.h" 164 165 /* isalpha etc. are used for the character classes. */ 166 #include <ctype.h> 167 168 /* Jim Meyering writes: 169 170 "... Some ctype macros are valid only for character codes that 171 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 172 using /bin/cc or gcc but without giving an ansi option). So, all 173 ctype uses should be through macros like ISPRINT... If 174 STDC_HEADERS is defined, then autoconf has verified that the ctype 175 macros don't need to be guarded with references to isascii. ... 176 Defining IN_CTYPE_DOMAIN to 1 should let any compiler worth its salt 177 eliminate the && through constant folding." */ 178 179 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 180 #define IN_CTYPE_DOMAIN(c) 1 181 #else 182 #define IN_CTYPE_DOMAIN(c) isascii(c) 183 #endif 184 185 #ifdef isblank 186 #define ISBLANK(c) (IN_CTYPE_DOMAIN (c) && isblank (c)) 187 #else 188 #define ISBLANK(c) ((c) == ' ' || (c) == '\t') 189 #endif 190 #ifdef isgraph 191 #define ISGRAPH(c) (IN_CTYPE_DOMAIN (c) && isgraph (c)) 192 #else 193 #define ISGRAPH(c) (IN_CTYPE_DOMAIN (c) && isprint (c) && !isspace (c)) 194 #endif 195 196 #define ISPRINT(c) (IN_CTYPE_DOMAIN (c) && isprint (c)) 197 #define ISDIGIT(c) (IN_CTYPE_DOMAIN (c) && isdigit (c)) 198 #define ISALNUM(c) (IN_CTYPE_DOMAIN (c) && isalnum (c)) 199 #define ISALPHA(c) (IN_CTYPE_DOMAIN (c) && isalpha (c)) 200 #define ISCNTRL(c) (IN_CTYPE_DOMAIN (c) && iscntrl (c)) 201 #define ISLOWER(c) (IN_CTYPE_DOMAIN (c) && islower (c)) 202 #define ISPUNCT(c) (IN_CTYPE_DOMAIN (c) && ispunct (c)) 203 #define ISSPACE(c) (IN_CTYPE_DOMAIN (c) && isspace (c)) 204 #define ISUPPER(c) (IN_CTYPE_DOMAIN (c) && isupper (c)) 205 #define ISXDIGIT(c) (IN_CTYPE_DOMAIN (c) && isxdigit (c)) 206 207 #ifndef NULL 208 #define NULL (void *)0 209 #endif 210 211 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 212 since ours (we hope) works properly with all combinations of 213 machines, compilers, `char' and `unsigned char' argument types. 214 (Per Bothner suggested the basic approach.) */ 215 #undef SIGN_EXTEND_CHAR 216 #if __STDC__ 217 #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 218 #else /* not __STDC__ */ 219 /* As in Harbison and Steele. */ 220 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 221 #endif 222 223 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 225 use `alloca' instead of `malloc'. This is because using malloc in 226 re_search* or re_match* could cause memory leaks when C-g is used in 227 Emacs; also, malloc is slower and causes storage fragmentation. On 228 the other hand, malloc is more portable, and easier to debug. 229 230 Because we sometimes use alloca, some routines have to be macros, 231 not functions -- `alloca'-allocated space disappears at the end of the 232 function it is called in. */ 233 234 #ifdef REGEX_MALLOC 235 236 #define REGEX_ALLOCATE malloc 237 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 238 #define REGEX_FREE free 239 240 #else /* not REGEX_MALLOC */ 241 242 /* Emacs already defines alloca, sometimes. */ 243 #ifndef alloca 244 245 /* Make alloca work the best possible way. */ 246 #ifdef __GNUC__ 247 #define alloca __builtin_alloca 248 #else /* not __GNUC__ */ 249 #if HAVE_ALLOCA_H 250 #include <alloca.h> 251 #else /* not __GNUC__ or HAVE_ALLOCA_H */ 252 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */ 253 #ifndef _AIX /* Already did AIX, up at the top. */ 254 char *alloca (); 255 #endif /* not _AIX */ 256 #endif 257 #endif /* not HAVE_ALLOCA_H */ 258 #endif /* not __GNUC__ */ 259 260 #endif /* not alloca */ 261 262 #define REGEX_ALLOCATE alloca 263 264 /* Assumes a `char *destination' variable. */ 265 #define REGEX_REALLOCATE(source, osize, nsize) \ 266 (destination = (char *) alloca (nsize), \ 267 bcopy (source, destination, osize), \ 268 destination) 269 270 /* No need to do anything to free, after alloca. */ 271 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 272 273 #endif /* not REGEX_MALLOC */ 274 275 /* Define how to allocate the failure stack. */ 276 277 #if defined (REL_ALLOC) && defined (REGEX_MALLOC) 278 279 #define REGEX_ALLOCATE_STACK(size) \ 280 r_alloc (&failure_stack_ptr, (size)) 281 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 282 r_re_alloc (&failure_stack_ptr, (nsize)) 283 #define REGEX_FREE_STACK(ptr) \ 284 r_alloc_free (&failure_stack_ptr) 285 286 #else /* not using relocating allocator */ 287 288 #ifdef REGEX_MALLOC 289 290 #define REGEX_ALLOCATE_STACK malloc 291 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 292 #define REGEX_FREE_STACK free 293 294 #else /* not REGEX_MALLOC */ 295 296 #define REGEX_ALLOCATE_STACK alloca 297 298 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 299 REGEX_REALLOCATE (source, osize, nsize) 300 /* No need to explicitly free anything. */ 301 #define REGEX_FREE_STACK(arg) 302 303 #endif /* not REGEX_MALLOC */ 304 #endif /* not using relocating allocator */ 305 306 307 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 308 `string1' or just past its end. This works if PTR is NULL, which is 309 a good thing. */ 310 #define FIRST_STRING_P(ptr) \ 311 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 312 313 /* (Re)Allocate N items of type T using malloc, or fail. */ 314 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 315 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 316 #define RETALLOC_IF(addr, n, t) \ 317 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 318 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 319 320 #define BYTEWIDTH 8 /* In bits. */ 321 322 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 323 324 #undef MAX 325 #undef MIN 326 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 327 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 328 329 typedef char boolean; 330 #define false 0 331 #define true 1 332 333 static int re_match_2_internal (); 334 335 /* These are the command codes that appear in compiled regular 337 expressions. Some opcodes are followed by argument bytes. A 338 command code can specify any interpretation whatsoever for its 339 arguments. Zero bytes may appear in the compiled regular expression. */ 340 341 typedef enum 342 { 343 no_op = 0, 344 345 /* Succeed right away--no more backtracking. */ 346 succeed, 347 348 /* Followed by one byte giving n, then by n literal bytes. */ 349 exactn, 350 351 /* Matches any (more or less) character. */ 352 anychar, 353 354 /* Matches any one char belonging to specified set. First 355 following byte is number of bitmap bytes. Then come bytes 356 for a bitmap saying which chars are in. Bits in each byte 357 are ordered low-bit-first. A character is in the set if its 358 bit is 1. A character too large to have a bit in the map is 359 automatically not in the set. */ 360 charset, 361 362 /* Same parameters as charset, but match any character that is 363 not one of those specified. */ 364 charset_not, 365 366 /* Start remembering the text that is matched, for storing in a 367 register. Followed by one byte with the register number, in 368 the range 0 to one less than the pattern buffer's re_nsub 369 field. Then followed by one byte with the number of groups 370 inner to this one. (This last has to be part of the 371 start_memory only because we need it in the on_failure_jump 372 of re_match_2.) */ 373 start_memory, 374 375 /* Stop remembering the text that is matched and store it in a 376 memory register. Followed by one byte with the register 377 number, in the range 0 to one less than `re_nsub' in the 378 pattern buffer, and one byte with the number of inner groups, 379 just like `start_memory'. (We need the number of inner 380 groups here because we don't have any easy way of finding the 381 corresponding start_memory when we're at a stop_memory.) */ 382 stop_memory, 383 384 /* Match a duplicate of something remembered. Followed by one 385 byte containing the register number. */ 386 duplicate, 387 388 /* Fail unless at beginning of line. */ 389 begline, 390 391 /* Fail unless at end of line. */ 392 endline, 393 394 /* Succeeds if at beginning of buffer (if emacs) or at beginning 395 of string to be matched (if not). */ 396 begbuf, 397 398 /* Analogously, for end of buffer/string. */ 399 endbuf, 400 401 /* Followed by two byte relative address to which to jump. */ 402 jump, 403 404 /* Same as jump, but marks the end of an alternative. */ 405 jump_past_alt, 406 407 /* Followed by two-byte relative address of place to resume at 408 in case of failure. */ 409 on_failure_jump, 410 411 /* Like on_failure_jump, but pushes a placeholder instead of the 412 current string position when executed. */ 413 on_failure_keep_string_jump, 414 415 /* Throw away latest failure point and then jump to following 416 two-byte relative address. */ 417 pop_failure_jump, 418 419 /* Change to pop_failure_jump if know won't have to backtrack to 420 match; otherwise change to jump. This is used to jump 421 back to the beginning of a repeat. If what follows this jump 422 clearly won't match what the repeat does, such that we can be 423 sure that there is no use backtracking out of repetitions 424 already matched, then we change it to a pop_failure_jump. 425 Followed by two-byte address. */ 426 maybe_pop_jump, 427 428 /* Jump to following two-byte address, and push a dummy failure 429 point. This failure point will be thrown away if an attempt 430 is made to use it for a failure. A `+' construct makes this 431 before the first repeat. Also used as an intermediary kind 432 of jump when compiling an alternative. */ 433 dummy_failure_jump, 434 435 /* Push a dummy failure point and continue. Used at the end of 436 alternatives. */ 437 push_dummy_failure, 438 439 /* Followed by two-byte relative address and two-byte number n. 440 After matching N times, jump to the address upon failure. */ 441 succeed_n, 442 443 /* Followed by two-byte relative address, and two-byte number n. 444 Jump to the address N times, then fail. */ 445 jump_n, 446 447 /* Set the following two-byte relative address to the 448 subsequent two-byte number. The address *includes* the two 449 bytes of number. */ 450 set_number_at, 451 452 wordchar, /* Matches any word-constituent character. */ 453 notwordchar, /* Matches any char that is not a word-constituent. */ 454 455 wordbeg, /* Succeeds if at word beginning. */ 456 wordend, /* Succeeds if at word end. */ 457 458 wordbound, /* Succeeds if at a word boundary. */ 459 notwordbound /* Succeeds if not at a word boundary. */ 460 461 #ifdef emacs 462 ,before_dot, /* Succeeds if before point. */ 463 at_dot, /* Succeeds if at point. */ 464 after_dot, /* Succeeds if after point. */ 465 466 /* Matches any character whose syntax is specified. Followed by 467 a byte which contains a syntax code, e.g., Sword. */ 468 syntaxspec, 469 470 /* Matches any character whose syntax is not that specified. */ 471 notsyntaxspec 472 #endif /* emacs */ 473 } re_opcode_t; 474 475 /* Common operations on the compiled pattern. */ 477 478 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 479 480 #define STORE_NUMBER(destination, number) \ 481 do { \ 482 (destination)[0] = (number) & 0377; \ 483 (destination)[1] = (number) >> 8; \ 484 } while (0) 485 486 /* Same as STORE_NUMBER, except increment DESTINATION to 487 the byte after where the number is stored. Therefore, DESTINATION 488 must be an lvalue. */ 489 490 #define STORE_NUMBER_AND_INCR(destination, number) \ 491 do { \ 492 STORE_NUMBER (destination, number); \ 493 (destination) += 2; \ 494 } while (0) 495 496 /* Put into DESTINATION a number stored in two contiguous bytes starting 497 at SOURCE. */ 498 499 #define EXTRACT_NUMBER(destination, source) \ 500 do { \ 501 (destination) = *(source) & 0377; \ 502 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 503 } while (0) 504 505 #ifdef DEBUG 506 static void 507 extract_number (dest, source) 508 int *dest; 509 unsigned char *source; 510 { 511 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 512 *dest = *source & 0377; 513 *dest += temp << 8; 514 } 515 516 #ifndef EXTRACT_MACROS /* To debug the macros. */ 517 #undef EXTRACT_NUMBER 518 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 519 #endif /* not EXTRACT_MACROS */ 520 521 #endif /* DEBUG */ 522 523 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 524 SOURCE must be an lvalue. */ 525 526 #define EXTRACT_NUMBER_AND_INCR(destination, source) \ 527 do { \ 528 EXTRACT_NUMBER (destination, source); \ 529 (source) += 2; \ 530 } while (0) 531 532 #ifdef DEBUG 533 static void 534 extract_number_and_incr (destination, source) 535 int *destination; 536 unsigned char **source; 537 { 538 extract_number (destination, *source); 539 *source += 2; 540 } 541 542 #ifndef EXTRACT_MACROS 543 #undef EXTRACT_NUMBER_AND_INCR 544 #define EXTRACT_NUMBER_AND_INCR(dest, src) \ 545 extract_number_and_incr (&dest, &src) 546 #endif /* not EXTRACT_MACROS */ 547 548 #endif /* DEBUG */ 549 550 /* If DEBUG is defined, Regex prints many voluminous messages about what 552 it is doing (if the variable `debug' is nonzero). If linked with the 553 main program in `iregex.c', you can enter patterns and strings 554 interactively. And if linked with the main program in `main.c' and 555 the other test files, you can run the already-written tests. */ 556 557 #ifdef DEBUG 558 559 /* We use standard I/O for debugging. */ 560 #include <stdio.h> 561 562 /* It is useful to test things that ``must'' be true when debugging. */ 563 #include <assert.h> 564 565 static int debug = 0; 566 567 #define DEBUG_STATEMENT(e) e 568 #define DEBUG_PRINT1(x) if (debug) printf (x) 569 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 570 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 571 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 572 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 573 if (debug) print_partial_compiled_pattern (s, e) 574 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 575 if (debug) print_double_string (w, s1, sz1, s2, sz2) 576 577 578 /* Print the fastmap in human-readable form. */ 579 580 void 581 print_fastmap (fastmap) 582 char *fastmap; 583 { 584 unsigned was_a_range = 0; 585 unsigned i = 0; 586 587 while (i < (1 << BYTEWIDTH)) 588 { 589 if (fastmap[i++]) 590 { 591 was_a_range = 0; 592 putchar (i - 1); 593 while (i < (1 << BYTEWIDTH) && fastmap[i]) 594 { 595 was_a_range = 1; 596 i++; 597 } 598 if (was_a_range) 599 { 600 printf ("-"); 601 putchar (i - 1); 602 } 603 } 604 } 605 putchar ('\n'); 606 } 607 608 609 /* Print a compiled pattern string in human-readable form, starting at 610 the START pointer into it and ending just before the pointer END. */ 611 612 void 613 print_partial_compiled_pattern (start, end) 614 unsigned char *start; 615 unsigned char *end; 616 { 617 int mcnt, mcnt2; 618 unsigned char *p = start; 619 unsigned char *pend = end; 620 621 if (start == NULL) 622 { 623 printf ("(null)\n"); 624 return; 625 } 626 627 /* Loop over pattern commands. */ 628 while (p < pend) 629 { 630 printf ("%d:\t", p - start); 631 632 switch ((re_opcode_t) *p++) 633 { 634 case no_op: 635 printf ("/no_op"); 636 break; 637 638 case exactn: 639 mcnt = *p++; 640 printf ("/exactn/%d", mcnt); 641 do 642 { 643 putchar ('/'); 644 putchar (*p++); 645 } 646 while (--mcnt); 647 break; 648 649 case start_memory: 650 mcnt = *p++; 651 printf ("/start_memory/%d/%d", mcnt, *p++); 652 break; 653 654 case stop_memory: 655 mcnt = *p++; 656 printf ("/stop_memory/%d/%d", mcnt, *p++); 657 break; 658 659 case duplicate: 660 printf ("/duplicate/%d", *p++); 661 break; 662 663 case anychar: 664 printf ("/anychar"); 665 break; 666 667 case charset: 668 case charset_not: 669 { 670 register int c, last = -100; 671 register int in_range = 0; 672 673 printf ("/charset [%s", 674 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 675 676 assert (p + *p < pend); 677 678 for (c = 0; c < 256; c++) 679 if (c / 8 < *p 680 && (p[1 + (c/8)] & (1 << (c % 8)))) 681 { 682 /* Are we starting a range? */ 683 if (last + 1 == c && ! in_range) 684 { 685 putchar ('-'); 686 in_range = 1; 687 } 688 /* Have we broken a range? */ 689 else if (last + 1 != c && in_range) 690 { 691 putchar (last); 692 in_range = 0; 693 } 694 695 if (! in_range) 696 putchar (c); 697 698 last = c; 699 } 700 701 if (in_range) 702 putchar (last); 703 704 putchar (']'); 705 706 p += 1 + *p; 707 } 708 break; 709 710 case begline: 711 printf ("/begline"); 712 break; 713 714 case endline: 715 printf ("/endline"); 716 break; 717 718 case on_failure_jump: 719 extract_number_and_incr (&mcnt, &p); 720 printf ("/on_failure_jump to %d", p + mcnt - start); 721 break; 722 723 case on_failure_keep_string_jump: 724 extract_number_and_incr (&mcnt, &p); 725 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); 726 break; 727 728 case dummy_failure_jump: 729 extract_number_and_incr (&mcnt, &p); 730 printf ("/dummy_failure_jump to %d", p + mcnt - start); 731 break; 732 733 case push_dummy_failure: 734 printf ("/push_dummy_failure"); 735 break; 736 737 case maybe_pop_jump: 738 extract_number_and_incr (&mcnt, &p); 739 printf ("/maybe_pop_jump to %d", p + mcnt - start); 740 break; 741 742 case pop_failure_jump: 743 extract_number_and_incr (&mcnt, &p); 744 printf ("/pop_failure_jump to %d", p + mcnt - start); 745 break; 746 747 case jump_past_alt: 748 extract_number_and_incr (&mcnt, &p); 749 printf ("/jump_past_alt to %d", p + mcnt - start); 750 break; 751 752 case jump: 753 extract_number_and_incr (&mcnt, &p); 754 printf ("/jump to %d", p + mcnt - start); 755 break; 756 757 case succeed_n: 758 extract_number_and_incr (&mcnt, &p); 759 extract_number_and_incr (&mcnt2, &p); 760 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2); 761 break; 762 763 case jump_n: 764 extract_number_and_incr (&mcnt, &p); 765 extract_number_and_incr (&mcnt2, &p); 766 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2); 767 break; 768 769 case set_number_at: 770 extract_number_and_incr (&mcnt, &p); 771 extract_number_and_incr (&mcnt2, &p); 772 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2); 773 break; 774 775 case wordbound: 776 printf ("/wordbound"); 777 break; 778 779 case notwordbound: 780 printf ("/notwordbound"); 781 break; 782 783 case wordbeg: 784 printf ("/wordbeg"); 785 break; 786 787 case wordend: 788 printf ("/wordend"); 789 790 #ifdef emacs 791 case before_dot: 792 printf ("/before_dot"); 793 break; 794 795 case at_dot: 796 printf ("/at_dot"); 797 break; 798 799 case after_dot: 800 printf ("/after_dot"); 801 break; 802 803 case syntaxspec: 804 printf ("/syntaxspec"); 805 mcnt = *p++; 806 printf ("/%d", mcnt); 807 break; 808 809 case notsyntaxspec: 810 printf ("/notsyntaxspec"); 811 mcnt = *p++; 812 printf ("/%d", mcnt); 813 break; 814 #endif /* emacs */ 815 816 case wordchar: 817 printf ("/wordchar"); 818 break; 819 820 case notwordchar: 821 printf ("/notwordchar"); 822 break; 823 824 case begbuf: 825 printf ("/begbuf"); 826 break; 827 828 case endbuf: 829 printf ("/endbuf"); 830 break; 831 832 default: 833 printf ("?%d", *(p-1)); 834 } 835 836 putchar ('\n'); 837 } 838 839 printf ("%d:\tend of pattern.\n", p - start); 840 } 841 842 843 void 844 print_compiled_pattern (bufp) 845 struct re_pattern_buffer *bufp; 846 { 847 unsigned char *buffer = bufp->buffer; 848 849 print_partial_compiled_pattern (buffer, buffer + bufp->used); 850 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); 851 852 if (bufp->fastmap_accurate && bufp->fastmap) 853 { 854 printf ("fastmap: "); 855 print_fastmap (bufp->fastmap); 856 } 857 858 printf ("re_nsub: %d\t", bufp->re_nsub); 859 printf ("regs_alloc: %d\t", bufp->regs_allocated); 860 printf ("can_be_null: %d\t", bufp->can_be_null); 861 printf ("newline_anchor: %d\n", bufp->newline_anchor); 862 printf ("no_sub: %d\t", bufp->no_sub); 863 printf ("not_bol: %d\t", bufp->not_bol); 864 printf ("not_eol: %d\t", bufp->not_eol); 865 printf ("syntax: %d\n", bufp->syntax); 866 /* Perhaps we should print the translate table? */ 867 } 868 869 870 void 871 print_double_string (where, string1, size1, string2, size2) 872 const char *where; 873 const char *string1; 874 const char *string2; 875 int size1; 876 int size2; 877 { 878 unsigned this_char; 879 880 if (where == NULL) 881 printf ("(null)"); 882 else 883 { 884 if (FIRST_STRING_P (where)) 885 { 886 for (this_char = where - string1; this_char < size1; this_char++) 887 putchar (string1[this_char]); 888 889 where = string2; 890 } 891 892 for (this_char = where - string2; this_char < size2; this_char++) 893 putchar (string2[this_char]); 894 } 895 } 896 897 #else /* not DEBUG */ 898 899 #undef assert 900 #define assert(e) 901 902 #define DEBUG_STATEMENT(e) 903 #define DEBUG_PRINT1(x) 904 #define DEBUG_PRINT2(x1, x2) 905 #define DEBUG_PRINT3(x1, x2, x3) 906 #define DEBUG_PRINT4(x1, x2, x3, x4) 907 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 908 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 909 910 #endif /* not DEBUG */ 911 912 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 914 also be assigned to arbitrarily: each pattern buffer stores its own 915 syntax, so it can be changed between regex compilations. */ 916 /* This has no initializer because initialized variables in Emacs 917 become read-only after dumping. */ 918 reg_syntax_t re_syntax_options; 919 920 921 /* Specify the precise syntax of regexps for compilation. This provides 922 for compatibility for various utilities which historically have 923 different, incompatible syntaxes. 924 925 The argument SYNTAX is a bit mask comprised of the various bits 926 defined in regex.h. We return the old syntax. */ 927 928 reg_syntax_t 929 re_set_syntax (syntax) 930 reg_syntax_t syntax; 931 { 932 reg_syntax_t ret = re_syntax_options; 933 934 re_syntax_options = syntax; 935 return ret; 936 } 937 938 /* This table gives an error message for each of the error codes listed 940 in regex.h. Obviously the order here has to be same as there. 941 POSIX doesn't require that we do anything for REG_NOERROR, 942 but why not be nice? */ 943 944 static const char *re_error_msgid[] = 945 { 946 gettext_noop ("Success"), /* REG_NOERROR */ 947 gettext_noop ("No match"), /* REG_NOMATCH */ 948 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */ 949 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */ 950 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */ 951 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */ 952 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */ 953 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */ 954 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */ 955 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */ 956 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */ 957 gettext_noop ("Invalid range end"), /* REG_ERANGE */ 958 gettext_noop ("Memory exhausted"), /* REG_ESPACE */ 959 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */ 960 gettext_noop ("Premature end of regular expression"), /* REG_EEND */ 961 gettext_noop ("Regular expression too big"), /* REG_ESIZE */ 962 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */ 963 }; 964 965 /* Avoiding alloca during matching, to placate r_alloc. */ 967 968 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 969 searching and matching functions should not call alloca. On some 970 systems, alloca is implemented in terms of malloc, and if we're 971 using the relocating allocator routines, then malloc could cause a 972 relocation, which might (if the strings being searched are in the 973 ralloc heap) shift the data out from underneath the regexp 974 routines. 975 976 Here's another reason to avoid allocation: Emacs 977 processes input from X in a signal handler; processing X input may 978 call malloc; if input arrives while a matching routine is calling 979 malloc, then we're scrod. But Emacs can't just block input while 980 calling matching routines; then we don't notice interrupts when 981 they come in. So, Emacs blocks input around all regexp calls 982 except the matching calls, which it leaves unprotected, in the 983 faith that they will not malloc. */ 984 985 /* Normally, this is fine. */ 986 #define MATCH_MAY_ALLOCATE 987 988 /* When using GNU C, we are not REALLY using the C alloca, no matter 989 what config.h may say. So don't take precautions for it. */ 990 #ifdef __GNUC__ 991 #undef C_ALLOCA 992 #endif 993 994 /* The match routines may not allocate if (1) they would do it with malloc 995 and (2) it's not safe for them to use malloc. 996 Note that if REL_ALLOC is defined, matching would not use malloc for the 997 failure stack, but we would still use it for the register vectors; 998 so REL_ALLOC should not affect this. */ 999 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs) 1000 #undef MATCH_MAY_ALLOCATE 1001 #endif 1002 1003 1004 /* Failure stack declarations and macros; both re_compile_fastmap and 1006 re_match_2 use a failure stack. These have to be macros because of 1007 REGEX_ALLOCATE_STACK. */ 1008 1009 1010 /* Number of failure points for which to initially allocate space 1011 when matching. If this number is exceeded, we allocate more 1012 space, so it is not a hard limit. */ 1013 #ifndef INIT_FAILURE_ALLOC 1014 #define INIT_FAILURE_ALLOC 5 1015 #endif 1016 1017 /* Roughly the maximum number of failure points on the stack. Would be 1018 exactly that if always used MAX_FAILURE_ITEMS items each time we failed. 1019 This is a variable only so users of regex can assign to it; we never 1020 change it ourselves. */ 1021 #if defined (MATCH_MAY_ALLOCATE) 1022 /* 4400 was enough to cause a crash on Alpha OSF/1, 1023 whose default stack limit is 2mb. */ 1024 int re_max_failures = 20000; 1025 #else 1026 int re_max_failures = 2000; 1027 #endif 1028 1029 union fail_stack_elt 1030 { 1031 unsigned char *pointer; 1032 int integer; 1033 }; 1034 1035 typedef union fail_stack_elt fail_stack_elt_t; 1036 1037 typedef struct 1038 { 1039 fail_stack_elt_t *stack; 1040 unsigned size; 1041 unsigned avail; /* Offset of next open position. */ 1042 } fail_stack_type; 1043 1044 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 1045 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 1046 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 1047 1048 1049 /* Define macros to initialize and free the failure stack. 1050 Do `return -2' if the alloc fails. */ 1051 1052 #ifdef MATCH_MAY_ALLOCATE 1053 #define INIT_FAIL_STACK() \ 1054 do { \ 1055 fail_stack.stack = (fail_stack_elt_t *) \ 1056 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 1057 \ 1058 if (fail_stack.stack == NULL) \ 1059 return -2; \ 1060 \ 1061 fail_stack.size = INIT_FAILURE_ALLOC; \ 1062 fail_stack.avail = 0; \ 1063 } while (0) 1064 1065 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 1066 #else 1067 #define INIT_FAIL_STACK() \ 1068 do { \ 1069 fail_stack.avail = 0; \ 1070 } while (0) 1071 1072 #define RESET_FAIL_STACK() 1073 #endif 1074 1075 1076 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 1077 1078 Return 1 if succeeds, and 0 if either ran out of memory 1079 allocating space for it or it was already too large. 1080 1081 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 1082 1083 #define DOUBLE_FAIL_STACK(fail_stack) \ 1084 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ 1085 ? 0 \ 1086 : ((fail_stack).stack = (fail_stack_elt_t *) \ 1087 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 1088 (fail_stack).size * sizeof (fail_stack_elt_t), \ 1089 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 1090 \ 1091 (fail_stack).stack == NULL \ 1092 ? 0 \ 1093 : ((fail_stack).size <<= 1, \ 1094 1))) 1095 1096 1097 /* Push pointer POINTER on FAIL_STACK. 1098 Return 1 if was able to do so and 0 if ran out of memory allocating 1099 space to do so. */ 1100 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 1101 ((FAIL_STACK_FULL () \ 1102 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 1103 ? 0 \ 1104 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1105 1)) 1106 1107 /* Push a pointer value onto the failure stack. 1108 Assumes the variable `fail_stack'. Probably should only 1109 be called from within `PUSH_FAILURE_POINT'. */ 1110 #define PUSH_FAILURE_POINTER(item) \ 1111 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) 1112 1113 /* This pushes an integer-valued item onto the failure stack. 1114 Assumes the variable `fail_stack'. Probably should only 1115 be called from within `PUSH_FAILURE_POINT'. */ 1116 #define PUSH_FAILURE_INT(item) \ 1117 fail_stack.stack[fail_stack.avail++].integer = (item) 1118 1119 /* Push a fail_stack_elt_t value onto the failure stack. 1120 Assumes the variable `fail_stack'. Probably should only 1121 be called from within `PUSH_FAILURE_POINT'. */ 1122 #define PUSH_FAILURE_ELT(item) \ 1123 fail_stack.stack[fail_stack.avail++] = (item) 1124 1125 /* These three POP... operations complement the three PUSH... operations. 1126 All assume that `fail_stack' is nonempty. */ 1127 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 1128 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 1129 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 1130 1131 /* Used to omit pushing failure point id's when we're not debugging. */ 1132 #ifdef DEBUG 1133 #define DEBUG_PUSH PUSH_FAILURE_INT 1134 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 1135 #else 1136 #define DEBUG_PUSH(item) 1137 #define DEBUG_POP(item_addr) 1138 #endif 1139 1140 1141 /* Push the information about the state we will need 1142 if we ever fail back to it. 1143 1144 Requires variables fail_stack, regstart, regend, reg_info, and 1145 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be 1146 declared. 1147 1148 Does `return FAILURE_CODE' if runs out of memory. */ 1149 1150 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 1151 do { \ 1152 char *destination; \ 1153 /* Must be int, so when we don't save any registers, the arithmetic \ 1154 of 0 + -1 isn't done as unsigned. */ \ 1155 int this_reg; \ 1156 \ 1157 DEBUG_STATEMENT (failure_id++); \ 1158 DEBUG_STATEMENT (nfailure_points_pushed++); \ 1159 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 1160 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 1161 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 1162 \ 1163 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 1164 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 1165 \ 1166 /* Ensure we have enough space allocated for what we will push. */ \ 1167 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 1168 { \ 1169 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 1170 return failure_code; \ 1171 \ 1172 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 1173 (fail_stack).size); \ 1174 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 1175 } \ 1176 \ 1177 /* Push the info, starting with the registers. */ \ 1178 DEBUG_PRINT1 ("\n"); \ 1179 \ 1180 if (1) \ 1181 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 1182 this_reg++) \ 1183 { \ 1184 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 1185 DEBUG_STATEMENT (num_regs_pushed++); \ 1186 \ 1187 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 1188 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 1189 \ 1190 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 1191 PUSH_FAILURE_POINTER (regend[this_reg]); \ 1192 \ 1193 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 1194 DEBUG_PRINT2 (" match_null=%d", \ 1195 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 1196 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 1197 DEBUG_PRINT2 (" matched_something=%d", \ 1198 MATCHED_SOMETHING (reg_info[this_reg])); \ 1199 DEBUG_PRINT2 (" ever_matched=%d", \ 1200 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 1201 DEBUG_PRINT1 ("\n"); \ 1202 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 1203 } \ 1204 \ 1205 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 1206 PUSH_FAILURE_INT (lowest_active_reg); \ 1207 \ 1208 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 1209 PUSH_FAILURE_INT (highest_active_reg); \ 1210 \ 1211 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ 1212 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 1213 PUSH_FAILURE_POINTER (pattern_place); \ 1214 \ 1215 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 1216 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 1217 size2); \ 1218 DEBUG_PRINT1 ("'\n"); \ 1219 PUSH_FAILURE_POINTER (string_place); \ 1220 \ 1221 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 1222 DEBUG_PUSH (failure_id); \ 1223 } while (0) 1224 1225 /* This is the number of items that are pushed and popped on the stack 1226 for each register. */ 1227 #define NUM_REG_ITEMS 3 1228 1229 /* Individual items aside from the registers. */ 1230 #ifdef DEBUG 1231 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 1232 #else 1233 #define NUM_NONREG_ITEMS 4 1234 #endif 1235 1236 /* We push at most this many items on the stack. */ 1237 /* We used to use (num_regs - 1), which is the number of registers 1238 this regexp will save; but that was changed to 5 1239 to avoid stack overflow for a regexp with lots of parens. */ 1240 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 1241 1242 /* We actually push this many items. */ 1243 #define NUM_FAILURE_ITEMS \ 1244 (((0 \ 1245 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 1246 * NUM_REG_ITEMS) \ 1247 + NUM_NONREG_ITEMS) 1248 1249 /* How many items can still be added to the stack without overflowing it. */ 1250 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 1251 1252 1253 /* Pops what PUSH_FAIL_STACK pushes. 1254 1255 We restore into the parameters, all of which should be lvalues: 1256 STR -- the saved data position. 1257 PAT -- the saved pattern position. 1258 LOW_REG, HIGH_REG -- the highest and lowest active registers. 1259 REGSTART, REGEND -- arrays of string positions. 1260 REG_INFO -- array of information about each subexpression. 1261 1262 Also assumes the variables `fail_stack' and (if debugging), `bufp', 1263 `pend', `string1', `size1', `string2', and `size2'. */ 1264 1265 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 1266 { \ 1267 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 1268 int this_reg; \ 1269 const unsigned char *string_temp; \ 1270 \ 1271 assert (!FAIL_STACK_EMPTY ()); \ 1272 \ 1273 /* Remove failure points and point to how many regs pushed. */ \ 1274 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 1275 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 1276 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 1277 \ 1278 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 1279 \ 1280 DEBUG_POP (&failure_id); \ 1281 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 1282 \ 1283 /* If the saved string location is NULL, it came from an \ 1284 on_failure_keep_string_jump opcode, and we want to throw away the \ 1285 saved NULL, thus retaining our current position in the string. */ \ 1286 string_temp = POP_FAILURE_POINTER (); \ 1287 if (string_temp != NULL) \ 1288 str = (const char *) string_temp; \ 1289 \ 1290 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 1291 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 1292 DEBUG_PRINT1 ("'\n"); \ 1293 \ 1294 pat = (unsigned char *) POP_FAILURE_POINTER (); \ 1295 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ 1296 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 1297 \ 1298 /* Restore register info. */ \ 1299 high_reg = (unsigned) POP_FAILURE_INT (); \ 1300 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 1301 \ 1302 low_reg = (unsigned) POP_FAILURE_INT (); \ 1303 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 1304 \ 1305 if (1) \ 1306 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 1307 { \ 1308 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 1309 \ 1310 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 1311 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 1312 \ 1313 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 1314 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 1315 \ 1316 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 1317 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 1318 } \ 1319 else \ 1320 { \ 1321 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 1322 { \ 1323 reg_info[this_reg].word.integer = 0; \ 1324 regend[this_reg] = 0; \ 1325 regstart[this_reg] = 0; \ 1326 } \ 1327 highest_active_reg = high_reg; \ 1328 } \ 1329 \ 1330 set_regs_matched_done = 0; \ 1331 DEBUG_STATEMENT (nfailure_points_popped++); \ 1332 } /* POP_FAILURE_POINT */ 1333 1334 1335 1336 /* Structure for per-register (a.k.a. per-group) information. 1338 Other register information, such as the 1339 starting and ending positions (which are addresses), and the list of 1340 inner groups (which is a bits list) are maintained in separate 1341 variables. 1342 1343 We are making a (strictly speaking) nonportable assumption here: that 1344 the compiler will pack our bit fields into something that fits into 1345 the type of `word', i.e., is something that fits into one item on the 1346 failure stack. */ 1347 1348 typedef union 1349 { 1350 fail_stack_elt_t word; 1351 struct 1352 { 1353 /* This field is one if this group can match the empty string, 1354 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 1355 #define MATCH_NULL_UNSET_VALUE 3 1356 unsigned match_null_string_p : 2; 1357 unsigned is_active : 1; 1358 unsigned matched_something : 1; 1359 unsigned ever_matched_something : 1; 1360 } bits; 1361 } register_info_type; 1362 1363 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 1364 #define IS_ACTIVE(R) ((R).bits.is_active) 1365 #define MATCHED_SOMETHING(R) ((R).bits.matched_something) 1366 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 1367 1368 1369 /* Call this when have matched a real character; it sets `matched' flags 1370 for the subexpressions which we are currently inside. Also records 1371 that those subexprs have matched. */ 1372 #define SET_REGS_MATCHED() \ 1373 do \ 1374 { \ 1375 if (!set_regs_matched_done) \ 1376 { \ 1377 unsigned r; \ 1378 set_regs_matched_done = 1; \ 1379 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 1380 { \ 1381 MATCHED_SOMETHING (reg_info[r]) \ 1382 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 1383 = 1; \ 1384 } \ 1385 } \ 1386 } \ 1387 while (0) 1388 1389 /* Registers are set to a sentinel when they haven't yet matched. */ 1390 static char reg_unset_dummy; 1391 #define REG_UNSET_VALUE (®_unset_dummy) 1392 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 1393 1394 /* Subroutine declarations and macros for regex_compile. */ 1396 1397 static void store_op1 (), store_op2 (); 1398 static void insert_op1 (), insert_op2 (); 1399 static boolean at_begline_loc_p (), at_endline_loc_p (); 1400 static boolean group_in_compile_stack (); 1401 static reg_errcode_t compile_range (); 1402 1403 /* Fetch the next character in the uncompiled pattern---translating it 1404 if necessary. Also cast from a signed character in the constant 1405 string passed to us by the user to an unsigned char that we can use 1406 as an array index (in, e.g., `translate'). */ 1407 #ifndef PATFETCH 1408 #define PATFETCH(c) \ 1409 do {if (p == pend) return REG_EEND; \ 1410 c = (unsigned char) *p++; \ 1411 if (translate) c = (unsigned char) translate[c]; \ 1412 } while (0) 1413 #endif 1414 1415 /* Fetch the next character in the uncompiled pattern, with no 1416 translation. */ 1417 #define PATFETCH_RAW(c) \ 1418 do {if (p == pend) return REG_EEND; \ 1419 c = (unsigned char) *p++; \ 1420 } while (0) 1421 1422 /* Go backwards one character in the pattern. */ 1423 #define PATUNFETCH p-- 1424 1425 1426 /* If `translate' is non-null, return translate[D], else just D. We 1427 cast the subscript to translate because some data is declared as 1428 `char *', to avoid warnings when a string constant is passed. But 1429 when we use a character as a subscript we must make it unsigned. */ 1430 #ifndef TRANSLATE 1431 #define TRANSLATE(d) \ 1432 (translate ? (char) translate[(unsigned char) (d)] : (d)) 1433 #endif 1434 1435 1436 /* Macros for outputting the compiled pattern into `buffer'. */ 1437 1438 /* If the buffer isn't allocated when it comes in, use this. */ 1439 #define INIT_BUF_SIZE 32 1440 1441 /* Make sure we have at least N more bytes of space in buffer. */ 1442 #define GET_BUFFER_SPACE(n) \ 1443 while (b - bufp->buffer + (n) > bufp->allocated) \ 1444 EXTEND_BUFFER () 1445 1446 /* Make sure we have one more byte of buffer space and then add C to it. */ 1447 #define BUF_PUSH(c) \ 1448 do { \ 1449 GET_BUFFER_SPACE (1); \ 1450 *b++ = (unsigned char) (c); \ 1451 } while (0) 1452 1453 1454 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 1455 #define BUF_PUSH_2(c1, c2) \ 1456 do { \ 1457 GET_BUFFER_SPACE (2); \ 1458 *b++ = (unsigned char) (c1); \ 1459 *b++ = (unsigned char) (c2); \ 1460 } while (0) 1461 1462 1463 /* As with BUF_PUSH_2, except for three bytes. */ 1464 #define BUF_PUSH_3(c1, c2, c3) \ 1465 do { \ 1466 GET_BUFFER_SPACE (3); \ 1467 *b++ = (unsigned char) (c1); \ 1468 *b++ = (unsigned char) (c2); \ 1469 *b++ = (unsigned char) (c3); \ 1470 } while (0) 1471 1472 1473 /* Store a jump with opcode OP at LOC to location TO. We store a 1474 relative address offset by the three bytes the jump itself occupies. */ 1475 #define STORE_JUMP(op, loc, to) \ 1476 store_op1 (op, loc, (to) - (loc) - 3) 1477 1478 /* Likewise, for a two-argument jump. */ 1479 #define STORE_JUMP2(op, loc, to, arg) \ 1480 store_op2 (op, loc, (to) - (loc) - 3, arg) 1481 1482 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 1483 #define INSERT_JUMP(op, loc, to) \ 1484 insert_op1 (op, loc, (to) - (loc) - 3, b) 1485 1486 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 1487 #define INSERT_JUMP2(op, loc, to, arg) \ 1488 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) 1489 1490 1491 /* This is not an arbitrary limit: the arguments which represent offsets 1492 into the pattern are two bytes long. So if 2^16 bytes turns out to 1493 be too small, many things would have to change. */ 1494 #define MAX_BUF_SIZE (1L << 16) 1495 1496 1497 /* Extend the buffer by twice its current size via realloc and 1498 reset the pointers that pointed into the old block to point to the 1499 correct places in the new one. If extending the buffer results in it 1500 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 1501 #define EXTEND_BUFFER() \ 1502 do { \ 1503 unsigned char *old_buffer = bufp->buffer; \ 1504 if (bufp->allocated == MAX_BUF_SIZE) \ 1505 return REG_ESIZE; \ 1506 bufp->allocated <<= 1; \ 1507 if (bufp->allocated > MAX_BUF_SIZE) \ 1508 bufp->allocated = MAX_BUF_SIZE; \ 1509 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ 1510 if (bufp->buffer == NULL) \ 1511 return REG_ESPACE; \ 1512 /* If the buffer moved, move all the pointers into it. */ \ 1513 if (old_buffer != bufp->buffer) \ 1514 { \ 1515 b = (b - old_buffer) + bufp->buffer; \ 1516 begalt = (begalt - old_buffer) + bufp->buffer; \ 1517 if (fixup_alt_jump) \ 1518 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 1519 if (laststart) \ 1520 laststart = (laststart - old_buffer) + bufp->buffer; \ 1521 if (pending_exact) \ 1522 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 1523 } \ 1524 } while (0) 1525 1526 1527 /* Since we have one byte reserved for the register number argument to 1528 {start,stop}_memory, the maximum number of groups we can report 1529 things about is what fits in that byte. */ 1530 #define MAX_REGNUM 255 1531 1532 /* But patterns can have more than `MAX_REGNUM' registers. We just 1533 ignore the excess. */ 1534 typedef unsigned regnum_t; 1535 1536 1537 /* Macros for the compile stack. */ 1538 1539 /* Since offsets can go either forwards or backwards, this type needs to 1540 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 1541 typedef int pattern_offset_t; 1542 1543 typedef struct 1544 { 1545 pattern_offset_t begalt_offset; 1546 pattern_offset_t fixup_alt_jump; 1547 pattern_offset_t inner_group_offset; 1548 pattern_offset_t laststart_offset; 1549 regnum_t regnum; 1550 } compile_stack_elt_t; 1551 1552 1553 typedef struct 1554 { 1555 compile_stack_elt_t *stack; 1556 unsigned size; 1557 unsigned avail; /* Offset of next open position. */ 1558 } compile_stack_type; 1559 1560 1561 #define INIT_COMPILE_STACK_SIZE 32 1562 1563 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 1564 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 1565 1566 /* The next available element. */ 1567 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 1568 1569 1570 /* Set the bit for character C in a list. */ 1571 #define SET_LIST_BIT(c) \ 1572 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1573 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1574 1575 1576 /* Get the next unsigned number in the uncompiled pattern. */ 1577 #define GET_UNSIGNED_NUMBER(num) \ 1578 { if (p != pend) \ 1579 { \ 1580 PATFETCH (c); \ 1581 while (ISDIGIT (c)) \ 1582 { \ 1583 if (num < 0) \ 1584 num = 0; \ 1585 num = num * 10 + c - '0'; \ 1586 if (p == pend) \ 1587 break; \ 1588 PATFETCH (c); \ 1589 } \ 1590 } \ 1591 } 1592 1593 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 1594 1595 #define IS_CHAR_CLASS(string) \ 1596 (STREQ (string, "alpha") || STREQ (string, "upper") \ 1597 || STREQ (string, "lower") || STREQ (string, "digit") \ 1598 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1599 || STREQ (string, "space") || STREQ (string, "print") \ 1600 || STREQ (string, "punct") || STREQ (string, "graph") \ 1601 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1602 1603 #ifndef MATCH_MAY_ALLOCATE 1605 1606 /* If we cannot allocate large objects within re_match_2_internal, 1607 we make the fail stack and register vectors global. 1608 The fail stack, we grow to the maximum size when a regexp 1609 is compiled. 1610 The register vectors, we adjust in size each time we 1611 compile a regexp, according to the number of registers it needs. */ 1612 1613 static fail_stack_type fail_stack; 1614 1615 /* Size with which the following vectors are currently allocated. 1616 That is so we can make them bigger as needed, 1617 but never make them smaller. */ 1618 static int regs_allocated_size; 1619 1620 static const char ** regstart, ** regend; 1621 static const char ** old_regstart, ** old_regend; 1622 static const char **best_regstart, **best_regend; 1623 static register_info_type *reg_info; 1624 static const char **reg_dummy; 1625 static register_info_type *reg_info_dummy; 1626 1627 /* Make the register vectors big enough for NUM_REGS registers, 1628 but don't make them smaller. */ 1629 1630 static 1631 regex_grow_registers (num_regs) 1632 int num_regs; 1633 { 1634 if (num_regs > regs_allocated_size) 1635 { 1636 RETALLOC_IF (regstart, num_regs, const char *); 1637 RETALLOC_IF (regend, num_regs, const char *); 1638 RETALLOC_IF (old_regstart, num_regs, const char *); 1639 RETALLOC_IF (old_regend, num_regs, const char *); 1640 RETALLOC_IF (best_regstart, num_regs, const char *); 1641 RETALLOC_IF (best_regend, num_regs, const char *); 1642 RETALLOC_IF (reg_info, num_regs, register_info_type); 1643 RETALLOC_IF (reg_dummy, num_regs, const char *); 1644 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); 1645 1646 regs_allocated_size = num_regs; 1647 } 1648 } 1649 1650 #endif /* not MATCH_MAY_ALLOCATE */ 1651 1652 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 1654 Returns one of error codes defined in `regex.h', or zero for success. 1655 1656 Assumes the `allocated' (and perhaps `buffer') and `translate' 1657 fields are set in BUFP on entry. 1658 1659 If it succeeds, results are put in BUFP (if it returns an error, the 1660 contents of BUFP are undefined): 1661 `buffer' is the compiled pattern; 1662 `syntax' is set to SYNTAX; 1663 `used' is set to the length of the compiled pattern; 1664 `fastmap_accurate' is zero; 1665 `re_nsub' is the number of subexpressions in PATTERN; 1666 `not_bol' and `not_eol' are zero; 1667 1668 The `fastmap' and `newline_anchor' fields are neither 1669 examined nor set. */ 1670 1671 /* Return, freeing storage we allocated. */ 1672 #define FREE_STACK_RETURN(value) \ 1673 return (free (compile_stack.stack), value) 1674 1675 static reg_errcode_t 1676 regex_compile (pattern, size, syntax, bufp) 1677 const char *pattern; 1678 int size; 1679 reg_syntax_t syntax; 1680 struct re_pattern_buffer *bufp; 1681 { 1682 /* We fetch characters from PATTERN here. Even though PATTERN is 1683 `char *' (i.e., signed), we declare these variables as unsigned, so 1684 they can be reliably used as array indices. */ 1685 register unsigned char c, c1; 1686 1687 /* A random temporary spot in PATTERN. */ 1688 const char *p1; 1689 1690 /* Points to the end of the buffer, where we should append. */ 1691 register unsigned char *b; 1692 1693 /* Keeps track of unclosed groups. */ 1694 compile_stack_type compile_stack; 1695 1696 /* Points to the current (ending) position in the pattern. */ 1697 const char *p = pattern; 1698 const char *pend = pattern + size; 1699 1700 /* How to translate the characters in the pattern. */ 1701 RE_TRANSLATE_TYPE translate = bufp->translate; 1702 1703 /* Address of the count-byte of the most recently inserted `exactn' 1704 command. This makes it possible to tell if a new exact-match 1705 character can be added to that command or if the character requires 1706 a new `exactn' command. */ 1707 unsigned char *pending_exact = 0; 1708 1709 /* Address of start of the most recently finished expression. 1710 This tells, e.g., postfix * where to find the start of its 1711 operand. Reset at the beginning of groups and alternatives. */ 1712 unsigned char *laststart = 0; 1713 1714 /* Address of beginning of regexp, or inside of last group. */ 1715 unsigned char *begalt; 1716 1717 /* Place in the uncompiled pattern (i.e., the {) to 1718 which to go back if the interval is invalid. */ 1719 const char *beg_interval; 1720 1721 /* Address of the place where a forward jump should go to the end of 1722 the containing expression. Each alternative of an `or' -- except the 1723 last -- ends with a forward jump of this sort. */ 1724 unsigned char *fixup_alt_jump = 0; 1725 1726 /* Counts open-groups as they are encountered. Remembered for the 1727 matching close-group on the compile stack, so the same register 1728 number is put in the stop_memory as the start_memory. */ 1729 regnum_t regnum = 0; 1730 1731 #ifdef DEBUG 1732 DEBUG_PRINT1 ("\nCompiling pattern: "); 1733 if (debug) 1734 { 1735 unsigned debug_count; 1736 1737 for (debug_count = 0; debug_count < size; debug_count++) 1738 putchar (pattern[debug_count]); 1739 putchar ('\n'); 1740 } 1741 #endif /* DEBUG */ 1742 1743 /* Initialize the compile stack. */ 1744 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 1745 if (compile_stack.stack == NULL) 1746 return REG_ESPACE; 1747 1748 compile_stack.size = INIT_COMPILE_STACK_SIZE; 1749 compile_stack.avail = 0; 1750 1751 /* Initialize the pattern buffer. */ 1752 bufp->syntax = syntax; 1753 bufp->fastmap_accurate = 0; 1754 bufp->not_bol = bufp->not_eol = 0; 1755 1756 /* Set `used' to zero, so that if we return an error, the pattern 1757 printer (for debugging) will think there's no pattern. We reset it 1758 at the end. */ 1759 bufp->used = 0; 1760 1761 /* Always count groups, whether or not bufp->no_sub is set. */ 1762 bufp->re_nsub = 0; 1763 1764 #if !defined (emacs) && !defined (SYNTAX_TABLE) 1765 /* Initialize the syntax table. */ 1766 init_syntax_once (); 1767 #endif 1768 1769 if (bufp->allocated == 0) 1770 { 1771 if (bufp->buffer) 1772 { /* If zero allocated, but buffer is non-null, try to realloc 1773 enough space. This loses if buffer's address is bogus, but 1774 that is the user's responsibility. */ 1775 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 1776 } 1777 else 1778 { /* Caller did not allocate a buffer. Do it for them. */ 1779 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 1780 } 1781 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); 1782 1783 bufp->allocated = INIT_BUF_SIZE; 1784 } 1785 1786 begalt = b = bufp->buffer; 1787 1788 /* Loop through the uncompiled pattern until we're at the end. */ 1789 while (p != pend) 1790 { 1791 PATFETCH (c); 1792 1793 switch (c) 1794 { 1795 case '^': 1796 { 1797 if ( /* If at start of pattern, it's an operator. */ 1798 p == pattern + 1 1799 /* If context independent, it's an operator. */ 1800 || syntax & RE_CONTEXT_INDEP_ANCHORS 1801 /* Otherwise, depends on what's come before. */ 1802 || at_begline_loc_p (pattern, p, syntax)) 1803 BUF_PUSH (begline); 1804 else 1805 goto normal_char; 1806 } 1807 break; 1808 1809 1810 case '$': 1811 { 1812 if ( /* If at end of pattern, it's an operator. */ 1813 p == pend 1814 /* If context independent, it's an operator. */ 1815 || syntax & RE_CONTEXT_INDEP_ANCHORS 1816 /* Otherwise, depends on what's next. */ 1817 || at_endline_loc_p (p, pend, syntax)) 1818 BUF_PUSH (endline); 1819 else 1820 goto normal_char; 1821 } 1822 break; 1823 1824 1825 case '+': 1826 case '?': 1827 if ((syntax & RE_BK_PLUS_QM) 1828 || (syntax & RE_LIMITED_OPS)) 1829 goto normal_char; 1830 handle_plus: 1831 case '*': 1832 /* If there is no previous pattern... */ 1833 if (!laststart) 1834 { 1835 if (syntax & RE_CONTEXT_INVALID_OPS) 1836 FREE_STACK_RETURN (REG_BADRPT); 1837 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 1838 goto normal_char; 1839 } 1840 1841 { 1842 /* Are we optimizing this jump? */ 1843 boolean keep_string_p = false; 1844 1845 /* 1 means zero (many) matches is allowed. */ 1846 char zero_times_ok = 0, many_times_ok = 0; 1847 1848 /* If there is a sequence of repetition chars, collapse it 1849 down to just one (the right one). We can't combine 1850 interval operators with these because of, e.g., `a{2}*', 1851 which should only match an even number of `a's. */ 1852 1853 for (;;) 1854 { 1855 zero_times_ok |= c != '+'; 1856 many_times_ok |= c != '?'; 1857 1858 if (p == pend) 1859 break; 1860 1861 PATFETCH (c); 1862 1863 if (c == '*' 1864 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 1865 ; 1866 1867 else if (syntax & RE_BK_PLUS_QM && c == '\\') 1868 { 1869 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 1870 1871 PATFETCH (c1); 1872 if (!(c1 == '+' || c1 == '?')) 1873 { 1874 PATUNFETCH; 1875 PATUNFETCH; 1876 break; 1877 } 1878 1879 c = c1; 1880 } 1881 else 1882 { 1883 PATUNFETCH; 1884 break; 1885 } 1886 1887 /* If we get here, we found another repeat character. */ 1888 } 1889 1890 /* Star, etc. applied to an empty pattern is equivalent 1891 to an empty pattern. */ 1892 if (!laststart) 1893 break; 1894 1895 /* Now we know whether or not zero matches is allowed 1896 and also whether or not two or more matches is allowed. */ 1897 if (many_times_ok) 1898 { /* More than one repetition is allowed, so put in at the 1899 end a backward relative jump from `b' to before the next 1900 jump we're going to put in below (which jumps from 1901 laststart to after this jump). 1902 1903 But if we are at the `*' in the exact sequence `.*\n', 1904 insert an unconditional jump backwards to the ., 1905 instead of the beginning of the loop. This way we only 1906 push a failure point once, instead of every time 1907 through the loop. */ 1908 assert (p - 1 > pattern); 1909 1910 /* Allocate the space for the jump. */ 1911 GET_BUFFER_SPACE (3); 1912 1913 /* We know we are not at the first character of the pattern, 1914 because laststart was nonzero. And we've already 1915 incremented `p', by the way, to be the character after 1916 the `*'. Do we have to do something analogous here 1917 for null bytes, because of RE_DOT_NOT_NULL? */ 1918 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 1919 && zero_times_ok 1920 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 1921 && !(syntax & RE_DOT_NEWLINE)) 1922 { /* We have .*\n. */ 1923 STORE_JUMP (jump, b, laststart); 1924 keep_string_p = true; 1925 } 1926 else 1927 /* Anything else. */ 1928 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 1929 1930 /* We've added more stuff to the buffer. */ 1931 b += 3; 1932 } 1933 1934 /* On failure, jump from laststart to b + 3, which will be the 1935 end of the buffer after this jump is inserted. */ 1936 GET_BUFFER_SPACE (3); 1937 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 1938 : on_failure_jump, 1939 laststart, b + 3); 1940 pending_exact = 0; 1941 b += 3; 1942 1943 if (!zero_times_ok) 1944 { 1945 /* At least one repetition is required, so insert a 1946 `dummy_failure_jump' before the initial 1947 `on_failure_jump' instruction of the loop. This 1948 effects a skip over that instruction the first time 1949 we hit that loop. */ 1950 GET_BUFFER_SPACE (3); 1951 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 1952 b += 3; 1953 } 1954 } 1955 break; 1956 1957 1958 case '.': 1959 laststart = b; 1960 BUF_PUSH (anychar); 1961 break; 1962 1963 1964 case '[': 1965 { 1966 boolean had_char_class = false; 1967 1968 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 1969 1970 /* Ensure that we have enough space to push a charset: the 1971 opcode, the length count, and the bitset; 34 bytes in all. */ 1972 GET_BUFFER_SPACE (34); 1973 1974 laststart = b; 1975 1976 /* We test `*p == '^' twice, instead of using an if 1977 statement, so we only need one BUF_PUSH. */ 1978 BUF_PUSH (*p == '^' ? charset_not : charset); 1979 if (*p == '^') 1980 p++; 1981 1982 /* Remember the first position in the bracket expression. */ 1983 p1 = p; 1984 1985 /* Push the number of bytes in the bitmap. */ 1986 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 1987 1988 /* Clear the whole map. */ 1989 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 1990 1991 /* charset_not matches newline according to a syntax bit. */ 1992 if ((re_opcode_t) b[-2] == charset_not 1993 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 1994 SET_LIST_BIT ('\n'); 1995 1996 /* Read in characters and ranges, setting map bits. */ 1997 for (;;) 1998 { 1999 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2000 2001 PATFETCH (c); 2002 2003 /* \ might escape characters inside [...] and [^...]. */ 2004 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2005 { 2006 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2007 2008 PATFETCH (c1); 2009 SET_LIST_BIT (c1); 2010 continue; 2011 } 2012 2013 /* Could be the end of the bracket expression. If it's 2014 not (i.e., when the bracket expression is `[]' so 2015 far), the ']' character bit gets set way below. */ 2016 if (c == ']' && p != p1 + 1) 2017 break; 2018 2019 /* Look ahead to see if it's a range when the last thing 2020 was a character class. */ 2021 if (had_char_class && c == '-' && *p != ']') 2022 FREE_STACK_RETURN (REG_ERANGE); 2023 2024 /* Look ahead to see if it's a range when the last thing 2025 was a character: if this is a hyphen not at the 2026 beginning or the end of a list, then it's the range 2027 operator. */ 2028 if (c == '-' 2029 && !(p - 2 >= pattern && p[-2] == '[') 2030 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2031 && *p != ']') 2032 { 2033 reg_errcode_t ret 2034 = compile_range (&p, pend, translate, syntax, b); 2035 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2036 } 2037 2038 else if (p[0] == '-' && p[1] != ']') 2039 { /* This handles ranges made up of characters only. */ 2040 reg_errcode_t ret; 2041 2042 /* Move past the `-'. */ 2043 PATFETCH (c1); 2044 2045 ret = compile_range (&p, pend, translate, syntax, b); 2046 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2047 } 2048 2049 /* See if we're at the beginning of a possible character 2050 class. */ 2051 2052 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2053 { /* Leave room for the null. */ 2054 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2055 2056 PATFETCH (c); 2057 c1 = 0; 2058 2059 /* If pattern is `[[:'. */ 2060 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2061 2062 for (;;) 2063 { 2064 PATFETCH (c); 2065 if (c == ':' || c == ']' || p == pend 2066 || c1 == CHAR_CLASS_MAX_LENGTH) 2067 break; 2068 str[c1++] = c; 2069 } 2070 str[c1] = '\0'; 2071 2072 /* If isn't a word bracketed by `[:' and:`]': 2073 undo the ending character, the letters, and leave 2074 the leading `:' and `[' (but set bits for them). */ 2075 if (c == ':' && *p == ']') 2076 { 2077 int ch; 2078 boolean is_alnum = STREQ (str, "alnum"); 2079 boolean is_alpha = STREQ (str, "alpha"); 2080 boolean is_blank = STREQ (str, "blank"); 2081 boolean is_cntrl = STREQ (str, "cntrl"); 2082 boolean is_digit = STREQ (str, "digit"); 2083 boolean is_graph = STREQ (str, "graph"); 2084 boolean is_lower = STREQ (str, "lower"); 2085 boolean is_print = STREQ (str, "print"); 2086 boolean is_punct = STREQ (str, "punct"); 2087 boolean is_space = STREQ (str, "space"); 2088 boolean is_upper = STREQ (str, "upper"); 2089 boolean is_xdigit = STREQ (str, "xdigit"); 2090 2091 if (!IS_CHAR_CLASS (str)) 2092 FREE_STACK_RETURN (REG_ECTYPE); 2093 2094 /* Throw away the ] at the end of the character 2095 class. */ 2096 PATFETCH (c); 2097 2098 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2099 2100 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 2101 { 2102 int translated = TRANSLATE (ch); 2103 /* This was split into 3 if's to 2104 avoid an arbitrary limit in some compiler. */ 2105 if ( (is_alnum && ISALNUM (ch)) 2106 || (is_alpha && ISALPHA (ch)) 2107 || (is_blank && ISBLANK (ch)) 2108 || (is_cntrl && ISCNTRL (ch))) 2109 SET_LIST_BIT (translated); 2110 if ( (is_digit && ISDIGIT (ch)) 2111 || (is_graph && ISGRAPH (ch)) 2112 || (is_lower && ISLOWER (ch)) 2113 || (is_print && ISPRINT (ch))) 2114 SET_LIST_BIT (translated); 2115 if ( (is_punct && ISPUNCT (ch)) 2116 || (is_space && ISSPACE (ch)) 2117 || (is_upper && ISUPPER (ch)) 2118 || (is_xdigit && ISXDIGIT (ch))) 2119 SET_LIST_BIT (translated); 2120 } 2121 had_char_class = true; 2122 } 2123 else 2124 { 2125 c1++; 2126 while (c1--) 2127 PATUNFETCH; 2128 SET_LIST_BIT ('['); 2129 SET_LIST_BIT (':'); 2130 had_char_class = false; 2131 } 2132 } 2133 else 2134 { 2135 had_char_class = false; 2136 SET_LIST_BIT (c); 2137 } 2138 } 2139 2140 /* Discard any (non)matching list bytes that are all 0 at the 2141 end of the map. Decrease the map-length byte too. */ 2142 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 2143 b[-1]--; 2144 b += b[-1]; 2145 } 2146 break; 2147 2148 2149 case '(': 2150 if (syntax & RE_NO_BK_PARENS) 2151 goto handle_open; 2152 else 2153 goto normal_char; 2154 2155 2156 case ')': 2157 if (syntax & RE_NO_BK_PARENS) 2158 goto handle_close; 2159 else 2160 goto normal_char; 2161 2162 2163 case '\n': 2164 if (syntax & RE_NEWLINE_ALT) 2165 goto handle_alt; 2166 else 2167 goto normal_char; 2168 2169 2170 case '|': 2171 if (syntax & RE_NO_BK_VBAR) 2172 goto handle_alt; 2173 else 2174 goto normal_char; 2175 2176 2177 case '{': 2178 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 2179 goto handle_interval; 2180 else 2181 goto normal_char; 2182 2183 2184 case '\\': 2185 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2186 2187 /* Do not translate the character after the \, so that we can 2188 distinguish, e.g., \B from \b, even if we normally would 2189 translate, e.g., B to b. */ 2190 PATFETCH_RAW (c); 2191 2192 switch (c) 2193 { 2194 case '(': 2195 if (syntax & RE_NO_BK_PARENS) 2196 goto normal_backslash; 2197 2198 handle_open: 2199 bufp->re_nsub++; 2200 regnum++; 2201 2202 if (COMPILE_STACK_FULL) 2203 { 2204 RETALLOC (compile_stack.stack, compile_stack.size << 1, 2205 compile_stack_elt_t); 2206 if (compile_stack.stack == NULL) return REG_ESPACE; 2207 2208 compile_stack.size <<= 1; 2209 } 2210 2211 /* These are the values to restore when we hit end of this 2212 group. They are all relative offsets, so that if the 2213 whole pattern moves because of realloc, they will still 2214 be valid. */ 2215 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 2216 COMPILE_STACK_TOP.fixup_alt_jump 2217 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 2218 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 2219 COMPILE_STACK_TOP.regnum = regnum; 2220 2221 /* We will eventually replace the 0 with the number of 2222 groups inner to this one. But do not push a 2223 start_memory for groups beyond the last one we can 2224 represent in the compiled pattern. */ 2225 if (regnum <= MAX_REGNUM) 2226 { 2227 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 2228 BUF_PUSH_3 (start_memory, regnum, 0); 2229 } 2230 2231 compile_stack.avail++; 2232 2233 fixup_alt_jump = 0; 2234 laststart = 0; 2235 begalt = b; 2236 /* If we've reached MAX_REGNUM groups, then this open 2237 won't actually generate any code, so we'll have to 2238 clear pending_exact explicitly. */ 2239 pending_exact = 0; 2240 break; 2241 2242 2243 case ')': 2244 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 2245 2246 if (COMPILE_STACK_EMPTY) 2247 { 2248 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 2249 goto normal_backslash; 2250 else 2251 FREE_STACK_RETURN (REG_ERPAREN); 2252 } 2253 2254 handle_close: 2255 if (fixup_alt_jump) 2256 { /* Push a dummy failure point at the end of the 2257 alternative for a possible future 2258 `pop_failure_jump' to pop. See comments at 2259 `push_dummy_failure' in `re_match_2'. */ 2260 BUF_PUSH (push_dummy_failure); 2261 2262 /* We allocated space for this jump when we assigned 2263 to `fixup_alt_jump', in the `handle_alt' case below. */ 2264 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 2265 } 2266 2267 /* See similar code for backslashed left paren above. */ 2268 if (COMPILE_STACK_EMPTY) 2269 { 2270 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 2271 goto normal_char; 2272 else 2273 FREE_STACK_RETURN (REG_ERPAREN); 2274 } 2275 2276 /* Since we just checked for an empty stack above, this 2277 ``can't happen''. */ 2278 assert (compile_stack.avail != 0); 2279 { 2280 /* We don't just want to restore into `regnum', because 2281 later groups should continue to be numbered higher, 2282 as in `(ab)c(de)' -- the second group is #2. */ 2283 regnum_t this_group_regnum; 2284 2285 compile_stack.avail--; 2286 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 2287 fixup_alt_jump 2288 = COMPILE_STACK_TOP.fixup_alt_jump 2289 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 2290 : 0; 2291 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 2292 this_group_regnum = COMPILE_STACK_TOP.regnum; 2293 /* If we've reached MAX_REGNUM groups, then this open 2294 won't actually generate any code, so we'll have to 2295 clear pending_exact explicitly. */ 2296 pending_exact = 0; 2297 2298 /* We're at the end of the group, so now we know how many 2299 groups were inside this one. */ 2300 if (this_group_regnum <= MAX_REGNUM) 2301 { 2302 unsigned char *inner_group_loc 2303 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 2304 2305 *inner_group_loc = regnum - this_group_regnum; 2306 BUF_PUSH_3 (stop_memory, this_group_regnum, 2307 regnum - this_group_regnum); 2308 } 2309 } 2310 break; 2311 2312 2313 case '|': /* `\|'. */ 2314 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 2315 goto normal_backslash; 2316 handle_alt: 2317 if (syntax & RE_LIMITED_OPS) 2318 goto normal_char; 2319 2320 /* Insert before the previous alternative a jump which 2321 jumps to this alternative if the former fails. */ 2322 GET_BUFFER_SPACE (3); 2323 INSERT_JUMP (on_failure_jump, begalt, b + 6); 2324 pending_exact = 0; 2325 b += 3; 2326 2327 /* The alternative before this one has a jump after it 2328 which gets executed if it gets matched. Adjust that 2329 jump so it will jump to this alternative's analogous 2330 jump (put in below, which in turn will jump to the next 2331 (if any) alternative's such jump, etc.). The last such 2332 jump jumps to the correct final destination. A picture: 2333 _____ _____ 2334 | | | | 2335 | v | v 2336 a | b | c 2337 2338 If we are at `b', then fixup_alt_jump right now points to a 2339 three-byte space after `a'. We'll put in the jump, set 2340 fixup_alt_jump to right after `b', and leave behind three 2341 bytes which we'll fill in when we get to after `c'. */ 2342 2343 if (fixup_alt_jump) 2344 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2345 2346 /* Mark and leave space for a jump after this alternative, 2347 to be filled in later either by next alternative or 2348 when know we're at the end of a series of alternatives. */ 2349 fixup_alt_jump = b; 2350 GET_BUFFER_SPACE (3); 2351 b += 3; 2352 2353 laststart = 0; 2354 begalt = b; 2355 break; 2356 2357 2358 case '{': 2359 /* If \{ is a literal. */ 2360 if (!(syntax & RE_INTERVALS) 2361 /* If we're at `\{' and it's not the open-interval 2362 operator. */ 2363 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 2364 || (p - 2 == pattern && p == pend)) 2365 goto normal_backslash; 2366 2367 handle_interval: 2368 { 2369 /* If got here, then the syntax allows intervals. */ 2370 2371 /* At least (most) this many matches must be made. */ 2372 int lower_bound = -1, upper_bound = -1; 2373 2374 beg_interval = p - 1; 2375 2376 if (p == pend) 2377 { 2378 if (syntax & RE_NO_BK_BRACES) 2379 goto unfetch_interval; 2380 else 2381 FREE_STACK_RETURN (REG_EBRACE); 2382 } 2383 2384 GET_UNSIGNED_NUMBER (lower_bound); 2385 2386 if (c == ',') 2387 { 2388 GET_UNSIGNED_NUMBER (upper_bound); 2389 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 2390 } 2391 else 2392 /* Interval such as `{1}' => match exactly once. */ 2393 upper_bound = lower_bound; 2394 2395 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 2396 || lower_bound > upper_bound) 2397 { 2398 if (syntax & RE_NO_BK_BRACES) 2399 goto unfetch_interval; 2400 else 2401 FREE_STACK_RETURN (REG_BADBR); 2402 } 2403 2404 if (!(syntax & RE_NO_BK_BRACES)) 2405 { 2406 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); 2407 2408 PATFETCH (c); 2409 } 2410 2411 if (c != '}') 2412 { 2413 if (syntax & RE_NO_BK_BRACES) 2414 goto unfetch_interval; 2415 else 2416 FREE_STACK_RETURN (REG_BADBR); 2417 } 2418 2419 /* We just parsed a valid interval. */ 2420 2421 /* If it's invalid to have no preceding re. */ 2422 if (!laststart) 2423 { 2424 if (syntax & RE_CONTEXT_INVALID_OPS) 2425 FREE_STACK_RETURN (REG_BADRPT); 2426 else if (syntax & RE_CONTEXT_INDEP_OPS) 2427 laststart = b; 2428 else 2429 goto unfetch_interval; 2430 } 2431 2432 /* If the upper bound is zero, don't want to succeed at 2433 all; jump from `laststart' to `b + 3', which will be 2434 the end of the buffer after we insert the jump. */ 2435 if (upper_bound == 0) 2436 { 2437 GET_BUFFER_SPACE (3); 2438 INSERT_JUMP (jump, laststart, b + 3); 2439 b += 3; 2440 } 2441 2442 /* Otherwise, we have a nontrivial interval. When 2443 we're all done, the pattern will look like: 2444 set_number_at <jump count> <upper bound> 2445 set_number_at <succeed_n count> <lower bound> 2446 succeed_n <after jump addr> <succeed_n count> 2447 <body of loop> 2448 jump_n <succeed_n addr> <jump count> 2449 (The upper bound and `jump_n' are omitted if 2450 `upper_bound' is 1, though.) */ 2451 else 2452 { /* If the upper bound is > 1, we need to insert 2453 more at the end of the loop. */ 2454 unsigned nbytes = 10 + (upper_bound > 1) * 10; 2455 2456 GET_BUFFER_SPACE (nbytes); 2457 2458 /* Initialize lower bound of the `succeed_n', even 2459 though it will be set during matching by its 2460 attendant `set_number_at' (inserted next), 2461 because `re_compile_fastmap' needs to know. 2462 Jump to the `jump_n' we might insert below. */ 2463 INSERT_JUMP2 (succeed_n, laststart, 2464 b + 5 + (upper_bound > 1) * 5, 2465 lower_bound); 2466 b += 5; 2467 2468 /* Code to initialize the lower bound. Insert 2469 before the `succeed_n'. The `5' is the last two 2470 bytes of this `set_number_at', plus 3 bytes of 2471 the following `succeed_n'. */ 2472 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 2473 b += 5; 2474 2475 if (upper_bound > 1) 2476 { /* More than one repetition is allowed, so 2477 append a backward jump to the `succeed_n' 2478 that starts this interval. 2479 2480 When we've reached this during matching, 2481 we'll have matched the interval once, so 2482 jump back only `upper_bound - 1' times. */ 2483 STORE_JUMP2 (jump_n, b, laststart + 5, 2484 upper_bound - 1); 2485 b += 5; 2486 2487 /* The location we want to set is the second 2488 parameter of the `jump_n'; that is `b-2' as 2489 an absolute address. `laststart' will be 2490 the `set_number_at' we're about to insert; 2491 `laststart+3' the number to set, the source 2492 for the relative address. But we are 2493 inserting into the middle of the pattern -- 2494 so everything is getting moved up by 5. 2495 Conclusion: (b - 2) - (laststart + 3) + 5, 2496 i.e., b - laststart. 2497 2498 We insert this at the beginning of the loop 2499 so that if we fail during matching, we'll 2500 reinitialize the bounds. */ 2501 insert_op2 (set_number_at, laststart, b - laststart, 2502 upper_bound - 1, b); 2503 b += 5; 2504 } 2505 } 2506 pending_exact = 0; 2507 beg_interval = NULL; 2508 } 2509 break; 2510 2511 unfetch_interval: 2512 /* If an invalid interval, match the characters as literals. */ 2513 assert (beg_interval); 2514 p = beg_interval; 2515 beg_interval = NULL; 2516 2517 /* normal_char and normal_backslash need `c'. */ 2518 PATFETCH (c); 2519 2520 if (!(syntax & RE_NO_BK_BRACES)) 2521 { 2522 if (p > pattern && p[-1] == '\\') 2523 goto normal_backslash; 2524 } 2525 goto normal_char; 2526 2527 #ifdef emacs 2528 /* There is no way to specify the before_dot and after_dot 2529 operators. rms says this is ok. --karl */ 2530 case '=': 2531 BUF_PUSH (at_dot); 2532 break; 2533 2534 case 's': 2535 laststart = b; 2536 PATFETCH (c); 2537 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 2538 break; 2539 2540 case 'S': 2541 laststart = b; 2542 PATFETCH (c); 2543 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 2544 break; 2545 #endif /* emacs */ 2546 2547 2548 case 'w': 2549 laststart = b; 2550 BUF_PUSH (wordchar); 2551 break; 2552 2553 2554 case 'W': 2555 laststart = b; 2556 BUF_PUSH (notwordchar); 2557 break; 2558 2559 2560 case '<': 2561 BUF_PUSH (wordbeg); 2562 break; 2563 2564 case '>': 2565 BUF_PUSH (wordend); 2566 break; 2567 2568 case 'b': 2569 BUF_PUSH (wordbound); 2570 break; 2571 2572 case 'B': 2573 BUF_PUSH (notwordbound); 2574 break; 2575 2576 case '`': 2577 BUF_PUSH (begbuf); 2578 break; 2579 2580 case '\'': 2581 BUF_PUSH (endbuf); 2582 break; 2583 2584 case '1': case '2': case '3': case '4': case '5': 2585 case '6': case '7': case '8': case '9': 2586 if (syntax & RE_NO_BK_REFS) 2587 goto normal_char; 2588 2589 c1 = c - '0'; 2590 2591 if (c1 > regnum) 2592 FREE_STACK_RETURN (REG_ESUBREG); 2593 2594 /* Can't back reference to a subexpression if inside of it. */ 2595 if (group_in_compile_stack (compile_stack, c1)) 2596 goto normal_char; 2597 2598 laststart = b; 2599 BUF_PUSH_2 (duplicate, c1); 2600 break; 2601 2602 2603 case '+': 2604 case '?': 2605 if (syntax & RE_BK_PLUS_QM) 2606 goto handle_plus; 2607 else 2608 goto normal_backslash; 2609 2610 default: 2611 normal_backslash: 2612 /* You might think it would be useful for \ to mean 2613 not to translate; but if we don't translate it 2614 it will never match anything. */ 2615 c = TRANSLATE (c); 2616 goto normal_char; 2617 } 2618 break; 2619 2620 2621 default: 2622 /* Expects the character in `c'. */ 2623 normal_char: 2624 /* If no exactn currently being built. */ 2625 if (!pending_exact 2626 2627 /* If last exactn not at current position. */ 2628 || pending_exact + *pending_exact + 1 != b 2629 2630 /* We have only one byte following the exactn for the count. */ 2631 || *pending_exact == (1 << BYTEWIDTH) - 1 2632 2633 /* If followed by a repetition operator. */ 2634 || *p == '*' || *p == '^' 2635 || ((syntax & RE_BK_PLUS_QM) 2636 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 2637 : (*p == '+' || *p == '?')) 2638 || ((syntax & RE_INTERVALS) 2639 && ((syntax & RE_NO_BK_BRACES) 2640 ? *p == '{' 2641 : (p[0] == '\\' && p[1] == '{')))) 2642 { 2643 /* Start building a new exactn. */ 2644 2645 laststart = b; 2646 2647 BUF_PUSH_2 (exactn, 0); 2648 pending_exact = b - 1; 2649 } 2650 2651 BUF_PUSH (c); 2652 (*pending_exact)++; 2653 break; 2654 } /* switch (c) */ 2655 } /* while p != pend */ 2656 2657 2658 /* Through the pattern now. */ 2659 2660 if (fixup_alt_jump) 2661 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2662 2663 if (!COMPILE_STACK_EMPTY) 2664 FREE_STACK_RETURN (REG_EPAREN); 2665 2666 /* If we don't want backtracking, force success 2667 the first time we reach the end of the compiled pattern. */ 2668 if (syntax & RE_NO_POSIX_BACKTRACKING) 2669 BUF_PUSH (succeed); 2670 2671 free (compile_stack.stack); 2672 2673 /* We have succeeded; set the length of the buffer. */ 2674 bufp->used = b - bufp->buffer; 2675 2676 #ifdef DEBUG 2677 if (debug) 2678 { 2679 DEBUG_PRINT1 ("\nCompiled pattern: \n"); 2680 print_compiled_pattern (bufp); 2681 } 2682 #endif /* DEBUG */ 2683 2684 #ifndef MATCH_MAY_ALLOCATE 2685 /* Initialize the failure stack to the largest possible stack. This 2686 isn't necessary unless we're trying to avoid calling alloca in 2687 the search and match routines. */ 2688 { 2689 int num_regs = bufp->re_nsub + 1; 2690 2691 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size 2692 is strictly greater than re_max_failures, the largest possible stack 2693 is 2 * re_max_failures failure points. */ 2694 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) 2695 { 2696 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); 2697 2698 #ifdef emacs 2699 if (! fail_stack.stack) 2700 fail_stack.stack 2701 = (fail_stack_elt_t *) xmalloc (fail_stack.size 2702 * sizeof (fail_stack_elt_t)); 2703 else 2704 fail_stack.stack 2705 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, 2706 (fail_stack.size 2707 * sizeof (fail_stack_elt_t))); 2708 #else /* not emacs */ 2709 if (! fail_stack.stack) 2710 fail_stack.stack 2711 = (fail_stack_elt_t *) malloc (fail_stack.size 2712 * sizeof (fail_stack_elt_t)); 2713 else 2714 fail_stack.stack 2715 = (fail_stack_elt_t *) realloc (fail_stack.stack, 2716 (fail_stack.size 2717 * sizeof (fail_stack_elt_t))); 2718 #endif /* not emacs */ 2719 } 2720 2721 regex_grow_registers (num_regs); 2722 } 2723 #endif /* not MATCH_MAY_ALLOCATE */ 2724 2725 return REG_NOERROR; 2726 } /* regex_compile */ 2727 2728 /* Subroutines for `regex_compile'. */ 2730 2731 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 2732 2733 static void 2734 store_op1 (op, loc, arg) 2735 re_opcode_t op; 2736 unsigned char *loc; 2737 int arg; 2738 { 2739 *loc = (unsigned char) op; 2740 STORE_NUMBER (loc + 1, arg); 2741 } 2742 2743 2744 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 2745 2746 static void 2747 store_op2 (op, loc, arg1, arg2) 2748 re_opcode_t op; 2749 unsigned char *loc; 2750 int arg1, arg2; 2751 { 2752 *loc = (unsigned char) op; 2753 STORE_NUMBER (loc + 1, arg1); 2754 STORE_NUMBER (loc + 3, arg2); 2755 } 2756 2757 2758 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 2759 for OP followed by two-byte integer parameter ARG. */ 2760 2761 static void 2762 insert_op1 (op, loc, arg, end) 2763 re_opcode_t op; 2764 unsigned char *loc; 2765 int arg; 2766 unsigned char *end; 2767 { 2768 register unsigned char *pfrom = end; 2769 register unsigned char *pto = end + 3; 2770 2771 while (pfrom != loc) 2772 *--pto = *--pfrom; 2773 2774 store_op1 (op, loc, arg); 2775 } 2776 2777 2778 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 2779 2780 static void 2781 insert_op2 (op, loc, arg1, arg2, end) 2782 re_opcode_t op; 2783 unsigned char *loc; 2784 int arg1, arg2; 2785 unsigned char *end; 2786 { 2787 register unsigned char *pfrom = end; 2788 register unsigned char *pto = end + 5; 2789 2790 while (pfrom != loc) 2791 *--pto = *--pfrom; 2792 2793 store_op2 (op, loc, arg1, arg2); 2794 } 2795 2796 2797 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 2798 after an alternative or a begin-subexpression. We assume there is at 2799 least one character before the ^. */ 2800 2801 static boolean 2802 at_begline_loc_p (pattern, p, syntax) 2803 const char *pattern, *p; 2804 reg_syntax_t syntax; 2805 { 2806 const char *prev = p - 2; 2807 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 2808 2809 return 2810 /* After a subexpression? */ 2811 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 2812 /* After an alternative? */ 2813 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 2814 } 2815 2816 2817 /* The dual of at_begline_loc_p. This one is for $. We assume there is 2818 at least one character after the $, i.e., `P < PEND'. */ 2819 2820 static boolean 2821 at_endline_loc_p (p, pend, syntax) 2822 const char *p, *pend; 2823 int syntax; 2824 { 2825 const char *next = p; 2826 boolean next_backslash = *next == '\\'; 2827 const char *next_next = p + 1 < pend ? p + 1 : 0; 2828 2829 return 2830 /* Before a subexpression? */ 2831 (syntax & RE_NO_BK_PARENS ? *next == ')' 2832 : next_backslash && next_next && *next_next == ')') 2833 /* Before an alternative? */ 2834 || (syntax & RE_NO_BK_VBAR ? *next == '|' 2835 : next_backslash && next_next && *next_next == '|'); 2836 } 2837 2838 2839 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 2840 false if it's not. */ 2841 2842 static boolean 2843 group_in_compile_stack (compile_stack, regnum) 2844 compile_stack_type compile_stack; 2845 regnum_t regnum; 2846 { 2847 int this_element; 2848 2849 for (this_element = compile_stack.avail - 1; 2850 this_element >= 0; 2851 this_element--) 2852 if (compile_stack.stack[this_element].regnum == regnum) 2853 return true; 2854 2855 return false; 2856 } 2857 2858 2859 /* Read the ending character of a range (in a bracket expression) from the 2860 uncompiled pattern *P_PTR (which ends at PEND). We assume the 2861 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 2862 Then we set the translation of all bits between the starting and 2863 ending characters (inclusive) in the compiled pattern B. 2864 2865 Return an error code. 2866 2867 We use these short variable names so we can use the same macros as 2868 `regex_compile' itself. */ 2869 2870 static reg_errcode_t 2871 compile_range (p_ptr, pend, translate, syntax, b) 2872 const char **p_ptr, *pend; 2873 RE_TRANSLATE_TYPE translate; 2874 reg_syntax_t syntax; 2875 unsigned char *b; 2876 { 2877 unsigned this_char; 2878 2879 const char *p = *p_ptr; 2880 int range_start, range_end; 2881 2882 if (p == pend) 2883 return REG_ERANGE; 2884 2885 /* Even though the pattern is a signed `char *', we need to fetch 2886 with unsigned char *'s; if the high bit of the pattern character 2887 is set, the range endpoints will be negative if we fetch using a 2888 signed char *. 2889 2890 We also want to fetch the endpoints without translating them; the 2891 appropriate translation is done in the bit-setting loop below. */ 2892 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ 2893 range_start = ((const unsigned char *) p)[-2]; 2894 range_end = ((const unsigned char *) p)[0]; 2895 2896 /* Have to increment the pointer into the pattern string, so the 2897 caller isn't still at the ending character. */ 2898 (*p_ptr)++; 2899 2900 /* If the start is after the end, the range is empty. */ 2901 if (range_start > range_end) 2902 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 2903 2904 /* Here we see why `this_char' has to be larger than an `unsigned 2905 char' -- the range is inclusive, so if `range_end' == 0xff 2906 (assuming 8-bit characters), we would otherwise go into an infinite 2907 loop, since all characters <= 0xff. */ 2908 for (this_char = range_start; this_char <= range_end; this_char++) 2909 { 2910 SET_LIST_BIT (TRANSLATE (this_char)); 2911 } 2912 2913 return REG_NOERROR; 2914 } 2915 2916 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 2918 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 2919 characters can start a string that matches the pattern. This fastmap 2920 is used by re_search to skip quickly over impossible starting points. 2921 2922 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 2923 area as BUFP->fastmap. 2924 2925 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 2926 the pattern buffer. 2927 2928 Returns 0 if we succeed, -2 if an internal error. */ 2929 2930 int 2931 re_compile_fastmap (bufp) 2932 struct re_pattern_buffer *bufp; 2933 { 2934 int j, k; 2935 #ifdef MATCH_MAY_ALLOCATE 2936 fail_stack_type fail_stack; 2937 #endif 2938 #ifndef REGEX_MALLOC 2939 char *destination; 2940 #endif 2941 /* We don't push any register information onto the failure stack. */ 2942 unsigned num_regs = 0; 2943 2944 register char *fastmap = bufp->fastmap; 2945 unsigned char *pattern = bufp->buffer; 2946 unsigned long size = bufp->used; 2947 unsigned char *p = pattern; 2948 register unsigned char *pend = pattern + size; 2949 2950 /* This holds the pointer to the failure stack, when 2951 it is allocated relocatably. */ 2952 #ifdef REL_ALLOC 2953 fail_stack_elt_t *failure_stack_ptr; 2954 #endif 2955 2956 /* Assume that each path through the pattern can be null until 2957 proven otherwise. We set this false at the bottom of switch 2958 statement, to which we get only if a particular path doesn't 2959 match the empty string. */ 2960 boolean path_can_be_null = true; 2961 2962 /* We aren't doing a `succeed_n' to begin with. */ 2963 boolean succeed_n_p = false; 2964 2965 assert (fastmap != NULL && p != NULL); 2966 2967 INIT_FAIL_STACK (); 2968 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 2969 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 2970 bufp->can_be_null = 0; 2971 2972 while (1) 2973 { 2974 if (p == pend || *p == succeed) 2975 { 2976 /* We have reached the (effective) end of pattern. */ 2977 if (!FAIL_STACK_EMPTY ()) 2978 { 2979 bufp->can_be_null |= path_can_be_null; 2980 2981 /* Reset for next path. */ 2982 path_can_be_null = true; 2983 2984 p = fail_stack.stack[--fail_stack.avail].pointer; 2985 2986 continue; 2987 } 2988 else 2989 break; 2990 } 2991 2992 /* We should never be about to go beyond the end of the pattern. */ 2993 assert (p < pend); 2994 2995 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 2996 { 2997 2998 /* I guess the idea here is to simply not bother with a fastmap 2999 if a backreference is used, since it's too hard to figure out 3000 the fastmap for the corresponding group. Setting 3001 `can_be_null' stops `re_search_2' from using the fastmap, so 3002 that is all we do. */ 3003 case duplicate: 3004 bufp->can_be_null = 1; 3005 goto done; 3006 3007 3008 /* Following are the cases which match a character. These end 3009 with `break'. */ 3010 3011 case exactn: 3012 fastmap[p[1]] = 1; 3013 break; 3014 3015 3016 case charset: 3017 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3018 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 3019 fastmap[j] = 1; 3020 break; 3021 3022 3023 case charset_not: 3024 /* Chars beyond end of map must be allowed. */ 3025 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 3026 fastmap[j] = 1; 3027 3028 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3029 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 3030 fastmap[j] = 1; 3031 break; 3032 3033 3034 case wordchar: 3035 for (j = 0; j < (1 << BYTEWIDTH); j++) 3036 if (SYNTAX (j) == Sword) 3037 fastmap[j] = 1; 3038 break; 3039 3040 3041 case notwordchar: 3042 for (j = 0; j < (1 << BYTEWIDTH); j++) 3043 if (SYNTAX (j) != Sword) 3044 fastmap[j] = 1; 3045 break; 3046 3047 3048 case anychar: 3049 { 3050 int fastmap_newline = fastmap['\n']; 3051 3052 /* `.' matches anything ... */ 3053 for (j = 0; j < (1 << BYTEWIDTH); j++) 3054 fastmap[j] = 1; 3055 3056 /* ... except perhaps newline. */ 3057 if (!(bufp->syntax & RE_DOT_NEWLINE)) 3058 fastmap['\n'] = fastmap_newline; 3059 3060 /* Return if we have already set `can_be_null'; if we have, 3061 then the fastmap is irrelevant. Something's wrong here. */ 3062 else if (bufp->can_be_null) 3063 goto done; 3064 3065 /* Otherwise, have to check alternative paths. */ 3066 break; 3067 } 3068 3069 #ifdef emacs 3070 case syntaxspec: 3071 k = *p++; 3072 for (j = 0; j < (1 << BYTEWIDTH); j++) 3073 if (SYNTAX (j) == (enum syntaxcode) k) 3074 fastmap[j] = 1; 3075 break; 3076 3077 3078 case notsyntaxspec: 3079 k = *p++; 3080 for (j = 0; j < (1 << BYTEWIDTH); j++) 3081 if (SYNTAX (j) != (enum syntaxcode) k) 3082 fastmap[j] = 1; 3083 break; 3084 3085 3086 /* All cases after this match the empty string. These end with 3087 `continue'. */ 3088 3089 3090 case before_dot: 3091 case at_dot: 3092 case after_dot: 3093 continue; 3094 #endif /* emacs */ 3095 3096 3097 case no_op: 3098 case begline: 3099 case endline: 3100 case begbuf: 3101 case endbuf: 3102 case wordbound: 3103 case notwordbound: 3104 case wordbeg: 3105 case wordend: 3106 case push_dummy_failure: 3107 continue; 3108 3109 3110 case jump_n: 3111 case pop_failure_jump: 3112 case maybe_pop_jump: 3113 case jump: 3114 case jump_past_alt: 3115 case dummy_failure_jump: 3116 EXTRACT_NUMBER_AND_INCR (j, p); 3117 p += j; 3118 if (j > 0) 3119 continue; 3120 3121 /* Jump backward implies we just went through the body of a 3122 loop and matched nothing. Opcode jumped to should be 3123 `on_failure_jump' or `succeed_n'. Just treat it like an 3124 ordinary jump. For a * loop, it has pushed its failure 3125 point already; if so, discard that as redundant. */ 3126 if ((re_opcode_t) *p != on_failure_jump 3127 && (re_opcode_t) *p != succeed_n) 3128 continue; 3129 3130 p++; 3131 EXTRACT_NUMBER_AND_INCR (j, p); 3132 p += j; 3133 3134 /* If what's on the stack is where we are now, pop it. */ 3135 if (!FAIL_STACK_EMPTY () 3136 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 3137 fail_stack.avail--; 3138 3139 continue; 3140 3141 3142 case on_failure_jump: 3143 case on_failure_keep_string_jump: 3144 handle_on_failure_jump: 3145 EXTRACT_NUMBER_AND_INCR (j, p); 3146 3147 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 3148 end of the pattern. We don't want to push such a point, 3149 since when we restore it above, entering the switch will 3150 increment `p' past the end of the pattern. We don't need 3151 to push such a point since we obviously won't find any more 3152 fastmap entries beyond `pend'. Such a pattern can match 3153 the null string, though. */ 3154 if (p + j < pend) 3155 { 3156 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 3157 { 3158 RESET_FAIL_STACK (); 3159 return -2; 3160 } 3161 } 3162 else 3163 bufp->can_be_null = 1; 3164 3165 if (succeed_n_p) 3166 { 3167 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 3168 succeed_n_p = false; 3169 } 3170 3171 continue; 3172 3173 3174 case succeed_n: 3175 /* Get to the number of times to succeed. */ 3176 p += 2; 3177 3178 /* Increment p past the n for when k != 0. */ 3179 EXTRACT_NUMBER_AND_INCR (k, p); 3180 if (k == 0) 3181 { 3182 p -= 4; 3183 succeed_n_p = true; /* Spaghetti code alert. */ 3184 goto handle_on_failure_jump; 3185 } 3186 continue; 3187 3188 3189 case set_number_at: 3190 p += 4; 3191 continue; 3192 3193 3194 case start_memory: 3195 case stop_memory: 3196 p += 2; 3197 continue; 3198 3199 3200 default: 3201 abort (); /* We have listed all the cases. */ 3202 } /* switch *p++ */ 3203 3204 /* Getting here means we have found the possible starting 3205 characters for one path of the pattern -- and that the empty 3206 string does not match. We need not follow this path further. 3207 Instead, look at the next alternative (remembered on the 3208 stack), or quit if no more. The test at the top of the loop 3209 does these things. */ 3210 path_can_be_null = false; 3211 p = pend; 3212 } /* while p */ 3213 3214 /* Set `can_be_null' for the last path (also the first path, if the 3215 pattern is empty). */ 3216 bufp->can_be_null |= path_can_be_null; 3217 3218 done: 3219 RESET_FAIL_STACK (); 3220 return 0; 3221 } /* re_compile_fastmap */ 3222 3223 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 3225 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 3226 this memory for recording register information. STARTS and ENDS 3227 must be allocated using the malloc library routine, and must each 3228 be at least NUM_REGS * sizeof (regoff_t) bytes long. 3229 3230 If NUM_REGS == 0, then subsequent matches should allocate their own 3231 register data. 3232 3233 Unless this function is called, the first search or match using 3234 PATTERN_BUFFER will allocate its own register data, without 3235 freeing the old data. */ 3236 3237 void 3238 re_set_registers (bufp, regs, num_regs, starts, ends) 3239 struct re_pattern_buffer *bufp; 3240 struct re_registers *regs; 3241 unsigned num_regs; 3242 regoff_t *starts, *ends; 3243 { 3244 if (num_regs) 3245 { 3246 bufp->regs_allocated = REGS_REALLOCATE; 3247 regs->num_regs = num_regs; 3248 regs->start = starts; 3249 regs->end = ends; 3250 } 3251 else 3252 { 3253 bufp->regs_allocated = REGS_UNALLOCATED; 3254 regs->num_regs = 0; 3255 regs->start = regs->end = (regoff_t *) 0; 3256 } 3257 } 3258 3259 /* Searching routines. */ 3261 3262 /* Like re_search_2, below, but only one string is specified, and 3263 doesn't let you say where to stop matching. */ 3264 3265 int 3266 re_search (bufp, string, size, startpos, range, regs) 3267 struct re_pattern_buffer *bufp; 3268 const char *string; 3269 int size, startpos, range; 3270 struct re_registers *regs; 3271 { 3272 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 3273 regs, size); 3274 } 3275 3276 3277 /* Using the compiled pattern in BUFP->buffer, first tries to match the 3278 virtual concatenation of STRING1 and STRING2, starting first at index 3279 STARTPOS, then at STARTPOS + 1, and so on. 3280 3281 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 3282 3283 RANGE is how far to scan while trying to match. RANGE = 0 means try 3284 only at STARTPOS; in general, the last start tried is STARTPOS + 3285 RANGE. 3286 3287 In REGS, return the indices of the virtual concatenation of STRING1 3288 and STRING2 that matched the entire BUFP->buffer and its contained 3289 subexpressions. 3290 3291 Do not consider matching one past the index STOP in the virtual 3292 concatenation of STRING1 and STRING2. 3293 3294 We return either the position in the strings at which the match was 3295 found, -1 if no match, or -2 if error (such as failure 3296 stack overflow). */ 3297 3298 int 3299 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 3300 struct re_pattern_buffer *bufp; 3301 const char *string1, *string2; 3302 int size1, size2; 3303 int startpos; 3304 int range; 3305 struct re_registers *regs; 3306 int stop; 3307 { 3308 int val; 3309 register char *fastmap = bufp->fastmap; 3310 register RE_TRANSLATE_TYPE translate = bufp->translate; 3311 int total_size = size1 + size2; 3312 int endpos = startpos + range; 3313 int anchored_start = 0; 3314 3315 /* Check for out-of-range STARTPOS. */ 3316 if (startpos < 0 || startpos > total_size) 3317 return -1; 3318 3319 /* Fix up RANGE if it might eventually take us outside 3320 the virtual concatenation of STRING1 and STRING2. 3321 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ 3322 if (endpos < 0) 3323 range = 0 - startpos; 3324 else if (endpos > total_size) 3325 range = total_size - startpos; 3326 3327 /* If the search isn't to be a backwards one, don't waste time in a 3328 search for a pattern that must be anchored. */ 3329 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) 3330 { 3331 if (startpos > 0) 3332 return -1; 3333 else 3334 range = 1; 3335 } 3336 3337 #ifdef emacs 3338 /* In a forward search for something that starts with \=. 3339 don't keep searching past point. */ 3340 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 3341 { 3342 range = PT - startpos; 3343 if (range <= 0) 3344 return -1; 3345 } 3346 #endif /* emacs */ 3347 3348 /* Update the fastmap now if not correct already. */ 3349 if (fastmap && !bufp->fastmap_accurate) 3350 if (re_compile_fastmap (bufp) == -2) 3351 return -2; 3352 3353 /* See whether the pattern is anchored. */ 3354 if (bufp->buffer[0] == begline) 3355 anchored_start = 1; 3356 3357 /* Loop through the string, looking for a place to start matching. */ 3358 for (;;) 3359 { 3360 /* If the pattern is anchored, 3361 skip quickly past places we cannot match. 3362 We don't bother to treat startpos == 0 specially 3363 because that case doesn't repeat. */ 3364 if (anchored_start && startpos > 0) 3365 { 3366 if (! (bufp->newline_anchor 3367 && ((startpos <= size1 ? string1[startpos - 1] 3368 : string2[startpos - size1 - 1]) 3369 == '\n'))) 3370 goto advance; 3371 } 3372 3373 /* If a fastmap is supplied, skip quickly over characters that 3374 cannot be the start of a match. If the pattern can match the 3375 null string, however, we don't need to skip characters; we want 3376 the first null string. */ 3377 if (fastmap && startpos < total_size && !bufp->can_be_null) 3378 { 3379 if (range > 0) /* Searching forwards. */ 3380 { 3381 register const char *d; 3382 register int lim = 0; 3383 int irange = range; 3384 3385 if (startpos < size1 && startpos + range >= size1) 3386 lim = range - (size1 - startpos); 3387 3388 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 3389 3390 /* Written out as an if-else to avoid testing `translate' 3391 inside the loop. */ 3392 if (translate) 3393 while (range > lim 3394 && !fastmap[(unsigned char) 3395 translate[(unsigned char) *d++]]) 3396 range--; 3397 else 3398 while (range > lim && !fastmap[(unsigned char) *d++]) 3399 range--; 3400 3401 startpos += irange - range; 3402 } 3403 else /* Searching backwards. */ 3404 { 3405 register char c = (size1 == 0 || startpos >= size1 3406 ? string2[startpos - size1] 3407 : string1[startpos]); 3408 3409 if (!fastmap[(unsigned char) TRANSLATE (c)]) 3410 goto advance; 3411 } 3412 } 3413 3414 /* If can't match the null string, and that's all we have left, fail. */ 3415 if (range >= 0 && startpos == total_size && fastmap 3416 && !bufp->can_be_null) 3417 return -1; 3418 3419 val = re_match_2_internal (bufp, string1, size1, string2, size2, 3420 startpos, regs, stop); 3421 #ifndef REGEX_MALLOC 3422 #ifdef C_ALLOCA 3423 alloca (0); 3424 #endif 3425 #endif 3426 3427 if (val >= 0) 3428 return startpos; 3429 3430 if (val == -2) 3431 return -2; 3432 3433 advance: 3434 if (!range) 3435 break; 3436 else if (range > 0) 3437 { 3438 range--; 3439 startpos++; 3440 } 3441 else 3442 { 3443 range++; 3444 startpos--; 3445 } 3446 } 3447 return -1; 3448 } /* re_search_2 */ 3449 3450 /* Declarations and macros for re_match_2. */ 3452 3453 static int bcmp_translate (); 3454 static boolean alt_match_null_string_p (), 3455 common_op_match_null_string_p (), 3456 group_match_null_string_p (); 3457 3458 /* This converts PTR, a pointer into one of the search strings `string1' 3459 and `string2' into an offset from the beginning of that string. */ 3460 #define POINTER_TO_OFFSET(ptr) \ 3461 (FIRST_STRING_P (ptr) \ 3462 ? ((regoff_t) ((ptr) - string1)) \ 3463 : ((regoff_t) ((ptr) - string2 + size1))) 3464 3465 /* Macros for dealing with the split strings in re_match_2. */ 3466 3467 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 3468 3469 /* Call before fetching a character with *d. This switches over to 3470 string2 if necessary. */ 3471 #define PREFETCH() \ 3472 while (d == dend) \ 3473 { \ 3474 /* End of string2 => fail. */ \ 3475 if (dend == end_match_2) \ 3476 goto fail; \ 3477 /* End of string1 => advance to string2. */ \ 3478 d = string2; \ 3479 dend = end_match_2; \ 3480 } 3481 3482 3483 /* Test if at very beginning or at very end of the virtual concatenation 3484 of `string1' and `string2'. If only one string, it's `string2'. */ 3485 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 3486 #define AT_STRINGS_END(d) ((d) == end2) 3487 3488 3489 /* Test if D points to a character which is word-constituent. We have 3490 two special cases to check for: if past the end of string1, look at 3491 the first character in string2; and if before the beginning of 3492 string2, look at the last character in string1. */ 3493 #define WORDCHAR_P(d) \ 3494 (SYNTAX ((d) == end1 ? *string2 \ 3495 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 3496 == Sword) 3497 3498 /* Disabled due to a compiler bug -- see comment at case wordbound */ 3499 #if 0 3500 /* Test if the character before D and the one at D differ with respect 3501 to being word-constituent. */ 3502 #define AT_WORD_BOUNDARY(d) \ 3503 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 3504 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 3505 #endif 3506 3507 /* Free everything we malloc. */ 3508 #ifdef MATCH_MAY_ALLOCATE 3509 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else 3510 #define FREE_VARIABLES() \ 3511 do { \ 3512 REGEX_FREE_STACK (fail_stack.stack); \ 3513 FREE_VAR (regstart); \ 3514 FREE_VAR (regend); \ 3515 FREE_VAR (old_regstart); \ 3516 FREE_VAR (old_regend); \ 3517 FREE_VAR (best_regstart); \ 3518 FREE_VAR (best_regend); \ 3519 FREE_VAR (reg_info); \ 3520 FREE_VAR (reg_dummy); \ 3521 FREE_VAR (reg_info_dummy); \ 3522 } while (0) 3523 #else 3524 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ 3525 #endif /* not MATCH_MAY_ALLOCATE */ 3526 3527 /* These values must meet several constraints. They must not be valid 3528 register values; since we have a limit of 255 registers (because 3529 we use only one byte in the pattern for the register number), we can 3530 use numbers larger than 255. They must differ by 1, because of 3531 NUM_FAILURE_ITEMS above. And the value for the lowest register must 3532 be larger than the value for the highest register, so we do not try 3533 to actually save any registers when none are active. */ 3534 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 3535 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 3536 3537 /* Matching routines. */ 3539 3540 #ifndef emacs /* Emacs never uses this. */ 3541 /* re_match is like re_match_2 except it takes only a single string. */ 3542 3543 int 3544 re_match (bufp, string, size, pos, regs) 3545 struct re_pattern_buffer *bufp; 3546 const char *string; 3547 int size, pos; 3548 struct re_registers *regs; 3549 { 3550 int result = re_match_2_internal (bufp, NULL, 0, string, size, 3551 pos, regs, size); 3552 alloca (0); 3553 return result; 3554 } 3555 #endif /* not emacs */ 3556 3557 3558 /* re_match_2 matches the compiled pattern in BUFP against the 3559 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 3560 and SIZE2, respectively). We start matching at POS, and stop 3561 matching at STOP. 3562 3563 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 3564 store offsets for the substring each group matched in REGS. See the 3565 documentation for exactly how many groups we fill. 3566 3567 We return -1 if no match, -2 if an internal error (such as the 3568 failure stack overflowing). Otherwise, we return the length of the 3569 matched substring. */ 3570 3571 int 3572 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 3573 struct re_pattern_buffer *bufp; 3574 const char *string1, *string2; 3575 int size1, size2; 3576 int pos; 3577 struct re_registers *regs; 3578 int stop; 3579 { 3580 int result = re_match_2_internal (bufp, string1, size1, string2, size2, 3581 pos, regs, stop); 3582 alloca (0); 3583 return result; 3584 } 3585 3586 /* This is a separate function so that we can force an alloca cleanup 3587 afterwards. */ 3588 static int 3589 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) 3590 struct re_pattern_buffer *bufp; 3591 const char *string1, *string2; 3592 int size1, size2; 3593 int pos; 3594 struct re_registers *regs; 3595 int stop; 3596 { 3597 /* General temporaries. */ 3598 int mcnt; 3599 unsigned char *p1; 3600 3601 /* Just past the end of the corresponding string. */ 3602 const char *end1, *end2; 3603 3604 /* Pointers into string1 and string2, just past the last characters in 3605 each to consider matching. */ 3606 const char *end_match_1, *end_match_2; 3607 3608 /* Where we are in the data, and the end of the current string. */ 3609 const char *d, *dend; 3610 3611 /* Where we are in the pattern, and the end of the pattern. */ 3612 unsigned char *p = bufp->buffer; 3613 register unsigned char *pend = p + bufp->used; 3614 3615 /* Mark the opcode just after a start_memory, so we can test for an 3616 empty subpattern when we get to the stop_memory. */ 3617 unsigned char *just_past_start_mem = 0; 3618 3619 /* We use this to map every character in the string. */ 3620 RE_TRANSLATE_TYPE translate = bufp->translate; 3621 3622 /* Failure point stack. Each place that can handle a failure further 3623 down the line pushes a failure point on this stack. It consists of 3624 restart, regend, and reg_info for all registers corresponding to 3625 the subexpressions we're currently inside, plus the number of such 3626 registers, and, finally, two char *'s. The first char * is where 3627 to resume scanning the pattern; the second one is where to resume 3628 scanning the strings. If the latter is zero, the failure point is 3629 a ``dummy''; if a failure happens and the failure point is a dummy, 3630 it gets discarded and the next next one is tried. */ 3631 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 3632 fail_stack_type fail_stack; 3633 #endif 3634 #ifdef DEBUG 3635 static unsigned failure_id = 0; 3636 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 3637 #endif 3638 3639 /* This holds the pointer to the failure stack, when 3640 it is allocated relocatably. */ 3641 #ifdef REL_ALLOC 3642 fail_stack_elt_t *failure_stack_ptr; 3643 #endif 3644 3645 /* We fill all the registers internally, independent of what we 3646 return, for use in backreferences. The number here includes 3647 an element for register zero. */ 3648 unsigned num_regs = bufp->re_nsub + 1; 3649 3650 /* The currently active registers. */ 3651 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3652 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3653 3654 /* Information on the contents of registers. These are pointers into 3655 the input strings; they record just what was matched (on this 3656 attempt) by a subexpression part of the pattern, that is, the 3657 regnum-th regstart pointer points to where in the pattern we began 3658 matching and the regnum-th regend points to right after where we 3659 stopped matching the regnum-th subexpression. (The zeroth register 3660 keeps track of what the whole pattern matches.) */ 3661 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3662 const char **regstart, **regend; 3663 #endif 3664 3665 /* If a group that's operated upon by a repetition operator fails to 3666 match anything, then the register for its start will need to be 3667 restored because it will have been set to wherever in the string we 3668 are when we last see its open-group operator. Similarly for a 3669 register's end. */ 3670 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3671 const char **old_regstart, **old_regend; 3672 #endif 3673 3674 /* The is_active field of reg_info helps us keep track of which (possibly 3675 nested) subexpressions we are currently in. The matched_something 3676 field of reg_info[reg_num] helps us tell whether or not we have 3677 matched any of the pattern so far this time through the reg_num-th 3678 subexpression. These two fields get reset each time through any 3679 loop their register is in. */ 3680 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 3681 register_info_type *reg_info; 3682 #endif 3683 3684 /* The following record the register info as found in the above 3685 variables when we find a match better than any we've seen before. 3686 This happens as we backtrack through the failure points, which in 3687 turn happens only if we have not yet matched the entire string. */ 3688 unsigned best_regs_set = false; 3689 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3690 const char **best_regstart, **best_regend; 3691 #endif 3692 3693 /* Logically, this is `best_regend[0]'. But we don't want to have to 3694 allocate space for that if we're not allocating space for anything 3695 else (see below). Also, we never need info about register 0 for 3696 any of the other register vectors, and it seems rather a kludge to 3697 treat `best_regend' differently than the rest. So we keep track of 3698 the end of the best match so far in a separate variable. We 3699 initialize this to NULL so that when we backtrack the first time 3700 and need to test it, it's not garbage. */ 3701 const char *match_end = NULL; 3702 3703 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ 3704 int set_regs_matched_done = 0; 3705 3706 /* Used when we pop values we don't care about. */ 3707 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3708 const char **reg_dummy; 3709 register_info_type *reg_info_dummy; 3710 #endif 3711 3712 #ifdef DEBUG 3713 /* Counts the total number of registers pushed. */ 3714 unsigned num_regs_pushed = 0; 3715 #endif 3716 3717 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 3718 3719 INIT_FAIL_STACK (); 3720 3721 #ifdef MATCH_MAY_ALLOCATE 3722 /* Do not bother to initialize all the register variables if there are 3723 no groups in the pattern, as it takes a fair amount of time. If 3724 there are groups, we include space for register 0 (the whole 3725 pattern), even though we never use it, since it simplifies the 3726 array indexing. We should fix this. */ 3727 if (bufp->re_nsub) 3728 { 3729 regstart = REGEX_TALLOC (num_regs, const char *); 3730 regend = REGEX_TALLOC (num_regs, const char *); 3731 old_regstart = REGEX_TALLOC (num_regs, const char *); 3732 old_regend = REGEX_TALLOC (num_regs, const char *); 3733 best_regstart = REGEX_TALLOC (num_regs, const char *); 3734 best_regend = REGEX_TALLOC (num_regs, const char *); 3735 reg_info = REGEX_TALLOC (num_regs, register_info_type); 3736 reg_dummy = REGEX_TALLOC (num_regs, const char *); 3737 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 3738 3739 if (!(regstart && regend && old_regstart && old_regend && reg_info 3740 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 3741 { 3742 FREE_VARIABLES (); 3743 return -2; 3744 } 3745 } 3746 else 3747 { 3748 /* We must initialize all our variables to NULL, so that 3749 `FREE_VARIABLES' doesn't try to free them. */ 3750 regstart = regend = old_regstart = old_regend = best_regstart 3751 = best_regend = reg_dummy = NULL; 3752 reg_info = reg_info_dummy = (register_info_type *) NULL; 3753 } 3754 #endif /* MATCH_MAY_ALLOCATE */ 3755 3756 /* The starting position is bogus. */ 3757 if (pos < 0 || pos > size1 + size2) 3758 { 3759 FREE_VARIABLES (); 3760 return -1; 3761 } 3762 3763 /* Initialize subexpression text positions to -1 to mark ones that no 3764 start_memory/stop_memory has been seen for. Also initialize the 3765 register information struct. */ 3766 for (mcnt = 1; mcnt < num_regs; mcnt++) 3767 { 3768 regstart[mcnt] = regend[mcnt] 3769 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 3770 3771 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 3772 IS_ACTIVE (reg_info[mcnt]) = 0; 3773 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3774 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 3775 } 3776 3777 /* We move `string1' into `string2' if the latter's empty -- but not if 3778 `string1' is null. */ 3779 if (size2 == 0 && string1 != NULL) 3780 { 3781 string2 = string1; 3782 size2 = size1; 3783 string1 = 0; 3784 size1 = 0; 3785 } 3786 end1 = string1 + size1; 3787 end2 = string2 + size2; 3788 3789 /* Compute where to stop matching, within the two strings. */ 3790 if (stop <= size1) 3791 { 3792 end_match_1 = string1 + stop; 3793 end_match_2 = string2; 3794 } 3795 else 3796 { 3797 end_match_1 = end1; 3798 end_match_2 = string2 + stop - size1; 3799 } 3800 3801 /* `p' scans through the pattern as `d' scans through the data. 3802 `dend' is the end of the input string that `d' points within. `d' 3803 is advanced into the following input string whenever necessary, but 3804 this happens before fetching; therefore, at the beginning of the 3805 loop, `d' can be pointing at the end of a string, but it cannot 3806 equal `string2'. */ 3807 if (size1 > 0 && pos <= size1) 3808 { 3809 d = string1 + pos; 3810 dend = end_match_1; 3811 } 3812 else 3813 { 3814 d = string2 + pos - size1; 3815 dend = end_match_2; 3816 } 3817 3818 DEBUG_PRINT1 ("The compiled pattern is: "); 3819 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 3820 DEBUG_PRINT1 ("The string to match is: `"); 3821 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 3822 DEBUG_PRINT1 ("'\n"); 3823 3824 /* This loops over pattern commands. It exits by returning from the 3825 function if the match is complete, or it drops through if the match 3826 fails at this starting point in the input data. */ 3827 for (;;) 3828 { 3829 DEBUG_PRINT2 ("\n0x%x: ", p); 3830 3831 if (p == pend) 3832 { /* End of pattern means we might have succeeded. */ 3833 DEBUG_PRINT1 ("end of pattern ... "); 3834 3835 /* If we haven't matched the entire string, and we want the 3836 longest match, try backtracking. */ 3837 if (d != end_match_2) 3838 { 3839 /* 1 if this match ends in the same string (string1 or string2) 3840 as the best previous match. */ 3841 boolean same_str_p = (FIRST_STRING_P (match_end) 3842 == MATCHING_IN_FIRST_STRING); 3843 /* 1 if this match is the best seen so far. */ 3844 boolean best_match_p; 3845 3846 /* AIX compiler got confused when this was combined 3847 with the previous declaration. */ 3848 if (same_str_p) 3849 best_match_p = d > match_end; 3850 else 3851 best_match_p = !MATCHING_IN_FIRST_STRING; 3852 3853 DEBUG_PRINT1 ("backtracking.\n"); 3854 3855 if (!FAIL_STACK_EMPTY ()) 3856 { /* More failure points to try. */ 3857 3858 /* If exceeds best match so far, save it. */ 3859 if (!best_regs_set || best_match_p) 3860 { 3861 best_regs_set = true; 3862 match_end = d; 3863 3864 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 3865 3866 for (mcnt = 1; mcnt < num_regs; mcnt++) 3867 { 3868 best_regstart[mcnt] = regstart[mcnt]; 3869 best_regend[mcnt] = regend[mcnt]; 3870 } 3871 } 3872 goto fail; 3873 } 3874 3875 /* If no failure points, don't restore garbage. And if 3876 last match is real best match, don't restore second 3877 best one. */ 3878 else if (best_regs_set && !best_match_p) 3879 { 3880 restore_best_regs: 3881 /* Restore best match. It may happen that `dend == 3882 end_match_1' while the restored d is in string2. 3883 For example, the pattern `x.*y.*z' against the 3884 strings `x-' and `y-z-', if the two strings are 3885 not consecutive in memory. */ 3886 DEBUG_PRINT1 ("Restoring best registers.\n"); 3887 3888 d = match_end; 3889 dend = ((d >= string1 && d <= end1) 3890 ? end_match_1 : end_match_2); 3891 3892 for (mcnt = 1; mcnt < num_regs; mcnt++) 3893 { 3894 regstart[mcnt] = best_regstart[mcnt]; 3895 regend[mcnt] = best_regend[mcnt]; 3896 } 3897 } 3898 } /* d != end_match_2 */ 3899 3900 succeed_label: 3901 DEBUG_PRINT1 ("Accepting match.\n"); 3902 3903 /* If caller wants register contents data back, do it. */ 3904 if (regs && !bufp->no_sub) 3905 { 3906 /* Have the register data arrays been allocated? */ 3907 if (bufp->regs_allocated == REGS_UNALLOCATED) 3908 { /* No. So allocate them with malloc. We need one 3909 extra element beyond `num_regs' for the `-1' marker 3910 GNU code uses. */ 3911 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 3912 regs->start = TALLOC (regs->num_regs, regoff_t); 3913 regs->end = TALLOC (regs->num_regs, regoff_t); 3914 if (regs->start == NULL || regs->end == NULL) 3915 { 3916 FREE_VARIABLES (); 3917 return -2; 3918 } 3919 bufp->regs_allocated = REGS_REALLOCATE; 3920 } 3921 else if (bufp->regs_allocated == REGS_REALLOCATE) 3922 { /* Yes. If we need more elements than were already 3923 allocated, reallocate them. If we need fewer, just 3924 leave it alone. */ 3925 if (regs->num_regs < num_regs + 1) 3926 { 3927 regs->num_regs = num_regs + 1; 3928 RETALLOC (regs->start, regs->num_regs, regoff_t); 3929 RETALLOC (regs->end, regs->num_regs, regoff_t); 3930 if (regs->start == NULL || regs->end == NULL) 3931 { 3932 FREE_VARIABLES (); 3933 return -2; 3934 } 3935 } 3936 } 3937 else 3938 { 3939 /* These braces fend off a "empty body in an else-statement" 3940 warning under GCC when assert expands to nothing. */ 3941 assert (bufp->regs_allocated == REGS_FIXED); 3942 } 3943 3944 /* Convert the pointer data in `regstart' and `regend' to 3945 indices. Register zero has to be set differently, 3946 since we haven't kept track of any info for it. */ 3947 if (regs->num_regs > 0) 3948 { 3949 regs->start[0] = pos; 3950 regs->end[0] = (MATCHING_IN_FIRST_STRING 3951 ? ((regoff_t) (d - string1)) 3952 : ((regoff_t) (d - string2 + size1))); 3953 } 3954 3955 /* Go through the first `min (num_regs, regs->num_regs)' 3956 registers, since that is all we initialized. */ 3957 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) 3958 { 3959 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 3960 regs->start[mcnt] = regs->end[mcnt] = -1; 3961 else 3962 { 3963 regs->start[mcnt] 3964 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); 3965 regs->end[mcnt] 3966 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); 3967 } 3968 } 3969 3970 /* If the regs structure we return has more elements than 3971 were in the pattern, set the extra elements to -1. If 3972 we (re)allocated the registers, this is the case, 3973 because we always allocate enough to have at least one 3974 -1 at the end. */ 3975 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) 3976 regs->start[mcnt] = regs->end[mcnt] = -1; 3977 } /* regs && !bufp->no_sub */ 3978 3979 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 3980 nfailure_points_pushed, nfailure_points_popped, 3981 nfailure_points_pushed - nfailure_points_popped); 3982 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 3983 3984 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 3985 ? string1 3986 : string2 - size1); 3987 3988 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 3989 3990 FREE_VARIABLES (); 3991 return mcnt; 3992 } 3993 3994 /* Otherwise match next pattern command. */ 3995 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 3996 { 3997 /* Ignore these. Used to ignore the n of succeed_n's which 3998 currently have n == 0. */ 3999 case no_op: 4000 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 4001 break; 4002 4003 case succeed: 4004 DEBUG_PRINT1 ("EXECUTING succeed.\n"); 4005 goto succeed_label; 4006 4007 /* Match the next n pattern characters exactly. The following 4008 byte in the pattern defines n, and the n bytes after that 4009 are the characters to match. */ 4010 case exactn: 4011 mcnt = *p++; 4012 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 4013 4014 /* This is written out as an if-else so we don't waste time 4015 testing `translate' inside the loop. */ 4016 if (translate) 4017 { 4018 do 4019 { 4020 PREFETCH (); 4021 if ((unsigned char) translate[(unsigned char) *d++] 4022 != (unsigned char) *p++) 4023 goto fail; 4024 } 4025 while (--mcnt); 4026 } 4027 else 4028 { 4029 do 4030 { 4031 PREFETCH (); 4032 if (*d++ != (char) *p++) goto fail; 4033 } 4034 while (--mcnt); 4035 } 4036 SET_REGS_MATCHED (); 4037 break; 4038 4039 4040 /* Match any character except possibly a newline or a null. */ 4041 case anychar: 4042 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 4043 4044 PREFETCH (); 4045 4046 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 4047 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 4048 goto fail; 4049 4050 SET_REGS_MATCHED (); 4051 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 4052 d++; 4053 break; 4054 4055 4056 case charset: 4057 case charset_not: 4058 { 4059 register unsigned char c; 4060 boolean not = (re_opcode_t) *(p - 1) == charset_not; 4061 4062 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 4063 4064 PREFETCH (); 4065 c = TRANSLATE (*d); /* The character to match. */ 4066 4067 /* Cast to `unsigned' instead of `unsigned char' in case the 4068 bit list is a full 32 bytes long. */ 4069 if (c < (unsigned) (*p * BYTEWIDTH) 4070 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4071 not = !not; 4072 4073 p += 1 + *p; 4074 4075 if (!not) goto fail; 4076 4077 SET_REGS_MATCHED (); 4078 d++; 4079 break; 4080 } 4081 4082 4083 /* The beginning of a group is represented by start_memory. 4084 The arguments are the register number in the next byte, and the 4085 number of groups inner to this one in the next. The text 4086 matched within the group is recorded (in the internal 4087 registers data structure) under the register number. */ 4088 case start_memory: 4089 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 4090 4091 /* Find out if this group can match the empty string. */ 4092 p1 = p; /* To send to group_match_null_string_p. */ 4093 4094 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 4095 REG_MATCH_NULL_STRING_P (reg_info[*p]) 4096 = group_match_null_string_p (&p1, pend, reg_info); 4097 4098 /* Save the position in the string where we were the last time 4099 we were at this open-group operator in case the group is 4100 operated upon by a repetition operator, e.g., with `(a*)*b' 4101 against `ab'; then we want to ignore where we are now in 4102 the string in case this attempt to match fails. */ 4103 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 4104 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 4105 : regstart[*p]; 4106 DEBUG_PRINT2 (" old_regstart: %d\n", 4107 POINTER_TO_OFFSET (old_regstart[*p])); 4108 4109 regstart[*p] = d; 4110 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 4111 4112 IS_ACTIVE (reg_info[*p]) = 1; 4113 MATCHED_SOMETHING (reg_info[*p]) = 0; 4114 4115 /* Clear this whenever we change the register activity status. */ 4116 set_regs_matched_done = 0; 4117 4118 /* This is the new highest active register. */ 4119 highest_active_reg = *p; 4120 4121 /* If nothing was active before, this is the new lowest active 4122 register. */ 4123 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 4124 lowest_active_reg = *p; 4125 4126 /* Move past the register number and inner group count. */ 4127 p += 2; 4128 just_past_start_mem = p; 4129 4130 break; 4131 4132 4133 /* The stop_memory opcode represents the end of a group. Its 4134 arguments are the same as start_memory's: the register 4135 number, and the number of inner groups. */ 4136 case stop_memory: 4137 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 4138 4139 /* We need to save the string position the last time we were at 4140 this close-group operator in case the group is operated 4141 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 4142 against `aba'; then we want to ignore where we are now in 4143 the string in case this attempt to match fails. */ 4144 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 4145 ? REG_UNSET (regend[*p]) ? d : regend[*p] 4146 : regend[*p]; 4147 DEBUG_PRINT2 (" old_regend: %d\n", 4148 POINTER_TO_OFFSET (old_regend[*p])); 4149 4150 regend[*p] = d; 4151 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 4152 4153 /* This register isn't active anymore. */ 4154 IS_ACTIVE (reg_info[*p]) = 0; 4155 4156 /* Clear this whenever we change the register activity status. */ 4157 set_regs_matched_done = 0; 4158 4159 /* If this was the only register active, nothing is active 4160 anymore. */ 4161 if (lowest_active_reg == highest_active_reg) 4162 { 4163 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 4164 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 4165 } 4166 else 4167 { /* We must scan for the new highest active register, since 4168 it isn't necessarily one less than now: consider 4169 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 4170 new highest active register is 1. */ 4171 unsigned char r = *p - 1; 4172 while (r > 0 && !IS_ACTIVE (reg_info[r])) 4173 r--; 4174 4175 /* If we end up at register zero, that means that we saved 4176 the registers as the result of an `on_failure_jump', not 4177 a `start_memory', and we jumped to past the innermost 4178 `stop_memory'. For example, in ((.)*) we save 4179 registers 1 and 2 as a result of the *, but when we pop 4180 back to the second ), we are at the stop_memory 1. 4181 Thus, nothing is active. */ 4182 if (r == 0) 4183 { 4184 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 4185 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 4186 } 4187 else 4188 highest_active_reg = r; 4189 } 4190 4191 /* If just failed to match something this time around with a 4192 group that's operated on by a repetition operator, try to 4193 force exit from the ``loop'', and restore the register 4194 information for this group that we had before trying this 4195 last match. */ 4196 if ((!MATCHED_SOMETHING (reg_info[*p]) 4197 || just_past_start_mem == p - 1) 4198 && (p + 2) < pend) 4199 { 4200 boolean is_a_jump_n = false; 4201 4202 p1 = p + 2; 4203 mcnt = 0; 4204 switch ((re_opcode_t) *p1++) 4205 { 4206 case jump_n: 4207 is_a_jump_n = true; 4208 case pop_failure_jump: 4209 case maybe_pop_jump: 4210 case jump: 4211 case dummy_failure_jump: 4212 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4213 if (is_a_jump_n) 4214 p1 += 2; 4215 break; 4216 4217 default: 4218 /* do nothing */ ; 4219 } 4220 p1 += mcnt; 4221 4222 /* If the next operation is a jump backwards in the pattern 4223 to an on_failure_jump right before the start_memory 4224 corresponding to this stop_memory, exit from the loop 4225 by forcing a failure after pushing on the stack the 4226 on_failure_jump's jump in the pattern, and d. */ 4227 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 4228 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 4229 { 4230 /* If this group ever matched anything, then restore 4231 what its registers were before trying this last 4232 failed match, e.g., with `(a*)*b' against `ab' for 4233 regstart[1], and, e.g., with `((a*)*(b*)*)*' 4234 against `aba' for regend[3]. 4235 4236 Also restore the registers for inner groups for, 4237 e.g., `((a*)(b*))*' against `aba' (register 3 would 4238 otherwise get trashed). */ 4239 4240 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 4241 { 4242 unsigned r; 4243 4244 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 4245 4246 /* Restore this and inner groups' (if any) registers. */ 4247 for (r = *p; r < *p + *(p + 1); r++) 4248 { 4249 regstart[r] = old_regstart[r]; 4250 4251 /* xx why this test? */ 4252 if (old_regend[r] >= regstart[r]) 4253 regend[r] = old_regend[r]; 4254 } 4255 } 4256 p1++; 4257 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4258 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 4259 4260 goto fail; 4261 } 4262 } 4263 4264 /* Move past the register number and the inner group count. */ 4265 p += 2; 4266 break; 4267 4268 4269 /* \<digit> has been turned into a `duplicate' command which is 4270 followed by the numeric value of <digit> as the register number. */ 4271 case duplicate: 4272 { 4273 register const char *d2, *dend2; 4274 int regno = *p++; /* Get which register to match against. */ 4275 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 4276 4277 /* Can't back reference a group which we've never matched. */ 4278 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 4279 goto fail; 4280 4281 /* Where in input to try to start matching. */ 4282 d2 = regstart[regno]; 4283 4284 /* Where to stop matching; if both the place to start and 4285 the place to stop matching are in the same string, then 4286 set to the place to stop, otherwise, for now have to use 4287 the end of the first string. */ 4288 4289 dend2 = ((FIRST_STRING_P (regstart[regno]) 4290 == FIRST_STRING_P (regend[regno])) 4291 ? regend[regno] : end_match_1); 4292 for (;;) 4293 { 4294 /* If necessary, advance to next segment in register 4295 contents. */ 4296 while (d2 == dend2) 4297 { 4298 if (dend2 == end_match_2) break; 4299 if (dend2 == regend[regno]) break; 4300 4301 /* End of string1 => advance to string2. */ 4302 d2 = string2; 4303 dend2 = regend[regno]; 4304 } 4305 /* At end of register contents => success */ 4306 if (d2 == dend2) break; 4307 4308 /* If necessary, advance to next segment in data. */ 4309 PREFETCH (); 4310 4311 /* How many characters left in this segment to match. */ 4312 mcnt = dend - d; 4313 4314 /* Want how many consecutive characters we can match in 4315 one shot, so, if necessary, adjust the count. */ 4316 if (mcnt > dend2 - d2) 4317 mcnt = dend2 - d2; 4318 4319 /* Compare that many; failure if mismatch, else move 4320 past them. */ 4321 if (translate 4322 ? bcmp_translate (d, d2, mcnt, translate) 4323 : bcmp (d, d2, mcnt)) 4324 goto fail; 4325 d += mcnt, d2 += mcnt; 4326 4327 /* Do this because we've match some characters. */ 4328 SET_REGS_MATCHED (); 4329 } 4330 } 4331 break; 4332 4333 4334 /* begline matches the empty string at the beginning of the string 4335 (unless `not_bol' is set in `bufp'), and, if 4336 `newline_anchor' is set, after newlines. */ 4337 case begline: 4338 DEBUG_PRINT1 ("EXECUTING begline.\n"); 4339 4340 if (AT_STRINGS_BEG (d)) 4341 { 4342 if (!bufp->not_bol) break; 4343 } 4344 else if (d[-1] == '\n' && bufp->newline_anchor) 4345 { 4346 break; 4347 } 4348 /* In all other cases, we fail. */ 4349 goto fail; 4350 4351 4352 /* endline is the dual of begline. */ 4353 case endline: 4354 DEBUG_PRINT1 ("EXECUTING endline.\n"); 4355 4356 if (AT_STRINGS_END (d)) 4357 { 4358 if (!bufp->not_eol) break; 4359 } 4360 4361 /* We have to ``prefetch'' the next character. */ 4362 else if ((d == end1 ? *string2 : *d) == '\n' 4363 && bufp->newline_anchor) 4364 { 4365 break; 4366 } 4367 goto fail; 4368 4369 4370 /* Match at the very beginning of the data. */ 4371 case begbuf: 4372 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 4373 if (AT_STRINGS_BEG (d)) 4374 break; 4375 goto fail; 4376 4377 4378 /* Match at the very end of the data. */ 4379 case endbuf: 4380 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 4381 if (AT_STRINGS_END (d)) 4382 break; 4383 goto fail; 4384 4385 4386 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 4387 pushes NULL as the value for the string on the stack. Then 4388 `pop_failure_point' will keep the current value for the 4389 string, instead of restoring it. To see why, consider 4390 matching `foo\nbar' against `.*\n'. The .* matches the foo; 4391 then the . fails against the \n. But the next thing we want 4392 to do is match the \n against the \n; if we restored the 4393 string value, we would be back at the foo. 4394 4395 Because this is used only in specific cases, we don't need to 4396 check all the things that `on_failure_jump' does, to make 4397 sure the right things get saved on the stack. Hence we don't 4398 share its code. The only reason to push anything on the 4399 stack at all is that otherwise we would have to change 4400 `anychar's code to do something besides goto fail in this 4401 case; that seems worse than this. */ 4402 case on_failure_keep_string_jump: 4403 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 4404 4405 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4406 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 4407 4408 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 4409 break; 4410 4411 4412 /* Uses of on_failure_jump: 4413 4414 Each alternative starts with an on_failure_jump that points 4415 to the beginning of the next alternative. Each alternative 4416 except the last ends with a jump that in effect jumps past 4417 the rest of the alternatives. (They really jump to the 4418 ending jump of the following alternative, because tensioning 4419 these jumps is a hassle.) 4420 4421 Repeats start with an on_failure_jump that points past both 4422 the repetition text and either the following jump or 4423 pop_failure_jump back to this on_failure_jump. */ 4424 case on_failure_jump: 4425 on_failure: 4426 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 4427 4428 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4429 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 4430 4431 /* If this on_failure_jump comes right before a group (i.e., 4432 the original * applied to a group), save the information 4433 for that group and all inner ones, so that if we fail back 4434 to this point, the group's information will be correct. 4435 For example, in \(a*\)*\1, we need the preceding group, 4436 and in \(zz\(a*\)b*\)\2, we need the inner group. */ 4437 4438 /* We can't use `p' to check ahead because we push 4439 a failure point to `p + mcnt' after we do this. */ 4440 p1 = p; 4441 4442 /* We need to skip no_op's before we look for the 4443 start_memory in case this on_failure_jump is happening as 4444 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 4445 against aba. */ 4446 while (p1 < pend && (re_opcode_t) *p1 == no_op) 4447 p1++; 4448 4449 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 4450 { 4451 /* We have a new highest active register now. This will 4452 get reset at the start_memory we are about to get to, 4453 but we will have saved all the registers relevant to 4454 this repetition op, as described above. */ 4455 highest_active_reg = *(p1 + 1) + *(p1 + 2); 4456 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 4457 lowest_active_reg = *(p1 + 1); 4458 } 4459 4460 DEBUG_PRINT1 (":\n"); 4461 PUSH_FAILURE_POINT (p + mcnt, d, -2); 4462 break; 4463 4464 4465 /* A smart repeat ends with `maybe_pop_jump'. 4466 We change it to either `pop_failure_jump' or `jump'. */ 4467 case maybe_pop_jump: 4468 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4469 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 4470 { 4471 register unsigned char *p2 = p; 4472 4473 /* Compare the beginning of the repeat with what in the 4474 pattern follows its end. If we can establish that there 4475 is nothing that they would both match, i.e., that we 4476 would have to backtrack because of (as in, e.g., `a*a') 4477 then we can change to pop_failure_jump, because we'll 4478 never have to backtrack. 4479 4480 This is not true in the case of alternatives: in 4481 `(a|ab)*' we do need to backtrack to the `ab' alternative 4482 (e.g., if the string was `ab'). But instead of trying to 4483 detect that here, the alternative has put on a dummy 4484 failure point which is what we will end up popping. */ 4485 4486 /* Skip over open/close-group commands. 4487 If what follows this loop is a ...+ construct, 4488 look at what begins its body, since we will have to 4489 match at least one of that. */ 4490 while (1) 4491 { 4492 if (p2 + 2 < pend 4493 && ((re_opcode_t) *p2 == stop_memory 4494 || (re_opcode_t) *p2 == start_memory)) 4495 p2 += 3; 4496 else if (p2 + 6 < pend 4497 && (re_opcode_t) *p2 == dummy_failure_jump) 4498 p2 += 6; 4499 else 4500 break; 4501 } 4502 4503 p1 = p + mcnt; 4504 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 4505 to the `maybe_finalize_jump' of this case. Examine what 4506 follows. */ 4507 4508 /* If we're at the end of the pattern, we can change. */ 4509 if (p2 == pend) 4510 { 4511 /* Consider what happens when matching ":\(.*\)" 4512 against ":/". I don't really understand this code 4513 yet. */ 4514 p[-3] = (unsigned char) pop_failure_jump; 4515 DEBUG_PRINT1 4516 (" End of pattern: change to `pop_failure_jump'.\n"); 4517 } 4518 4519 else if ((re_opcode_t) *p2 == exactn 4520 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 4521 { 4522 register unsigned char c 4523 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4524 4525 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 4526 { 4527 p[-3] = (unsigned char) pop_failure_jump; 4528 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4529 c, p1[5]); 4530 } 4531 4532 else if ((re_opcode_t) p1[3] == charset 4533 || (re_opcode_t) p1[3] == charset_not) 4534 { 4535 int not = (re_opcode_t) p1[3] == charset_not; 4536 4537 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 4538 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4539 not = !not; 4540 4541 /* `not' is equal to 1 if c would match, which means 4542 that we can't change to pop_failure_jump. */ 4543 if (!not) 4544 { 4545 p[-3] = (unsigned char) pop_failure_jump; 4546 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4547 } 4548 } 4549 } 4550 else if ((re_opcode_t) *p2 == charset) 4551 { 4552 #ifdef DEBUG 4553 register unsigned char c 4554 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4555 #endif 4556 4557 if ((re_opcode_t) p1[3] == exactn 4558 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 4559 && (p2[2 + p1[5] / BYTEWIDTH] 4560 & (1 << (p1[5] % BYTEWIDTH))))) 4561 { 4562 p[-3] = (unsigned char) pop_failure_jump; 4563 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4564 c, p1[5]); 4565 } 4566 4567 else if ((re_opcode_t) p1[3] == charset_not) 4568 { 4569 int idx; 4570 /* We win if the charset_not inside the loop 4571 lists every character listed in the charset after. */ 4572 for (idx = 0; idx < (int) p2[1]; idx++) 4573 if (! (p2[2 + idx] == 0 4574 || (idx < (int) p1[4] 4575 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) 4576 break; 4577 4578 if (idx == p2[1]) 4579 { 4580 p[-3] = (unsigned char) pop_failure_jump; 4581 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4582 } 4583 } 4584 else if ((re_opcode_t) p1[3] == charset) 4585 { 4586 int idx; 4587 /* We win if the charset inside the loop 4588 has no overlap with the one after the loop. */ 4589 for (idx = 0; 4590 idx < (int) p2[1] && idx < (int) p1[4]; 4591 idx++) 4592 if ((p2[2 + idx] & p1[5 + idx]) != 0) 4593 break; 4594 4595 if (idx == p2[1] || idx == p1[4]) 4596 { 4597 p[-3] = (unsigned char) pop_failure_jump; 4598 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4599 } 4600 } 4601 } 4602 } 4603 p -= 2; /* Point at relative address again. */ 4604 if ((re_opcode_t) p[-1] != pop_failure_jump) 4605 { 4606 p[-1] = (unsigned char) jump; 4607 DEBUG_PRINT1 (" Match => jump.\n"); 4608 goto unconditional_jump; 4609 } 4610 /* Note fall through. */ 4611 4612 4613 /* The end of a simple repeat has a pop_failure_jump back to 4614 its matching on_failure_jump, where the latter will push a 4615 failure point. The pop_failure_jump takes off failure 4616 points put on by this pop_failure_jump's matching 4617 on_failure_jump; we got through the pattern to here from the 4618 matching on_failure_jump, so didn't fail. */ 4619 case pop_failure_jump: 4620 { 4621 /* We need to pass separate storage for the lowest and 4622 highest registers, even though we don't care about the 4623 actual values. Otherwise, we will restore only one 4624 register from the stack, since lowest will == highest in 4625 `pop_failure_point'. */ 4626 unsigned dummy_low_reg, dummy_high_reg; 4627 unsigned char *pdummy; 4628 const char *sdummy; 4629 4630 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 4631 POP_FAILURE_POINT (sdummy, pdummy, 4632 dummy_low_reg, dummy_high_reg, 4633 reg_dummy, reg_dummy, reg_info_dummy); 4634 } 4635 /* Note fall through. */ 4636 4637 4638 /* Unconditionally jump (without popping any failure points). */ 4639 case jump: 4640 unconditional_jump: 4641 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 4642 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 4643 p += mcnt; /* Do the jump. */ 4644 DEBUG_PRINT2 ("(to 0x%x).\n", p); 4645 break; 4646 4647 4648 /* We need this opcode so we can detect where alternatives end 4649 in `group_match_null_string_p' et al. */ 4650 case jump_past_alt: 4651 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 4652 goto unconditional_jump; 4653 4654 4655 /* Normally, the on_failure_jump pushes a failure point, which 4656 then gets popped at pop_failure_jump. We will end up at 4657 pop_failure_jump, also, and with a pattern of, say, `a+', we 4658 are skipping over the on_failure_jump, so we have to push 4659 something meaningless for pop_failure_jump to pop. */ 4660 case dummy_failure_jump: 4661 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 4662 /* It doesn't matter what we push for the string here. What 4663 the code at `fail' tests is the value for the pattern. */ 4664 PUSH_FAILURE_POINT (0, 0, -2); 4665 goto unconditional_jump; 4666 4667 4668 /* At the end of an alternative, we need to push a dummy failure 4669 point in case we are followed by a `pop_failure_jump', because 4670 we don't want the failure point for the alternative to be 4671 popped. For example, matching `(a|ab)*' against `aab' 4672 requires that we match the `ab' alternative. */ 4673 case push_dummy_failure: 4674 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 4675 /* See comments just above at `dummy_failure_jump' about the 4676 two zeroes. */ 4677 PUSH_FAILURE_POINT (0, 0, -2); 4678 break; 4679 4680 /* Have to succeed matching what follows at least n times. 4681 After that, handle like `on_failure_jump'. */ 4682 case succeed_n: 4683 EXTRACT_NUMBER (mcnt, p + 2); 4684 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 4685 4686 assert (mcnt >= 0); 4687 /* Originally, this is how many times we HAVE to succeed. */ 4688 if (mcnt > 0) 4689 { 4690 mcnt--; 4691 p += 2; 4692 STORE_NUMBER_AND_INCR (p, mcnt); 4693 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); 4694 } 4695 else if (mcnt == 0) 4696 { 4697 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 4698 p[2] = (unsigned char) no_op; 4699 p[3] = (unsigned char) no_op; 4700 goto on_failure; 4701 } 4702 break; 4703 4704 case jump_n: 4705 EXTRACT_NUMBER (mcnt, p + 2); 4706 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 4707 4708 /* Originally, this is how many times we CAN jump. */ 4709 if (mcnt) 4710 { 4711 mcnt--; 4712 STORE_NUMBER (p + 2, mcnt); 4713 goto unconditional_jump; 4714 } 4715 /* If don't have to jump any more, skip over the rest of command. */ 4716 else 4717 p += 4; 4718 break; 4719 4720 case set_number_at: 4721 { 4722 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 4723 4724 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4725 p1 = p + mcnt; 4726 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4727 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 4728 STORE_NUMBER (p1, mcnt); 4729 break; 4730 } 4731 4732 #if 0 4733 /* The DEC Alpha C compiler 3.x generates incorrect code for the 4734 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of 4735 AT_WORD_BOUNDARY, so this code is disabled. Expanding the 4736 macro and introducing temporary variables works around the bug. */ 4737 4738 case wordbound: 4739 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 4740 if (AT_WORD_BOUNDARY (d)) 4741 break; 4742 goto fail; 4743 4744 case notwordbound: 4745 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 4746 if (AT_WORD_BOUNDARY (d)) 4747 goto fail; 4748 break; 4749 #else 4750 case wordbound: 4751 { 4752 boolean prevchar, thischar; 4753 4754 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 4755 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 4756 break; 4757 4758 prevchar = WORDCHAR_P (d - 1); 4759 thischar = WORDCHAR_P (d); 4760 if (prevchar != thischar) 4761 break; 4762 goto fail; 4763 } 4764 4765 case notwordbound: 4766 { 4767 boolean prevchar, thischar; 4768 4769 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 4770 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 4771 goto fail; 4772 4773 prevchar = WORDCHAR_P (d - 1); 4774 thischar = WORDCHAR_P (d); 4775 if (prevchar != thischar) 4776 goto fail; 4777 break; 4778 } 4779 #endif 4780 4781 case wordbeg: 4782 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 4783 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 4784 break; 4785 goto fail; 4786 4787 case wordend: 4788 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 4789 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 4790 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 4791 break; 4792 goto fail; 4793 4794 #ifdef emacs 4795 case before_dot: 4796 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 4797 if (PTR_CHAR_POS ((unsigned char *) d) >= PT) 4798 goto fail; 4799 break; 4800 4801 case at_dot: 4802 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 4803 if (PTR_CHAR_POS ((unsigned char *) d) != PT) 4804 goto fail; 4805 break; 4806 4807 case after_dot: 4808 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 4809 if (PTR_CHAR_POS ((unsigned char *) d) <= PT) 4810 goto fail; 4811 break; 4812 4813 case syntaxspec: 4814 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 4815 mcnt = *p++; 4816 goto matchsyntax; 4817 4818 case wordchar: 4819 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 4820 mcnt = (int) Sword; 4821 matchsyntax: 4822 PREFETCH (); 4823 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 4824 d++; 4825 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 4826 goto fail; 4827 SET_REGS_MATCHED (); 4828 break; 4829 4830 case notsyntaxspec: 4831 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 4832 mcnt = *p++; 4833 goto matchnotsyntax; 4834 4835 case notwordchar: 4836 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 4837 mcnt = (int) Sword; 4838 matchnotsyntax: 4839 PREFETCH (); 4840 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 4841 d++; 4842 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 4843 goto fail; 4844 SET_REGS_MATCHED (); 4845 break; 4846 4847 #else /* not emacs */ 4848 case wordchar: 4849 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 4850 PREFETCH (); 4851 if (!WORDCHAR_P (d)) 4852 goto fail; 4853 SET_REGS_MATCHED (); 4854 d++; 4855 break; 4856 4857 case notwordchar: 4858 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 4859 PREFETCH (); 4860 if (WORDCHAR_P (d)) 4861 goto fail; 4862 SET_REGS_MATCHED (); 4863 d++; 4864 break; 4865 #endif /* not emacs */ 4866 4867 default: 4868 abort (); 4869 } 4870 continue; /* Successfully executed one pattern command; keep going. */ 4871 4872 4873 /* We goto here if a matching operation fails. */ 4874 fail: 4875 if (!FAIL_STACK_EMPTY ()) 4876 { /* A restart point is known. Restore to that state. */ 4877 DEBUG_PRINT1 ("\nFAIL:\n"); 4878 POP_FAILURE_POINT (d, p, 4879 lowest_active_reg, highest_active_reg, 4880 regstart, regend, reg_info); 4881 4882 /* If this failure point is a dummy, try the next one. */ 4883 if (!p) 4884 goto fail; 4885 4886 /* If we failed to the end of the pattern, don't examine *p. */ 4887 assert (p <= pend); 4888 if (p < pend) 4889 { 4890 boolean is_a_jump_n = false; 4891 4892 /* If failed to a backwards jump that's part of a repetition 4893 loop, need to pop this failure point and use the next one. */ 4894 switch ((re_opcode_t) *p) 4895 { 4896 case jump_n: 4897 is_a_jump_n = true; 4898 case maybe_pop_jump: 4899 case pop_failure_jump: 4900 case jump: 4901 p1 = p + 1; 4902 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4903 p1 += mcnt; 4904 4905 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 4906 || (!is_a_jump_n 4907 && (re_opcode_t) *p1 == on_failure_jump)) 4908 goto fail; 4909 break; 4910 default: 4911 /* do nothing */ ; 4912 } 4913 } 4914 4915 if (d >= string1 && d <= end1) 4916 dend = end_match_1; 4917 } 4918 else 4919 break; /* Matching at this starting point really fails. */ 4920 } /* for (;;) */ 4921 4922 if (best_regs_set) 4923 goto restore_best_regs; 4924 4925 FREE_VARIABLES (); 4926 4927 return -1; /* Failure to match. */ 4928 } /* re_match_2 */ 4929 4930 /* Subroutine definitions for re_match_2. */ 4932 4933 4934 /* We are passed P pointing to a register number after a start_memory. 4935 4936 Return true if the pattern up to the corresponding stop_memory can 4937 match the empty string, and false otherwise. 4938 4939 If we find the matching stop_memory, sets P to point to one past its number. 4940 Otherwise, sets P to an undefined byte less than or equal to END. 4941 4942 We don't handle duplicates properly (yet). */ 4943 4944 static boolean 4945 group_match_null_string_p (p, end, reg_info) 4946 unsigned char **p, *end; 4947 register_info_type *reg_info; 4948 { 4949 int mcnt; 4950 /* Point to after the args to the start_memory. */ 4951 unsigned char *p1 = *p + 2; 4952 4953 while (p1 < end) 4954 { 4955 /* Skip over opcodes that can match nothing, and return true or 4956 false, as appropriate, when we get to one that can't, or to the 4957 matching stop_memory. */ 4958 4959 switch ((re_opcode_t) *p1) 4960 { 4961 /* Could be either a loop or a series of alternatives. */ 4962 case on_failure_jump: 4963 p1++; 4964 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4965 4966 /* If the next operation is not a jump backwards in the 4967 pattern. */ 4968 4969 if (mcnt >= 0) 4970 { 4971 /* Go through the on_failure_jumps of the alternatives, 4972 seeing if any of the alternatives cannot match nothing. 4973 The last alternative starts with only a jump, 4974 whereas the rest start with on_failure_jump and end 4975 with a jump, e.g., here is the pattern for `a|b|c': 4976 4977 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 4978 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 4979 /exactn/1/c 4980 4981 So, we have to first go through the first (n-1) 4982 alternatives and then deal with the last one separately. */ 4983 4984 4985 /* Deal with the first (n-1) alternatives, which start 4986 with an on_failure_jump (see above) that jumps to right 4987 past a jump_past_alt. */ 4988 4989 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 4990 { 4991 /* `mcnt' holds how many bytes long the alternative 4992 is, including the ending `jump_past_alt' and 4993 its number. */ 4994 4995 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 4996 reg_info)) 4997 return false; 4998 4999 /* Move to right after this alternative, including the 5000 jump_past_alt. */ 5001 p1 += mcnt; 5002 5003 /* Break if it's the beginning of an n-th alternative 5004 that doesn't begin with an on_failure_jump. */ 5005 if ((re_opcode_t) *p1 != on_failure_jump) 5006 break; 5007 5008 /* Still have to check that it's not an n-th 5009 alternative that starts with an on_failure_jump. */ 5010 p1++; 5011 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5012 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 5013 { 5014 /* Get to the beginning of the n-th alternative. */ 5015 p1 -= 3; 5016 break; 5017 } 5018 } 5019 5020 /* Deal with the last alternative: go back and get number 5021 of the `jump_past_alt' just before it. `mcnt' contains 5022 the length of the alternative. */ 5023 EXTRACT_NUMBER (mcnt, p1 - 2); 5024 5025 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 5026 return false; 5027 5028 p1 += mcnt; /* Get past the n-th alternative. */ 5029 } /* if mcnt > 0 */ 5030 break; 5031 5032 5033 case stop_memory: 5034 assert (p1[1] == **p); 5035 *p = p1 + 2; 5036 return true; 5037 5038 5039 default: 5040 if (!common_op_match_null_string_p (&p1, end, reg_info)) 5041 return false; 5042 } 5043 } /* while p1 < end */ 5044 5045 return false; 5046 } /* group_match_null_string_p */ 5047 5048 5049 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 5050 It expects P to be the first byte of a single alternative and END one 5051 byte past the last. The alternative can contain groups. */ 5052 5053 static boolean 5054 alt_match_null_string_p (p, end, reg_info) 5055 unsigned char *p, *end; 5056 register_info_type *reg_info; 5057 { 5058 int mcnt; 5059 unsigned char *p1 = p; 5060 5061 while (p1 < end) 5062 { 5063 /* Skip over opcodes that can match nothing, and break when we get 5064 to one that can't. */ 5065 5066 switch ((re_opcode_t) *p1) 5067 { 5068 /* It's a loop. */ 5069 case on_failure_jump: 5070 p1++; 5071 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5072 p1 += mcnt; 5073 break; 5074 5075 default: 5076 if (!common_op_match_null_string_p (&p1, end, reg_info)) 5077 return false; 5078 } 5079 } /* while p1 < end */ 5080 5081 return true; 5082 } /* alt_match_null_string_p */ 5083 5084 5085 /* Deals with the ops common to group_match_null_string_p and 5086 alt_match_null_string_p. 5087 5088 Sets P to one after the op and its arguments, if any. */ 5089 5090 static boolean 5091 common_op_match_null_string_p (p, end, reg_info) 5092 unsigned char **p, *end; 5093 register_info_type *reg_info; 5094 { 5095 int mcnt; 5096 boolean ret; 5097 int reg_no; 5098 unsigned char *p1 = *p; 5099 5100 switch ((re_opcode_t) *p1++) 5101 { 5102 case no_op: 5103 case begline: 5104 case endline: 5105 case begbuf: 5106 case endbuf: 5107 case wordbeg: 5108 case wordend: 5109 case wordbound: 5110 case notwordbound: 5111 #ifdef emacs 5112 case before_dot: 5113 case at_dot: 5114 case after_dot: 5115 #endif 5116 break; 5117 5118 case start_memory: 5119 reg_no = *p1; 5120 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 5121 ret = group_match_null_string_p (&p1, end, reg_info); 5122 5123 /* Have to set this here in case we're checking a group which 5124 contains a group and a back reference to it. */ 5125 5126 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 5127 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 5128 5129 if (!ret) 5130 return false; 5131 break; 5132 5133 /* If this is an optimized succeed_n for zero times, make the jump. */ 5134 case jump: 5135 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5136 if (mcnt >= 0) 5137 p1 += mcnt; 5138 else 5139 return false; 5140 break; 5141 5142 case succeed_n: 5143 /* Get to the number of times to succeed. */ 5144 p1 += 2; 5145 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5146 5147 if (mcnt == 0) 5148 { 5149 p1 -= 4; 5150 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5151 p1 += mcnt; 5152 } 5153 else 5154 return false; 5155 break; 5156 5157 case duplicate: 5158 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 5159 return false; 5160 break; 5161 5162 case set_number_at: 5163 p1 += 4; 5164 5165 default: 5166 /* All other opcodes mean we cannot match the empty string. */ 5167 return false; 5168 } 5169 5170 *p = p1; 5171 return true; 5172 } /* common_op_match_null_string_p */ 5173 5174 5175 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 5176 bytes; nonzero otherwise. */ 5177 5178 static int 5179 bcmp_translate (s1, s2, len, translate) 5180 unsigned char *s1, *s2; 5181 register int len; 5182 RE_TRANSLATE_TYPE translate; 5183 { 5184 register unsigned char *p1 = s1, *p2 = s2; 5185 while (len) 5186 { 5187 if (translate[*p1++] != translate[*p2++]) return 1; 5188 len--; 5189 } 5190 return 0; 5191 } 5192 5193 /* Entry points for GNU code. */ 5195 5196 /* re_compile_pattern is the GNU regular expression compiler: it 5197 compiles PATTERN (of length SIZE) and puts the result in BUFP. 5198 Returns 0 if the pattern was valid, otherwise an error string. 5199 5200 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 5201 are set in BUFP on entry. 5202 5203 We call regex_compile to do the actual compilation. */ 5204 5205 const char * 5206 re_compile_pattern (pattern, length, bufp) 5207 const char *pattern; 5208 int length; 5209 struct re_pattern_buffer *bufp; 5210 { 5211 reg_errcode_t ret; 5212 5213 /* GNU code is written to assume at least RE_NREGS registers will be set 5214 (and at least one extra will be -1). */ 5215 bufp->regs_allocated = REGS_UNALLOCATED; 5216 5217 /* And GNU code determines whether or not to get register information 5218 by passing null for the REGS argument to re_match, etc., not by 5219 setting no_sub. */ 5220 bufp->no_sub = 0; 5221 5222 /* Match anchors at newline. */ 5223 bufp->newline_anchor = 1; 5224 5225 ret = regex_compile (pattern, length, re_syntax_options, bufp); 5226 5227 if (!ret) 5228 return NULL; 5229 return gettext (re_error_msgid[(int) ret]); 5230 } 5231 5232 /* Entry points compatible with 4.2 BSD regex library. We don't define 5234 them unless specifically requested. */ 5235 5236 #if defined (_REGEX_RE_COMP) || defined (_LIBC) 5237 5238 /* BSD has one and only one pattern buffer. */ 5239 static struct re_pattern_buffer re_comp_buf; 5240 5241 char * 5242 #ifdef _LIBC 5243 /* Make these definitions weak in libc, so POSIX programs can redefine 5244 these names if they don't use our functions, and still use 5245 regcomp/regexec below without link errors. */ 5246 weak_function 5247 #endif 5248 re_comp (s) 5249 const char *s; 5250 { 5251 reg_errcode_t ret; 5252 5253 if (!s) 5254 { 5255 if (!re_comp_buf.buffer) 5256 return gettext ("No previous regular expression"); 5257 return 0; 5258 } 5259 5260 if (!re_comp_buf.buffer) 5261 { 5262 re_comp_buf.buffer = (unsigned char *) malloc (200); 5263 if (re_comp_buf.buffer == NULL) 5264 return gettext (re_error_msgid[(int) REG_ESPACE]); 5265 re_comp_buf.allocated = 200; 5266 5267 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 5268 if (re_comp_buf.fastmap == NULL) 5269 return gettext (re_error_msgid[(int) REG_ESPACE]); 5270 } 5271 5272 /* Since `re_exec' always passes NULL for the `regs' argument, we 5273 don't need to initialize the pattern buffer fields which affect it. */ 5274 5275 /* Match anchors at newlines. */ 5276 re_comp_buf.newline_anchor = 1; 5277 5278 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 5279 5280 if (!ret) 5281 return NULL; 5282 5283 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 5284 return (char *) gettext (re_error_msgid[(int) ret]); 5285 } 5286 5287 5288 int 5289 #ifdef _LIBC 5290 weak_function 5291 #endif 5292 re_exec (s) 5293 const char *s; 5294 { 5295 const int len = strlen (s); 5296 return 5297 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 5298 } 5299 #endif /* _REGEX_RE_COMP */ 5300 5301 /* POSIX.2 functions. Don't define these for Emacs. */ 5303 5304 #ifndef emacs 5305 5306 /* regcomp takes a regular expression as a string and compiles it. 5307 5308 PREG is a regex_t *. We do not expect any fields to be initialized, 5309 since POSIX says we shouldn't. Thus, we set 5310 5311 `buffer' to the compiled pattern; 5312 `used' to the length of the compiled pattern; 5313 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 5314 REG_EXTENDED bit in CFLAGS is set; otherwise, to 5315 RE_SYNTAX_POSIX_BASIC; 5316 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 5317 `fastmap' and `fastmap_accurate' to zero; 5318 `re_nsub' to the number of subexpressions in PATTERN. 5319 5320 PATTERN is the address of the pattern string. 5321 5322 CFLAGS is a series of bits which affect compilation. 5323 5324 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 5325 use POSIX basic syntax. 5326 5327 If REG_NEWLINE is set, then . and [^...] don't match newline. 5328 Also, regexec will try a match beginning after every newline. 5329 5330 If REG_ICASE is set, then we considers upper- and lowercase 5331 versions of letters to be equivalent when matching. 5332 5333 If REG_NOSUB is set, then when PREG is passed to regexec, that 5334 routine will report only success or failure, and nothing about the 5335 registers. 5336 5337 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 5338 the return codes and their meanings.) */ 5339 5340 int 5341 regcomp (preg, pattern, cflags) 5342 regex_t *preg; 5343 const char *pattern; 5344 int cflags; 5345 { 5346 reg_errcode_t ret; 5347 unsigned syntax 5348 = (cflags & REG_EXTENDED) ? 5349 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 5350 5351 /* regex_compile will allocate the space for the compiled pattern. */ 5352 preg->buffer = 0; 5353 preg->allocated = 0; 5354 preg->used = 0; 5355 5356 /* Don't bother to use a fastmap when searching. This simplifies the 5357 REG_NEWLINE case: if we used a fastmap, we'd have to put all the 5358 characters after newlines into the fastmap. This way, we just try 5359 every character. */ 5360 preg->fastmap = 0; 5361 5362 if (cflags & REG_ICASE) 5363 { 5364 unsigned i; 5365 5366 preg->translate 5367 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE 5368 * sizeof (*(RE_TRANSLATE_TYPE)0)); 5369 if (preg->translate == NULL) 5370 return (int) REG_ESPACE; 5371 5372 /* Map uppercase characters to corresponding lowercase ones. */ 5373 for (i = 0; i < CHAR_SET_SIZE; i++) 5374 preg->translate[i] = ISUPPER (i) ? tolower (i) : i; 5375 } 5376 else 5377 preg->translate = NULL; 5378 5379 /* If REG_NEWLINE is set, newlines are treated differently. */ 5380 if (cflags & REG_NEWLINE) 5381 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 5382 syntax &= ~RE_DOT_NEWLINE; 5383 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 5384 /* It also changes the matching behavior. */ 5385 preg->newline_anchor = 1; 5386 } 5387 else 5388 preg->newline_anchor = 0; 5389 5390 preg->no_sub = !!(cflags & REG_NOSUB); 5391 5392 /* POSIX says a null character in the pattern terminates it, so we 5393 can use strlen here in compiling the pattern. */ 5394 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 5395 5396 /* POSIX doesn't distinguish between an unmatched open-group and an 5397 unmatched close-group: both are REG_EPAREN. */ 5398 if (ret == REG_ERPAREN) ret = REG_EPAREN; 5399 5400 return (int) ret; 5401 } 5402 5403 5404 /* regexec searches for a given pattern, specified by PREG, in the 5405 string STRING. 5406 5407 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 5408 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 5409 least NMATCH elements, and we set them to the offsets of the 5410 corresponding matched substrings. 5411 5412 EFLAGS specifies `execution flags' which affect matching: if 5413 REG_NOTBOL is set, then ^ does not match at the beginning of the 5414 string; if REG_NOTEOL is set, then $ does not match at the end. 5415 5416 We return 0 if we find a match and REG_NOMATCH if not. */ 5417 5418 int 5419 regexec (preg, string, nmatch, pmatch, eflags) 5420 const regex_t *preg; 5421 const char *string; 5422 size_t nmatch; 5423 regmatch_t pmatch[]; 5424 int eflags; 5425 { 5426 int ret; 5427 struct re_registers regs; 5428 regex_t private_preg; 5429 int len = strlen (string); 5430 boolean want_reg_info = !preg->no_sub && nmatch > 0; 5431 5432 private_preg = *preg; 5433 5434 private_preg.not_bol = !!(eflags & REG_NOTBOL); 5435 private_preg.not_eol = !!(eflags & REG_NOTEOL); 5436 5437 /* The user has told us exactly how many registers to return 5438 information about, via `nmatch'. We have to pass that on to the 5439 matching routines. */ 5440 private_preg.regs_allocated = REGS_FIXED; 5441 5442 if (want_reg_info) 5443 { 5444 regs.num_regs = nmatch; 5445 regs.start = TALLOC (nmatch, regoff_t); 5446 regs.end = TALLOC (nmatch, regoff_t); 5447 if (regs.start == NULL || regs.end == NULL) 5448 return (int) REG_NOMATCH; 5449 } 5450 5451 /* Perform the searching operation. */ 5452 ret = re_search (&private_preg, string, len, 5453 /* start: */ 0, /* range: */ len, 5454 want_reg_info ? ®s : (struct re_registers *) 0); 5455 5456 /* Copy the register information to the POSIX structure. */ 5457 if (want_reg_info) 5458 { 5459 if (ret >= 0) 5460 { 5461 unsigned r; 5462 5463 for (r = 0; r < nmatch; r++) 5464 { 5465 pmatch[r].rm_so = regs.start[r]; 5466 pmatch[r].rm_eo = regs.end[r]; 5467 } 5468 } 5469 5470 /* If we needed the temporary register info, free the space now. */ 5471 free (regs.start); 5472 free (regs.end); 5473 } 5474 5475 /* We want zero return to mean success, unlike `re_search'. */ 5476 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 5477 } 5478 5479 5480 /* Returns a message corresponding to an error code, ERRCODE, returned 5481 from either regcomp or regexec. We don't use PREG here. */ 5482 5483 size_t 5484 regerror (errcode, preg, errbuf, errbuf_size) 5485 int errcode; 5486 const regex_t *preg; 5487 char *errbuf; 5488 size_t errbuf_size; 5489 { 5490 const char *msg; 5491 size_t msg_size; 5492 5493 if (errcode < 0 5494 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0]))) 5495 /* Only error codes returned by the rest of the code should be passed 5496 to this routine. If we are given anything else, or if other regex 5497 code generates an invalid error code, then the program has a bug. 5498 Dump core so we can fix it. */ 5499 abort (); 5500 5501 msg = gettext (re_error_msgid[errcode]); 5502 5503 msg_size = strlen (msg) + 1; /* Includes the null. */ 5504 5505 if (errbuf_size != 0) 5506 { 5507 if (msg_size > errbuf_size) 5508 { 5509 strncpy (errbuf, msg, errbuf_size - 1); 5510 errbuf[errbuf_size - 1] = 0; 5511 } 5512 else 5513 strcpy (errbuf, msg); 5514 } 5515 5516 return msg_size; 5517 } 5518 5519 5520 /* Free dynamically allocated space used by PREG. */ 5521 5522 void 5523 regfree (preg) 5524 regex_t *preg; 5525 { 5526 if (preg->buffer != NULL) 5527 free (preg->buffer); 5528 preg->buffer = NULL; 5529 5530 preg->allocated = 0; 5531 preg->used = 0; 5532 5533 if (preg->fastmap != NULL) 5534 free (preg->fastmap); 5535 preg->fastmap = NULL; 5536 preg->fastmap_accurate = 0; 5537 5538 if (preg->translate != NULL) 5539 free (preg->translate); 5540 preg->translate = NULL; 5541 } 5542 5543 #endif /* not emacs */ 5544