1 /* Extended regular expression matching and search library, 2 version 0.12. 3 (Implements POSIX draft P1003.2/D11.2, except for some of the 4 internationalization features.) 5 6 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 7 2002, 2005, 2010, 2013 Free Software Foundation, Inc. 8 This file is part of the GNU C Library. 9 10 The GNU C Library is free software; you can redistribute it and/or 11 modify it under the terms of the GNU Lesser General Public 12 License as published by the Free Software Foundation; either 13 version 2.1 of the License, or (at your option) any later version. 14 15 The GNU C Library is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 Lesser General Public License for more details. 19 20 You should have received a copy of the GNU Lesser General Public 21 License along with the GNU C Library; if not, write to the Free 22 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 23 02110-1301 USA. */ 24 25 /* This file has been modified for usage in libiberty. It includes "xregex.h" 26 instead of <regex.h>. The "xregex.h" header file renames all external 27 routines with an "x" prefix so they do not collide with the native regex 28 routines or with other components regex routines. */ 29 /* AIX requires this to be the first thing in the file. */ 30 #if defined _AIX && !defined __GNUC__ && !defined REGEX_MALLOC 31 #pragma alloca 32 #endif 33 34 #undef _GNU_SOURCE 35 #define _GNU_SOURCE 36 37 #ifndef INSIDE_RECURSION 38 # ifdef HAVE_CONFIG_H 39 # include <config.h> 40 # endif 41 #endif 42 43 #include <ansidecl.h> 44 45 #ifndef INSIDE_RECURSION 46 47 # if defined STDC_HEADERS && !defined emacs 48 # include <stddef.h> 49 # define PTR_INT_TYPE ptrdiff_t 50 # else 51 /* We need this for `regex.h', and perhaps for the Emacs include files. */ 52 # include <sys/types.h> 53 # define PTR_INT_TYPE long 54 # endif 55 56 # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) 57 58 /* For platform which support the ISO C amendement 1 functionality we 59 support user defined character classes. */ 60 # if defined _LIBC || WIDE_CHAR_SUPPORT 61 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ 62 # include <wchar.h> 63 # include <wctype.h> 64 # endif 65 66 # ifdef _LIBC 67 /* We have to keep the namespace clean. */ 68 # define regfree(preg) __regfree (preg) 69 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) 70 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) 71 # define regerror(errcode, preg, errbuf, errbuf_size) \ 72 __regerror(errcode, preg, errbuf, errbuf_size) 73 # define re_set_registers(bu, re, nu, st, en) \ 74 __re_set_registers (bu, re, nu, st, en) 75 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ 76 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 77 # define re_match(bufp, string, size, pos, regs) \ 78 __re_match (bufp, string, size, pos, regs) 79 # define re_search(bufp, string, size, startpos, range, regs) \ 80 __re_search (bufp, string, size, startpos, range, regs) 81 # define re_compile_pattern(pattern, length, bufp) \ 82 __re_compile_pattern (pattern, length, bufp) 83 # define re_set_syntax(syntax) __re_set_syntax (syntax) 84 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ 85 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) 86 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) 87 88 # define btowc __btowc 89 90 /* We are also using some library internals. */ 91 # include <locale/localeinfo.h> 92 # include <locale/elem-hash.h> 93 # include <langinfo.h> 94 # include <locale/coll-lookup.h> 95 # endif 96 97 /* This is for other GNU distributions with internationalized messages. */ 98 # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC 99 # include <libintl.h> 100 # ifdef _LIBC 101 # undef gettext 102 # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) 103 # endif 104 # else 105 # define gettext(msgid) (msgid) 106 # endif 107 108 # ifndef gettext_noop 109 /* This define is so xgettext can find the internationalizable 110 strings. */ 111 # define gettext_noop(String) String 112 # endif 113 114 /* The `emacs' switch turns on certain matching commands 115 that make sense only in Emacs. */ 116 # ifdef emacs 117 118 # include "lisp.h" 119 # include "buffer.h" 120 # include "syntax.h" 121 122 # else /* not emacs */ 123 124 /* If we are not linking with Emacs proper, 125 we can't use the relocating allocator 126 even if config.h says that we can. */ 127 # undef REL_ALLOC 128 129 # if defined STDC_HEADERS || defined _LIBC 130 # include <stdlib.h> 131 # else 132 char *malloc (); 133 char *realloc (); 134 # endif 135 136 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 137 If nothing else has been done, use the method below. */ 138 # ifdef INHIBIT_STRING_HEADER 139 # if !(defined HAVE_BZERO && defined HAVE_BCOPY) 140 # if !defined bzero && !defined bcopy 141 # undef INHIBIT_STRING_HEADER 142 # endif 143 # endif 144 # endif 145 146 /* This is the normal way of making sure we have a bcopy and a bzero. 147 This is used in most programs--a few other programs avoid this 148 by defining INHIBIT_STRING_HEADER. */ 149 # ifndef INHIBIT_STRING_HEADER 150 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC 151 # include <string.h> 152 # ifndef bzero 153 # ifndef _LIBC 154 # define bzero(s, n) ((void) memset (s, '\0', n)) 155 # else 156 # define bzero(s, n) __bzero (s, n) 157 # endif 158 # endif 159 # else 160 # include <strings.h> 161 # ifndef memcmp 162 # define memcmp(s1, s2, n) bcmp (s1, s2, n) 163 # endif 164 # ifndef memcpy 165 # define memcpy(d, s, n) (bcopy (s, d, n), (d)) 166 # endif 167 # endif 168 # endif 169 170 /* Define the syntax stuff for \<, \>, etc. */ 171 172 /* This must be nonzero for the wordchar and notwordchar pattern 173 commands in re_match_2. */ 174 # ifndef Sword 175 # define Sword 1 176 # endif 177 178 # ifdef SWITCH_ENUM_BUG 179 # define SWITCH_ENUM_CAST(x) ((int)(x)) 180 # else 181 # define SWITCH_ENUM_CAST(x) (x) 182 # endif 183 184 # endif /* not emacs */ 185 186 # if defined _LIBC || HAVE_LIMITS_H 187 # include <limits.h> 188 # endif 189 190 # ifndef MB_LEN_MAX 191 # define MB_LEN_MAX 1 192 # endif 193 194 /* Get the interface, including the syntax bits. */ 196 # include "xregex.h" /* change for libiberty */ 197 198 /* isalpha etc. are used for the character classes. */ 199 # include <ctype.h> 200 201 /* Jim Meyering writes: 202 203 "... Some ctype macros are valid only for character codes that 204 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 205 using /bin/cc or gcc but without giving an ansi option). So, all 206 ctype uses should be through macros like ISPRINT... If 207 STDC_HEADERS is defined, then autoconf has verified that the ctype 208 macros don't need to be guarded with references to isascii. ... 209 Defining isascii to 1 should let any compiler worth its salt 210 eliminate the && through constant folding." 211 Solaris defines some of these symbols so we must undefine them first. */ 212 213 # undef ISASCII 214 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) 215 # define ISASCII(c) 1 216 # else 217 # define ISASCII(c) isascii(c) 218 # endif 219 220 # ifdef isblank 221 # define ISBLANK(c) (ISASCII (c) && isblank (c)) 222 # else 223 # define ISBLANK(c) ((c) == ' ' || (c) == '\t') 224 # endif 225 # ifdef isgraph 226 # define ISGRAPH(c) (ISASCII (c) && isgraph (c)) 227 # else 228 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) 229 # endif 230 231 # undef ISPRINT 232 # define ISPRINT(c) (ISASCII (c) && isprint (c)) 233 # define ISDIGIT(c) (ISASCII (c) && isdigit (c)) 234 # define ISALNUM(c) (ISASCII (c) && isalnum (c)) 235 # define ISALPHA(c) (ISASCII (c) && isalpha (c)) 236 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) 237 # define ISLOWER(c) (ISASCII (c) && islower (c)) 238 # define ISPUNCT(c) (ISASCII (c) && ispunct (c)) 239 # define ISSPACE(c) (ISASCII (c) && isspace (c)) 240 # define ISUPPER(c) (ISASCII (c) && isupper (c)) 241 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) 242 243 # ifdef _tolower 244 # define TOLOWER(c) _tolower(c) 245 # else 246 # define TOLOWER(c) tolower(c) 247 # endif 248 249 # ifndef NULL 250 # define NULL (void *)0 251 # endif 252 253 /* We remove any previous definition of `SIGN_EXTEND_CHAR', 254 since ours (we hope) works properly with all combinations of 255 machines, compilers, `char' and `unsigned char' argument types. 256 (Per Bothner suggested the basic approach.) */ 257 # undef SIGN_EXTEND_CHAR 258 # if __STDC__ 259 # define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 260 # else /* not __STDC__ */ 261 /* As in Harbison and Steele. */ 262 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 263 # endif 264 265 # ifndef emacs 267 /* How many characters in the character set. */ 268 # define CHAR_SET_SIZE 256 269 270 # ifdef SYNTAX_TABLE 271 272 extern char *re_syntax_table; 273 274 # else /* not SYNTAX_TABLE */ 275 276 static char re_syntax_table[CHAR_SET_SIZE]; 277 278 static void init_syntax_once (void); 279 280 static void 281 init_syntax_once (void) 282 { 283 register int c; 284 static int done = 0; 285 286 if (done) 287 return; 288 bzero (re_syntax_table, sizeof re_syntax_table); 289 290 for (c = 0; c < CHAR_SET_SIZE; ++c) 291 if (ISALNUM (c)) 292 re_syntax_table[c] = Sword; 293 294 re_syntax_table['_'] = Sword; 295 296 done = 1; 297 } 298 299 # endif /* not SYNTAX_TABLE */ 300 301 # define SYNTAX(c) re_syntax_table[(unsigned char) (c)] 302 303 # endif /* emacs */ 304 305 /* Integer type for pointers. */ 307 # if !defined _LIBC && !defined HAVE_UINTPTR_T 308 typedef unsigned long int uintptr_t; 309 # endif 310 311 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 312 use `alloca' instead of `malloc'. This is because using malloc in 313 re_search* or re_match* could cause memory leaks when C-g is used in 314 Emacs; also, malloc is slower and causes storage fragmentation. On 315 the other hand, malloc is more portable, and easier to debug. 316 317 Because we sometimes use alloca, some routines have to be macros, 318 not functions -- `alloca'-allocated space disappears at the end of the 319 function it is called in. */ 320 321 # ifdef REGEX_MALLOC 322 323 # define REGEX_ALLOCATE malloc 324 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 325 # define REGEX_FREE free 326 327 # else /* not REGEX_MALLOC */ 328 329 /* Emacs already defines alloca, sometimes. */ 330 # ifndef alloca 331 332 /* Make alloca work the best possible way. */ 333 # ifdef __GNUC__ 334 # define alloca __builtin_alloca 335 # else /* not __GNUC__ */ 336 # if HAVE_ALLOCA_H 337 # include <alloca.h> 338 # endif /* HAVE_ALLOCA_H */ 339 # endif /* not __GNUC__ */ 340 341 # endif /* not alloca */ 342 343 # define REGEX_ALLOCATE alloca 344 345 /* Assumes a `char *destination' variable. */ 346 # define REGEX_REALLOCATE(source, osize, nsize) \ 347 (destination = (char *) alloca (nsize), \ 348 memcpy (destination, source, osize)) 349 350 /* No need to do anything to free, after alloca. */ 351 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 352 353 # endif /* not REGEX_MALLOC */ 354 355 /* Define how to allocate the failure stack. */ 356 357 # if defined REL_ALLOC && defined REGEX_MALLOC 358 359 # define REGEX_ALLOCATE_STACK(size) \ 360 r_alloc (&failure_stack_ptr, (size)) 361 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 362 r_re_alloc (&failure_stack_ptr, (nsize)) 363 # define REGEX_FREE_STACK(ptr) \ 364 r_alloc_free (&failure_stack_ptr) 365 366 # else /* not using relocating allocator */ 367 368 # ifdef REGEX_MALLOC 369 370 # define REGEX_ALLOCATE_STACK malloc 371 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 372 # define REGEX_FREE_STACK free 373 374 # else /* not REGEX_MALLOC */ 375 376 # define REGEX_ALLOCATE_STACK alloca 377 378 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 379 REGEX_REALLOCATE (source, osize, nsize) 380 /* No need to explicitly free anything. */ 381 # define REGEX_FREE_STACK(arg) 382 383 # endif /* not REGEX_MALLOC */ 384 # endif /* not using relocating allocator */ 385 386 387 /* True if `size1' is non-NULL and PTR is pointing anywhere inside 388 `string1' or just past its end. This works if PTR is NULL, which is 389 a good thing. */ 390 # define FIRST_STRING_P(ptr) \ 391 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 392 393 /* (Re)Allocate N items of type T using malloc, or fail. */ 394 # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 395 # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 396 # define RETALLOC_IF(addr, n, t) \ 397 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 398 # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 399 400 # define BYTEWIDTH 8 /* In bits. */ 401 402 # define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 403 404 # undef MAX 405 # undef MIN 406 # define MAX(a, b) ((a) > (b) ? (a) : (b)) 407 # define MIN(a, b) ((a) < (b) ? (a) : (b)) 408 409 typedef char boolean; 410 # define false 0 411 # define true 1 412 413 static reg_errcode_t byte_regex_compile (const char *pattern, size_t size, 414 reg_syntax_t syntax, 415 struct re_pattern_buffer *bufp); 416 417 static int byte_re_match_2_internal (struct re_pattern_buffer *bufp, 418 const char *string1, int size1, 419 const char *string2, int size2, 420 int pos, 421 struct re_registers *regs, 422 int stop); 423 static int byte_re_search_2 (struct re_pattern_buffer *bufp, 424 const char *string1, int size1, 425 const char *string2, int size2, 426 int startpos, int range, 427 struct re_registers *regs, int stop); 428 static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp); 429 430 #ifdef MBS_SUPPORT 431 static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size, 432 reg_syntax_t syntax, 433 struct re_pattern_buffer *bufp); 434 435 436 static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp, 437 const char *cstring1, int csize1, 438 const char *cstring2, int csize2, 439 int pos, 440 struct re_registers *regs, 441 int stop, 442 wchar_t *string1, int size1, 443 wchar_t *string2, int size2, 444 int *mbs_offset1, int *mbs_offset2); 445 static int wcs_re_search_2 (struct re_pattern_buffer *bufp, 446 const char *string1, int size1, 447 const char *string2, int size2, 448 int startpos, int range, 449 struct re_registers *regs, int stop); 450 static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp); 451 #endif 452 453 /* These are the command codes that appear in compiled regular 455 expressions. Some opcodes are followed by argument bytes. A 456 command code can specify any interpretation whatsoever for its 457 arguments. Zero bytes may appear in the compiled regular expression. */ 458 459 typedef enum 460 { 461 no_op = 0, 462 463 /* Succeed right away--no more backtracking. */ 464 succeed, 465 466 /* Followed by one byte giving n, then by n literal bytes. */ 467 exactn, 468 469 # ifdef MBS_SUPPORT 470 /* Same as exactn, but contains binary data. */ 471 exactn_bin, 472 # endif 473 474 /* Matches any (more or less) character. */ 475 anychar, 476 477 /* Matches any one char belonging to specified set. First 478 following byte is number of bitmap bytes. Then come bytes 479 for a bitmap saying which chars are in. Bits in each byte 480 are ordered low-bit-first. A character is in the set if its 481 bit is 1. A character too large to have a bit in the map is 482 automatically not in the set. */ 483 /* ifdef MBS_SUPPORT, following element is length of character 484 classes, length of collating symbols, length of equivalence 485 classes, length of character ranges, and length of characters. 486 Next, character class element, collating symbols elements, 487 equivalence class elements, range elements, and character 488 elements follow. 489 See regex_compile function. */ 490 charset, 491 492 /* Same parameters as charset, but match any character that is 493 not one of those specified. */ 494 charset_not, 495 496 /* Start remembering the text that is matched, for storing in a 497 register. Followed by one byte with the register number, in 498 the range 0 to one less than the pattern buffer's re_nsub 499 field. Then followed by one byte with the number of groups 500 inner to this one. (This last has to be part of the 501 start_memory only because we need it in the on_failure_jump 502 of re_match_2.) */ 503 start_memory, 504 505 /* Stop remembering the text that is matched and store it in a 506 memory register. Followed by one byte with the register 507 number, in the range 0 to one less than `re_nsub' in the 508 pattern buffer, and one byte with the number of inner groups, 509 just like `start_memory'. (We need the number of inner 510 groups here because we don't have any easy way of finding the 511 corresponding start_memory when we're at a stop_memory.) */ 512 stop_memory, 513 514 /* Match a duplicate of something remembered. Followed by one 515 byte containing the register number. */ 516 duplicate, 517 518 /* Fail unless at beginning of line. */ 519 begline, 520 521 /* Fail unless at end of line. */ 522 endline, 523 524 /* Succeeds if at beginning of buffer (if emacs) or at beginning 525 of string to be matched (if not). */ 526 begbuf, 527 528 /* Analogously, for end of buffer/string. */ 529 endbuf, 530 531 /* Followed by two byte relative address to which to jump. */ 532 jump, 533 534 /* Same as jump, but marks the end of an alternative. */ 535 jump_past_alt, 536 537 /* Followed by two-byte relative address of place to resume at 538 in case of failure. */ 539 /* ifdef MBS_SUPPORT, the size of address is 1. */ 540 on_failure_jump, 541 542 /* Like on_failure_jump, but pushes a placeholder instead of the 543 current string position when executed. */ 544 on_failure_keep_string_jump, 545 546 /* Throw away latest failure point and then jump to following 547 two-byte relative address. */ 548 /* ifdef MBS_SUPPORT, the size of address is 1. */ 549 pop_failure_jump, 550 551 /* Change to pop_failure_jump if know won't have to backtrack to 552 match; otherwise change to jump. This is used to jump 553 back to the beginning of a repeat. If what follows this jump 554 clearly won't match what the repeat does, such that we can be 555 sure that there is no use backtracking out of repetitions 556 already matched, then we change it to a pop_failure_jump. 557 Followed by two-byte address. */ 558 /* ifdef MBS_SUPPORT, the size of address is 1. */ 559 maybe_pop_jump, 560 561 /* Jump to following two-byte address, and push a dummy failure 562 point. This failure point will be thrown away if an attempt 563 is made to use it for a failure. A `+' construct makes this 564 before the first repeat. Also used as an intermediary kind 565 of jump when compiling an alternative. */ 566 /* ifdef MBS_SUPPORT, the size of address is 1. */ 567 dummy_failure_jump, 568 569 /* Push a dummy failure point and continue. Used at the end of 570 alternatives. */ 571 push_dummy_failure, 572 573 /* Followed by two-byte relative address and two-byte number n. 574 After matching N times, jump to the address upon failure. */ 575 /* ifdef MBS_SUPPORT, the size of address is 1. */ 576 succeed_n, 577 578 /* Followed by two-byte relative address, and two-byte number n. 579 Jump to the address N times, then fail. */ 580 /* ifdef MBS_SUPPORT, the size of address is 1. */ 581 jump_n, 582 583 /* Set the following two-byte relative address to the 584 subsequent two-byte number. The address *includes* the two 585 bytes of number. */ 586 /* ifdef MBS_SUPPORT, the size of address is 1. */ 587 set_number_at, 588 589 wordchar, /* Matches any word-constituent character. */ 590 notwordchar, /* Matches any char that is not a word-constituent. */ 591 592 wordbeg, /* Succeeds if at word beginning. */ 593 wordend, /* Succeeds if at word end. */ 594 595 wordbound, /* Succeeds if at a word boundary. */ 596 notwordbound /* Succeeds if not at a word boundary. */ 597 598 # ifdef emacs 599 ,before_dot, /* Succeeds if before point. */ 600 at_dot, /* Succeeds if at point. */ 601 after_dot, /* Succeeds if after point. */ 602 603 /* Matches any character whose syntax is specified. Followed by 604 a byte which contains a syntax code, e.g., Sword. */ 605 syntaxspec, 606 607 /* Matches any character whose syntax is not that specified. */ 608 notsyntaxspec 609 # endif /* emacs */ 610 } re_opcode_t; 611 #endif /* not INSIDE_RECURSION */ 612 613 615 #ifdef BYTE 616 # define CHAR_T char 617 # define UCHAR_T unsigned char 618 # define COMPILED_BUFFER_VAR bufp->buffer 619 # define OFFSET_ADDRESS_SIZE 2 620 # define PREFIX(name) byte_##name 621 # define ARG_PREFIX(name) name 622 # define PUT_CHAR(c) putchar (c) 623 #else 624 # ifdef WCHAR 625 # define CHAR_T wchar_t 626 # define UCHAR_T wchar_t 627 # define COMPILED_BUFFER_VAR wc_buffer 628 # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ 629 # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1) 630 # define PREFIX(name) wcs_##name 631 # define ARG_PREFIX(name) c##name 632 /* Should we use wide stream?? */ 633 # define PUT_CHAR(c) printf ("%C", c); 634 # define TRUE 1 635 # define FALSE 0 636 # else 637 # ifdef MBS_SUPPORT 638 # define WCHAR 639 # define INSIDE_RECURSION 640 # include "regex.c" 641 # undef INSIDE_RECURSION 642 # endif 643 # define BYTE 644 # define INSIDE_RECURSION 645 # include "regex.c" 646 # undef INSIDE_RECURSION 647 # endif 648 #endif 649 650 #ifdef INSIDE_RECURSION 651 /* Common operations on the compiled pattern. */ 652 653 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 654 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 655 656 # ifdef WCHAR 657 # define STORE_NUMBER(destination, number) \ 658 do { \ 659 *(destination) = (UCHAR_T)(number); \ 660 } while (0) 661 # else /* BYTE */ 662 # define STORE_NUMBER(destination, number) \ 663 do { \ 664 (destination)[0] = (number) & 0377; \ 665 (destination)[1] = (number) >> 8; \ 666 } while (0) 667 # endif /* WCHAR */ 668 669 /* Same as STORE_NUMBER, except increment DESTINATION to 670 the byte after where the number is stored. Therefore, DESTINATION 671 must be an lvalue. */ 672 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 673 674 # define STORE_NUMBER_AND_INCR(destination, number) \ 675 do { \ 676 STORE_NUMBER (destination, number); \ 677 (destination) += OFFSET_ADDRESS_SIZE; \ 678 } while (0) 679 680 /* Put into DESTINATION a number stored in two contiguous bytes starting 681 at SOURCE. */ 682 /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 683 684 # ifdef WCHAR 685 # define EXTRACT_NUMBER(destination, source) \ 686 do { \ 687 (destination) = *(source); \ 688 } while (0) 689 # else /* BYTE */ 690 # define EXTRACT_NUMBER(destination, source) \ 691 do { \ 692 (destination) = *(source) & 0377; \ 693 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 694 } while (0) 695 # endif 696 697 # ifdef DEBUG 698 static void PREFIX(extract_number) (int *dest, UCHAR_T *source); 699 static void 700 PREFIX(extract_number) (int *dest, UCHAR_T *source) 701 { 702 # ifdef WCHAR 703 *dest = *source; 704 # else /* BYTE */ 705 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 706 *dest = *source & 0377; 707 *dest += temp << 8; 708 # endif 709 } 710 711 # ifndef EXTRACT_MACROS /* To debug the macros. */ 712 # undef EXTRACT_NUMBER 713 # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src) 714 # endif /* not EXTRACT_MACROS */ 715 716 # endif /* DEBUG */ 717 718 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 719 SOURCE must be an lvalue. */ 720 721 # define EXTRACT_NUMBER_AND_INCR(destination, source) \ 722 do { \ 723 EXTRACT_NUMBER (destination, source); \ 724 (source) += OFFSET_ADDRESS_SIZE; \ 725 } while (0) 726 727 # ifdef DEBUG 728 static void PREFIX(extract_number_and_incr) (int *destination, 729 UCHAR_T **source); 730 static void 731 PREFIX(extract_number_and_incr) (int *destination, UCHAR_T **source) 732 { 733 PREFIX(extract_number) (destination, *source); 734 *source += OFFSET_ADDRESS_SIZE; 735 } 736 737 # ifndef EXTRACT_MACROS 738 # undef EXTRACT_NUMBER_AND_INCR 739 # define EXTRACT_NUMBER_AND_INCR(dest, src) \ 740 PREFIX(extract_number_and_incr) (&dest, &src) 741 # endif /* not EXTRACT_MACROS */ 742 743 # endif /* DEBUG */ 744 745 746 748 /* If DEBUG is defined, Regex prints many voluminous messages about what 749 it is doing (if the variable `debug' is nonzero). If linked with the 750 main program in `iregex.c', you can enter patterns and strings 751 interactively. And if linked with the main program in `main.c' and 752 the other test files, you can run the already-written tests. */ 753 754 # ifdef DEBUG 755 756 # ifndef DEFINED_ONCE 757 758 /* We use standard I/O for debugging. */ 759 # include <stdio.h> 760 761 /* It is useful to test things that ``must'' be true when debugging. */ 762 # include <assert.h> 763 764 static int debug; 765 766 # define DEBUG_STATEMENT(e) e 767 # define DEBUG_PRINT1(x) if (debug) printf (x) 768 # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 769 # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 770 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 771 # endif /* not DEFINED_ONCE */ 772 773 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 774 if (debug) PREFIX(print_partial_compiled_pattern) (s, e) 775 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 776 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2) 777 778 779 /* Print the fastmap in human-readable form. */ 780 781 # ifndef DEFINED_ONCE 782 void 783 print_fastmap (char *fastmap) 784 { 785 unsigned was_a_range = 0; 786 unsigned i = 0; 787 788 while (i < (1 << BYTEWIDTH)) 789 { 790 if (fastmap[i++]) 791 { 792 was_a_range = 0; 793 putchar (i - 1); 794 while (i < (1 << BYTEWIDTH) && fastmap[i]) 795 { 796 was_a_range = 1; 797 i++; 798 } 799 if (was_a_range) 800 { 801 printf ("-"); 802 putchar (i - 1); 803 } 804 } 805 } 806 putchar ('\n'); 807 } 808 # endif /* not DEFINED_ONCE */ 809 810 811 /* Print a compiled pattern string in human-readable form, starting at 812 the START pointer into it and ending just before the pointer END. */ 813 814 void 815 PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end) 816 { 817 int mcnt, mcnt2; 818 UCHAR_T *p1; 819 UCHAR_T *p = start; 820 UCHAR_T *pend = end; 821 822 if (start == NULL) 823 { 824 printf ("(null)\n"); 825 return; 826 } 827 828 /* Loop over pattern commands. */ 829 while (p < pend) 830 { 831 # ifdef _LIBC 832 printf ("%td:\t", p - start); 833 # else 834 printf ("%ld:\t", (long int) (p - start)); 835 # endif 836 837 switch ((re_opcode_t) *p++) 838 { 839 case no_op: 840 printf ("/no_op"); 841 break; 842 843 case exactn: 844 mcnt = *p++; 845 printf ("/exactn/%d", mcnt); 846 do 847 { 848 putchar ('/'); 849 PUT_CHAR (*p++); 850 } 851 while (--mcnt); 852 break; 853 854 # ifdef MBS_SUPPORT 855 case exactn_bin: 856 mcnt = *p++; 857 printf ("/exactn_bin/%d", mcnt); 858 do 859 { 860 printf("/%lx", (long int) *p++); 861 } 862 while (--mcnt); 863 break; 864 # endif /* MBS_SUPPORT */ 865 866 case start_memory: 867 mcnt = *p++; 868 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++); 869 break; 870 871 case stop_memory: 872 mcnt = *p++; 873 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++); 874 break; 875 876 case duplicate: 877 printf ("/duplicate/%ld", (long int) *p++); 878 break; 879 880 case anychar: 881 printf ("/anychar"); 882 break; 883 884 case charset: 885 case charset_not: 886 { 887 # ifdef WCHAR 888 int i, length; 889 wchar_t *workp = p; 890 printf ("/charset [%s", 891 (re_opcode_t) *(workp - 1) == charset_not ? "^" : ""); 892 p += 5; 893 length = *workp++; /* the length of char_classes */ 894 for (i=0 ; i<length ; i++) 895 printf("[:%lx:]", (long int) *p++); 896 length = *workp++; /* the length of collating_symbol */ 897 for (i=0 ; i<length ;) 898 { 899 printf("[."); 900 while(*p != 0) 901 PUT_CHAR((i++,*p++)); 902 i++,p++; 903 printf(".]"); 904 } 905 length = *workp++; /* the length of equivalence_class */ 906 for (i=0 ; i<length ;) 907 { 908 printf("[="); 909 while(*p != 0) 910 PUT_CHAR((i++,*p++)); 911 i++,p++; 912 printf("=]"); 913 } 914 length = *workp++; /* the length of char_range */ 915 for (i=0 ; i<length ; i++) 916 { 917 wchar_t range_start = *p++; 918 wchar_t range_end = *p++; 919 printf("%C-%C", range_start, range_end); 920 } 921 length = *workp++; /* the length of char */ 922 for (i=0 ; i<length ; i++) 923 printf("%C", *p++); 924 putchar (']'); 925 # else 926 register int c, last = -100; 927 register int in_range = 0; 928 929 printf ("/charset [%s", 930 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 931 932 assert (p + *p < pend); 933 934 for (c = 0; c < 256; c++) 935 if (c / 8 < *p 936 && (p[1 + (c/8)] & (1 << (c % 8)))) 937 { 938 /* Are we starting a range? */ 939 if (last + 1 == c && ! in_range) 940 { 941 putchar ('-'); 942 in_range = 1; 943 } 944 /* Have we broken a range? */ 945 else if (last + 1 != c && in_range) 946 { 947 putchar (last); 948 in_range = 0; 949 } 950 951 if (! in_range) 952 putchar (c); 953 954 last = c; 955 } 956 957 if (in_range) 958 putchar (last); 959 960 putchar (']'); 961 962 p += 1 + *p; 963 # endif /* WCHAR */ 964 } 965 break; 966 967 case begline: 968 printf ("/begline"); 969 break; 970 971 case endline: 972 printf ("/endline"); 973 break; 974 975 case on_failure_jump: 976 PREFIX(extract_number_and_incr) (&mcnt, &p); 977 # ifdef _LIBC 978 printf ("/on_failure_jump to %td", p + mcnt - start); 979 # else 980 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start)); 981 # endif 982 break; 983 984 case on_failure_keep_string_jump: 985 PREFIX(extract_number_and_incr) (&mcnt, &p); 986 # ifdef _LIBC 987 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start); 988 # else 989 printf ("/on_failure_keep_string_jump to %ld", 990 (long int) (p + mcnt - start)); 991 # endif 992 break; 993 994 case dummy_failure_jump: 995 PREFIX(extract_number_and_incr) (&mcnt, &p); 996 # ifdef _LIBC 997 printf ("/dummy_failure_jump to %td", p + mcnt - start); 998 # else 999 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start)); 1000 # endif 1001 break; 1002 1003 case push_dummy_failure: 1004 printf ("/push_dummy_failure"); 1005 break; 1006 1007 case maybe_pop_jump: 1008 PREFIX(extract_number_and_incr) (&mcnt, &p); 1009 # ifdef _LIBC 1010 printf ("/maybe_pop_jump to %td", p + mcnt - start); 1011 # else 1012 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start)); 1013 # endif 1014 break; 1015 1016 case pop_failure_jump: 1017 PREFIX(extract_number_and_incr) (&mcnt, &p); 1018 # ifdef _LIBC 1019 printf ("/pop_failure_jump to %td", p + mcnt - start); 1020 # else 1021 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start)); 1022 # endif 1023 break; 1024 1025 case jump_past_alt: 1026 PREFIX(extract_number_and_incr) (&mcnt, &p); 1027 # ifdef _LIBC 1028 printf ("/jump_past_alt to %td", p + mcnt - start); 1029 # else 1030 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start)); 1031 # endif 1032 break; 1033 1034 case jump: 1035 PREFIX(extract_number_and_incr) (&mcnt, &p); 1036 # ifdef _LIBC 1037 printf ("/jump to %td", p + mcnt - start); 1038 # else 1039 printf ("/jump to %ld", (long int) (p + mcnt - start)); 1040 # endif 1041 break; 1042 1043 case succeed_n: 1044 PREFIX(extract_number_and_incr) (&mcnt, &p); 1045 p1 = p + mcnt; 1046 PREFIX(extract_number_and_incr) (&mcnt2, &p); 1047 # ifdef _LIBC 1048 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2); 1049 # else 1050 printf ("/succeed_n to %ld, %d times", 1051 (long int) (p1 - start), mcnt2); 1052 # endif 1053 break; 1054 1055 case jump_n: 1056 PREFIX(extract_number_and_incr) (&mcnt, &p); 1057 p1 = p + mcnt; 1058 PREFIX(extract_number_and_incr) (&mcnt2, &p); 1059 printf ("/jump_n to %d, %d times", p1 - start, mcnt2); 1060 break; 1061 1062 case set_number_at: 1063 PREFIX(extract_number_and_incr) (&mcnt, &p); 1064 p1 = p + mcnt; 1065 PREFIX(extract_number_and_incr) (&mcnt2, &p); 1066 # ifdef _LIBC 1067 printf ("/set_number_at location %td to %d", p1 - start, mcnt2); 1068 # else 1069 printf ("/set_number_at location %ld to %d", 1070 (long int) (p1 - start), mcnt2); 1071 # endif 1072 break; 1073 1074 case wordbound: 1075 printf ("/wordbound"); 1076 break; 1077 1078 case notwordbound: 1079 printf ("/notwordbound"); 1080 break; 1081 1082 case wordbeg: 1083 printf ("/wordbeg"); 1084 break; 1085 1086 case wordend: 1087 printf ("/wordend"); 1088 break; 1089 1090 # ifdef emacs 1091 case before_dot: 1092 printf ("/before_dot"); 1093 break; 1094 1095 case at_dot: 1096 printf ("/at_dot"); 1097 break; 1098 1099 case after_dot: 1100 printf ("/after_dot"); 1101 break; 1102 1103 case syntaxspec: 1104 printf ("/syntaxspec"); 1105 mcnt = *p++; 1106 printf ("/%d", mcnt); 1107 break; 1108 1109 case notsyntaxspec: 1110 printf ("/notsyntaxspec"); 1111 mcnt = *p++; 1112 printf ("/%d", mcnt); 1113 break; 1114 # endif /* emacs */ 1115 1116 case wordchar: 1117 printf ("/wordchar"); 1118 break; 1119 1120 case notwordchar: 1121 printf ("/notwordchar"); 1122 break; 1123 1124 case begbuf: 1125 printf ("/begbuf"); 1126 break; 1127 1128 case endbuf: 1129 printf ("/endbuf"); 1130 break; 1131 1132 default: 1133 printf ("?%ld", (long int) *(p-1)); 1134 } 1135 1136 putchar ('\n'); 1137 } 1138 1139 # ifdef _LIBC 1140 printf ("%td:\tend of pattern.\n", p - start); 1141 # else 1142 printf ("%ld:\tend of pattern.\n", (long int) (p - start)); 1143 # endif 1144 } 1145 1146 1147 void 1148 PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp) 1149 { 1150 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer; 1151 1152 PREFIX(print_partial_compiled_pattern) (buffer, buffer 1153 + bufp->used / sizeof(UCHAR_T)); 1154 printf ("%ld bytes used/%ld bytes allocated.\n", 1155 bufp->used, bufp->allocated); 1156 1157 if (bufp->fastmap_accurate && bufp->fastmap) 1158 { 1159 printf ("fastmap: "); 1160 print_fastmap (bufp->fastmap); 1161 } 1162 1163 # ifdef _LIBC 1164 printf ("re_nsub: %Zd\t", bufp->re_nsub); 1165 # else 1166 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub); 1167 # endif 1168 printf ("regs_alloc: %d\t", bufp->regs_allocated); 1169 printf ("can_be_null: %d\t", bufp->can_be_null); 1170 printf ("newline_anchor: %d\n", bufp->newline_anchor); 1171 printf ("no_sub: %d\t", bufp->no_sub); 1172 printf ("not_bol: %d\t", bufp->not_bol); 1173 printf ("not_eol: %d\t", bufp->not_eol); 1174 printf ("syntax: %lx\n", bufp->syntax); 1175 /* Perhaps we should print the translate table? */ 1176 } 1177 1178 1179 void 1180 PREFIX(print_double_string) (const CHAR_T *where, const CHAR_T *string1, 1181 int size1, const CHAR_T *string2, int size2) 1182 { 1183 int this_char; 1184 1185 if (where == NULL) 1186 printf ("(null)"); 1187 else 1188 { 1189 int cnt; 1190 1191 if (FIRST_STRING_P (where)) 1192 { 1193 for (this_char = where - string1; this_char < size1; this_char++) 1194 PUT_CHAR (string1[this_char]); 1195 1196 where = string2; 1197 } 1198 1199 cnt = 0; 1200 for (this_char = where - string2; this_char < size2; this_char++) 1201 { 1202 PUT_CHAR (string2[this_char]); 1203 if (++cnt > 100) 1204 { 1205 fputs ("...", stdout); 1206 break; 1207 } 1208 } 1209 } 1210 } 1211 1212 # ifndef DEFINED_ONCE 1213 void 1214 printchar (int c) 1215 { 1216 putc (c, stderr); 1217 } 1218 # endif 1219 1220 # else /* not DEBUG */ 1221 1222 # ifndef DEFINED_ONCE 1223 # undef assert 1224 # define assert(e) 1225 1226 # define DEBUG_STATEMENT(e) 1227 # define DEBUG_PRINT1(x) 1228 # define DEBUG_PRINT2(x1, x2) 1229 # define DEBUG_PRINT3(x1, x2, x3) 1230 # define DEBUG_PRINT4(x1, x2, x3, x4) 1231 # endif /* not DEFINED_ONCE */ 1232 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 1233 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 1234 1235 # endif /* not DEBUG */ 1236 1237 1238 1240 # ifdef WCHAR 1241 /* This convert a multibyte string to a wide character string. 1242 And write their correspondances to offset_buffer(see below) 1243 and write whether each wchar_t is binary data to is_binary. 1244 This assume invalid multibyte sequences as binary data. 1245 We assume offset_buffer and is_binary is already allocated 1246 enough space. */ 1247 1248 static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src, 1249 size_t len, int *offset_buffer, 1250 char *is_binary); 1251 static size_t 1252 convert_mbs_to_wcs (CHAR_T *dest, const unsigned char*src, size_t len, 1253 int *offset_buffer, char *is_binary) 1254 /* It hold correspondances between src(char string) and 1255 dest(wchar_t string) for optimization. 1256 e.g. src = "xxxyzz" 1257 dest = {'X', 'Y', 'Z'} 1258 (each "xxx", "y" and "zz" represent one multibyte character 1259 corresponding to 'X', 'Y' and 'Z'.) 1260 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")} 1261 = {0, 3, 4, 6} 1262 */ 1263 { 1264 wchar_t *pdest = dest; 1265 const unsigned char *psrc = src; 1266 size_t wc_count = 0; 1267 1268 mbstate_t mbs; 1269 int i, consumed; 1270 size_t mb_remain = len; 1271 size_t mb_count = 0; 1272 1273 /* Initialize the conversion state. */ 1274 memset (&mbs, 0, sizeof (mbstate_t)); 1275 1276 offset_buffer[0] = 0; 1277 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed, 1278 psrc += consumed) 1279 { 1280 #ifdef _LIBC 1281 consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs); 1282 #else 1283 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs); 1284 #endif 1285 1286 if (consumed <= 0) 1287 /* failed to convert. maybe src contains binary data. 1288 So we consume 1 byte manualy. */ 1289 { 1290 *pdest = *psrc; 1291 consumed = 1; 1292 is_binary[wc_count] = TRUE; 1293 } 1294 else 1295 is_binary[wc_count] = FALSE; 1296 /* In sjis encoding, we use yen sign as escape character in 1297 place of reverse solidus. So we convert 0x5c(yen sign in 1298 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse 1299 solidus in UCS2). */ 1300 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5) 1301 *pdest = (wchar_t) *psrc; 1302 1303 offset_buffer[wc_count + 1] = mb_count += consumed; 1304 } 1305 1306 /* Fill remain of the buffer with sentinel. */ 1307 for (i = wc_count + 1 ; i <= len ; i++) 1308 offset_buffer[i] = mb_count + 1; 1309 1310 return wc_count; 1311 } 1312 1313 # endif /* WCHAR */ 1314 1315 #else /* not INSIDE_RECURSION */ 1316 1317 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 1318 also be assigned to arbitrarily: each pattern buffer stores its own 1319 syntax, so it can be changed between regex compilations. */ 1320 /* This has no initializer because initialized variables in Emacs 1321 become read-only after dumping. */ 1322 reg_syntax_t re_syntax_options; 1323 1324 1325 /* Specify the precise syntax of regexps for compilation. This provides 1326 for compatibility for various utilities which historically have 1327 different, incompatible syntaxes. 1328 1329 The argument SYNTAX is a bit mask comprised of the various bits 1330 defined in regex.h. We return the old syntax. */ 1331 1332 reg_syntax_t 1333 re_set_syntax (reg_syntax_t syntax) 1334 { 1335 reg_syntax_t ret = re_syntax_options; 1336 1337 re_syntax_options = syntax; 1338 # ifdef DEBUG 1339 if (syntax & RE_DEBUG) 1340 debug = 1; 1341 else if (debug) /* was on but now is not */ 1342 debug = 0; 1343 # endif /* DEBUG */ 1344 return ret; 1345 } 1346 # ifdef _LIBC 1347 weak_alias (__re_set_syntax, re_set_syntax) 1348 # endif 1349 1350 /* This table gives an error message for each of the error codes listed 1352 in regex.h. Obviously the order here has to be same as there. 1353 POSIX doesn't require that we do anything for REG_NOERROR, 1354 but why not be nice? */ 1355 1356 static const char *re_error_msgid[] = 1357 { 1358 gettext_noop ("Success"), /* REG_NOERROR */ 1359 gettext_noop ("No match"), /* REG_NOMATCH */ 1360 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */ 1361 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */ 1362 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */ 1363 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */ 1364 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */ 1365 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */ 1366 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */ 1367 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */ 1368 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */ 1369 gettext_noop ("Invalid range end"), /* REG_ERANGE */ 1370 gettext_noop ("Memory exhausted"), /* REG_ESPACE */ 1371 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */ 1372 gettext_noop ("Premature end of regular expression"), /* REG_EEND */ 1373 gettext_noop ("Regular expression too big"), /* REG_ESIZE */ 1374 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 1375 }; 1376 1377 #endif /* INSIDE_RECURSION */ 1379 1380 #ifndef DEFINED_ONCE 1381 /* Avoiding alloca during matching, to placate r_alloc. */ 1382 1383 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 1384 searching and matching functions should not call alloca. On some 1385 systems, alloca is implemented in terms of malloc, and if we're 1386 using the relocating allocator routines, then malloc could cause a 1387 relocation, which might (if the strings being searched are in the 1388 ralloc heap) shift the data out from underneath the regexp 1389 routines. 1390 1391 Here's another reason to avoid allocation: Emacs 1392 processes input from X in a signal handler; processing X input may 1393 call malloc; if input arrives while a matching routine is calling 1394 malloc, then we're scrod. But Emacs can't just block input while 1395 calling matching routines; then we don't notice interrupts when 1396 they come in. So, Emacs blocks input around all regexp calls 1397 except the matching calls, which it leaves unprotected, in the 1398 faith that they will not malloc. */ 1399 1400 /* Normally, this is fine. */ 1401 # define MATCH_MAY_ALLOCATE 1402 1403 /* When using GNU C, we are not REALLY using the C alloca, no matter 1404 what config.h may say. So don't take precautions for it. */ 1405 # ifdef __GNUC__ 1406 # undef C_ALLOCA 1407 # endif 1408 1409 /* The match routines may not allocate if (1) they would do it with malloc 1410 and (2) it's not safe for them to use malloc. 1411 Note that if REL_ALLOC is defined, matching would not use malloc for the 1412 failure stack, but we would still use it for the register vectors; 1413 so REL_ALLOC should not affect this. */ 1414 # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs 1415 # undef MATCH_MAY_ALLOCATE 1416 # endif 1417 #endif /* not DEFINED_ONCE */ 1418 1419 #ifdef INSIDE_RECURSION 1421 /* Failure stack declarations and macros; both re_compile_fastmap and 1422 re_match_2 use a failure stack. These have to be macros because of 1423 REGEX_ALLOCATE_STACK. */ 1424 1425 1426 /* Number of failure points for which to initially allocate space 1427 when matching. If this number is exceeded, we allocate more 1428 space, so it is not a hard limit. */ 1429 # ifndef INIT_FAILURE_ALLOC 1430 # define INIT_FAILURE_ALLOC 5 1431 # endif 1432 1433 /* Roughly the maximum number of failure points on the stack. Would be 1434 exactly that if always used MAX_FAILURE_ITEMS items each time we failed. 1435 This is a variable only so users of regex can assign to it; we never 1436 change it ourselves. */ 1437 1438 # ifdef INT_IS_16BIT 1439 1440 # ifndef DEFINED_ONCE 1441 # if defined MATCH_MAY_ALLOCATE 1442 /* 4400 was enough to cause a crash on Alpha OSF/1, 1443 whose default stack limit is 2mb. */ 1444 long int re_max_failures = 4000; 1445 # else 1446 long int re_max_failures = 2000; 1447 # endif 1448 # endif 1449 1450 union PREFIX(fail_stack_elt) 1451 { 1452 UCHAR_T *pointer; 1453 long int integer; 1454 }; 1455 1456 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); 1457 1458 typedef struct 1459 { 1460 PREFIX(fail_stack_elt_t) *stack; 1461 unsigned long int size; 1462 unsigned long int avail; /* Offset of next open position. */ 1463 } PREFIX(fail_stack_type); 1464 1465 # else /* not INT_IS_16BIT */ 1466 1467 # ifndef DEFINED_ONCE 1468 # if defined MATCH_MAY_ALLOCATE 1469 /* 4400 was enough to cause a crash on Alpha OSF/1, 1470 whose default stack limit is 2mb. */ 1471 int re_max_failures = 4000; 1472 # else 1473 int re_max_failures = 2000; 1474 # endif 1475 # endif 1476 1477 union PREFIX(fail_stack_elt) 1478 { 1479 UCHAR_T *pointer; 1480 int integer; 1481 }; 1482 1483 typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); 1484 1485 typedef struct 1486 { 1487 PREFIX(fail_stack_elt_t) *stack; 1488 unsigned size; 1489 unsigned avail; /* Offset of next open position. */ 1490 } PREFIX(fail_stack_type); 1491 1492 # endif /* INT_IS_16BIT */ 1493 1494 # ifndef DEFINED_ONCE 1495 # define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 1496 # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 1497 # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 1498 # endif 1499 1500 1501 /* Define macros to initialize and free the failure stack. 1502 Do `return -2' if the alloc fails. */ 1503 1504 # ifdef MATCH_MAY_ALLOCATE 1505 # define INIT_FAIL_STACK() \ 1506 do { \ 1507 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \ 1508 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \ 1509 \ 1510 if (fail_stack.stack == NULL) \ 1511 return -2; \ 1512 \ 1513 fail_stack.size = INIT_FAILURE_ALLOC; \ 1514 fail_stack.avail = 0; \ 1515 } while (0) 1516 1517 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 1518 # else 1519 # define INIT_FAIL_STACK() \ 1520 do { \ 1521 fail_stack.avail = 0; \ 1522 } while (0) 1523 1524 # define RESET_FAIL_STACK() 1525 # endif 1526 1527 1528 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 1529 1530 Return 1 if succeeds, and 0 if either ran out of memory 1531 allocating space for it or it was already too large. 1532 1533 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 1534 1535 # define DOUBLE_FAIL_STACK(fail_stack) \ 1536 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ 1537 ? 0 \ 1538 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \ 1539 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 1540 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \ 1541 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\ 1542 \ 1543 (fail_stack).stack == NULL \ 1544 ? 0 \ 1545 : ((fail_stack).size <<= 1, \ 1546 1))) 1547 1548 1549 /* Push pointer POINTER on FAIL_STACK. 1550 Return 1 if was able to do so and 0 if ran out of memory allocating 1551 space to do so. */ 1552 # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 1553 ((FAIL_STACK_FULL () \ 1554 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 1555 ? 0 \ 1556 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1557 1)) 1558 1559 /* Push a pointer value onto the failure stack. 1560 Assumes the variable `fail_stack'. Probably should only 1561 be called from within `PUSH_FAILURE_POINT'. */ 1562 # define PUSH_FAILURE_POINTER(item) \ 1563 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item) 1564 1565 /* This pushes an integer-valued item onto the failure stack. 1566 Assumes the variable `fail_stack'. Probably should only 1567 be called from within `PUSH_FAILURE_POINT'. */ 1568 # define PUSH_FAILURE_INT(item) \ 1569 fail_stack.stack[fail_stack.avail++].integer = (item) 1570 1571 /* Push a fail_stack_elt_t value onto the failure stack. 1572 Assumes the variable `fail_stack'. Probably should only 1573 be called from within `PUSH_FAILURE_POINT'. */ 1574 # define PUSH_FAILURE_ELT(item) \ 1575 fail_stack.stack[fail_stack.avail++] = (item) 1576 1577 /* These three POP... operations complement the three PUSH... operations. 1578 All assume that `fail_stack' is nonempty. */ 1579 # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 1580 # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 1581 # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 1582 1583 /* Used to omit pushing failure point id's when we're not debugging. */ 1584 # ifdef DEBUG 1585 # define DEBUG_PUSH PUSH_FAILURE_INT 1586 # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 1587 # else 1588 # define DEBUG_PUSH(item) 1589 # define DEBUG_POP(item_addr) 1590 # endif 1591 1592 1593 /* Push the information about the state we will need 1594 if we ever fail back to it. 1595 1596 Requires variables fail_stack, regstart, regend, reg_info, and 1597 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' 1598 be declared. 1599 1600 Does `return FAILURE_CODE' if runs out of memory. */ 1601 1602 # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 1603 do { \ 1604 char *destination; \ 1605 /* Must be int, so when we don't save any registers, the arithmetic \ 1606 of 0 + -1 isn't done as unsigned. */ \ 1607 /* Can't be int, since there is not a shred of a guarantee that int \ 1608 is wide enough to hold a value of something to which pointer can \ 1609 be assigned */ \ 1610 active_reg_t this_reg; \ 1611 \ 1612 DEBUG_STATEMENT (failure_id++); \ 1613 DEBUG_STATEMENT (nfailure_points_pushed++); \ 1614 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 1615 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 1616 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 1617 \ 1618 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ 1619 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 1620 \ 1621 /* Ensure we have enough space allocated for what we will push. */ \ 1622 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 1623 { \ 1624 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 1625 return failure_code; \ 1626 \ 1627 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 1628 (fail_stack).size); \ 1629 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 1630 } \ 1631 \ 1632 /* Push the info, starting with the registers. */ \ 1633 DEBUG_PRINT1 ("\n"); \ 1634 \ 1635 if (1) \ 1636 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 1637 this_reg++) \ 1638 { \ 1639 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ 1640 DEBUG_STATEMENT (num_regs_pushed++); \ 1641 \ 1642 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1643 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 1644 \ 1645 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1646 PUSH_FAILURE_POINTER (regend[this_reg]); \ 1647 \ 1648 DEBUG_PRINT2 (" info: %p\n ", \ 1649 reg_info[this_reg].word.pointer); \ 1650 DEBUG_PRINT2 (" match_null=%d", \ 1651 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 1652 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 1653 DEBUG_PRINT2 (" matched_something=%d", \ 1654 MATCHED_SOMETHING (reg_info[this_reg])); \ 1655 DEBUG_PRINT2 (" ever_matched=%d", \ 1656 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 1657 DEBUG_PRINT1 ("\n"); \ 1658 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 1659 } \ 1660 \ 1661 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ 1662 PUSH_FAILURE_INT (lowest_active_reg); \ 1663 \ 1664 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ 1665 PUSH_FAILURE_INT (highest_active_reg); \ 1666 \ 1667 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ 1668 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 1669 PUSH_FAILURE_POINTER (pattern_place); \ 1670 \ 1671 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ 1672 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 1673 size2); \ 1674 DEBUG_PRINT1 ("'\n"); \ 1675 PUSH_FAILURE_POINTER (string_place); \ 1676 \ 1677 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 1678 DEBUG_PUSH (failure_id); \ 1679 } while (0) 1680 1681 # ifndef DEFINED_ONCE 1682 /* This is the number of items that are pushed and popped on the stack 1683 for each register. */ 1684 # define NUM_REG_ITEMS 3 1685 1686 /* Individual items aside from the registers. */ 1687 # ifdef DEBUG 1688 # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 1689 # else 1690 # define NUM_NONREG_ITEMS 4 1691 # endif 1692 1693 /* We push at most this many items on the stack. */ 1694 /* We used to use (num_regs - 1), which is the number of registers 1695 this regexp will save; but that was changed to 5 1696 to avoid stack overflow for a regexp with lots of parens. */ 1697 # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 1698 1699 /* We actually push this many items. */ 1700 # define NUM_FAILURE_ITEMS \ 1701 (((0 \ 1702 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 1703 * NUM_REG_ITEMS) \ 1704 + NUM_NONREG_ITEMS) 1705 1706 /* How many items can still be added to the stack without overflowing it. */ 1707 # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 1708 # endif /* not DEFINED_ONCE */ 1709 1710 1711 /* Pops what PUSH_FAIL_STACK pushes. 1712 1713 We restore into the parameters, all of which should be lvalues: 1714 STR -- the saved data position. 1715 PAT -- the saved pattern position. 1716 LOW_REG, HIGH_REG -- the highest and lowest active registers. 1717 REGSTART, REGEND -- arrays of string positions. 1718 REG_INFO -- array of information about each subexpression. 1719 1720 Also assumes the variables `fail_stack' and (if debugging), `bufp', 1721 `pend', `string1', `size1', `string2', and `size2'. */ 1722 # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 1723 { \ 1724 DEBUG_STATEMENT (unsigned failure_id;) \ 1725 active_reg_t this_reg; \ 1726 const UCHAR_T *string_temp; \ 1727 \ 1728 assert (!FAIL_STACK_EMPTY ()); \ 1729 \ 1730 /* Remove failure points and point to how many regs pushed. */ \ 1731 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 1732 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 1733 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 1734 \ 1735 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 1736 \ 1737 DEBUG_POP (&failure_id); \ 1738 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 1739 \ 1740 /* If the saved string location is NULL, it came from an \ 1741 on_failure_keep_string_jump opcode, and we want to throw away the \ 1742 saved NULL, thus retaining our current position in the string. */ \ 1743 string_temp = POP_FAILURE_POINTER (); \ 1744 if (string_temp != NULL) \ 1745 str = (const CHAR_T *) string_temp; \ 1746 \ 1747 DEBUG_PRINT2 (" Popping string %p: `", str); \ 1748 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 1749 DEBUG_PRINT1 ("'\n"); \ 1750 \ 1751 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \ 1752 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ 1753 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 1754 \ 1755 /* Restore register info. */ \ 1756 high_reg = (active_reg_t) POP_FAILURE_INT (); \ 1757 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ 1758 \ 1759 low_reg = (active_reg_t) POP_FAILURE_INT (); \ 1760 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ 1761 \ 1762 if (1) \ 1763 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 1764 { \ 1765 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ 1766 \ 1767 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 1768 DEBUG_PRINT2 (" info: %p\n", \ 1769 reg_info[this_reg].word.pointer); \ 1770 \ 1771 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ 1772 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1773 \ 1774 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ 1775 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1776 } \ 1777 else \ 1778 { \ 1779 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 1780 { \ 1781 reg_info[this_reg].word.integer = 0; \ 1782 regend[this_reg] = 0; \ 1783 regstart[this_reg] = 0; \ 1784 } \ 1785 highest_active_reg = high_reg; \ 1786 } \ 1787 \ 1788 set_regs_matched_done = 0; \ 1789 DEBUG_STATEMENT (nfailure_points_popped++); \ 1790 } /* POP_FAILURE_POINT */ 1791 1792 /* Structure for per-register (a.k.a. per-group) information. 1794 Other register information, such as the 1795 starting and ending positions (which are addresses), and the list of 1796 inner groups (which is a bits list) are maintained in separate 1797 variables. 1798 1799 We are making a (strictly speaking) nonportable assumption here: that 1800 the compiler will pack our bit fields into something that fits into 1801 the type of `word', i.e., is something that fits into one item on the 1802 failure stack. */ 1803 1804 1805 /* Declarations and macros for re_match_2. */ 1806 1807 typedef union 1808 { 1809 PREFIX(fail_stack_elt_t) word; 1810 struct 1811 { 1812 /* This field is one if this group can match the empty string, 1813 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 1814 # define MATCH_NULL_UNSET_VALUE 3 1815 unsigned match_null_string_p : 2; 1816 unsigned is_active : 1; 1817 unsigned matched_something : 1; 1818 unsigned ever_matched_something : 1; 1819 } bits; 1820 } PREFIX(register_info_type); 1821 1822 # ifndef DEFINED_ONCE 1823 # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 1824 # define IS_ACTIVE(R) ((R).bits.is_active) 1825 # define MATCHED_SOMETHING(R) ((R).bits.matched_something) 1826 # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 1827 1828 1829 /* Call this when have matched a real character; it sets `matched' flags 1830 for the subexpressions which we are currently inside. Also records 1831 that those subexprs have matched. */ 1832 # define SET_REGS_MATCHED() \ 1833 do \ 1834 { \ 1835 if (!set_regs_matched_done) \ 1836 { \ 1837 active_reg_t r; \ 1838 set_regs_matched_done = 1; \ 1839 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 1840 { \ 1841 MATCHED_SOMETHING (reg_info[r]) \ 1842 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 1843 = 1; \ 1844 } \ 1845 } \ 1846 } \ 1847 while (0) 1848 # endif /* not DEFINED_ONCE */ 1849 1850 /* Registers are set to a sentinel when they haven't yet matched. */ 1851 static CHAR_T PREFIX(reg_unset_dummy); 1852 # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy)) 1853 # define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 1854 1855 /* Subroutine declarations and macros for regex_compile. */ 1856 static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg); 1857 static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, 1858 int arg1, int arg2); 1859 static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, 1860 int arg, UCHAR_T *end); 1861 static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, 1862 int arg1, int arg2, UCHAR_T *end); 1863 static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern, 1864 const CHAR_T *p, 1865 reg_syntax_t syntax); 1866 static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p, 1867 const CHAR_T *pend, 1868 reg_syntax_t syntax); 1869 # ifdef WCHAR 1870 static reg_errcode_t wcs_compile_range (CHAR_T range_start, 1871 const CHAR_T **p_ptr, 1872 const CHAR_T *pend, 1873 char *translate, 1874 reg_syntax_t syntax, 1875 UCHAR_T *b, 1876 CHAR_T *char_set); 1877 static void insert_space (int num, CHAR_T *loc, CHAR_T *end); 1878 # else /* BYTE */ 1879 static reg_errcode_t byte_compile_range (unsigned int range_start, 1880 const char **p_ptr, 1881 const char *pend, 1882 char *translate, 1883 reg_syntax_t syntax, 1884 unsigned char *b); 1885 # endif /* WCHAR */ 1886 1887 /* Fetch the next character in the uncompiled pattern---translating it 1888 if necessary. Also cast from a signed character in the constant 1889 string passed to us by the user to an unsigned char that we can use 1890 as an array index (in, e.g., `translate'). */ 1891 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff, 1892 because it is impossible to allocate 4GB array for some encodings 1893 which have 4 byte character_set like UCS4. */ 1894 # ifndef PATFETCH 1895 # ifdef WCHAR 1896 # define PATFETCH(c) \ 1897 do {if (p == pend) return REG_EEND; \ 1898 c = (UCHAR_T) *p++; \ 1899 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \ 1900 } while (0) 1901 # else /* BYTE */ 1902 # define PATFETCH(c) \ 1903 do {if (p == pend) return REG_EEND; \ 1904 c = (unsigned char) *p++; \ 1905 if (translate) c = (unsigned char) translate[c]; \ 1906 } while (0) 1907 # endif /* WCHAR */ 1908 # endif 1909 1910 /* Fetch the next character in the uncompiled pattern, with no 1911 translation. */ 1912 # define PATFETCH_RAW(c) \ 1913 do {if (p == pend) return REG_EEND; \ 1914 c = (UCHAR_T) *p++; \ 1915 } while (0) 1916 1917 /* Go backwards one character in the pattern. */ 1918 # define PATUNFETCH p-- 1919 1920 1921 /* If `translate' is non-null, return translate[D], else just D. We 1922 cast the subscript to translate because some data is declared as 1923 `char *', to avoid warnings when a string constant is passed. But 1924 when we use a character as a subscript we must make it unsigned. */ 1925 /* ifdef MBS_SUPPORT, we translate only if character <= 0xff, 1926 because it is impossible to allocate 4GB array for some encodings 1927 which have 4 byte character_set like UCS4. */ 1928 1929 # ifndef TRANSLATE 1930 # ifdef WCHAR 1931 # define TRANSLATE(d) \ 1932 ((translate && ((UCHAR_T) (d)) <= 0xff) \ 1933 ? (char) translate[(unsigned char) (d)] : (d)) 1934 # else /* BYTE */ 1935 # define TRANSLATE(d) \ 1936 (translate ? (char) translate[(unsigned char) (d)] : (char) (d)) 1937 # endif /* WCHAR */ 1938 # endif 1939 1940 1941 /* Macros for outputting the compiled pattern into `buffer'. */ 1942 1943 /* If the buffer isn't allocated when it comes in, use this. */ 1944 # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T)) 1945 1946 /* Make sure we have at least N more bytes of space in buffer. */ 1947 # ifdef WCHAR 1948 # define GET_BUFFER_SPACE(n) \ 1949 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \ 1950 + (n)*sizeof(CHAR_T)) > bufp->allocated) \ 1951 EXTEND_BUFFER () 1952 # else /* BYTE */ 1953 # define GET_BUFFER_SPACE(n) \ 1954 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ 1955 EXTEND_BUFFER () 1956 # endif /* WCHAR */ 1957 1958 /* Make sure we have one more byte of buffer space and then add C to it. */ 1959 # define BUF_PUSH(c) \ 1960 do { \ 1961 GET_BUFFER_SPACE (1); \ 1962 *b++ = (UCHAR_T) (c); \ 1963 } while (0) 1964 1965 1966 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 1967 # define BUF_PUSH_2(c1, c2) \ 1968 do { \ 1969 GET_BUFFER_SPACE (2); \ 1970 *b++ = (UCHAR_T) (c1); \ 1971 *b++ = (UCHAR_T) (c2); \ 1972 } while (0) 1973 1974 1975 /* As with BUF_PUSH_2, except for three bytes. */ 1976 # define BUF_PUSH_3(c1, c2, c3) \ 1977 do { \ 1978 GET_BUFFER_SPACE (3); \ 1979 *b++ = (UCHAR_T) (c1); \ 1980 *b++ = (UCHAR_T) (c2); \ 1981 *b++ = (UCHAR_T) (c3); \ 1982 } while (0) 1983 1984 /* Store a jump with opcode OP at LOC to location TO. We store a 1985 relative address offset by the three bytes the jump itself occupies. */ 1986 # define STORE_JUMP(op, loc, to) \ 1987 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE))) 1988 1989 /* Likewise, for a two-argument jump. */ 1990 # define STORE_JUMP2(op, loc, to, arg) \ 1991 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg) 1992 1993 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 1994 # define INSERT_JUMP(op, loc, to) \ 1995 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b) 1996 1997 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 1998 # define INSERT_JUMP2(op, loc, to, arg) \ 1999 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\ 2000 arg, b) 2001 2002 /* This is not an arbitrary limit: the arguments which represent offsets 2003 into the pattern are two bytes long. So if 2^16 bytes turns out to 2004 be too small, many things would have to change. */ 2005 /* Any other compiler which, like MSC, has allocation limit below 2^16 2006 bytes will have to use approach similar to what was done below for 2007 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up 2008 reallocating to 0 bytes. Such thing is not going to work too well. 2009 You have been warned!! */ 2010 # ifndef DEFINED_ONCE 2011 # if defined _MSC_VER && !defined WIN32 2012 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. 2013 The REALLOC define eliminates a flurry of conversion warnings, 2014 but is not required. */ 2015 # define MAX_BUF_SIZE 65500L 2016 # define REALLOC(p,s) realloc ((p), (size_t) (s)) 2017 # else 2018 # define MAX_BUF_SIZE (1L << 16) 2019 # define REALLOC(p,s) realloc ((p), (s)) 2020 # endif 2021 2022 /* Extend the buffer by twice its current size via realloc and 2023 reset the pointers that pointed into the old block to point to the 2024 correct places in the new one. If extending the buffer results in it 2025 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 2026 # if __BOUNDED_POINTERS__ 2027 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated) 2028 # define MOVE_BUFFER_POINTER(P) \ 2029 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr) 2030 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2031 else \ 2032 { \ 2033 SET_HIGH_BOUND (b); \ 2034 SET_HIGH_BOUND (begalt); \ 2035 if (fixup_alt_jump) \ 2036 SET_HIGH_BOUND (fixup_alt_jump); \ 2037 if (laststart) \ 2038 SET_HIGH_BOUND (laststart); \ 2039 if (pending_exact) \ 2040 SET_HIGH_BOUND (pending_exact); \ 2041 } 2042 # else 2043 # define MOVE_BUFFER_POINTER(P) (P) += incr 2044 # define ELSE_EXTEND_BUFFER_HIGH_BOUND 2045 # endif 2046 # endif /* not DEFINED_ONCE */ 2047 2048 # ifdef WCHAR 2049 # define EXTEND_BUFFER() \ 2050 do { \ 2051 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ 2052 int wchar_count; \ 2053 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \ 2054 return REG_ESIZE; \ 2055 bufp->allocated <<= 1; \ 2056 if (bufp->allocated > MAX_BUF_SIZE) \ 2057 bufp->allocated = MAX_BUF_SIZE; \ 2058 /* How many characters the new buffer can have? */ \ 2059 wchar_count = bufp->allocated / sizeof(UCHAR_T); \ 2060 if (wchar_count == 0) wchar_count = 1; \ 2061 /* Truncate the buffer to CHAR_T align. */ \ 2062 bufp->allocated = wchar_count * sizeof(UCHAR_T); \ 2063 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \ 2064 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \ 2065 if (COMPILED_BUFFER_VAR == NULL) \ 2066 return REG_ESPACE; \ 2067 /* If the buffer moved, move all the pointers into it. */ \ 2068 if (old_buffer != COMPILED_BUFFER_VAR) \ 2069 { \ 2070 PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \ 2071 MOVE_BUFFER_POINTER (b); \ 2072 MOVE_BUFFER_POINTER (begalt); \ 2073 if (fixup_alt_jump) \ 2074 MOVE_BUFFER_POINTER (fixup_alt_jump); \ 2075 if (laststart) \ 2076 MOVE_BUFFER_POINTER (laststart); \ 2077 if (pending_exact) \ 2078 MOVE_BUFFER_POINTER (pending_exact); \ 2079 } \ 2080 ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2081 } while (0) 2082 # else /* BYTE */ 2083 # define EXTEND_BUFFER() \ 2084 do { \ 2085 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ 2086 if (bufp->allocated == MAX_BUF_SIZE) \ 2087 return REG_ESIZE; \ 2088 bufp->allocated <<= 1; \ 2089 if (bufp->allocated > MAX_BUF_SIZE) \ 2090 bufp->allocated = MAX_BUF_SIZE; \ 2091 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \ 2092 bufp->allocated); \ 2093 if (COMPILED_BUFFER_VAR == NULL) \ 2094 return REG_ESPACE; \ 2095 /* If the buffer moved, move all the pointers into it. */ \ 2096 if (old_buffer != COMPILED_BUFFER_VAR) \ 2097 { \ 2098 PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \ 2099 MOVE_BUFFER_POINTER (b); \ 2100 MOVE_BUFFER_POINTER (begalt); \ 2101 if (fixup_alt_jump) \ 2102 MOVE_BUFFER_POINTER (fixup_alt_jump); \ 2103 if (laststart) \ 2104 MOVE_BUFFER_POINTER (laststart); \ 2105 if (pending_exact) \ 2106 MOVE_BUFFER_POINTER (pending_exact); \ 2107 } \ 2108 ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2109 } while (0) 2110 # endif /* WCHAR */ 2111 2112 # ifndef DEFINED_ONCE 2113 /* Since we have one byte reserved for the register number argument to 2114 {start,stop}_memory, the maximum number of groups we can report 2115 things about is what fits in that byte. */ 2116 # define MAX_REGNUM 255 2117 2118 /* But patterns can have more than `MAX_REGNUM' registers. We just 2119 ignore the excess. */ 2120 typedef unsigned regnum_t; 2121 2122 2123 /* Macros for the compile stack. */ 2124 2125 /* Since offsets can go either forwards or backwards, this type needs to 2126 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 2127 /* int may be not enough when sizeof(int) == 2. */ 2128 typedef long pattern_offset_t; 2129 2130 typedef struct 2131 { 2132 pattern_offset_t begalt_offset; 2133 pattern_offset_t fixup_alt_jump; 2134 pattern_offset_t inner_group_offset; 2135 pattern_offset_t laststart_offset; 2136 regnum_t regnum; 2137 } compile_stack_elt_t; 2138 2139 2140 typedef struct 2141 { 2142 compile_stack_elt_t *stack; 2143 unsigned size; 2144 unsigned avail; /* Offset of next open position. */ 2145 } compile_stack_type; 2146 2147 2148 # define INIT_COMPILE_STACK_SIZE 32 2149 2150 # define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 2151 # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 2152 2153 /* The next available element. */ 2154 # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 2155 2156 # endif /* not DEFINED_ONCE */ 2157 2158 /* Set the bit for character C in a list. */ 2159 # ifndef DEFINED_ONCE 2160 # define SET_LIST_BIT(c) \ 2161 (b[((unsigned char) (c)) / BYTEWIDTH] \ 2162 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 2163 # endif /* DEFINED_ONCE */ 2164 2165 /* Get the next unsigned number in the uncompiled pattern. */ 2166 # define GET_UNSIGNED_NUMBER(num) \ 2167 { \ 2168 while (p != pend) \ 2169 { \ 2170 PATFETCH (c); \ 2171 if (c < '0' || c > '9') \ 2172 break; \ 2173 if (num <= RE_DUP_MAX) \ 2174 { \ 2175 if (num < 0) \ 2176 num = 0; \ 2177 num = num * 10 + c - '0'; \ 2178 } \ 2179 } \ 2180 } 2181 2182 # ifndef DEFINED_ONCE 2183 # if defined _LIBC || WIDE_CHAR_SUPPORT 2184 /* The GNU C library provides support for user-defined character classes 2185 and the functions from ISO C amendement 1. */ 2186 # ifdef CHARCLASS_NAME_MAX 2187 # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX 2188 # else 2189 /* This shouldn't happen but some implementation might still have this 2190 problem. Use a reasonable default value. */ 2191 # define CHAR_CLASS_MAX_LENGTH 256 2192 # endif 2193 2194 # ifdef _LIBC 2195 # define IS_CHAR_CLASS(string) __wctype (string) 2196 # else 2197 # define IS_CHAR_CLASS(string) wctype (string) 2198 # endif 2199 # else 2200 # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 2201 2202 # define IS_CHAR_CLASS(string) \ 2203 (STREQ (string, "alpha") || STREQ (string, "upper") \ 2204 || STREQ (string, "lower") || STREQ (string, "digit") \ 2205 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 2206 || STREQ (string, "space") || STREQ (string, "print") \ 2207 || STREQ (string, "punct") || STREQ (string, "graph") \ 2208 || STREQ (string, "cntrl") || STREQ (string, "blank")) 2209 # endif 2210 # endif /* DEFINED_ONCE */ 2211 2212 # ifndef MATCH_MAY_ALLOCATE 2214 2215 /* If we cannot allocate large objects within re_match_2_internal, 2216 we make the fail stack and register vectors global. 2217 The fail stack, we grow to the maximum size when a regexp 2218 is compiled. 2219 The register vectors, we adjust in size each time we 2220 compile a regexp, according to the number of registers it needs. */ 2221 2222 static PREFIX(fail_stack_type) fail_stack; 2223 2224 /* Size with which the following vectors are currently allocated. 2225 That is so we can make them bigger as needed, 2226 but never make them smaller. */ 2227 # ifdef DEFINED_ONCE 2228 static int regs_allocated_size; 2229 2230 static const char ** regstart, ** regend; 2231 static const char ** old_regstart, ** old_regend; 2232 static const char **best_regstart, **best_regend; 2233 static const char **reg_dummy; 2234 # endif /* DEFINED_ONCE */ 2235 2236 static PREFIX(register_info_type) *PREFIX(reg_info); 2237 static PREFIX(register_info_type) *PREFIX(reg_info_dummy); 2238 2239 /* Make the register vectors big enough for NUM_REGS registers, 2240 but don't make them smaller. */ 2241 2242 static void 2243 PREFIX(regex_grow_registers) (int num_regs) 2244 { 2245 if (num_regs > regs_allocated_size) 2246 { 2247 RETALLOC_IF (regstart, num_regs, const char *); 2248 RETALLOC_IF (regend, num_regs, const char *); 2249 RETALLOC_IF (old_regstart, num_regs, const char *); 2250 RETALLOC_IF (old_regend, num_regs, const char *); 2251 RETALLOC_IF (best_regstart, num_regs, const char *); 2252 RETALLOC_IF (best_regend, num_regs, const char *); 2253 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type)); 2254 RETALLOC_IF (reg_dummy, num_regs, const char *); 2255 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type)); 2256 2257 regs_allocated_size = num_regs; 2258 } 2259 } 2260 2261 # endif /* not MATCH_MAY_ALLOCATE */ 2262 2263 # ifndef DEFINED_ONCE 2265 static boolean group_in_compile_stack (compile_stack_type compile_stack, 2266 regnum_t regnum); 2267 # endif /* not DEFINED_ONCE */ 2268 2269 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 2270 Returns one of error codes defined in `regex.h', or zero for success. 2271 2272 Assumes the `allocated' (and perhaps `buffer') and `translate' 2273 fields are set in BUFP on entry. 2274 2275 If it succeeds, results are put in BUFP (if it returns an error, the 2276 contents of BUFP are undefined): 2277 `buffer' is the compiled pattern; 2278 `syntax' is set to SYNTAX; 2279 `used' is set to the length of the compiled pattern; 2280 `fastmap_accurate' is zero; 2281 `re_nsub' is the number of subexpressions in PATTERN; 2282 `not_bol' and `not_eol' are zero; 2283 2284 The `fastmap' and `newline_anchor' fields are neither 2285 examined nor set. */ 2286 2287 /* Return, freeing storage we allocated. */ 2288 # ifdef WCHAR 2289 # define FREE_STACK_RETURN(value) \ 2290 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value) 2291 # else 2292 # define FREE_STACK_RETURN(value) \ 2293 return (free (compile_stack.stack), value) 2294 # endif /* WCHAR */ 2295 2296 static reg_errcode_t 2297 PREFIX(regex_compile) (const char *ARG_PREFIX(pattern), 2298 size_t ARG_PREFIX(size), reg_syntax_t syntax, 2299 struct re_pattern_buffer *bufp) 2300 { 2301 /* We fetch characters from PATTERN here. Even though PATTERN is 2302 `char *' (i.e., signed), we declare these variables as unsigned, so 2303 they can be reliably used as array indices. */ 2304 register UCHAR_T c, c1; 2305 2306 #ifdef WCHAR 2307 /* A temporary space to keep wchar_t pattern and compiled pattern. */ 2308 CHAR_T *pattern, *COMPILED_BUFFER_VAR; 2309 size_t size; 2310 /* offset buffer for optimization. See convert_mbs_to_wc. */ 2311 int *mbs_offset = NULL; 2312 /* It hold whether each wchar_t is binary data or not. */ 2313 char *is_binary = NULL; 2314 /* A flag whether exactn is handling binary data or not. */ 2315 char is_exactn_bin = FALSE; 2316 #endif /* WCHAR */ 2317 2318 /* A random temporary spot in PATTERN. */ 2319 const CHAR_T *p1; 2320 2321 /* Points to the end of the buffer, where we should append. */ 2322 register UCHAR_T *b; 2323 2324 /* Keeps track of unclosed groups. */ 2325 compile_stack_type compile_stack; 2326 2327 /* Points to the current (ending) position in the pattern. */ 2328 #ifdef WCHAR 2329 const CHAR_T *p; 2330 const CHAR_T *pend; 2331 #else /* BYTE */ 2332 const CHAR_T *p = pattern; 2333 const CHAR_T *pend = pattern + size; 2334 #endif /* WCHAR */ 2335 2336 /* How to translate the characters in the pattern. */ 2337 RE_TRANSLATE_TYPE translate = bufp->translate; 2338 2339 /* Address of the count-byte of the most recently inserted `exactn' 2340 command. This makes it possible to tell if a new exact-match 2341 character can be added to that command or if the character requires 2342 a new `exactn' command. */ 2343 UCHAR_T *pending_exact = 0; 2344 2345 /* Address of start of the most recently finished expression. 2346 This tells, e.g., postfix * where to find the start of its 2347 operand. Reset at the beginning of groups and alternatives. */ 2348 UCHAR_T *laststart = 0; 2349 2350 /* Address of beginning of regexp, or inside of last group. */ 2351 UCHAR_T *begalt; 2352 2353 /* Address of the place where a forward jump should go to the end of 2354 the containing expression. Each alternative of an `or' -- except the 2355 last -- ends with a forward jump of this sort. */ 2356 UCHAR_T *fixup_alt_jump = 0; 2357 2358 /* Counts open-groups as they are encountered. Remembered for the 2359 matching close-group on the compile stack, so the same register 2360 number is put in the stop_memory as the start_memory. */ 2361 regnum_t regnum = 0; 2362 2363 #ifdef WCHAR 2364 /* Initialize the wchar_t PATTERN and offset_buffer. */ 2365 p = pend = pattern = TALLOC(csize + 1, CHAR_T); 2366 mbs_offset = TALLOC(csize + 1, int); 2367 is_binary = TALLOC(csize + 1, char); 2368 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) 2369 { 2370 free(pattern); 2371 free(mbs_offset); 2372 free(is_binary); 2373 return REG_ESPACE; 2374 } 2375 pattern[csize] = L'\0'; /* sentinel */ 2376 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); 2377 pend = p + size; 2378 if (size < 0) 2379 { 2380 free(pattern); 2381 free(mbs_offset); 2382 free(is_binary); 2383 return REG_BADPAT; 2384 } 2385 #endif 2386 2387 #ifdef DEBUG 2388 DEBUG_PRINT1 ("\nCompiling pattern: "); 2389 if (debug) 2390 { 2391 unsigned debug_count; 2392 2393 for (debug_count = 0; debug_count < size; debug_count++) 2394 PUT_CHAR (pattern[debug_count]); 2395 putchar ('\n'); 2396 } 2397 #endif /* DEBUG */ 2398 2399 /* Initialize the compile stack. */ 2400 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 2401 if (compile_stack.stack == NULL) 2402 { 2403 #ifdef WCHAR 2404 free(pattern); 2405 free(mbs_offset); 2406 free(is_binary); 2407 #endif 2408 return REG_ESPACE; 2409 } 2410 2411 compile_stack.size = INIT_COMPILE_STACK_SIZE; 2412 compile_stack.avail = 0; 2413 2414 /* Initialize the pattern buffer. */ 2415 bufp->syntax = syntax; 2416 bufp->fastmap_accurate = 0; 2417 bufp->not_bol = bufp->not_eol = 0; 2418 2419 /* Set `used' to zero, so that if we return an error, the pattern 2420 printer (for debugging) will think there's no pattern. We reset it 2421 at the end. */ 2422 bufp->used = 0; 2423 2424 /* Always count groups, whether or not bufp->no_sub is set. */ 2425 bufp->re_nsub = 0; 2426 2427 #if !defined emacs && !defined SYNTAX_TABLE 2428 /* Initialize the syntax table. */ 2429 init_syntax_once (); 2430 #endif 2431 2432 if (bufp->allocated == 0) 2433 { 2434 if (bufp->buffer) 2435 { /* If zero allocated, but buffer is non-null, try to realloc 2436 enough space. This loses if buffer's address is bogus, but 2437 that is the user's responsibility. */ 2438 #ifdef WCHAR 2439 /* Free bufp->buffer and allocate an array for wchar_t pattern 2440 buffer. */ 2441 free(bufp->buffer); 2442 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T), 2443 UCHAR_T); 2444 #else 2445 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T); 2446 #endif /* WCHAR */ 2447 } 2448 else 2449 { /* Caller did not allocate a buffer. Do it for them. */ 2450 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T), 2451 UCHAR_T); 2452 } 2453 2454 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE); 2455 #ifdef WCHAR 2456 bufp->buffer = (char*)COMPILED_BUFFER_VAR; 2457 #endif /* WCHAR */ 2458 bufp->allocated = INIT_BUF_SIZE; 2459 } 2460 #ifdef WCHAR 2461 else 2462 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer; 2463 #endif 2464 2465 begalt = b = COMPILED_BUFFER_VAR; 2466 2467 /* Loop through the uncompiled pattern until we're at the end. */ 2468 while (p != pend) 2469 { 2470 PATFETCH (c); 2471 2472 switch (c) 2473 { 2474 case '^': 2475 { 2476 if ( /* If at start of pattern, it's an operator. */ 2477 p == pattern + 1 2478 /* If context independent, it's an operator. */ 2479 || syntax & RE_CONTEXT_INDEP_ANCHORS 2480 /* Otherwise, depends on what's come before. */ 2481 || PREFIX(at_begline_loc_p) (pattern, p, syntax)) 2482 BUF_PUSH (begline); 2483 else 2484 goto normal_char; 2485 } 2486 break; 2487 2488 2489 case '$': 2490 { 2491 if ( /* If at end of pattern, it's an operator. */ 2492 p == pend 2493 /* If context independent, it's an operator. */ 2494 || syntax & RE_CONTEXT_INDEP_ANCHORS 2495 /* Otherwise, depends on what's next. */ 2496 || PREFIX(at_endline_loc_p) (p, pend, syntax)) 2497 BUF_PUSH (endline); 2498 else 2499 goto normal_char; 2500 } 2501 break; 2502 2503 2504 case '+': 2505 case '?': 2506 if ((syntax & RE_BK_PLUS_QM) 2507 || (syntax & RE_LIMITED_OPS)) 2508 goto normal_char; 2509 handle_plus: 2510 case '*': 2511 /* If there is no previous pattern... */ 2512 if (!laststart) 2513 { 2514 if (syntax & RE_CONTEXT_INVALID_OPS) 2515 FREE_STACK_RETURN (REG_BADRPT); 2516 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 2517 goto normal_char; 2518 } 2519 2520 { 2521 /* Are we optimizing this jump? */ 2522 boolean keep_string_p = false; 2523 2524 /* 1 means zero (many) matches is allowed. */ 2525 char zero_times_ok = 0, many_times_ok = 0; 2526 2527 /* If there is a sequence of repetition chars, collapse it 2528 down to just one (the right one). We can't combine 2529 interval operators with these because of, e.g., `a{2}*', 2530 which should only match an even number of `a's. */ 2531 2532 for (;;) 2533 { 2534 zero_times_ok |= c != '+'; 2535 many_times_ok |= c != '?'; 2536 2537 if (p == pend) 2538 break; 2539 2540 PATFETCH (c); 2541 2542 if (c == '*' 2543 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 2544 ; 2545 2546 else if (syntax & RE_BK_PLUS_QM && c == '\\') 2547 { 2548 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2549 2550 PATFETCH (c1); 2551 if (!(c1 == '+' || c1 == '?')) 2552 { 2553 PATUNFETCH; 2554 PATUNFETCH; 2555 break; 2556 } 2557 2558 c = c1; 2559 } 2560 else 2561 { 2562 PATUNFETCH; 2563 break; 2564 } 2565 2566 /* If we get here, we found another repeat character. */ 2567 } 2568 2569 /* Star, etc. applied to an empty pattern is equivalent 2570 to an empty pattern. */ 2571 if (!laststart) 2572 break; 2573 2574 /* Now we know whether or not zero matches is allowed 2575 and also whether or not two or more matches is allowed. */ 2576 if (many_times_ok) 2577 { /* More than one repetition is allowed, so put in at the 2578 end a backward relative jump from `b' to before the next 2579 jump we're going to put in below (which jumps from 2580 laststart to after this jump). 2581 2582 But if we are at the `*' in the exact sequence `.*\n', 2583 insert an unconditional jump backwards to the ., 2584 instead of the beginning of the loop. This way we only 2585 push a failure point once, instead of every time 2586 through the loop. */ 2587 assert (p - 1 > pattern); 2588 2589 /* Allocate the space for the jump. */ 2590 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2591 2592 /* We know we are not at the first character of the pattern, 2593 because laststart was nonzero. And we've already 2594 incremented `p', by the way, to be the character after 2595 the `*'. Do we have to do something analogous here 2596 for null bytes, because of RE_DOT_NOT_NULL? */ 2597 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 2598 && zero_times_ok 2599 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 2600 && !(syntax & RE_DOT_NEWLINE)) 2601 { /* We have .*\n. */ 2602 STORE_JUMP (jump, b, laststart); 2603 keep_string_p = true; 2604 } 2605 else 2606 /* Anything else. */ 2607 STORE_JUMP (maybe_pop_jump, b, laststart - 2608 (1 + OFFSET_ADDRESS_SIZE)); 2609 2610 /* We've added more stuff to the buffer. */ 2611 b += 1 + OFFSET_ADDRESS_SIZE; 2612 } 2613 2614 /* On failure, jump from laststart to b + 3, which will be the 2615 end of the buffer after this jump is inserted. */ 2616 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of 2617 'b + 3'. */ 2618 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2619 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 2620 : on_failure_jump, 2621 laststart, b + 1 + OFFSET_ADDRESS_SIZE); 2622 pending_exact = 0; 2623 b += 1 + OFFSET_ADDRESS_SIZE; 2624 2625 if (!zero_times_ok) 2626 { 2627 /* At least one repetition is required, so insert a 2628 `dummy_failure_jump' before the initial 2629 `on_failure_jump' instruction of the loop. This 2630 effects a skip over that instruction the first time 2631 we hit that loop. */ 2632 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2633 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 2634 2 + 2 * OFFSET_ADDRESS_SIZE); 2635 b += 1 + OFFSET_ADDRESS_SIZE; 2636 } 2637 } 2638 break; 2639 2640 2641 case '.': 2642 laststart = b; 2643 BUF_PUSH (anychar); 2644 break; 2645 2646 2647 case '[': 2648 { 2649 boolean had_char_class = false; 2650 #ifdef WCHAR 2651 CHAR_T range_start = 0xffffffff; 2652 #else 2653 unsigned int range_start = 0xffffffff; 2654 #endif 2655 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2656 2657 #ifdef WCHAR 2658 /* We assume a charset(_not) structure as a wchar_t array. 2659 charset[0] = (re_opcode_t) charset(_not) 2660 charset[1] = l (= length of char_classes) 2661 charset[2] = m (= length of collating_symbols) 2662 charset[3] = n (= length of equivalence_classes) 2663 charset[4] = o (= length of char_ranges) 2664 charset[5] = p (= length of chars) 2665 2666 charset[6] = char_class (wctype_t) 2667 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) 2668 ... 2669 charset[l+5] = char_class (wctype_t) 2670 2671 charset[l+6] = collating_symbol (wchar_t) 2672 ... 2673 charset[l+m+5] = collating_symbol (wchar_t) 2674 ifdef _LIBC we use the index if 2675 _NL_COLLATE_SYMB_EXTRAMB instead of 2676 wchar_t string. 2677 2678 charset[l+m+6] = equivalence_classes (wchar_t) 2679 ... 2680 charset[l+m+n+5] = equivalence_classes (wchar_t) 2681 ifdef _LIBC we use the index in 2682 _NL_COLLATE_WEIGHT instead of 2683 wchar_t string. 2684 2685 charset[l+m+n+6] = range_start 2686 charset[l+m+n+7] = range_end 2687 ... 2688 charset[l+m+n+2o+4] = range_start 2689 charset[l+m+n+2o+5] = range_end 2690 ifdef _LIBC we use the value looked up 2691 in _NL_COLLATE_COLLSEQ instead of 2692 wchar_t character. 2693 2694 charset[l+m+n+2o+6] = char 2695 ... 2696 charset[l+m+n+2o+p+5] = char 2697 2698 */ 2699 2700 /* We need at least 6 spaces: the opcode, the length of 2701 char_classes, the length of collating_symbols, the length of 2702 equivalence_classes, the length of char_ranges, the length of 2703 chars. */ 2704 GET_BUFFER_SPACE (6); 2705 2706 /* Save b as laststart. And We use laststart as the pointer 2707 to the first element of the charset here. 2708 In other words, laststart[i] indicates charset[i]. */ 2709 laststart = b; 2710 2711 /* We test `*p == '^' twice, instead of using an if 2712 statement, so we only need one BUF_PUSH. */ 2713 BUF_PUSH (*p == '^' ? charset_not : charset); 2714 if (*p == '^') 2715 p++; 2716 2717 /* Push the length of char_classes, the length of 2718 collating_symbols, the length of equivalence_classes, the 2719 length of char_ranges and the length of chars. */ 2720 BUF_PUSH_3 (0, 0, 0); 2721 BUF_PUSH_2 (0, 0); 2722 2723 /* Remember the first position in the bracket expression. */ 2724 p1 = p; 2725 2726 /* charset_not matches newline according to a syntax bit. */ 2727 if ((re_opcode_t) b[-6] == charset_not 2728 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 2729 { 2730 BUF_PUSH('\n'); 2731 laststart[5]++; /* Update the length of characters */ 2732 } 2733 2734 /* Read in characters and ranges, setting map bits. */ 2735 for (;;) 2736 { 2737 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2738 2739 PATFETCH (c); 2740 2741 /* \ might escape characters inside [...] and [^...]. */ 2742 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2743 { 2744 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2745 2746 PATFETCH (c1); 2747 BUF_PUSH(c1); 2748 laststart[5]++; /* Update the length of chars */ 2749 range_start = c1; 2750 continue; 2751 } 2752 2753 /* Could be the end of the bracket expression. If it's 2754 not (i.e., when the bracket expression is `[]' so 2755 far), the ']' character bit gets set way below. */ 2756 if (c == ']' && p != p1 + 1) 2757 break; 2758 2759 /* Look ahead to see if it's a range when the last thing 2760 was a character class. */ 2761 if (had_char_class && c == '-' && *p != ']') 2762 FREE_STACK_RETURN (REG_ERANGE); 2763 2764 /* Look ahead to see if it's a range when the last thing 2765 was a character: if this is a hyphen not at the 2766 beginning or the end of a list, then it's the range 2767 operator. */ 2768 if (c == '-' 2769 && !(p - 2 >= pattern && p[-2] == '[') 2770 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2771 && *p != ']') 2772 { 2773 reg_errcode_t ret; 2774 /* Allocate the space for range_start and range_end. */ 2775 GET_BUFFER_SPACE (2); 2776 /* Update the pointer to indicate end of buffer. */ 2777 b += 2; 2778 ret = wcs_compile_range (range_start, &p, pend, translate, 2779 syntax, b, laststart); 2780 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2781 range_start = 0xffffffff; 2782 } 2783 else if (p[0] == '-' && p[1] != ']') 2784 { /* This handles ranges made up of characters only. */ 2785 reg_errcode_t ret; 2786 2787 /* Move past the `-'. */ 2788 PATFETCH (c1); 2789 /* Allocate the space for range_start and range_end. */ 2790 GET_BUFFER_SPACE (2); 2791 /* Update the pointer to indicate end of buffer. */ 2792 b += 2; 2793 ret = wcs_compile_range (c, &p, pend, translate, syntax, b, 2794 laststart); 2795 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2796 range_start = 0xffffffff; 2797 } 2798 2799 /* See if we're at the beginning of a possible character 2800 class. */ 2801 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2802 { /* Leave room for the null. */ 2803 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2804 2805 PATFETCH (c); 2806 c1 = 0; 2807 2808 /* If pattern is `[[:'. */ 2809 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2810 2811 for (;;) 2812 { 2813 PATFETCH (c); 2814 if ((c == ':' && *p == ']') || p == pend) 2815 break; 2816 if (c1 < CHAR_CLASS_MAX_LENGTH) 2817 str[c1++] = c; 2818 else 2819 /* This is in any case an invalid class name. */ 2820 str[0] = '\0'; 2821 } 2822 str[c1] = '\0'; 2823 2824 /* If isn't a word bracketed by `[:' and `:]': 2825 undo the ending character, the letters, and leave 2826 the leading `:' and `[' (but store them as character). */ 2827 if (c == ':' && *p == ']') 2828 { 2829 wctype_t wt; 2830 uintptr_t alignedp; 2831 2832 /* Query the character class as wctype_t. */ 2833 wt = IS_CHAR_CLASS (str); 2834 if (wt == 0) 2835 FREE_STACK_RETURN (REG_ECTYPE); 2836 2837 /* Throw away the ] at the end of the character 2838 class. */ 2839 PATFETCH (c); 2840 2841 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2842 2843 /* Allocate the space for character class. */ 2844 GET_BUFFER_SPACE(CHAR_CLASS_SIZE); 2845 /* Update the pointer to indicate end of buffer. */ 2846 b += CHAR_CLASS_SIZE; 2847 /* Move data which follow character classes 2848 not to violate the data. */ 2849 insert_space(CHAR_CLASS_SIZE, 2850 laststart + 6 + laststart[1], 2851 b - 1); 2852 alignedp = ((uintptr_t)(laststart + 6 + laststart[1]) 2853 + __alignof__(wctype_t) - 1) 2854 & ~(uintptr_t)(__alignof__(wctype_t) - 1); 2855 /* Store the character class. */ 2856 *((wctype_t*)alignedp) = wt; 2857 /* Update length of char_classes */ 2858 laststart[1] += CHAR_CLASS_SIZE; 2859 2860 had_char_class = true; 2861 } 2862 else 2863 { 2864 c1++; 2865 while (c1--) 2866 PATUNFETCH; 2867 BUF_PUSH ('['); 2868 BUF_PUSH (':'); 2869 laststart[5] += 2; /* Update the length of characters */ 2870 range_start = ':'; 2871 had_char_class = false; 2872 } 2873 } 2874 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '=' 2875 || *p == '.')) 2876 { 2877 CHAR_T str[128]; /* Should be large enough. */ 2878 CHAR_T delim = *p; /* '=' or '.' */ 2879 # ifdef _LIBC 2880 uint32_t nrules = 2881 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2882 # endif 2883 PATFETCH (c); 2884 c1 = 0; 2885 2886 /* If pattern is `[[=' or '[[.'. */ 2887 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2888 2889 for (;;) 2890 { 2891 PATFETCH (c); 2892 if ((c == delim && *p == ']') || p == pend) 2893 break; 2894 if (c1 < sizeof (str) - 1) 2895 str[c1++] = c; 2896 else 2897 /* This is in any case an invalid class name. */ 2898 str[0] = '\0'; 2899 } 2900 str[c1] = '\0'; 2901 2902 if (c == delim && *p == ']' && str[0] != '\0') 2903 { 2904 unsigned int i, offset; 2905 /* If we have no collation data we use the default 2906 collation in which each character is in a class 2907 by itself. It also means that ASCII is the 2908 character set and therefore we cannot have character 2909 with more than one byte in the multibyte 2910 representation. */ 2911 2912 /* If not defined _LIBC, we push the name and 2913 `\0' for the sake of matching performance. */ 2914 int datasize = c1 + 1; 2915 2916 # ifdef _LIBC 2917 int32_t idx = 0; 2918 if (nrules == 0) 2919 # endif 2920 { 2921 if (c1 != 1) 2922 FREE_STACK_RETURN (REG_ECOLLATE); 2923 } 2924 # ifdef _LIBC 2925 else 2926 { 2927 const int32_t *table; 2928 const int32_t *weights; 2929 const int32_t *extra; 2930 const int32_t *indirect; 2931 wint_t *cp; 2932 2933 /* This #include defines a local function! */ 2934 # include <locale/weightwc.h> 2935 2936 if(delim == '=') 2937 { 2938 /* We push the index for equivalence class. */ 2939 cp = (wint_t*)str; 2940 2941 table = (const int32_t *) 2942 _NL_CURRENT (LC_COLLATE, 2943 _NL_COLLATE_TABLEWC); 2944 weights = (const int32_t *) 2945 _NL_CURRENT (LC_COLLATE, 2946 _NL_COLLATE_WEIGHTWC); 2947 extra = (const int32_t *) 2948 _NL_CURRENT (LC_COLLATE, 2949 _NL_COLLATE_EXTRAWC); 2950 indirect = (const int32_t *) 2951 _NL_CURRENT (LC_COLLATE, 2952 _NL_COLLATE_INDIRECTWC); 2953 2954 idx = findidx ((const wint_t**)&cp); 2955 if (idx == 0 || cp < (wint_t*) str + c1) 2956 /* This is no valid character. */ 2957 FREE_STACK_RETURN (REG_ECOLLATE); 2958 2959 str[0] = (wchar_t)idx; 2960 } 2961 else /* delim == '.' */ 2962 { 2963 /* We push collation sequence value 2964 for collating symbol. */ 2965 int32_t table_size; 2966 const int32_t *symb_table; 2967 const unsigned char *extra; 2968 int32_t idx; 2969 int32_t elem; 2970 int32_t second; 2971 int32_t hash; 2972 char char_str[c1]; 2973 2974 /* We have to convert the name to a single-byte 2975 string. This is possible since the names 2976 consist of ASCII characters and the internal 2977 representation is UCS4. */ 2978 for (i = 0; i < c1; ++i) 2979 char_str[i] = str[i]; 2980 2981 table_size = 2982 _NL_CURRENT_WORD (LC_COLLATE, 2983 _NL_COLLATE_SYMB_HASH_SIZEMB); 2984 symb_table = (const int32_t *) 2985 _NL_CURRENT (LC_COLLATE, 2986 _NL_COLLATE_SYMB_TABLEMB); 2987 extra = (const unsigned char *) 2988 _NL_CURRENT (LC_COLLATE, 2989 _NL_COLLATE_SYMB_EXTRAMB); 2990 2991 /* Locate the character in the hashing table. */ 2992 hash = elem_hash (char_str, c1); 2993 2994 idx = 0; 2995 elem = hash % table_size; 2996 second = hash % (table_size - 2); 2997 while (symb_table[2 * elem] != 0) 2998 { 2999 /* First compare the hashing value. */ 3000 if (symb_table[2 * elem] == hash 3001 && c1 == extra[symb_table[2 * elem + 1]] 3002 && memcmp (char_str, 3003 &extra[symb_table[2 * elem + 1] 3004 + 1], c1) == 0) 3005 { 3006 /* Yep, this is the entry. */ 3007 idx = symb_table[2 * elem + 1]; 3008 idx += 1 + extra[idx]; 3009 break; 3010 } 3011 3012 /* Next entry. */ 3013 elem += second; 3014 } 3015 3016 if (symb_table[2 * elem] != 0) 3017 { 3018 /* Compute the index of the byte sequence 3019 in the table. */ 3020 idx += 1 + extra[idx]; 3021 /* Adjust for the alignment. */ 3022 idx = (idx + 3) & ~3; 3023 3024 str[0] = (wchar_t) idx + 4; 3025 } 3026 else if (symb_table[2 * elem] == 0 && c1 == 1) 3027 { 3028 /* No valid character. Match it as a 3029 single byte character. */ 3030 had_char_class = false; 3031 BUF_PUSH(str[0]); 3032 /* Update the length of characters */ 3033 laststart[5]++; 3034 range_start = str[0]; 3035 3036 /* Throw away the ] at the end of the 3037 collating symbol. */ 3038 PATFETCH (c); 3039 /* exit from the switch block. */ 3040 continue; 3041 } 3042 else 3043 FREE_STACK_RETURN (REG_ECOLLATE); 3044 } 3045 datasize = 1; 3046 } 3047 # endif 3048 /* Throw away the ] at the end of the equivalence 3049 class (or collating symbol). */ 3050 PATFETCH (c); 3051 3052 /* Allocate the space for the equivalence class 3053 (or collating symbol) (and '\0' if needed). */ 3054 GET_BUFFER_SPACE(datasize); 3055 /* Update the pointer to indicate end of buffer. */ 3056 b += datasize; 3057 3058 if (delim == '=') 3059 { /* equivalence class */ 3060 /* Calculate the offset of char_ranges, 3061 which is next to equivalence_classes. */ 3062 offset = laststart[1] + laststart[2] 3063 + laststart[3] +6; 3064 /* Insert space. */ 3065 insert_space(datasize, laststart + offset, b - 1); 3066 3067 /* Write the equivalence_class and \0. */ 3068 for (i = 0 ; i < datasize ; i++) 3069 laststart[offset + i] = str[i]; 3070 3071 /* Update the length of equivalence_classes. */ 3072 laststart[3] += datasize; 3073 had_char_class = true; 3074 } 3075 else /* delim == '.' */ 3076 { /* collating symbol */ 3077 /* Calculate the offset of the equivalence_classes, 3078 which is next to collating_symbols. */ 3079 offset = laststart[1] + laststart[2] + 6; 3080 /* Insert space and write the collationg_symbol 3081 and \0. */ 3082 insert_space(datasize, laststart + offset, b-1); 3083 for (i = 0 ; i < datasize ; i++) 3084 laststart[offset + i] = str[i]; 3085 3086 /* In re_match_2_internal if range_start < -1, we 3087 assume -range_start is the offset of the 3088 collating symbol which is specified as 3089 the character of the range start. So we assign 3090 -(laststart[1] + laststart[2] + 6) to 3091 range_start. */ 3092 range_start = -(laststart[1] + laststart[2] + 6); 3093 /* Update the length of collating_symbol. */ 3094 laststart[2] += datasize; 3095 had_char_class = false; 3096 } 3097 } 3098 else 3099 { 3100 c1++; 3101 while (c1--) 3102 PATUNFETCH; 3103 BUF_PUSH ('['); 3104 BUF_PUSH (delim); 3105 laststart[5] += 2; /* Update the length of characters */ 3106 range_start = delim; 3107 had_char_class = false; 3108 } 3109 } 3110 else 3111 { 3112 had_char_class = false; 3113 BUF_PUSH(c); 3114 laststart[5]++; /* Update the length of characters */ 3115 range_start = c; 3116 } 3117 } 3118 3119 #else /* BYTE */ 3120 /* Ensure that we have enough space to push a charset: the 3121 opcode, the length count, and the bitset; 34 bytes in all. */ 3122 GET_BUFFER_SPACE (34); 3123 3124 laststart = b; 3125 3126 /* We test `*p == '^' twice, instead of using an if 3127 statement, so we only need one BUF_PUSH. */ 3128 BUF_PUSH (*p == '^' ? charset_not : charset); 3129 if (*p == '^') 3130 p++; 3131 3132 /* Remember the first position in the bracket expression. */ 3133 p1 = p; 3134 3135 /* Push the number of bytes in the bitmap. */ 3136 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 3137 3138 /* Clear the whole map. */ 3139 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 3140 3141 /* charset_not matches newline according to a syntax bit. */ 3142 if ((re_opcode_t) b[-2] == charset_not 3143 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 3144 SET_LIST_BIT ('\n'); 3145 3146 /* Read in characters and ranges, setting map bits. */ 3147 for (;;) 3148 { 3149 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3150 3151 PATFETCH (c); 3152 3153 /* \ might escape characters inside [...] and [^...]. */ 3154 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 3155 { 3156 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 3157 3158 PATFETCH (c1); 3159 SET_LIST_BIT (c1); 3160 range_start = c1; 3161 continue; 3162 } 3163 3164 /* Could be the end of the bracket expression. If it's 3165 not (i.e., when the bracket expression is `[]' so 3166 far), the ']' character bit gets set way below. */ 3167 if (c == ']' && p != p1 + 1) 3168 break; 3169 3170 /* Look ahead to see if it's a range when the last thing 3171 was a character class. */ 3172 if (had_char_class && c == '-' && *p != ']') 3173 FREE_STACK_RETURN (REG_ERANGE); 3174 3175 /* Look ahead to see if it's a range when the last thing 3176 was a character: if this is a hyphen not at the 3177 beginning or the end of a list, then it's the range 3178 operator. */ 3179 if (c == '-' 3180 && !(p - 2 >= pattern && p[-2] == '[') 3181 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 3182 && *p != ']') 3183 { 3184 reg_errcode_t ret 3185 = byte_compile_range (range_start, &p, pend, translate, 3186 syntax, b); 3187 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 3188 range_start = 0xffffffff; 3189 } 3190 3191 else if (p[0] == '-' && p[1] != ']') 3192 { /* This handles ranges made up of characters only. */ 3193 reg_errcode_t ret; 3194 3195 /* Move past the `-'. */ 3196 PATFETCH (c1); 3197 3198 ret = byte_compile_range (c, &p, pend, translate, syntax, b); 3199 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 3200 range_start = 0xffffffff; 3201 } 3202 3203 /* See if we're at the beginning of a possible character 3204 class. */ 3205 3206 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 3207 { /* Leave room for the null. */ 3208 char str[CHAR_CLASS_MAX_LENGTH + 1]; 3209 3210 PATFETCH (c); 3211 c1 = 0; 3212 3213 /* If pattern is `[[:'. */ 3214 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3215 3216 for (;;) 3217 { 3218 PATFETCH (c); 3219 if ((c == ':' && *p == ']') || p == pend) 3220 break; 3221 if (c1 < CHAR_CLASS_MAX_LENGTH) 3222 str[c1++] = c; 3223 else 3224 /* This is in any case an invalid class name. */ 3225 str[0] = '\0'; 3226 } 3227 str[c1] = '\0'; 3228 3229 /* If isn't a word bracketed by `[:' and `:]': 3230 undo the ending character, the letters, and leave 3231 the leading `:' and `[' (but set bits for them). */ 3232 if (c == ':' && *p == ']') 3233 { 3234 # if defined _LIBC || WIDE_CHAR_SUPPORT 3235 boolean is_lower = STREQ (str, "lower"); 3236 boolean is_upper = STREQ (str, "upper"); 3237 wctype_t wt; 3238 int ch; 3239 3240 wt = IS_CHAR_CLASS (str); 3241 if (wt == 0) 3242 FREE_STACK_RETURN (REG_ECTYPE); 3243 3244 /* Throw away the ] at the end of the character 3245 class. */ 3246 PATFETCH (c); 3247 3248 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3249 3250 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) 3251 { 3252 # ifdef _LIBC 3253 if (__iswctype (__btowc (ch), wt)) 3254 SET_LIST_BIT (ch); 3255 # else 3256 if (iswctype (btowc (ch), wt)) 3257 SET_LIST_BIT (ch); 3258 # endif 3259 3260 if (translate && (is_upper || is_lower) 3261 && (ISUPPER (ch) || ISLOWER (ch))) 3262 SET_LIST_BIT (ch); 3263 } 3264 3265 had_char_class = true; 3266 # else 3267 int ch; 3268 boolean is_alnum = STREQ (str, "alnum"); 3269 boolean is_alpha = STREQ (str, "alpha"); 3270 boolean is_blank = STREQ (str, "blank"); 3271 boolean is_cntrl = STREQ (str, "cntrl"); 3272 boolean is_digit = STREQ (str, "digit"); 3273 boolean is_graph = STREQ (str, "graph"); 3274 boolean is_lower = STREQ (str, "lower"); 3275 boolean is_print = STREQ (str, "print"); 3276 boolean is_punct = STREQ (str, "punct"); 3277 boolean is_space = STREQ (str, "space"); 3278 boolean is_upper = STREQ (str, "upper"); 3279 boolean is_xdigit = STREQ (str, "xdigit"); 3280 3281 if (!IS_CHAR_CLASS (str)) 3282 FREE_STACK_RETURN (REG_ECTYPE); 3283 3284 /* Throw away the ] at the end of the character 3285 class. */ 3286 PATFETCH (c); 3287 3288 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3289 3290 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 3291 { 3292 /* This was split into 3 if's to 3293 avoid an arbitrary limit in some compiler. */ 3294 if ( (is_alnum && ISALNUM (ch)) 3295 || (is_alpha && ISALPHA (ch)) 3296 || (is_blank && ISBLANK (ch)) 3297 || (is_cntrl && ISCNTRL (ch))) 3298 SET_LIST_BIT (ch); 3299 if ( (is_digit && ISDIGIT (ch)) 3300 || (is_graph && ISGRAPH (ch)) 3301 || (is_lower && ISLOWER (ch)) 3302 || (is_print && ISPRINT (ch))) 3303 SET_LIST_BIT (ch); 3304 if ( (is_punct && ISPUNCT (ch)) 3305 || (is_space && ISSPACE (ch)) 3306 || (is_upper && ISUPPER (ch)) 3307 || (is_xdigit && ISXDIGIT (ch))) 3308 SET_LIST_BIT (ch); 3309 if ( translate && (is_upper || is_lower) 3310 && (ISUPPER (ch) || ISLOWER (ch))) 3311 SET_LIST_BIT (ch); 3312 } 3313 had_char_class = true; 3314 # endif /* libc || wctype.h */ 3315 } 3316 else 3317 { 3318 c1++; 3319 while (c1--) 3320 PATUNFETCH; 3321 SET_LIST_BIT ('['); 3322 SET_LIST_BIT (':'); 3323 range_start = ':'; 3324 had_char_class = false; 3325 } 3326 } 3327 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') 3328 { 3329 unsigned char str[MB_LEN_MAX + 1]; 3330 # ifdef _LIBC 3331 uint32_t nrules = 3332 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3333 # endif 3334 3335 PATFETCH (c); 3336 c1 = 0; 3337 3338 /* If pattern is `[[='. */ 3339 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3340 3341 for (;;) 3342 { 3343 PATFETCH (c); 3344 if ((c == '=' && *p == ']') || p == pend) 3345 break; 3346 if (c1 < MB_LEN_MAX) 3347 str[c1++] = c; 3348 else 3349 /* This is in any case an invalid class name. */ 3350 str[0] = '\0'; 3351 } 3352 str[c1] = '\0'; 3353 3354 if (c == '=' && *p == ']' && str[0] != '\0') 3355 { 3356 /* If we have no collation data we use the default 3357 collation in which each character is in a class 3358 by itself. It also means that ASCII is the 3359 character set and therefore we cannot have character 3360 with more than one byte in the multibyte 3361 representation. */ 3362 # ifdef _LIBC 3363 if (nrules == 0) 3364 # endif 3365 { 3366 if (c1 != 1) 3367 FREE_STACK_RETURN (REG_ECOLLATE); 3368 3369 /* Throw away the ] at the end of the equivalence 3370 class. */ 3371 PATFETCH (c); 3372 3373 /* Set the bit for the character. */ 3374 SET_LIST_BIT (str[0]); 3375 } 3376 # ifdef _LIBC 3377 else 3378 { 3379 /* Try to match the byte sequence in `str' against 3380 those known to the collate implementation. 3381 First find out whether the bytes in `str' are 3382 actually from exactly one character. */ 3383 const int32_t *table; 3384 const unsigned char *weights; 3385 const unsigned char *extra; 3386 const int32_t *indirect; 3387 int32_t idx; 3388 const unsigned char *cp = str; 3389 int ch; 3390 3391 /* This #include defines a local function! */ 3392 # include <locale/weight.h> 3393 3394 table = (const int32_t *) 3395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3396 weights = (const unsigned char *) 3397 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3398 extra = (const unsigned char *) 3399 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3400 indirect = (const int32_t *) 3401 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3402 3403 idx = findidx (&cp); 3404 if (idx == 0 || cp < str + c1) 3405 /* This is no valid character. */ 3406 FREE_STACK_RETURN (REG_ECOLLATE); 3407 3408 /* Throw away the ] at the end of the equivalence 3409 class. */ 3410 PATFETCH (c); 3411 3412 /* Now we have to go through the whole table 3413 and find all characters which have the same 3414 first level weight. 3415 3416 XXX Note that this is not entirely correct. 3417 we would have to match multibyte sequences 3418 but this is not possible with the current 3419 implementation. */ 3420 for (ch = 1; ch < 256; ++ch) 3421 /* XXX This test would have to be changed if we 3422 would allow matching multibyte sequences. */ 3423 if (table[ch] > 0) 3424 { 3425 int32_t idx2 = table[ch]; 3426 size_t len = weights[idx2]; 3427 3428 /* Test whether the lenghts match. */ 3429 if (weights[idx] == len) 3430 { 3431 /* They do. New compare the bytes of 3432 the weight. */ 3433 size_t cnt = 0; 3434 3435 while (cnt < len 3436 && (weights[idx + 1 + cnt] 3437 == weights[idx2 + 1 + cnt])) 3438 ++cnt; 3439 3440 if (cnt == len) 3441 /* They match. Mark the character as 3442 acceptable. */ 3443 SET_LIST_BIT (ch); 3444 } 3445 } 3446 } 3447 # endif 3448 had_char_class = true; 3449 } 3450 else 3451 { 3452 c1++; 3453 while (c1--) 3454 PATUNFETCH; 3455 SET_LIST_BIT ('['); 3456 SET_LIST_BIT ('='); 3457 range_start = '='; 3458 had_char_class = false; 3459 } 3460 } 3461 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') 3462 { 3463 unsigned char str[128]; /* Should be large enough. */ 3464 # ifdef _LIBC 3465 uint32_t nrules = 3466 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3467 # endif 3468 3469 PATFETCH (c); 3470 c1 = 0; 3471 3472 /* If pattern is `[[.'. */ 3473 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3474 3475 for (;;) 3476 { 3477 PATFETCH (c); 3478 if ((c == '.' && *p == ']') || p == pend) 3479 break; 3480 if (c1 < sizeof (str)) 3481 str[c1++] = c; 3482 else 3483 /* This is in any case an invalid class name. */ 3484 str[0] = '\0'; 3485 } 3486 str[c1] = '\0'; 3487 3488 if (c == '.' && *p == ']' && str[0] != '\0') 3489 { 3490 /* If we have no collation data we use the default 3491 collation in which each character is the name 3492 for its own class which contains only the one 3493 character. It also means that ASCII is the 3494 character set and therefore we cannot have character 3495 with more than one byte in the multibyte 3496 representation. */ 3497 # ifdef _LIBC 3498 if (nrules == 0) 3499 # endif 3500 { 3501 if (c1 != 1) 3502 FREE_STACK_RETURN (REG_ECOLLATE); 3503 3504 /* Throw away the ] at the end of the equivalence 3505 class. */ 3506 PATFETCH (c); 3507 3508 /* Set the bit for the character. */ 3509 SET_LIST_BIT (str[0]); 3510 range_start = ((const unsigned char *) str)[0]; 3511 } 3512 # ifdef _LIBC 3513 else 3514 { 3515 /* Try to match the byte sequence in `str' against 3516 those known to the collate implementation. 3517 First find out whether the bytes in `str' are 3518 actually from exactly one character. */ 3519 int32_t table_size; 3520 const int32_t *symb_table; 3521 const unsigned char *extra; 3522 int32_t idx; 3523 int32_t elem; 3524 int32_t second; 3525 int32_t hash; 3526 3527 table_size = 3528 _NL_CURRENT_WORD (LC_COLLATE, 3529 _NL_COLLATE_SYMB_HASH_SIZEMB); 3530 symb_table = (const int32_t *) 3531 _NL_CURRENT (LC_COLLATE, 3532 _NL_COLLATE_SYMB_TABLEMB); 3533 extra = (const unsigned char *) 3534 _NL_CURRENT (LC_COLLATE, 3535 _NL_COLLATE_SYMB_EXTRAMB); 3536 3537 /* Locate the character in the hashing table. */ 3538 hash = elem_hash (str, c1); 3539 3540 idx = 0; 3541 elem = hash % table_size; 3542 second = hash % (table_size - 2); 3543 while (symb_table[2 * elem] != 0) 3544 { 3545 /* First compare the hashing value. */ 3546 if (symb_table[2 * elem] == hash 3547 && c1 == extra[symb_table[2 * elem + 1]] 3548 && memcmp (str, 3549 &extra[symb_table[2 * elem + 1] 3550 + 1], 3551 c1) == 0) 3552 { 3553 /* Yep, this is the entry. */ 3554 idx = symb_table[2 * elem + 1]; 3555 idx += 1 + extra[idx]; 3556 break; 3557 } 3558 3559 /* Next entry. */ 3560 elem += second; 3561 } 3562 3563 if (symb_table[2 * elem] == 0) 3564 /* This is no valid character. */ 3565 FREE_STACK_RETURN (REG_ECOLLATE); 3566 3567 /* Throw away the ] at the end of the equivalence 3568 class. */ 3569 PATFETCH (c); 3570 3571 /* Now add the multibyte character(s) we found 3572 to the accept list. 3573 3574 XXX Note that this is not entirely correct. 3575 we would have to match multibyte sequences 3576 but this is not possible with the current 3577 implementation. Also, we have to match 3578 collating symbols, which expand to more than 3579 one file, as a whole and not allow the 3580 individual bytes. */ 3581 c1 = extra[idx++]; 3582 if (c1 == 1) 3583 range_start = extra[idx]; 3584 while (c1-- > 0) 3585 { 3586 SET_LIST_BIT (extra[idx]); 3587 ++idx; 3588 } 3589 } 3590 # endif 3591 had_char_class = false; 3592 } 3593 else 3594 { 3595 c1++; 3596 while (c1--) 3597 PATUNFETCH; 3598 SET_LIST_BIT ('['); 3599 SET_LIST_BIT ('.'); 3600 range_start = '.'; 3601 had_char_class = false; 3602 } 3603 } 3604 else 3605 { 3606 had_char_class = false; 3607 SET_LIST_BIT (c); 3608 range_start = c; 3609 } 3610 } 3611 3612 /* Discard any (non)matching list bytes that are all 0 at the 3613 end of the map. Decrease the map-length byte too. */ 3614 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 3615 b[-1]--; 3616 b += b[-1]; 3617 #endif /* WCHAR */ 3618 } 3619 break; 3620 3621 3622 case '(': 3623 if (syntax & RE_NO_BK_PARENS) 3624 goto handle_open; 3625 else 3626 goto normal_char; 3627 3628 3629 case ')': 3630 if (syntax & RE_NO_BK_PARENS) 3631 goto handle_close; 3632 else 3633 goto normal_char; 3634 3635 3636 case '\n': 3637 if (syntax & RE_NEWLINE_ALT) 3638 goto handle_alt; 3639 else 3640 goto normal_char; 3641 3642 3643 case '|': 3644 if (syntax & RE_NO_BK_VBAR) 3645 goto handle_alt; 3646 else 3647 goto normal_char; 3648 3649 3650 case '{': 3651 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 3652 goto handle_interval; 3653 else 3654 goto normal_char; 3655 3656 3657 case '\\': 3658 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 3659 3660 /* Do not translate the character after the \, so that we can 3661 distinguish, e.g., \B from \b, even if we normally would 3662 translate, e.g., B to b. */ 3663 PATFETCH_RAW (c); 3664 3665 switch (c) 3666 { 3667 case '(': 3668 if (syntax & RE_NO_BK_PARENS) 3669 goto normal_backslash; 3670 3671 handle_open: 3672 bufp->re_nsub++; 3673 regnum++; 3674 3675 if (COMPILE_STACK_FULL) 3676 { 3677 RETALLOC (compile_stack.stack, compile_stack.size << 1, 3678 compile_stack_elt_t); 3679 if (compile_stack.stack == NULL) return REG_ESPACE; 3680 3681 compile_stack.size <<= 1; 3682 } 3683 3684 /* These are the values to restore when we hit end of this 3685 group. They are all relative offsets, so that if the 3686 whole pattern moves because of realloc, they will still 3687 be valid. */ 3688 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR; 3689 COMPILE_STACK_TOP.fixup_alt_jump 3690 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0; 3691 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR; 3692 COMPILE_STACK_TOP.regnum = regnum; 3693 3694 /* We will eventually replace the 0 with the number of 3695 groups inner to this one. But do not push a 3696 start_memory for groups beyond the last one we can 3697 represent in the compiled pattern. */ 3698 if (regnum <= MAX_REGNUM) 3699 { 3700 COMPILE_STACK_TOP.inner_group_offset = b 3701 - COMPILED_BUFFER_VAR + 2; 3702 BUF_PUSH_3 (start_memory, regnum, 0); 3703 } 3704 3705 compile_stack.avail++; 3706 3707 fixup_alt_jump = 0; 3708 laststart = 0; 3709 begalt = b; 3710 /* If we've reached MAX_REGNUM groups, then this open 3711 won't actually generate any code, so we'll have to 3712 clear pending_exact explicitly. */ 3713 pending_exact = 0; 3714 break; 3715 3716 3717 case ')': 3718 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 3719 3720 if (COMPILE_STACK_EMPTY) 3721 { 3722 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 3723 goto normal_backslash; 3724 else 3725 FREE_STACK_RETURN (REG_ERPAREN); 3726 } 3727 3728 handle_close: 3729 if (fixup_alt_jump) 3730 { /* Push a dummy failure point at the end of the 3731 alternative for a possible future 3732 `pop_failure_jump' to pop. See comments at 3733 `push_dummy_failure' in `re_match_2'. */ 3734 BUF_PUSH (push_dummy_failure); 3735 3736 /* We allocated space for this jump when we assigned 3737 to `fixup_alt_jump', in the `handle_alt' case below. */ 3738 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 3739 } 3740 3741 /* See similar code for backslashed left paren above. */ 3742 if (COMPILE_STACK_EMPTY) 3743 { 3744 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 3745 goto normal_char; 3746 else 3747 FREE_STACK_RETURN (REG_ERPAREN); 3748 } 3749 3750 /* Since we just checked for an empty stack above, this 3751 ``can't happen''. */ 3752 assert (compile_stack.avail != 0); 3753 { 3754 /* We don't just want to restore into `regnum', because 3755 later groups should continue to be numbered higher, 3756 as in `(ab)c(de)' -- the second group is #2. */ 3757 regnum_t this_group_regnum; 3758 3759 compile_stack.avail--; 3760 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset; 3761 fixup_alt_jump 3762 = COMPILE_STACK_TOP.fixup_alt_jump 3763 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1 3764 : 0; 3765 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset; 3766 this_group_regnum = COMPILE_STACK_TOP.regnum; 3767 /* If we've reached MAX_REGNUM groups, then this open 3768 won't actually generate any code, so we'll have to 3769 clear pending_exact explicitly. */ 3770 pending_exact = 0; 3771 3772 /* We're at the end of the group, so now we know how many 3773 groups were inside this one. */ 3774 if (this_group_regnum <= MAX_REGNUM) 3775 { 3776 UCHAR_T *inner_group_loc 3777 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset; 3778 3779 *inner_group_loc = regnum - this_group_regnum; 3780 BUF_PUSH_3 (stop_memory, this_group_regnum, 3781 regnum - this_group_regnum); 3782 } 3783 } 3784 break; 3785 3786 3787 case '|': /* `\|'. */ 3788 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 3789 goto normal_backslash; 3790 handle_alt: 3791 if (syntax & RE_LIMITED_OPS) 3792 goto normal_char; 3793 3794 /* Insert before the previous alternative a jump which 3795 jumps to this alternative if the former fails. */ 3796 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3797 INSERT_JUMP (on_failure_jump, begalt, 3798 b + 2 + 2 * OFFSET_ADDRESS_SIZE); 3799 pending_exact = 0; 3800 b += 1 + OFFSET_ADDRESS_SIZE; 3801 3802 /* The alternative before this one has a jump after it 3803 which gets executed if it gets matched. Adjust that 3804 jump so it will jump to this alternative's analogous 3805 jump (put in below, which in turn will jump to the next 3806 (if any) alternative's such jump, etc.). The last such 3807 jump jumps to the correct final destination. A picture: 3808 _____ _____ 3809 | | | | 3810 | v | v 3811 a | b | c 3812 3813 If we are at `b', then fixup_alt_jump right now points to a 3814 three-byte space after `a'. We'll put in the jump, set 3815 fixup_alt_jump to right after `b', and leave behind three 3816 bytes which we'll fill in when we get to after `c'. */ 3817 3818 if (fixup_alt_jump) 3819 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 3820 3821 /* Mark and leave space for a jump after this alternative, 3822 to be filled in later either by next alternative or 3823 when know we're at the end of a series of alternatives. */ 3824 fixup_alt_jump = b; 3825 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3826 b += 1 + OFFSET_ADDRESS_SIZE; 3827 3828 laststart = 0; 3829 begalt = b; 3830 break; 3831 3832 3833 case '{': 3834 /* If \{ is a literal. */ 3835 if (!(syntax & RE_INTERVALS) 3836 /* If we're at `\{' and it's not the open-interval 3837 operator. */ 3838 || (syntax & RE_NO_BK_BRACES)) 3839 goto normal_backslash; 3840 3841 handle_interval: 3842 { 3843 /* If got here, then the syntax allows intervals. */ 3844 3845 /* At least (most) this many matches must be made. */ 3846 int lower_bound = -1, upper_bound = -1; 3847 3848 /* Place in the uncompiled pattern (i.e., just after 3849 the '{') to go back to if the interval is invalid. */ 3850 const CHAR_T *beg_interval = p; 3851 3852 if (p == pend) 3853 goto invalid_interval; 3854 3855 GET_UNSIGNED_NUMBER (lower_bound); 3856 3857 if (c == ',') 3858 { 3859 GET_UNSIGNED_NUMBER (upper_bound); 3860 if (upper_bound < 0) 3861 upper_bound = RE_DUP_MAX; 3862 } 3863 else 3864 /* Interval such as `{1}' => match exactly once. */ 3865 upper_bound = lower_bound; 3866 3867 if (! (0 <= lower_bound && lower_bound <= upper_bound)) 3868 goto invalid_interval; 3869 3870 if (!(syntax & RE_NO_BK_BRACES)) 3871 { 3872 if (c != '\\' || p == pend) 3873 goto invalid_interval; 3874 PATFETCH (c); 3875 } 3876 3877 if (c != '}') 3878 goto invalid_interval; 3879 3880 /* If it's invalid to have no preceding re. */ 3881 if (!laststart) 3882 { 3883 if (syntax & RE_CONTEXT_INVALID_OPS 3884 && !(syntax & RE_INVALID_INTERVAL_ORD)) 3885 FREE_STACK_RETURN (REG_BADRPT); 3886 else if (syntax & RE_CONTEXT_INDEP_OPS) 3887 laststart = b; 3888 else 3889 goto unfetch_interval; 3890 } 3891 3892 /* We just parsed a valid interval. */ 3893 3894 if (RE_DUP_MAX < upper_bound) 3895 FREE_STACK_RETURN (REG_BADBR); 3896 3897 /* If the upper bound is zero, don't want to succeed at 3898 all; jump from `laststart' to `b + 3', which will be 3899 the end of the buffer after we insert the jump. */ 3900 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' 3901 instead of 'b + 3'. */ 3902 if (upper_bound == 0) 3903 { 3904 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3905 INSERT_JUMP (jump, laststart, b + 1 3906 + OFFSET_ADDRESS_SIZE); 3907 b += 1 + OFFSET_ADDRESS_SIZE; 3908 } 3909 3910 /* Otherwise, we have a nontrivial interval. When 3911 we're all done, the pattern will look like: 3912 set_number_at <jump count> <upper bound> 3913 set_number_at <succeed_n count> <lower bound> 3914 succeed_n <after jump addr> <succeed_n count> 3915 <body of loop> 3916 jump_n <succeed_n addr> <jump count> 3917 (The upper bound and `jump_n' are omitted if 3918 `upper_bound' is 1, though.) */ 3919 else 3920 { /* If the upper bound is > 1, we need to insert 3921 more at the end of the loop. */ 3922 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE + 3923 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE); 3924 3925 GET_BUFFER_SPACE (nbytes); 3926 3927 /* Initialize lower bound of the `succeed_n', even 3928 though it will be set during matching by its 3929 attendant `set_number_at' (inserted next), 3930 because `re_compile_fastmap' needs to know. 3931 Jump to the `jump_n' we might insert below. */ 3932 INSERT_JUMP2 (succeed_n, laststart, 3933 b + 1 + 2 * OFFSET_ADDRESS_SIZE 3934 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE) 3935 , lower_bound); 3936 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3937 3938 /* Code to initialize the lower bound. Insert 3939 before the `succeed_n'. The `5' is the last two 3940 bytes of this `set_number_at', plus 3 bytes of 3941 the following `succeed_n'. */ 3942 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE' 3943 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE' 3944 of the following `succeed_n'. */ 3945 PREFIX(insert_op2) (set_number_at, laststart, 1 3946 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b); 3947 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3948 3949 if (upper_bound > 1) 3950 { /* More than one repetition is allowed, so 3951 append a backward jump to the `succeed_n' 3952 that starts this interval. 3953 3954 When we've reached this during matching, 3955 we'll have matched the interval once, so 3956 jump back only `upper_bound - 1' times. */ 3957 STORE_JUMP2 (jump_n, b, laststart 3958 + 2 * OFFSET_ADDRESS_SIZE + 1, 3959 upper_bound - 1); 3960 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3961 3962 /* The location we want to set is the second 3963 parameter of the `jump_n'; that is `b-2' as 3964 an absolute address. `laststart' will be 3965 the `set_number_at' we're about to insert; 3966 `laststart+3' the number to set, the source 3967 for the relative address. But we are 3968 inserting into the middle of the pattern -- 3969 so everything is getting moved up by 5. 3970 Conclusion: (b - 2) - (laststart + 3) + 5, 3971 i.e., b - laststart. 3972 3973 We insert this at the beginning of the loop 3974 so that if we fail during matching, we'll 3975 reinitialize the bounds. */ 3976 PREFIX(insert_op2) (set_number_at, laststart, 3977 b - laststart, 3978 upper_bound - 1, b); 3979 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3980 } 3981 } 3982 pending_exact = 0; 3983 break; 3984 3985 invalid_interval: 3986 if (!(syntax & RE_INVALID_INTERVAL_ORD)) 3987 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR); 3988 unfetch_interval: 3989 /* Match the characters as literals. */ 3990 p = beg_interval; 3991 c = '{'; 3992 if (syntax & RE_NO_BK_BRACES) 3993 goto normal_char; 3994 else 3995 goto normal_backslash; 3996 } 3997 3998 #ifdef emacs 3999 /* There is no way to specify the before_dot and after_dot 4000 operators. rms says this is ok. --karl */ 4001 case '=': 4002 BUF_PUSH (at_dot); 4003 break; 4004 4005 case 's': 4006 laststart = b; 4007 PATFETCH (c); 4008 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 4009 break; 4010 4011 case 'S': 4012 laststart = b; 4013 PATFETCH (c); 4014 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 4015 break; 4016 #endif /* emacs */ 4017 4018 4019 case 'w': 4020 if (syntax & RE_NO_GNU_OPS) 4021 goto normal_char; 4022 laststart = b; 4023 BUF_PUSH (wordchar); 4024 break; 4025 4026 4027 case 'W': 4028 if (syntax & RE_NO_GNU_OPS) 4029 goto normal_char; 4030 laststart = b; 4031 BUF_PUSH (notwordchar); 4032 break; 4033 4034 4035 case '<': 4036 if (syntax & RE_NO_GNU_OPS) 4037 goto normal_char; 4038 BUF_PUSH (wordbeg); 4039 break; 4040 4041 case '>': 4042 if (syntax & RE_NO_GNU_OPS) 4043 goto normal_char; 4044 BUF_PUSH (wordend); 4045 break; 4046 4047 case 'b': 4048 if (syntax & RE_NO_GNU_OPS) 4049 goto normal_char; 4050 BUF_PUSH (wordbound); 4051 break; 4052 4053 case 'B': 4054 if (syntax & RE_NO_GNU_OPS) 4055 goto normal_char; 4056 BUF_PUSH (notwordbound); 4057 break; 4058 4059 case '`': 4060 if (syntax & RE_NO_GNU_OPS) 4061 goto normal_char; 4062 BUF_PUSH (begbuf); 4063 break; 4064 4065 case '\'': 4066 if (syntax & RE_NO_GNU_OPS) 4067 goto normal_char; 4068 BUF_PUSH (endbuf); 4069 break; 4070 4071 case '1': case '2': case '3': case '4': case '5': 4072 case '6': case '7': case '8': case '9': 4073 if (syntax & RE_NO_BK_REFS) 4074 goto normal_char; 4075 4076 c1 = c - '0'; 4077 4078 if (c1 > regnum) 4079 FREE_STACK_RETURN (REG_ESUBREG); 4080 4081 /* Can't back reference to a subexpression if inside of it. */ 4082 if (group_in_compile_stack (compile_stack, (regnum_t) c1)) 4083 goto normal_char; 4084 4085 laststart = b; 4086 BUF_PUSH_2 (duplicate, c1); 4087 break; 4088 4089 4090 case '+': 4091 case '?': 4092 if (syntax & RE_BK_PLUS_QM) 4093 goto handle_plus; 4094 else 4095 goto normal_backslash; 4096 4097 default: 4098 normal_backslash: 4099 /* You might think it would be useful for \ to mean 4100 not to translate; but if we don't translate it 4101 it will never match anything. */ 4102 c = TRANSLATE (c); 4103 goto normal_char; 4104 } 4105 break; 4106 4107 4108 default: 4109 /* Expects the character in `c'. */ 4110 normal_char: 4111 /* If no exactn currently being built. */ 4112 if (!pending_exact 4113 #ifdef WCHAR 4114 /* If last exactn handle binary(or character) and 4115 new exactn handle character(or binary). */ 4116 || is_exactn_bin != is_binary[p - 1 - pattern] 4117 #endif /* WCHAR */ 4118 4119 /* If last exactn not at current position. */ 4120 || pending_exact + *pending_exact + 1 != b 4121 4122 /* We have only one byte following the exactn for the count. */ 4123 || *pending_exact == (1 << BYTEWIDTH) - 1 4124 4125 /* If followed by a repetition operator. */ 4126 || *p == '*' || *p == '^' 4127 || ((syntax & RE_BK_PLUS_QM) 4128 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 4129 : (*p == '+' || *p == '?')) 4130 || ((syntax & RE_INTERVALS) 4131 && ((syntax & RE_NO_BK_BRACES) 4132 ? *p == '{' 4133 : (p[0] == '\\' && p[1] == '{')))) 4134 { 4135 /* Start building a new exactn. */ 4136 4137 laststart = b; 4138 4139 #ifdef WCHAR 4140 /* Is this exactn binary data or character? */ 4141 is_exactn_bin = is_binary[p - 1 - pattern]; 4142 if (is_exactn_bin) 4143 BUF_PUSH_2 (exactn_bin, 0); 4144 else 4145 BUF_PUSH_2 (exactn, 0); 4146 #else 4147 BUF_PUSH_2 (exactn, 0); 4148 #endif /* WCHAR */ 4149 pending_exact = b - 1; 4150 } 4151 4152 BUF_PUSH (c); 4153 (*pending_exact)++; 4154 break; 4155 } /* switch (c) */ 4156 } /* while p != pend */ 4157 4158 4159 /* Through the pattern now. */ 4160 4161 if (fixup_alt_jump) 4162 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 4163 4164 if (!COMPILE_STACK_EMPTY) 4165 FREE_STACK_RETURN (REG_EPAREN); 4166 4167 /* If we don't want backtracking, force success 4168 the first time we reach the end of the compiled pattern. */ 4169 if (syntax & RE_NO_POSIX_BACKTRACKING) 4170 BUF_PUSH (succeed); 4171 4172 #ifdef WCHAR 4173 free (pattern); 4174 free (mbs_offset); 4175 free (is_binary); 4176 #endif 4177 free (compile_stack.stack); 4178 4179 /* We have succeeded; set the length of the buffer. */ 4180 #ifdef WCHAR 4181 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR; 4182 #else 4183 bufp->used = b - bufp->buffer; 4184 #endif 4185 4186 #ifdef DEBUG 4187 if (debug) 4188 { 4189 DEBUG_PRINT1 ("\nCompiled pattern: \n"); 4190 PREFIX(print_compiled_pattern) (bufp); 4191 } 4192 #endif /* DEBUG */ 4193 4194 #ifndef MATCH_MAY_ALLOCATE 4195 /* Initialize the failure stack to the largest possible stack. This 4196 isn't necessary unless we're trying to avoid calling alloca in 4197 the search and match routines. */ 4198 { 4199 int num_regs = bufp->re_nsub + 1; 4200 4201 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size 4202 is strictly greater than re_max_failures, the largest possible stack 4203 is 2 * re_max_failures failure points. */ 4204 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) 4205 { 4206 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); 4207 4208 # ifdef emacs 4209 if (! fail_stack.stack) 4210 fail_stack.stack 4211 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size 4212 * sizeof (PREFIX(fail_stack_elt_t))); 4213 else 4214 fail_stack.stack 4215 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack, 4216 (fail_stack.size 4217 * sizeof (PREFIX(fail_stack_elt_t)))); 4218 # else /* not emacs */ 4219 if (! fail_stack.stack) 4220 fail_stack.stack 4221 = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size 4222 * sizeof (PREFIX(fail_stack_elt_t))); 4223 else 4224 fail_stack.stack 4225 = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack, 4226 (fail_stack.size 4227 * sizeof (PREFIX(fail_stack_elt_t)))); 4228 # endif /* not emacs */ 4229 } 4230 4231 PREFIX(regex_grow_registers) (num_regs); 4232 } 4233 #endif /* not MATCH_MAY_ALLOCATE */ 4234 4235 return REG_NOERROR; 4236 } /* regex_compile */ 4237 4238 /* Subroutines for `regex_compile'. */ 4239 4240 /* Store OP at LOC followed by two-byte integer parameter ARG. */ 4241 /* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4242 4243 static void 4244 PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg) 4245 { 4246 *loc = (UCHAR_T) op; 4247 STORE_NUMBER (loc + 1, arg); 4248 } 4249 4250 4251 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 4252 /* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4253 4254 static void 4255 PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, int arg2) 4256 { 4257 *loc = (UCHAR_T) op; 4258 STORE_NUMBER (loc + 1, arg1); 4259 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2); 4260 } 4261 4262 4263 /* Copy the bytes from LOC to END to open up three bytes of space at LOC 4264 for OP followed by two-byte integer parameter ARG. */ 4265 /* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4266 4267 static void 4268 PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, int arg, UCHAR_T *end) 4269 { 4270 register UCHAR_T *pfrom = end; 4271 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE; 4272 4273 while (pfrom != loc) 4274 *--pto = *--pfrom; 4275 4276 PREFIX(store_op1) (op, loc, arg); 4277 } 4278 4279 4280 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 4281 /* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4282 4283 static void 4284 PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, 4285 int arg2, UCHAR_T *end) 4286 { 4287 register UCHAR_T *pfrom = end; 4288 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE; 4289 4290 while (pfrom != loc) 4291 *--pto = *--pfrom; 4292 4293 PREFIX(store_op2) (op, loc, arg1, arg2); 4294 } 4295 4296 4297 /* P points to just after a ^ in PATTERN. Return true if that ^ comes 4298 after an alternative or a begin-subexpression. We assume there is at 4299 least one character before the ^. */ 4300 4301 static boolean 4302 PREFIX(at_begline_loc_p) (const CHAR_T *pattern, const CHAR_T *p, 4303 reg_syntax_t syntax) 4304 { 4305 const CHAR_T *prev = p - 2; 4306 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 4307 4308 return 4309 /* After a subexpression? */ 4310 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 4311 /* After an alternative? */ 4312 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 4313 } 4314 4315 4316 /* The dual of at_begline_loc_p. This one is for $. We assume there is 4317 at least one character after the $, i.e., `P < PEND'. */ 4318 4319 static boolean 4320 PREFIX(at_endline_loc_p) (const CHAR_T *p, const CHAR_T *pend, 4321 reg_syntax_t syntax) 4322 { 4323 const CHAR_T *next = p; 4324 boolean next_backslash = *next == '\\'; 4325 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0; 4326 4327 return 4328 /* Before a subexpression? */ 4329 (syntax & RE_NO_BK_PARENS ? *next == ')' 4330 : next_backslash && next_next && *next_next == ')') 4331 /* Before an alternative? */ 4332 || (syntax & RE_NO_BK_VBAR ? *next == '|' 4333 : next_backslash && next_next && *next_next == '|'); 4334 } 4335 4336 #else /* not INSIDE_RECURSION */ 4337 4338 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and 4339 false if it's not. */ 4340 4341 static boolean 4342 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum) 4343 { 4344 int this_element; 4345 4346 for (this_element = compile_stack.avail - 1; 4347 this_element >= 0; 4348 this_element--) 4349 if (compile_stack.stack[this_element].regnum == regnum) 4350 return true; 4351 4352 return false; 4353 } 4354 #endif /* not INSIDE_RECURSION */ 4355 4356 #ifdef INSIDE_RECURSION 4357 4358 #ifdef WCHAR 4359 /* This insert space, which size is "num", into the pattern at "loc". 4360 "end" must point the end of the allocated buffer. */ 4361 static void 4362 insert_space (int num, CHAR_T *loc, CHAR_T *end) 4363 { 4364 register CHAR_T *pto = end; 4365 register CHAR_T *pfrom = end - num; 4366 4367 while (pfrom >= loc) 4368 *pto-- = *pfrom--; 4369 } 4370 #endif /* WCHAR */ 4371 4372 #ifdef WCHAR 4373 static reg_errcode_t 4374 wcs_compile_range (CHAR_T range_start_char, const CHAR_T **p_ptr, 4375 const CHAR_T *pend, RE_TRANSLATE_TYPE translate, 4376 reg_syntax_t syntax, CHAR_T *b, CHAR_T *char_set) 4377 { 4378 const CHAR_T *p = *p_ptr; 4379 CHAR_T range_start, range_end; 4380 reg_errcode_t ret; 4381 # ifdef _LIBC 4382 uint32_t nrules; 4383 uint32_t start_val, end_val; 4384 # endif 4385 if (p == pend) 4386 return REG_ERANGE; 4387 4388 # ifdef _LIBC 4389 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 4390 if (nrules != 0) 4391 { 4392 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE, 4393 _NL_COLLATE_COLLSEQWC); 4394 const unsigned char *extra = (const unsigned char *) 4395 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4396 4397 if (range_start_char < -1) 4398 { 4399 /* range_start is a collating symbol. */ 4400 int32_t *wextra; 4401 /* Retreive the index and get collation sequence value. */ 4402 wextra = (int32_t*)(extra + char_set[-range_start_char]); 4403 start_val = wextra[1 + *wextra]; 4404 } 4405 else 4406 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char)); 4407 4408 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0])); 4409 4410 /* Report an error if the range is empty and the syntax prohibits 4411 this. */ 4412 ret = ((syntax & RE_NO_EMPTY_RANGES) 4413 && (start_val > end_val))? REG_ERANGE : REG_NOERROR; 4414 4415 /* Insert space to the end of the char_ranges. */ 4416 insert_space(2, b - char_set[5] - 2, b - 1); 4417 *(b - char_set[5] - 2) = (wchar_t)start_val; 4418 *(b - char_set[5] - 1) = (wchar_t)end_val; 4419 char_set[4]++; /* ranges_index */ 4420 } 4421 else 4422 # endif 4423 { 4424 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char): 4425 range_start_char; 4426 range_end = TRANSLATE (p[0]); 4427 /* Report an error if the range is empty and the syntax prohibits 4428 this. */ 4429 ret = ((syntax & RE_NO_EMPTY_RANGES) 4430 && (range_start > range_end))? REG_ERANGE : REG_NOERROR; 4431 4432 /* Insert space to the end of the char_ranges. */ 4433 insert_space(2, b - char_set[5] - 2, b - 1); 4434 *(b - char_set[5] - 2) = range_start; 4435 *(b - char_set[5] - 1) = range_end; 4436 char_set[4]++; /* ranges_index */ 4437 } 4438 /* Have to increment the pointer into the pattern string, so the 4439 caller isn't still at the ending character. */ 4440 (*p_ptr)++; 4441 4442 return ret; 4443 } 4444 #else /* BYTE */ 4445 /* Read the ending character of a range (in a bracket expression) from the 4446 uncompiled pattern *P_PTR (which ends at PEND). We assume the 4447 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 4448 Then we set the translation of all bits between the starting and 4449 ending characters (inclusive) in the compiled pattern B. 4450 4451 Return an error code. 4452 4453 We use these short variable names so we can use the same macros as 4454 `regex_compile' itself. */ 4455 4456 static reg_errcode_t 4457 byte_compile_range (unsigned int range_start_char, const char **p_ptr, 4458 const char *pend, RE_TRANSLATE_TYPE translate, 4459 reg_syntax_t syntax, unsigned char *b) 4460 { 4461 unsigned this_char; 4462 const char *p = *p_ptr; 4463 reg_errcode_t ret; 4464 # if _LIBC 4465 const unsigned char *collseq; 4466 unsigned int start_colseq; 4467 unsigned int end_colseq; 4468 # else 4469 unsigned end_char; 4470 # endif 4471 4472 if (p == pend) 4473 return REG_ERANGE; 4474 4475 /* Have to increment the pointer into the pattern string, so the 4476 caller isn't still at the ending character. */ 4477 (*p_ptr)++; 4478 4479 /* Report an error if the range is empty and the syntax prohibits this. */ 4480 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 4481 4482 # if _LIBC 4483 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 4484 _NL_COLLATE_COLLSEQMB); 4485 4486 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)]; 4487 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])]; 4488 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char) 4489 { 4490 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)]; 4491 4492 if (start_colseq <= this_colseq && this_colseq <= end_colseq) 4493 { 4494 SET_LIST_BIT (TRANSLATE (this_char)); 4495 ret = REG_NOERROR; 4496 } 4497 } 4498 # else 4499 /* Here we see why `this_char' has to be larger than an `unsigned 4500 char' -- we would otherwise go into an infinite loop, since all 4501 characters <= 0xff. */ 4502 range_start_char = TRANSLATE (range_start_char); 4503 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE, 4504 and some compilers cast it to int implicitly, so following for_loop 4505 may fall to (almost) infinite loop. 4506 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff. 4507 To avoid this, we cast p[0] to unsigned int and truncate it. */ 4508 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1)); 4509 4510 for (this_char = range_start_char; this_char <= end_char; ++this_char) 4511 { 4512 SET_LIST_BIT (TRANSLATE (this_char)); 4513 ret = REG_NOERROR; 4514 } 4515 # endif 4516 4517 return ret; 4518 } 4519 #endif /* WCHAR */ 4520 4521 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 4523 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 4524 characters can start a string that matches the pattern. This fastmap 4525 is used by re_search to skip quickly over impossible starting points. 4526 4527 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 4528 area as BUFP->fastmap. 4529 4530 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 4531 the pattern buffer. 4532 4533 Returns 0 if we succeed, -2 if an internal error. */ 4534 4535 #ifdef WCHAR 4536 /* local function for re_compile_fastmap. 4537 truncate wchar_t character to char. */ 4538 static unsigned char truncate_wchar (CHAR_T c); 4539 4540 static unsigned char 4541 truncate_wchar (CHAR_T c) 4542 { 4543 unsigned char buf[MB_CUR_MAX]; 4544 mbstate_t state; 4545 int retval; 4546 memset (&state, '\0', sizeof (state)); 4547 # ifdef _LIBC 4548 retval = __wcrtomb (buf, c, &state); 4549 # else 4550 retval = wcrtomb (buf, c, &state); 4551 # endif 4552 return retval > 0 ? buf[0] : (unsigned char) c; 4553 } 4554 #endif /* WCHAR */ 4555 4556 static int 4557 PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp) 4558 { 4559 int j, k; 4560 #ifdef MATCH_MAY_ALLOCATE 4561 PREFIX(fail_stack_type) fail_stack; 4562 #endif 4563 #ifndef REGEX_MALLOC 4564 char *destination; 4565 #endif 4566 4567 register char *fastmap = bufp->fastmap; 4568 4569 #ifdef WCHAR 4570 /* We need to cast pattern to (wchar_t*), because we casted this compiled 4571 pattern to (char*) in regex_compile. */ 4572 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer; 4573 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used); 4574 #else /* BYTE */ 4575 UCHAR_T *pattern = bufp->buffer; 4576 register UCHAR_T *pend = pattern + bufp->used; 4577 #endif /* WCHAR */ 4578 UCHAR_T *p = pattern; 4579 4580 #ifdef REL_ALLOC 4581 /* This holds the pointer to the failure stack, when 4582 it is allocated relocatably. */ 4583 fail_stack_elt_t *failure_stack_ptr; 4584 #endif 4585 4586 /* Assume that each path through the pattern can be null until 4587 proven otherwise. We set this false at the bottom of switch 4588 statement, to which we get only if a particular path doesn't 4589 match the empty string. */ 4590 boolean path_can_be_null = true; 4591 4592 /* We aren't doing a `succeed_n' to begin with. */ 4593 boolean succeed_n_p = false; 4594 4595 assert (fastmap != NULL && p != NULL); 4596 4597 INIT_FAIL_STACK (); 4598 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 4599 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 4600 bufp->can_be_null = 0; 4601 4602 while (1) 4603 { 4604 if (p == pend || *p == (UCHAR_T) succeed) 4605 { 4606 /* We have reached the (effective) end of pattern. */ 4607 if (!FAIL_STACK_EMPTY ()) 4608 { 4609 bufp->can_be_null |= path_can_be_null; 4610 4611 /* Reset for next path. */ 4612 path_can_be_null = true; 4613 4614 p = fail_stack.stack[--fail_stack.avail].pointer; 4615 4616 continue; 4617 } 4618 else 4619 break; 4620 } 4621 4622 /* We should never be about to go beyond the end of the pattern. */ 4623 assert (p < pend); 4624 4625 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 4626 { 4627 4628 /* I guess the idea here is to simply not bother with a fastmap 4629 if a backreference is used, since it's too hard to figure out 4630 the fastmap for the corresponding group. Setting 4631 `can_be_null' stops `re_search_2' from using the fastmap, so 4632 that is all we do. */ 4633 case duplicate: 4634 bufp->can_be_null = 1; 4635 goto done; 4636 4637 4638 /* Following are the cases which match a character. These end 4639 with `break'. */ 4640 4641 #ifdef WCHAR 4642 case exactn: 4643 fastmap[truncate_wchar(p[1])] = 1; 4644 break; 4645 #else /* BYTE */ 4646 case exactn: 4647 fastmap[p[1]] = 1; 4648 break; 4649 #endif /* WCHAR */ 4650 #ifdef MBS_SUPPORT 4651 case exactn_bin: 4652 fastmap[p[1]] = 1; 4653 break; 4654 #endif 4655 4656 #ifdef WCHAR 4657 /* It is hard to distinguish fastmap from (multi byte) characters 4658 which depends on current locale. */ 4659 case charset: 4660 case charset_not: 4661 case wordchar: 4662 case notwordchar: 4663 bufp->can_be_null = 1; 4664 goto done; 4665 #else /* BYTE */ 4666 case charset: 4667 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 4668 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 4669 fastmap[j] = 1; 4670 break; 4671 4672 4673 case charset_not: 4674 /* Chars beyond end of map must be allowed. */ 4675 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 4676 fastmap[j] = 1; 4677 4678 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 4679 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 4680 fastmap[j] = 1; 4681 break; 4682 4683 4684 case wordchar: 4685 for (j = 0; j < (1 << BYTEWIDTH); j++) 4686 if (SYNTAX (j) == Sword) 4687 fastmap[j] = 1; 4688 break; 4689 4690 4691 case notwordchar: 4692 for (j = 0; j < (1 << BYTEWIDTH); j++) 4693 if (SYNTAX (j) != Sword) 4694 fastmap[j] = 1; 4695 break; 4696 #endif /* WCHAR */ 4697 4698 case anychar: 4699 { 4700 int fastmap_newline = fastmap['\n']; 4701 4702 /* `.' matches anything ... */ 4703 for (j = 0; j < (1 << BYTEWIDTH); j++) 4704 fastmap[j] = 1; 4705 4706 /* ... except perhaps newline. */ 4707 if (!(bufp->syntax & RE_DOT_NEWLINE)) 4708 fastmap['\n'] = fastmap_newline; 4709 4710 /* Return if we have already set `can_be_null'; if we have, 4711 then the fastmap is irrelevant. Something's wrong here. */ 4712 else if (bufp->can_be_null) 4713 goto done; 4714 4715 /* Otherwise, have to check alternative paths. */ 4716 break; 4717 } 4718 4719 #ifdef emacs 4720 case syntaxspec: 4721 k = *p++; 4722 for (j = 0; j < (1 << BYTEWIDTH); j++) 4723 if (SYNTAX (j) == (enum syntaxcode) k) 4724 fastmap[j] = 1; 4725 break; 4726 4727 4728 case notsyntaxspec: 4729 k = *p++; 4730 for (j = 0; j < (1 << BYTEWIDTH); j++) 4731 if (SYNTAX (j) != (enum syntaxcode) k) 4732 fastmap[j] = 1; 4733 break; 4734 4735 4736 /* All cases after this match the empty string. These end with 4737 `continue'. */ 4738 4739 4740 case before_dot: 4741 case at_dot: 4742 case after_dot: 4743 continue; 4744 #endif /* emacs */ 4745 4746 4747 case no_op: 4748 case begline: 4749 case endline: 4750 case begbuf: 4751 case endbuf: 4752 case wordbound: 4753 case notwordbound: 4754 case wordbeg: 4755 case wordend: 4756 case push_dummy_failure: 4757 continue; 4758 4759 4760 case jump_n: 4761 case pop_failure_jump: 4762 case maybe_pop_jump: 4763 case jump: 4764 case jump_past_alt: 4765 case dummy_failure_jump: 4766 EXTRACT_NUMBER_AND_INCR (j, p); 4767 p += j; 4768 if (j > 0) 4769 continue; 4770 4771 /* Jump backward implies we just went through the body of a 4772 loop and matched nothing. Opcode jumped to should be 4773 `on_failure_jump' or `succeed_n'. Just treat it like an 4774 ordinary jump. For a * loop, it has pushed its failure 4775 point already; if so, discard that as redundant. */ 4776 if ((re_opcode_t) *p != on_failure_jump 4777 && (re_opcode_t) *p != succeed_n) 4778 continue; 4779 4780 p++; 4781 EXTRACT_NUMBER_AND_INCR (j, p); 4782 p += j; 4783 4784 /* If what's on the stack is where we are now, pop it. */ 4785 if (!FAIL_STACK_EMPTY () 4786 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 4787 fail_stack.avail--; 4788 4789 continue; 4790 4791 4792 case on_failure_jump: 4793 case on_failure_keep_string_jump: 4794 handle_on_failure_jump: 4795 EXTRACT_NUMBER_AND_INCR (j, p); 4796 4797 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 4798 end of the pattern. We don't want to push such a point, 4799 since when we restore it above, entering the switch will 4800 increment `p' past the end of the pattern. We don't need 4801 to push such a point since we obviously won't find any more 4802 fastmap entries beyond `pend'. Such a pattern can match 4803 the null string, though. */ 4804 if (p + j < pend) 4805 { 4806 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 4807 { 4808 RESET_FAIL_STACK (); 4809 return -2; 4810 } 4811 } 4812 else 4813 bufp->can_be_null = 1; 4814 4815 if (succeed_n_p) 4816 { 4817 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 4818 succeed_n_p = false; 4819 } 4820 4821 continue; 4822 4823 4824 case succeed_n: 4825 /* Get to the number of times to succeed. */ 4826 p += OFFSET_ADDRESS_SIZE; 4827 4828 /* Increment p past the n for when k != 0. */ 4829 EXTRACT_NUMBER_AND_INCR (k, p); 4830 if (k == 0) 4831 { 4832 p -= 2 * OFFSET_ADDRESS_SIZE; 4833 succeed_n_p = true; /* Spaghetti code alert. */ 4834 goto handle_on_failure_jump; 4835 } 4836 continue; 4837 4838 4839 case set_number_at: 4840 p += 2 * OFFSET_ADDRESS_SIZE; 4841 continue; 4842 4843 4844 case start_memory: 4845 case stop_memory: 4846 p += 2; 4847 continue; 4848 4849 4850 default: 4851 abort (); /* We have listed all the cases. */ 4852 } /* switch *p++ */ 4853 4854 /* Getting here means we have found the possible starting 4855 characters for one path of the pattern -- and that the empty 4856 string does not match. We need not follow this path further. 4857 Instead, look at the next alternative (remembered on the 4858 stack), or quit if no more. The test at the top of the loop 4859 does these things. */ 4860 path_can_be_null = false; 4861 p = pend; 4862 } /* while p */ 4863 4864 /* Set `can_be_null' for the last path (also the first path, if the 4865 pattern is empty). */ 4866 bufp->can_be_null |= path_can_be_null; 4867 4868 done: 4869 RESET_FAIL_STACK (); 4870 return 0; 4871 } 4872 4873 #else /* not INSIDE_RECURSION */ 4874 4875 int 4876 re_compile_fastmap (struct re_pattern_buffer *bufp) 4877 { 4878 # ifdef MBS_SUPPORT 4879 if (MB_CUR_MAX != 1) 4880 return wcs_re_compile_fastmap(bufp); 4881 else 4882 # endif 4883 return byte_re_compile_fastmap(bufp); 4884 } /* re_compile_fastmap */ 4885 #ifdef _LIBC 4886 weak_alias (__re_compile_fastmap, re_compile_fastmap) 4887 #endif 4888 4889 4891 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 4892 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 4893 this memory for recording register information. STARTS and ENDS 4894 must be allocated using the malloc library routine, and must each 4895 be at least NUM_REGS * sizeof (regoff_t) bytes long. 4896 4897 If NUM_REGS == 0, then subsequent matches should allocate their own 4898 register data. 4899 4900 Unless this function is called, the first search or match using 4901 PATTERN_BUFFER will allocate its own register data, without 4902 freeing the old data. */ 4903 4904 void 4905 re_set_registers (struct re_pattern_buffer *bufp, 4906 struct re_registers *regs, unsigned num_regs, 4907 regoff_t *starts, regoff_t *ends) 4908 { 4909 if (num_regs) 4910 { 4911 bufp->regs_allocated = REGS_REALLOCATE; 4912 regs->num_regs = num_regs; 4913 regs->start = starts; 4914 regs->end = ends; 4915 } 4916 else 4917 { 4918 bufp->regs_allocated = REGS_UNALLOCATED; 4919 regs->num_regs = 0; 4920 regs->start = regs->end = (regoff_t *) 0; 4921 } 4922 } 4923 #ifdef _LIBC 4924 weak_alias (__re_set_registers, re_set_registers) 4925 #endif 4926 4927 /* Searching routines. */ 4929 4930 /* Like re_search_2, below, but only one string is specified, and 4931 doesn't let you say where to stop matching. */ 4932 4933 int 4934 re_search (struct re_pattern_buffer *bufp, const char *string, int size, 4935 int startpos, int range, struct re_registers *regs) 4936 { 4937 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 4938 regs, size); 4939 } 4940 #ifdef _LIBC 4941 weak_alias (__re_search, re_search) 4942 #endif 4943 4944 4945 /* Using the compiled pattern in BUFP->buffer, first tries to match the 4946 virtual concatenation of STRING1 and STRING2, starting first at index 4947 STARTPOS, then at STARTPOS + 1, and so on. 4948 4949 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 4950 4951 RANGE is how far to scan while trying to match. RANGE = 0 means try 4952 only at STARTPOS; in general, the last start tried is STARTPOS + 4953 RANGE. 4954 4955 In REGS, return the indices of the virtual concatenation of STRING1 4956 and STRING2 that matched the entire BUFP->buffer and its contained 4957 subexpressions. 4958 4959 Do not consider matching one past the index STOP in the virtual 4960 concatenation of STRING1 and STRING2. 4961 4962 We return either the position in the strings at which the match was 4963 found, -1 if no match, or -2 if error (such as failure 4964 stack overflow). */ 4965 4966 int 4967 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, 4968 const char *string2, int size2, int startpos, int range, 4969 struct re_registers *regs, int stop) 4970 { 4971 # ifdef MBS_SUPPORT 4972 if (MB_CUR_MAX != 1) 4973 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos, 4974 range, regs, stop); 4975 else 4976 # endif 4977 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos, 4978 range, regs, stop); 4979 } /* re_search_2 */ 4980 #ifdef _LIBC 4981 weak_alias (__re_search_2, re_search_2) 4982 #endif 4983 4984 #endif /* not INSIDE_RECURSION */ 4985 4986 #ifdef INSIDE_RECURSION 4987 4988 #ifdef MATCH_MAY_ALLOCATE 4989 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL 4990 #else 4991 # define FREE_VAR(var) free (var); var = NULL 4992 #endif 4993 4994 #ifdef WCHAR 4995 # define MAX_ALLOCA_SIZE 2000 4996 4997 # define FREE_WCS_BUFFERS() \ 4998 do { \ 4999 if (size1 > MAX_ALLOCA_SIZE) \ 5000 { \ 5001 free (wcs_string1); \ 5002 free (mbs_offset1); \ 5003 } \ 5004 else \ 5005 { \ 5006 FREE_VAR (wcs_string1); \ 5007 FREE_VAR (mbs_offset1); \ 5008 } \ 5009 if (size2 > MAX_ALLOCA_SIZE) \ 5010 { \ 5011 free (wcs_string2); \ 5012 free (mbs_offset2); \ 5013 } \ 5014 else \ 5015 { \ 5016 FREE_VAR (wcs_string2); \ 5017 FREE_VAR (mbs_offset2); \ 5018 } \ 5019 } while (0) 5020 5021 #endif 5022 5023 5024 static int 5025 PREFIX(re_search_2) (struct re_pattern_buffer *bufp, const char *string1, 5026 int size1, const char *string2, int size2, 5027 int startpos, int range, 5028 struct re_registers *regs, int stop) 5029 { 5030 int val; 5031 register char *fastmap = bufp->fastmap; 5032 register RE_TRANSLATE_TYPE translate = bufp->translate; 5033 int total_size = size1 + size2; 5034 int endpos = startpos + range; 5035 #ifdef WCHAR 5036 /* We need wchar_t* buffers correspond to cstring1, cstring2. */ 5037 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL; 5038 /* We need the size of wchar_t buffers correspond to csize1, csize2. */ 5039 int wcs_size1 = 0, wcs_size2 = 0; 5040 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ 5041 int *mbs_offset1 = NULL, *mbs_offset2 = NULL; 5042 /* They hold whether each wchar_t is binary data or not. */ 5043 char *is_binary = NULL; 5044 #endif /* WCHAR */ 5045 5046 /* Check for out-of-range STARTPOS. */ 5047 if (startpos < 0 || startpos > total_size) 5048 return -1; 5049 5050 /* Fix up RANGE if it might eventually take us outside 5051 the virtual concatenation of STRING1 and STRING2. 5052 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ 5053 if (endpos < 0) 5054 range = 0 - startpos; 5055 else if (endpos > total_size) 5056 range = total_size - startpos; 5057 5058 /* If the search isn't to be a backwards one, don't waste time in a 5059 search for a pattern that must be anchored. */ 5060 if (bufp->used > 0 && range > 0 5061 && ((re_opcode_t) bufp->buffer[0] == begbuf 5062 /* `begline' is like `begbuf' if it cannot match at newlines. */ 5063 || ((re_opcode_t) bufp->buffer[0] == begline 5064 && !bufp->newline_anchor))) 5065 { 5066 if (startpos > 0) 5067 return -1; 5068 else 5069 range = 1; 5070 } 5071 5072 #ifdef emacs 5073 /* In a forward search for something that starts with \=. 5074 don't keep searching past point. */ 5075 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 5076 { 5077 range = PT - startpos; 5078 if (range <= 0) 5079 return -1; 5080 } 5081 #endif /* emacs */ 5082 5083 /* Update the fastmap now if not correct already. */ 5084 if (fastmap && !bufp->fastmap_accurate) 5085 if (re_compile_fastmap (bufp) == -2) 5086 return -2; 5087 5088 #ifdef WCHAR 5089 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and 5090 fill them with converted string. */ 5091 if (size1 != 0) 5092 { 5093 if (size1 > MAX_ALLOCA_SIZE) 5094 { 5095 wcs_string1 = TALLOC (size1 + 1, CHAR_T); 5096 mbs_offset1 = TALLOC (size1 + 1, int); 5097 is_binary = TALLOC (size1 + 1, char); 5098 } 5099 else 5100 { 5101 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T); 5102 mbs_offset1 = REGEX_TALLOC (size1 + 1, int); 5103 is_binary = REGEX_TALLOC (size1 + 1, char); 5104 } 5105 if (!wcs_string1 || !mbs_offset1 || !is_binary) 5106 { 5107 if (size1 > MAX_ALLOCA_SIZE) 5108 { 5109 free (wcs_string1); 5110 free (mbs_offset1); 5111 free (is_binary); 5112 } 5113 else 5114 { 5115 FREE_VAR (wcs_string1); 5116 FREE_VAR (mbs_offset1); 5117 FREE_VAR (is_binary); 5118 } 5119 return -2; 5120 } 5121 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1, 5122 mbs_offset1, is_binary); 5123 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */ 5124 if (size1 > MAX_ALLOCA_SIZE) 5125 free (is_binary); 5126 else 5127 FREE_VAR (is_binary); 5128 } 5129 if (size2 != 0) 5130 { 5131 if (size2 > MAX_ALLOCA_SIZE) 5132 { 5133 wcs_string2 = TALLOC (size2 + 1, CHAR_T); 5134 mbs_offset2 = TALLOC (size2 + 1, int); 5135 is_binary = TALLOC (size2 + 1, char); 5136 } 5137 else 5138 { 5139 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T); 5140 mbs_offset2 = REGEX_TALLOC (size2 + 1, int); 5141 is_binary = REGEX_TALLOC (size2 + 1, char); 5142 } 5143 if (!wcs_string2 || !mbs_offset2 || !is_binary) 5144 { 5145 FREE_WCS_BUFFERS (); 5146 if (size2 > MAX_ALLOCA_SIZE) 5147 free (is_binary); 5148 else 5149 FREE_VAR (is_binary); 5150 return -2; 5151 } 5152 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2, 5153 mbs_offset2, is_binary); 5154 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */ 5155 if (size2 > MAX_ALLOCA_SIZE) 5156 free (is_binary); 5157 else 5158 FREE_VAR (is_binary); 5159 } 5160 #endif /* WCHAR */ 5161 5162 5163 /* Loop through the string, looking for a place to start matching. */ 5164 for (;;) 5165 { 5166 /* If a fastmap is supplied, skip quickly over characters that 5167 cannot be the start of a match. If the pattern can match the 5168 null string, however, we don't need to skip characters; we want 5169 the first null string. */ 5170 if (fastmap && startpos < total_size && !bufp->can_be_null) 5171 { 5172 if (range > 0) /* Searching forwards. */ 5173 { 5174 register const char *d; 5175 register int lim = 0; 5176 int irange = range; 5177 5178 if (startpos < size1 && startpos + range >= size1) 5179 lim = range - (size1 - startpos); 5180 5181 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 5182 5183 /* Written out as an if-else to avoid testing `translate' 5184 inside the loop. */ 5185 if (translate) 5186 while (range > lim 5187 && !fastmap[(unsigned char) 5188 translate[(unsigned char) *d++]]) 5189 range--; 5190 else 5191 while (range > lim && !fastmap[(unsigned char) *d++]) 5192 range--; 5193 5194 startpos += irange - range; 5195 } 5196 else /* Searching backwards. */ 5197 { 5198 register CHAR_T c = (size1 == 0 || startpos >= size1 5199 ? string2[startpos - size1] 5200 : string1[startpos]); 5201 5202 if (!fastmap[(unsigned char) TRANSLATE (c)]) 5203 goto advance; 5204 } 5205 } 5206 5207 /* If can't match the null string, and that's all we have left, fail. */ 5208 if (range >= 0 && startpos == total_size && fastmap 5209 && !bufp->can_be_null) 5210 { 5211 #ifdef WCHAR 5212 FREE_WCS_BUFFERS (); 5213 #endif 5214 return -1; 5215 } 5216 5217 #ifdef WCHAR 5218 val = wcs_re_match_2_internal (bufp, string1, size1, string2, 5219 size2, startpos, regs, stop, 5220 wcs_string1, wcs_size1, 5221 wcs_string2, wcs_size2, 5222 mbs_offset1, mbs_offset2); 5223 #else /* BYTE */ 5224 val = byte_re_match_2_internal (bufp, string1, size1, string2, 5225 size2, startpos, regs, stop); 5226 #endif /* BYTE */ 5227 5228 #ifndef REGEX_MALLOC 5229 # ifdef C_ALLOCA 5230 alloca (0); 5231 # endif 5232 #endif 5233 5234 if (val >= 0) 5235 { 5236 #ifdef WCHAR 5237 FREE_WCS_BUFFERS (); 5238 #endif 5239 return startpos; 5240 } 5241 5242 if (val == -2) 5243 { 5244 #ifdef WCHAR 5245 FREE_WCS_BUFFERS (); 5246 #endif 5247 return -2; 5248 } 5249 5250 advance: 5251 if (!range) 5252 break; 5253 else if (range > 0) 5254 { 5255 range--; 5256 startpos++; 5257 } 5258 else 5259 { 5260 range++; 5261 startpos--; 5262 } 5263 } 5264 #ifdef WCHAR 5265 FREE_WCS_BUFFERS (); 5266 #endif 5267 return -1; 5268 } 5269 5270 #ifdef WCHAR 5271 /* This converts PTR, a pointer into one of the search wchar_t strings 5272 `string1' and `string2' into an multibyte string offset from the 5273 beginning of that string. We use mbs_offset to optimize. 5274 See convert_mbs_to_wcs. */ 5275 # define POINTER_TO_OFFSET(ptr) \ 5276 (FIRST_STRING_P (ptr) \ 5277 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \ 5278 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \ 5279 + csize1))) 5280 #else /* BYTE */ 5281 /* This converts PTR, a pointer into one of the search strings `string1' 5282 and `string2' into an offset from the beginning of that string. */ 5283 # define POINTER_TO_OFFSET(ptr) \ 5284 (FIRST_STRING_P (ptr) \ 5285 ? ((regoff_t) ((ptr) - string1)) \ 5286 : ((regoff_t) ((ptr) - string2 + size1))) 5287 #endif /* WCHAR */ 5288 5289 /* Macros for dealing with the split strings in re_match_2. */ 5290 5291 #define MATCHING_IN_FIRST_STRING (dend == end_match_1) 5292 5293 /* Call before fetching a character with *d. This switches over to 5294 string2 if necessary. */ 5295 #define PREFETCH() \ 5296 while (d == dend) \ 5297 { \ 5298 /* End of string2 => fail. */ \ 5299 if (dend == end_match_2) \ 5300 goto fail; \ 5301 /* End of string1 => advance to string2. */ \ 5302 d = string2; \ 5303 dend = end_match_2; \ 5304 } 5305 5306 /* Test if at very beginning or at very end of the virtual concatenation 5307 of `string1' and `string2'. If only one string, it's `string2'. */ 5308 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 5309 #define AT_STRINGS_END(d) ((d) == end2) 5310 5311 5312 /* Test if D points to a character which is word-constituent. We have 5313 two special cases to check for: if past the end of string1, look at 5314 the first character in string2; and if before the beginning of 5315 string2, look at the last character in string1. */ 5316 #ifdef WCHAR 5317 /* Use internationalized API instead of SYNTAX. */ 5318 # define WORDCHAR_P(d) \ 5319 (iswalnum ((wint_t)((d) == end1 ? *string2 \ 5320 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \ 5321 || ((d) == end1 ? *string2 \ 5322 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_') 5323 #else /* BYTE */ 5324 # define WORDCHAR_P(d) \ 5325 (SYNTAX ((d) == end1 ? *string2 \ 5326 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 5327 == Sword) 5328 #endif /* WCHAR */ 5329 5330 /* Disabled due to a compiler bug -- see comment at case wordbound */ 5331 #if 0 5332 /* Test if the character before D and the one at D differ with respect 5333 to being word-constituent. */ 5334 #define AT_WORD_BOUNDARY(d) \ 5335 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 5336 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 5337 #endif 5338 5339 /* Free everything we malloc. */ 5340 #ifdef MATCH_MAY_ALLOCATE 5341 # ifdef WCHAR 5342 # define FREE_VARIABLES() \ 5343 do { \ 5344 REGEX_FREE_STACK (fail_stack.stack); \ 5345 FREE_VAR (regstart); \ 5346 FREE_VAR (regend); \ 5347 FREE_VAR (old_regstart); \ 5348 FREE_VAR (old_regend); \ 5349 FREE_VAR (best_regstart); \ 5350 FREE_VAR (best_regend); \ 5351 FREE_VAR (reg_info); \ 5352 FREE_VAR (reg_dummy); \ 5353 FREE_VAR (reg_info_dummy); \ 5354 if (!cant_free_wcs_buf) \ 5355 { \ 5356 FREE_VAR (string1); \ 5357 FREE_VAR (string2); \ 5358 FREE_VAR (mbs_offset1); \ 5359 FREE_VAR (mbs_offset2); \ 5360 } \ 5361 } while (0) 5362 # else /* BYTE */ 5363 # define FREE_VARIABLES() \ 5364 do { \ 5365 REGEX_FREE_STACK (fail_stack.stack); \ 5366 FREE_VAR (regstart); \ 5367 FREE_VAR (regend); \ 5368 FREE_VAR (old_regstart); \ 5369 FREE_VAR (old_regend); \ 5370 FREE_VAR (best_regstart); \ 5371 FREE_VAR (best_regend); \ 5372 FREE_VAR (reg_info); \ 5373 FREE_VAR (reg_dummy); \ 5374 FREE_VAR (reg_info_dummy); \ 5375 } while (0) 5376 # endif /* WCHAR */ 5377 #else 5378 # ifdef WCHAR 5379 # define FREE_VARIABLES() \ 5380 do { \ 5381 if (!cant_free_wcs_buf) \ 5382 { \ 5383 FREE_VAR (string1); \ 5384 FREE_VAR (string2); \ 5385 FREE_VAR (mbs_offset1); \ 5386 FREE_VAR (mbs_offset2); \ 5387 } \ 5388 } while (0) 5389 # else /* BYTE */ 5390 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ 5391 # endif /* WCHAR */ 5392 #endif /* not MATCH_MAY_ALLOCATE */ 5393 5394 /* These values must meet several constraints. They must not be valid 5395 register values; since we have a limit of 255 registers (because 5396 we use only one byte in the pattern for the register number), we can 5397 use numbers larger than 255. They must differ by 1, because of 5398 NUM_FAILURE_ITEMS above. And the value for the lowest register must 5399 be larger than the value for the highest register, so we do not try 5400 to actually save any registers when none are active. */ 5401 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 5402 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 5403 5404 #else /* not INSIDE_RECURSION */ 5406 /* Matching routines. */ 5407 5408 #ifndef emacs /* Emacs never uses this. */ 5409 /* re_match is like re_match_2 except it takes only a single string. */ 5410 5411 int 5412 re_match (struct re_pattern_buffer *bufp, const char *string, 5413 int size, int pos, struct re_registers *regs) 5414 { 5415 int result; 5416 # ifdef MBS_SUPPORT 5417 if (MB_CUR_MAX != 1) 5418 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size, 5419 pos, regs, size, 5420 NULL, 0, NULL, 0, NULL, NULL); 5421 else 5422 # endif 5423 result = byte_re_match_2_internal (bufp, NULL, 0, string, size, 5424 pos, regs, size); 5425 # ifndef REGEX_MALLOC 5426 # ifdef C_ALLOCA 5427 alloca (0); 5428 # endif 5429 # endif 5430 return result; 5431 } 5432 # ifdef _LIBC 5433 weak_alias (__re_match, re_match) 5434 # endif 5435 #endif /* not emacs */ 5436 5437 #endif /* not INSIDE_RECURSION */ 5438 5439 #ifdef INSIDE_RECURSION 5440 static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p, 5441 UCHAR_T *end, 5442 PREFIX(register_info_type) *reg_info); 5443 static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p, 5444 UCHAR_T *end, 5445 PREFIX(register_info_type) *reg_info); 5446 static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p, 5447 UCHAR_T *end, 5448 PREFIX(register_info_type) *reg_info); 5449 static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, 5450 int len, char *translate); 5451 #else /* not INSIDE_RECURSION */ 5452 5453 /* re_match_2 matches the compiled pattern in BUFP against the 5454 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 5455 and SIZE2, respectively). We start matching at POS, and stop 5456 matching at STOP. 5457 5458 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 5459 store offsets for the substring each group matched in REGS. See the 5460 documentation for exactly how many groups we fill. 5461 5462 We return -1 if no match, -2 if an internal error (such as the 5463 failure stack overflowing). Otherwise, we return the length of the 5464 matched substring. */ 5465 5466 int 5467 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, 5468 const char *string2, int size2, int pos, 5469 struct re_registers *regs, int stop) 5470 { 5471 int result; 5472 # ifdef MBS_SUPPORT 5473 if (MB_CUR_MAX != 1) 5474 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2, 5475 pos, regs, stop, 5476 NULL, 0, NULL, 0, NULL, NULL); 5477 else 5478 # endif 5479 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2, 5480 pos, regs, stop); 5481 5482 #ifndef REGEX_MALLOC 5483 # ifdef C_ALLOCA 5484 alloca (0); 5485 # endif 5486 #endif 5487 return result; 5488 } 5489 #ifdef _LIBC 5490 weak_alias (__re_match_2, re_match_2) 5491 #endif 5492 5493 #endif /* not INSIDE_RECURSION */ 5494 5495 #ifdef INSIDE_RECURSION 5496 5497 #ifdef WCHAR 5498 static int count_mbs_length (int *, int); 5499 5500 /* This check the substring (from 0, to length) of the multibyte string, 5501 to which offset_buffer correspond. And count how many wchar_t_characters 5502 the substring occupy. We use offset_buffer to optimization. 5503 See convert_mbs_to_wcs. */ 5504 5505 static int 5506 count_mbs_length(int *offset_buffer, int length) 5507 { 5508 int upper, lower; 5509 5510 /* Check whether the size is valid. */ 5511 if (length < 0) 5512 return -1; 5513 5514 if (offset_buffer == NULL) 5515 return 0; 5516 5517 /* If there are no multibyte character, offset_buffer[i] == i. 5518 Optmize for this case. */ 5519 if (offset_buffer[length] == length) 5520 return length; 5521 5522 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */ 5523 upper = length; 5524 lower = 0; 5525 5526 while (true) 5527 { 5528 int middle = (lower + upper) / 2; 5529 if (middle == lower || middle == upper) 5530 break; 5531 if (offset_buffer[middle] > length) 5532 upper = middle; 5533 else if (offset_buffer[middle] < length) 5534 lower = middle; 5535 else 5536 return middle; 5537 } 5538 5539 return -1; 5540 } 5541 #endif /* WCHAR */ 5542 5543 /* This is a separate function so that we can force an alloca cleanup 5544 afterwards. */ 5545 #ifdef WCHAR 5546 static int 5547 wcs_re_match_2_internal (struct re_pattern_buffer *bufp, 5548 const char *cstring1, int csize1, 5549 const char *cstring2, int csize2, 5550 int pos, 5551 struct re_registers *regs, 5552 int stop, 5553 /* string1 == string2 == NULL means string1/2, size1/2 and 5554 mbs_offset1/2 need seting up in this function. */ 5555 /* We need wchar_t* buffers correspond to cstring1, cstring2. */ 5556 wchar_t *string1, int size1, 5557 wchar_t *string2, int size2, 5558 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ 5559 int *mbs_offset1, int *mbs_offset2) 5560 #else /* BYTE */ 5561 static int 5562 byte_re_match_2_internal (struct re_pattern_buffer *bufp, 5563 const char *string1, int size1, 5564 const char *string2, int size2, 5565 int pos, 5566 struct re_registers *regs, int stop) 5567 #endif /* BYTE */ 5568 { 5569 /* General temporaries. */ 5570 int mcnt; 5571 UCHAR_T *p1; 5572 #ifdef WCHAR 5573 /* They hold whether each wchar_t is binary data or not. */ 5574 char *is_binary = NULL; 5575 /* If true, we can't free string1/2, mbs_offset1/2. */ 5576 int cant_free_wcs_buf = 1; 5577 #endif /* WCHAR */ 5578 5579 /* Just past the end of the corresponding string. */ 5580 const CHAR_T *end1, *end2; 5581 5582 /* Pointers into string1 and string2, just past the last characters in 5583 each to consider matching. */ 5584 const CHAR_T *end_match_1, *end_match_2; 5585 5586 /* Where we are in the data, and the end of the current string. */ 5587 const CHAR_T *d, *dend; 5588 5589 /* Where we are in the pattern, and the end of the pattern. */ 5590 #ifdef WCHAR 5591 UCHAR_T *pattern, *p; 5592 register UCHAR_T *pend; 5593 #else /* BYTE */ 5594 UCHAR_T *p = bufp->buffer; 5595 register UCHAR_T *pend = p + bufp->used; 5596 #endif /* WCHAR */ 5597 5598 /* Mark the opcode just after a start_memory, so we can test for an 5599 empty subpattern when we get to the stop_memory. */ 5600 UCHAR_T *just_past_start_mem = 0; 5601 5602 /* We use this to map every character in the string. */ 5603 RE_TRANSLATE_TYPE translate = bufp->translate; 5604 5605 /* Failure point stack. Each place that can handle a failure further 5606 down the line pushes a failure point on this stack. It consists of 5607 restart, regend, and reg_info for all registers corresponding to 5608 the subexpressions we're currently inside, plus the number of such 5609 registers, and, finally, two char *'s. The first char * is where 5610 to resume scanning the pattern; the second one is where to resume 5611 scanning the strings. If the latter is zero, the failure point is 5612 a ``dummy''; if a failure happens and the failure point is a dummy, 5613 it gets discarded and the next next one is tried. */ 5614 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 5615 PREFIX(fail_stack_type) fail_stack; 5616 #endif 5617 #ifdef DEBUG 5618 static unsigned failure_id; 5619 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 5620 #endif 5621 5622 #ifdef REL_ALLOC 5623 /* This holds the pointer to the failure stack, when 5624 it is allocated relocatably. */ 5625 fail_stack_elt_t *failure_stack_ptr; 5626 #endif 5627 5628 /* We fill all the registers internally, independent of what we 5629 return, for use in backreferences. The number here includes 5630 an element for register zero. */ 5631 size_t num_regs = bufp->re_nsub + 1; 5632 5633 /* The currently active registers. */ 5634 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; 5635 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; 5636 5637 /* Information on the contents of registers. These are pointers into 5638 the input strings; they record just what was matched (on this 5639 attempt) by a subexpression part of the pattern, that is, the 5640 regnum-th regstart pointer points to where in the pattern we began 5641 matching and the regnum-th regend points to right after where we 5642 stopped matching the regnum-th subexpression. (The zeroth register 5643 keeps track of what the whole pattern matches.) */ 5644 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5645 const CHAR_T **regstart, **regend; 5646 #endif 5647 5648 /* If a group that's operated upon by a repetition operator fails to 5649 match anything, then the register for its start will need to be 5650 restored because it will have been set to wherever in the string we 5651 are when we last see its open-group operator. Similarly for a 5652 register's end. */ 5653 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5654 const CHAR_T **old_regstart, **old_regend; 5655 #endif 5656 5657 /* The is_active field of reg_info helps us keep track of which (possibly 5658 nested) subexpressions we are currently in. The matched_something 5659 field of reg_info[reg_num] helps us tell whether or not we have 5660 matched any of the pattern so far this time through the reg_num-th 5661 subexpression. These two fields get reset each time through any 5662 loop their register is in. */ 5663 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 5664 PREFIX(register_info_type) *reg_info; 5665 #endif 5666 5667 /* The following record the register info as found in the above 5668 variables when we find a match better than any we've seen before. 5669 This happens as we backtrack through the failure points, which in 5670 turn happens only if we have not yet matched the entire string. */ 5671 unsigned best_regs_set = false; 5672 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5673 const CHAR_T **best_regstart, **best_regend; 5674 #endif 5675 5676 /* Logically, this is `best_regend[0]'. But we don't want to have to 5677 allocate space for that if we're not allocating space for anything 5678 else (see below). Also, we never need info about register 0 for 5679 any of the other register vectors, and it seems rather a kludge to 5680 treat `best_regend' differently than the rest. So we keep track of 5681 the end of the best match so far in a separate variable. We 5682 initialize this to NULL so that when we backtrack the first time 5683 and need to test it, it's not garbage. */ 5684 const CHAR_T *match_end = NULL; 5685 5686 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ 5687 int set_regs_matched_done = 0; 5688 5689 /* Used when we pop values we don't care about. */ 5690 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5691 const CHAR_T **reg_dummy; 5692 PREFIX(register_info_type) *reg_info_dummy; 5693 #endif 5694 5695 #ifdef DEBUG 5696 /* Counts the total number of registers pushed. */ 5697 unsigned num_regs_pushed = 0; 5698 #endif 5699 5700 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 5701 5702 INIT_FAIL_STACK (); 5703 5704 #ifdef MATCH_MAY_ALLOCATE 5705 /* Do not bother to initialize all the register variables if there are 5706 no groups in the pattern, as it takes a fair amount of time. If 5707 there are groups, we include space for register 0 (the whole 5708 pattern), even though we never use it, since it simplifies the 5709 array indexing. We should fix this. */ 5710 if (bufp->re_nsub) 5711 { 5712 regstart = REGEX_TALLOC (num_regs, const CHAR_T *); 5713 regend = REGEX_TALLOC (num_regs, const CHAR_T *); 5714 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *); 5715 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *); 5716 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *); 5717 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *); 5718 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type)); 5719 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *); 5720 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type)); 5721 5722 if (!(regstart && regend && old_regstart && old_regend && reg_info 5723 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 5724 { 5725 FREE_VARIABLES (); 5726 return -2; 5727 } 5728 } 5729 else 5730 { 5731 /* We must initialize all our variables to NULL, so that 5732 `FREE_VARIABLES' doesn't try to free them. */ 5733 regstart = regend = old_regstart = old_regend = best_regstart 5734 = best_regend = reg_dummy = NULL; 5735 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL; 5736 } 5737 #endif /* MATCH_MAY_ALLOCATE */ 5738 5739 /* The starting position is bogus. */ 5740 #ifdef WCHAR 5741 if (pos < 0 || pos > csize1 + csize2) 5742 #else /* BYTE */ 5743 if (pos < 0 || pos > size1 + size2) 5744 #endif 5745 { 5746 FREE_VARIABLES (); 5747 return -1; 5748 } 5749 5750 #ifdef WCHAR 5751 /* Allocate wchar_t array for string1 and string2 and 5752 fill them with converted string. */ 5753 if (string1 == NULL && string2 == NULL) 5754 { 5755 /* We need seting up buffers here. */ 5756 5757 /* We must free wcs buffers in this function. */ 5758 cant_free_wcs_buf = 0; 5759 5760 if (csize1 != 0) 5761 { 5762 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T); 5763 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int); 5764 is_binary = REGEX_TALLOC (csize1 + 1, char); 5765 if (!string1 || !mbs_offset1 || !is_binary) 5766 { 5767 FREE_VAR (string1); 5768 FREE_VAR (mbs_offset1); 5769 FREE_VAR (is_binary); 5770 return -2; 5771 } 5772 } 5773 if (csize2 != 0) 5774 { 5775 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T); 5776 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int); 5777 is_binary = REGEX_TALLOC (csize2 + 1, char); 5778 if (!string2 || !mbs_offset2 || !is_binary) 5779 { 5780 FREE_VAR (string1); 5781 FREE_VAR (mbs_offset1); 5782 FREE_VAR (string2); 5783 FREE_VAR (mbs_offset2); 5784 FREE_VAR (is_binary); 5785 return -2; 5786 } 5787 size2 = convert_mbs_to_wcs(string2, cstring2, csize2, 5788 mbs_offset2, is_binary); 5789 string2[size2] = L'\0'; /* for a sentinel */ 5790 FREE_VAR (is_binary); 5791 } 5792 } 5793 5794 /* We need to cast pattern to (wchar_t*), because we casted this compiled 5795 pattern to (char*) in regex_compile. */ 5796 p = pattern = (CHAR_T*)bufp->buffer; 5797 pend = (CHAR_T*)(bufp->buffer + bufp->used); 5798 5799 #endif /* WCHAR */ 5800 5801 /* Initialize subexpression text positions to -1 to mark ones that no 5802 start_memory/stop_memory has been seen for. Also initialize the 5803 register information struct. */ 5804 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5805 { 5806 regstart[mcnt] = regend[mcnt] 5807 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 5808 5809 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 5810 IS_ACTIVE (reg_info[mcnt]) = 0; 5811 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 5812 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 5813 } 5814 5815 /* We move `string1' into `string2' if the latter's empty -- but not if 5816 `string1' is null. */ 5817 if (size2 == 0 && string1 != NULL) 5818 { 5819 string2 = string1; 5820 size2 = size1; 5821 string1 = 0; 5822 size1 = 0; 5823 #ifdef WCHAR 5824 mbs_offset2 = mbs_offset1; 5825 csize2 = csize1; 5826 mbs_offset1 = NULL; 5827 csize1 = 0; 5828 #endif 5829 } 5830 end1 = string1 + size1; 5831 end2 = string2 + size2; 5832 5833 /* Compute where to stop matching, within the two strings. */ 5834 #ifdef WCHAR 5835 if (stop <= csize1) 5836 { 5837 mcnt = count_mbs_length(mbs_offset1, stop); 5838 end_match_1 = string1 + mcnt; 5839 end_match_2 = string2; 5840 } 5841 else 5842 { 5843 if (stop > csize1 + csize2) 5844 stop = csize1 + csize2; 5845 end_match_1 = end1; 5846 mcnt = count_mbs_length(mbs_offset2, stop-csize1); 5847 end_match_2 = string2 + mcnt; 5848 } 5849 if (mcnt < 0) 5850 { /* count_mbs_length return error. */ 5851 FREE_VARIABLES (); 5852 return -1; 5853 } 5854 #else 5855 if (stop <= size1) 5856 { 5857 end_match_1 = string1 + stop; 5858 end_match_2 = string2; 5859 } 5860 else 5861 { 5862 end_match_1 = end1; 5863 end_match_2 = string2 + stop - size1; 5864 } 5865 #endif /* WCHAR */ 5866 5867 /* `p' scans through the pattern as `d' scans through the data. 5868 `dend' is the end of the input string that `d' points within. `d' 5869 is advanced into the following input string whenever necessary, but 5870 this happens before fetching; therefore, at the beginning of the 5871 loop, `d' can be pointing at the end of a string, but it cannot 5872 equal `string2'. */ 5873 #ifdef WCHAR 5874 if (size1 > 0 && pos <= csize1) 5875 { 5876 mcnt = count_mbs_length(mbs_offset1, pos); 5877 d = string1 + mcnt; 5878 dend = end_match_1; 5879 } 5880 else 5881 { 5882 mcnt = count_mbs_length(mbs_offset2, pos-csize1); 5883 d = string2 + mcnt; 5884 dend = end_match_2; 5885 } 5886 5887 if (mcnt < 0) 5888 { /* count_mbs_length return error. */ 5889 FREE_VARIABLES (); 5890 return -1; 5891 } 5892 #else 5893 if (size1 > 0 && pos <= size1) 5894 { 5895 d = string1 + pos; 5896 dend = end_match_1; 5897 } 5898 else 5899 { 5900 d = string2 + pos - size1; 5901 dend = end_match_2; 5902 } 5903 #endif /* WCHAR */ 5904 5905 DEBUG_PRINT1 ("The compiled pattern is:\n"); 5906 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 5907 DEBUG_PRINT1 ("The string to match is: `"); 5908 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 5909 DEBUG_PRINT1 ("'\n"); 5910 5911 /* This loops over pattern commands. It exits by returning from the 5912 function if the match is complete, or it drops through if the match 5913 fails at this starting point in the input data. */ 5914 for (;;) 5915 { 5916 #ifdef _LIBC 5917 DEBUG_PRINT2 ("\n%p: ", p); 5918 #else 5919 DEBUG_PRINT2 ("\n0x%x: ", p); 5920 #endif 5921 5922 if (p == pend) 5923 { /* End of pattern means we might have succeeded. */ 5924 DEBUG_PRINT1 ("end of pattern ... "); 5925 5926 /* If we haven't matched the entire string, and we want the 5927 longest match, try backtracking. */ 5928 if (d != end_match_2) 5929 { 5930 /* 1 if this match ends in the same string (string1 or string2) 5931 as the best previous match. */ 5932 boolean same_str_p; 5933 5934 /* 1 if this match is the best seen so far. */ 5935 boolean best_match_p; 5936 5937 same_str_p = (FIRST_STRING_P (match_end) 5938 == MATCHING_IN_FIRST_STRING); 5939 5940 /* AIX compiler got confused when this was combined 5941 with the previous declaration. */ 5942 if (same_str_p) 5943 best_match_p = d > match_end; 5944 else 5945 best_match_p = !MATCHING_IN_FIRST_STRING; 5946 5947 DEBUG_PRINT1 ("backtracking.\n"); 5948 5949 if (!FAIL_STACK_EMPTY ()) 5950 { /* More failure points to try. */ 5951 5952 /* If exceeds best match so far, save it. */ 5953 if (!best_regs_set || best_match_p) 5954 { 5955 best_regs_set = true; 5956 match_end = d; 5957 5958 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 5959 5960 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5961 { 5962 best_regstart[mcnt] = regstart[mcnt]; 5963 best_regend[mcnt] = regend[mcnt]; 5964 } 5965 } 5966 goto fail; 5967 } 5968 5969 /* If no failure points, don't restore garbage. And if 5970 last match is real best match, don't restore second 5971 best one. */ 5972 else if (best_regs_set && !best_match_p) 5973 { 5974 restore_best_regs: 5975 /* Restore best match. It may happen that `dend == 5976 end_match_1' while the restored d is in string2. 5977 For example, the pattern `x.*y.*z' against the 5978 strings `x-' and `y-z-', if the two strings are 5979 not consecutive in memory. */ 5980 DEBUG_PRINT1 ("Restoring best registers.\n"); 5981 5982 d = match_end; 5983 dend = ((d >= string1 && d <= end1) 5984 ? end_match_1 : end_match_2); 5985 5986 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5987 { 5988 regstart[mcnt] = best_regstart[mcnt]; 5989 regend[mcnt] = best_regend[mcnt]; 5990 } 5991 } 5992 } /* d != end_match_2 */ 5993 5994 succeed_label: 5995 DEBUG_PRINT1 ("Accepting match.\n"); 5996 /* If caller wants register contents data back, do it. */ 5997 if (regs && !bufp->no_sub) 5998 { 5999 /* Have the register data arrays been allocated? */ 6000 if (bufp->regs_allocated == REGS_UNALLOCATED) 6001 { /* No. So allocate them with malloc. We need one 6002 extra element beyond `num_regs' for the `-1' marker 6003 GNU code uses. */ 6004 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 6005 regs->start = TALLOC (regs->num_regs, regoff_t); 6006 regs->end = TALLOC (regs->num_regs, regoff_t); 6007 if (regs->start == NULL || regs->end == NULL) 6008 { 6009 FREE_VARIABLES (); 6010 return -2; 6011 } 6012 bufp->regs_allocated = REGS_REALLOCATE; 6013 } 6014 else if (bufp->regs_allocated == REGS_REALLOCATE) 6015 { /* Yes. If we need more elements than were already 6016 allocated, reallocate them. If we need fewer, just 6017 leave it alone. */ 6018 if (regs->num_regs < num_regs + 1) 6019 { 6020 regs->num_regs = num_regs + 1; 6021 RETALLOC (regs->start, regs->num_regs, regoff_t); 6022 RETALLOC (regs->end, regs->num_regs, regoff_t); 6023 if (regs->start == NULL || regs->end == NULL) 6024 { 6025 FREE_VARIABLES (); 6026 return -2; 6027 } 6028 } 6029 } 6030 else 6031 { 6032 /* These braces fend off a "empty body in an else-statement" 6033 warning under GCC when assert expands to nothing. */ 6034 assert (bufp->regs_allocated == REGS_FIXED); 6035 } 6036 6037 /* Convert the pointer data in `regstart' and `regend' to 6038 indices. Register zero has to be set differently, 6039 since we haven't kept track of any info for it. */ 6040 if (regs->num_regs > 0) 6041 { 6042 regs->start[0] = pos; 6043 #ifdef WCHAR 6044 if (MATCHING_IN_FIRST_STRING) 6045 regs->end[0] = mbs_offset1 != NULL ? 6046 mbs_offset1[d-string1] : 0; 6047 else 6048 regs->end[0] = csize1 + (mbs_offset2 != NULL ? 6049 mbs_offset2[d-string2] : 0); 6050 #else 6051 regs->end[0] = (MATCHING_IN_FIRST_STRING 6052 ? ((regoff_t) (d - string1)) 6053 : ((regoff_t) (d - string2 + size1))); 6054 #endif /* WCHAR */ 6055 } 6056 6057 /* Go through the first `min (num_regs, regs->num_regs)' 6058 registers, since that is all we initialized. */ 6059 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs); 6060 mcnt++) 6061 { 6062 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 6063 regs->start[mcnt] = regs->end[mcnt] = -1; 6064 else 6065 { 6066 regs->start[mcnt] 6067 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); 6068 regs->end[mcnt] 6069 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); 6070 } 6071 } 6072 6073 /* If the regs structure we return has more elements than 6074 were in the pattern, set the extra elements to -1. If 6075 we (re)allocated the registers, this is the case, 6076 because we always allocate enough to have at least one 6077 -1 at the end. */ 6078 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++) 6079 regs->start[mcnt] = regs->end[mcnt] = -1; 6080 } /* regs && !bufp->no_sub */ 6081 6082 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 6083 nfailure_points_pushed, nfailure_points_popped, 6084 nfailure_points_pushed - nfailure_points_popped); 6085 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 6086 6087 #ifdef WCHAR 6088 if (MATCHING_IN_FIRST_STRING) 6089 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0; 6090 else 6091 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) + 6092 csize1; 6093 mcnt -= pos; 6094 #else 6095 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 6096 ? string1 6097 : string2 - size1); 6098 #endif /* WCHAR */ 6099 6100 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 6101 6102 FREE_VARIABLES (); 6103 return mcnt; 6104 } 6105 6106 /* Otherwise match next pattern command. */ 6107 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 6108 { 6109 /* Ignore these. Used to ignore the n of succeed_n's which 6110 currently have n == 0. */ 6111 case no_op: 6112 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 6113 break; 6114 6115 case succeed: 6116 DEBUG_PRINT1 ("EXECUTING succeed.\n"); 6117 goto succeed_label; 6118 6119 /* Match the next n pattern characters exactly. The following 6120 byte in the pattern defines n, and the n bytes after that 6121 are the characters to match. */ 6122 case exactn: 6123 #ifdef MBS_SUPPORT 6124 case exactn_bin: 6125 #endif 6126 mcnt = *p++; 6127 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 6128 6129 /* This is written out as an if-else so we don't waste time 6130 testing `translate' inside the loop. */ 6131 if (translate) 6132 { 6133 do 6134 { 6135 PREFETCH (); 6136 #ifdef WCHAR 6137 if (*d <= 0xff) 6138 { 6139 if ((UCHAR_T) translate[(unsigned char) *d++] 6140 != (UCHAR_T) *p++) 6141 goto fail; 6142 } 6143 else 6144 { 6145 if (*d++ != (CHAR_T) *p++) 6146 goto fail; 6147 } 6148 #else 6149 if ((UCHAR_T) translate[(unsigned char) *d++] 6150 != (UCHAR_T) *p++) 6151 goto fail; 6152 #endif /* WCHAR */ 6153 } 6154 while (--mcnt); 6155 } 6156 else 6157 { 6158 do 6159 { 6160 PREFETCH (); 6161 if (*d++ != (CHAR_T) *p++) goto fail; 6162 } 6163 while (--mcnt); 6164 } 6165 SET_REGS_MATCHED (); 6166 break; 6167 6168 6169 /* Match any character except possibly a newline or a null. */ 6170 case anychar: 6171 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 6172 6173 PREFETCH (); 6174 6175 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 6176 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 6177 goto fail; 6178 6179 SET_REGS_MATCHED (); 6180 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d); 6181 d++; 6182 break; 6183 6184 6185 case charset: 6186 case charset_not: 6187 { 6188 register UCHAR_T c; 6189 #ifdef WCHAR 6190 unsigned int i, char_class_length, coll_symbol_length, 6191 equiv_class_length, ranges_length, chars_length, length; 6192 CHAR_T *workp, *workp2, *charset_top; 6193 #define WORK_BUFFER_SIZE 128 6194 CHAR_T str_buf[WORK_BUFFER_SIZE]; 6195 # ifdef _LIBC 6196 uint32_t nrules; 6197 # endif /* _LIBC */ 6198 #endif /* WCHAR */ 6199 boolean negate = (re_opcode_t) *(p - 1) == charset_not; 6200 6201 DEBUG_PRINT2 ("EXECUTING charset%s.\n", negate ? "_not" : ""); 6202 PREFETCH (); 6203 c = TRANSLATE (*d); /* The character to match. */ 6204 #ifdef WCHAR 6205 # ifdef _LIBC 6206 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 6207 # endif /* _LIBC */ 6208 charset_top = p - 1; 6209 char_class_length = *p++; 6210 coll_symbol_length = *p++; 6211 equiv_class_length = *p++; 6212 ranges_length = *p++; 6213 chars_length = *p++; 6214 /* p points charset[6], so the address of the next instruction 6215 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'], 6216 where l=length of char_classes, m=length of collating_symbol, 6217 n=equivalence_class, o=length of char_range, 6218 p'=length of character. */ 6219 workp = p; 6220 /* Update p to indicate the next instruction. */ 6221 p += char_class_length + coll_symbol_length+ equiv_class_length + 6222 2*ranges_length + chars_length; 6223 6224 /* match with char_class? */ 6225 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE) 6226 { 6227 wctype_t wctype; 6228 uintptr_t alignedp = ((uintptr_t)workp 6229 + __alignof__(wctype_t) - 1) 6230 & ~(uintptr_t)(__alignof__(wctype_t) - 1); 6231 wctype = *((wctype_t*)alignedp); 6232 workp += CHAR_CLASS_SIZE; 6233 # ifdef _LIBC 6234 if (__iswctype((wint_t)c, wctype)) 6235 goto char_set_matched; 6236 # else 6237 if (iswctype((wint_t)c, wctype)) 6238 goto char_set_matched; 6239 # endif 6240 } 6241 6242 /* match with collating_symbol? */ 6243 # ifdef _LIBC 6244 if (nrules != 0) 6245 { 6246 const unsigned char *extra = (const unsigned char *) 6247 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 6248 6249 for (workp2 = workp + coll_symbol_length ; workp < workp2 ; 6250 workp++) 6251 { 6252 int32_t *wextra; 6253 wextra = (int32_t*)(extra + *workp++); 6254 for (i = 0; i < *wextra; ++i) 6255 if (TRANSLATE(d[i]) != wextra[1 + i]) 6256 break; 6257 6258 if (i == *wextra) 6259 { 6260 /* Update d, however d will be incremented at 6261 char_set_matched:, we decrement d here. */ 6262 d += i - 1; 6263 goto char_set_matched; 6264 } 6265 } 6266 } 6267 else /* (nrules == 0) */ 6268 # endif 6269 /* If we can't look up collation data, we use wcscoll 6270 instead. */ 6271 { 6272 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;) 6273 { 6274 const CHAR_T *backup_d = d, *backup_dend = dend; 6275 # ifdef _LIBC 6276 length = __wcslen (workp); 6277 # else 6278 length = wcslen (workp); 6279 # endif 6280 6281 /* If wcscoll(the collating symbol, whole string) > 0, 6282 any substring of the string never match with the 6283 collating symbol. */ 6284 # ifdef _LIBC 6285 if (__wcscoll (workp, d) > 0) 6286 # else 6287 if (wcscoll (workp, d) > 0) 6288 # endif 6289 { 6290 workp += length + 1; 6291 continue; 6292 } 6293 6294 /* First, we compare the collating symbol with 6295 the first character of the string. 6296 If it don't match, we add the next character to 6297 the compare buffer in turn. */ 6298 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++) 6299 { 6300 int match; 6301 if (d == dend) 6302 { 6303 if (dend == end_match_2) 6304 break; 6305 d = string2; 6306 dend = end_match_2; 6307 } 6308 6309 /* add next character to the compare buffer. */ 6310 str_buf[i] = TRANSLATE(*d); 6311 str_buf[i+1] = '\0'; 6312 6313 # ifdef _LIBC 6314 match = __wcscoll (workp, str_buf); 6315 # else 6316 match = wcscoll (workp, str_buf); 6317 # endif 6318 if (match == 0) 6319 goto char_set_matched; 6320 6321 if (match < 0) 6322 /* (str_buf > workp) indicate (str_buf + X > workp), 6323 because for all X (str_buf + X > str_buf). 6324 So we don't need continue this loop. */ 6325 break; 6326 6327 /* Otherwise(str_buf < workp), 6328 (str_buf+next_character) may equals (workp). 6329 So we continue this loop. */ 6330 } 6331 /* not matched */ 6332 d = backup_d; 6333 dend = backup_dend; 6334 workp += length + 1; 6335 } 6336 } 6337 /* match with equivalence_class? */ 6338 # ifdef _LIBC 6339 if (nrules != 0) 6340 { 6341 const CHAR_T *backup_d = d, *backup_dend = dend; 6342 /* Try to match the equivalence class against 6343 those known to the collate implementation. */ 6344 const int32_t *table; 6345 const int32_t *weights; 6346 const int32_t *extra; 6347 const int32_t *indirect; 6348 int32_t idx, idx2; 6349 wint_t *cp; 6350 size_t len; 6351 6352 /* This #include defines a local function! */ 6353 # include <locale/weightwc.h> 6354 6355 table = (const int32_t *) 6356 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC); 6357 weights = (const wint_t *) 6358 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC); 6359 extra = (const wint_t *) 6360 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC); 6361 indirect = (const int32_t *) 6362 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC); 6363 6364 /* Write 1 collating element to str_buf, and 6365 get its index. */ 6366 idx2 = 0; 6367 6368 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++) 6369 { 6370 cp = (wint_t*)str_buf; 6371 if (d == dend) 6372 { 6373 if (dend == end_match_2) 6374 break; 6375 d = string2; 6376 dend = end_match_2; 6377 } 6378 str_buf[i] = TRANSLATE(*(d+i)); 6379 str_buf[i+1] = '\0'; /* sentinel */ 6380 idx2 = findidx ((const wint_t**)&cp); 6381 } 6382 6383 /* Update d, however d will be incremented at 6384 char_set_matched:, we decrement d here. */ 6385 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1); 6386 if (d >= dend) 6387 { 6388 if (dend == end_match_2) 6389 d = dend; 6390 else 6391 { 6392 d = string2; 6393 dend = end_match_2; 6394 } 6395 } 6396 6397 len = weights[idx2]; 6398 6399 for (workp2 = workp + equiv_class_length ; workp < workp2 ; 6400 workp++) 6401 { 6402 idx = (int32_t)*workp; 6403 /* We already checked idx != 0 in regex_compile. */ 6404 6405 if (idx2 != 0 && len == weights[idx]) 6406 { 6407 int cnt = 0; 6408 while (cnt < len && (weights[idx + 1 + cnt] 6409 == weights[idx2 + 1 + cnt])) 6410 ++cnt; 6411 6412 if (cnt == len) 6413 goto char_set_matched; 6414 } 6415 } 6416 /* not matched */ 6417 d = backup_d; 6418 dend = backup_dend; 6419 } 6420 else /* (nrules == 0) */ 6421 # endif 6422 /* If we can't look up collation data, we use wcscoll 6423 instead. */ 6424 { 6425 for (workp2 = workp + equiv_class_length ; workp < workp2 ;) 6426 { 6427 const CHAR_T *backup_d = d, *backup_dend = dend; 6428 # ifdef _LIBC 6429 length = __wcslen (workp); 6430 # else 6431 length = wcslen (workp); 6432 # endif 6433 6434 /* If wcscoll(the collating symbol, whole string) > 0, 6435 any substring of the string never match with the 6436 collating symbol. */ 6437 # ifdef _LIBC 6438 if (__wcscoll (workp, d) > 0) 6439 # else 6440 if (wcscoll (workp, d) > 0) 6441 # endif 6442 { 6443 workp += length + 1; 6444 break; 6445 } 6446 6447 /* First, we compare the equivalence class with 6448 the first character of the string. 6449 If it don't match, we add the next character to 6450 the compare buffer in turn. */ 6451 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++) 6452 { 6453 int match; 6454 if (d == dend) 6455 { 6456 if (dend == end_match_2) 6457 break; 6458 d = string2; 6459 dend = end_match_2; 6460 } 6461 6462 /* add next character to the compare buffer. */ 6463 str_buf[i] = TRANSLATE(*d); 6464 str_buf[i+1] = '\0'; 6465 6466 # ifdef _LIBC 6467 match = __wcscoll (workp, str_buf); 6468 # else 6469 match = wcscoll (workp, str_buf); 6470 # endif 6471 6472 if (match == 0) 6473 goto char_set_matched; 6474 6475 if (match < 0) 6476 /* (str_buf > workp) indicate (str_buf + X > workp), 6477 because for all X (str_buf + X > str_buf). 6478 So we don't need continue this loop. */ 6479 break; 6480 6481 /* Otherwise(str_buf < workp), 6482 (str_buf+next_character) may equals (workp). 6483 So we continue this loop. */ 6484 } 6485 /* not matched */ 6486 d = backup_d; 6487 dend = backup_dend; 6488 workp += length + 1; 6489 } 6490 } 6491 6492 /* match with char_range? */ 6493 # ifdef _LIBC 6494 if (nrules != 0) 6495 { 6496 uint32_t collseqval; 6497 const char *collseq = (const char *) 6498 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC); 6499 6500 collseqval = collseq_table_lookup (collseq, c); 6501 6502 for (; workp < p - chars_length ;) 6503 { 6504 uint32_t start_val, end_val; 6505 6506 /* We already compute the collation sequence value 6507 of the characters (or collating symbols). */ 6508 start_val = (uint32_t) *workp++; /* range_start */ 6509 end_val = (uint32_t) *workp++; /* range_end */ 6510 6511 if (start_val <= collseqval && collseqval <= end_val) 6512 goto char_set_matched; 6513 } 6514 } 6515 else 6516 # endif 6517 { 6518 /* We set range_start_char at str_buf[0], range_end_char 6519 at str_buf[4], and compared char at str_buf[2]. */ 6520 str_buf[1] = 0; 6521 str_buf[2] = c; 6522 str_buf[3] = 0; 6523 str_buf[5] = 0; 6524 for (; workp < p - chars_length ;) 6525 { 6526 wchar_t *range_start_char, *range_end_char; 6527 6528 /* match if (range_start_char <= c <= range_end_char). */ 6529 6530 /* If range_start(or end) < 0, we assume -range_start(end) 6531 is the offset of the collating symbol which is specified 6532 as the character of the range start(end). */ 6533 6534 /* range_start */ 6535 if (*workp < 0) 6536 range_start_char = charset_top - (*workp++); 6537 else 6538 { 6539 str_buf[0] = *workp++; 6540 range_start_char = str_buf; 6541 } 6542 6543 /* range_end */ 6544 if (*workp < 0) 6545 range_end_char = charset_top - (*workp++); 6546 else 6547 { 6548 str_buf[4] = *workp++; 6549 range_end_char = str_buf + 4; 6550 } 6551 6552 # ifdef _LIBC 6553 if (__wcscoll (range_start_char, str_buf+2) <= 0 6554 && __wcscoll (str_buf+2, range_end_char) <= 0) 6555 # else 6556 if (wcscoll (range_start_char, str_buf+2) <= 0 6557 && wcscoll (str_buf+2, range_end_char) <= 0) 6558 # endif 6559 goto char_set_matched; 6560 } 6561 } 6562 6563 /* match with char? */ 6564 for (; workp < p ; workp++) 6565 if (c == *workp) 6566 goto char_set_matched; 6567 6568 negate = !negate; 6569 6570 char_set_matched: 6571 if (negate) goto fail; 6572 #else 6573 /* Cast to `unsigned' instead of `unsigned char' in case the 6574 bit list is a full 32 bytes long. */ 6575 if (c < (unsigned) (*p * BYTEWIDTH) 6576 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 6577 negate = !negate; 6578 6579 p += 1 + *p; 6580 6581 if (!negate) goto fail; 6582 #undef WORK_BUFFER_SIZE 6583 #endif /* WCHAR */ 6584 SET_REGS_MATCHED (); 6585 d++; 6586 break; 6587 } 6588 6589 6590 /* The beginning of a group is represented by start_memory. 6591 The arguments are the register number in the next byte, and the 6592 number of groups inner to this one in the next. The text 6593 matched within the group is recorded (in the internal 6594 registers data structure) under the register number. */ 6595 case start_memory: 6596 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n", 6597 (long int) *p, (long int) p[1]); 6598 6599 /* Find out if this group can match the empty string. */ 6600 p1 = p; /* To send to group_match_null_string_p. */ 6601 6602 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 6603 REG_MATCH_NULL_STRING_P (reg_info[*p]) 6604 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info); 6605 6606 /* Save the position in the string where we were the last time 6607 we were at this open-group operator in case the group is 6608 operated upon by a repetition operator, e.g., with `(a*)*b' 6609 against `ab'; then we want to ignore where we are now in 6610 the string in case this attempt to match fails. */ 6611 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 6612 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 6613 : regstart[*p]; 6614 DEBUG_PRINT2 (" old_regstart: %d\n", 6615 POINTER_TO_OFFSET (old_regstart[*p])); 6616 6617 regstart[*p] = d; 6618 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 6619 6620 IS_ACTIVE (reg_info[*p]) = 1; 6621 MATCHED_SOMETHING (reg_info[*p]) = 0; 6622 6623 /* Clear this whenever we change the register activity status. */ 6624 set_regs_matched_done = 0; 6625 6626 /* This is the new highest active register. */ 6627 highest_active_reg = *p; 6628 6629 /* If nothing was active before, this is the new lowest active 6630 register. */ 6631 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 6632 lowest_active_reg = *p; 6633 6634 /* Move past the register number and inner group count. */ 6635 p += 2; 6636 just_past_start_mem = p; 6637 6638 break; 6639 6640 6641 /* The stop_memory opcode represents the end of a group. Its 6642 arguments are the same as start_memory's: the register 6643 number, and the number of inner groups. */ 6644 case stop_memory: 6645 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n", 6646 (long int) *p, (long int) p[1]); 6647 6648 /* We need to save the string position the last time we were at 6649 this close-group operator in case the group is operated 6650 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 6651 against `aba'; then we want to ignore where we are now in 6652 the string in case this attempt to match fails. */ 6653 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 6654 ? REG_UNSET (regend[*p]) ? d : regend[*p] 6655 : regend[*p]; 6656 DEBUG_PRINT2 (" old_regend: %d\n", 6657 POINTER_TO_OFFSET (old_regend[*p])); 6658 6659 regend[*p] = d; 6660 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 6661 6662 /* This register isn't active anymore. */ 6663 IS_ACTIVE (reg_info[*p]) = 0; 6664 6665 /* Clear this whenever we change the register activity status. */ 6666 set_regs_matched_done = 0; 6667 6668 /* If this was the only register active, nothing is active 6669 anymore. */ 6670 if (lowest_active_reg == highest_active_reg) 6671 { 6672 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 6673 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 6674 } 6675 else 6676 { /* We must scan for the new highest active register, since 6677 it isn't necessarily one less than now: consider 6678 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 6679 new highest active register is 1. */ 6680 UCHAR_T r = *p - 1; 6681 while (r > 0 && !IS_ACTIVE (reg_info[r])) 6682 r--; 6683 6684 /* If we end up at register zero, that means that we saved 6685 the registers as the result of an `on_failure_jump', not 6686 a `start_memory', and we jumped to past the innermost 6687 `stop_memory'. For example, in ((.)*) we save 6688 registers 1 and 2 as a result of the *, but when we pop 6689 back to the second ), we are at the stop_memory 1. 6690 Thus, nothing is active. */ 6691 if (r == 0) 6692 { 6693 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 6694 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 6695 } 6696 else 6697 highest_active_reg = r; 6698 } 6699 6700 /* If just failed to match something this time around with a 6701 group that's operated on by a repetition operator, try to 6702 force exit from the ``loop'', and restore the register 6703 information for this group that we had before trying this 6704 last match. */ 6705 if ((!MATCHED_SOMETHING (reg_info[*p]) 6706 || just_past_start_mem == p - 1) 6707 && (p + 2) < pend) 6708 { 6709 boolean is_a_jump_n = false; 6710 6711 p1 = p + 2; 6712 mcnt = 0; 6713 switch ((re_opcode_t) *p1++) 6714 { 6715 case jump_n: 6716 is_a_jump_n = true; 6717 case pop_failure_jump: 6718 case maybe_pop_jump: 6719 case jump: 6720 case dummy_failure_jump: 6721 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 6722 if (is_a_jump_n) 6723 p1 += OFFSET_ADDRESS_SIZE; 6724 break; 6725 6726 default: 6727 /* do nothing */ ; 6728 } 6729 p1 += mcnt; 6730 6731 /* If the next operation is a jump backwards in the pattern 6732 to an on_failure_jump right before the start_memory 6733 corresponding to this stop_memory, exit from the loop 6734 by forcing a failure after pushing on the stack the 6735 on_failure_jump's jump in the pattern, and d. */ 6736 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 6737 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory 6738 && p1[2+OFFSET_ADDRESS_SIZE] == *p) 6739 { 6740 /* If this group ever matched anything, then restore 6741 what its registers were before trying this last 6742 failed match, e.g., with `(a*)*b' against `ab' for 6743 regstart[1], and, e.g., with `((a*)*(b*)*)*' 6744 against `aba' for regend[3]. 6745 6746 Also restore the registers for inner groups for, 6747 e.g., `((a*)(b*))*' against `aba' (register 3 would 6748 otherwise get trashed). */ 6749 6750 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 6751 { 6752 unsigned r; 6753 6754 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 6755 6756 /* Restore this and inner groups' (if any) registers. */ 6757 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1); 6758 r++) 6759 { 6760 regstart[r] = old_regstart[r]; 6761 6762 /* xx why this test? */ 6763 if (old_regend[r] >= regstart[r]) 6764 regend[r] = old_regend[r]; 6765 } 6766 } 6767 p1++; 6768 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 6769 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 6770 6771 goto fail; 6772 } 6773 } 6774 6775 /* Move past the register number and the inner group count. */ 6776 p += 2; 6777 break; 6778 6779 6780 /* \<digit> has been turned into a `duplicate' command which is 6781 followed by the numeric value of <digit> as the register number. */ 6782 case duplicate: 6783 { 6784 register const CHAR_T *d2, *dend2; 6785 int regno = *p++; /* Get which register to match against. */ 6786 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 6787 6788 /* Can't back reference a group which we've never matched. */ 6789 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 6790 goto fail; 6791 6792 /* Where in input to try to start matching. */ 6793 d2 = regstart[regno]; 6794 6795 /* Where to stop matching; if both the place to start and 6796 the place to stop matching are in the same string, then 6797 set to the place to stop, otherwise, for now have to use 6798 the end of the first string. */ 6799 6800 dend2 = ((FIRST_STRING_P (regstart[regno]) 6801 == FIRST_STRING_P (regend[regno])) 6802 ? regend[regno] : end_match_1); 6803 for (;;) 6804 { 6805 /* If necessary, advance to next segment in register 6806 contents. */ 6807 while (d2 == dend2) 6808 { 6809 if (dend2 == end_match_2) break; 6810 if (dend2 == regend[regno]) break; 6811 6812 /* End of string1 => advance to string2. */ 6813 d2 = string2; 6814 dend2 = regend[regno]; 6815 } 6816 /* At end of register contents => success */ 6817 if (d2 == dend2) break; 6818 6819 /* If necessary, advance to next segment in data. */ 6820 PREFETCH (); 6821 6822 /* How many characters left in this segment to match. */ 6823 mcnt = dend - d; 6824 6825 /* Want how many consecutive characters we can match in 6826 one shot, so, if necessary, adjust the count. */ 6827 if (mcnt > dend2 - d2) 6828 mcnt = dend2 - d2; 6829 6830 /* Compare that many; failure if mismatch, else move 6831 past them. */ 6832 if (translate 6833 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate) 6834 : memcmp (d, d2, mcnt*sizeof(UCHAR_T))) 6835 goto fail; 6836 d += mcnt, d2 += mcnt; 6837 6838 /* Do this because we've match some characters. */ 6839 SET_REGS_MATCHED (); 6840 } 6841 } 6842 break; 6843 6844 6845 /* begline matches the empty string at the beginning of the string 6846 (unless `not_bol' is set in `bufp'), and, if 6847 `newline_anchor' is set, after newlines. */ 6848 case begline: 6849 DEBUG_PRINT1 ("EXECUTING begline.\n"); 6850 6851 if (AT_STRINGS_BEG (d)) 6852 { 6853 if (!bufp->not_bol) break; 6854 } 6855 else if (d[-1] == '\n' && bufp->newline_anchor) 6856 { 6857 break; 6858 } 6859 /* In all other cases, we fail. */ 6860 goto fail; 6861 6862 6863 /* endline is the dual of begline. */ 6864 case endline: 6865 DEBUG_PRINT1 ("EXECUTING endline.\n"); 6866 6867 if (AT_STRINGS_END (d)) 6868 { 6869 if (!bufp->not_eol) break; 6870 } 6871 6872 /* We have to ``prefetch'' the next character. */ 6873 else if ((d == end1 ? *string2 : *d) == '\n' 6874 && bufp->newline_anchor) 6875 { 6876 break; 6877 } 6878 goto fail; 6879 6880 6881 /* Match at the very beginning of the data. */ 6882 case begbuf: 6883 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 6884 if (AT_STRINGS_BEG (d)) 6885 break; 6886 goto fail; 6887 6888 6889 /* Match at the very end of the data. */ 6890 case endbuf: 6891 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 6892 if (AT_STRINGS_END (d)) 6893 break; 6894 goto fail; 6895 6896 6897 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 6898 pushes NULL as the value for the string on the stack. Then 6899 `pop_failure_point' will keep the current value for the 6900 string, instead of restoring it. To see why, consider 6901 matching `foo\nbar' against `.*\n'. The .* matches the foo; 6902 then the . fails against the \n. But the next thing we want 6903 to do is match the \n against the \n; if we restored the 6904 string value, we would be back at the foo. 6905 6906 Because this is used only in specific cases, we don't need to 6907 check all the things that `on_failure_jump' does, to make 6908 sure the right things get saved on the stack. Hence we don't 6909 share its code. The only reason to push anything on the 6910 stack at all is that otherwise we would have to change 6911 `anychar's code to do something besides goto fail in this 6912 case; that seems worse than this. */ 6913 case on_failure_keep_string_jump: 6914 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 6915 6916 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6917 #ifdef _LIBC 6918 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt); 6919 #else 6920 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 6921 #endif 6922 6923 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 6924 break; 6925 6926 6927 /* Uses of on_failure_jump: 6928 6929 Each alternative starts with an on_failure_jump that points 6930 to the beginning of the next alternative. Each alternative 6931 except the last ends with a jump that in effect jumps past 6932 the rest of the alternatives. (They really jump to the 6933 ending jump of the following alternative, because tensioning 6934 these jumps is a hassle.) 6935 6936 Repeats start with an on_failure_jump that points past both 6937 the repetition text and either the following jump or 6938 pop_failure_jump back to this on_failure_jump. */ 6939 case on_failure_jump: 6940 on_failure: 6941 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 6942 6943 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6944 #ifdef _LIBC 6945 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt); 6946 #else 6947 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 6948 #endif 6949 6950 /* If this on_failure_jump comes right before a group (i.e., 6951 the original * applied to a group), save the information 6952 for that group and all inner ones, so that if we fail back 6953 to this point, the group's information will be correct. 6954 For example, in \(a*\)*\1, we need the preceding group, 6955 and in \(zz\(a*\)b*\)\2, we need the inner group. */ 6956 6957 /* We can't use `p' to check ahead because we push 6958 a failure point to `p + mcnt' after we do this. */ 6959 p1 = p; 6960 6961 /* We need to skip no_op's before we look for the 6962 start_memory in case this on_failure_jump is happening as 6963 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 6964 against aba. */ 6965 while (p1 < pend && (re_opcode_t) *p1 == no_op) 6966 p1++; 6967 6968 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 6969 { 6970 /* We have a new highest active register now. This will 6971 get reset at the start_memory we are about to get to, 6972 but we will have saved all the registers relevant to 6973 this repetition op, as described above. */ 6974 highest_active_reg = *(p1 + 1) + *(p1 + 2); 6975 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 6976 lowest_active_reg = *(p1 + 1); 6977 } 6978 6979 DEBUG_PRINT1 (":\n"); 6980 PUSH_FAILURE_POINT (p + mcnt, d, -2); 6981 break; 6982 6983 6984 /* A smart repeat ends with `maybe_pop_jump'. 6985 We change it to either `pop_failure_jump' or `jump'. */ 6986 case maybe_pop_jump: 6987 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6988 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 6989 { 6990 register UCHAR_T *p2 = p; 6991 6992 /* Compare the beginning of the repeat with what in the 6993 pattern follows its end. If we can establish that there 6994 is nothing that they would both match, i.e., that we 6995 would have to backtrack because of (as in, e.g., `a*a') 6996 then we can change to pop_failure_jump, because we'll 6997 never have to backtrack. 6998 6999 This is not true in the case of alternatives: in 7000 `(a|ab)*' we do need to backtrack to the `ab' alternative 7001 (e.g., if the string was `ab'). But instead of trying to 7002 detect that here, the alternative has put on a dummy 7003 failure point which is what we will end up popping. */ 7004 7005 /* Skip over open/close-group commands. 7006 If what follows this loop is a ...+ construct, 7007 look at what begins its body, since we will have to 7008 match at least one of that. */ 7009 while (1) 7010 { 7011 if (p2 + 2 < pend 7012 && ((re_opcode_t) *p2 == stop_memory 7013 || (re_opcode_t) *p2 == start_memory)) 7014 p2 += 3; 7015 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend 7016 && (re_opcode_t) *p2 == dummy_failure_jump) 7017 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE; 7018 else 7019 break; 7020 } 7021 7022 p1 = p + mcnt; 7023 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 7024 to the `maybe_finalize_jump' of this case. Examine what 7025 follows. */ 7026 7027 /* If we're at the end of the pattern, we can change. */ 7028 if (p2 == pend) 7029 { 7030 /* Consider what happens when matching ":\(.*\)" 7031 against ":/". I don't really understand this code 7032 yet. */ 7033 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T) 7034 pop_failure_jump; 7035 DEBUG_PRINT1 7036 (" End of pattern: change to `pop_failure_jump'.\n"); 7037 } 7038 7039 else if ((re_opcode_t) *p2 == exactn 7040 #ifdef MBS_SUPPORT 7041 || (re_opcode_t) *p2 == exactn_bin 7042 #endif 7043 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 7044 { 7045 register UCHAR_T c 7046 = *p2 == (UCHAR_T) endline ? '\n' : p2[2]; 7047 7048 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn 7049 #ifdef MBS_SUPPORT 7050 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin 7051 #endif 7052 ) && p1[3+OFFSET_ADDRESS_SIZE] != c) 7053 { 7054 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T) 7055 pop_failure_jump; 7056 #ifdef WCHAR 7057 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n", 7058 (wint_t) c, 7059 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]); 7060 #else 7061 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 7062 (char) c, 7063 (char) p1[3+OFFSET_ADDRESS_SIZE]); 7064 #endif 7065 } 7066 7067 #ifndef WCHAR 7068 else if ((re_opcode_t) p1[3] == charset 7069 || (re_opcode_t) p1[3] == charset_not) 7070 { 7071 int negate = (re_opcode_t) p1[3] == charset_not; 7072 7073 if (c < (unsigned) (p1[4] * BYTEWIDTH) 7074 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 7075 negate = !negate; 7076 7077 /* `negate' is equal to 1 if c would match, which means 7078 that we can't change to pop_failure_jump. */ 7079 if (!negate) 7080 { 7081 p[-3] = (unsigned char) pop_failure_jump; 7082 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7083 } 7084 } 7085 #endif /* not WCHAR */ 7086 } 7087 #ifndef WCHAR 7088 else if ((re_opcode_t) *p2 == charset) 7089 { 7090 /* We win if the first character of the loop is not part 7091 of the charset. */ 7092 if ((re_opcode_t) p1[3] == exactn 7093 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 7094 && (p2[2 + p1[5] / BYTEWIDTH] 7095 & (1 << (p1[5] % BYTEWIDTH))))) 7096 { 7097 p[-3] = (unsigned char) pop_failure_jump; 7098 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7099 } 7100 7101 else if ((re_opcode_t) p1[3] == charset_not) 7102 { 7103 int idx; 7104 /* We win if the charset_not inside the loop 7105 lists every character listed in the charset after. */ 7106 for (idx = 0; idx < (int) p2[1]; idx++) 7107 if (! (p2[2 + idx] == 0 7108 || (idx < (int) p1[4] 7109 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) 7110 break; 7111 7112 if (idx == p2[1]) 7113 { 7114 p[-3] = (unsigned char) pop_failure_jump; 7115 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7116 } 7117 } 7118 else if ((re_opcode_t) p1[3] == charset) 7119 { 7120 int idx; 7121 /* We win if the charset inside the loop 7122 has no overlap with the one after the loop. */ 7123 for (idx = 0; 7124 idx < (int) p2[1] && idx < (int) p1[4]; 7125 idx++) 7126 if ((p2[2 + idx] & p1[5 + idx]) != 0) 7127 break; 7128 7129 if (idx == p2[1] || idx == p1[4]) 7130 { 7131 p[-3] = (unsigned char) pop_failure_jump; 7132 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7133 } 7134 } 7135 } 7136 #endif /* not WCHAR */ 7137 } 7138 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */ 7139 if ((re_opcode_t) p[-1] != pop_failure_jump) 7140 { 7141 p[-1] = (UCHAR_T) jump; 7142 DEBUG_PRINT1 (" Match => jump.\n"); 7143 goto unconditional_jump; 7144 } 7145 /* Note fall through. */ 7146 7147 7148 /* The end of a simple repeat has a pop_failure_jump back to 7149 its matching on_failure_jump, where the latter will push a 7150 failure point. The pop_failure_jump takes off failure 7151 points put on by this pop_failure_jump's matching 7152 on_failure_jump; we got through the pattern to here from the 7153 matching on_failure_jump, so didn't fail. */ 7154 case pop_failure_jump: 7155 { 7156 /* We need to pass separate storage for the lowest and 7157 highest registers, even though we don't care about the 7158 actual values. Otherwise, we will restore only one 7159 register from the stack, since lowest will == highest in 7160 `pop_failure_point'. */ 7161 active_reg_t dummy_low_reg, dummy_high_reg; 7162 UCHAR_T *pdummy ATTRIBUTE_UNUSED = NULL; 7163 const CHAR_T *sdummy ATTRIBUTE_UNUSED = NULL; 7164 7165 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 7166 POP_FAILURE_POINT (sdummy, pdummy, 7167 dummy_low_reg, dummy_high_reg, 7168 reg_dummy, reg_dummy, reg_info_dummy); 7169 } 7170 /* Note fall through. */ 7171 7172 unconditional_jump: 7173 #ifdef _LIBC 7174 DEBUG_PRINT2 ("\n%p: ", p); 7175 #else 7176 DEBUG_PRINT2 ("\n0x%x: ", p); 7177 #endif 7178 /* Note fall through. */ 7179 7180 /* Unconditionally jump (without popping any failure points). */ 7181 case jump: 7182 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 7183 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 7184 p += mcnt; /* Do the jump. */ 7185 #ifdef _LIBC 7186 DEBUG_PRINT2 ("(to %p).\n", p); 7187 #else 7188 DEBUG_PRINT2 ("(to 0x%x).\n", p); 7189 #endif 7190 break; 7191 7192 7193 /* We need this opcode so we can detect where alternatives end 7194 in `group_match_null_string_p' et al. */ 7195 case jump_past_alt: 7196 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 7197 goto unconditional_jump; 7198 7199 7200 /* Normally, the on_failure_jump pushes a failure point, which 7201 then gets popped at pop_failure_jump. We will end up at 7202 pop_failure_jump, also, and with a pattern of, say, `a+', we 7203 are skipping over the on_failure_jump, so we have to push 7204 something meaningless for pop_failure_jump to pop. */ 7205 case dummy_failure_jump: 7206 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 7207 /* It doesn't matter what we push for the string here. What 7208 the code at `fail' tests is the value for the pattern. */ 7209 PUSH_FAILURE_POINT (NULL, NULL, -2); 7210 goto unconditional_jump; 7211 7212 7213 /* At the end of an alternative, we need to push a dummy failure 7214 point in case we are followed by a `pop_failure_jump', because 7215 we don't want the failure point for the alternative to be 7216 popped. For example, matching `(a|ab)*' against `aab' 7217 requires that we match the `ab' alternative. */ 7218 case push_dummy_failure: 7219 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 7220 /* See comments just above at `dummy_failure_jump' about the 7221 two zeroes. */ 7222 PUSH_FAILURE_POINT (NULL, NULL, -2); 7223 break; 7224 7225 /* Have to succeed matching what follows at least n times. 7226 After that, handle like `on_failure_jump'. */ 7227 case succeed_n: 7228 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); 7229 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 7230 7231 assert (mcnt >= 0); 7232 /* Originally, this is how many times we HAVE to succeed. */ 7233 if (mcnt > 0) 7234 { 7235 mcnt--; 7236 p += OFFSET_ADDRESS_SIZE; 7237 STORE_NUMBER_AND_INCR (p, mcnt); 7238 #ifdef _LIBC 7239 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE 7240 , mcnt); 7241 #else 7242 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE 7243 , mcnt); 7244 #endif 7245 } 7246 else if (mcnt == 0) 7247 { 7248 #ifdef _LIBC 7249 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", 7250 p + OFFSET_ADDRESS_SIZE); 7251 #else 7252 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", 7253 p + OFFSET_ADDRESS_SIZE); 7254 #endif /* _LIBC */ 7255 7256 #ifdef WCHAR 7257 p[1] = (UCHAR_T) no_op; 7258 #else 7259 p[2] = (UCHAR_T) no_op; 7260 p[3] = (UCHAR_T) no_op; 7261 #endif /* WCHAR */ 7262 goto on_failure; 7263 } 7264 break; 7265 7266 case jump_n: 7267 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); 7268 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 7269 7270 /* Originally, this is how many times we CAN jump. */ 7271 if (mcnt) 7272 { 7273 mcnt--; 7274 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt); 7275 7276 #ifdef _LIBC 7277 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE, 7278 mcnt); 7279 #else 7280 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE, 7281 mcnt); 7282 #endif /* _LIBC */ 7283 goto unconditional_jump; 7284 } 7285 /* If don't have to jump any more, skip over the rest of command. */ 7286 else 7287 p += 2 * OFFSET_ADDRESS_SIZE; 7288 break; 7289 7290 case set_number_at: 7291 { 7292 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 7293 7294 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7295 p1 = p + mcnt; 7296 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7297 #ifdef _LIBC 7298 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); 7299 #else 7300 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 7301 #endif 7302 STORE_NUMBER (p1, mcnt); 7303 break; 7304 } 7305 7306 #if 0 7307 /* The DEC Alpha C compiler 3.x generates incorrect code for the 7308 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of 7309 AT_WORD_BOUNDARY, so this code is disabled. Expanding the 7310 macro and introducing temporary variables works around the bug. */ 7311 7312 case wordbound: 7313 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 7314 if (AT_WORD_BOUNDARY (d)) 7315 break; 7316 goto fail; 7317 7318 case notwordbound: 7319 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 7320 if (AT_WORD_BOUNDARY (d)) 7321 goto fail; 7322 break; 7323 #else 7324 case wordbound: 7325 { 7326 boolean prevchar, thischar; 7327 7328 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 7329 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 7330 break; 7331 7332 prevchar = WORDCHAR_P (d - 1); 7333 thischar = WORDCHAR_P (d); 7334 if (prevchar != thischar) 7335 break; 7336 goto fail; 7337 } 7338 7339 case notwordbound: 7340 { 7341 boolean prevchar, thischar; 7342 7343 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 7344 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 7345 goto fail; 7346 7347 prevchar = WORDCHAR_P (d - 1); 7348 thischar = WORDCHAR_P (d); 7349 if (prevchar != thischar) 7350 goto fail; 7351 break; 7352 } 7353 #endif 7354 7355 case wordbeg: 7356 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 7357 if (!AT_STRINGS_END (d) && WORDCHAR_P (d) 7358 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 7359 break; 7360 goto fail; 7361 7362 case wordend: 7363 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 7364 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 7365 && (AT_STRINGS_END (d) || !WORDCHAR_P (d))) 7366 break; 7367 goto fail; 7368 7369 #ifdef emacs 7370 case before_dot: 7371 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 7372 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 7373 goto fail; 7374 break; 7375 7376 case at_dot: 7377 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 7378 if (PTR_CHAR_POS ((unsigned char *) d) != point) 7379 goto fail; 7380 break; 7381 7382 case after_dot: 7383 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 7384 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 7385 goto fail; 7386 break; 7387 7388 case syntaxspec: 7389 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 7390 mcnt = *p++; 7391 goto matchsyntax; 7392 7393 case wordchar: 7394 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 7395 mcnt = (int) Sword; 7396 matchsyntax: 7397 PREFETCH (); 7398 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 7399 d++; 7400 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 7401 goto fail; 7402 SET_REGS_MATCHED (); 7403 break; 7404 7405 case notsyntaxspec: 7406 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 7407 mcnt = *p++; 7408 goto matchnotsyntax; 7409 7410 case notwordchar: 7411 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 7412 mcnt = (int) Sword; 7413 matchnotsyntax: 7414 PREFETCH (); 7415 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 7416 d++; 7417 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 7418 goto fail; 7419 SET_REGS_MATCHED (); 7420 break; 7421 7422 #else /* not emacs */ 7423 case wordchar: 7424 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 7425 PREFETCH (); 7426 if (!WORDCHAR_P (d)) 7427 goto fail; 7428 SET_REGS_MATCHED (); 7429 d++; 7430 break; 7431 7432 case notwordchar: 7433 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 7434 PREFETCH (); 7435 if (WORDCHAR_P (d)) 7436 goto fail; 7437 SET_REGS_MATCHED (); 7438 d++; 7439 break; 7440 #endif /* not emacs */ 7441 7442 default: 7443 abort (); 7444 } 7445 continue; /* Successfully executed one pattern command; keep going. */ 7446 7447 7448 /* We goto here if a matching operation fails. */ 7449 fail: 7450 if (!FAIL_STACK_EMPTY ()) 7451 { /* A restart point is known. Restore to that state. */ 7452 DEBUG_PRINT1 ("\nFAIL:\n"); 7453 POP_FAILURE_POINT (d, p, 7454 lowest_active_reg, highest_active_reg, 7455 regstart, regend, reg_info); 7456 7457 /* If this failure point is a dummy, try the next one. */ 7458 if (!p) 7459 goto fail; 7460 7461 /* If we failed to the end of the pattern, don't examine *p. */ 7462 assert (p <= pend); 7463 if (p < pend) 7464 { 7465 boolean is_a_jump_n = false; 7466 7467 /* If failed to a backwards jump that's part of a repetition 7468 loop, need to pop this failure point and use the next one. */ 7469 switch ((re_opcode_t) *p) 7470 { 7471 case jump_n: 7472 is_a_jump_n = true; 7473 case maybe_pop_jump: 7474 case pop_failure_jump: 7475 case jump: 7476 p1 = p + 1; 7477 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7478 p1 += mcnt; 7479 7480 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 7481 || (!is_a_jump_n 7482 && (re_opcode_t) *p1 == on_failure_jump)) 7483 goto fail; 7484 break; 7485 default: 7486 /* do nothing */ ; 7487 } 7488 } 7489 7490 if (d >= string1 && d <= end1) 7491 dend = end_match_1; 7492 } 7493 else 7494 break; /* Matching at this starting point really fails. */ 7495 } /* for (;;) */ 7496 7497 if (best_regs_set) 7498 goto restore_best_regs; 7499 7500 FREE_VARIABLES (); 7501 7502 return -1; /* Failure to match. */ 7503 } /* re_match_2 */ 7504 7505 /* Subroutine definitions for re_match_2. */ 7507 7508 7509 /* We are passed P pointing to a register number after a start_memory. 7510 7511 Return true if the pattern up to the corresponding stop_memory can 7512 match the empty string, and false otherwise. 7513 7514 If we find the matching stop_memory, sets P to point to one past its number. 7515 Otherwise, sets P to an undefined byte less than or equal to END. 7516 7517 We don't handle duplicates properly (yet). */ 7518 7519 static boolean 7520 PREFIX(group_match_null_string_p) (UCHAR_T **p, UCHAR_T *end, 7521 PREFIX(register_info_type) *reg_info) 7522 { 7523 int mcnt; 7524 /* Point to after the args to the start_memory. */ 7525 UCHAR_T *p1 = *p + 2; 7526 7527 while (p1 < end) 7528 { 7529 /* Skip over opcodes that can match nothing, and return true or 7530 false, as appropriate, when we get to one that can't, or to the 7531 matching stop_memory. */ 7532 7533 switch ((re_opcode_t) *p1) 7534 { 7535 /* Could be either a loop or a series of alternatives. */ 7536 case on_failure_jump: 7537 p1++; 7538 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7539 7540 /* If the next operation is not a jump backwards in the 7541 pattern. */ 7542 7543 if (mcnt >= 0) 7544 { 7545 /* Go through the on_failure_jumps of the alternatives, 7546 seeing if any of the alternatives cannot match nothing. 7547 The last alternative starts with only a jump, 7548 whereas the rest start with on_failure_jump and end 7549 with a jump, e.g., here is the pattern for `a|b|c': 7550 7551 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 7552 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 7553 /exactn/1/c 7554 7555 So, we have to first go through the first (n-1) 7556 alternatives and then deal with the last one separately. */ 7557 7558 7559 /* Deal with the first (n-1) alternatives, which start 7560 with an on_failure_jump (see above) that jumps to right 7561 past a jump_past_alt. */ 7562 7563 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] == 7564 jump_past_alt) 7565 { 7566 /* `mcnt' holds how many bytes long the alternative 7567 is, including the ending `jump_past_alt' and 7568 its number. */ 7569 7570 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt - 7571 (1 + OFFSET_ADDRESS_SIZE), 7572 reg_info)) 7573 return false; 7574 7575 /* Move to right after this alternative, including the 7576 jump_past_alt. */ 7577 p1 += mcnt; 7578 7579 /* Break if it's the beginning of an n-th alternative 7580 that doesn't begin with an on_failure_jump. */ 7581 if ((re_opcode_t) *p1 != on_failure_jump) 7582 break; 7583 7584 /* Still have to check that it's not an n-th 7585 alternative that starts with an on_failure_jump. */ 7586 p1++; 7587 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7588 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] != 7589 jump_past_alt) 7590 { 7591 /* Get to the beginning of the n-th alternative. */ 7592 p1 -= 1 + OFFSET_ADDRESS_SIZE; 7593 break; 7594 } 7595 } 7596 7597 /* Deal with the last alternative: go back and get number 7598 of the `jump_past_alt' just before it. `mcnt' contains 7599 the length of the alternative. */ 7600 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE); 7601 7602 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info)) 7603 return false; 7604 7605 p1 += mcnt; /* Get past the n-th alternative. */ 7606 } /* if mcnt > 0 */ 7607 break; 7608 7609 7610 case stop_memory: 7611 assert (p1[1] == **p); 7612 *p = p1 + 2; 7613 return true; 7614 7615 7616 default: 7617 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info)) 7618 return false; 7619 } 7620 } /* while p1 < end */ 7621 7622 return false; 7623 } /* group_match_null_string_p */ 7624 7625 7626 /* Similar to group_match_null_string_p, but doesn't deal with alternatives: 7627 It expects P to be the first byte of a single alternative and END one 7628 byte past the last. The alternative can contain groups. */ 7629 7630 static boolean 7631 PREFIX(alt_match_null_string_p) (UCHAR_T *p, UCHAR_T *end, 7632 PREFIX(register_info_type) *reg_info) 7633 { 7634 int mcnt; 7635 UCHAR_T *p1 = p; 7636 7637 while (p1 < end) 7638 { 7639 /* Skip over opcodes that can match nothing, and break when we get 7640 to one that can't. */ 7641 7642 switch ((re_opcode_t) *p1) 7643 { 7644 /* It's a loop. */ 7645 case on_failure_jump: 7646 p1++; 7647 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7648 p1 += mcnt; 7649 break; 7650 7651 default: 7652 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info)) 7653 return false; 7654 } 7655 } /* while p1 < end */ 7656 7657 return true; 7658 } /* alt_match_null_string_p */ 7659 7660 7661 /* Deals with the ops common to group_match_null_string_p and 7662 alt_match_null_string_p. 7663 7664 Sets P to one after the op and its arguments, if any. */ 7665 7666 static boolean 7667 PREFIX(common_op_match_null_string_p) (UCHAR_T **p, UCHAR_T *end, 7668 PREFIX(register_info_type) *reg_info) 7669 { 7670 int mcnt; 7671 boolean ret; 7672 int reg_no; 7673 UCHAR_T *p1 = *p; 7674 7675 switch ((re_opcode_t) *p1++) 7676 { 7677 case no_op: 7678 case begline: 7679 case endline: 7680 case begbuf: 7681 case endbuf: 7682 case wordbeg: 7683 case wordend: 7684 case wordbound: 7685 case notwordbound: 7686 #ifdef emacs 7687 case before_dot: 7688 case at_dot: 7689 case after_dot: 7690 #endif 7691 break; 7692 7693 case start_memory: 7694 reg_no = *p1; 7695 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 7696 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info); 7697 7698 /* Have to set this here in case we're checking a group which 7699 contains a group and a back reference to it. */ 7700 7701 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 7702 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 7703 7704 if (!ret) 7705 return false; 7706 break; 7707 7708 /* If this is an optimized succeed_n for zero times, make the jump. */ 7709 case jump: 7710 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7711 if (mcnt >= 0) 7712 p1 += mcnt; 7713 else 7714 return false; 7715 break; 7716 7717 case succeed_n: 7718 /* Get to the number of times to succeed. */ 7719 p1 += OFFSET_ADDRESS_SIZE; 7720 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7721 7722 if (mcnt == 0) 7723 { 7724 p1 -= 2 * OFFSET_ADDRESS_SIZE; 7725 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7726 p1 += mcnt; 7727 } 7728 else 7729 return false; 7730 break; 7731 7732 case duplicate: 7733 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 7734 return false; 7735 break; 7736 7737 case set_number_at: 7738 p1 += 2 * OFFSET_ADDRESS_SIZE; 7739 7740 default: 7741 /* All other opcodes mean we cannot match the empty string. */ 7742 return false; 7743 } 7744 7745 *p = p1; 7746 return true; 7747 } /* common_op_match_null_string_p */ 7748 7749 7750 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 7751 bytes; nonzero otherwise. */ 7752 7753 static int 7754 PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, register int len, 7755 RE_TRANSLATE_TYPE translate) 7756 { 7757 register const UCHAR_T *p1 = (const UCHAR_T *) s1; 7758 register const UCHAR_T *p2 = (const UCHAR_T *) s2; 7759 while (len) 7760 { 7761 #ifdef WCHAR 7762 if (((*p1<=0xff)?translate[*p1++]:*p1++) 7763 != ((*p2<=0xff)?translate[*p2++]:*p2++)) 7764 return 1; 7765 #else /* BYTE */ 7766 if (translate[*p1++] != translate[*p2++]) return 1; 7767 #endif /* WCHAR */ 7768 len--; 7769 } 7770 return 0; 7771 } 7772 7773 7775 #else /* not INSIDE_RECURSION */ 7776 7777 /* Entry points for GNU code. */ 7778 7779 /* re_compile_pattern is the GNU regular expression compiler: it 7780 compiles PATTERN (of length SIZE) and puts the result in BUFP. 7781 Returns 0 if the pattern was valid, otherwise an error string. 7782 7783 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 7784 are set in BUFP on entry. 7785 7786 We call regex_compile to do the actual compilation. */ 7787 7788 const char * 7789 re_compile_pattern (const char *pattern, size_t length, 7790 struct re_pattern_buffer *bufp) 7791 { 7792 reg_errcode_t ret; 7793 7794 /* GNU code is written to assume at least RE_NREGS registers will be set 7795 (and at least one extra will be -1). */ 7796 bufp->regs_allocated = REGS_UNALLOCATED; 7797 7798 /* And GNU code determines whether or not to get register information 7799 by passing null for the REGS argument to re_match, etc., not by 7800 setting no_sub. */ 7801 bufp->no_sub = 0; 7802 7803 /* Match anchors at newline. */ 7804 bufp->newline_anchor = 1; 7805 7806 # ifdef MBS_SUPPORT 7807 if (MB_CUR_MAX != 1) 7808 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp); 7809 else 7810 # endif 7811 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp); 7812 7813 if (!ret) 7814 return NULL; 7815 return gettext (re_error_msgid[(int) ret]); 7816 } 7817 #ifdef _LIBC 7818 weak_alias (__re_compile_pattern, re_compile_pattern) 7819 #endif 7820 7821 /* Entry points compatible with 4.2 BSD regex library. We don't define 7823 them unless specifically requested. */ 7824 7825 #if defined _REGEX_RE_COMP || defined _LIBC 7826 7827 /* BSD has one and only one pattern buffer. */ 7828 static struct re_pattern_buffer re_comp_buf; 7829 7830 char * 7831 #ifdef _LIBC 7832 /* Make these definitions weak in libc, so POSIX programs can redefine 7833 these names if they don't use our functions, and still use 7834 regcomp/regexec below without link errors. */ 7835 weak_function 7836 #endif 7837 re_comp (const char *s) 7838 { 7839 reg_errcode_t ret; 7840 7841 if (!s) 7842 { 7843 if (!re_comp_buf.buffer) 7844 return (char *) gettext ("No previous regular expression"); 7845 return 0; 7846 } 7847 7848 if (!re_comp_buf.buffer) 7849 { 7850 re_comp_buf.buffer = (unsigned char *) malloc (200); 7851 if (re_comp_buf.buffer == NULL) 7852 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]); 7853 re_comp_buf.allocated = 200; 7854 7855 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 7856 if (re_comp_buf.fastmap == NULL) 7857 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]); 7858 } 7859 7860 /* Since `re_exec' always passes NULL for the `regs' argument, we 7861 don't need to initialize the pattern buffer fields which affect it. */ 7862 7863 /* Match anchors at newlines. */ 7864 re_comp_buf.newline_anchor = 1; 7865 7866 # ifdef MBS_SUPPORT 7867 if (MB_CUR_MAX != 1) 7868 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 7869 else 7870 # endif 7871 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 7872 7873 if (!ret) 7874 return NULL; 7875 7876 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 7877 return (char *) gettext (re_error_msgid[(int) ret]); 7878 } 7879 7880 7881 int 7882 #ifdef _LIBC 7883 weak_function 7884 #endif 7885 re_exec (const char *s) 7886 { 7887 const int len = strlen (s); 7888 return 7889 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 7890 } 7891 7892 #endif /* _REGEX_RE_COMP */ 7893 7894 /* POSIX.2 functions. Don't define these for Emacs. */ 7896 7897 #ifndef emacs 7898 7899 /* regcomp takes a regular expression as a string and compiles it. 7900 7901 PREG is a regex_t *. We do not expect any fields to be initialized, 7902 since POSIX says we shouldn't. Thus, we set 7903 7904 `buffer' to the compiled pattern; 7905 `used' to the length of the compiled pattern; 7906 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 7907 REG_EXTENDED bit in CFLAGS is set; otherwise, to 7908 RE_SYNTAX_POSIX_BASIC; 7909 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 7910 `fastmap' to an allocated space for the fastmap; 7911 `fastmap_accurate' to zero; 7912 `re_nsub' to the number of subexpressions in PATTERN. 7913 7914 PATTERN is the address of the pattern string. 7915 7916 CFLAGS is a series of bits which affect compilation. 7917 7918 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 7919 use POSIX basic syntax. 7920 7921 If REG_NEWLINE is set, then . and [^...] don't match newline. 7922 Also, regexec will try a match beginning after every newline. 7923 7924 If REG_ICASE is set, then we considers upper- and lowercase 7925 versions of letters to be equivalent when matching. 7926 7927 If REG_NOSUB is set, then when PREG is passed to regexec, that 7928 routine will report only success or failure, and nothing about the 7929 registers. 7930 7931 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 7932 the return codes and their meanings.) */ 7933 7934 int 7935 regcomp (regex_t *preg, const char *pattern, int cflags) 7936 { 7937 reg_errcode_t ret; 7938 reg_syntax_t syntax 7939 = (cflags & REG_EXTENDED) ? 7940 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 7941 7942 /* regex_compile will allocate the space for the compiled pattern. */ 7943 preg->buffer = 0; 7944 preg->allocated = 0; 7945 preg->used = 0; 7946 7947 /* Try to allocate space for the fastmap. */ 7948 preg->fastmap = (char *) malloc (1 << BYTEWIDTH); 7949 7950 if (cflags & REG_ICASE) 7951 { 7952 int i; 7953 7954 preg->translate 7955 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE 7956 * sizeof (*(RE_TRANSLATE_TYPE)0)); 7957 if (preg->translate == NULL) 7958 return (int) REG_ESPACE; 7959 7960 /* Map uppercase characters to corresponding lowercase ones. */ 7961 for (i = 0; i < CHAR_SET_SIZE; i++) 7962 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i; 7963 } 7964 else 7965 preg->translate = NULL; 7966 7967 /* If REG_NEWLINE is set, newlines are treated differently. */ 7968 if (cflags & REG_NEWLINE) 7969 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 7970 syntax &= ~RE_DOT_NEWLINE; 7971 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 7972 /* It also changes the matching behavior. */ 7973 preg->newline_anchor = 1; 7974 } 7975 else 7976 preg->newline_anchor = 0; 7977 7978 preg->no_sub = !!(cflags & REG_NOSUB); 7979 7980 /* POSIX says a null character in the pattern terminates it, so we 7981 can use strlen here in compiling the pattern. */ 7982 # ifdef MBS_SUPPORT 7983 if (MB_CUR_MAX != 1) 7984 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg); 7985 else 7986 # endif 7987 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg); 7988 7989 /* POSIX doesn't distinguish between an unmatched open-group and an 7990 unmatched close-group: both are REG_EPAREN. */ 7991 if (ret == REG_ERPAREN) ret = REG_EPAREN; 7992 7993 if (ret == REG_NOERROR && preg->fastmap) 7994 { 7995 /* Compute the fastmap now, since regexec cannot modify the pattern 7996 buffer. */ 7997 if (re_compile_fastmap (preg) == -2) 7998 { 7999 /* Some error occurred while computing the fastmap, just forget 8000 about it. */ 8001 free (preg->fastmap); 8002 preg->fastmap = NULL; 8003 } 8004 } 8005 8006 return (int) ret; 8007 } 8008 #ifdef _LIBC 8009 weak_alias (__regcomp, regcomp) 8010 #endif 8011 8012 8013 /* regexec searches for a given pattern, specified by PREG, in the 8014 string STRING. 8015 8016 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 8017 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 8018 least NMATCH elements, and we set them to the offsets of the 8019 corresponding matched substrings. 8020 8021 EFLAGS specifies `execution flags' which affect matching: if 8022 REG_NOTBOL is set, then ^ does not match at the beginning of the 8023 string; if REG_NOTEOL is set, then $ does not match at the end. 8024 8025 We return 0 if we find a match and REG_NOMATCH if not. */ 8026 8027 int 8028 regexec (const regex_t *preg, const char *string, size_t nmatch, 8029 regmatch_t pmatch[], int eflags) 8030 { 8031 int ret; 8032 struct re_registers regs; 8033 regex_t private_preg; 8034 int len = strlen (string); 8035 boolean want_reg_info = !preg->no_sub && nmatch > 0; 8036 8037 private_preg = *preg; 8038 8039 private_preg.not_bol = !!(eflags & REG_NOTBOL); 8040 private_preg.not_eol = !!(eflags & REG_NOTEOL); 8041 8042 /* The user has told us exactly how many registers to return 8043 information about, via `nmatch'. We have to pass that on to the 8044 matching routines. */ 8045 private_preg.regs_allocated = REGS_FIXED; 8046 8047 if (want_reg_info) 8048 { 8049 regs.num_regs = nmatch; 8050 regs.start = TALLOC (nmatch * 2, regoff_t); 8051 if (regs.start == NULL) 8052 return (int) REG_NOMATCH; 8053 regs.end = regs.start + nmatch; 8054 } 8055 8056 /* Perform the searching operation. */ 8057 ret = re_search (&private_preg, string, len, 8058 /* start: */ 0, /* range: */ len, 8059 want_reg_info ? ®s : (struct re_registers *) 0); 8060 8061 /* Copy the register information to the POSIX structure. */ 8062 if (want_reg_info) 8063 { 8064 if (ret >= 0) 8065 { 8066 unsigned r; 8067 8068 for (r = 0; r < nmatch; r++) 8069 { 8070 pmatch[r].rm_so = regs.start[r]; 8071 pmatch[r].rm_eo = regs.end[r]; 8072 } 8073 } 8074 8075 /* If we needed the temporary register info, free the space now. */ 8076 free (regs.start); 8077 } 8078 8079 /* We want zero return to mean success, unlike `re_search'. */ 8080 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 8081 } 8082 #ifdef _LIBC 8083 weak_alias (__regexec, regexec) 8084 #endif 8085 8086 8087 /* Returns a message corresponding to an error code, ERRCODE, returned 8088 from either regcomp or regexec. We don't use PREG here. */ 8089 8090 size_t 8091 regerror (int errcode, const regex_t *preg ATTRIBUTE_UNUSED, 8092 char *errbuf, size_t errbuf_size) 8093 { 8094 const char *msg; 8095 size_t msg_size; 8096 8097 if (errcode < 0 8098 || errcode >= (int) (sizeof (re_error_msgid) 8099 / sizeof (re_error_msgid[0]))) 8100 /* Only error codes returned by the rest of the code should be passed 8101 to this routine. If we are given anything else, or if other regex 8102 code generates an invalid error code, then the program has a bug. 8103 Dump core so we can fix it. */ 8104 abort (); 8105 8106 msg = gettext (re_error_msgid[errcode]); 8107 8108 msg_size = strlen (msg) + 1; /* Includes the null. */ 8109 8110 if (errbuf_size != 0) 8111 { 8112 if (msg_size > errbuf_size) 8113 { 8114 #if defined HAVE_MEMPCPY || defined _LIBC 8115 *((char *) mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 8116 #else 8117 (void) memcpy (errbuf, msg, errbuf_size - 1); 8118 errbuf[errbuf_size - 1] = 0; 8119 #endif 8120 } 8121 else 8122 (void) memcpy (errbuf, msg, msg_size); 8123 } 8124 8125 return msg_size; 8126 } 8127 #ifdef _LIBC 8128 weak_alias (__regerror, regerror) 8129 #endif 8130 8131 8132 /* Free dynamically allocated space used by PREG. */ 8133 8134 void 8135 regfree (regex_t *preg) 8136 { 8137 free (preg->buffer); 8138 preg->buffer = NULL; 8139 8140 preg->allocated = 0; 8141 preg->used = 0; 8142 8143 free (preg->fastmap); 8144 preg->fastmap = NULL; 8145 preg->fastmap_accurate = 0; 8146 8147 free (preg->translate); 8148 preg->translate = NULL; 8149 } 8150 #ifdef _LIBC 8151 weak_alias (__regfree, regfree) 8152 #endif 8153 8154 #endif /* not emacs */ 8155 8156 #endif /* not INSIDE_RECURSION */ 8157 8158 8159 #undef STORE_NUMBER 8161 #undef STORE_NUMBER_AND_INCR 8162 #undef EXTRACT_NUMBER 8163 #undef EXTRACT_NUMBER_AND_INCR 8164 8165 #undef DEBUG_PRINT_COMPILED_PATTERN 8166 #undef DEBUG_PRINT_DOUBLE_STRING 8167 8168 #undef INIT_FAIL_STACK 8169 #undef RESET_FAIL_STACK 8170 #undef DOUBLE_FAIL_STACK 8171 #undef PUSH_PATTERN_OP 8172 #undef PUSH_FAILURE_POINTER 8173 #undef PUSH_FAILURE_INT 8174 #undef PUSH_FAILURE_ELT 8175 #undef POP_FAILURE_POINTER 8176 #undef POP_FAILURE_INT 8177 #undef POP_FAILURE_ELT 8178 #undef DEBUG_PUSH 8179 #undef DEBUG_POP 8180 #undef PUSH_FAILURE_POINT 8181 #undef POP_FAILURE_POINT 8182 8183 #undef REG_UNSET_VALUE 8184 #undef REG_UNSET 8185 8186 #undef PATFETCH 8187 #undef PATFETCH_RAW 8188 #undef PATUNFETCH 8189 #undef TRANSLATE 8190 8191 #undef INIT_BUF_SIZE 8192 #undef GET_BUFFER_SPACE 8193 #undef BUF_PUSH 8194 #undef BUF_PUSH_2 8195 #undef BUF_PUSH_3 8196 #undef STORE_JUMP 8197 #undef STORE_JUMP2 8198 #undef INSERT_JUMP 8199 #undef INSERT_JUMP2 8200 #undef EXTEND_BUFFER 8201 #undef GET_UNSIGNED_NUMBER 8202 #undef FREE_STACK_RETURN 8203 8204 # undef POINTER_TO_OFFSET 8205 # undef MATCHING_IN_FRST_STRING 8206 # undef PREFETCH 8207 # undef AT_STRINGS_BEG 8208 # undef AT_STRINGS_END 8209 # undef WORDCHAR_P 8210 # undef FREE_VAR 8211 # undef FREE_VARIABLES 8212 # undef NO_HIGHEST_ACTIVE_REG 8213 # undef NO_LOWEST_ACTIVE_REG 8214 8215 # undef CHAR_T 8216 # undef UCHAR_T 8217 # undef COMPILED_BUFFER_VAR 8218 # undef OFFSET_ADDRESS_SIZE 8219 # undef CHAR_CLASS_SIZE 8220 # undef PREFIX 8221 # undef ARG_PREFIX 8222 # undef PUT_CHAR 8223 # undef BYTE 8224 # undef WCHAR 8225 8226 # define DEFINED_ONCE 8227