1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009 5 Free Software Foundation, Inc. 6 This file is part of the GNU C Library. 7 Contributed by Isamu Hasegawa <isamu (at) yamato.ibm.com>. 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3, or (at your option) 12 any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with this program; if not, write to the Free Software Foundation, 21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, 24 size_t length, reg_syntax_t syntax); 25 static void re_compile_fastmap_iter (regex_t *bufp, 26 const re_dfastate_t *init_state, 27 char *fastmap); 28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); 29 #ifdef RE_ENABLE_I18N 30 static void free_charset (re_charset_t *cset); 31 #endif /* RE_ENABLE_I18N */ 32 static void free_workarea_compile (regex_t *preg); 33 static reg_errcode_t create_initial_state (re_dfa_t *dfa); 34 #ifdef RE_ENABLE_I18N 35 static void optimize_utf8 (re_dfa_t *dfa); 36 #endif 37 static reg_errcode_t analyze (regex_t *preg); 38 static reg_errcode_t preorder (bin_tree_t *root, 39 reg_errcode_t (fn (void *, bin_tree_t *)), 40 void *extra); 41 static reg_errcode_t postorder (bin_tree_t *root, 42 reg_errcode_t (fn (void *, bin_tree_t *)), 43 void *extra); 44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); 45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); 46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, 47 bin_tree_t *node); 48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node); 49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node); 50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); 51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint); 52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 53 unsigned int constraint); 54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa); 55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, 56 Idx node, bool root); 57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); 58 static Idx fetch_number (re_string_t *input, re_token_t *token, 59 reg_syntax_t syntax); 60 static int peek_token (re_token_t *token, re_string_t *input, 61 reg_syntax_t syntax) internal_function; 62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, 63 reg_syntax_t syntax, reg_errcode_t *err); 64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, 65 re_token_t *token, reg_syntax_t syntax, 66 Idx nest, reg_errcode_t *err); 67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, 68 re_token_t *token, reg_syntax_t syntax, 69 Idx nest, reg_errcode_t *err); 70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, 71 re_token_t *token, reg_syntax_t syntax, 72 Idx nest, reg_errcode_t *err); 73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, 74 re_token_t *token, reg_syntax_t syntax, 75 Idx nest, reg_errcode_t *err); 76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, 77 re_dfa_t *dfa, re_token_t *token, 78 reg_syntax_t syntax, reg_errcode_t *err); 79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, 80 re_token_t *token, reg_syntax_t syntax, 81 reg_errcode_t *err); 82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, 83 re_string_t *regexp, 84 re_token_t *token, int token_len, 85 re_dfa_t *dfa, 86 reg_syntax_t syntax, 87 bool accept_hyphen); 88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, 89 re_string_t *regexp, 90 re_token_t *token); 91 #ifdef RE_ENABLE_I18N 92 static reg_errcode_t build_equiv_class (bitset_t sbcset, 93 re_charset_t *mbcset, 94 Idx *equiv_class_alloc, 95 const unsigned char *name); 96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 97 bitset_t sbcset, 98 re_charset_t *mbcset, 99 Idx *char_class_alloc, 100 const unsigned char *class_name, 101 reg_syntax_t syntax); 102 #else /* not RE_ENABLE_I18N */ 103 static reg_errcode_t build_equiv_class (bitset_t sbcset, 104 const unsigned char *name); 105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, 106 bitset_t sbcset, 107 const unsigned char *class_name, 108 reg_syntax_t syntax); 109 #endif /* not RE_ENABLE_I18N */ 110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa, 111 RE_TRANSLATE_TYPE trans, 112 const unsigned char *class_name, 113 const unsigned char *extra, 114 bool non_match, reg_errcode_t *err); 115 static bin_tree_t *create_tree (re_dfa_t *dfa, 116 bin_tree_t *left, bin_tree_t *right, 117 re_token_type_t type); 118 static bin_tree_t *create_token_tree (re_dfa_t *dfa, 119 bin_tree_t *left, bin_tree_t *right, 120 const re_token_t *token); 121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); 122 static void free_token (re_token_t *node); 123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node); 124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); 125 126 /* This table gives an error message for each of the error codes listed 128 in regex.h. Obviously the order here has to be same as there. 129 POSIX doesn't require that we do anything for REG_NOERROR, 130 but why not be nice? */ 131 132 static const char __re_error_msgid[] = 133 { 134 #define REG_NOERROR_IDX 0 135 gettext_noop ("Success") /* REG_NOERROR */ 136 "\0" 137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 138 gettext_noop ("No match") /* REG_NOMATCH */ 139 "\0" 140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 142 "\0" 143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 145 "\0" 146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 148 "\0" 149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 151 "\0" 152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 154 "\0" 155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 156 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 157 "\0" 158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 160 "\0" 161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 163 "\0" 164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 166 "\0" 167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 168 gettext_noop ("Invalid range end") /* REG_ERANGE */ 169 "\0" 170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 171 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 172 "\0" 173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 175 "\0" 176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 177 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 178 "\0" 179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 180 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 181 "\0" 182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 184 }; 185 186 static const size_t __re_error_msgid_idx[] = 187 { 188 REG_NOERROR_IDX, 189 REG_NOMATCH_IDX, 190 REG_BADPAT_IDX, 191 REG_ECOLLATE_IDX, 192 REG_ECTYPE_IDX, 193 REG_EESCAPE_IDX, 194 REG_ESUBREG_IDX, 195 REG_EBRACK_IDX, 196 REG_EPAREN_IDX, 197 REG_EBRACE_IDX, 198 REG_BADBR_IDX, 199 REG_ERANGE_IDX, 200 REG_ESPACE_IDX, 201 REG_BADRPT_IDX, 202 REG_EEND_IDX, 203 REG_ESIZE_IDX, 204 REG_ERPAREN_IDX 205 }; 206 207 /* Entry points for GNU code. */ 209 210 /* re_compile_pattern is the GNU regular expression compiler: it 211 compiles PATTERN (of length LENGTH) and puts the result in BUFP. 212 Returns 0 if the pattern was valid, otherwise an error string. 213 214 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 215 are set in BUFP on entry. */ 216 217 #ifdef _LIBC 218 const char * 219 re_compile_pattern (pattern, length, bufp) 220 const char *pattern; 221 size_t length; 222 struct re_pattern_buffer *bufp; 223 #else /* size_t might promote */ 224 const char * 225 re_compile_pattern (const char *pattern, size_t length, 226 struct re_pattern_buffer *bufp) 227 #endif 228 { 229 reg_errcode_t ret; 230 231 /* And GNU code determines whether or not to get register information 232 by passing null for the REGS argument to re_match, etc., not by 233 setting no_sub, unless RE_NO_SUB is set. */ 234 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); 235 236 /* Match anchors at newline. */ 237 bufp->newline_anchor = 1; 238 239 ret = re_compile_internal (bufp, pattern, length, re_syntax_options); 240 241 if (!ret) 242 return NULL; 243 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 244 } 245 #ifdef _LIBC 246 weak_alias (__re_compile_pattern, re_compile_pattern) 247 #endif 248 249 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 250 also be assigned to arbitrarily: each pattern buffer stores its own 251 syntax, so it can be changed between regex compilations. */ 252 /* This has no initializer because initialized variables in Emacs 253 become read-only after dumping. */ 254 reg_syntax_t re_syntax_options; 255 256 257 /* Specify the precise syntax of regexps for compilation. This provides 258 for compatibility for various utilities which historically have 259 different, incompatible syntaxes. 260 261 The argument SYNTAX is a bit mask comprised of the various bits 262 defined in regex.h. We return the old syntax. */ 263 264 reg_syntax_t 265 re_set_syntax (syntax) 266 reg_syntax_t syntax; 267 { 268 reg_syntax_t ret = re_syntax_options; 269 270 re_syntax_options = syntax; 271 return ret; 272 } 273 #ifdef _LIBC 274 weak_alias (__re_set_syntax, re_set_syntax) 275 #endif 276 277 int 278 re_compile_fastmap (bufp) 279 struct re_pattern_buffer *bufp; 280 { 281 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 282 char *fastmap = bufp->fastmap; 283 284 memset (fastmap, '\0', sizeof (char) * SBC_MAX); 285 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); 286 if (dfa->init_state != dfa->init_state_word) 287 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); 288 if (dfa->init_state != dfa->init_state_nl) 289 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); 290 if (dfa->init_state != dfa->init_state_begbuf) 291 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); 292 bufp->fastmap_accurate = 1; 293 return 0; 294 } 295 #ifdef _LIBC 296 weak_alias (__re_compile_fastmap, re_compile_fastmap) 297 #endif 298 299 static inline void 300 __attribute ((always_inline)) 301 re_set_fastmap (char *fastmap, bool icase, int ch) 302 { 303 fastmap[ch] = 1; 304 if (icase) 305 fastmap[tolower (ch)] = 1; 306 } 307 308 /* Helper function for re_compile_fastmap. 309 Compile fastmap for the initial_state INIT_STATE. */ 310 311 static void 312 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, 313 char *fastmap) 314 { 315 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; 316 Idx node_cnt; 317 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); 318 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) 319 { 320 Idx node = init_state->nodes.elems[node_cnt]; 321 re_token_type_t type = dfa->nodes[node].type; 322 323 if (type == CHARACTER) 324 { 325 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); 326 #ifdef RE_ENABLE_I18N 327 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 328 { 329 unsigned char buf[MB_LEN_MAX]; 330 unsigned char *p; 331 wchar_t wc; 332 mbstate_t state; 333 334 p = buf; 335 *p++ = dfa->nodes[node].opr.c; 336 while (++node < dfa->nodes_len 337 && dfa->nodes[node].type == CHARACTER 338 && dfa->nodes[node].mb_partial) 339 *p++ = dfa->nodes[node].opr.c; 340 memset (&state, '\0', sizeof (state)); 341 if (__mbrtowc (&wc, (const char *) buf, p - buf, 342 &state) == p - buf 343 && (__wcrtomb ((char *) buf, towlower (wc), &state) 344 != (size_t) -1)) 345 re_set_fastmap (fastmap, false, buf[0]); 346 } 347 #endif 348 } 349 else if (type == SIMPLE_BRACKET) 350 { 351 int i, ch; 352 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 353 { 354 int j; 355 bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; 356 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 357 if (w & ((bitset_word_t) 1 << j)) 358 re_set_fastmap (fastmap, icase, ch); 359 } 360 } 361 #ifdef RE_ENABLE_I18N 362 else if (type == COMPLEX_BRACKET) 363 { 364 re_charset_t *cset = dfa->nodes[node].opr.mbcset; 365 Idx i; 366 367 # ifdef _LIBC 368 /* See if we have to try all bytes which start multiple collation 369 elements. 370 e.g. In da_DK, we want to catch 'a' since "aa" is a valid 371 collation element, and don't catch 'b' since 'b' is 372 the only collation element which starts from 'b' (and 373 it is caught by SIMPLE_BRACKET). */ 374 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0 375 && (cset->ncoll_syms || cset->nranges)) 376 { 377 const int32_t *table = (const int32_t *) 378 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 379 for (i = 0; i < SBC_MAX; ++i) 380 if (table[i] < 0) 381 re_set_fastmap (fastmap, icase, i); 382 } 383 # endif /* _LIBC */ 384 385 /* See if we have to start the match at all multibyte characters, 386 i.e. where we would not find an invalid sequence. This only 387 applies to multibyte character sets; for single byte character 388 sets, the SIMPLE_BRACKET again suffices. */ 389 if (dfa->mb_cur_max > 1 390 && (cset->nchar_classes || cset->non_match 391 # ifdef _LIBC 392 || cset->nequiv_classes 393 # endif /* _LIBC */ 394 )) 395 { 396 unsigned char c = 0; 397 do 398 { 399 mbstate_t mbs; 400 memset (&mbs, 0, sizeof (mbs)); 401 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2) 402 re_set_fastmap (fastmap, false, (int) c); 403 } 404 while (++c != 0); 405 } 406 407 else 408 { 409 /* ... Else catch all bytes which can start the mbchars. */ 410 for (i = 0; i < cset->nmbchars; ++i) 411 { 412 char buf[256]; 413 mbstate_t state; 414 memset (&state, '\0', sizeof (state)); 415 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) 416 re_set_fastmap (fastmap, icase, *(unsigned char *) buf); 417 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) 418 { 419 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) 420 != (size_t) -1) 421 re_set_fastmap (fastmap, false, *(unsigned char *) buf); 422 } 423 } 424 } 425 } 426 #endif /* RE_ENABLE_I18N */ 427 else if (type == OP_PERIOD 428 #ifdef RE_ENABLE_I18N 429 || type == OP_UTF8_PERIOD 430 #endif /* RE_ENABLE_I18N */ 431 || type == END_OF_RE) 432 { 433 memset (fastmap, '\1', sizeof (char) * SBC_MAX); 434 if (type == END_OF_RE) 435 bufp->can_be_null = 1; 436 return; 437 } 438 } 439 } 440 441 /* Entry point for POSIX code. */ 443 /* regcomp takes a regular expression as a string and compiles it. 444 445 PREG is a regex_t *. We do not expect any fields to be initialized, 446 since POSIX says we shouldn't. Thus, we set 447 448 `buffer' to the compiled pattern; 449 `used' to the length of the compiled pattern; 450 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 451 REG_EXTENDED bit in CFLAGS is set; otherwise, to 452 RE_SYNTAX_POSIX_BASIC; 453 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 454 `fastmap' to an allocated space for the fastmap; 455 `fastmap_accurate' to zero; 456 `re_nsub' to the number of subexpressions in PATTERN. 457 458 PATTERN is the address of the pattern string. 459 460 CFLAGS is a series of bits which affect compilation. 461 462 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 463 use POSIX basic syntax. 464 465 If REG_NEWLINE is set, then . and [^...] don't match newline. 466 Also, regexec will try a match beginning after every newline. 467 468 If REG_ICASE is set, then we considers upper- and lowercase 469 versions of letters to be equivalent when matching. 470 471 If REG_NOSUB is set, then when PREG is passed to regexec, that 472 routine will report only success or failure, and nothing about the 473 registers. 474 475 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 476 the return codes and their meanings.) */ 477 478 int 479 regcomp (preg, pattern, cflags) 480 regex_t *_Restrict_ preg; 481 const char *_Restrict_ pattern; 482 int cflags; 483 { 484 reg_errcode_t ret; 485 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED 486 : RE_SYNTAX_POSIX_BASIC); 487 488 preg->buffer = NULL; 489 preg->allocated = 0; 490 preg->used = 0; 491 492 /* Try to allocate space for the fastmap. */ 493 preg->fastmap = re_malloc (char, SBC_MAX); 494 if (BE (preg->fastmap == NULL, 0)) 495 return REG_ESPACE; 496 497 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; 498 499 /* If REG_NEWLINE is set, newlines are treated differently. */ 500 if (cflags & REG_NEWLINE) 501 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 502 syntax &= ~RE_DOT_NEWLINE; 503 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 504 /* It also changes the matching behavior. */ 505 preg->newline_anchor = 1; 506 } 507 else 508 preg->newline_anchor = 0; 509 preg->no_sub = !!(cflags & REG_NOSUB); 510 preg->translate = NULL; 511 512 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); 513 514 /* POSIX doesn't distinguish between an unmatched open-group and an 515 unmatched close-group: both are REG_EPAREN. */ 516 if (ret == REG_ERPAREN) 517 ret = REG_EPAREN; 518 519 /* We have already checked preg->fastmap != NULL. */ 520 if (BE (ret == REG_NOERROR, 1)) 521 /* Compute the fastmap now, since regexec cannot modify the pattern 522 buffer. This function never fails in this implementation. */ 523 (void) re_compile_fastmap (preg); 524 else 525 { 526 /* Some error occurred while compiling the expression. */ 527 re_free (preg->fastmap); 528 preg->fastmap = NULL; 529 } 530 531 return (int) ret; 532 } 533 #ifdef _LIBC 534 weak_alias (__regcomp, regcomp) 535 #endif 536 537 /* Returns a message corresponding to an error code, ERRCODE, returned 538 from either regcomp or regexec. We don't use PREG here. */ 539 540 #ifdef _LIBC 541 size_t 542 regerror (errcode, preg, errbuf, errbuf_size) 543 int errcode; 544 const regex_t *_Restrict_ preg; 545 char *_Restrict_ errbuf; 546 size_t errbuf_size; 547 #else /* size_t might promote */ 548 size_t 549 regerror (int errcode, const regex_t *_Restrict_ preg, 550 char *_Restrict_ errbuf, size_t errbuf_size) 551 #endif 552 { 553 const char *msg; 554 size_t msg_size; 555 556 if (BE (errcode < 0 557 || errcode >= (int) (sizeof (__re_error_msgid_idx) 558 / sizeof (__re_error_msgid_idx[0])), 0)) 559 /* Only error codes returned by the rest of the code should be passed 560 to this routine. If we are given anything else, or if other regex 561 code generates an invalid error code, then the program has a bug. 562 Dump core so we can fix it. */ 563 abort (); 564 565 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); 566 567 msg_size = strlen (msg) + 1; /* Includes the null. */ 568 569 if (BE (errbuf_size != 0, 1)) 570 { 571 size_t cpy_size = msg_size; 572 if (BE (msg_size > errbuf_size, 0)) 573 { 574 cpy_size = errbuf_size - 1; 575 errbuf[cpy_size] = '\0'; 576 } 577 memcpy (errbuf, msg, cpy_size); 578 } 579 580 return msg_size; 581 } 582 #ifdef _LIBC 583 weak_alias (__regerror, regerror) 584 #endif 585 586 587 #ifdef RE_ENABLE_I18N 588 /* This static array is used for the map to single-byte characters when 589 UTF-8 is used. Otherwise we would allocate memory just to initialize 590 it the same all the time. UTF-8 is the preferred encoding so this is 591 a worthwhile optimization. */ 592 static const bitset_t utf8_sb_map = 593 { 594 /* Set the first 128 bits. */ 595 # if 4 * BITSET_WORD_BITS < ASCII_CHARS 596 # error "bitset_word_t is narrower than 32 bits" 597 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS 598 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX, 599 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS 600 BITSET_WORD_MAX, BITSET_WORD_MAX, 601 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS 602 BITSET_WORD_MAX, 603 # endif 604 (BITSET_WORD_MAX 605 >> (SBC_MAX % BITSET_WORD_BITS == 0 606 ? 0 607 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS)) 608 }; 609 #endif 610 611 612 static void 613 free_dfa_content (re_dfa_t *dfa) 614 { 615 Idx i, j; 616 617 if (dfa->nodes) 618 for (i = 0; i < dfa->nodes_len; ++i) 619 free_token (dfa->nodes + i); 620 re_free (dfa->nexts); 621 for (i = 0; i < dfa->nodes_len; ++i) 622 { 623 if (dfa->eclosures != NULL) 624 re_node_set_free (dfa->eclosures + i); 625 if (dfa->inveclosures != NULL) 626 re_node_set_free (dfa->inveclosures + i); 627 if (dfa->edests != NULL) 628 re_node_set_free (dfa->edests + i); 629 } 630 re_free (dfa->edests); 631 re_free (dfa->eclosures); 632 re_free (dfa->inveclosures); 633 re_free (dfa->nodes); 634 635 if (dfa->state_table) 636 for (i = 0; i <= dfa->state_hash_mask; ++i) 637 { 638 struct re_state_table_entry *entry = dfa->state_table + i; 639 for (j = 0; j < entry->num; ++j) 640 { 641 re_dfastate_t *state = entry->array[j]; 642 free_state (state); 643 } 644 re_free (entry->array); 645 } 646 re_free (dfa->state_table); 647 #ifdef RE_ENABLE_I18N 648 if (dfa->sb_char != utf8_sb_map) 649 re_free (dfa->sb_char); 650 #endif 651 re_free (dfa->subexp_map); 652 #ifdef DEBUG 653 re_free (dfa->re_str); 654 #endif 655 656 re_free (dfa); 657 } 658 659 660 /* Free dynamically allocated space used by PREG. */ 661 662 void 663 regfree (preg) 664 regex_t *preg; 665 { 666 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 667 if (BE (dfa != NULL, 1)) 668 free_dfa_content (dfa); 669 preg->buffer = NULL; 670 preg->allocated = 0; 671 672 re_free (preg->fastmap); 673 preg->fastmap = NULL; 674 675 re_free (preg->translate); 676 preg->translate = NULL; 677 } 678 #ifdef _LIBC 679 weak_alias (__regfree, regfree) 680 #endif 681 682 /* Entry points compatible with 4.2 BSD regex library. We don't define 684 them unless specifically requested. */ 685 686 #if defined _REGEX_RE_COMP || defined _LIBC 687 688 /* BSD has one and only one pattern buffer. */ 689 static struct re_pattern_buffer re_comp_buf; 690 691 char * 692 # ifdef _LIBC 693 /* Make these definitions weak in libc, so POSIX programs can redefine 694 these names if they don't use our functions, and still use 695 regcomp/regexec above without link errors. */ 696 weak_function 697 # endif 698 re_comp (s) 699 const char *s; 700 { 701 reg_errcode_t ret; 702 char *fastmap; 703 704 if (!s) 705 { 706 if (!re_comp_buf.buffer) 707 return gettext ("No previous regular expression"); 708 return 0; 709 } 710 711 if (re_comp_buf.buffer) 712 { 713 fastmap = re_comp_buf.fastmap; 714 re_comp_buf.fastmap = NULL; 715 __regfree (&re_comp_buf); 716 memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); 717 re_comp_buf.fastmap = fastmap; 718 } 719 720 if (re_comp_buf.fastmap == NULL) 721 { 722 re_comp_buf.fastmap = (char *) malloc (SBC_MAX); 723 if (re_comp_buf.fastmap == NULL) 724 return (char *) gettext (__re_error_msgid 725 + __re_error_msgid_idx[(int) REG_ESPACE]); 726 } 727 728 /* Since `re_exec' always passes NULL for the `regs' argument, we 729 don't need to initialize the pattern buffer fields which affect it. */ 730 731 /* Match anchors at newlines. */ 732 re_comp_buf.newline_anchor = 1; 733 734 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); 735 736 if (!ret) 737 return NULL; 738 739 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 740 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); 741 } 742 743 #ifdef _LIBC 744 libc_freeres_fn (free_mem) 745 { 746 __regfree (&re_comp_buf); 747 } 748 #endif 749 750 #endif /* _REGEX_RE_COMP */ 751 752 /* Internal entry point. 754 Compile the regular expression PATTERN, whose length is LENGTH. 755 SYNTAX indicate regular expression's syntax. */ 756 757 static reg_errcode_t 758 re_compile_internal (regex_t *preg, const char * pattern, size_t length, 759 reg_syntax_t syntax) 760 { 761 reg_errcode_t err = REG_NOERROR; 762 re_dfa_t *dfa; 763 re_string_t regexp; 764 765 /* Initialize the pattern buffer. */ 766 preg->fastmap_accurate = 0; 767 preg->syntax = syntax; 768 preg->not_bol = preg->not_eol = 0; 769 preg->used = 0; 770 preg->re_nsub = 0; 771 preg->can_be_null = 0; 772 preg->regs_allocated = REGS_UNALLOCATED; 773 774 /* Initialize the dfa. */ 775 dfa = (re_dfa_t *) preg->buffer; 776 if (BE (preg->allocated < sizeof (re_dfa_t), 0)) 777 { 778 /* If zero allocated, but buffer is non-null, try to realloc 779 enough space. This loses if buffer's address is bogus, but 780 that is the user's responsibility. If ->buffer is NULL this 781 is a simple allocation. */ 782 dfa = re_realloc (preg->buffer, re_dfa_t, 1); 783 if (dfa == NULL) 784 return REG_ESPACE; 785 preg->allocated = sizeof (re_dfa_t); 786 preg->buffer = (unsigned char *) dfa; 787 } 788 preg->used = sizeof (re_dfa_t); 789 790 err = init_dfa (dfa, length); 791 if (BE (err != REG_NOERROR, 0)) 792 { 793 free_dfa_content (dfa); 794 preg->buffer = NULL; 795 preg->allocated = 0; 796 return err; 797 } 798 #ifdef DEBUG 799 /* Note: length+1 will not overflow since it is checked in init_dfa. */ 800 dfa->re_str = re_malloc (char, length + 1); 801 strncpy (dfa->re_str, pattern, length + 1); 802 #endif 803 804 __libc_lock_init (dfa->lock); 805 806 err = re_string_construct (®exp, pattern, length, preg->translate, 807 (syntax & RE_ICASE) != 0, dfa); 808 if (BE (err != REG_NOERROR, 0)) 809 { 810 re_compile_internal_free_return: 811 free_workarea_compile (preg); 812 re_string_destruct (®exp); 813 free_dfa_content (dfa); 814 preg->buffer = NULL; 815 preg->allocated = 0; 816 return err; 817 } 818 819 /* Parse the regular expression, and build a structure tree. */ 820 preg->re_nsub = 0; 821 dfa->str_tree = parse (®exp, preg, syntax, &err); 822 if (BE (dfa->str_tree == NULL, 0)) 823 goto re_compile_internal_free_return; 824 825 /* Analyze the tree and create the nfa. */ 826 err = analyze (preg); 827 if (BE (err != REG_NOERROR, 0)) 828 goto re_compile_internal_free_return; 829 830 #ifdef RE_ENABLE_I18N 831 /* If possible, do searching in single byte encoding to speed things up. */ 832 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) 833 optimize_utf8 (dfa); 834 #endif 835 836 /* Then create the initial state of the dfa. */ 837 err = create_initial_state (dfa); 838 839 /* Release work areas. */ 840 free_workarea_compile (preg); 841 re_string_destruct (®exp); 842 843 if (BE (err != REG_NOERROR, 0)) 844 { 845 free_dfa_content (dfa); 846 preg->buffer = NULL; 847 preg->allocated = 0; 848 } 849 850 return err; 851 } 852 853 /* Initialize DFA. We use the length of the regular expression PAT_LEN 854 as the initial length of some arrays. */ 855 856 static reg_errcode_t 857 init_dfa (re_dfa_t *dfa, size_t pat_len) 858 { 859 __re_size_t table_size; 860 #ifdef RE_ENABLE_I18N 861 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t)); 862 #else 863 size_t max_i18n_object_size = 0; 864 #endif 865 size_t max_object_size = 866 MAX (sizeof (struct re_state_table_entry), 867 MAX (sizeof (re_token_t), 868 MAX (sizeof (re_node_set), 869 MAX (sizeof (regmatch_t), 870 max_i18n_object_size)))); 871 872 memset (dfa, '\0', sizeof (re_dfa_t)); 873 874 /* Force allocation of str_tree_storage the first time. */ 875 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 876 877 /* Avoid overflows. The extra "/ 2" is for the table_size doubling 878 calculation below, and for similar doubling calculations 879 elsewhere. And it's <= rather than <, because some of the 880 doubling calculations add 1 afterwards. */ 881 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0)) 882 return REG_ESPACE; 883 884 dfa->nodes_alloc = pat_len + 1; 885 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); 886 887 /* table_size = 2 ^ ceil(log pat_len) */ 888 for (table_size = 1; ; table_size <<= 1) 889 if (table_size > pat_len) 890 break; 891 892 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); 893 dfa->state_hash_mask = table_size - 1; 894 895 dfa->mb_cur_max = MB_CUR_MAX; 896 #ifdef _LIBC 897 if (dfa->mb_cur_max == 6 898 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) 899 dfa->is_utf8 = 1; 900 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) 901 != 0); 902 #else 903 if (strcmp (locale_charset (), "UTF-8") == 0) 904 dfa->is_utf8 = 1; 905 906 /* We check exhaustively in the loop below if this charset is a 907 superset of ASCII. */ 908 dfa->map_notascii = 0; 909 #endif 910 911 #ifdef RE_ENABLE_I18N 912 if (dfa->mb_cur_max > 1) 913 { 914 if (dfa->is_utf8) 915 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; 916 else 917 { 918 int i, j, ch; 919 920 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 921 if (BE (dfa->sb_char == NULL, 0)) 922 return REG_ESPACE; 923 924 /* Set the bits corresponding to single byte chars. */ 925 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 926 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 927 { 928 wint_t wch = __btowc (ch); 929 if (wch != WEOF) 930 dfa->sb_char[i] |= (bitset_word_t) 1 << j; 931 # ifndef _LIBC 932 if (isascii (ch) && wch != ch) 933 dfa->map_notascii = 1; 934 # endif 935 } 936 } 937 } 938 #endif 939 940 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) 941 return REG_ESPACE; 942 return REG_NOERROR; 943 } 944 945 /* Initialize WORD_CHAR table, which indicate which character is 946 "word". In this case "word" means that it is the word construction 947 character used by some operators like "\<", "\>", etc. */ 948 949 static void 950 internal_function 951 init_word_char (re_dfa_t *dfa) 952 { 953 int i, j, ch; 954 dfa->word_ops_used = 1; 955 for (i = 0, ch = 0; i < BITSET_WORDS; ++i) 956 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) 957 if (isalnum (ch) || ch == '_') 958 dfa->word_char[i] |= (bitset_word_t) 1 << j; 959 } 960 961 /* Free the work area which are only used while compiling. */ 962 963 static void 964 free_workarea_compile (regex_t *preg) 965 { 966 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 967 bin_tree_storage_t *storage, *next; 968 for (storage = dfa->str_tree_storage; storage; storage = next) 969 { 970 next = storage->next; 971 re_free (storage); 972 } 973 dfa->str_tree_storage = NULL; 974 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; 975 dfa->str_tree = NULL; 976 re_free (dfa->org_indices); 977 dfa->org_indices = NULL; 978 } 979 980 /* Create initial states for all contexts. */ 981 982 static reg_errcode_t 983 create_initial_state (re_dfa_t *dfa) 984 { 985 Idx first, i; 986 reg_errcode_t err; 987 re_node_set init_nodes; 988 989 /* Initial states have the epsilon closure of the node which is 990 the first node of the regular expression. */ 991 first = dfa->str_tree->first->node_idx; 992 dfa->init_node = first; 993 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); 994 if (BE (err != REG_NOERROR, 0)) 995 return err; 996 997 /* The back-references which are in initial states can epsilon transit, 998 since in this case all of the subexpressions can be null. 999 Then we add epsilon closures of the nodes which are the next nodes of 1000 the back-references. */ 1001 if (dfa->nbackref > 0) 1002 for (i = 0; i < init_nodes.nelem; ++i) 1003 { 1004 Idx node_idx = init_nodes.elems[i]; 1005 re_token_type_t type = dfa->nodes[node_idx].type; 1006 1007 Idx clexp_idx; 1008 if (type != OP_BACK_REF) 1009 continue; 1010 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) 1011 { 1012 re_token_t *clexp_node; 1013 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; 1014 if (clexp_node->type == OP_CLOSE_SUBEXP 1015 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) 1016 break; 1017 } 1018 if (clexp_idx == init_nodes.nelem) 1019 continue; 1020 1021 if (type == OP_BACK_REF) 1022 { 1023 Idx dest_idx = dfa->edests[node_idx].elems[0]; 1024 if (!re_node_set_contains (&init_nodes, dest_idx)) 1025 { 1026 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx); 1027 i = 0; 1028 } 1029 } 1030 } 1031 1032 /* It must be the first time to invoke acquire_state. */ 1033 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0); 1034 /* We don't check ERR here, since the initial state must not be NULL. */ 1035 if (BE (dfa->init_state == NULL, 0)) 1036 return err; 1037 if (dfa->init_state->has_constraint) 1038 { 1039 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes, 1040 CONTEXT_WORD); 1041 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes, 1042 CONTEXT_NEWLINE); 1043 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa, 1044 &init_nodes, 1045 CONTEXT_NEWLINE 1046 | CONTEXT_BEGBUF); 1047 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL 1048 || dfa->init_state_begbuf == NULL, 0)) 1049 return err; 1050 } 1051 else 1052 dfa->init_state_word = dfa->init_state_nl 1053 = dfa->init_state_begbuf = dfa->init_state; 1054 1055 re_node_set_free (&init_nodes); 1056 return REG_NOERROR; 1057 } 1058 1059 #ifdef RE_ENABLE_I18N 1061 /* If it is possible to do searching in single byte encoding instead of UTF-8 1062 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change 1063 DFA nodes where needed. */ 1064 1065 static void 1066 optimize_utf8 (re_dfa_t *dfa) 1067 { 1068 Idx node; 1069 int i; 1070 bool mb_chars = false; 1071 bool has_period = false; 1072 1073 for (node = 0; node < dfa->nodes_len; ++node) 1074 switch (dfa->nodes[node].type) 1075 { 1076 case CHARACTER: 1077 if (dfa->nodes[node].opr.c >= ASCII_CHARS) 1078 mb_chars = true; 1079 break; 1080 case ANCHOR: 1081 switch (dfa->nodes[node].opr.ctx_type) 1082 { 1083 case LINE_FIRST: 1084 case LINE_LAST: 1085 case BUF_FIRST: 1086 case BUF_LAST: 1087 break; 1088 default: 1089 /* Word anchors etc. cannot be handled. It's okay to test 1090 opr.ctx_type since constraints (for all DFA nodes) are 1091 created by ORing one or more opr.ctx_type values. */ 1092 return; 1093 } 1094 break; 1095 case OP_PERIOD: 1096 has_period = true; 1097 break; 1098 case OP_BACK_REF: 1099 case OP_ALT: 1100 case END_OF_RE: 1101 case OP_DUP_ASTERISK: 1102 case OP_OPEN_SUBEXP: 1103 case OP_CLOSE_SUBEXP: 1104 break; 1105 case COMPLEX_BRACKET: 1106 return; 1107 case SIMPLE_BRACKET: 1108 /* Just double check. */ 1109 { 1110 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0 1111 ? 0 1112 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS); 1113 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i) 1114 { 1115 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0) 1116 return; 1117 rshift = 0; 1118 } 1119 } 1120 break; 1121 default: 1122 abort (); 1123 } 1124 1125 if (mb_chars || has_period) 1126 for (node = 0; node < dfa->nodes_len; ++node) 1127 { 1128 if (dfa->nodes[node].type == CHARACTER 1129 && dfa->nodes[node].opr.c >= ASCII_CHARS) 1130 dfa->nodes[node].mb_partial = 0; 1131 else if (dfa->nodes[node].type == OP_PERIOD) 1132 dfa->nodes[node].type = OP_UTF8_PERIOD; 1133 } 1134 1135 /* The search can be in single byte locale. */ 1136 dfa->mb_cur_max = 1; 1137 dfa->is_utf8 = 0; 1138 dfa->has_mb_node = dfa->nbackref > 0 || has_period; 1139 } 1140 #endif 1141 1142 /* Analyze the structure tree, and calculate "first", "next", "edest", 1144 "eclosure", and "inveclosure". */ 1145 1146 static reg_errcode_t 1147 analyze (regex_t *preg) 1148 { 1149 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1150 reg_errcode_t ret; 1151 1152 /* Allocate arrays. */ 1153 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc); 1154 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc); 1155 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc); 1156 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc); 1157 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL 1158 || dfa->eclosures == NULL, 0)) 1159 return REG_ESPACE; 1160 1161 dfa->subexp_map = re_malloc (Idx, preg->re_nsub); 1162 if (dfa->subexp_map != NULL) 1163 { 1164 Idx i; 1165 for (i = 0; i < preg->re_nsub; i++) 1166 dfa->subexp_map[i] = i; 1167 preorder (dfa->str_tree, optimize_subexps, dfa); 1168 for (i = 0; i < preg->re_nsub; i++) 1169 if (dfa->subexp_map[i] != i) 1170 break; 1171 if (i == preg->re_nsub) 1172 { 1173 free (dfa->subexp_map); 1174 dfa->subexp_map = NULL; 1175 } 1176 } 1177 1178 ret = postorder (dfa->str_tree, lower_subexps, preg); 1179 if (BE (ret != REG_NOERROR, 0)) 1180 return ret; 1181 ret = postorder (dfa->str_tree, calc_first, dfa); 1182 if (BE (ret != REG_NOERROR, 0)) 1183 return ret; 1184 preorder (dfa->str_tree, calc_next, dfa); 1185 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa); 1186 if (BE (ret != REG_NOERROR, 0)) 1187 return ret; 1188 ret = calc_eclosure (dfa); 1189 if (BE (ret != REG_NOERROR, 0)) 1190 return ret; 1191 1192 /* We only need this during the prune_impossible_nodes pass in regexec.c; 1193 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */ 1194 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match) 1195 || dfa->nbackref) 1196 { 1197 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len); 1198 if (BE (dfa->inveclosures == NULL, 0)) 1199 return REG_ESPACE; 1200 ret = calc_inveclosure (dfa); 1201 } 1202 1203 return ret; 1204 } 1205 1206 /* Our parse trees are very unbalanced, so we cannot use a stack to 1207 implement parse tree visits. Instead, we use parent pointers and 1208 some hairy code in these two functions. */ 1209 static reg_errcode_t 1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1211 void *extra) 1212 { 1213 bin_tree_t *node, *prev; 1214 1215 for (node = root; ; ) 1216 { 1217 /* Descend down the tree, preferably to the left (or to the right 1218 if that's the only child). */ 1219 while (node->left || node->right) 1220 if (node->left) 1221 node = node->left; 1222 else 1223 node = node->right; 1224 1225 do 1226 { 1227 reg_errcode_t err = fn (extra, node); 1228 if (BE (err != REG_NOERROR, 0)) 1229 return err; 1230 if (node->parent == NULL) 1231 return REG_NOERROR; 1232 prev = node; 1233 node = node->parent; 1234 } 1235 /* Go up while we have a node that is reached from the right. */ 1236 while (node->right == prev || node->right == NULL); 1237 node = node->right; 1238 } 1239 } 1240 1241 static reg_errcode_t 1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)), 1243 void *extra) 1244 { 1245 bin_tree_t *node; 1246 1247 for (node = root; ; ) 1248 { 1249 reg_errcode_t err = fn (extra, node); 1250 if (BE (err != REG_NOERROR, 0)) 1251 return err; 1252 1253 /* Go to the left node, or up and to the right. */ 1254 if (node->left) 1255 node = node->left; 1256 else 1257 { 1258 bin_tree_t *prev = NULL; 1259 while (node->right == prev || node->right == NULL) 1260 { 1261 prev = node; 1262 node = node->parent; 1263 if (!node) 1264 return REG_NOERROR; 1265 } 1266 node = node->right; 1267 } 1268 } 1269 } 1270 1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell 1272 re_search_internal to map the inner one's opr.idx to this one's. Adjust 1273 backreferences as well. Requires a preorder visit. */ 1274 static reg_errcode_t 1275 optimize_subexps (void *extra, bin_tree_t *node) 1276 { 1277 re_dfa_t *dfa = (re_dfa_t *) extra; 1278 1279 if (node->token.type == OP_BACK_REF && dfa->subexp_map) 1280 { 1281 int idx = node->token.opr.idx; 1282 node->token.opr.idx = dfa->subexp_map[idx]; 1283 dfa->used_bkref_map |= 1 << node->token.opr.idx; 1284 } 1285 1286 else if (node->token.type == SUBEXP 1287 && node->left && node->left->token.type == SUBEXP) 1288 { 1289 Idx other_idx = node->left->token.opr.idx; 1290 1291 node->left = node->left->left; 1292 if (node->left) 1293 node->left->parent = node; 1294 1295 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx]; 1296 if (other_idx < BITSET_WORD_BITS) 1297 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx); 1298 } 1299 1300 return REG_NOERROR; 1301 } 1302 1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation 1304 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */ 1305 static reg_errcode_t 1306 lower_subexps (void *extra, bin_tree_t *node) 1307 { 1308 regex_t *preg = (regex_t *) extra; 1309 reg_errcode_t err = REG_NOERROR; 1310 1311 if (node->left && node->left->token.type == SUBEXP) 1312 { 1313 node->left = lower_subexp (&err, preg, node->left); 1314 if (node->left) 1315 node->left->parent = node; 1316 } 1317 if (node->right && node->right->token.type == SUBEXP) 1318 { 1319 node->right = lower_subexp (&err, preg, node->right); 1320 if (node->right) 1321 node->right->parent = node; 1322 } 1323 1324 return err; 1325 } 1326 1327 static bin_tree_t * 1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node) 1329 { 1330 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 1331 bin_tree_t *body = node->left; 1332 bin_tree_t *op, *cls, *tree1, *tree; 1333 1334 if (preg->no_sub 1335 /* We do not optimize empty subexpressions, because otherwise we may 1336 have bad CONCAT nodes with NULL children. This is obviously not 1337 very common, so we do not lose much. An example that triggers 1338 this case is the sed "script" /\(\)/x. */ 1339 && node->left != NULL 1340 && (node->token.opr.idx >= BITSET_WORD_BITS 1341 || !(dfa->used_bkref_map 1342 & ((bitset_word_t) 1 << node->token.opr.idx)))) 1343 return node->left; 1344 1345 /* Convert the SUBEXP node to the concatenation of an 1346 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */ 1347 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP); 1348 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP); 1349 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls; 1350 tree = create_tree (dfa, op, tree1, CONCAT); 1351 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0)) 1352 { 1353 *err = REG_ESPACE; 1354 return NULL; 1355 } 1356 1357 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx; 1358 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp; 1359 return tree; 1360 } 1361 1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton 1363 nodes. Requires a postorder visit. */ 1364 static reg_errcode_t 1365 calc_first (void *extra, bin_tree_t *node) 1366 { 1367 re_dfa_t *dfa = (re_dfa_t *) extra; 1368 if (node->token.type == CONCAT) 1369 { 1370 node->first = node->left->first; 1371 node->node_idx = node->left->node_idx; 1372 } 1373 else 1374 { 1375 node->first = node; 1376 node->node_idx = re_dfa_add_node (dfa, node->token); 1377 if (BE (node->node_idx == REG_MISSING, 0)) 1378 return REG_ESPACE; 1379 if (node->token.type == ANCHOR) 1380 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type; 1381 } 1382 return REG_NOERROR; 1383 } 1384 1385 /* Pass 2: compute NEXT on the tree. Preorder visit. */ 1386 static reg_errcode_t 1387 calc_next (void *extra, bin_tree_t *node) 1388 { 1389 switch (node->token.type) 1390 { 1391 case OP_DUP_ASTERISK: 1392 node->left->next = node; 1393 break; 1394 case CONCAT: 1395 node->left->next = node->right->first; 1396 node->right->next = node->next; 1397 break; 1398 default: 1399 if (node->left) 1400 node->left->next = node->next; 1401 if (node->right) 1402 node->right->next = node->next; 1403 break; 1404 } 1405 return REG_NOERROR; 1406 } 1407 1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */ 1409 static reg_errcode_t 1410 link_nfa_nodes (void *extra, bin_tree_t *node) 1411 { 1412 re_dfa_t *dfa = (re_dfa_t *) extra; 1413 Idx idx = node->node_idx; 1414 reg_errcode_t err = REG_NOERROR; 1415 1416 switch (node->token.type) 1417 { 1418 case CONCAT: 1419 break; 1420 1421 case END_OF_RE: 1422 assert (node->next == NULL); 1423 break; 1424 1425 case OP_DUP_ASTERISK: 1426 case OP_ALT: 1427 { 1428 Idx left, right; 1429 dfa->has_plural_match = 1; 1430 if (node->left != NULL) 1431 left = node->left->first->node_idx; 1432 else 1433 left = node->next->node_idx; 1434 if (node->right != NULL) 1435 right = node->right->first->node_idx; 1436 else 1437 right = node->next->node_idx; 1438 assert (REG_VALID_INDEX (left)); 1439 assert (REG_VALID_INDEX (right)); 1440 err = re_node_set_init_2 (dfa->edests + idx, left, right); 1441 } 1442 break; 1443 1444 case ANCHOR: 1445 case OP_OPEN_SUBEXP: 1446 case OP_CLOSE_SUBEXP: 1447 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx); 1448 break; 1449 1450 case OP_BACK_REF: 1451 dfa->nexts[idx] = node->next->node_idx; 1452 if (node->token.type == OP_BACK_REF) 1453 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]); 1454 break; 1455 1456 default: 1457 assert (!IS_EPSILON_NODE (node->token.type)); 1458 dfa->nexts[idx] = node->next->node_idx; 1459 break; 1460 } 1461 1462 return err; 1463 } 1464 1465 /* Duplicate the epsilon closure of the node ROOT_NODE. 1466 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition 1467 to their own constraint. */ 1468 1469 static reg_errcode_t 1470 internal_function 1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node, 1472 Idx root_node, unsigned int init_constraint) 1473 { 1474 Idx org_node, clone_node; 1475 bool ok; 1476 unsigned int constraint = init_constraint; 1477 for (org_node = top_org_node, clone_node = top_clone_node;;) 1478 { 1479 Idx org_dest, clone_dest; 1480 if (dfa->nodes[org_node].type == OP_BACK_REF) 1481 { 1482 /* If the back reference epsilon-transit, its destination must 1483 also have the constraint. Then duplicate the epsilon closure 1484 of the destination of the back reference, and store it in 1485 edests of the back reference. */ 1486 org_dest = dfa->nexts[org_node]; 1487 re_node_set_empty (dfa->edests + clone_node); 1488 clone_dest = duplicate_node (dfa, org_dest, constraint); 1489 if (BE (clone_dest == REG_MISSING, 0)) 1490 return REG_ESPACE; 1491 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1492 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1493 if (BE (! ok, 0)) 1494 return REG_ESPACE; 1495 } 1496 else if (dfa->edests[org_node].nelem == 0) 1497 { 1498 /* In case of the node can't epsilon-transit, don't duplicate the 1499 destination and store the original destination as the 1500 destination of the node. */ 1501 dfa->nexts[clone_node] = dfa->nexts[org_node]; 1502 break; 1503 } 1504 else if (dfa->edests[org_node].nelem == 1) 1505 { 1506 /* In case of the node can epsilon-transit, and it has only one 1507 destination. */ 1508 org_dest = dfa->edests[org_node].elems[0]; 1509 re_node_set_empty (dfa->edests + clone_node); 1510 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1511 /* If the node is root_node itself, it means the epsilon closure 1512 has a loop. Then tie it to the destination of the root_node. */ 1513 if (org_node == root_node && clone_node != org_node) 1514 { 1515 ok = re_node_set_insert (dfa->edests + clone_node, org_dest); 1516 if (BE (! ok, 0)) 1517 return REG_ESPACE; 1518 break; 1519 } 1520 /* In case the node has another constraint, append it. */ 1521 constraint |= dfa->nodes[org_node].constraint; 1522 clone_dest = duplicate_node (dfa, org_dest, constraint); 1523 if (BE (clone_dest == REG_MISSING, 0)) 1524 return REG_ESPACE; 1525 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1526 if (BE (! ok, 0)) 1527 return REG_ESPACE; 1528 } 1529 else /* dfa->edests[org_node].nelem == 2 */ 1530 { 1531 /* In case of the node can epsilon-transit, and it has two 1532 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */ 1533 org_dest = dfa->edests[org_node].elems[0]; 1534 re_node_set_empty (dfa->edests + clone_node); 1535 /* Search for a duplicated node which satisfies the constraint. */ 1536 clone_dest = search_duplicated_node (dfa, org_dest, constraint); 1537 if (clone_dest == REG_MISSING) 1538 { 1539 /* There is no such duplicated node, create a new one. */ 1540 reg_errcode_t err; 1541 clone_dest = duplicate_node (dfa, org_dest, constraint); 1542 if (BE (clone_dest == REG_MISSING, 0)) 1543 return REG_ESPACE; 1544 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1545 if (BE (! ok, 0)) 1546 return REG_ESPACE; 1547 err = duplicate_node_closure (dfa, org_dest, clone_dest, 1548 root_node, constraint); 1549 if (BE (err != REG_NOERROR, 0)) 1550 return err; 1551 } 1552 else 1553 { 1554 /* There is a duplicated node which satisfy the constraint, 1555 use it to avoid infinite loop. */ 1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1557 if (BE (! ok, 0)) 1558 return REG_ESPACE; 1559 } 1560 1561 org_dest = dfa->edests[org_node].elems[1]; 1562 clone_dest = duplicate_node (dfa, org_dest, constraint); 1563 if (BE (clone_dest == REG_MISSING, 0)) 1564 return REG_ESPACE; 1565 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest); 1566 if (BE (! ok, 0)) 1567 return REG_ESPACE; 1568 } 1569 org_node = org_dest; 1570 clone_node = clone_dest; 1571 } 1572 return REG_NOERROR; 1573 } 1574 1575 /* Search for a node which is duplicated from the node ORG_NODE, and 1576 satisfies the constraint CONSTRAINT. */ 1577 1578 static Idx 1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node, 1580 unsigned int constraint) 1581 { 1582 Idx idx; 1583 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx) 1584 { 1585 if (org_node == dfa->org_indices[idx] 1586 && constraint == dfa->nodes[idx].constraint) 1587 return idx; /* Found. */ 1588 } 1589 return REG_MISSING; /* Not found. */ 1590 } 1591 1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT. 1593 Return the index of the new node, or REG_MISSING if insufficient storage is 1594 available. */ 1595 1596 static Idx 1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint) 1598 { 1599 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]); 1600 if (BE (dup_idx != REG_MISSING, 1)) 1601 { 1602 dfa->nodes[dup_idx].constraint = constraint; 1603 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint; 1604 dfa->nodes[dup_idx].duplicated = 1; 1605 1606 /* Store the index of the original node. */ 1607 dfa->org_indices[dup_idx] = org_idx; 1608 } 1609 return dup_idx; 1610 } 1611 1612 static reg_errcode_t 1613 calc_inveclosure (re_dfa_t *dfa) 1614 { 1615 Idx src, idx; 1616 bool ok; 1617 for (idx = 0; idx < dfa->nodes_len; ++idx) 1618 re_node_set_init_empty (dfa->inveclosures + idx); 1619 1620 for (src = 0; src < dfa->nodes_len; ++src) 1621 { 1622 Idx *elems = dfa->eclosures[src].elems; 1623 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) 1624 { 1625 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src); 1626 if (BE (! ok, 0)) 1627 return REG_ESPACE; 1628 } 1629 } 1630 1631 return REG_NOERROR; 1632 } 1633 1634 /* Calculate "eclosure" for all the node in DFA. */ 1635 1636 static reg_errcode_t 1637 calc_eclosure (re_dfa_t *dfa) 1638 { 1639 Idx node_idx; 1640 bool incomplete; 1641 #ifdef DEBUG 1642 assert (dfa->nodes_len > 0); 1643 #endif 1644 incomplete = false; 1645 /* For each nodes, calculate epsilon closure. */ 1646 for (node_idx = 0; ; ++node_idx) 1647 { 1648 reg_errcode_t err; 1649 re_node_set eclosure_elem; 1650 if (node_idx == dfa->nodes_len) 1651 { 1652 if (!incomplete) 1653 break; 1654 incomplete = false; 1655 node_idx = 0; 1656 } 1657 1658 #ifdef DEBUG 1659 assert (dfa->eclosures[node_idx].nelem != REG_MISSING); 1660 #endif 1661 1662 /* If we have already calculated, skip it. */ 1663 if (dfa->eclosures[node_idx].nelem != 0) 1664 continue; 1665 /* Calculate epsilon closure of `node_idx'. */ 1666 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true); 1667 if (BE (err != REG_NOERROR, 0)) 1668 return err; 1669 1670 if (dfa->eclosures[node_idx].nelem == 0) 1671 { 1672 incomplete = true; 1673 re_node_set_free (&eclosure_elem); 1674 } 1675 } 1676 return REG_NOERROR; 1677 } 1678 1679 /* Calculate epsilon closure of NODE. */ 1680 1681 static reg_errcode_t 1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root) 1683 { 1684 reg_errcode_t err; 1685 Idx i; 1686 bool incomplete; 1687 bool ok; 1688 re_node_set eclosure; 1689 incomplete = false; 1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1); 1691 if (BE (err != REG_NOERROR, 0)) 1692 return err; 1693 1694 /* This indicates that we are calculating this node now. 1695 We reference this value to avoid infinite loop. */ 1696 dfa->eclosures[node].nelem = REG_MISSING; 1697 1698 /* If the current node has constraints, duplicate all nodes 1699 since they must inherit the constraints. */ 1700 if (dfa->nodes[node].constraint 1701 && dfa->edests[node].nelem 1702 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated) 1703 { 1704 err = duplicate_node_closure (dfa, node, node, node, 1705 dfa->nodes[node].constraint); 1706 if (BE (err != REG_NOERROR, 0)) 1707 return err; 1708 } 1709 1710 /* Expand each epsilon destination nodes. */ 1711 if (IS_EPSILON_NODE(dfa->nodes[node].type)) 1712 for (i = 0; i < dfa->edests[node].nelem; ++i) 1713 { 1714 re_node_set eclosure_elem; 1715 Idx edest = dfa->edests[node].elems[i]; 1716 /* If calculating the epsilon closure of `edest' is in progress, 1717 return intermediate result. */ 1718 if (dfa->eclosures[edest].nelem == REG_MISSING) 1719 { 1720 incomplete = true; 1721 continue; 1722 } 1723 /* If we haven't calculated the epsilon closure of `edest' yet, 1724 calculate now. Otherwise use calculated epsilon closure. */ 1725 if (dfa->eclosures[edest].nelem == 0) 1726 { 1727 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false); 1728 if (BE (err != REG_NOERROR, 0)) 1729 return err; 1730 } 1731 else 1732 eclosure_elem = dfa->eclosures[edest]; 1733 /* Merge the epsilon closure of `edest'. */ 1734 re_node_set_merge (&eclosure, &eclosure_elem); 1735 /* If the epsilon closure of `edest' is incomplete, 1736 the epsilon closure of this node is also incomplete. */ 1737 if (dfa->eclosures[edest].nelem == 0) 1738 { 1739 incomplete = true; 1740 re_node_set_free (&eclosure_elem); 1741 } 1742 } 1743 1744 /* Epsilon closures include itself. */ 1745 ok = re_node_set_insert (&eclosure, node); 1746 if (BE (! ok, 0)) 1747 return REG_ESPACE; 1748 if (incomplete && !root) 1749 dfa->eclosures[node].nelem = 0; 1750 else 1751 dfa->eclosures[node] = eclosure; 1752 *new_set = eclosure; 1753 return REG_NOERROR; 1754 } 1755 1756 /* Functions for token which are used in the parser. */ 1758 1759 /* Fetch a token from INPUT. 1760 We must not use this function inside bracket expressions. */ 1761 1762 static void 1763 internal_function 1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax) 1765 { 1766 re_string_skip_bytes (input, peek_token (result, input, syntax)); 1767 } 1768 1769 /* Peek a token from INPUT, and return the length of the token. 1770 We must not use this function inside bracket expressions. */ 1771 1772 static int 1773 internal_function 1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 1775 { 1776 unsigned char c; 1777 1778 if (re_string_eoi (input)) 1779 { 1780 token->type = END_OF_RE; 1781 return 0; 1782 } 1783 1784 c = re_string_peek_byte (input, 0); 1785 token->opr.c = c; 1786 1787 token->word_char = 0; 1788 #ifdef RE_ENABLE_I18N 1789 token->mb_partial = 0; 1790 if (input->mb_cur_max > 1 && 1791 !re_string_first_byte (input, re_string_cur_idx (input))) 1792 { 1793 token->type = CHARACTER; 1794 token->mb_partial = 1; 1795 return 1; 1796 } 1797 #endif 1798 if (c == '\\') 1799 { 1800 unsigned char c2; 1801 if (re_string_cur_idx (input) + 1 >= re_string_length (input)) 1802 { 1803 token->type = BACK_SLASH; 1804 return 1; 1805 } 1806 1807 c2 = re_string_peek_byte_case (input, 1); 1808 token->opr.c = c2; 1809 token->type = CHARACTER; 1810 #ifdef RE_ENABLE_I18N 1811 if (input->mb_cur_max > 1) 1812 { 1813 wint_t wc = re_string_wchar_at (input, 1814 re_string_cur_idx (input) + 1); 1815 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1816 } 1817 else 1818 #endif 1819 token->word_char = IS_WORD_CHAR (c2) != 0; 1820 1821 switch (c2) 1822 { 1823 case '|': 1824 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR)) 1825 token->type = OP_ALT; 1826 break; 1827 case '1': case '2': case '3': case '4': case '5': 1828 case '6': case '7': case '8': case '9': 1829 if (!(syntax & RE_NO_BK_REFS)) 1830 { 1831 token->type = OP_BACK_REF; 1832 token->opr.idx = c2 - '1'; 1833 } 1834 break; 1835 case '<': 1836 if (!(syntax & RE_NO_GNU_OPS)) 1837 { 1838 token->type = ANCHOR; 1839 token->opr.ctx_type = WORD_FIRST; 1840 } 1841 break; 1842 case '>': 1843 if (!(syntax & RE_NO_GNU_OPS)) 1844 { 1845 token->type = ANCHOR; 1846 token->opr.ctx_type = WORD_LAST; 1847 } 1848 break; 1849 case 'b': 1850 if (!(syntax & RE_NO_GNU_OPS)) 1851 { 1852 token->type = ANCHOR; 1853 token->opr.ctx_type = WORD_DELIM; 1854 } 1855 break; 1856 case 'B': 1857 if (!(syntax & RE_NO_GNU_OPS)) 1858 { 1859 token->type = ANCHOR; 1860 token->opr.ctx_type = NOT_WORD_DELIM; 1861 } 1862 break; 1863 case 'w': 1864 if (!(syntax & RE_NO_GNU_OPS)) 1865 token->type = OP_WORD; 1866 break; 1867 case 'W': 1868 if (!(syntax & RE_NO_GNU_OPS)) 1869 token->type = OP_NOTWORD; 1870 break; 1871 case 's': 1872 if (!(syntax & RE_NO_GNU_OPS)) 1873 token->type = OP_SPACE; 1874 break; 1875 case 'S': 1876 if (!(syntax & RE_NO_GNU_OPS)) 1877 token->type = OP_NOTSPACE; 1878 break; 1879 case '`': 1880 if (!(syntax & RE_NO_GNU_OPS)) 1881 { 1882 token->type = ANCHOR; 1883 token->opr.ctx_type = BUF_FIRST; 1884 } 1885 break; 1886 case '\'': 1887 if (!(syntax & RE_NO_GNU_OPS)) 1888 { 1889 token->type = ANCHOR; 1890 token->opr.ctx_type = BUF_LAST; 1891 } 1892 break; 1893 case '(': 1894 if (!(syntax & RE_NO_BK_PARENS)) 1895 token->type = OP_OPEN_SUBEXP; 1896 break; 1897 case ')': 1898 if (!(syntax & RE_NO_BK_PARENS)) 1899 token->type = OP_CLOSE_SUBEXP; 1900 break; 1901 case '+': 1902 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1903 token->type = OP_DUP_PLUS; 1904 break; 1905 case '?': 1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM)) 1907 token->type = OP_DUP_QUESTION; 1908 break; 1909 case '{': 1910 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1911 token->type = OP_OPEN_DUP_NUM; 1912 break; 1913 case '}': 1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES))) 1915 token->type = OP_CLOSE_DUP_NUM; 1916 break; 1917 default: 1918 break; 1919 } 1920 return 2; 1921 } 1922 1923 token->type = CHARACTER; 1924 #ifdef RE_ENABLE_I18N 1925 if (input->mb_cur_max > 1) 1926 { 1927 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input)); 1928 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0; 1929 } 1930 else 1931 #endif 1932 token->word_char = IS_WORD_CHAR (token->opr.c); 1933 1934 switch (c) 1935 { 1936 case '\n': 1937 if (syntax & RE_NEWLINE_ALT) 1938 token->type = OP_ALT; 1939 break; 1940 case '|': 1941 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR)) 1942 token->type = OP_ALT; 1943 break; 1944 case '*': 1945 token->type = OP_DUP_ASTERISK; 1946 break; 1947 case '+': 1948 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1949 token->type = OP_DUP_PLUS; 1950 break; 1951 case '?': 1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM)) 1953 token->type = OP_DUP_QUESTION; 1954 break; 1955 case '{': 1956 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1957 token->type = OP_OPEN_DUP_NUM; 1958 break; 1959 case '}': 1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 1961 token->type = OP_CLOSE_DUP_NUM; 1962 break; 1963 case '(': 1964 if (syntax & RE_NO_BK_PARENS) 1965 token->type = OP_OPEN_SUBEXP; 1966 break; 1967 case ')': 1968 if (syntax & RE_NO_BK_PARENS) 1969 token->type = OP_CLOSE_SUBEXP; 1970 break; 1971 case '[': 1972 token->type = OP_OPEN_BRACKET; 1973 break; 1974 case '.': 1975 token->type = OP_PERIOD; 1976 break; 1977 case '^': 1978 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) && 1979 re_string_cur_idx (input) != 0) 1980 { 1981 char prev = re_string_peek_byte (input, -1); 1982 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n') 1983 break; 1984 } 1985 token->type = ANCHOR; 1986 token->opr.ctx_type = LINE_FIRST; 1987 break; 1988 case '$': 1989 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) && 1990 re_string_cur_idx (input) + 1 != re_string_length (input)) 1991 { 1992 re_token_t next; 1993 re_string_skip_bytes (input, 1); 1994 peek_token (&next, input, syntax); 1995 re_string_skip_bytes (input, -1); 1996 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP) 1997 break; 1998 } 1999 token->type = ANCHOR; 2000 token->opr.ctx_type = LINE_LAST; 2001 break; 2002 default: 2003 break; 2004 } 2005 return 1; 2006 } 2007 2008 /* Peek a token from INPUT, and return the length of the token. 2009 We must not use this function out of bracket expressions. */ 2010 2011 static int 2012 internal_function 2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax) 2014 { 2015 unsigned char c; 2016 if (re_string_eoi (input)) 2017 { 2018 token->type = END_OF_RE; 2019 return 0; 2020 } 2021 c = re_string_peek_byte (input, 0); 2022 token->opr.c = c; 2023 2024 #ifdef RE_ENABLE_I18N 2025 if (input->mb_cur_max > 1 && 2026 !re_string_first_byte (input, re_string_cur_idx (input))) 2027 { 2028 token->type = CHARACTER; 2029 return 1; 2030 } 2031 #endif /* RE_ENABLE_I18N */ 2032 2033 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) 2034 && re_string_cur_idx (input) + 1 < re_string_length (input)) 2035 { 2036 /* In this case, '\' escape a character. */ 2037 unsigned char c2; 2038 re_string_skip_bytes (input, 1); 2039 c2 = re_string_peek_byte (input, 0); 2040 token->opr.c = c2; 2041 token->type = CHARACTER; 2042 return 1; 2043 } 2044 if (c == '[') /* '[' is a special char in a bracket exps. */ 2045 { 2046 unsigned char c2; 2047 int token_len; 2048 if (re_string_cur_idx (input) + 1 < re_string_length (input)) 2049 c2 = re_string_peek_byte (input, 1); 2050 else 2051 c2 = 0; 2052 token->opr.c = c2; 2053 token_len = 2; 2054 switch (c2) 2055 { 2056 case '.': 2057 token->type = OP_OPEN_COLL_ELEM; 2058 break; 2059 case '=': 2060 token->type = OP_OPEN_EQUIV_CLASS; 2061 break; 2062 case ':': 2063 if (syntax & RE_CHAR_CLASSES) 2064 { 2065 token->type = OP_OPEN_CHAR_CLASS; 2066 break; 2067 } 2068 /* else fall through. */ 2069 default: 2070 token->type = CHARACTER; 2071 token->opr.c = c; 2072 token_len = 1; 2073 break; 2074 } 2075 return token_len; 2076 } 2077 switch (c) 2078 { 2079 case '-': 2080 token->type = OP_CHARSET_RANGE; 2081 break; 2082 case ']': 2083 token->type = OP_CLOSE_BRACKET; 2084 break; 2085 case '^': 2086 token->type = OP_NON_MATCH_LIST; 2087 break; 2088 default: 2089 token->type = CHARACTER; 2090 } 2091 return 1; 2092 } 2093 2094 /* Functions for parser. */ 2096 2097 /* Entry point of the parser. 2098 Parse the regular expression REGEXP and return the structure tree. 2099 If an error is occured, ERR is set by error code, and return NULL. 2100 This function build the following tree, from regular expression <reg_exp>: 2101 CAT 2102 / \ 2103 / \ 2104 <reg_exp> EOR 2105 2106 CAT means concatenation. 2107 EOR means end of regular expression. */ 2108 2109 static bin_tree_t * 2110 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax, 2111 reg_errcode_t *err) 2112 { 2113 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2114 bin_tree_t *tree, *eor, *root; 2115 re_token_t current_token; 2116 dfa->syntax = syntax; 2117 fetch_token (¤t_token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2118 tree = parse_reg_exp (regexp, preg, ¤t_token, syntax, 0, err); 2119 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2120 return NULL; 2121 eor = create_tree (dfa, NULL, NULL, END_OF_RE); 2122 if (tree != NULL) 2123 root = create_tree (dfa, tree, eor, CONCAT); 2124 else 2125 root = eor; 2126 if (BE (eor == NULL || root == NULL, 0)) 2127 { 2128 *err = REG_ESPACE; 2129 return NULL; 2130 } 2131 return root; 2132 } 2133 2134 /* This function build the following tree, from regular expression 2135 <branch1>|<branch2>: 2136 ALT 2137 / \ 2138 / \ 2139 <branch1> <branch2> 2140 2141 ALT means alternative, which represents the operator `|'. */ 2142 2143 static bin_tree_t * 2144 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2145 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2146 { 2147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2148 bin_tree_t *tree, *branch = NULL; 2149 tree = parse_branch (regexp, preg, token, syntax, nest, err); 2150 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2151 return NULL; 2152 2153 while (token->type == OP_ALT) 2154 { 2155 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2156 if (token->type != OP_ALT && token->type != END_OF_RE 2157 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2158 { 2159 branch = parse_branch (regexp, preg, token, syntax, nest, err); 2160 if (BE (*err != REG_NOERROR && branch == NULL, 0)) 2161 return NULL; 2162 } 2163 else 2164 branch = NULL; 2165 tree = create_tree (dfa, tree, branch, OP_ALT); 2166 if (BE (tree == NULL, 0)) 2167 { 2168 *err = REG_ESPACE; 2169 return NULL; 2170 } 2171 } 2172 return tree; 2173 } 2174 2175 /* This function build the following tree, from regular expression 2176 <exp1><exp2>: 2177 CAT 2178 / \ 2179 / \ 2180 <exp1> <exp2> 2181 2182 CAT means concatenation. */ 2183 2184 static bin_tree_t * 2185 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token, 2186 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2187 { 2188 bin_tree_t *tree, *expr; 2189 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2190 tree = parse_expression (regexp, preg, token, syntax, nest, err); 2191 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2192 return NULL; 2193 2194 while (token->type != OP_ALT && token->type != END_OF_RE 2195 && (nest == 0 || token->type != OP_CLOSE_SUBEXP)) 2196 { 2197 expr = parse_expression (regexp, preg, token, syntax, nest, err); 2198 if (BE (*err != REG_NOERROR && expr == NULL, 0)) 2199 { 2200 return NULL; 2201 } 2202 if (tree != NULL && expr != NULL) 2203 { 2204 tree = create_tree (dfa, tree, expr, CONCAT); 2205 if (tree == NULL) 2206 { 2207 *err = REG_ESPACE; 2208 return NULL; 2209 } 2210 } 2211 else if (tree == NULL) 2212 tree = expr; 2213 /* Otherwise expr == NULL, we don't need to create new tree. */ 2214 } 2215 return tree; 2216 } 2217 2218 /* This function build the following tree, from regular expression a*: 2219 * 2220 | 2221 a 2222 */ 2223 2224 static bin_tree_t * 2225 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token, 2226 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2227 { 2228 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2229 bin_tree_t *tree; 2230 switch (token->type) 2231 { 2232 case CHARACTER: 2233 tree = create_token_tree (dfa, NULL, NULL, token); 2234 if (BE (tree == NULL, 0)) 2235 { 2236 *err = REG_ESPACE; 2237 return NULL; 2238 } 2239 #ifdef RE_ENABLE_I18N 2240 if (dfa->mb_cur_max > 1) 2241 { 2242 while (!re_string_eoi (regexp) 2243 && !re_string_first_byte (regexp, re_string_cur_idx (regexp))) 2244 { 2245 bin_tree_t *mbc_remain; 2246 fetch_token (token, regexp, syntax); 2247 mbc_remain = create_token_tree (dfa, NULL, NULL, token); 2248 tree = create_tree (dfa, tree, mbc_remain, CONCAT); 2249 if (BE (mbc_remain == NULL || tree == NULL, 0)) 2250 { 2251 *err = REG_ESPACE; 2252 return NULL; 2253 } 2254 } 2255 } 2256 #endif 2257 break; 2258 case OP_OPEN_SUBEXP: 2259 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err); 2260 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2261 return NULL; 2262 break; 2263 case OP_OPEN_BRACKET: 2264 tree = parse_bracket_exp (regexp, dfa, token, syntax, err); 2265 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2266 return NULL; 2267 break; 2268 case OP_BACK_REF: 2269 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1)) 2270 { 2271 *err = REG_ESUBREG; 2272 return NULL; 2273 } 2274 dfa->used_bkref_map |= 1 << token->opr.idx; 2275 tree = create_token_tree (dfa, NULL, NULL, token); 2276 if (BE (tree == NULL, 0)) 2277 { 2278 *err = REG_ESPACE; 2279 return NULL; 2280 } 2281 ++dfa->nbackref; 2282 dfa->has_mb_node = 1; 2283 break; 2284 case OP_OPEN_DUP_NUM: 2285 if (syntax & RE_CONTEXT_INVALID_DUP) 2286 { 2287 *err = REG_BADRPT; 2288 return NULL; 2289 } 2290 /* FALLTHROUGH */ 2291 case OP_DUP_ASTERISK: 2292 case OP_DUP_PLUS: 2293 case OP_DUP_QUESTION: 2294 if (syntax & RE_CONTEXT_INVALID_OPS) 2295 { 2296 *err = REG_BADRPT; 2297 return NULL; 2298 } 2299 else if (syntax & RE_CONTEXT_INDEP_OPS) 2300 { 2301 fetch_token (token, regexp, syntax); 2302 return parse_expression (regexp, preg, token, syntax, nest, err); 2303 } 2304 /* else fall through */ 2305 case OP_CLOSE_SUBEXP: 2306 if ((token->type == OP_CLOSE_SUBEXP) && 2307 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)) 2308 { 2309 *err = REG_ERPAREN; 2310 return NULL; 2311 } 2312 /* else fall through */ 2313 case OP_CLOSE_DUP_NUM: 2314 /* We treat it as a normal character. */ 2315 2316 /* Then we can these characters as normal characters. */ 2317 token->type = CHARACTER; 2318 /* mb_partial and word_char bits should be initialized already 2319 by peek_token. */ 2320 tree = create_token_tree (dfa, NULL, NULL, token); 2321 if (BE (tree == NULL, 0)) 2322 { 2323 *err = REG_ESPACE; 2324 return NULL; 2325 } 2326 break; 2327 case ANCHOR: 2328 if ((token->opr.ctx_type 2329 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST)) 2330 && dfa->word_ops_used == 0) 2331 init_word_char (dfa); 2332 if (token->opr.ctx_type == WORD_DELIM 2333 || token->opr.ctx_type == NOT_WORD_DELIM) 2334 { 2335 bin_tree_t *tree_first, *tree_last; 2336 if (token->opr.ctx_type == WORD_DELIM) 2337 { 2338 token->opr.ctx_type = WORD_FIRST; 2339 tree_first = create_token_tree (dfa, NULL, NULL, token); 2340 token->opr.ctx_type = WORD_LAST; 2341 } 2342 else 2343 { 2344 token->opr.ctx_type = INSIDE_WORD; 2345 tree_first = create_token_tree (dfa, NULL, NULL, token); 2346 token->opr.ctx_type = INSIDE_NOTWORD; 2347 } 2348 tree_last = create_token_tree (dfa, NULL, NULL, token); 2349 tree = create_tree (dfa, tree_first, tree_last, OP_ALT); 2350 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0)) 2351 { 2352 *err = REG_ESPACE; 2353 return NULL; 2354 } 2355 } 2356 else 2357 { 2358 tree = create_token_tree (dfa, NULL, NULL, token); 2359 if (BE (tree == NULL, 0)) 2360 { 2361 *err = REG_ESPACE; 2362 return NULL; 2363 } 2364 } 2365 /* We must return here, since ANCHORs can't be followed 2366 by repetition operators. 2367 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>", 2368 it must not be "<ANCHOR(^)><REPEAT(*)>". */ 2369 fetch_token (token, regexp, syntax); 2370 return tree; 2371 case OP_PERIOD: 2372 tree = create_token_tree (dfa, NULL, NULL, token); 2373 if (BE (tree == NULL, 0)) 2374 { 2375 *err = REG_ESPACE; 2376 return NULL; 2377 } 2378 if (dfa->mb_cur_max > 1) 2379 dfa->has_mb_node = 1; 2380 break; 2381 case OP_WORD: 2382 case OP_NOTWORD: 2383 tree = build_charclass_op (dfa, regexp->trans, 2384 (const unsigned char *) "alnum", 2385 (const unsigned char *) "_", 2386 token->type == OP_NOTWORD, err); 2387 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2388 return NULL; 2389 break; 2390 case OP_SPACE: 2391 case OP_NOTSPACE: 2392 tree = build_charclass_op (dfa, regexp->trans, 2393 (const unsigned char *) "space", 2394 (const unsigned char *) "", 2395 token->type == OP_NOTSPACE, err); 2396 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2397 return NULL; 2398 break; 2399 case OP_ALT: 2400 case END_OF_RE: 2401 return NULL; 2402 case BACK_SLASH: 2403 *err = REG_EESCAPE; 2404 return NULL; 2405 default: 2406 /* Must not happen? */ 2407 #ifdef DEBUG 2408 assert (0); 2409 #endif 2410 return NULL; 2411 } 2412 fetch_token (token, regexp, syntax); 2413 2414 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS 2415 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM) 2416 { 2417 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err); 2418 if (BE (*err != REG_NOERROR && tree == NULL, 0)) 2419 return NULL; 2420 /* In BRE consecutive duplications are not allowed. */ 2421 if ((syntax & RE_CONTEXT_INVALID_DUP) 2422 && (token->type == OP_DUP_ASTERISK 2423 || token->type == OP_OPEN_DUP_NUM)) 2424 { 2425 *err = REG_BADRPT; 2426 return NULL; 2427 } 2428 } 2429 2430 return tree; 2431 } 2432 2433 /* This function build the following tree, from regular expression 2434 (<reg_exp>): 2435 SUBEXP 2436 | 2437 <reg_exp> 2438 */ 2439 2440 static bin_tree_t * 2441 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token, 2442 reg_syntax_t syntax, Idx nest, reg_errcode_t *err) 2443 { 2444 re_dfa_t *dfa = (re_dfa_t *) preg->buffer; 2445 bin_tree_t *tree; 2446 size_t cur_nsub; 2447 cur_nsub = preg->re_nsub++; 2448 2449 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE); 2450 2451 /* The subexpression may be a null string. */ 2452 if (token->type == OP_CLOSE_SUBEXP) 2453 tree = NULL; 2454 else 2455 { 2456 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err); 2457 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0)) 2458 *err = REG_EPAREN; 2459 if (BE (*err != REG_NOERROR, 0)) 2460 return NULL; 2461 } 2462 2463 if (cur_nsub <= '9' - '1') 2464 dfa->completed_bkref_map |= 1 << cur_nsub; 2465 2466 tree = create_tree (dfa, tree, NULL, SUBEXP); 2467 if (BE (tree == NULL, 0)) 2468 { 2469 *err = REG_ESPACE; 2470 return NULL; 2471 } 2472 tree->token.opr.idx = cur_nsub; 2473 return tree; 2474 } 2475 2476 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */ 2477 2478 static bin_tree_t * 2479 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa, 2480 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err) 2481 { 2482 bin_tree_t *tree = NULL, *old_tree = NULL; 2483 Idx i, start, end, start_idx = re_string_cur_idx (regexp); 2484 re_token_t start_token = *token; 2485 2486 if (token->type == OP_OPEN_DUP_NUM) 2487 { 2488 end = 0; 2489 start = fetch_number (regexp, token, syntax); 2490 if (start == REG_MISSING) 2491 { 2492 if (token->type == CHARACTER && token->opr.c == ',') 2493 start = 0; /* We treat "{,m}" as "{0,m}". */ 2494 else 2495 { 2496 *err = REG_BADBR; /* <re>{} is invalid. */ 2497 return NULL; 2498 } 2499 } 2500 if (BE (start != REG_ERROR, 1)) 2501 { 2502 /* We treat "{n}" as "{n,n}". */ 2503 end = ((token->type == OP_CLOSE_DUP_NUM) ? start 2504 : ((token->type == CHARACTER && token->opr.c == ',') 2505 ? fetch_number (regexp, token, syntax) : REG_ERROR)); 2506 } 2507 if (BE (start == REG_ERROR || end == REG_ERROR, 0)) 2508 { 2509 /* Invalid sequence. */ 2510 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0)) 2511 { 2512 if (token->type == END_OF_RE) 2513 *err = REG_EBRACE; 2514 else 2515 *err = REG_BADBR; 2516 2517 return NULL; 2518 } 2519 2520 /* If the syntax bit is set, rollback. */ 2521 re_string_set_index (regexp, start_idx); 2522 *token = start_token; 2523 token->type = CHARACTER; 2524 /* mb_partial and word_char bits should be already initialized by 2525 peek_token. */ 2526 return elem; 2527 } 2528 2529 if (BE (end != REG_MISSING && start > end, 0)) 2530 { 2531 /* First number greater than second. */ 2532 *err = REG_BADBR; 2533 return NULL; 2534 } 2535 } 2536 else 2537 { 2538 start = (token->type == OP_DUP_PLUS) ? 1 : 0; 2539 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING; 2540 } 2541 2542 fetch_token (token, regexp, syntax); 2543 2544 if (BE (elem == NULL, 0)) 2545 return NULL; 2546 if (BE (start == 0 && end == 0, 0)) 2547 { 2548 postorder (elem, free_tree, NULL); 2549 return NULL; 2550 } 2551 2552 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */ 2553 if (BE (start > 0, 0)) 2554 { 2555 tree = elem; 2556 for (i = 2; i <= start; ++i) 2557 { 2558 elem = duplicate_tree (elem, dfa); 2559 tree = create_tree (dfa, tree, elem, CONCAT); 2560 if (BE (elem == NULL || tree == NULL, 0)) 2561 goto parse_dup_op_espace; 2562 } 2563 2564 if (start == end) 2565 return tree; 2566 2567 /* Duplicate ELEM before it is marked optional. */ 2568 elem = duplicate_tree (elem, dfa); 2569 old_tree = tree; 2570 } 2571 else 2572 old_tree = NULL; 2573 2574 if (elem->token.type == SUBEXP) 2575 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx); 2576 2577 tree = create_tree (dfa, elem, NULL, 2578 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT)); 2579 if (BE (tree == NULL, 0)) 2580 goto parse_dup_op_espace; 2581 2582 /* This loop is actually executed only when end != REG_MISSING, 2583 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have 2584 already created the start+1-th copy. */ 2585 if ((Idx) -1 < 0 || end != REG_MISSING) 2586 for (i = start + 2; i <= end; ++i) 2587 { 2588 elem = duplicate_tree (elem, dfa); 2589 tree = create_tree (dfa, tree, elem, CONCAT); 2590 if (BE (elem == NULL || tree == NULL, 0)) 2591 goto parse_dup_op_espace; 2592 2593 tree = create_tree (dfa, tree, NULL, OP_ALT); 2594 if (BE (tree == NULL, 0)) 2595 goto parse_dup_op_espace; 2596 } 2597 2598 if (old_tree) 2599 tree = create_tree (dfa, old_tree, tree, CONCAT); 2600 2601 return tree; 2602 2603 parse_dup_op_espace: 2604 *err = REG_ESPACE; 2605 return NULL; 2606 } 2607 2608 /* Size of the names for collating symbol/equivalence_class/character_class. 2609 I'm not sure, but maybe enough. */ 2610 #define BRACKET_NAME_BUF_SIZE 32 2611 2612 #ifndef _LIBC 2613 /* Local function for parse_bracket_exp only used in case of NOT _LIBC. 2614 Build the range expression which starts from START_ELEM, and ends 2615 at END_ELEM. The result are written to MBCSET and SBCSET. 2616 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2617 mbcset->range_ends, is a pointer argument sinse we may 2618 update it. */ 2619 2620 static reg_errcode_t 2621 internal_function 2622 # ifdef RE_ENABLE_I18N 2623 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc, 2624 bracket_elem_t *start_elem, bracket_elem_t *end_elem) 2625 # else /* not RE_ENABLE_I18N */ 2626 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem, 2627 bracket_elem_t *end_elem) 2628 # endif /* not RE_ENABLE_I18N */ 2629 { 2630 unsigned int start_ch, end_ch; 2631 /* Equivalence Classes and Character Classes can't be a range start/end. */ 2632 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2633 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2634 0)) 2635 return REG_ERANGE; 2636 2637 /* We can handle no multi character collating elements without libc 2638 support. */ 2639 if (BE ((start_elem->type == COLL_SYM 2640 && strlen ((char *) start_elem->opr.name) > 1) 2641 || (end_elem->type == COLL_SYM 2642 && strlen ((char *) end_elem->opr.name) > 1), 0)) 2643 return REG_ECOLLATE; 2644 2645 # ifdef RE_ENABLE_I18N 2646 { 2647 wchar_t wc; 2648 wint_t start_wc; 2649 wint_t end_wc; 2650 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'}; 2651 2652 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch 2653 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2654 : 0)); 2655 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch 2656 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2657 : 0)); 2658 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM) 2659 ? __btowc (start_ch) : start_elem->opr.wch); 2660 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM) 2661 ? __btowc (end_ch) : end_elem->opr.wch); 2662 if (start_wc == WEOF || end_wc == WEOF) 2663 return REG_ECOLLATE; 2664 cmp_buf[0] = start_wc; 2665 cmp_buf[4] = end_wc; 2666 if (wcscoll (cmp_buf, cmp_buf + 4) > 0) 2667 return REG_ERANGE; 2668 2669 /* Got valid collation sequence values, add them as a new entry. 2670 However, for !_LIBC we have no collation elements: if the 2671 character set is single byte, the single byte character set 2672 that we build below suffices. parse_bracket_exp passes 2673 no MBCSET if dfa->mb_cur_max == 1. */ 2674 if (mbcset) 2675 { 2676 /* Check the space of the arrays. */ 2677 if (BE (*range_alloc == mbcset->nranges, 0)) 2678 { 2679 /* There is not enough space, need realloc. */ 2680 wchar_t *new_array_start, *new_array_end; 2681 Idx new_nranges; 2682 2683 /* +1 in case of mbcset->nranges is 0. */ 2684 new_nranges = 2 * mbcset->nranges + 1; 2685 /* Use realloc since mbcset->range_starts and mbcset->range_ends 2686 are NULL if *range_alloc == 0. */ 2687 new_array_start = re_realloc (mbcset->range_starts, wchar_t, 2688 new_nranges); 2689 new_array_end = re_realloc (mbcset->range_ends, wchar_t, 2690 new_nranges); 2691 2692 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2693 return REG_ESPACE; 2694 2695 mbcset->range_starts = new_array_start; 2696 mbcset->range_ends = new_array_end; 2697 *range_alloc = new_nranges; 2698 } 2699 2700 mbcset->range_starts[mbcset->nranges] = start_wc; 2701 mbcset->range_ends[mbcset->nranges++] = end_wc; 2702 } 2703 2704 /* Build the table for single byte characters. */ 2705 for (wc = 0; wc < SBC_MAX; ++wc) 2706 { 2707 cmp_buf[2] = wc; 2708 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0 2709 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0) 2710 bitset_set (sbcset, wc); 2711 } 2712 } 2713 # else /* not RE_ENABLE_I18N */ 2714 { 2715 unsigned int ch; 2716 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch 2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0] 2718 : 0)); 2719 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch 2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0] 2721 : 0)); 2722 if (start_ch > end_ch) 2723 return REG_ERANGE; 2724 /* Build the table for single byte characters. */ 2725 for (ch = 0; ch < SBC_MAX; ++ch) 2726 if (start_ch <= ch && ch <= end_ch) 2727 bitset_set (sbcset, ch); 2728 } 2729 # endif /* not RE_ENABLE_I18N */ 2730 return REG_NOERROR; 2731 } 2732 #endif /* not _LIBC */ 2733 2734 #ifndef _LIBC 2735 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.. 2736 Build the collating element which is represented by NAME. 2737 The result are written to MBCSET and SBCSET. 2738 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2739 pointer argument since we may update it. */ 2740 2741 static reg_errcode_t 2742 internal_function 2743 build_collating_symbol (bitset_t sbcset, 2744 # ifdef RE_ENABLE_I18N 2745 re_charset_t *mbcset, Idx *coll_sym_alloc, 2746 # endif 2747 const unsigned char *name) 2748 { 2749 size_t name_len = strlen ((const char *) name); 2750 if (BE (name_len != 1, 0)) 2751 return REG_ECOLLATE; 2752 else 2753 { 2754 bitset_set (sbcset, name[0]); 2755 return REG_NOERROR; 2756 } 2757 } 2758 #endif /* not _LIBC */ 2759 2760 /* This function parse bracket expression like "[abc]", "[a-c]", 2761 "[[.a-a.]]" etc. */ 2762 2763 static bin_tree_t * 2764 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, 2765 reg_syntax_t syntax, reg_errcode_t *err) 2766 { 2767 #ifdef _LIBC 2768 const unsigned char *collseqmb; 2769 const char *collseqwc; 2770 uint32_t nrules; 2771 int32_t table_size; 2772 const int32_t *symb_table; 2773 const unsigned char *extra; 2774 2775 /* Local function for parse_bracket_exp used in _LIBC environement. 2776 Seek the collating symbol entry correspondings to NAME. 2777 Return the index of the symbol in the SYMB_TABLE. */ 2778 2779 auto inline int32_t 2780 __attribute ((always_inline)) 2781 seek_collating_symbol_entry (name, name_len) 2782 const unsigned char *name; 2783 size_t name_len; 2784 { 2785 int32_t hash = elem_hash ((const char *) name, name_len); 2786 int32_t elem = hash % table_size; 2787 if (symb_table[2 * elem] != 0) 2788 { 2789 int32_t second = hash % (table_size - 2) + 1; 2790 2791 do 2792 { 2793 /* First compare the hashing value. */ 2794 if (symb_table[2 * elem] == hash 2795 /* Compare the length of the name. */ 2796 && name_len == extra[symb_table[2 * elem + 1]] 2797 /* Compare the name. */ 2798 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1], 2799 name_len) == 0) 2800 { 2801 /* Yep, this is the entry. */ 2802 break; 2803 } 2804 2805 /* Next entry. */ 2806 elem += second; 2807 } 2808 while (symb_table[2 * elem] != 0); 2809 } 2810 return elem; 2811 } 2812 2813 /* Local function for parse_bracket_exp used in _LIBC environement. 2814 Look up the collation sequence value of BR_ELEM. 2815 Return the value if succeeded, UINT_MAX otherwise. */ 2816 2817 auto inline unsigned int 2818 __attribute ((always_inline)) 2819 lookup_collation_sequence_value (br_elem) 2820 bracket_elem_t *br_elem; 2821 { 2822 if (br_elem->type == SB_CHAR) 2823 { 2824 /* 2825 if (MB_CUR_MAX == 1) 2826 */ 2827 if (nrules == 0) 2828 return collseqmb[br_elem->opr.ch]; 2829 else 2830 { 2831 wint_t wc = __btowc (br_elem->opr.ch); 2832 return __collseq_table_lookup (collseqwc, wc); 2833 } 2834 } 2835 else if (br_elem->type == MB_CHAR) 2836 { 2837 return __collseq_table_lookup (collseqwc, br_elem->opr.wch); 2838 } 2839 else if (br_elem->type == COLL_SYM) 2840 { 2841 size_t sym_name_len = strlen ((char *) br_elem->opr.name); 2842 if (nrules != 0) 2843 { 2844 int32_t elem, idx; 2845 elem = seek_collating_symbol_entry (br_elem->opr.name, 2846 sym_name_len); 2847 if (symb_table[2 * elem] != 0) 2848 { 2849 /* We found the entry. */ 2850 idx = symb_table[2 * elem + 1]; 2851 /* Skip the name of collating element name. */ 2852 idx += 1 + extra[idx]; 2853 /* Skip the byte sequence of the collating element. */ 2854 idx += 1 + extra[idx]; 2855 /* Adjust for the alignment. */ 2856 idx = (idx + 3) & ~3; 2857 /* Skip the multibyte collation sequence value. */ 2858 idx += sizeof (unsigned int); 2859 /* Skip the wide char sequence of the collating element. */ 2860 idx += sizeof (unsigned int) * 2861 (1 + *(unsigned int *) (extra + idx)); 2862 /* Return the collation sequence value. */ 2863 return *(unsigned int *) (extra + idx); 2864 } 2865 else if (symb_table[2 * elem] == 0 && sym_name_len == 1) 2866 { 2867 /* No valid character. Match it as a single byte 2868 character. */ 2869 return collseqmb[br_elem->opr.name[0]]; 2870 } 2871 } 2872 else if (sym_name_len == 1) 2873 return collseqmb[br_elem->opr.name[0]]; 2874 } 2875 return UINT_MAX; 2876 } 2877 2878 /* Local function for parse_bracket_exp used in _LIBC environement. 2879 Build the range expression which starts from START_ELEM, and ends 2880 at END_ELEM. The result are written to MBCSET and SBCSET. 2881 RANGE_ALLOC is the allocated size of mbcset->range_starts, and 2882 mbcset->range_ends, is a pointer argument sinse we may 2883 update it. */ 2884 2885 auto inline reg_errcode_t 2886 __attribute ((always_inline)) 2887 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem) 2888 re_charset_t *mbcset; 2889 Idx *range_alloc; 2890 bitset_t sbcset; 2891 bracket_elem_t *start_elem, *end_elem; 2892 { 2893 unsigned int ch; 2894 uint32_t start_collseq; 2895 uint32_t end_collseq; 2896 2897 /* Equivalence Classes and Character Classes can't be a range 2898 start/end. */ 2899 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS 2900 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS, 2901 0)) 2902 return REG_ERANGE; 2903 2904 start_collseq = lookup_collation_sequence_value (start_elem); 2905 end_collseq = lookup_collation_sequence_value (end_elem); 2906 /* Check start/end collation sequence values. */ 2907 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0)) 2908 return REG_ECOLLATE; 2909 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0)) 2910 return REG_ERANGE; 2911 2912 /* Got valid collation sequence values, add them as a new entry. 2913 However, if we have no collation elements, and the character set 2914 is single byte, the single byte character set that we 2915 build below suffices. */ 2916 if (nrules > 0 || dfa->mb_cur_max > 1) 2917 { 2918 /* Check the space of the arrays. */ 2919 if (BE (*range_alloc == mbcset->nranges, 0)) 2920 { 2921 /* There is not enough space, need realloc. */ 2922 uint32_t *new_array_start; 2923 uint32_t *new_array_end; 2924 Idx new_nranges; 2925 2926 /* +1 in case of mbcset->nranges is 0. */ 2927 new_nranges = 2 * mbcset->nranges + 1; 2928 new_array_start = re_realloc (mbcset->range_starts, uint32_t, 2929 new_nranges); 2930 new_array_end = re_realloc (mbcset->range_ends, uint32_t, 2931 new_nranges); 2932 2933 if (BE (new_array_start == NULL || new_array_end == NULL, 0)) 2934 return REG_ESPACE; 2935 2936 mbcset->range_starts = new_array_start; 2937 mbcset->range_ends = new_array_end; 2938 *range_alloc = new_nranges; 2939 } 2940 2941 mbcset->range_starts[mbcset->nranges] = start_collseq; 2942 mbcset->range_ends[mbcset->nranges++] = end_collseq; 2943 } 2944 2945 /* Build the table for single byte characters. */ 2946 for (ch = 0; ch < SBC_MAX; ch++) 2947 { 2948 uint32_t ch_collseq; 2949 /* 2950 if (MB_CUR_MAX == 1) 2951 */ 2952 if (nrules == 0) 2953 ch_collseq = collseqmb[ch]; 2954 else 2955 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch)); 2956 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq) 2957 bitset_set (sbcset, ch); 2958 } 2959 return REG_NOERROR; 2960 } 2961 2962 /* Local function for parse_bracket_exp used in _LIBC environement. 2963 Build the collating element which is represented by NAME. 2964 The result are written to MBCSET and SBCSET. 2965 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a 2966 pointer argument sinse we may update it. */ 2967 2968 auto inline reg_errcode_t 2969 __attribute ((always_inline)) 2970 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name) 2971 re_charset_t *mbcset; 2972 Idx *coll_sym_alloc; 2973 bitset_t sbcset; 2974 const unsigned char *name; 2975 { 2976 int32_t elem, idx; 2977 size_t name_len = strlen ((const char *) name); 2978 if (nrules != 0) 2979 { 2980 elem = seek_collating_symbol_entry (name, name_len); 2981 if (symb_table[2 * elem] != 0) 2982 { 2983 /* We found the entry. */ 2984 idx = symb_table[2 * elem + 1]; 2985 /* Skip the name of collating element name. */ 2986 idx += 1 + extra[idx]; 2987 } 2988 else if (symb_table[2 * elem] == 0 && name_len == 1) 2989 { 2990 /* No valid character, treat it as a normal 2991 character. */ 2992 bitset_set (sbcset, name[0]); 2993 return REG_NOERROR; 2994 } 2995 else 2996 return REG_ECOLLATE; 2997 2998 /* Got valid collation sequence, add it as a new entry. */ 2999 /* Check the space of the arrays. */ 3000 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0)) 3001 { 3002 /* Not enough, realloc it. */ 3003 /* +1 in case of mbcset->ncoll_syms is 0. */ 3004 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1; 3005 /* Use realloc since mbcset->coll_syms is NULL 3006 if *alloc == 0. */ 3007 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t, 3008 new_coll_sym_alloc); 3009 if (BE (new_coll_syms == NULL, 0)) 3010 return REG_ESPACE; 3011 mbcset->coll_syms = new_coll_syms; 3012 *coll_sym_alloc = new_coll_sym_alloc; 3013 } 3014 mbcset->coll_syms[mbcset->ncoll_syms++] = idx; 3015 return REG_NOERROR; 3016 } 3017 else 3018 { 3019 if (BE (name_len != 1, 0)) 3020 return REG_ECOLLATE; 3021 else 3022 { 3023 bitset_set (sbcset, name[0]); 3024 return REG_NOERROR; 3025 } 3026 } 3027 } 3028 #endif 3029 3030 re_token_t br_token; 3031 re_bitset_ptr_t sbcset; 3032 #ifdef RE_ENABLE_I18N 3033 re_charset_t *mbcset; 3034 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0; 3035 Idx equiv_class_alloc = 0, char_class_alloc = 0; 3036 #endif /* not RE_ENABLE_I18N */ 3037 bool non_match = false; 3038 bin_tree_t *work_tree; 3039 int token_len; 3040 bool first_round = true; 3041 #ifdef _LIBC 3042 collseqmb = (const unsigned char *) 3043 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB); 3044 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3045 if (nrules) 3046 { 3047 /* 3048 if (MB_CUR_MAX > 1) 3049 */ 3050 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC); 3051 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB); 3052 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3053 _NL_COLLATE_SYMB_TABLEMB); 3054 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3055 _NL_COLLATE_SYMB_EXTRAMB); 3056 } 3057 #endif 3058 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3059 #ifdef RE_ENABLE_I18N 3060 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3061 #endif /* RE_ENABLE_I18N */ 3062 #ifdef RE_ENABLE_I18N 3063 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3064 #else 3065 if (BE (sbcset == NULL, 0)) 3066 #endif /* RE_ENABLE_I18N */ 3067 { 3068 *err = REG_ESPACE; 3069 return NULL; 3070 } 3071 3072 token_len = peek_token_bracket (token, regexp, syntax); 3073 if (BE (token->type == END_OF_RE, 0)) 3074 { 3075 *err = REG_BADPAT; 3076 goto parse_bracket_exp_free_return; 3077 } 3078 if (token->type == OP_NON_MATCH_LIST) 3079 { 3080 #ifdef RE_ENABLE_I18N 3081 mbcset->non_match = 1; 3082 #endif /* not RE_ENABLE_I18N */ 3083 non_match = true; 3084 if (syntax & RE_HAT_LISTS_NOT_NEWLINE) 3085 bitset_set (sbcset, '\n'); 3086 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3087 token_len = peek_token_bracket (token, regexp, syntax); 3088 if (BE (token->type == END_OF_RE, 0)) 3089 { 3090 *err = REG_BADPAT; 3091 goto parse_bracket_exp_free_return; 3092 } 3093 } 3094 3095 /* We treat the first ']' as a normal character. */ 3096 if (token->type == OP_CLOSE_BRACKET) 3097 token->type = CHARACTER; 3098 3099 while (1) 3100 { 3101 bracket_elem_t start_elem, end_elem; 3102 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE]; 3103 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE]; 3104 reg_errcode_t ret; 3105 int token_len2 = 0; 3106 bool is_range_exp = false; 3107 re_token_t token2; 3108 3109 start_elem.opr.name = start_name_buf; 3110 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa, 3111 syntax, first_round); 3112 if (BE (ret != REG_NOERROR, 0)) 3113 { 3114 *err = ret; 3115 goto parse_bracket_exp_free_return; 3116 } 3117 first_round = false; 3118 3119 /* Get information about the next token. We need it in any case. */ 3120 token_len = peek_token_bracket (token, regexp, syntax); 3121 3122 /* Do not check for ranges if we know they are not allowed. */ 3123 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS) 3124 { 3125 if (BE (token->type == END_OF_RE, 0)) 3126 { 3127 *err = REG_EBRACK; 3128 goto parse_bracket_exp_free_return; 3129 } 3130 if (token->type == OP_CHARSET_RANGE) 3131 { 3132 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */ 3133 token_len2 = peek_token_bracket (&token2, regexp, syntax); 3134 if (BE (token2.type == END_OF_RE, 0)) 3135 { 3136 *err = REG_EBRACK; 3137 goto parse_bracket_exp_free_return; 3138 } 3139 if (token2.type == OP_CLOSE_BRACKET) 3140 { 3141 /* We treat the last '-' as a normal character. */ 3142 re_string_skip_bytes (regexp, -token_len); 3143 token->type = CHARACTER; 3144 } 3145 else 3146 is_range_exp = true; 3147 } 3148 } 3149 3150 if (is_range_exp == true) 3151 { 3152 end_elem.opr.name = end_name_buf; 3153 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2, 3154 dfa, syntax, true); 3155 if (BE (ret != REG_NOERROR, 0)) 3156 { 3157 *err = ret; 3158 goto parse_bracket_exp_free_return; 3159 } 3160 3161 token_len = peek_token_bracket (token, regexp, syntax); 3162 3163 #ifdef _LIBC 3164 *err = build_range_exp (sbcset, mbcset, &range_alloc, 3165 &start_elem, &end_elem); 3166 #else 3167 # ifdef RE_ENABLE_I18N 3168 *err = build_range_exp (sbcset, 3169 dfa->mb_cur_max > 1 ? mbcset : NULL, 3170 &range_alloc, &start_elem, &end_elem); 3171 # else 3172 *err = build_range_exp (sbcset, &start_elem, &end_elem); 3173 # endif 3174 #endif /* RE_ENABLE_I18N */ 3175 if (BE (*err != REG_NOERROR, 0)) 3176 goto parse_bracket_exp_free_return; 3177 } 3178 else 3179 { 3180 switch (start_elem.type) 3181 { 3182 case SB_CHAR: 3183 bitset_set (sbcset, start_elem.opr.ch); 3184 break; 3185 #ifdef RE_ENABLE_I18N 3186 case MB_CHAR: 3187 /* Check whether the array has enough space. */ 3188 if (BE (mbchar_alloc == mbcset->nmbchars, 0)) 3189 { 3190 wchar_t *new_mbchars; 3191 /* Not enough, realloc it. */ 3192 /* +1 in case of mbcset->nmbchars is 0. */ 3193 mbchar_alloc = 2 * mbcset->nmbchars + 1; 3194 /* Use realloc since array is NULL if *alloc == 0. */ 3195 new_mbchars = re_realloc (mbcset->mbchars, wchar_t, 3196 mbchar_alloc); 3197 if (BE (new_mbchars == NULL, 0)) 3198 goto parse_bracket_exp_espace; 3199 mbcset->mbchars = new_mbchars; 3200 } 3201 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch; 3202 break; 3203 #endif /* RE_ENABLE_I18N */ 3204 case EQUIV_CLASS: 3205 *err = build_equiv_class (sbcset, 3206 #ifdef RE_ENABLE_I18N 3207 mbcset, &equiv_class_alloc, 3208 #endif /* RE_ENABLE_I18N */ 3209 start_elem.opr.name); 3210 if (BE (*err != REG_NOERROR, 0)) 3211 goto parse_bracket_exp_free_return; 3212 break; 3213 case COLL_SYM: 3214 *err = build_collating_symbol (sbcset, 3215 #ifdef RE_ENABLE_I18N 3216 mbcset, &coll_sym_alloc, 3217 #endif /* RE_ENABLE_I18N */ 3218 start_elem.opr.name); 3219 if (BE (*err != REG_NOERROR, 0)) 3220 goto parse_bracket_exp_free_return; 3221 break; 3222 case CHAR_CLASS: 3223 *err = build_charclass (regexp->trans, sbcset, 3224 #ifdef RE_ENABLE_I18N 3225 mbcset, &char_class_alloc, 3226 #endif /* RE_ENABLE_I18N */ 3227 start_elem.opr.name, syntax); 3228 if (BE (*err != REG_NOERROR, 0)) 3229 goto parse_bracket_exp_free_return; 3230 break; 3231 default: 3232 assert (0); 3233 break; 3234 } 3235 } 3236 if (BE (token->type == END_OF_RE, 0)) 3237 { 3238 *err = REG_EBRACK; 3239 goto parse_bracket_exp_free_return; 3240 } 3241 if (token->type == OP_CLOSE_BRACKET) 3242 break; 3243 } 3244 3245 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3246 3247 /* If it is non-matching list. */ 3248 if (non_match) 3249 bitset_not (sbcset); 3250 3251 #ifdef RE_ENABLE_I18N 3252 /* Ensure only single byte characters are set. */ 3253 if (dfa->mb_cur_max > 1) 3254 bitset_mask (sbcset, dfa->sb_char); 3255 3256 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes 3257 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes 3258 || mbcset->non_match))) 3259 { 3260 bin_tree_t *mbc_tree; 3261 int sbc_idx; 3262 /* Build a tree for complex bracket. */ 3263 dfa->has_mb_node = 1; 3264 br_token.type = COMPLEX_BRACKET; 3265 br_token.opr.mbcset = mbcset; 3266 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3267 if (BE (mbc_tree == NULL, 0)) 3268 goto parse_bracket_exp_espace; 3269 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx) 3270 if (sbcset[sbc_idx]) 3271 break; 3272 /* If there are no bits set in sbcset, there is no point 3273 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */ 3274 if (sbc_idx < BITSET_WORDS) 3275 { 3276 /* Build a tree for simple bracket. */ 3277 br_token.type = SIMPLE_BRACKET; 3278 br_token.opr.sbcset = sbcset; 3279 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3280 if (BE (work_tree == NULL, 0)) 3281 goto parse_bracket_exp_espace; 3282 3283 /* Then join them by ALT node. */ 3284 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT); 3285 if (BE (work_tree == NULL, 0)) 3286 goto parse_bracket_exp_espace; 3287 } 3288 else 3289 { 3290 re_free (sbcset); 3291 work_tree = mbc_tree; 3292 } 3293 } 3294 else 3295 #endif /* not RE_ENABLE_I18N */ 3296 { 3297 #ifdef RE_ENABLE_I18N 3298 free_charset (mbcset); 3299 #endif 3300 /* Build a tree for simple bracket. */ 3301 br_token.type = SIMPLE_BRACKET; 3302 br_token.opr.sbcset = sbcset; 3303 work_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3304 if (BE (work_tree == NULL, 0)) 3305 goto parse_bracket_exp_espace; 3306 } 3307 return work_tree; 3308 3309 parse_bracket_exp_espace: 3310 *err = REG_ESPACE; 3311 parse_bracket_exp_free_return: 3312 re_free (sbcset); 3313 #ifdef RE_ENABLE_I18N 3314 free_charset (mbcset); 3315 #endif /* RE_ENABLE_I18N */ 3316 return NULL; 3317 } 3318 3319 /* Parse an element in the bracket expression. */ 3320 3321 static reg_errcode_t 3322 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp, 3323 re_token_t *token, int token_len, re_dfa_t *dfa, 3324 reg_syntax_t syntax, bool accept_hyphen) 3325 { 3326 #ifdef RE_ENABLE_I18N 3327 int cur_char_size; 3328 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp)); 3329 if (cur_char_size > 1) 3330 { 3331 elem->type = MB_CHAR; 3332 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp)); 3333 re_string_skip_bytes (regexp, cur_char_size); 3334 return REG_NOERROR; 3335 } 3336 #endif /* RE_ENABLE_I18N */ 3337 re_string_skip_bytes (regexp, token_len); /* Skip a token. */ 3338 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS 3339 || token->type == OP_OPEN_EQUIV_CLASS) 3340 return parse_bracket_symbol (elem, regexp, token); 3341 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen) 3342 { 3343 /* A '-' must only appear as anything but a range indicator before 3344 the closing bracket. Everything else is an error. */ 3345 re_token_t token2; 3346 (void) peek_token_bracket (&token2, regexp, syntax); 3347 if (token2.type != OP_CLOSE_BRACKET) 3348 /* The actual error value is not standardized since this whole 3349 case is undefined. But ERANGE makes good sense. */ 3350 return REG_ERANGE; 3351 } 3352 elem->type = SB_CHAR; 3353 elem->opr.ch = token->opr.c; 3354 return REG_NOERROR; 3355 } 3356 3357 /* Parse a bracket symbol in the bracket expression. Bracket symbols are 3358 such as [:<character_class>:], [.<collating_element>.], and 3359 [=<equivalent_class>=]. */ 3360 3361 static reg_errcode_t 3362 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp, 3363 re_token_t *token) 3364 { 3365 unsigned char ch, delim = token->opr.c; 3366 int i = 0; 3367 if (re_string_eoi(regexp)) 3368 return REG_EBRACK; 3369 for (;; ++i) 3370 { 3371 if (i >= BRACKET_NAME_BUF_SIZE) 3372 return REG_EBRACK; 3373 if (token->type == OP_OPEN_CHAR_CLASS) 3374 ch = re_string_fetch_byte_case (regexp); 3375 else 3376 ch = re_string_fetch_byte (regexp); 3377 if (re_string_eoi(regexp)) 3378 return REG_EBRACK; 3379 if (ch == delim && re_string_peek_byte (regexp, 0) == ']') 3380 break; 3381 elem->opr.name[i] = ch; 3382 } 3383 re_string_skip_bytes (regexp, 1); 3384 elem->opr.name[i] = '\0'; 3385 switch (token->type) 3386 { 3387 case OP_OPEN_COLL_ELEM: 3388 elem->type = COLL_SYM; 3389 break; 3390 case OP_OPEN_EQUIV_CLASS: 3391 elem->type = EQUIV_CLASS; 3392 break; 3393 case OP_OPEN_CHAR_CLASS: 3394 elem->type = CHAR_CLASS; 3395 break; 3396 default: 3397 break; 3398 } 3399 return REG_NOERROR; 3400 } 3401 3402 /* Helper function for parse_bracket_exp. 3403 Build the equivalence class which is represented by NAME. 3404 The result are written to MBCSET and SBCSET. 3405 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes, 3406 is a pointer argument sinse we may update it. */ 3407 3408 static reg_errcode_t 3409 #ifdef RE_ENABLE_I18N 3410 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset, 3411 Idx *equiv_class_alloc, const unsigned char *name) 3412 #else /* not RE_ENABLE_I18N */ 3413 build_equiv_class (bitset_t sbcset, const unsigned char *name) 3414 #endif /* not RE_ENABLE_I18N */ 3415 { 3416 #ifdef _LIBC 3417 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3418 if (nrules != 0) 3419 { 3420 const int32_t *table, *indirect; 3421 const unsigned char *weights, *extra, *cp; 3422 unsigned char char_buf[2]; 3423 int32_t idx1, idx2; 3424 unsigned int ch; 3425 size_t len; 3426 /* This #include defines a local function! */ 3427 # include <locale/weight.h> 3428 /* Calculate the index for equivalence class. */ 3429 cp = name; 3430 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3431 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3432 _NL_COLLATE_WEIGHTMB); 3433 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 3434 _NL_COLLATE_EXTRAMB); 3435 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 3436 _NL_COLLATE_INDIRECTMB); 3437 idx1 = findidx (&cp); 3438 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0)) 3439 /* This isn't a valid character. */ 3440 return REG_ECOLLATE; 3441 3442 /* Build single byte matcing table for this equivalence class. */ 3443 char_buf[1] = (unsigned char) '\0'; 3444 len = weights[idx1]; 3445 for (ch = 0; ch < SBC_MAX; ++ch) 3446 { 3447 char_buf[0] = ch; 3448 cp = char_buf; 3449 idx2 = findidx (&cp); 3450 /* 3451 idx2 = table[ch]; 3452 */ 3453 if (idx2 == 0) 3454 /* This isn't a valid character. */ 3455 continue; 3456 if (len == weights[idx2]) 3457 { 3458 int cnt = 0; 3459 while (cnt <= len && 3460 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt]) 3461 ++cnt; 3462 3463 if (cnt > len) 3464 bitset_set (sbcset, ch); 3465 } 3466 } 3467 /* Check whether the array has enough space. */ 3468 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0)) 3469 { 3470 /* Not enough, realloc it. */ 3471 /* +1 in case of mbcset->nequiv_classes is 0. */ 3472 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1; 3473 /* Use realloc since the array is NULL if *alloc == 0. */ 3474 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes, 3475 int32_t, 3476 new_equiv_class_alloc); 3477 if (BE (new_equiv_classes == NULL, 0)) 3478 return REG_ESPACE; 3479 mbcset->equiv_classes = new_equiv_classes; 3480 *equiv_class_alloc = new_equiv_class_alloc; 3481 } 3482 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1; 3483 } 3484 else 3485 #endif /* _LIBC */ 3486 { 3487 if (BE (strlen ((const char *) name) != 1, 0)) 3488 return REG_ECOLLATE; 3489 bitset_set (sbcset, *name); 3490 } 3491 return REG_NOERROR; 3492 } 3493 3494 /* Helper function for parse_bracket_exp. 3495 Build the character class which is represented by NAME. 3496 The result are written to MBCSET and SBCSET. 3497 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes, 3498 is a pointer argument sinse we may update it. */ 3499 3500 static reg_errcode_t 3501 #ifdef RE_ENABLE_I18N 3502 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3503 re_charset_t *mbcset, Idx *char_class_alloc, 3504 const unsigned char *class_name, reg_syntax_t syntax) 3505 #else /* not RE_ENABLE_I18N */ 3506 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset, 3507 const unsigned char *class_name, reg_syntax_t syntax) 3508 #endif /* not RE_ENABLE_I18N */ 3509 { 3510 int i; 3511 const char *name = (const char *) class_name; 3512 3513 /* In case of REG_ICASE "upper" and "lower" match the both of 3514 upper and lower cases. */ 3515 if ((syntax & RE_ICASE) 3516 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0)) 3517 name = "alpha"; 3518 3519 #ifdef RE_ENABLE_I18N 3520 /* Check the space of the arrays. */ 3521 if (BE (*char_class_alloc == mbcset->nchar_classes, 0)) 3522 { 3523 /* Not enough, realloc it. */ 3524 /* +1 in case of mbcset->nchar_classes is 0. */ 3525 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1; 3526 /* Use realloc since array is NULL if *alloc == 0. */ 3527 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t, 3528 new_char_class_alloc); 3529 if (BE (new_char_classes == NULL, 0)) 3530 return REG_ESPACE; 3531 mbcset->char_classes = new_char_classes; 3532 *char_class_alloc = new_char_class_alloc; 3533 } 3534 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name); 3535 #endif /* RE_ENABLE_I18N */ 3536 3537 #define BUILD_CHARCLASS_LOOP(ctype_func) \ 3538 do { \ 3539 if (BE (trans != NULL, 0)) \ 3540 { \ 3541 for (i = 0; i < SBC_MAX; ++i) \ 3542 if (ctype_func (i)) \ 3543 bitset_set (sbcset, trans[i]); \ 3544 } \ 3545 else \ 3546 { \ 3547 for (i = 0; i < SBC_MAX; ++i) \ 3548 if (ctype_func (i)) \ 3549 bitset_set (sbcset, i); \ 3550 } \ 3551 } while (0) 3552 3553 if (strcmp (name, "alnum") == 0) 3554 BUILD_CHARCLASS_LOOP (isalnum); 3555 else if (strcmp (name, "cntrl") == 0) 3556 BUILD_CHARCLASS_LOOP (iscntrl); 3557 else if (strcmp (name, "lower") == 0) 3558 BUILD_CHARCLASS_LOOP (islower); 3559 else if (strcmp (name, "space") == 0) 3560 BUILD_CHARCLASS_LOOP (isspace); 3561 else if (strcmp (name, "alpha") == 0) 3562 BUILD_CHARCLASS_LOOP (isalpha); 3563 else if (strcmp (name, "digit") == 0) 3564 BUILD_CHARCLASS_LOOP (isdigit); 3565 else if (strcmp (name, "print") == 0) 3566 BUILD_CHARCLASS_LOOP (isprint); 3567 else if (strcmp (name, "upper") == 0) 3568 BUILD_CHARCLASS_LOOP (isupper); 3569 else if (strcmp (name, "blank") == 0) 3570 BUILD_CHARCLASS_LOOP (isblank); 3571 else if (strcmp (name, "graph") == 0) 3572 BUILD_CHARCLASS_LOOP (isgraph); 3573 else if (strcmp (name, "punct") == 0) 3574 BUILD_CHARCLASS_LOOP (ispunct); 3575 else if (strcmp (name, "xdigit") == 0) 3576 BUILD_CHARCLASS_LOOP (isxdigit); 3577 else 3578 return REG_ECTYPE; 3579 3580 return REG_NOERROR; 3581 } 3582 3583 static bin_tree_t * 3584 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans, 3585 const unsigned char *class_name, 3586 const unsigned char *extra, bool non_match, 3587 reg_errcode_t *err) 3588 { 3589 re_bitset_ptr_t sbcset; 3590 #ifdef RE_ENABLE_I18N 3591 re_charset_t *mbcset; 3592 Idx alloc = 0; 3593 #endif /* not RE_ENABLE_I18N */ 3594 reg_errcode_t ret; 3595 re_token_t br_token; 3596 bin_tree_t *tree; 3597 3598 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); 3599 #ifdef RE_ENABLE_I18N 3600 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1); 3601 #endif /* RE_ENABLE_I18N */ 3602 3603 #ifdef RE_ENABLE_I18N 3604 if (BE (sbcset == NULL || mbcset == NULL, 0)) 3605 #else /* not RE_ENABLE_I18N */ 3606 if (BE (sbcset == NULL, 0)) 3607 #endif /* not RE_ENABLE_I18N */ 3608 { 3609 *err = REG_ESPACE; 3610 return NULL; 3611 } 3612 3613 if (non_match) 3614 { 3615 #ifdef RE_ENABLE_I18N 3616 mbcset->non_match = 1; 3617 #endif /* not RE_ENABLE_I18N */ 3618 } 3619 3620 /* We don't care the syntax in this case. */ 3621 ret = build_charclass (trans, sbcset, 3622 #ifdef RE_ENABLE_I18N 3623 mbcset, &alloc, 3624 #endif /* RE_ENABLE_I18N */ 3625 class_name, 0); 3626 3627 if (BE (ret != REG_NOERROR, 0)) 3628 { 3629 re_free (sbcset); 3630 #ifdef RE_ENABLE_I18N 3631 free_charset (mbcset); 3632 #endif /* RE_ENABLE_I18N */ 3633 *err = ret; 3634 return NULL; 3635 } 3636 /* \w match '_' also. */ 3637 for (; *extra; extra++) 3638 bitset_set (sbcset, *extra); 3639 3640 /* If it is non-matching list. */ 3641 if (non_match) 3642 bitset_not (sbcset); 3643 3644 #ifdef RE_ENABLE_I18N 3645 /* Ensure only single byte characters are set. */ 3646 if (dfa->mb_cur_max > 1) 3647 bitset_mask (sbcset, dfa->sb_char); 3648 #endif 3649 3650 /* Build a tree for simple bracket. */ 3651 br_token.type = SIMPLE_BRACKET; 3652 br_token.opr.sbcset = sbcset; 3653 tree = create_token_tree (dfa, NULL, NULL, &br_token); 3654 if (BE (tree == NULL, 0)) 3655 goto build_word_op_espace; 3656 3657 #ifdef RE_ENABLE_I18N 3658 if (dfa->mb_cur_max > 1) 3659 { 3660 bin_tree_t *mbc_tree; 3661 /* Build a tree for complex bracket. */ 3662 br_token.type = COMPLEX_BRACKET; 3663 br_token.opr.mbcset = mbcset; 3664 dfa->has_mb_node = 1; 3665 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token); 3666 if (BE (mbc_tree == NULL, 0)) 3667 goto build_word_op_espace; 3668 /* Then join them by ALT node. */ 3669 tree = create_tree (dfa, tree, mbc_tree, OP_ALT); 3670 if (BE (mbc_tree != NULL, 1)) 3671 return tree; 3672 } 3673 else 3674 { 3675 free_charset (mbcset); 3676 return tree; 3677 } 3678 #else /* not RE_ENABLE_I18N */ 3679 return tree; 3680 #endif /* not RE_ENABLE_I18N */ 3681 3682 build_word_op_espace: 3683 re_free (sbcset); 3684 #ifdef RE_ENABLE_I18N 3685 free_charset (mbcset); 3686 #endif /* RE_ENABLE_I18N */ 3687 *err = REG_ESPACE; 3688 return NULL; 3689 } 3690 3691 /* This is intended for the expressions like "a{1,3}". 3692 Fetch a number from `input', and return the number. 3693 Return REG_MISSING if the number field is empty like "{,1}". 3694 Return REG_ERROR if an error occurred. */ 3695 3696 static Idx 3697 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax) 3698 { 3699 Idx num = REG_MISSING; 3700 unsigned char c; 3701 while (1) 3702 { 3703 fetch_token (token, input, syntax); 3704 c = token->opr.c; 3705 if (BE (token->type == END_OF_RE, 0)) 3706 return REG_ERROR; 3707 if (token->type == OP_CLOSE_DUP_NUM || c == ',') 3708 break; 3709 num = ((token->type != CHARACTER || c < '0' || '9' < c 3710 || num == REG_ERROR) 3711 ? REG_ERROR 3712 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0')); 3713 num = (num > RE_DUP_MAX) ? REG_ERROR : num; 3714 } 3715 return num; 3716 } 3717 3718 #ifdef RE_ENABLE_I18N 3720 static void 3721 free_charset (re_charset_t *cset) 3722 { 3723 re_free (cset->mbchars); 3724 # ifdef _LIBC 3725 re_free (cset->coll_syms); 3726 re_free (cset->equiv_classes); 3727 re_free (cset->range_starts); 3728 re_free (cset->range_ends); 3729 # endif 3730 re_free (cset->char_classes); 3731 re_free (cset); 3732 } 3733 #endif /* RE_ENABLE_I18N */ 3734 3735 /* Functions for binary tree operation. */ 3737 3738 /* Create a tree node. */ 3739 3740 static bin_tree_t * 3741 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3742 re_token_type_t type) 3743 { 3744 re_token_t t; 3745 t.type = type; 3746 return create_token_tree (dfa, left, right, &t); 3747 } 3748 3749 static bin_tree_t * 3750 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right, 3751 const re_token_t *token) 3752 { 3753 bin_tree_t *tree; 3754 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0)) 3755 { 3756 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1); 3757 3758 if (storage == NULL) 3759 return NULL; 3760 storage->next = dfa->str_tree_storage; 3761 dfa->str_tree_storage = storage; 3762 dfa->str_tree_storage_idx = 0; 3763 } 3764 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++]; 3765 3766 tree->parent = NULL; 3767 tree->left = left; 3768 tree->right = right; 3769 tree->token = *token; 3770 tree->token.duplicated = 0; 3771 tree->token.opt_subexp = 0; 3772 tree->first = NULL; 3773 tree->next = NULL; 3774 tree->node_idx = REG_MISSING; 3775 3776 if (left != NULL) 3777 left->parent = tree; 3778 if (right != NULL) 3779 right->parent = tree; 3780 return tree; 3781 } 3782 3783 /* Mark the tree SRC as an optional subexpression. 3784 To be called from preorder or postorder. */ 3785 3786 static reg_errcode_t 3787 mark_opt_subexp (void *extra, bin_tree_t *node) 3788 { 3789 Idx idx = (Idx) (intptr_t) extra; 3790 assert(sizeof(void*) >= sizeof(Idx)); 3791 if (node->token.type == SUBEXP && node->token.opr.idx == idx) 3792 node->token.opt_subexp = 1; 3793 3794 return REG_NOERROR; 3795 } 3796 3797 /* Free the allocated memory inside NODE. */ 3798 3799 static void 3800 free_token (re_token_t *node) 3801 { 3802 #ifdef RE_ENABLE_I18N 3803 if (node->type == COMPLEX_BRACKET && node->duplicated == 0) 3804 free_charset (node->opr.mbcset); 3805 else 3806 #endif /* RE_ENABLE_I18N */ 3807 if (node->type == SIMPLE_BRACKET && node->duplicated == 0) 3808 re_free (node->opr.sbcset); 3809 } 3810 3811 /* Worker function for tree walking. Free the allocated memory inside NODE 3812 and its children. */ 3813 3814 static reg_errcode_t 3815 free_tree (void *extra, bin_tree_t *node) 3816 { 3817 free_token (&node->token); 3818 return REG_NOERROR; 3819 } 3820 3821 3822 /* Duplicate the node SRC, and return new node. This is a preorder 3823 visit similar to the one implemented by the generic visitor, but 3824 we need more infrastructure to maintain two parallel trees --- so, 3825 it's easier to duplicate. */ 3826 3827 static bin_tree_t * 3828 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa) 3829 { 3830 const bin_tree_t *node; 3831 bin_tree_t *dup_root; 3832 bin_tree_t **p_new = &dup_root, *dup_node = root->parent; 3833 3834 for (node = root; ; ) 3835 { 3836 /* Create a new tree and link it back to the current parent. */ 3837 *p_new = create_token_tree (dfa, NULL, NULL, &node->token); 3838 if (*p_new == NULL) 3839 return NULL; 3840 (*p_new)->parent = dup_node; 3841 (*p_new)->token.duplicated = 1; 3842 dup_node = *p_new; 3843 3844 /* Go to the left node, or up and to the right. */ 3845 if (node->left) 3846 { 3847 node = node->left; 3848 p_new = &dup_node->left; 3849 } 3850 else 3851 { 3852 const bin_tree_t *prev = NULL; 3853 while (node->right == prev || node->right == NULL) 3854 { 3855 prev = node; 3856 node = node->parent; 3857 dup_node = dup_node->parent; 3858 if (!node) 3859 return dup_root; 3860 } 3861 node = node->right; 3862 p_new = &dup_node->right; 3863 } 3864 } 3865 } 3866