Home | History | Annotate | Download | only in lib
      1 /* -*- buffer-read-only: t -*- vi: set ro: */
      2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
      3 /* Extended regular expression matching and search library.
      4    Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
      5    Free Software Foundation, Inc.
      6    This file is part of the GNU C Library.
      7    Contributed by Isamu Hasegawa <isamu (at) yamato.ibm.com>.
      8 
      9    This program is free software; you can redistribute it and/or modify
     10    it under the terms of the GNU General Public License as published by
     11    the Free Software Foundation; either version 3, or (at your option)
     12    any later version.
     13 
     14    This program is distributed in the hope that it will be useful,
     15    but WITHOUT ANY WARRANTY; without even the implied warranty of
     16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17    GNU General Public License for more details.
     18 
     19    You should have received a copy of the GNU General Public License along
     20    with this program; if not, write to the Free Software Foundation,
     21    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
     22 
     23 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
     24 					  size_t length, reg_syntax_t syntax);
     25 static void re_compile_fastmap_iter (regex_t *bufp,
     26 				     const re_dfastate_t *init_state,
     27 				     char *fastmap);
     28 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
     29 #ifdef RE_ENABLE_I18N
     30 static void free_charset (re_charset_t *cset);
     31 #endif /* RE_ENABLE_I18N */
     32 static void free_workarea_compile (regex_t *preg);
     33 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
     34 #ifdef RE_ENABLE_I18N
     35 static void optimize_utf8 (re_dfa_t *dfa);
     36 #endif
     37 static reg_errcode_t analyze (regex_t *preg);
     38 static reg_errcode_t preorder (bin_tree_t *root,
     39 			       reg_errcode_t (fn (void *, bin_tree_t *)),
     40 			       void *extra);
     41 static reg_errcode_t postorder (bin_tree_t *root,
     42 				reg_errcode_t (fn (void *, bin_tree_t *)),
     43 				void *extra);
     44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
     45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
     46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
     47 				 bin_tree_t *node);
     48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
     49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
     50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
     51 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
     52 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
     53 				   unsigned int constraint);
     54 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
     55 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
     56 					 Idx node, bool root);
     57 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
     58 static Idx fetch_number (re_string_t *input, re_token_t *token,
     59 			 reg_syntax_t syntax);
     60 static int peek_token (re_token_t *token, re_string_t *input,
     61 			reg_syntax_t syntax) internal_function;
     62 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
     63 			  reg_syntax_t syntax, reg_errcode_t *err);
     64 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
     65 				  re_token_t *token, reg_syntax_t syntax,
     66 				  Idx nest, reg_errcode_t *err);
     67 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
     68 				 re_token_t *token, reg_syntax_t syntax,
     69 				 Idx nest, reg_errcode_t *err);
     70 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
     71 				     re_token_t *token, reg_syntax_t syntax,
     72 				     Idx nest, reg_errcode_t *err);
     73 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
     74 				  re_token_t *token, reg_syntax_t syntax,
     75 				  Idx nest, reg_errcode_t *err);
     76 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
     77 				 re_dfa_t *dfa, re_token_t *token,
     78 				 reg_syntax_t syntax, reg_errcode_t *err);
     79 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
     80 				      re_token_t *token, reg_syntax_t syntax,
     81 				      reg_errcode_t *err);
     82 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
     83 					    re_string_t *regexp,
     84 					    re_token_t *token, int token_len,
     85 					    re_dfa_t *dfa,
     86 					    reg_syntax_t syntax,
     87 					    bool accept_hyphen);
     88 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
     89 					  re_string_t *regexp,
     90 					  re_token_t *token);
     91 #ifdef RE_ENABLE_I18N
     92 static reg_errcode_t build_equiv_class (bitset_t sbcset,
     93 					re_charset_t *mbcset,
     94 					Idx *equiv_class_alloc,
     95 					const unsigned char *name);
     96 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
     97 				      bitset_t sbcset,
     98 				      re_charset_t *mbcset,
     99 				      Idx *char_class_alloc,
    100 				      const unsigned char *class_name,
    101 				      reg_syntax_t syntax);
    102 #else  /* not RE_ENABLE_I18N */
    103 static reg_errcode_t build_equiv_class (bitset_t sbcset,
    104 					const unsigned char *name);
    105 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
    106 				      bitset_t sbcset,
    107 				      const unsigned char *class_name,
    108 				      reg_syntax_t syntax);
    109 #endif /* not RE_ENABLE_I18N */
    110 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
    111 				       RE_TRANSLATE_TYPE trans,
    112 				       const unsigned char *class_name,
    113 				       const unsigned char *extra,
    114 				       bool non_match, reg_errcode_t *err);
    115 static bin_tree_t *create_tree (re_dfa_t *dfa,
    116 				bin_tree_t *left, bin_tree_t *right,
    117 				re_token_type_t type);
    118 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
    119 				      bin_tree_t *left, bin_tree_t *right,
    120 				      const re_token_t *token);
    121 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
    122 static void free_token (re_token_t *node);
    123 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
    124 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
    125 
    126 /* This table gives an error message for each of the error codes listed
    128    in regex.h.  Obviously the order here has to be same as there.
    129    POSIX doesn't require that we do anything for REG_NOERROR,
    130    but why not be nice?  */
    131 
    132 static const char __re_error_msgid[] =
    133   {
    134 #define REG_NOERROR_IDX	0
    135     gettext_noop ("Success")	/* REG_NOERROR */
    136     "\0"
    137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
    138     gettext_noop ("No match")	/* REG_NOMATCH */
    139     "\0"
    140 #define REG_BADPAT_IDX	(REG_NOMATCH_IDX + sizeof "No match")
    141     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
    142     "\0"
    143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
    144     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
    145     "\0"
    146 #define REG_ECTYPE_IDX	(REG_ECOLLATE_IDX + sizeof "Invalid collation character")
    147     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
    148     "\0"
    149 #define REG_EESCAPE_IDX	(REG_ECTYPE_IDX + sizeof "Invalid character class name")
    150     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
    151     "\0"
    152 #define REG_ESUBREG_IDX	(REG_EESCAPE_IDX + sizeof "Trailing backslash")
    153     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
    154     "\0"
    155 #define REG_EBRACK_IDX	(REG_ESUBREG_IDX + sizeof "Invalid back reference")
    156     gettext_noop ("Unmatched [ or [^")	/* REG_EBRACK */
    157     "\0"
    158 #define REG_EPAREN_IDX	(REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
    159     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
    160     "\0"
    161 #define REG_EBRACE_IDX	(REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
    162     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
    163     "\0"
    164 #define REG_BADBR_IDX	(REG_EBRACE_IDX + sizeof "Unmatched \\{")
    165     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
    166     "\0"
    167 #define REG_ERANGE_IDX	(REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
    168     gettext_noop ("Invalid range end")	/* REG_ERANGE */
    169     "\0"
    170 #define REG_ESPACE_IDX	(REG_ERANGE_IDX + sizeof "Invalid range end")
    171     gettext_noop ("Memory exhausted") /* REG_ESPACE */
    172     "\0"
    173 #define REG_BADRPT_IDX	(REG_ESPACE_IDX + sizeof "Memory exhausted")
    174     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
    175     "\0"
    176 #define REG_EEND_IDX	(REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
    177     gettext_noop ("Premature end of regular expression") /* REG_EEND */
    178     "\0"
    179 #define REG_ESIZE_IDX	(REG_EEND_IDX + sizeof "Premature end of regular expression")
    180     gettext_noop ("Regular expression too big") /* REG_ESIZE */
    181     "\0"
    182 #define REG_ERPAREN_IDX	(REG_ESIZE_IDX + sizeof "Regular expression too big")
    183     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
    184   };
    185 
    186 static const size_t __re_error_msgid_idx[] =
    187   {
    188     REG_NOERROR_IDX,
    189     REG_NOMATCH_IDX,
    190     REG_BADPAT_IDX,
    191     REG_ECOLLATE_IDX,
    192     REG_ECTYPE_IDX,
    193     REG_EESCAPE_IDX,
    194     REG_ESUBREG_IDX,
    195     REG_EBRACK_IDX,
    196     REG_EPAREN_IDX,
    197     REG_EBRACE_IDX,
    198     REG_BADBR_IDX,
    199     REG_ERANGE_IDX,
    200     REG_ESPACE_IDX,
    201     REG_BADRPT_IDX,
    202     REG_EEND_IDX,
    203     REG_ESIZE_IDX,
    204     REG_ERPAREN_IDX
    205   };
    206 
    207 /* Entry points for GNU code.  */
    209 
    210 /* re_compile_pattern is the GNU regular expression compiler: it
    211    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
    212    Returns 0 if the pattern was valid, otherwise an error string.
    213 
    214    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
    215    are set in BUFP on entry.  */
    216 
    217 #ifdef _LIBC
    218 const char *
    219 re_compile_pattern (pattern, length, bufp)
    220     const char *pattern;
    221     size_t length;
    222     struct re_pattern_buffer *bufp;
    223 #else /* size_t might promote */
    224 const char *
    225 re_compile_pattern (const char *pattern, size_t length,
    226 		    struct re_pattern_buffer *bufp)
    227 #endif
    228 {
    229   reg_errcode_t ret;
    230 
    231   /* And GNU code determines whether or not to get register information
    232      by passing null for the REGS argument to re_match, etc., not by
    233      setting no_sub, unless RE_NO_SUB is set.  */
    234   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
    235 
    236   /* Match anchors at newline.  */
    237   bufp->newline_anchor = 1;
    238 
    239   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
    240 
    241   if (!ret)
    242     return NULL;
    243   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
    244 }
    245 #ifdef _LIBC
    246 weak_alias (__re_compile_pattern, re_compile_pattern)
    247 #endif
    248 
    249 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
    250    also be assigned to arbitrarily: each pattern buffer stores its own
    251    syntax, so it can be changed between regex compilations.  */
    252 /* This has no initializer because initialized variables in Emacs
    253    become read-only after dumping.  */
    254 reg_syntax_t re_syntax_options;
    255 
    256 
    257 /* Specify the precise syntax of regexps for compilation.  This provides
    258    for compatibility for various utilities which historically have
    259    different, incompatible syntaxes.
    260 
    261    The argument SYNTAX is a bit mask comprised of the various bits
    262    defined in regex.h.  We return the old syntax.  */
    263 
    264 reg_syntax_t
    265 re_set_syntax (syntax)
    266     reg_syntax_t syntax;
    267 {
    268   reg_syntax_t ret = re_syntax_options;
    269 
    270   re_syntax_options = syntax;
    271   return ret;
    272 }
    273 #ifdef _LIBC
    274 weak_alias (__re_set_syntax, re_set_syntax)
    275 #endif
    276 
    277 int
    278 re_compile_fastmap (bufp)
    279     struct re_pattern_buffer *bufp;
    280 {
    281   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
    282   char *fastmap = bufp->fastmap;
    283 
    284   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
    285   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
    286   if (dfa->init_state != dfa->init_state_word)
    287     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
    288   if (dfa->init_state != dfa->init_state_nl)
    289     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
    290   if (dfa->init_state != dfa->init_state_begbuf)
    291     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
    292   bufp->fastmap_accurate = 1;
    293   return 0;
    294 }
    295 #ifdef _LIBC
    296 weak_alias (__re_compile_fastmap, re_compile_fastmap)
    297 #endif
    298 
    299 static inline void
    300 __attribute ((always_inline))
    301 re_set_fastmap (char *fastmap, bool icase, int ch)
    302 {
    303   fastmap[ch] = 1;
    304   if (icase)
    305     fastmap[tolower (ch)] = 1;
    306 }
    307 
    308 /* Helper function for re_compile_fastmap.
    309    Compile fastmap for the initial_state INIT_STATE.  */
    310 
    311 static void
    312 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
    313 			 char *fastmap)
    314 {
    315   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
    316   Idx node_cnt;
    317   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
    318   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
    319     {
    320       Idx node = init_state->nodes.elems[node_cnt];
    321       re_token_type_t type = dfa->nodes[node].type;
    322 
    323       if (type == CHARACTER)
    324 	{
    325 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
    326 #ifdef RE_ENABLE_I18N
    327 	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
    328 	    {
    329 	      unsigned char buf[MB_LEN_MAX];
    330 	      unsigned char *p;
    331 	      wchar_t wc;
    332 	      mbstate_t state;
    333 
    334 	      p = buf;
    335 	      *p++ = dfa->nodes[node].opr.c;
    336 	      while (++node < dfa->nodes_len
    337 		     &&	dfa->nodes[node].type == CHARACTER
    338 		     && dfa->nodes[node].mb_partial)
    339 		*p++ = dfa->nodes[node].opr.c;
    340 	      memset (&state, '\0', sizeof (state));
    341 	      if (__mbrtowc (&wc, (const char *) buf, p - buf,
    342 			     &state) == p - buf
    343 		  && (__wcrtomb ((char *) buf, towlower (wc), &state)
    344 		      != (size_t) -1))
    345 		re_set_fastmap (fastmap, false, buf[0]);
    346 	    }
    347 #endif
    348 	}
    349       else if (type == SIMPLE_BRACKET)
    350 	{
    351 	  int i, ch;
    352 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
    353 	    {
    354 	      int j;
    355 	      bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
    356 	      for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
    357 		if (w & ((bitset_word_t) 1 << j))
    358 		  re_set_fastmap (fastmap, icase, ch);
    359 	    }
    360 	}
    361 #ifdef RE_ENABLE_I18N
    362       else if (type == COMPLEX_BRACKET)
    363 	{
    364 	  re_charset_t *cset = dfa->nodes[node].opr.mbcset;
    365 	  Idx i;
    366 
    367 # ifdef _LIBC
    368 	  /* See if we have to try all bytes which start multiple collation
    369 	     elements.
    370 	     e.g. In da_DK, we want to catch 'a' since "aa" is a valid
    371 		  collation element, and don't catch 'b' since 'b' is
    372 		  the only collation element which starts from 'b' (and
    373 		  it is caught by SIMPLE_BRACKET).  */
    374 	      if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
    375 		  && (cset->ncoll_syms || cset->nranges))
    376 		{
    377 		  const int32_t *table = (const int32_t *)
    378 		    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
    379 		  for (i = 0; i < SBC_MAX; ++i)
    380 		    if (table[i] < 0)
    381 		      re_set_fastmap (fastmap, icase, i);
    382 		}
    383 # endif /* _LIBC */
    384 
    385 	  /* See if we have to start the match at all multibyte characters,
    386 	     i.e. where we would not find an invalid sequence.  This only
    387 	     applies to multibyte character sets; for single byte character
    388 	     sets, the SIMPLE_BRACKET again suffices.  */
    389 	  if (dfa->mb_cur_max > 1
    390 	      && (cset->nchar_classes || cset->non_match
    391 # ifdef _LIBC
    392 		  || cset->nequiv_classes
    393 # endif /* _LIBC */
    394 		 ))
    395 	    {
    396 	      unsigned char c = 0;
    397 	      do
    398 		{
    399 		  mbstate_t mbs;
    400 		  memset (&mbs, 0, sizeof (mbs));
    401 		  if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
    402 		    re_set_fastmap (fastmap, false, (int) c);
    403 		}
    404 	      while (++c != 0);
    405 	    }
    406 
    407 	  else
    408 	    {
    409 	      /* ... Else catch all bytes which can start the mbchars.  */
    410 	      for (i = 0; i < cset->nmbchars; ++i)
    411 		{
    412 		  char buf[256];
    413 		  mbstate_t state;
    414 		  memset (&state, '\0', sizeof (state));
    415 		  if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
    416 		    re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
    417 		  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
    418 		    {
    419 		      if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
    420 			  != (size_t) -1)
    421 			re_set_fastmap (fastmap, false, *(unsigned char *) buf);
    422 		    }
    423 		}
    424 	    }
    425 	}
    426 #endif /* RE_ENABLE_I18N */
    427       else if (type == OP_PERIOD
    428 #ifdef RE_ENABLE_I18N
    429 	       || type == OP_UTF8_PERIOD
    430 #endif /* RE_ENABLE_I18N */
    431 	       || type == END_OF_RE)
    432 	{
    433 	  memset (fastmap, '\1', sizeof (char) * SBC_MAX);
    434 	  if (type == END_OF_RE)
    435 	    bufp->can_be_null = 1;
    436 	  return;
    437 	}
    438     }
    439 }
    440 
    441 /* Entry point for POSIX code.  */
    443 /* regcomp takes a regular expression as a string and compiles it.
    444 
    445    PREG is a regex_t *.  We do not expect any fields to be initialized,
    446    since POSIX says we shouldn't.  Thus, we set
    447 
    448      `buffer' to the compiled pattern;
    449      `used' to the length of the compiled pattern;
    450      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
    451        REG_EXTENDED bit in CFLAGS is set; otherwise, to
    452        RE_SYNTAX_POSIX_BASIC;
    453      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
    454      `fastmap' to an allocated space for the fastmap;
    455      `fastmap_accurate' to zero;
    456      `re_nsub' to the number of subexpressions in PATTERN.
    457 
    458    PATTERN is the address of the pattern string.
    459 
    460    CFLAGS is a series of bits which affect compilation.
    461 
    462      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
    463      use POSIX basic syntax.
    464 
    465      If REG_NEWLINE is set, then . and [^...] don't match newline.
    466      Also, regexec will try a match beginning after every newline.
    467 
    468      If REG_ICASE is set, then we considers upper- and lowercase
    469      versions of letters to be equivalent when matching.
    470 
    471      If REG_NOSUB is set, then when PREG is passed to regexec, that
    472      routine will report only success or failure, and nothing about the
    473      registers.
    474 
    475    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
    476    the return codes and their meanings.)  */
    477 
    478 int
    479 regcomp (preg, pattern, cflags)
    480     regex_t *_Restrict_ preg;
    481     const char *_Restrict_ pattern;
    482     int cflags;
    483 {
    484   reg_errcode_t ret;
    485   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
    486 			 : RE_SYNTAX_POSIX_BASIC);
    487 
    488   preg->buffer = NULL;
    489   preg->allocated = 0;
    490   preg->used = 0;
    491 
    492   /* Try to allocate space for the fastmap.  */
    493   preg->fastmap = re_malloc (char, SBC_MAX);
    494   if (BE (preg->fastmap == NULL, 0))
    495     return REG_ESPACE;
    496 
    497   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
    498 
    499   /* If REG_NEWLINE is set, newlines are treated differently.  */
    500   if (cflags & REG_NEWLINE)
    501     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
    502       syntax &= ~RE_DOT_NEWLINE;
    503       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
    504       /* It also changes the matching behavior.  */
    505       preg->newline_anchor = 1;
    506     }
    507   else
    508     preg->newline_anchor = 0;
    509   preg->no_sub = !!(cflags & REG_NOSUB);
    510   preg->translate = NULL;
    511 
    512   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
    513 
    514   /* POSIX doesn't distinguish between an unmatched open-group and an
    515      unmatched close-group: both are REG_EPAREN.  */
    516   if (ret == REG_ERPAREN)
    517     ret = REG_EPAREN;
    518 
    519   /* We have already checked preg->fastmap != NULL.  */
    520   if (BE (ret == REG_NOERROR, 1))
    521     /* Compute the fastmap now, since regexec cannot modify the pattern
    522        buffer.  This function never fails in this implementation.  */
    523     (void) re_compile_fastmap (preg);
    524   else
    525     {
    526       /* Some error occurred while compiling the expression.  */
    527       re_free (preg->fastmap);
    528       preg->fastmap = NULL;
    529     }
    530 
    531   return (int) ret;
    532 }
    533 #ifdef _LIBC
    534 weak_alias (__regcomp, regcomp)
    535 #endif
    536 
    537 /* Returns a message corresponding to an error code, ERRCODE, returned
    538    from either regcomp or regexec.   We don't use PREG here.  */
    539 
    540 #ifdef _LIBC
    541 size_t
    542 regerror (errcode, preg, errbuf, errbuf_size)
    543     int errcode;
    544     const regex_t *_Restrict_ preg;
    545     char *_Restrict_ errbuf;
    546     size_t errbuf_size;
    547 #else /* size_t might promote */
    548 size_t
    549 regerror (int errcode, const regex_t *_Restrict_ preg,
    550 	  char *_Restrict_ errbuf, size_t errbuf_size)
    551 #endif
    552 {
    553   const char *msg;
    554   size_t msg_size;
    555 
    556   if (BE (errcode < 0
    557 	  || errcode >= (int) (sizeof (__re_error_msgid_idx)
    558 			       / sizeof (__re_error_msgid_idx[0])), 0))
    559     /* Only error codes returned by the rest of the code should be passed
    560        to this routine.  If we are given anything else, or if other regex
    561        code generates an invalid error code, then the program has a bug.
    562        Dump core so we can fix it.  */
    563     abort ();
    564 
    565   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
    566 
    567   msg_size = strlen (msg) + 1; /* Includes the null.  */
    568 
    569   if (BE (errbuf_size != 0, 1))
    570     {
    571       size_t cpy_size = msg_size;
    572       if (BE (msg_size > errbuf_size, 0))
    573 	{
    574 	  cpy_size = errbuf_size - 1;
    575 	  errbuf[cpy_size] = '\0';
    576 	}
    577       memcpy (errbuf, msg, cpy_size);
    578     }
    579 
    580   return msg_size;
    581 }
    582 #ifdef _LIBC
    583 weak_alias (__regerror, regerror)
    584 #endif
    585 
    586 
    587 #ifdef RE_ENABLE_I18N
    588 /* This static array is used for the map to single-byte characters when
    589    UTF-8 is used.  Otherwise we would allocate memory just to initialize
    590    it the same all the time.  UTF-8 is the preferred encoding so this is
    591    a worthwhile optimization.  */
    592 static const bitset_t utf8_sb_map =
    593 {
    594   /* Set the first 128 bits.  */
    595 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
    596 #  error "bitset_word_t is narrower than 32 bits"
    597 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
    598   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
    599 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
    600   BITSET_WORD_MAX, BITSET_WORD_MAX,
    601 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
    602   BITSET_WORD_MAX,
    603 # endif
    604   (BITSET_WORD_MAX
    605    >> (SBC_MAX % BITSET_WORD_BITS == 0
    606        ? 0
    607        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
    608 };
    609 #endif
    610 
    611 
    612 static void
    613 free_dfa_content (re_dfa_t *dfa)
    614 {
    615   Idx i, j;
    616 
    617   if (dfa->nodes)
    618     for (i = 0; i < dfa->nodes_len; ++i)
    619       free_token (dfa->nodes + i);
    620   re_free (dfa->nexts);
    621   for (i = 0; i < dfa->nodes_len; ++i)
    622     {
    623       if (dfa->eclosures != NULL)
    624 	re_node_set_free (dfa->eclosures + i);
    625       if (dfa->inveclosures != NULL)
    626 	re_node_set_free (dfa->inveclosures + i);
    627       if (dfa->edests != NULL)
    628 	re_node_set_free (dfa->edests + i);
    629     }
    630   re_free (dfa->edests);
    631   re_free (dfa->eclosures);
    632   re_free (dfa->inveclosures);
    633   re_free (dfa->nodes);
    634 
    635   if (dfa->state_table)
    636     for (i = 0; i <= dfa->state_hash_mask; ++i)
    637       {
    638 	struct re_state_table_entry *entry = dfa->state_table + i;
    639 	for (j = 0; j < entry->num; ++j)
    640 	  {
    641 	    re_dfastate_t *state = entry->array[j];
    642 	    free_state (state);
    643 	  }
    644         re_free (entry->array);
    645       }
    646   re_free (dfa->state_table);
    647 #ifdef RE_ENABLE_I18N
    648   if (dfa->sb_char != utf8_sb_map)
    649     re_free (dfa->sb_char);
    650 #endif
    651   re_free (dfa->subexp_map);
    652 #ifdef DEBUG
    653   re_free (dfa->re_str);
    654 #endif
    655 
    656   re_free (dfa);
    657 }
    658 
    659 
    660 /* Free dynamically allocated space used by PREG.  */
    661 
    662 void
    663 regfree (preg)
    664     regex_t *preg;
    665 {
    666   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
    667   if (BE (dfa != NULL, 1))
    668     free_dfa_content (dfa);
    669   preg->buffer = NULL;
    670   preg->allocated = 0;
    671 
    672   re_free (preg->fastmap);
    673   preg->fastmap = NULL;
    674 
    675   re_free (preg->translate);
    676   preg->translate = NULL;
    677 }
    678 #ifdef _LIBC
    679 weak_alias (__regfree, regfree)
    680 #endif
    681 
    682 /* Entry points compatible with 4.2 BSD regex library.  We don't define
    684    them unless specifically requested.  */
    685 
    686 #if defined _REGEX_RE_COMP || defined _LIBC
    687 
    688 /* BSD has one and only one pattern buffer.  */
    689 static struct re_pattern_buffer re_comp_buf;
    690 
    691 char *
    692 # ifdef _LIBC
    693 /* Make these definitions weak in libc, so POSIX programs can redefine
    694    these names if they don't use our functions, and still use
    695    regcomp/regexec above without link errors.  */
    696 weak_function
    697 # endif
    698 re_comp (s)
    699      const char *s;
    700 {
    701   reg_errcode_t ret;
    702   char *fastmap;
    703 
    704   if (!s)
    705     {
    706       if (!re_comp_buf.buffer)
    707 	return gettext ("No previous regular expression");
    708       return 0;
    709     }
    710 
    711   if (re_comp_buf.buffer)
    712     {
    713       fastmap = re_comp_buf.fastmap;
    714       re_comp_buf.fastmap = NULL;
    715       __regfree (&re_comp_buf);
    716       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
    717       re_comp_buf.fastmap = fastmap;
    718     }
    719 
    720   if (re_comp_buf.fastmap == NULL)
    721     {
    722       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
    723       if (re_comp_buf.fastmap == NULL)
    724 	return (char *) gettext (__re_error_msgid
    725 				 + __re_error_msgid_idx[(int) REG_ESPACE]);
    726     }
    727 
    728   /* Since `re_exec' always passes NULL for the `regs' argument, we
    729      don't need to initialize the pattern buffer fields which affect it.  */
    730 
    731   /* Match anchors at newlines.  */
    732   re_comp_buf.newline_anchor = 1;
    733 
    734   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
    735 
    736   if (!ret)
    737     return NULL;
    738 
    739   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
    740   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
    741 }
    742 
    743 #ifdef _LIBC
    744 libc_freeres_fn (free_mem)
    745 {
    746   __regfree (&re_comp_buf);
    747 }
    748 #endif
    749 
    750 #endif /* _REGEX_RE_COMP */
    751 
    752 /* Internal entry point.
    754    Compile the regular expression PATTERN, whose length is LENGTH.
    755    SYNTAX indicate regular expression's syntax.  */
    756 
    757 static reg_errcode_t
    758 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
    759 		     reg_syntax_t syntax)
    760 {
    761   reg_errcode_t err = REG_NOERROR;
    762   re_dfa_t *dfa;
    763   re_string_t regexp;
    764 
    765   /* Initialize the pattern buffer.  */
    766   preg->fastmap_accurate = 0;
    767   preg->syntax = syntax;
    768   preg->not_bol = preg->not_eol = 0;
    769   preg->used = 0;
    770   preg->re_nsub = 0;
    771   preg->can_be_null = 0;
    772   preg->regs_allocated = REGS_UNALLOCATED;
    773 
    774   /* Initialize the dfa.  */
    775   dfa = (re_dfa_t *) preg->buffer;
    776   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
    777     {
    778       /* If zero allocated, but buffer is non-null, try to realloc
    779 	 enough space.  This loses if buffer's address is bogus, but
    780 	 that is the user's responsibility.  If ->buffer is NULL this
    781 	 is a simple allocation.  */
    782       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
    783       if (dfa == NULL)
    784 	return REG_ESPACE;
    785       preg->allocated = sizeof (re_dfa_t);
    786       preg->buffer = (unsigned char *) dfa;
    787     }
    788   preg->used = sizeof (re_dfa_t);
    789 
    790   err = init_dfa (dfa, length);
    791   if (BE (err != REG_NOERROR, 0))
    792     {
    793       free_dfa_content (dfa);
    794       preg->buffer = NULL;
    795       preg->allocated = 0;
    796       return err;
    797     }
    798 #ifdef DEBUG
    799   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
    800   dfa->re_str = re_malloc (char, length + 1);
    801   strncpy (dfa->re_str, pattern, length + 1);
    802 #endif
    803 
    804   __libc_lock_init (dfa->lock);
    805 
    806   err = re_string_construct (&regexp, pattern, length, preg->translate,
    807 			     (syntax & RE_ICASE) != 0, dfa);
    808   if (BE (err != REG_NOERROR, 0))
    809     {
    810     re_compile_internal_free_return:
    811       free_workarea_compile (preg);
    812       re_string_destruct (&regexp);
    813       free_dfa_content (dfa);
    814       preg->buffer = NULL;
    815       preg->allocated = 0;
    816       return err;
    817     }
    818 
    819   /* Parse the regular expression, and build a structure tree.  */
    820   preg->re_nsub = 0;
    821   dfa->str_tree = parse (&regexp, preg, syntax, &err);
    822   if (BE (dfa->str_tree == NULL, 0))
    823     goto re_compile_internal_free_return;
    824 
    825   /* Analyze the tree and create the nfa.  */
    826   err = analyze (preg);
    827   if (BE (err != REG_NOERROR, 0))
    828     goto re_compile_internal_free_return;
    829 
    830 #ifdef RE_ENABLE_I18N
    831   /* If possible, do searching in single byte encoding to speed things up.  */
    832   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
    833     optimize_utf8 (dfa);
    834 #endif
    835 
    836   /* Then create the initial state of the dfa.  */
    837   err = create_initial_state (dfa);
    838 
    839   /* Release work areas.  */
    840   free_workarea_compile (preg);
    841   re_string_destruct (&regexp);
    842 
    843   if (BE (err != REG_NOERROR, 0))
    844     {
    845       free_dfa_content (dfa);
    846       preg->buffer = NULL;
    847       preg->allocated = 0;
    848     }
    849 
    850   return err;
    851 }
    852 
    853 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
    854    as the initial length of some arrays.  */
    855 
    856 static reg_errcode_t
    857 init_dfa (re_dfa_t *dfa, size_t pat_len)
    858 {
    859   __re_size_t table_size;
    860 #ifdef RE_ENABLE_I18N
    861   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
    862 #else
    863   size_t max_i18n_object_size = 0;
    864 #endif
    865   size_t max_object_size =
    866     MAX (sizeof (struct re_state_table_entry),
    867 	 MAX (sizeof (re_token_t),
    868 	      MAX (sizeof (re_node_set),
    869 		   MAX (sizeof (regmatch_t),
    870 			max_i18n_object_size))));
    871 
    872   memset (dfa, '\0', sizeof (re_dfa_t));
    873 
    874   /* Force allocation of str_tree_storage the first time.  */
    875   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
    876 
    877   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
    878      calculation below, and for similar doubling calculations
    879      elsewhere.  And it's <= rather than <, because some of the
    880      doubling calculations add 1 afterwards.  */
    881   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
    882     return REG_ESPACE;
    883 
    884   dfa->nodes_alloc = pat_len + 1;
    885   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
    886 
    887   /*  table_size = 2 ^ ceil(log pat_len) */
    888   for (table_size = 1; ; table_size <<= 1)
    889     if (table_size > pat_len)
    890       break;
    891 
    892   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
    893   dfa->state_hash_mask = table_size - 1;
    894 
    895   dfa->mb_cur_max = MB_CUR_MAX;
    896 #ifdef _LIBC
    897   if (dfa->mb_cur_max == 6
    898       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
    899     dfa->is_utf8 = 1;
    900   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
    901 		       != 0);
    902 #else
    903   if (strcmp (locale_charset (), "UTF-8") == 0)
    904     dfa->is_utf8 = 1;
    905 
    906   /* We check exhaustively in the loop below if this charset is a
    907      superset of ASCII.  */
    908   dfa->map_notascii = 0;
    909 #endif
    910 
    911 #ifdef RE_ENABLE_I18N
    912   if (dfa->mb_cur_max > 1)
    913     {
    914       if (dfa->is_utf8)
    915 	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
    916       else
    917 	{
    918 	  int i, j, ch;
    919 
    920 	  dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
    921 	  if (BE (dfa->sb_char == NULL, 0))
    922 	    return REG_ESPACE;
    923 
    924 	  /* Set the bits corresponding to single byte chars.  */
    925 	  for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
    926 	    for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
    927 	      {
    928 		wint_t wch = __btowc (ch);
    929 		if (wch != WEOF)
    930 		  dfa->sb_char[i] |= (bitset_word_t) 1 << j;
    931 # ifndef _LIBC
    932 		if (isascii (ch) && wch != ch)
    933 		  dfa->map_notascii = 1;
    934 # endif
    935 	      }
    936 	}
    937     }
    938 #endif
    939 
    940   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
    941     return REG_ESPACE;
    942   return REG_NOERROR;
    943 }
    944 
    945 /* Initialize WORD_CHAR table, which indicate which character is
    946    "word".  In this case "word" means that it is the word construction
    947    character used by some operators like "\<", "\>", etc.  */
    948 
    949 static void
    950 internal_function
    951 init_word_char (re_dfa_t *dfa)
    952 {
    953   int i, j, ch;
    954   dfa->word_ops_used = 1;
    955   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
    956     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
    957       if (isalnum (ch) || ch == '_')
    958 	dfa->word_char[i] |= (bitset_word_t) 1 << j;
    959 }
    960 
    961 /* Free the work area which are only used while compiling.  */
    962 
    963 static void
    964 free_workarea_compile (regex_t *preg)
    965 {
    966   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
    967   bin_tree_storage_t *storage, *next;
    968   for (storage = dfa->str_tree_storage; storage; storage = next)
    969     {
    970       next = storage->next;
    971       re_free (storage);
    972     }
    973   dfa->str_tree_storage = NULL;
    974   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
    975   dfa->str_tree = NULL;
    976   re_free (dfa->org_indices);
    977   dfa->org_indices = NULL;
    978 }
    979 
    980 /* Create initial states for all contexts.  */
    981 
    982 static reg_errcode_t
    983 create_initial_state (re_dfa_t *dfa)
    984 {
    985   Idx first, i;
    986   reg_errcode_t err;
    987   re_node_set init_nodes;
    988 
    989   /* Initial states have the epsilon closure of the node which is
    990      the first node of the regular expression.  */
    991   first = dfa->str_tree->first->node_idx;
    992   dfa->init_node = first;
    993   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
    994   if (BE (err != REG_NOERROR, 0))
    995     return err;
    996 
    997   /* The back-references which are in initial states can epsilon transit,
    998      since in this case all of the subexpressions can be null.
    999      Then we add epsilon closures of the nodes which are the next nodes of
   1000      the back-references.  */
   1001   if (dfa->nbackref > 0)
   1002     for (i = 0; i < init_nodes.nelem; ++i)
   1003       {
   1004 	Idx node_idx = init_nodes.elems[i];
   1005 	re_token_type_t type = dfa->nodes[node_idx].type;
   1006 
   1007 	Idx clexp_idx;
   1008 	if (type != OP_BACK_REF)
   1009 	  continue;
   1010 	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
   1011 	  {
   1012 	    re_token_t *clexp_node;
   1013 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
   1014 	    if (clexp_node->type == OP_CLOSE_SUBEXP
   1015 		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
   1016 	      break;
   1017 	  }
   1018 	if (clexp_idx == init_nodes.nelem)
   1019 	  continue;
   1020 
   1021 	if (type == OP_BACK_REF)
   1022 	  {
   1023 	    Idx dest_idx = dfa->edests[node_idx].elems[0];
   1024 	    if (!re_node_set_contains (&init_nodes, dest_idx))
   1025 	      {
   1026 		re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
   1027 		i = 0;
   1028 	      }
   1029 	  }
   1030       }
   1031 
   1032   /* It must be the first time to invoke acquire_state.  */
   1033   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
   1034   /* We don't check ERR here, since the initial state must not be NULL.  */
   1035   if (BE (dfa->init_state == NULL, 0))
   1036     return err;
   1037   if (dfa->init_state->has_constraint)
   1038     {
   1039       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
   1040 						       CONTEXT_WORD);
   1041       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
   1042 						     CONTEXT_NEWLINE);
   1043       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
   1044 							 &init_nodes,
   1045 							 CONTEXT_NEWLINE
   1046 							 | CONTEXT_BEGBUF);
   1047       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
   1048 	      || dfa->init_state_begbuf == NULL, 0))
   1049 	return err;
   1050     }
   1051   else
   1052     dfa->init_state_word = dfa->init_state_nl
   1053       = dfa->init_state_begbuf = dfa->init_state;
   1054 
   1055   re_node_set_free (&init_nodes);
   1056   return REG_NOERROR;
   1057 }
   1058 
   1059 #ifdef RE_ENABLE_I18N
   1061 /* If it is possible to do searching in single byte encoding instead of UTF-8
   1062    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
   1063    DFA nodes where needed.  */
   1064 
   1065 static void
   1066 optimize_utf8 (re_dfa_t *dfa)
   1067 {
   1068   Idx node;
   1069   int i;
   1070   bool mb_chars = false;
   1071   bool has_period = false;
   1072 
   1073   for (node = 0; node < dfa->nodes_len; ++node)
   1074     switch (dfa->nodes[node].type)
   1075       {
   1076       case CHARACTER:
   1077 	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
   1078 	  mb_chars = true;
   1079 	break;
   1080       case ANCHOR:
   1081 	switch (dfa->nodes[node].opr.ctx_type)
   1082 	  {
   1083 	  case LINE_FIRST:
   1084 	  case LINE_LAST:
   1085 	  case BUF_FIRST:
   1086 	  case BUF_LAST:
   1087 	    break;
   1088 	  default:
   1089 	    /* Word anchors etc. cannot be handled.  It's okay to test
   1090 	       opr.ctx_type since constraints (for all DFA nodes) are
   1091 	       created by ORing one or more opr.ctx_type values.  */
   1092 	    return;
   1093 	  }
   1094 	break;
   1095       case OP_PERIOD:
   1096         has_period = true;
   1097         break;
   1098       case OP_BACK_REF:
   1099       case OP_ALT:
   1100       case END_OF_RE:
   1101       case OP_DUP_ASTERISK:
   1102       case OP_OPEN_SUBEXP:
   1103       case OP_CLOSE_SUBEXP:
   1104 	break;
   1105       case COMPLEX_BRACKET:
   1106 	return;
   1107       case SIMPLE_BRACKET:
   1108 	/* Just double check.  */
   1109 	{
   1110 	  int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
   1111 			? 0
   1112 			: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
   1113 	  for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
   1114 	    {
   1115 	      if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
   1116 		return;
   1117 	      rshift = 0;
   1118 	    }
   1119 	}
   1120 	break;
   1121       default:
   1122 	abort ();
   1123       }
   1124 
   1125   if (mb_chars || has_period)
   1126     for (node = 0; node < dfa->nodes_len; ++node)
   1127       {
   1128 	if (dfa->nodes[node].type == CHARACTER
   1129 	    && dfa->nodes[node].opr.c >= ASCII_CHARS)
   1130 	  dfa->nodes[node].mb_partial = 0;
   1131 	else if (dfa->nodes[node].type == OP_PERIOD)
   1132 	  dfa->nodes[node].type = OP_UTF8_PERIOD;
   1133       }
   1134 
   1135   /* The search can be in single byte locale.  */
   1136   dfa->mb_cur_max = 1;
   1137   dfa->is_utf8 = 0;
   1138   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
   1139 }
   1140 #endif
   1141 
   1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
   1144    "eclosure", and "inveclosure".  */
   1145 
   1146 static reg_errcode_t
   1147 analyze (regex_t *preg)
   1148 {
   1149   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   1150   reg_errcode_t ret;
   1151 
   1152   /* Allocate arrays.  */
   1153   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
   1154   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
   1155   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
   1156   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
   1157   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
   1158 	  || dfa->eclosures == NULL, 0))
   1159     return REG_ESPACE;
   1160 
   1161   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
   1162   if (dfa->subexp_map != NULL)
   1163     {
   1164       Idx i;
   1165       for (i = 0; i < preg->re_nsub; i++)
   1166 	dfa->subexp_map[i] = i;
   1167       preorder (dfa->str_tree, optimize_subexps, dfa);
   1168       for (i = 0; i < preg->re_nsub; i++)
   1169 	if (dfa->subexp_map[i] != i)
   1170 	  break;
   1171       if (i == preg->re_nsub)
   1172 	{
   1173 	  free (dfa->subexp_map);
   1174 	  dfa->subexp_map = NULL;
   1175 	}
   1176     }
   1177 
   1178   ret = postorder (dfa->str_tree, lower_subexps, preg);
   1179   if (BE (ret != REG_NOERROR, 0))
   1180     return ret;
   1181   ret = postorder (dfa->str_tree, calc_first, dfa);
   1182   if (BE (ret != REG_NOERROR, 0))
   1183     return ret;
   1184   preorder (dfa->str_tree, calc_next, dfa);
   1185   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
   1186   if (BE (ret != REG_NOERROR, 0))
   1187     return ret;
   1188   ret = calc_eclosure (dfa);
   1189   if (BE (ret != REG_NOERROR, 0))
   1190     return ret;
   1191 
   1192   /* We only need this during the prune_impossible_nodes pass in regexec.c;
   1193      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
   1194   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
   1195       || dfa->nbackref)
   1196     {
   1197       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
   1198       if (BE (dfa->inveclosures == NULL, 0))
   1199         return REG_ESPACE;
   1200       ret = calc_inveclosure (dfa);
   1201     }
   1202 
   1203   return ret;
   1204 }
   1205 
   1206 /* Our parse trees are very unbalanced, so we cannot use a stack to
   1207    implement parse tree visits.  Instead, we use parent pointers and
   1208    some hairy code in these two functions.  */
   1209 static reg_errcode_t
   1210 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
   1211 	   void *extra)
   1212 {
   1213   bin_tree_t *node, *prev;
   1214 
   1215   for (node = root; ; )
   1216     {
   1217       /* Descend down the tree, preferably to the left (or to the right
   1218 	 if that's the only child).  */
   1219       while (node->left || node->right)
   1220 	if (node->left)
   1221           node = node->left;
   1222         else
   1223           node = node->right;
   1224 
   1225       do
   1226 	{
   1227 	  reg_errcode_t err = fn (extra, node);
   1228 	  if (BE (err != REG_NOERROR, 0))
   1229 	    return err;
   1230           if (node->parent == NULL)
   1231 	    return REG_NOERROR;
   1232 	  prev = node;
   1233 	  node = node->parent;
   1234 	}
   1235       /* Go up while we have a node that is reached from the right.  */
   1236       while (node->right == prev || node->right == NULL);
   1237       node = node->right;
   1238     }
   1239 }
   1240 
   1241 static reg_errcode_t
   1242 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
   1243 	  void *extra)
   1244 {
   1245   bin_tree_t *node;
   1246 
   1247   for (node = root; ; )
   1248     {
   1249       reg_errcode_t err = fn (extra, node);
   1250       if (BE (err != REG_NOERROR, 0))
   1251 	return err;
   1252 
   1253       /* Go to the left node, or up and to the right.  */
   1254       if (node->left)
   1255 	node = node->left;
   1256       else
   1257 	{
   1258 	  bin_tree_t *prev = NULL;
   1259 	  while (node->right == prev || node->right == NULL)
   1260 	    {
   1261 	      prev = node;
   1262 	      node = node->parent;
   1263 	      if (!node)
   1264 	        return REG_NOERROR;
   1265 	    }
   1266 	  node = node->right;
   1267 	}
   1268     }
   1269 }
   1270 
   1271 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
   1272    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
   1273    backreferences as well.  Requires a preorder visit.  */
   1274 static reg_errcode_t
   1275 optimize_subexps (void *extra, bin_tree_t *node)
   1276 {
   1277   re_dfa_t *dfa = (re_dfa_t *) extra;
   1278 
   1279   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
   1280     {
   1281       int idx = node->token.opr.idx;
   1282       node->token.opr.idx = dfa->subexp_map[idx];
   1283       dfa->used_bkref_map |= 1 << node->token.opr.idx;
   1284     }
   1285 
   1286   else if (node->token.type == SUBEXP
   1287            && node->left && node->left->token.type == SUBEXP)
   1288     {
   1289       Idx other_idx = node->left->token.opr.idx;
   1290 
   1291       node->left = node->left->left;
   1292       if (node->left)
   1293         node->left->parent = node;
   1294 
   1295       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
   1296       if (other_idx < BITSET_WORD_BITS)
   1297 	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
   1298     }
   1299 
   1300   return REG_NOERROR;
   1301 }
   1302 
   1303 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
   1304    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
   1305 static reg_errcode_t
   1306 lower_subexps (void *extra, bin_tree_t *node)
   1307 {
   1308   regex_t *preg = (regex_t *) extra;
   1309   reg_errcode_t err = REG_NOERROR;
   1310 
   1311   if (node->left && node->left->token.type == SUBEXP)
   1312     {
   1313       node->left = lower_subexp (&err, preg, node->left);
   1314       if (node->left)
   1315 	node->left->parent = node;
   1316     }
   1317   if (node->right && node->right->token.type == SUBEXP)
   1318     {
   1319       node->right = lower_subexp (&err, preg, node->right);
   1320       if (node->right)
   1321 	node->right->parent = node;
   1322     }
   1323 
   1324   return err;
   1325 }
   1326 
   1327 static bin_tree_t *
   1328 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
   1329 {
   1330   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   1331   bin_tree_t *body = node->left;
   1332   bin_tree_t *op, *cls, *tree1, *tree;
   1333 
   1334   if (preg->no_sub
   1335       /* We do not optimize empty subexpressions, because otherwise we may
   1336 	 have bad CONCAT nodes with NULL children.  This is obviously not
   1337 	 very common, so we do not lose much.  An example that triggers
   1338 	 this case is the sed "script" /\(\)/x.  */
   1339       && node->left != NULL
   1340       && (node->token.opr.idx >= BITSET_WORD_BITS
   1341 	  || !(dfa->used_bkref_map
   1342 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
   1343     return node->left;
   1344 
   1345   /* Convert the SUBEXP node to the concatenation of an
   1346      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
   1347   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
   1348   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
   1349   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
   1350   tree = create_tree (dfa, op, tree1, CONCAT);
   1351   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
   1352     {
   1353       *err = REG_ESPACE;
   1354       return NULL;
   1355     }
   1356 
   1357   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
   1358   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
   1359   return tree;
   1360 }
   1361 
   1362 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
   1363    nodes.  Requires a postorder visit.  */
   1364 static reg_errcode_t
   1365 calc_first (void *extra, bin_tree_t *node)
   1366 {
   1367   re_dfa_t *dfa = (re_dfa_t *) extra;
   1368   if (node->token.type == CONCAT)
   1369     {
   1370       node->first = node->left->first;
   1371       node->node_idx = node->left->node_idx;
   1372     }
   1373   else
   1374     {
   1375       node->first = node;
   1376       node->node_idx = re_dfa_add_node (dfa, node->token);
   1377       if (BE (node->node_idx == REG_MISSING, 0))
   1378         return REG_ESPACE;
   1379       if (node->token.type == ANCHOR)
   1380         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
   1381     }
   1382   return REG_NOERROR;
   1383 }
   1384 
   1385 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
   1386 static reg_errcode_t
   1387 calc_next (void *extra, bin_tree_t *node)
   1388 {
   1389   switch (node->token.type)
   1390     {
   1391     case OP_DUP_ASTERISK:
   1392       node->left->next = node;
   1393       break;
   1394     case CONCAT:
   1395       node->left->next = node->right->first;
   1396       node->right->next = node->next;
   1397       break;
   1398     default:
   1399       if (node->left)
   1400 	node->left->next = node->next;
   1401       if (node->right)
   1402         node->right->next = node->next;
   1403       break;
   1404     }
   1405   return REG_NOERROR;
   1406 }
   1407 
   1408 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
   1409 static reg_errcode_t
   1410 link_nfa_nodes (void *extra, bin_tree_t *node)
   1411 {
   1412   re_dfa_t *dfa = (re_dfa_t *) extra;
   1413   Idx idx = node->node_idx;
   1414   reg_errcode_t err = REG_NOERROR;
   1415 
   1416   switch (node->token.type)
   1417     {
   1418     case CONCAT:
   1419       break;
   1420 
   1421     case END_OF_RE:
   1422       assert (node->next == NULL);
   1423       break;
   1424 
   1425     case OP_DUP_ASTERISK:
   1426     case OP_ALT:
   1427       {
   1428 	Idx left, right;
   1429 	dfa->has_plural_match = 1;
   1430 	if (node->left != NULL)
   1431 	  left = node->left->first->node_idx;
   1432 	else
   1433 	  left = node->next->node_idx;
   1434 	if (node->right != NULL)
   1435 	  right = node->right->first->node_idx;
   1436 	else
   1437 	  right = node->next->node_idx;
   1438 	assert (REG_VALID_INDEX (left));
   1439 	assert (REG_VALID_INDEX (right));
   1440 	err = re_node_set_init_2 (dfa->edests + idx, left, right);
   1441       }
   1442       break;
   1443 
   1444     case ANCHOR:
   1445     case OP_OPEN_SUBEXP:
   1446     case OP_CLOSE_SUBEXP:
   1447       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
   1448       break;
   1449 
   1450     case OP_BACK_REF:
   1451       dfa->nexts[idx] = node->next->node_idx;
   1452       if (node->token.type == OP_BACK_REF)
   1453 	re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
   1454       break;
   1455 
   1456     default:
   1457       assert (!IS_EPSILON_NODE (node->token.type));
   1458       dfa->nexts[idx] = node->next->node_idx;
   1459       break;
   1460     }
   1461 
   1462   return err;
   1463 }
   1464 
   1465 /* Duplicate the epsilon closure of the node ROOT_NODE.
   1466    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
   1467    to their own constraint.  */
   1468 
   1469 static reg_errcode_t
   1470 internal_function
   1471 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
   1472 			Idx root_node, unsigned int init_constraint)
   1473 {
   1474   Idx org_node, clone_node;
   1475   bool ok;
   1476   unsigned int constraint = init_constraint;
   1477   for (org_node = top_org_node, clone_node = top_clone_node;;)
   1478     {
   1479       Idx org_dest, clone_dest;
   1480       if (dfa->nodes[org_node].type == OP_BACK_REF)
   1481 	{
   1482 	  /* If the back reference epsilon-transit, its destination must
   1483 	     also have the constraint.  Then duplicate the epsilon closure
   1484 	     of the destination of the back reference, and store it in
   1485 	     edests of the back reference.  */
   1486 	  org_dest = dfa->nexts[org_node];
   1487 	  re_node_set_empty (dfa->edests + clone_node);
   1488 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
   1489 	  if (BE (clone_dest == REG_MISSING, 0))
   1490 	    return REG_ESPACE;
   1491 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
   1492 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
   1493 	  if (BE (! ok, 0))
   1494 	    return REG_ESPACE;
   1495 	}
   1496       else if (dfa->edests[org_node].nelem == 0)
   1497 	{
   1498 	  /* In case of the node can't epsilon-transit, don't duplicate the
   1499 	     destination and store the original destination as the
   1500 	     destination of the node.  */
   1501 	  dfa->nexts[clone_node] = dfa->nexts[org_node];
   1502 	  break;
   1503 	}
   1504       else if (dfa->edests[org_node].nelem == 1)
   1505 	{
   1506 	  /* In case of the node can epsilon-transit, and it has only one
   1507 	     destination.  */
   1508 	  org_dest = dfa->edests[org_node].elems[0];
   1509 	  re_node_set_empty (dfa->edests + clone_node);
   1510 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
   1511 	  /* If the node is root_node itself, it means the epsilon closure
   1512 	     has a loop.  Then tie it to the destination of the root_node.  */
   1513 	  if (org_node == root_node && clone_node != org_node)
   1514 	    {
   1515 	      ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
   1516 	      if (BE (! ok, 0))
   1517 	        return REG_ESPACE;
   1518 	      break;
   1519 	    }
   1520 	  /* In case the node has another constraint, append it.  */
   1521 	  constraint |= dfa->nodes[org_node].constraint;
   1522 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
   1523 	  if (BE (clone_dest == REG_MISSING, 0))
   1524 	    return REG_ESPACE;
   1525 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
   1526 	  if (BE (! ok, 0))
   1527 	    return REG_ESPACE;
   1528 	}
   1529       else /* dfa->edests[org_node].nelem == 2 */
   1530 	{
   1531 	  /* In case of the node can epsilon-transit, and it has two
   1532 	     destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
   1533 	  org_dest = dfa->edests[org_node].elems[0];
   1534 	  re_node_set_empty (dfa->edests + clone_node);
   1535 	  /* Search for a duplicated node which satisfies the constraint.  */
   1536 	  clone_dest = search_duplicated_node (dfa, org_dest, constraint);
   1537 	  if (clone_dest == REG_MISSING)
   1538 	    {
   1539 	      /* There is no such duplicated node, create a new one.  */
   1540 	      reg_errcode_t err;
   1541 	      clone_dest = duplicate_node (dfa, org_dest, constraint);
   1542 	      if (BE (clone_dest == REG_MISSING, 0))
   1543 		return REG_ESPACE;
   1544 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
   1545 	      if (BE (! ok, 0))
   1546 		return REG_ESPACE;
   1547 	      err = duplicate_node_closure (dfa, org_dest, clone_dest,
   1548 					    root_node, constraint);
   1549 	      if (BE (err != REG_NOERROR, 0))
   1550 		return err;
   1551 	    }
   1552 	  else
   1553 	    {
   1554 	      /* There is a duplicated node which satisfy the constraint,
   1555 		 use it to avoid infinite loop.  */
   1556 	      ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
   1557 	      if (BE (! ok, 0))
   1558 		return REG_ESPACE;
   1559 	    }
   1560 
   1561 	  org_dest = dfa->edests[org_node].elems[1];
   1562 	  clone_dest = duplicate_node (dfa, org_dest, constraint);
   1563 	  if (BE (clone_dest == REG_MISSING, 0))
   1564 	    return REG_ESPACE;
   1565 	  ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
   1566 	  if (BE (! ok, 0))
   1567 	    return REG_ESPACE;
   1568 	}
   1569       org_node = org_dest;
   1570       clone_node = clone_dest;
   1571     }
   1572   return REG_NOERROR;
   1573 }
   1574 
   1575 /* Search for a node which is duplicated from the node ORG_NODE, and
   1576    satisfies the constraint CONSTRAINT.  */
   1577 
   1578 static Idx
   1579 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
   1580 			unsigned int constraint)
   1581 {
   1582   Idx idx;
   1583   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
   1584     {
   1585       if (org_node == dfa->org_indices[idx]
   1586 	  && constraint == dfa->nodes[idx].constraint)
   1587 	return idx; /* Found.  */
   1588     }
   1589   return REG_MISSING; /* Not found.  */
   1590 }
   1591 
   1592 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
   1593    Return the index of the new node, or REG_MISSING if insufficient storage is
   1594    available.  */
   1595 
   1596 static Idx
   1597 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
   1598 {
   1599   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
   1600   if (BE (dup_idx != REG_MISSING, 1))
   1601     {
   1602       dfa->nodes[dup_idx].constraint = constraint;
   1603       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
   1604       dfa->nodes[dup_idx].duplicated = 1;
   1605 
   1606       /* Store the index of the original node.  */
   1607       dfa->org_indices[dup_idx] = org_idx;
   1608     }
   1609   return dup_idx;
   1610 }
   1611 
   1612 static reg_errcode_t
   1613 calc_inveclosure (re_dfa_t *dfa)
   1614 {
   1615   Idx src, idx;
   1616   bool ok;
   1617   for (idx = 0; idx < dfa->nodes_len; ++idx)
   1618     re_node_set_init_empty (dfa->inveclosures + idx);
   1619 
   1620   for (src = 0; src < dfa->nodes_len; ++src)
   1621     {
   1622       Idx *elems = dfa->eclosures[src].elems;
   1623       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
   1624 	{
   1625 	  ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
   1626 	  if (BE (! ok, 0))
   1627 	    return REG_ESPACE;
   1628 	}
   1629     }
   1630 
   1631   return REG_NOERROR;
   1632 }
   1633 
   1634 /* Calculate "eclosure" for all the node in DFA.  */
   1635 
   1636 static reg_errcode_t
   1637 calc_eclosure (re_dfa_t *dfa)
   1638 {
   1639   Idx node_idx;
   1640   bool incomplete;
   1641 #ifdef DEBUG
   1642   assert (dfa->nodes_len > 0);
   1643 #endif
   1644   incomplete = false;
   1645   /* For each nodes, calculate epsilon closure.  */
   1646   for (node_idx = 0; ; ++node_idx)
   1647     {
   1648       reg_errcode_t err;
   1649       re_node_set eclosure_elem;
   1650       if (node_idx == dfa->nodes_len)
   1651 	{
   1652 	  if (!incomplete)
   1653 	    break;
   1654 	  incomplete = false;
   1655 	  node_idx = 0;
   1656 	}
   1657 
   1658 #ifdef DEBUG
   1659       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
   1660 #endif
   1661 
   1662       /* If we have already calculated, skip it.  */
   1663       if (dfa->eclosures[node_idx].nelem != 0)
   1664 	continue;
   1665       /* Calculate epsilon closure of `node_idx'.  */
   1666       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
   1667       if (BE (err != REG_NOERROR, 0))
   1668 	return err;
   1669 
   1670       if (dfa->eclosures[node_idx].nelem == 0)
   1671 	{
   1672 	  incomplete = true;
   1673 	  re_node_set_free (&eclosure_elem);
   1674 	}
   1675     }
   1676   return REG_NOERROR;
   1677 }
   1678 
   1679 /* Calculate epsilon closure of NODE.  */
   1680 
   1681 static reg_errcode_t
   1682 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
   1683 {
   1684   reg_errcode_t err;
   1685   Idx i;
   1686   bool incomplete;
   1687   bool ok;
   1688   re_node_set eclosure;
   1689   incomplete = false;
   1690   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
   1691   if (BE (err != REG_NOERROR, 0))
   1692     return err;
   1693 
   1694   /* This indicates that we are calculating this node now.
   1695      We reference this value to avoid infinite loop.  */
   1696   dfa->eclosures[node].nelem = REG_MISSING;
   1697 
   1698   /* If the current node has constraints, duplicate all nodes
   1699      since they must inherit the constraints.  */
   1700   if (dfa->nodes[node].constraint
   1701       && dfa->edests[node].nelem
   1702       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
   1703     {
   1704       err = duplicate_node_closure (dfa, node, node, node,
   1705 				    dfa->nodes[node].constraint);
   1706       if (BE (err != REG_NOERROR, 0))
   1707 	return err;
   1708     }
   1709 
   1710   /* Expand each epsilon destination nodes.  */
   1711   if (IS_EPSILON_NODE(dfa->nodes[node].type))
   1712     for (i = 0; i < dfa->edests[node].nelem; ++i)
   1713       {
   1714 	re_node_set eclosure_elem;
   1715 	Idx edest = dfa->edests[node].elems[i];
   1716 	/* If calculating the epsilon closure of `edest' is in progress,
   1717 	   return intermediate result.  */
   1718 	if (dfa->eclosures[edest].nelem == REG_MISSING)
   1719 	  {
   1720 	    incomplete = true;
   1721 	    continue;
   1722 	  }
   1723 	/* If we haven't calculated the epsilon closure of `edest' yet,
   1724 	   calculate now. Otherwise use calculated epsilon closure.  */
   1725 	if (dfa->eclosures[edest].nelem == 0)
   1726 	  {
   1727 	    err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
   1728 	    if (BE (err != REG_NOERROR, 0))
   1729 	      return err;
   1730 	  }
   1731 	else
   1732 	  eclosure_elem = dfa->eclosures[edest];
   1733 	/* Merge the epsilon closure of `edest'.  */
   1734 	re_node_set_merge (&eclosure, &eclosure_elem);
   1735 	/* If the epsilon closure of `edest' is incomplete,
   1736 	   the epsilon closure of this node is also incomplete.  */
   1737 	if (dfa->eclosures[edest].nelem == 0)
   1738 	  {
   1739 	    incomplete = true;
   1740 	    re_node_set_free (&eclosure_elem);
   1741 	  }
   1742       }
   1743 
   1744   /* Epsilon closures include itself.  */
   1745   ok = re_node_set_insert (&eclosure, node);
   1746   if (BE (! ok, 0))
   1747     return REG_ESPACE;
   1748   if (incomplete && !root)
   1749     dfa->eclosures[node].nelem = 0;
   1750   else
   1751     dfa->eclosures[node] = eclosure;
   1752   *new_set = eclosure;
   1753   return REG_NOERROR;
   1754 }
   1755 
   1756 /* Functions for token which are used in the parser.  */
   1758 
   1759 /* Fetch a token from INPUT.
   1760    We must not use this function inside bracket expressions.  */
   1761 
   1762 static void
   1763 internal_function
   1764 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
   1765 {
   1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
   1767 }
   1768 
   1769 /* Peek a token from INPUT, and return the length of the token.
   1770    We must not use this function inside bracket expressions.  */
   1771 
   1772 static int
   1773 internal_function
   1774 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
   1775 {
   1776   unsigned char c;
   1777 
   1778   if (re_string_eoi (input))
   1779     {
   1780       token->type = END_OF_RE;
   1781       return 0;
   1782     }
   1783 
   1784   c = re_string_peek_byte (input, 0);
   1785   token->opr.c = c;
   1786 
   1787   token->word_char = 0;
   1788 #ifdef RE_ENABLE_I18N
   1789   token->mb_partial = 0;
   1790   if (input->mb_cur_max > 1 &&
   1791       !re_string_first_byte (input, re_string_cur_idx (input)))
   1792     {
   1793       token->type = CHARACTER;
   1794       token->mb_partial = 1;
   1795       return 1;
   1796     }
   1797 #endif
   1798   if (c == '\\')
   1799     {
   1800       unsigned char c2;
   1801       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
   1802 	{
   1803 	  token->type = BACK_SLASH;
   1804 	  return 1;
   1805 	}
   1806 
   1807       c2 = re_string_peek_byte_case (input, 1);
   1808       token->opr.c = c2;
   1809       token->type = CHARACTER;
   1810 #ifdef RE_ENABLE_I18N
   1811       if (input->mb_cur_max > 1)
   1812 	{
   1813 	  wint_t wc = re_string_wchar_at (input,
   1814 					  re_string_cur_idx (input) + 1);
   1815 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
   1816 	}
   1817       else
   1818 #endif
   1819 	token->word_char = IS_WORD_CHAR (c2) != 0;
   1820 
   1821       switch (c2)
   1822 	{
   1823 	case '|':
   1824 	  if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
   1825 	    token->type = OP_ALT;
   1826 	  break;
   1827 	case '1': case '2': case '3': case '4': case '5':
   1828 	case '6': case '7': case '8': case '9':
   1829 	  if (!(syntax & RE_NO_BK_REFS))
   1830 	    {
   1831 	      token->type = OP_BACK_REF;
   1832 	      token->opr.idx = c2 - '1';
   1833 	    }
   1834 	  break;
   1835 	case '<':
   1836 	  if (!(syntax & RE_NO_GNU_OPS))
   1837 	    {
   1838 	      token->type = ANCHOR;
   1839 	      token->opr.ctx_type = WORD_FIRST;
   1840 	    }
   1841 	  break;
   1842 	case '>':
   1843 	  if (!(syntax & RE_NO_GNU_OPS))
   1844 	    {
   1845 	      token->type = ANCHOR;
   1846 	      token->opr.ctx_type = WORD_LAST;
   1847 	    }
   1848 	  break;
   1849 	case 'b':
   1850 	  if (!(syntax & RE_NO_GNU_OPS))
   1851 	    {
   1852 	      token->type = ANCHOR;
   1853 	      token->opr.ctx_type = WORD_DELIM;
   1854 	    }
   1855 	  break;
   1856 	case 'B':
   1857 	  if (!(syntax & RE_NO_GNU_OPS))
   1858 	    {
   1859 	      token->type = ANCHOR;
   1860 	      token->opr.ctx_type = NOT_WORD_DELIM;
   1861 	    }
   1862 	  break;
   1863 	case 'w':
   1864 	  if (!(syntax & RE_NO_GNU_OPS))
   1865 	    token->type = OP_WORD;
   1866 	  break;
   1867 	case 'W':
   1868 	  if (!(syntax & RE_NO_GNU_OPS))
   1869 	    token->type = OP_NOTWORD;
   1870 	  break;
   1871 	case 's':
   1872 	  if (!(syntax & RE_NO_GNU_OPS))
   1873 	    token->type = OP_SPACE;
   1874 	  break;
   1875 	case 'S':
   1876 	  if (!(syntax & RE_NO_GNU_OPS))
   1877 	    token->type = OP_NOTSPACE;
   1878 	  break;
   1879 	case '`':
   1880 	  if (!(syntax & RE_NO_GNU_OPS))
   1881 	    {
   1882 	      token->type = ANCHOR;
   1883 	      token->opr.ctx_type = BUF_FIRST;
   1884 	    }
   1885 	  break;
   1886 	case '\'':
   1887 	  if (!(syntax & RE_NO_GNU_OPS))
   1888 	    {
   1889 	      token->type = ANCHOR;
   1890 	      token->opr.ctx_type = BUF_LAST;
   1891 	    }
   1892 	  break;
   1893 	case '(':
   1894 	  if (!(syntax & RE_NO_BK_PARENS))
   1895 	    token->type = OP_OPEN_SUBEXP;
   1896 	  break;
   1897 	case ')':
   1898 	  if (!(syntax & RE_NO_BK_PARENS))
   1899 	    token->type = OP_CLOSE_SUBEXP;
   1900 	  break;
   1901 	case '+':
   1902 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
   1903 	    token->type = OP_DUP_PLUS;
   1904 	  break;
   1905 	case '?':
   1906 	  if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
   1907 	    token->type = OP_DUP_QUESTION;
   1908 	  break;
   1909 	case '{':
   1910 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
   1911 	    token->type = OP_OPEN_DUP_NUM;
   1912 	  break;
   1913 	case '}':
   1914 	  if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
   1915 	    token->type = OP_CLOSE_DUP_NUM;
   1916 	  break;
   1917 	default:
   1918 	  break;
   1919 	}
   1920       return 2;
   1921     }
   1922 
   1923   token->type = CHARACTER;
   1924 #ifdef RE_ENABLE_I18N
   1925   if (input->mb_cur_max > 1)
   1926     {
   1927       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
   1928       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
   1929     }
   1930   else
   1931 #endif
   1932     token->word_char = IS_WORD_CHAR (token->opr.c);
   1933 
   1934   switch (c)
   1935     {
   1936     case '\n':
   1937       if (syntax & RE_NEWLINE_ALT)
   1938 	token->type = OP_ALT;
   1939       break;
   1940     case '|':
   1941       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
   1942 	token->type = OP_ALT;
   1943       break;
   1944     case '*':
   1945       token->type = OP_DUP_ASTERISK;
   1946       break;
   1947     case '+':
   1948       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
   1949 	token->type = OP_DUP_PLUS;
   1950       break;
   1951     case '?':
   1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
   1953 	token->type = OP_DUP_QUESTION;
   1954       break;
   1955     case '{':
   1956       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
   1957 	token->type = OP_OPEN_DUP_NUM;
   1958       break;
   1959     case '}':
   1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
   1961 	token->type = OP_CLOSE_DUP_NUM;
   1962       break;
   1963     case '(':
   1964       if (syntax & RE_NO_BK_PARENS)
   1965 	token->type = OP_OPEN_SUBEXP;
   1966       break;
   1967     case ')':
   1968       if (syntax & RE_NO_BK_PARENS)
   1969 	token->type = OP_CLOSE_SUBEXP;
   1970       break;
   1971     case '[':
   1972       token->type = OP_OPEN_BRACKET;
   1973       break;
   1974     case '.':
   1975       token->type = OP_PERIOD;
   1976       break;
   1977     case '^':
   1978       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
   1979 	  re_string_cur_idx (input) != 0)
   1980 	{
   1981 	  char prev = re_string_peek_byte (input, -1);
   1982 	  if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
   1983 	    break;
   1984 	}
   1985       token->type = ANCHOR;
   1986       token->opr.ctx_type = LINE_FIRST;
   1987       break;
   1988     case '$':
   1989       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
   1990 	  re_string_cur_idx (input) + 1 != re_string_length (input))
   1991 	{
   1992 	  re_token_t next;
   1993 	  re_string_skip_bytes (input, 1);
   1994 	  peek_token (&next, input, syntax);
   1995 	  re_string_skip_bytes (input, -1);
   1996 	  if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
   1997 	    break;
   1998 	}
   1999       token->type = ANCHOR;
   2000       token->opr.ctx_type = LINE_LAST;
   2001       break;
   2002     default:
   2003       break;
   2004     }
   2005   return 1;
   2006 }
   2007 
   2008 /* Peek a token from INPUT, and return the length of the token.
   2009    We must not use this function out of bracket expressions.  */
   2010 
   2011 static int
   2012 internal_function
   2013 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
   2014 {
   2015   unsigned char c;
   2016   if (re_string_eoi (input))
   2017     {
   2018       token->type = END_OF_RE;
   2019       return 0;
   2020     }
   2021   c = re_string_peek_byte (input, 0);
   2022   token->opr.c = c;
   2023 
   2024 #ifdef RE_ENABLE_I18N
   2025   if (input->mb_cur_max > 1 &&
   2026       !re_string_first_byte (input, re_string_cur_idx (input)))
   2027     {
   2028       token->type = CHARACTER;
   2029       return 1;
   2030     }
   2031 #endif /* RE_ENABLE_I18N */
   2032 
   2033   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
   2034       && re_string_cur_idx (input) + 1 < re_string_length (input))
   2035     {
   2036       /* In this case, '\' escape a character.  */
   2037       unsigned char c2;
   2038       re_string_skip_bytes (input, 1);
   2039       c2 = re_string_peek_byte (input, 0);
   2040       token->opr.c = c2;
   2041       token->type = CHARACTER;
   2042       return 1;
   2043     }
   2044   if (c == '[') /* '[' is a special char in a bracket exps.  */
   2045     {
   2046       unsigned char c2;
   2047       int token_len;
   2048       if (re_string_cur_idx (input) + 1 < re_string_length (input))
   2049 	c2 = re_string_peek_byte (input, 1);
   2050       else
   2051 	c2 = 0;
   2052       token->opr.c = c2;
   2053       token_len = 2;
   2054       switch (c2)
   2055 	{
   2056 	case '.':
   2057 	  token->type = OP_OPEN_COLL_ELEM;
   2058 	  break;
   2059 	case '=':
   2060 	  token->type = OP_OPEN_EQUIV_CLASS;
   2061 	  break;
   2062 	case ':':
   2063 	  if (syntax & RE_CHAR_CLASSES)
   2064 	    {
   2065 	      token->type = OP_OPEN_CHAR_CLASS;
   2066 	      break;
   2067 	    }
   2068 	  /* else fall through.  */
   2069 	default:
   2070 	  token->type = CHARACTER;
   2071 	  token->opr.c = c;
   2072 	  token_len = 1;
   2073 	  break;
   2074 	}
   2075       return token_len;
   2076     }
   2077   switch (c)
   2078     {
   2079     case '-':
   2080       token->type = OP_CHARSET_RANGE;
   2081       break;
   2082     case ']':
   2083       token->type = OP_CLOSE_BRACKET;
   2084       break;
   2085     case '^':
   2086       token->type = OP_NON_MATCH_LIST;
   2087       break;
   2088     default:
   2089       token->type = CHARACTER;
   2090     }
   2091   return 1;
   2092 }
   2093 
   2094 /* Functions for parser.  */
   2096 
   2097 /* Entry point of the parser.
   2098    Parse the regular expression REGEXP and return the structure tree.
   2099    If an error is occured, ERR is set by error code, and return NULL.
   2100    This function build the following tree, from regular expression <reg_exp>:
   2101 	   CAT
   2102 	   / \
   2103 	  /   \
   2104    <reg_exp>  EOR
   2105 
   2106    CAT means concatenation.
   2107    EOR means end of regular expression.  */
   2108 
   2109 static bin_tree_t *
   2110 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
   2111        reg_errcode_t *err)
   2112 {
   2113   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   2114   bin_tree_t *tree, *eor, *root;
   2115   re_token_t current_token;
   2116   dfa->syntax = syntax;
   2117   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
   2118   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
   2119   if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2120     return NULL;
   2121   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
   2122   if (tree != NULL)
   2123     root = create_tree (dfa, tree, eor, CONCAT);
   2124   else
   2125     root = eor;
   2126   if (BE (eor == NULL || root == NULL, 0))
   2127     {
   2128       *err = REG_ESPACE;
   2129       return NULL;
   2130     }
   2131   return root;
   2132 }
   2133 
   2134 /* This function build the following tree, from regular expression
   2135    <branch1>|<branch2>:
   2136 	   ALT
   2137 	   / \
   2138 	  /   \
   2139    <branch1> <branch2>
   2140 
   2141    ALT means alternative, which represents the operator `|'.  */
   2142 
   2143 static bin_tree_t *
   2144 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
   2145 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
   2146 {
   2147   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   2148   bin_tree_t *tree, *branch = NULL;
   2149   tree = parse_branch (regexp, preg, token, syntax, nest, err);
   2150   if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2151     return NULL;
   2152 
   2153   while (token->type == OP_ALT)
   2154     {
   2155       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
   2156       if (token->type != OP_ALT && token->type != END_OF_RE
   2157 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
   2158 	{
   2159 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
   2160 	  if (BE (*err != REG_NOERROR && branch == NULL, 0))
   2161 	    return NULL;
   2162 	}
   2163       else
   2164 	branch = NULL;
   2165       tree = create_tree (dfa, tree, branch, OP_ALT);
   2166       if (BE (tree == NULL, 0))
   2167 	{
   2168 	  *err = REG_ESPACE;
   2169 	  return NULL;
   2170 	}
   2171     }
   2172   return tree;
   2173 }
   2174 
   2175 /* This function build the following tree, from regular expression
   2176    <exp1><exp2>:
   2177 	CAT
   2178 	/ \
   2179        /   \
   2180    <exp1> <exp2>
   2181 
   2182    CAT means concatenation.  */
   2183 
   2184 static bin_tree_t *
   2185 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
   2186 	      reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
   2187 {
   2188   bin_tree_t *tree, *expr;
   2189   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   2190   tree = parse_expression (regexp, preg, token, syntax, nest, err);
   2191   if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2192     return NULL;
   2193 
   2194   while (token->type != OP_ALT && token->type != END_OF_RE
   2195 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
   2196     {
   2197       expr = parse_expression (regexp, preg, token, syntax, nest, err);
   2198       if (BE (*err != REG_NOERROR && expr == NULL, 0))
   2199 	{
   2200 	  return NULL;
   2201 	}
   2202       if (tree != NULL && expr != NULL)
   2203 	{
   2204 	  tree = create_tree (dfa, tree, expr, CONCAT);
   2205 	  if (tree == NULL)
   2206 	    {
   2207 	      *err = REG_ESPACE;
   2208 	      return NULL;
   2209 	    }
   2210 	}
   2211       else if (tree == NULL)
   2212 	tree = expr;
   2213       /* Otherwise expr == NULL, we don't need to create new tree.  */
   2214     }
   2215   return tree;
   2216 }
   2217 
   2218 /* This function build the following tree, from regular expression a*:
   2219 	 *
   2220 	 |
   2221 	 a
   2222 */
   2223 
   2224 static bin_tree_t *
   2225 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
   2226 		  reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
   2227 {
   2228   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   2229   bin_tree_t *tree;
   2230   switch (token->type)
   2231     {
   2232     case CHARACTER:
   2233       tree = create_token_tree (dfa, NULL, NULL, token);
   2234       if (BE (tree == NULL, 0))
   2235 	{
   2236 	  *err = REG_ESPACE;
   2237 	  return NULL;
   2238 	}
   2239 #ifdef RE_ENABLE_I18N
   2240       if (dfa->mb_cur_max > 1)
   2241 	{
   2242 	  while (!re_string_eoi (regexp)
   2243 		 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
   2244 	    {
   2245 	      bin_tree_t *mbc_remain;
   2246 	      fetch_token (token, regexp, syntax);
   2247 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
   2248 	      tree = create_tree (dfa, tree, mbc_remain, CONCAT);
   2249 	      if (BE (mbc_remain == NULL || tree == NULL, 0))
   2250 		{
   2251 		  *err = REG_ESPACE;
   2252 		  return NULL;
   2253 		}
   2254 	    }
   2255 	}
   2256 #endif
   2257       break;
   2258     case OP_OPEN_SUBEXP:
   2259       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
   2260       if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2261 	return NULL;
   2262       break;
   2263     case OP_OPEN_BRACKET:
   2264       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
   2265       if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2266 	return NULL;
   2267       break;
   2268     case OP_BACK_REF:
   2269       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
   2270 	{
   2271 	  *err = REG_ESUBREG;
   2272 	  return NULL;
   2273 	}
   2274       dfa->used_bkref_map |= 1 << token->opr.idx;
   2275       tree = create_token_tree (dfa, NULL, NULL, token);
   2276       if (BE (tree == NULL, 0))
   2277 	{
   2278 	  *err = REG_ESPACE;
   2279 	  return NULL;
   2280 	}
   2281       ++dfa->nbackref;
   2282       dfa->has_mb_node = 1;
   2283       break;
   2284     case OP_OPEN_DUP_NUM:
   2285       if (syntax & RE_CONTEXT_INVALID_DUP)
   2286 	{
   2287 	  *err = REG_BADRPT;
   2288 	  return NULL;
   2289 	}
   2290       /* FALLTHROUGH */
   2291     case OP_DUP_ASTERISK:
   2292     case OP_DUP_PLUS:
   2293     case OP_DUP_QUESTION:
   2294       if (syntax & RE_CONTEXT_INVALID_OPS)
   2295 	{
   2296 	  *err = REG_BADRPT;
   2297 	  return NULL;
   2298 	}
   2299       else if (syntax & RE_CONTEXT_INDEP_OPS)
   2300 	{
   2301 	  fetch_token (token, regexp, syntax);
   2302 	  return parse_expression (regexp, preg, token, syntax, nest, err);
   2303 	}
   2304       /* else fall through  */
   2305     case OP_CLOSE_SUBEXP:
   2306       if ((token->type == OP_CLOSE_SUBEXP) &&
   2307 	  !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
   2308 	{
   2309 	  *err = REG_ERPAREN;
   2310 	  return NULL;
   2311 	}
   2312       /* else fall through  */
   2313     case OP_CLOSE_DUP_NUM:
   2314       /* We treat it as a normal character.  */
   2315 
   2316       /* Then we can these characters as normal characters.  */
   2317       token->type = CHARACTER;
   2318       /* mb_partial and word_char bits should be initialized already
   2319 	 by peek_token.  */
   2320       tree = create_token_tree (dfa, NULL, NULL, token);
   2321       if (BE (tree == NULL, 0))
   2322 	{
   2323 	  *err = REG_ESPACE;
   2324 	  return NULL;
   2325 	}
   2326       break;
   2327     case ANCHOR:
   2328       if ((token->opr.ctx_type
   2329 	   & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
   2330 	  && dfa->word_ops_used == 0)
   2331 	init_word_char (dfa);
   2332       if (token->opr.ctx_type == WORD_DELIM
   2333           || token->opr.ctx_type == NOT_WORD_DELIM)
   2334 	{
   2335 	  bin_tree_t *tree_first, *tree_last;
   2336 	  if (token->opr.ctx_type == WORD_DELIM)
   2337 	    {
   2338 	      token->opr.ctx_type = WORD_FIRST;
   2339 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
   2340 	      token->opr.ctx_type = WORD_LAST;
   2341             }
   2342           else
   2343             {
   2344 	      token->opr.ctx_type = INSIDE_WORD;
   2345 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
   2346 	      token->opr.ctx_type = INSIDE_NOTWORD;
   2347             }
   2348 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
   2349 	  tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
   2350 	  if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
   2351 	    {
   2352 	      *err = REG_ESPACE;
   2353 	      return NULL;
   2354 	    }
   2355 	}
   2356       else
   2357 	{
   2358 	  tree = create_token_tree (dfa, NULL, NULL, token);
   2359 	  if (BE (tree == NULL, 0))
   2360 	    {
   2361 	      *err = REG_ESPACE;
   2362 	      return NULL;
   2363 	    }
   2364 	}
   2365       /* We must return here, since ANCHORs can't be followed
   2366 	 by repetition operators.
   2367 	 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
   2368 	     it must not be "<ANCHOR(^)><REPEAT(*)>".  */
   2369       fetch_token (token, regexp, syntax);
   2370       return tree;
   2371     case OP_PERIOD:
   2372       tree = create_token_tree (dfa, NULL, NULL, token);
   2373       if (BE (tree == NULL, 0))
   2374 	{
   2375 	  *err = REG_ESPACE;
   2376 	  return NULL;
   2377 	}
   2378       if (dfa->mb_cur_max > 1)
   2379 	dfa->has_mb_node = 1;
   2380       break;
   2381     case OP_WORD:
   2382     case OP_NOTWORD:
   2383       tree = build_charclass_op (dfa, regexp->trans,
   2384 				 (const unsigned char *) "alnum",
   2385 				 (const unsigned char *) "_",
   2386 				 token->type == OP_NOTWORD, err);
   2387       if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2388 	return NULL;
   2389       break;
   2390     case OP_SPACE:
   2391     case OP_NOTSPACE:
   2392       tree = build_charclass_op (dfa, regexp->trans,
   2393 				 (const unsigned char *) "space",
   2394 				 (const unsigned char *) "",
   2395 				 token->type == OP_NOTSPACE, err);
   2396       if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2397 	return NULL;
   2398       break;
   2399     case OP_ALT:
   2400     case END_OF_RE:
   2401       return NULL;
   2402     case BACK_SLASH:
   2403       *err = REG_EESCAPE;
   2404       return NULL;
   2405     default:
   2406       /* Must not happen?  */
   2407 #ifdef DEBUG
   2408       assert (0);
   2409 #endif
   2410       return NULL;
   2411     }
   2412   fetch_token (token, regexp, syntax);
   2413 
   2414   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
   2415 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
   2416     {
   2417       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
   2418       if (BE (*err != REG_NOERROR && tree == NULL, 0))
   2419 	return NULL;
   2420       /* In BRE consecutive duplications are not allowed.  */
   2421       if ((syntax & RE_CONTEXT_INVALID_DUP)
   2422 	  && (token->type == OP_DUP_ASTERISK
   2423 	      || token->type == OP_OPEN_DUP_NUM))
   2424 	{
   2425 	  *err = REG_BADRPT;
   2426 	  return NULL;
   2427 	}
   2428     }
   2429 
   2430   return tree;
   2431 }
   2432 
   2433 /* This function build the following tree, from regular expression
   2434    (<reg_exp>):
   2435 	 SUBEXP
   2436 	    |
   2437 	<reg_exp>
   2438 */
   2439 
   2440 static bin_tree_t *
   2441 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
   2442 	       reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
   2443 {
   2444   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
   2445   bin_tree_t *tree;
   2446   size_t cur_nsub;
   2447   cur_nsub = preg->re_nsub++;
   2448 
   2449   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
   2450 
   2451   /* The subexpression may be a null string.  */
   2452   if (token->type == OP_CLOSE_SUBEXP)
   2453     tree = NULL;
   2454   else
   2455     {
   2456       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
   2457       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
   2458         *err = REG_EPAREN;
   2459       if (BE (*err != REG_NOERROR, 0))
   2460 	return NULL;
   2461     }
   2462 
   2463   if (cur_nsub <= '9' - '1')
   2464     dfa->completed_bkref_map |= 1 << cur_nsub;
   2465 
   2466   tree = create_tree (dfa, tree, NULL, SUBEXP);
   2467   if (BE (tree == NULL, 0))
   2468     {
   2469       *err = REG_ESPACE;
   2470       return NULL;
   2471     }
   2472   tree->token.opr.idx = cur_nsub;
   2473   return tree;
   2474 }
   2475 
   2476 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
   2477 
   2478 static bin_tree_t *
   2479 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
   2480 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
   2481 {
   2482   bin_tree_t *tree = NULL, *old_tree = NULL;
   2483   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
   2484   re_token_t start_token = *token;
   2485 
   2486   if (token->type == OP_OPEN_DUP_NUM)
   2487     {
   2488       end = 0;
   2489       start = fetch_number (regexp, token, syntax);
   2490       if (start == REG_MISSING)
   2491 	{
   2492 	  if (token->type == CHARACTER && token->opr.c == ',')
   2493 	    start = 0; /* We treat "{,m}" as "{0,m}".  */
   2494 	  else
   2495 	    {
   2496 	      *err = REG_BADBR; /* <re>{} is invalid.  */
   2497 	      return NULL;
   2498 	    }
   2499 	}
   2500       if (BE (start != REG_ERROR, 1))
   2501 	{
   2502 	  /* We treat "{n}" as "{n,n}".  */
   2503 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
   2504 		 : ((token->type == CHARACTER && token->opr.c == ',')
   2505 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
   2506 	}
   2507       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
   2508 	{
   2509 	  /* Invalid sequence.  */
   2510 	  if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
   2511 	    {
   2512 	      if (token->type == END_OF_RE)
   2513 		*err = REG_EBRACE;
   2514 	      else
   2515 		*err = REG_BADBR;
   2516 
   2517 	      return NULL;
   2518 	    }
   2519 
   2520 	  /* If the syntax bit is set, rollback.  */
   2521 	  re_string_set_index (regexp, start_idx);
   2522 	  *token = start_token;
   2523 	  token->type = CHARACTER;
   2524 	  /* mb_partial and word_char bits should be already initialized by
   2525 	     peek_token.  */
   2526 	  return elem;
   2527 	}
   2528 
   2529       if (BE (end != REG_MISSING && start > end, 0))
   2530 	{
   2531 	  /* First number greater than second.  */
   2532 	  *err = REG_BADBR;
   2533 	  return NULL;
   2534 	}
   2535     }
   2536   else
   2537     {
   2538       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
   2539       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
   2540     }
   2541 
   2542   fetch_token (token, regexp, syntax);
   2543 
   2544   if (BE (elem == NULL, 0))
   2545     return NULL;
   2546   if (BE (start == 0 && end == 0, 0))
   2547     {
   2548       postorder (elem, free_tree, NULL);
   2549       return NULL;
   2550     }
   2551 
   2552   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
   2553   if (BE (start > 0, 0))
   2554     {
   2555       tree = elem;
   2556       for (i = 2; i <= start; ++i)
   2557 	{
   2558 	  elem = duplicate_tree (elem, dfa);
   2559 	  tree = create_tree (dfa, tree, elem, CONCAT);
   2560 	  if (BE (elem == NULL || tree == NULL, 0))
   2561 	    goto parse_dup_op_espace;
   2562 	}
   2563 
   2564       if (start == end)
   2565 	return tree;
   2566 
   2567       /* Duplicate ELEM before it is marked optional.  */
   2568       elem = duplicate_tree (elem, dfa);
   2569       old_tree = tree;
   2570     }
   2571   else
   2572     old_tree = NULL;
   2573 
   2574   if (elem->token.type == SUBEXP)
   2575     postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
   2576 
   2577   tree = create_tree (dfa, elem, NULL,
   2578 		      (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
   2579   if (BE (tree == NULL, 0))
   2580     goto parse_dup_op_espace;
   2581 
   2582   /* This loop is actually executed only when end != REG_MISSING,
   2583      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
   2584      already created the start+1-th copy.  */
   2585   if ((Idx) -1 < 0 || end != REG_MISSING)
   2586     for (i = start + 2; i <= end; ++i)
   2587       {
   2588 	elem = duplicate_tree (elem, dfa);
   2589 	tree = create_tree (dfa, tree, elem, CONCAT);
   2590 	if (BE (elem == NULL || tree == NULL, 0))
   2591 	  goto parse_dup_op_espace;
   2592 
   2593 	tree = create_tree (dfa, tree, NULL, OP_ALT);
   2594 	if (BE (tree == NULL, 0))
   2595 	  goto parse_dup_op_espace;
   2596       }
   2597 
   2598   if (old_tree)
   2599     tree = create_tree (dfa, old_tree, tree, CONCAT);
   2600 
   2601   return tree;
   2602 
   2603  parse_dup_op_espace:
   2604   *err = REG_ESPACE;
   2605   return NULL;
   2606 }
   2607 
   2608 /* Size of the names for collating symbol/equivalence_class/character_class.
   2609    I'm not sure, but maybe enough.  */
   2610 #define BRACKET_NAME_BUF_SIZE 32
   2611 
   2612 #ifndef _LIBC
   2613   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
   2614      Build the range expression which starts from START_ELEM, and ends
   2615      at END_ELEM.  The result are written to MBCSET and SBCSET.
   2616      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
   2617      mbcset->range_ends, is a pointer argument sinse we may
   2618      update it.  */
   2619 
   2620 static reg_errcode_t
   2621 internal_function
   2622 # ifdef RE_ENABLE_I18N
   2623 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
   2624 		 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
   2625 # else /* not RE_ENABLE_I18N */
   2626 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
   2627 		 bracket_elem_t *end_elem)
   2628 # endif /* not RE_ENABLE_I18N */
   2629 {
   2630   unsigned int start_ch, end_ch;
   2631   /* Equivalence Classes and Character Classes can't be a range start/end.  */
   2632   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
   2633 	  || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
   2634 	  0))
   2635     return REG_ERANGE;
   2636 
   2637   /* We can handle no multi character collating elements without libc
   2638      support.  */
   2639   if (BE ((start_elem->type == COLL_SYM
   2640 	   && strlen ((char *) start_elem->opr.name) > 1)
   2641 	  || (end_elem->type == COLL_SYM
   2642 	      && strlen ((char *) end_elem->opr.name) > 1), 0))
   2643     return REG_ECOLLATE;
   2644 
   2645 # ifdef RE_ENABLE_I18N
   2646   {
   2647     wchar_t wc;
   2648     wint_t start_wc;
   2649     wint_t end_wc;
   2650     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
   2651 
   2652     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
   2653 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
   2654 		   : 0));
   2655     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
   2656 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
   2657 		 : 0));
   2658     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
   2659 		? __btowc (start_ch) : start_elem->opr.wch);
   2660     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
   2661 	      ? __btowc (end_ch) : end_elem->opr.wch);
   2662     if (start_wc == WEOF || end_wc == WEOF)
   2663       return REG_ECOLLATE;
   2664     cmp_buf[0] = start_wc;
   2665     cmp_buf[4] = end_wc;
   2666     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
   2667       return REG_ERANGE;
   2668 
   2669     /* Got valid collation sequence values, add them as a new entry.
   2670        However, for !_LIBC we have no collation elements: if the
   2671        character set is single byte, the single byte character set
   2672        that we build below suffices.  parse_bracket_exp passes
   2673        no MBCSET if dfa->mb_cur_max == 1.  */
   2674     if (mbcset)
   2675       {
   2676         /* Check the space of the arrays.  */
   2677         if (BE (*range_alloc == mbcset->nranges, 0))
   2678           {
   2679 	    /* There is not enough space, need realloc.  */
   2680 	    wchar_t *new_array_start, *new_array_end;
   2681 	    Idx new_nranges;
   2682 
   2683 	    /* +1 in case of mbcset->nranges is 0.  */
   2684 	    new_nranges = 2 * mbcset->nranges + 1;
   2685 	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
   2686 	       are NULL if *range_alloc == 0.  */
   2687 	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
   2688 				          new_nranges);
   2689 	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
   2690 				        new_nranges);
   2691 
   2692 	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
   2693 	      return REG_ESPACE;
   2694 
   2695 	    mbcset->range_starts = new_array_start;
   2696 	    mbcset->range_ends = new_array_end;
   2697 	    *range_alloc = new_nranges;
   2698           }
   2699 
   2700         mbcset->range_starts[mbcset->nranges] = start_wc;
   2701         mbcset->range_ends[mbcset->nranges++] = end_wc;
   2702       }
   2703 
   2704     /* Build the table for single byte characters.  */
   2705     for (wc = 0; wc < SBC_MAX; ++wc)
   2706       {
   2707 	cmp_buf[2] = wc;
   2708 	if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
   2709 	    && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
   2710 	  bitset_set (sbcset, wc);
   2711       }
   2712   }
   2713 # else /* not RE_ENABLE_I18N */
   2714   {
   2715     unsigned int ch;
   2716     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
   2717 		: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
   2718 		   : 0));
   2719     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
   2720 	      : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
   2721 		 : 0));
   2722     if (start_ch > end_ch)
   2723       return REG_ERANGE;
   2724     /* Build the table for single byte characters.  */
   2725     for (ch = 0; ch < SBC_MAX; ++ch)
   2726       if (start_ch <= ch  && ch <= end_ch)
   2727 	bitset_set (sbcset, ch);
   2728   }
   2729 # endif /* not RE_ENABLE_I18N */
   2730   return REG_NOERROR;
   2731 }
   2732 #endif /* not _LIBC */
   2733 
   2734 #ifndef _LIBC
   2735 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
   2736    Build the collating element which is represented by NAME.
   2737    The result are written to MBCSET and SBCSET.
   2738    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
   2739    pointer argument since we may update it.  */
   2740 
   2741 static reg_errcode_t
   2742 internal_function
   2743 build_collating_symbol (bitset_t sbcset,
   2744 # ifdef RE_ENABLE_I18N
   2745 			re_charset_t *mbcset, Idx *coll_sym_alloc,
   2746 # endif
   2747 			const unsigned char *name)
   2748 {
   2749   size_t name_len = strlen ((const char *) name);
   2750   if (BE (name_len != 1, 0))
   2751     return REG_ECOLLATE;
   2752   else
   2753     {
   2754       bitset_set (sbcset, name[0]);
   2755       return REG_NOERROR;
   2756     }
   2757 }
   2758 #endif /* not _LIBC */
   2759 
   2760 /* This function parse bracket expression like "[abc]", "[a-c]",
   2761    "[[.a-a.]]" etc.  */
   2762 
   2763 static bin_tree_t *
   2764 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
   2765 		   reg_syntax_t syntax, reg_errcode_t *err)
   2766 {
   2767 #ifdef _LIBC
   2768   const unsigned char *collseqmb;
   2769   const char *collseqwc;
   2770   uint32_t nrules;
   2771   int32_t table_size;
   2772   const int32_t *symb_table;
   2773   const unsigned char *extra;
   2774 
   2775   /* Local function for parse_bracket_exp used in _LIBC environement.
   2776      Seek the collating symbol entry correspondings to NAME.
   2777      Return the index of the symbol in the SYMB_TABLE.  */
   2778 
   2779   auto inline int32_t
   2780   __attribute ((always_inline))
   2781   seek_collating_symbol_entry (name, name_len)
   2782 	 const unsigned char *name;
   2783 	 size_t name_len;
   2784     {
   2785       int32_t hash = elem_hash ((const char *) name, name_len);
   2786       int32_t elem = hash % table_size;
   2787       if (symb_table[2 * elem] != 0)
   2788 	{
   2789 	  int32_t second = hash % (table_size - 2) + 1;
   2790 
   2791 	  do
   2792 	    {
   2793 	      /* First compare the hashing value.  */
   2794 	      if (symb_table[2 * elem] == hash
   2795 		  /* Compare the length of the name.  */
   2796 		  && name_len == extra[symb_table[2 * elem + 1]]
   2797 		  /* Compare the name.  */
   2798 		  && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
   2799 			     name_len) == 0)
   2800 		{
   2801 		  /* Yep, this is the entry.  */
   2802 		  break;
   2803 		}
   2804 
   2805 	      /* Next entry.  */
   2806 	      elem += second;
   2807 	    }
   2808 	  while (symb_table[2 * elem] != 0);
   2809 	}
   2810       return elem;
   2811     }
   2812 
   2813   /* Local function for parse_bracket_exp used in _LIBC environement.
   2814      Look up the collation sequence value of BR_ELEM.
   2815      Return the value if succeeded, UINT_MAX otherwise.  */
   2816 
   2817   auto inline unsigned int
   2818   __attribute ((always_inline))
   2819   lookup_collation_sequence_value (br_elem)
   2820 	 bracket_elem_t *br_elem;
   2821     {
   2822       if (br_elem->type == SB_CHAR)
   2823 	{
   2824 	  /*
   2825 	  if (MB_CUR_MAX == 1)
   2826 	  */
   2827 	  if (nrules == 0)
   2828 	    return collseqmb[br_elem->opr.ch];
   2829 	  else
   2830 	    {
   2831 	      wint_t wc = __btowc (br_elem->opr.ch);
   2832 	      return __collseq_table_lookup (collseqwc, wc);
   2833 	    }
   2834 	}
   2835       else if (br_elem->type == MB_CHAR)
   2836 	{
   2837 	  return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
   2838 	}
   2839       else if (br_elem->type == COLL_SYM)
   2840 	{
   2841 	  size_t sym_name_len = strlen ((char *) br_elem->opr.name);
   2842 	  if (nrules != 0)
   2843 	    {
   2844 	      int32_t elem, idx;
   2845 	      elem = seek_collating_symbol_entry (br_elem->opr.name,
   2846 						  sym_name_len);
   2847 	      if (symb_table[2 * elem] != 0)
   2848 		{
   2849 		  /* We found the entry.  */
   2850 		  idx = symb_table[2 * elem + 1];
   2851 		  /* Skip the name of collating element name.  */
   2852 		  idx += 1 + extra[idx];
   2853 		  /* Skip the byte sequence of the collating element.  */
   2854 		  idx += 1 + extra[idx];
   2855 		  /* Adjust for the alignment.  */
   2856 		  idx = (idx + 3) & ~3;
   2857 		  /* Skip the multibyte collation sequence value.  */
   2858 		  idx += sizeof (unsigned int);
   2859 		  /* Skip the wide char sequence of the collating element.  */
   2860 		  idx += sizeof (unsigned int) *
   2861 		    (1 + *(unsigned int *) (extra + idx));
   2862 		  /* Return the collation sequence value.  */
   2863 		  return *(unsigned int *) (extra + idx);
   2864 		}
   2865 	      else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
   2866 		{
   2867 		  /* No valid character.  Match it as a single byte
   2868 		     character.  */
   2869 		  return collseqmb[br_elem->opr.name[0]];
   2870 		}
   2871 	    }
   2872 	  else if (sym_name_len == 1)
   2873 	    return collseqmb[br_elem->opr.name[0]];
   2874 	}
   2875       return UINT_MAX;
   2876     }
   2877 
   2878   /* Local function for parse_bracket_exp used in _LIBC environement.
   2879      Build the range expression which starts from START_ELEM, and ends
   2880      at END_ELEM.  The result are written to MBCSET and SBCSET.
   2881      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
   2882      mbcset->range_ends, is a pointer argument sinse we may
   2883      update it.  */
   2884 
   2885   auto inline reg_errcode_t
   2886   __attribute ((always_inline))
   2887   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
   2888 	 re_charset_t *mbcset;
   2889 	 Idx *range_alloc;
   2890 	 bitset_t sbcset;
   2891 	 bracket_elem_t *start_elem, *end_elem;
   2892     {
   2893       unsigned int ch;
   2894       uint32_t start_collseq;
   2895       uint32_t end_collseq;
   2896 
   2897       /* Equivalence Classes and Character Classes can't be a range
   2898 	 start/end.  */
   2899       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
   2900 	      || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
   2901 	      0))
   2902 	return REG_ERANGE;
   2903 
   2904       start_collseq = lookup_collation_sequence_value (start_elem);
   2905       end_collseq = lookup_collation_sequence_value (end_elem);
   2906       /* Check start/end collation sequence values.  */
   2907       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
   2908 	return REG_ECOLLATE;
   2909       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
   2910 	return REG_ERANGE;
   2911 
   2912       /* Got valid collation sequence values, add them as a new entry.
   2913 	 However, if we have no collation elements, and the character set
   2914 	 is single byte, the single byte character set that we
   2915 	 build below suffices. */
   2916       if (nrules > 0 || dfa->mb_cur_max > 1)
   2917 	{
   2918           /* Check the space of the arrays.  */
   2919           if (BE (*range_alloc == mbcset->nranges, 0))
   2920 	    {
   2921 	      /* There is not enough space, need realloc.  */
   2922 	      uint32_t *new_array_start;
   2923 	      uint32_t *new_array_end;
   2924 	      Idx new_nranges;
   2925 
   2926 	      /* +1 in case of mbcset->nranges is 0.  */
   2927 	      new_nranges = 2 * mbcset->nranges + 1;
   2928 	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
   2929 					    new_nranges);
   2930 	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
   2931 				          new_nranges);
   2932 
   2933 	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
   2934 	        return REG_ESPACE;
   2935 
   2936 	      mbcset->range_starts = new_array_start;
   2937 	      mbcset->range_ends = new_array_end;
   2938 	      *range_alloc = new_nranges;
   2939 	    }
   2940 
   2941           mbcset->range_starts[mbcset->nranges] = start_collseq;
   2942           mbcset->range_ends[mbcset->nranges++] = end_collseq;
   2943 	}
   2944 
   2945       /* Build the table for single byte characters.  */
   2946       for (ch = 0; ch < SBC_MAX; ch++)
   2947 	{
   2948 	  uint32_t ch_collseq;
   2949 	  /*
   2950 	  if (MB_CUR_MAX == 1)
   2951 	  */
   2952 	  if (nrules == 0)
   2953 	    ch_collseq = collseqmb[ch];
   2954 	  else
   2955 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
   2956 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
   2957 	    bitset_set (sbcset, ch);
   2958 	}
   2959       return REG_NOERROR;
   2960     }
   2961 
   2962   /* Local function for parse_bracket_exp used in _LIBC environement.
   2963      Build the collating element which is represented by NAME.
   2964      The result are written to MBCSET and SBCSET.
   2965      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
   2966      pointer argument sinse we may update it.  */
   2967 
   2968   auto inline reg_errcode_t
   2969   __attribute ((always_inline))
   2970   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
   2971 	 re_charset_t *mbcset;
   2972 	 Idx *coll_sym_alloc;
   2973 	 bitset_t sbcset;
   2974 	 const unsigned char *name;
   2975     {
   2976       int32_t elem, idx;
   2977       size_t name_len = strlen ((const char *) name);
   2978       if (nrules != 0)
   2979 	{
   2980 	  elem = seek_collating_symbol_entry (name, name_len);
   2981 	  if (symb_table[2 * elem] != 0)
   2982 	    {
   2983 	      /* We found the entry.  */
   2984 	      idx = symb_table[2 * elem + 1];
   2985 	      /* Skip the name of collating element name.  */
   2986 	      idx += 1 + extra[idx];
   2987 	    }
   2988 	  else if (symb_table[2 * elem] == 0 && name_len == 1)
   2989 	    {
   2990 	      /* No valid character, treat it as a normal
   2991 		 character.  */
   2992 	      bitset_set (sbcset, name[0]);
   2993 	      return REG_NOERROR;
   2994 	    }
   2995 	  else
   2996 	    return REG_ECOLLATE;
   2997 
   2998 	  /* Got valid collation sequence, add it as a new entry.  */
   2999 	  /* Check the space of the arrays.  */
   3000 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
   3001 	    {
   3002 	      /* Not enough, realloc it.  */
   3003 	      /* +1 in case of mbcset->ncoll_syms is 0.  */
   3004 	      Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
   3005 	      /* Use realloc since mbcset->coll_syms is NULL
   3006 		 if *alloc == 0.  */
   3007 	      int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
   3008 						   new_coll_sym_alloc);
   3009 	      if (BE (new_coll_syms == NULL, 0))
   3010 		return REG_ESPACE;
   3011 	      mbcset->coll_syms = new_coll_syms;
   3012 	      *coll_sym_alloc = new_coll_sym_alloc;
   3013 	    }
   3014 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
   3015 	  return REG_NOERROR;
   3016 	}
   3017       else
   3018 	{
   3019 	  if (BE (name_len != 1, 0))
   3020 	    return REG_ECOLLATE;
   3021 	  else
   3022 	    {
   3023 	      bitset_set (sbcset, name[0]);
   3024 	      return REG_NOERROR;
   3025 	    }
   3026 	}
   3027     }
   3028 #endif
   3029 
   3030   re_token_t br_token;
   3031   re_bitset_ptr_t sbcset;
   3032 #ifdef RE_ENABLE_I18N
   3033   re_charset_t *mbcset;
   3034   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
   3035   Idx equiv_class_alloc = 0, char_class_alloc = 0;
   3036 #endif /* not RE_ENABLE_I18N */
   3037   bool non_match = false;
   3038   bin_tree_t *work_tree;
   3039   int token_len;
   3040   bool first_round = true;
   3041 #ifdef _LIBC
   3042   collseqmb = (const unsigned char *)
   3043     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
   3044   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
   3045   if (nrules)
   3046     {
   3047       /*
   3048       if (MB_CUR_MAX > 1)
   3049       */
   3050       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
   3051       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
   3052       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
   3053 						  _NL_COLLATE_SYMB_TABLEMB);
   3054       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
   3055 						   _NL_COLLATE_SYMB_EXTRAMB);
   3056     }
   3057 #endif
   3058   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
   3059 #ifdef RE_ENABLE_I18N
   3060   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
   3061 #endif /* RE_ENABLE_I18N */
   3062 #ifdef RE_ENABLE_I18N
   3063   if (BE (sbcset == NULL || mbcset == NULL, 0))
   3064 #else
   3065   if (BE (sbcset == NULL, 0))
   3066 #endif /* RE_ENABLE_I18N */
   3067     {
   3068       *err = REG_ESPACE;
   3069       return NULL;
   3070     }
   3071 
   3072   token_len = peek_token_bracket (token, regexp, syntax);
   3073   if (BE (token->type == END_OF_RE, 0))
   3074     {
   3075       *err = REG_BADPAT;
   3076       goto parse_bracket_exp_free_return;
   3077     }
   3078   if (token->type == OP_NON_MATCH_LIST)
   3079     {
   3080 #ifdef RE_ENABLE_I18N
   3081       mbcset->non_match = 1;
   3082 #endif /* not RE_ENABLE_I18N */
   3083       non_match = true;
   3084       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
   3085 	bitset_set (sbcset, '\n');
   3086       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
   3087       token_len = peek_token_bracket (token, regexp, syntax);
   3088       if (BE (token->type == END_OF_RE, 0))
   3089 	{
   3090 	  *err = REG_BADPAT;
   3091 	  goto parse_bracket_exp_free_return;
   3092 	}
   3093     }
   3094 
   3095   /* We treat the first ']' as a normal character.  */
   3096   if (token->type == OP_CLOSE_BRACKET)
   3097     token->type = CHARACTER;
   3098 
   3099   while (1)
   3100     {
   3101       bracket_elem_t start_elem, end_elem;
   3102       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
   3103       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
   3104       reg_errcode_t ret;
   3105       int token_len2 = 0;
   3106       bool is_range_exp = false;
   3107       re_token_t token2;
   3108 
   3109       start_elem.opr.name = start_name_buf;
   3110       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
   3111 				   syntax, first_round);
   3112       if (BE (ret != REG_NOERROR, 0))
   3113 	{
   3114 	  *err = ret;
   3115 	  goto parse_bracket_exp_free_return;
   3116 	}
   3117       first_round = false;
   3118 
   3119       /* Get information about the next token.  We need it in any case.  */
   3120       token_len = peek_token_bracket (token, regexp, syntax);
   3121 
   3122       /* Do not check for ranges if we know they are not allowed.  */
   3123       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
   3124 	{
   3125 	  if (BE (token->type == END_OF_RE, 0))
   3126 	    {
   3127 	      *err = REG_EBRACK;
   3128 	      goto parse_bracket_exp_free_return;
   3129 	    }
   3130 	  if (token->type == OP_CHARSET_RANGE)
   3131 	    {
   3132 	      re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
   3133 	      token_len2 = peek_token_bracket (&token2, regexp, syntax);
   3134 	      if (BE (token2.type == END_OF_RE, 0))
   3135 		{
   3136 		  *err = REG_EBRACK;
   3137 		  goto parse_bracket_exp_free_return;
   3138 		}
   3139 	      if (token2.type == OP_CLOSE_BRACKET)
   3140 		{
   3141 		  /* We treat the last '-' as a normal character.  */
   3142 		  re_string_skip_bytes (regexp, -token_len);
   3143 		  token->type = CHARACTER;
   3144 		}
   3145 	      else
   3146 		is_range_exp = true;
   3147 	    }
   3148 	}
   3149 
   3150       if (is_range_exp == true)
   3151 	{
   3152 	  end_elem.opr.name = end_name_buf;
   3153 	  ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
   3154 				       dfa, syntax, true);
   3155 	  if (BE (ret != REG_NOERROR, 0))
   3156 	    {
   3157 	      *err = ret;
   3158 	      goto parse_bracket_exp_free_return;
   3159 	    }
   3160 
   3161 	  token_len = peek_token_bracket (token, regexp, syntax);
   3162 
   3163 #ifdef _LIBC
   3164 	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
   3165 				  &start_elem, &end_elem);
   3166 #else
   3167 # ifdef RE_ENABLE_I18N
   3168 	  *err = build_range_exp (sbcset,
   3169 				  dfa->mb_cur_max > 1 ? mbcset : NULL,
   3170 				  &range_alloc, &start_elem, &end_elem);
   3171 # else
   3172 	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
   3173 # endif
   3174 #endif /* RE_ENABLE_I18N */
   3175 	  if (BE (*err != REG_NOERROR, 0))
   3176 	    goto parse_bracket_exp_free_return;
   3177 	}
   3178       else
   3179 	{
   3180 	  switch (start_elem.type)
   3181 	    {
   3182 	    case SB_CHAR:
   3183 	      bitset_set (sbcset, start_elem.opr.ch);
   3184 	      break;
   3185 #ifdef RE_ENABLE_I18N
   3186 	    case MB_CHAR:
   3187 	      /* Check whether the array has enough space.  */
   3188 	      if (BE (mbchar_alloc == mbcset->nmbchars, 0))
   3189 		{
   3190 		  wchar_t *new_mbchars;
   3191 		  /* Not enough, realloc it.  */
   3192 		  /* +1 in case of mbcset->nmbchars is 0.  */
   3193 		  mbchar_alloc = 2 * mbcset->nmbchars + 1;
   3194 		  /* Use realloc since array is NULL if *alloc == 0.  */
   3195 		  new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
   3196 					    mbchar_alloc);
   3197 		  if (BE (new_mbchars == NULL, 0))
   3198 		    goto parse_bracket_exp_espace;
   3199 		  mbcset->mbchars = new_mbchars;
   3200 		}
   3201 	      mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
   3202 	      break;
   3203 #endif /* RE_ENABLE_I18N */
   3204 	    case EQUIV_CLASS:
   3205 	      *err = build_equiv_class (sbcset,
   3206 #ifdef RE_ENABLE_I18N
   3207 					mbcset, &equiv_class_alloc,
   3208 #endif /* RE_ENABLE_I18N */
   3209 					start_elem.opr.name);
   3210 	      if (BE (*err != REG_NOERROR, 0))
   3211 		goto parse_bracket_exp_free_return;
   3212 	      break;
   3213 	    case COLL_SYM:
   3214 	      *err = build_collating_symbol (sbcset,
   3215 #ifdef RE_ENABLE_I18N
   3216 					     mbcset, &coll_sym_alloc,
   3217 #endif /* RE_ENABLE_I18N */
   3218 					     start_elem.opr.name);
   3219 	      if (BE (*err != REG_NOERROR, 0))
   3220 		goto parse_bracket_exp_free_return;
   3221 	      break;
   3222 	    case CHAR_CLASS:
   3223 	      *err = build_charclass (regexp->trans, sbcset,
   3224 #ifdef RE_ENABLE_I18N
   3225 				      mbcset, &char_class_alloc,
   3226 #endif /* RE_ENABLE_I18N */
   3227 				      start_elem.opr.name, syntax);
   3228 	      if (BE (*err != REG_NOERROR, 0))
   3229 	       goto parse_bracket_exp_free_return;
   3230 	      break;
   3231 	    default:
   3232 	      assert (0);
   3233 	      break;
   3234 	    }
   3235 	}
   3236       if (BE (token->type == END_OF_RE, 0))
   3237 	{
   3238 	  *err = REG_EBRACK;
   3239 	  goto parse_bracket_exp_free_return;
   3240 	}
   3241       if (token->type == OP_CLOSE_BRACKET)
   3242 	break;
   3243     }
   3244 
   3245   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
   3246 
   3247   /* If it is non-matching list.  */
   3248   if (non_match)
   3249     bitset_not (sbcset);
   3250 
   3251 #ifdef RE_ENABLE_I18N
   3252   /* Ensure only single byte characters are set.  */
   3253   if (dfa->mb_cur_max > 1)
   3254     bitset_mask (sbcset, dfa->sb_char);
   3255 
   3256   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
   3257       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
   3258 						     || mbcset->non_match)))
   3259     {
   3260       bin_tree_t *mbc_tree;
   3261       int sbc_idx;
   3262       /* Build a tree for complex bracket.  */
   3263       dfa->has_mb_node = 1;
   3264       br_token.type = COMPLEX_BRACKET;
   3265       br_token.opr.mbcset = mbcset;
   3266       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
   3267       if (BE (mbc_tree == NULL, 0))
   3268 	goto parse_bracket_exp_espace;
   3269       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
   3270 	if (sbcset[sbc_idx])
   3271 	  break;
   3272       /* If there are no bits set in sbcset, there is no point
   3273 	 of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
   3274       if (sbc_idx < BITSET_WORDS)
   3275 	{
   3276           /* Build a tree for simple bracket.  */
   3277           br_token.type = SIMPLE_BRACKET;
   3278           br_token.opr.sbcset = sbcset;
   3279           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
   3280           if (BE (work_tree == NULL, 0))
   3281             goto parse_bracket_exp_espace;
   3282 
   3283           /* Then join them by ALT node.  */
   3284           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
   3285           if (BE (work_tree == NULL, 0))
   3286             goto parse_bracket_exp_espace;
   3287 	}
   3288       else
   3289 	{
   3290 	  re_free (sbcset);
   3291 	  work_tree = mbc_tree;
   3292 	}
   3293     }
   3294   else
   3295 #endif /* not RE_ENABLE_I18N */
   3296     {
   3297 #ifdef RE_ENABLE_I18N
   3298       free_charset (mbcset);
   3299 #endif
   3300       /* Build a tree for simple bracket.  */
   3301       br_token.type = SIMPLE_BRACKET;
   3302       br_token.opr.sbcset = sbcset;
   3303       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
   3304       if (BE (work_tree == NULL, 0))
   3305         goto parse_bracket_exp_espace;
   3306     }
   3307   return work_tree;
   3308 
   3309  parse_bracket_exp_espace:
   3310   *err = REG_ESPACE;
   3311  parse_bracket_exp_free_return:
   3312   re_free (sbcset);
   3313 #ifdef RE_ENABLE_I18N
   3314   free_charset (mbcset);
   3315 #endif /* RE_ENABLE_I18N */
   3316   return NULL;
   3317 }
   3318 
   3319 /* Parse an element in the bracket expression.  */
   3320 
   3321 static reg_errcode_t
   3322 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
   3323 		       re_token_t *token, int token_len, re_dfa_t *dfa,
   3324 		       reg_syntax_t syntax, bool accept_hyphen)
   3325 {
   3326 #ifdef RE_ENABLE_I18N
   3327   int cur_char_size;
   3328   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
   3329   if (cur_char_size > 1)
   3330     {
   3331       elem->type = MB_CHAR;
   3332       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
   3333       re_string_skip_bytes (regexp, cur_char_size);
   3334       return REG_NOERROR;
   3335     }
   3336 #endif /* RE_ENABLE_I18N */
   3337   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
   3338   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
   3339       || token->type == OP_OPEN_EQUIV_CLASS)
   3340     return parse_bracket_symbol (elem, regexp, token);
   3341   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
   3342     {
   3343       /* A '-' must only appear as anything but a range indicator before
   3344 	 the closing bracket.  Everything else is an error.  */
   3345       re_token_t token2;
   3346       (void) peek_token_bracket (&token2, regexp, syntax);
   3347       if (token2.type != OP_CLOSE_BRACKET)
   3348 	/* The actual error value is not standardized since this whole
   3349 	   case is undefined.  But ERANGE makes good sense.  */
   3350 	return REG_ERANGE;
   3351     }
   3352   elem->type = SB_CHAR;
   3353   elem->opr.ch = token->opr.c;
   3354   return REG_NOERROR;
   3355 }
   3356 
   3357 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
   3358    such as [:<character_class>:], [.<collating_element>.], and
   3359    [=<equivalent_class>=].  */
   3360 
   3361 static reg_errcode_t
   3362 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
   3363 		      re_token_t *token)
   3364 {
   3365   unsigned char ch, delim = token->opr.c;
   3366   int i = 0;
   3367   if (re_string_eoi(regexp))
   3368     return REG_EBRACK;
   3369   for (;; ++i)
   3370     {
   3371       if (i >= BRACKET_NAME_BUF_SIZE)
   3372 	return REG_EBRACK;
   3373       if (token->type == OP_OPEN_CHAR_CLASS)
   3374 	ch = re_string_fetch_byte_case (regexp);
   3375       else
   3376 	ch = re_string_fetch_byte (regexp);
   3377       if (re_string_eoi(regexp))
   3378 	return REG_EBRACK;
   3379       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
   3380 	break;
   3381       elem->opr.name[i] = ch;
   3382     }
   3383   re_string_skip_bytes (regexp, 1);
   3384   elem->opr.name[i] = '\0';
   3385   switch (token->type)
   3386     {
   3387     case OP_OPEN_COLL_ELEM:
   3388       elem->type = COLL_SYM;
   3389       break;
   3390     case OP_OPEN_EQUIV_CLASS:
   3391       elem->type = EQUIV_CLASS;
   3392       break;
   3393     case OP_OPEN_CHAR_CLASS:
   3394       elem->type = CHAR_CLASS;
   3395       break;
   3396     default:
   3397       break;
   3398     }
   3399   return REG_NOERROR;
   3400 }
   3401 
   3402   /* Helper function for parse_bracket_exp.
   3403      Build the equivalence class which is represented by NAME.
   3404      The result are written to MBCSET and SBCSET.
   3405      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
   3406      is a pointer argument sinse we may update it.  */
   3407 
   3408 static reg_errcode_t
   3409 #ifdef RE_ENABLE_I18N
   3410 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
   3411 		   Idx *equiv_class_alloc, const unsigned char *name)
   3412 #else /* not RE_ENABLE_I18N */
   3413 build_equiv_class (bitset_t sbcset, const unsigned char *name)
   3414 #endif /* not RE_ENABLE_I18N */
   3415 {
   3416 #ifdef _LIBC
   3417   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
   3418   if (nrules != 0)
   3419     {
   3420       const int32_t *table, *indirect;
   3421       const unsigned char *weights, *extra, *cp;
   3422       unsigned char char_buf[2];
   3423       int32_t idx1, idx2;
   3424       unsigned int ch;
   3425       size_t len;
   3426       /* This #include defines a local function!  */
   3427 # include <locale/weight.h>
   3428       /* Calculate the index for equivalence class.  */
   3429       cp = name;
   3430       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
   3431       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
   3432 					       _NL_COLLATE_WEIGHTMB);
   3433       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
   3434 						   _NL_COLLATE_EXTRAMB);
   3435       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
   3436 						_NL_COLLATE_INDIRECTMB);
   3437       idx1 = findidx (&cp);
   3438       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
   3439 	/* This isn't a valid character.  */
   3440 	return REG_ECOLLATE;
   3441 
   3442       /* Build single byte matcing table for this equivalence class.  */
   3443       char_buf[1] = (unsigned char) '\0';
   3444       len = weights[idx1];
   3445       for (ch = 0; ch < SBC_MAX; ++ch)
   3446 	{
   3447 	  char_buf[0] = ch;
   3448 	  cp = char_buf;
   3449 	  idx2 = findidx (&cp);
   3450 /*
   3451 	  idx2 = table[ch];
   3452 */
   3453 	  if (idx2 == 0)
   3454 	    /* This isn't a valid character.  */
   3455 	    continue;
   3456 	  if (len == weights[idx2])
   3457 	    {
   3458 	      int cnt = 0;
   3459 	      while (cnt <= len &&
   3460 		     weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
   3461 		++cnt;
   3462 
   3463 	      if (cnt > len)
   3464 		bitset_set (sbcset, ch);
   3465 	    }
   3466 	}
   3467       /* Check whether the array has enough space.  */
   3468       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
   3469 	{
   3470 	  /* Not enough, realloc it.  */
   3471 	  /* +1 in case of mbcset->nequiv_classes is 0.  */
   3472 	  Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
   3473 	  /* Use realloc since the array is NULL if *alloc == 0.  */
   3474 	  int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
   3475 						   int32_t,
   3476 						   new_equiv_class_alloc);
   3477 	  if (BE (new_equiv_classes == NULL, 0))
   3478 	    return REG_ESPACE;
   3479 	  mbcset->equiv_classes = new_equiv_classes;
   3480 	  *equiv_class_alloc = new_equiv_class_alloc;
   3481 	}
   3482       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
   3483     }
   3484   else
   3485 #endif /* _LIBC */
   3486     {
   3487       if (BE (strlen ((const char *) name) != 1, 0))
   3488 	return REG_ECOLLATE;
   3489       bitset_set (sbcset, *name);
   3490     }
   3491   return REG_NOERROR;
   3492 }
   3493 
   3494   /* Helper function for parse_bracket_exp.
   3495      Build the character class which is represented by NAME.
   3496      The result are written to MBCSET and SBCSET.
   3497      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
   3498      is a pointer argument sinse we may update it.  */
   3499 
   3500 static reg_errcode_t
   3501 #ifdef RE_ENABLE_I18N
   3502 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
   3503 		 re_charset_t *mbcset, Idx *char_class_alloc,
   3504 		 const unsigned char *class_name, reg_syntax_t syntax)
   3505 #else /* not RE_ENABLE_I18N */
   3506 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
   3507 		 const unsigned char *class_name, reg_syntax_t syntax)
   3508 #endif /* not RE_ENABLE_I18N */
   3509 {
   3510   int i;
   3511   const char *name = (const char *) class_name;
   3512 
   3513   /* In case of REG_ICASE "upper" and "lower" match the both of
   3514      upper and lower cases.  */
   3515   if ((syntax & RE_ICASE)
   3516       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
   3517     name = "alpha";
   3518 
   3519 #ifdef RE_ENABLE_I18N
   3520   /* Check the space of the arrays.  */
   3521   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
   3522     {
   3523       /* Not enough, realloc it.  */
   3524       /* +1 in case of mbcset->nchar_classes is 0.  */
   3525       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
   3526       /* Use realloc since array is NULL if *alloc == 0.  */
   3527       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
   3528 					       new_char_class_alloc);
   3529       if (BE (new_char_classes == NULL, 0))
   3530 	return REG_ESPACE;
   3531       mbcset->char_classes = new_char_classes;
   3532       *char_class_alloc = new_char_class_alloc;
   3533     }
   3534   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
   3535 #endif /* RE_ENABLE_I18N */
   3536 
   3537 #define BUILD_CHARCLASS_LOOP(ctype_func)	\
   3538   do {						\
   3539     if (BE (trans != NULL, 0))			\
   3540       {						\
   3541 	for (i = 0; i < SBC_MAX; ++i)		\
   3542 	  if (ctype_func (i))			\
   3543 	    bitset_set (sbcset, trans[i]);	\
   3544       }						\
   3545     else					\
   3546       {						\
   3547 	for (i = 0; i < SBC_MAX; ++i)		\
   3548 	  if (ctype_func (i))			\
   3549 	    bitset_set (sbcset, i);		\
   3550       }						\
   3551   } while (0)
   3552 
   3553   if (strcmp (name, "alnum") == 0)
   3554     BUILD_CHARCLASS_LOOP (isalnum);
   3555   else if (strcmp (name, "cntrl") == 0)
   3556     BUILD_CHARCLASS_LOOP (iscntrl);
   3557   else if (strcmp (name, "lower") == 0)
   3558     BUILD_CHARCLASS_LOOP (islower);
   3559   else if (strcmp (name, "space") == 0)
   3560     BUILD_CHARCLASS_LOOP (isspace);
   3561   else if (strcmp (name, "alpha") == 0)
   3562     BUILD_CHARCLASS_LOOP (isalpha);
   3563   else if (strcmp (name, "digit") == 0)
   3564     BUILD_CHARCLASS_LOOP (isdigit);
   3565   else if (strcmp (name, "print") == 0)
   3566     BUILD_CHARCLASS_LOOP (isprint);
   3567   else if (strcmp (name, "upper") == 0)
   3568     BUILD_CHARCLASS_LOOP (isupper);
   3569   else if (strcmp (name, "blank") == 0)
   3570     BUILD_CHARCLASS_LOOP (isblank);
   3571   else if (strcmp (name, "graph") == 0)
   3572     BUILD_CHARCLASS_LOOP (isgraph);
   3573   else if (strcmp (name, "punct") == 0)
   3574     BUILD_CHARCLASS_LOOP (ispunct);
   3575   else if (strcmp (name, "xdigit") == 0)
   3576     BUILD_CHARCLASS_LOOP (isxdigit);
   3577   else
   3578     return REG_ECTYPE;
   3579 
   3580   return REG_NOERROR;
   3581 }
   3582 
   3583 static bin_tree_t *
   3584 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
   3585 		    const unsigned char *class_name,
   3586 		    const unsigned char *extra, bool non_match,
   3587 		    reg_errcode_t *err)
   3588 {
   3589   re_bitset_ptr_t sbcset;
   3590 #ifdef RE_ENABLE_I18N
   3591   re_charset_t *mbcset;
   3592   Idx alloc = 0;
   3593 #endif /* not RE_ENABLE_I18N */
   3594   reg_errcode_t ret;
   3595   re_token_t br_token;
   3596   bin_tree_t *tree;
   3597 
   3598   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
   3599 #ifdef RE_ENABLE_I18N
   3600   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
   3601 #endif /* RE_ENABLE_I18N */
   3602 
   3603 #ifdef RE_ENABLE_I18N
   3604   if (BE (sbcset == NULL || mbcset == NULL, 0))
   3605 #else /* not RE_ENABLE_I18N */
   3606   if (BE (sbcset == NULL, 0))
   3607 #endif /* not RE_ENABLE_I18N */
   3608     {
   3609       *err = REG_ESPACE;
   3610       return NULL;
   3611     }
   3612 
   3613   if (non_match)
   3614     {
   3615 #ifdef RE_ENABLE_I18N
   3616       mbcset->non_match = 1;
   3617 #endif /* not RE_ENABLE_I18N */
   3618     }
   3619 
   3620   /* We don't care the syntax in this case.  */
   3621   ret = build_charclass (trans, sbcset,
   3622 #ifdef RE_ENABLE_I18N
   3623 			 mbcset, &alloc,
   3624 #endif /* RE_ENABLE_I18N */
   3625 			 class_name, 0);
   3626 
   3627   if (BE (ret != REG_NOERROR, 0))
   3628     {
   3629       re_free (sbcset);
   3630 #ifdef RE_ENABLE_I18N
   3631       free_charset (mbcset);
   3632 #endif /* RE_ENABLE_I18N */
   3633       *err = ret;
   3634       return NULL;
   3635     }
   3636   /* \w match '_' also.  */
   3637   for (; *extra; extra++)
   3638     bitset_set (sbcset, *extra);
   3639 
   3640   /* If it is non-matching list.  */
   3641   if (non_match)
   3642     bitset_not (sbcset);
   3643 
   3644 #ifdef RE_ENABLE_I18N
   3645   /* Ensure only single byte characters are set.  */
   3646   if (dfa->mb_cur_max > 1)
   3647     bitset_mask (sbcset, dfa->sb_char);
   3648 #endif
   3649 
   3650   /* Build a tree for simple bracket.  */
   3651   br_token.type = SIMPLE_BRACKET;
   3652   br_token.opr.sbcset = sbcset;
   3653   tree = create_token_tree (dfa, NULL, NULL, &br_token);
   3654   if (BE (tree == NULL, 0))
   3655     goto build_word_op_espace;
   3656 
   3657 #ifdef RE_ENABLE_I18N
   3658   if (dfa->mb_cur_max > 1)
   3659     {
   3660       bin_tree_t *mbc_tree;
   3661       /* Build a tree for complex bracket.  */
   3662       br_token.type = COMPLEX_BRACKET;
   3663       br_token.opr.mbcset = mbcset;
   3664       dfa->has_mb_node = 1;
   3665       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
   3666       if (BE (mbc_tree == NULL, 0))
   3667 	goto build_word_op_espace;
   3668       /* Then join them by ALT node.  */
   3669       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
   3670       if (BE (mbc_tree != NULL, 1))
   3671 	return tree;
   3672     }
   3673   else
   3674     {
   3675       free_charset (mbcset);
   3676       return tree;
   3677     }
   3678 #else /* not RE_ENABLE_I18N */
   3679   return tree;
   3680 #endif /* not RE_ENABLE_I18N */
   3681 
   3682  build_word_op_espace:
   3683   re_free (sbcset);
   3684 #ifdef RE_ENABLE_I18N
   3685   free_charset (mbcset);
   3686 #endif /* RE_ENABLE_I18N */
   3687   *err = REG_ESPACE;
   3688   return NULL;
   3689 }
   3690 
   3691 /* This is intended for the expressions like "a{1,3}".
   3692    Fetch a number from `input', and return the number.
   3693    Return REG_MISSING if the number field is empty like "{,1}".
   3694    Return REG_ERROR if an error occurred.  */
   3695 
   3696 static Idx
   3697 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
   3698 {
   3699   Idx num = REG_MISSING;
   3700   unsigned char c;
   3701   while (1)
   3702     {
   3703       fetch_token (token, input, syntax);
   3704       c = token->opr.c;
   3705       if (BE (token->type == END_OF_RE, 0))
   3706 	return REG_ERROR;
   3707       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
   3708 	break;
   3709       num = ((token->type != CHARACTER || c < '0' || '9' < c
   3710 	      || num == REG_ERROR)
   3711 	     ? REG_ERROR
   3712 	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
   3713       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
   3714     }
   3715   return num;
   3716 }
   3717 
   3718 #ifdef RE_ENABLE_I18N
   3720 static void
   3721 free_charset (re_charset_t *cset)
   3722 {
   3723   re_free (cset->mbchars);
   3724 # ifdef _LIBC
   3725   re_free (cset->coll_syms);
   3726   re_free (cset->equiv_classes);
   3727   re_free (cset->range_starts);
   3728   re_free (cset->range_ends);
   3729 # endif
   3730   re_free (cset->char_classes);
   3731   re_free (cset);
   3732 }
   3733 #endif /* RE_ENABLE_I18N */
   3734 
   3735 /* Functions for binary tree operation.  */
   3737 
   3738 /* Create a tree node.  */
   3739 
   3740 static bin_tree_t *
   3741 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
   3742 	     re_token_type_t type)
   3743 {
   3744   re_token_t t;
   3745   t.type = type;
   3746   return create_token_tree (dfa, left, right, &t);
   3747 }
   3748 
   3749 static bin_tree_t *
   3750 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
   3751 		   const re_token_t *token)
   3752 {
   3753   bin_tree_t *tree;
   3754   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
   3755     {
   3756       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
   3757 
   3758       if (storage == NULL)
   3759 	return NULL;
   3760       storage->next = dfa->str_tree_storage;
   3761       dfa->str_tree_storage = storage;
   3762       dfa->str_tree_storage_idx = 0;
   3763     }
   3764   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
   3765 
   3766   tree->parent = NULL;
   3767   tree->left = left;
   3768   tree->right = right;
   3769   tree->token = *token;
   3770   tree->token.duplicated = 0;
   3771   tree->token.opt_subexp = 0;
   3772   tree->first = NULL;
   3773   tree->next = NULL;
   3774   tree->node_idx = REG_MISSING;
   3775 
   3776   if (left != NULL)
   3777     left->parent = tree;
   3778   if (right != NULL)
   3779     right->parent = tree;
   3780   return tree;
   3781 }
   3782 
   3783 /* Mark the tree SRC as an optional subexpression.
   3784    To be called from preorder or postorder.  */
   3785 
   3786 static reg_errcode_t
   3787 mark_opt_subexp (void *extra, bin_tree_t *node)
   3788 {
   3789   Idx idx = (Idx) (intptr_t) extra;
   3790   assert(sizeof(void*) >= sizeof(Idx));
   3791   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
   3792     node->token.opt_subexp = 1;
   3793 
   3794   return REG_NOERROR;
   3795 }
   3796 
   3797 /* Free the allocated memory inside NODE. */
   3798 
   3799 static void
   3800 free_token (re_token_t *node)
   3801 {
   3802 #ifdef RE_ENABLE_I18N
   3803   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
   3804     free_charset (node->opr.mbcset);
   3805   else
   3806 #endif /* RE_ENABLE_I18N */
   3807     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
   3808       re_free (node->opr.sbcset);
   3809 }
   3810 
   3811 /* Worker function for tree walking.  Free the allocated memory inside NODE
   3812    and its children. */
   3813 
   3814 static reg_errcode_t
   3815 free_tree (void *extra, bin_tree_t *node)
   3816 {
   3817   free_token (&node->token);
   3818   return REG_NOERROR;
   3819 }
   3820 
   3821 
   3822 /* Duplicate the node SRC, and return new node.  This is a preorder
   3823    visit similar to the one implemented by the generic visitor, but
   3824    we need more infrastructure to maintain two parallel trees --- so,
   3825    it's easier to duplicate.  */
   3826 
   3827 static bin_tree_t *
   3828 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
   3829 {
   3830   const bin_tree_t *node;
   3831   bin_tree_t *dup_root;
   3832   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
   3833 
   3834   for (node = root; ; )
   3835     {
   3836       /* Create a new tree and link it back to the current parent.  */
   3837       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
   3838       if (*p_new == NULL)
   3839 	return NULL;
   3840       (*p_new)->parent = dup_node;
   3841       (*p_new)->token.duplicated = 1;
   3842       dup_node = *p_new;
   3843 
   3844       /* Go to the left node, or up and to the right.  */
   3845       if (node->left)
   3846 	{
   3847 	  node = node->left;
   3848 	  p_new = &dup_node->left;
   3849 	}
   3850       else
   3851 	{
   3852 	  const bin_tree_t *prev = NULL;
   3853 	  while (node->right == prev || node->right == NULL)
   3854 	    {
   3855 	      prev = node;
   3856 	      node = node->parent;
   3857 	      dup_node = dup_node->parent;
   3858 	      if (!node)
   3859 	        return dup_root;
   3860 	    }
   3861 	  node = node->right;
   3862 	  p_new = &dup_node->right;
   3863 	}
   3864     }
   3865 }
   3866