1 /* -*- buffer-read-only: t -*- vi: set ro: */ 2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ 3 /* Extended regular expression matching and search library. 4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 5 Free Software Foundation, Inc. 6 This file is part of the GNU C Library. 7 Contributed by Isamu Hasegawa <isamu (at) yamato.ibm.com>. 8 9 This program is free software; you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3, or (at your option) 12 any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License along 20 with this program; if not, write to the Free Software Foundation, 21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 22 23 static void re_string_construct_common (const char *str, Idx len, 24 re_string_t *pstr, 25 RE_TRANSLATE_TYPE trans, bool icase, 26 const re_dfa_t *dfa) internal_function; 27 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa, 28 const re_node_set *nodes, 29 re_hashval_t hash) internal_function; 30 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa, 31 const re_node_set *nodes, 32 unsigned int context, 33 re_hashval_t hash) internal_function; 34 35 /* Functions for string operation. */ 37 38 /* This function allocate the buffers. It is necessary to call 39 re_string_reconstruct before using the object. */ 40 41 static reg_errcode_t 42 internal_function 43 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len, 44 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 45 { 46 reg_errcode_t ret; 47 Idx init_buf_len; 48 49 /* Ensure at least one character fits into the buffers. */ 50 if (init_len < dfa->mb_cur_max) 51 init_len = dfa->mb_cur_max; 52 init_buf_len = (len + 1 < init_len) ? len + 1: init_len; 53 re_string_construct_common (str, len, pstr, trans, icase, dfa); 54 55 ret = re_string_realloc_buffers (pstr, init_buf_len); 56 if (BE (ret != REG_NOERROR, 0)) 57 return ret; 58 59 pstr->word_char = dfa->word_char; 60 pstr->word_ops_used = dfa->word_ops_used; 61 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 62 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len; 63 pstr->valid_raw_len = pstr->valid_len; 64 return REG_NOERROR; 65 } 66 67 /* This function allocate the buffers, and initialize them. */ 68 69 static reg_errcode_t 70 internal_function 71 re_string_construct (re_string_t *pstr, const char *str, Idx len, 72 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa) 73 { 74 reg_errcode_t ret; 75 memset (pstr, '\0', sizeof (re_string_t)); 76 re_string_construct_common (str, len, pstr, trans, icase, dfa); 77 78 if (len > 0) 79 { 80 ret = re_string_realloc_buffers (pstr, len + 1); 81 if (BE (ret != REG_NOERROR, 0)) 82 return ret; 83 } 84 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str; 85 86 if (icase) 87 { 88 #ifdef RE_ENABLE_I18N 89 if (dfa->mb_cur_max > 1) 90 { 91 while (1) 92 { 93 ret = build_wcs_upper_buffer (pstr); 94 if (BE (ret != REG_NOERROR, 0)) 95 return ret; 96 if (pstr->valid_raw_len >= len) 97 break; 98 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max) 99 break; 100 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2); 101 if (BE (ret != REG_NOERROR, 0)) 102 return ret; 103 } 104 } 105 else 106 #endif /* RE_ENABLE_I18N */ 107 build_upper_buffer (pstr); 108 } 109 else 110 { 111 #ifdef RE_ENABLE_I18N 112 if (dfa->mb_cur_max > 1) 113 build_wcs_buffer (pstr); 114 else 115 #endif /* RE_ENABLE_I18N */ 116 { 117 if (trans != NULL) 118 re_string_translate_buffer (pstr); 119 else 120 { 121 pstr->valid_len = pstr->bufs_len; 122 pstr->valid_raw_len = pstr->bufs_len; 123 } 124 } 125 } 126 127 return REG_NOERROR; 128 } 129 130 /* Helper functions for re_string_allocate, and re_string_construct. */ 131 132 static reg_errcode_t 133 internal_function 134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len) 135 { 136 #ifdef RE_ENABLE_I18N 137 if (pstr->mb_cur_max > 1) 138 { 139 wint_t *new_wcs; 140 141 /* Avoid overflow. */ 142 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx)); 143 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0)) 144 return REG_ESPACE; 145 146 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len); 147 if (BE (new_wcs == NULL, 0)) 148 return REG_ESPACE; 149 pstr->wcs = new_wcs; 150 if (pstr->offsets != NULL) 151 { 152 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len); 153 if (BE (new_offsets == NULL, 0)) 154 return REG_ESPACE; 155 pstr->offsets = new_offsets; 156 } 157 } 158 #endif /* RE_ENABLE_I18N */ 159 if (pstr->mbs_allocated) 160 { 161 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char, 162 new_buf_len); 163 if (BE (new_mbs == NULL, 0)) 164 return REG_ESPACE; 165 pstr->mbs = new_mbs; 166 } 167 pstr->bufs_len = new_buf_len; 168 return REG_NOERROR; 169 } 170 171 172 static void 173 internal_function 174 re_string_construct_common (const char *str, Idx len, re_string_t *pstr, 175 RE_TRANSLATE_TYPE trans, bool icase, 176 const re_dfa_t *dfa) 177 { 178 pstr->raw_mbs = (const unsigned char *) str; 179 pstr->len = len; 180 pstr->raw_len = len; 181 pstr->trans = trans; 182 pstr->icase = icase; 183 pstr->mbs_allocated = (trans != NULL || icase); 184 pstr->mb_cur_max = dfa->mb_cur_max; 185 pstr->is_utf8 = dfa->is_utf8; 186 pstr->map_notascii = dfa->map_notascii; 187 pstr->stop = pstr->len; 188 pstr->raw_stop = pstr->stop; 189 } 190 191 #ifdef RE_ENABLE_I18N 192 193 /* Build wide character buffer PSTR->WCS. 194 If the byte sequence of the string are: 195 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3> 196 Then wide character buffer will be: 197 <wc1> , WEOF , <wc2> , WEOF , <wc3> 198 We use WEOF for padding, they indicate that the position isn't 199 a first byte of a multibyte character. 200 201 Note that this function assumes PSTR->VALID_LEN elements are already 202 built and starts from PSTR->VALID_LEN. */ 203 204 static void 205 internal_function 206 build_wcs_buffer (re_string_t *pstr) 207 { 208 #ifdef _LIBC 209 unsigned char buf[MB_LEN_MAX]; 210 assert (MB_LEN_MAX >= pstr->mb_cur_max); 211 #else 212 unsigned char buf[64]; 213 #endif 214 mbstate_t prev_st; 215 Idx byte_idx, end_idx, remain_len; 216 size_t mbclen; 217 218 /* Build the buffers from pstr->valid_len to either pstr->len or 219 pstr->bufs_len. */ 220 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 221 for (byte_idx = pstr->valid_len; byte_idx < end_idx;) 222 { 223 wchar_t wc; 224 const char *p; 225 226 remain_len = end_idx - byte_idx; 227 prev_st = pstr->cur_state; 228 /* Apply the translation if we need. */ 229 if (BE (pstr->trans != NULL, 0)) 230 { 231 int i, ch; 232 233 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 234 { 235 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i]; 236 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch]; 237 } 238 p = (const char *) buf; 239 } 240 else 241 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx; 242 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 243 if (BE (mbclen == (size_t) -2, 0)) 244 { 245 /* The buffer doesn't have enough space, finish to build. */ 246 pstr->cur_state = prev_st; 247 break; 248 } 249 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0)) 250 { 251 /* We treat these cases as a singlebyte character. */ 252 mbclen = 1; 253 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 254 if (BE (pstr->trans != NULL, 0)) 255 wc = pstr->trans[wc]; 256 pstr->cur_state = prev_st; 257 } 258 259 /* Write wide character and padding. */ 260 pstr->wcs[byte_idx++] = wc; 261 /* Write paddings. */ 262 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 263 pstr->wcs[byte_idx++] = WEOF; 264 } 265 pstr->valid_len = byte_idx; 266 pstr->valid_raw_len = byte_idx; 267 } 268 269 /* Build wide character buffer PSTR->WCS like build_wcs_buffer, 270 but for REG_ICASE. */ 271 272 static reg_errcode_t 273 internal_function 274 build_wcs_upper_buffer (re_string_t *pstr) 275 { 276 mbstate_t prev_st; 277 Idx src_idx, byte_idx, end_idx, remain_len; 278 size_t mbclen; 279 #ifdef _LIBC 280 char buf[MB_LEN_MAX]; 281 assert (MB_LEN_MAX >= pstr->mb_cur_max); 282 #else 283 char buf[64]; 284 #endif 285 286 byte_idx = pstr->valid_len; 287 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 288 289 /* The following optimization assumes that ASCII characters can be 290 mapped to wide characters with a simple cast. */ 291 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed) 292 { 293 while (byte_idx < end_idx) 294 { 295 wchar_t wc; 296 297 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]) 298 && mbsinit (&pstr->cur_state)) 299 { 300 /* In case of a singlebyte character. */ 301 pstr->mbs[byte_idx] 302 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]); 303 /* The next step uses the assumption that wchar_t is encoded 304 ASCII-safe: all ASCII values can be converted like this. */ 305 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx]; 306 ++byte_idx; 307 continue; 308 } 309 310 remain_len = end_idx - byte_idx; 311 prev_st = pstr->cur_state; 312 mbclen = __mbrtowc (&wc, 313 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx 314 + byte_idx), remain_len, &pstr->cur_state); 315 if (BE (mbclen < (size_t) -2, 1)) 316 { 317 wchar_t wcu = wc; 318 if (iswlower (wc)) 319 { 320 size_t mbcdlen; 321 322 wcu = towupper (wc); 323 mbcdlen = wcrtomb (buf, wcu, &prev_st); 324 if (BE (mbclen == mbcdlen, 1)) 325 memcpy (pstr->mbs + byte_idx, buf, mbclen); 326 else 327 { 328 src_idx = byte_idx; 329 goto offsets_needed; 330 } 331 } 332 else 333 memcpy (pstr->mbs + byte_idx, 334 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen); 335 pstr->wcs[byte_idx++] = wcu; 336 /* Write paddings. */ 337 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 338 pstr->wcs[byte_idx++] = WEOF; 339 } 340 else if (mbclen == (size_t) -1 || mbclen == 0) 341 { 342 /* It is an invalid character or '\0'. Just use the byte. */ 343 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]; 344 pstr->mbs[byte_idx] = ch; 345 /* And also cast it to wide char. */ 346 pstr->wcs[byte_idx++] = (wchar_t) ch; 347 if (BE (mbclen == (size_t) -1, 0)) 348 pstr->cur_state = prev_st; 349 } 350 else 351 { 352 /* The buffer doesn't have enough space, finish to build. */ 353 pstr->cur_state = prev_st; 354 break; 355 } 356 } 357 pstr->valid_len = byte_idx; 358 pstr->valid_raw_len = byte_idx; 359 return REG_NOERROR; 360 } 361 else 362 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;) 363 { 364 wchar_t wc; 365 const char *p; 366 offsets_needed: 367 remain_len = end_idx - byte_idx; 368 prev_st = pstr->cur_state; 369 if (BE (pstr->trans != NULL, 0)) 370 { 371 int i, ch; 372 373 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i) 374 { 375 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i]; 376 buf[i] = pstr->trans[ch]; 377 } 378 p = (const char *) buf; 379 } 380 else 381 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx; 382 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state); 383 if (BE (mbclen < (size_t) -2, 1)) 384 { 385 wchar_t wcu = wc; 386 if (iswlower (wc)) 387 { 388 size_t mbcdlen; 389 390 wcu = towupper (wc); 391 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st); 392 if (BE (mbclen == mbcdlen, 1)) 393 memcpy (pstr->mbs + byte_idx, buf, mbclen); 394 else if (mbcdlen != (size_t) -1) 395 { 396 size_t i; 397 398 if (byte_idx + mbcdlen > pstr->bufs_len) 399 { 400 pstr->cur_state = prev_st; 401 break; 402 } 403 404 if (pstr->offsets == NULL) 405 { 406 pstr->offsets = re_malloc (Idx, pstr->bufs_len); 407 408 if (pstr->offsets == NULL) 409 return REG_ESPACE; 410 } 411 if (!pstr->offsets_needed) 412 { 413 for (i = 0; i < (size_t) byte_idx; ++i) 414 pstr->offsets[i] = i; 415 pstr->offsets_needed = 1; 416 } 417 418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen); 419 pstr->wcs[byte_idx] = wcu; 420 pstr->offsets[byte_idx] = src_idx; 421 for (i = 1; i < mbcdlen; ++i) 422 { 423 pstr->offsets[byte_idx + i] 424 = src_idx + (i < mbclen ? i : mbclen - 1); 425 pstr->wcs[byte_idx + i] = WEOF; 426 } 427 pstr->len += mbcdlen - mbclen; 428 if (pstr->raw_stop > src_idx) 429 pstr->stop += mbcdlen - mbclen; 430 end_idx = (pstr->bufs_len > pstr->len) 431 ? pstr->len : pstr->bufs_len; 432 byte_idx += mbcdlen; 433 src_idx += mbclen; 434 continue; 435 } 436 else 437 memcpy (pstr->mbs + byte_idx, p, mbclen); 438 } 439 else 440 memcpy (pstr->mbs + byte_idx, p, mbclen); 441 442 if (BE (pstr->offsets_needed != 0, 0)) 443 { 444 size_t i; 445 for (i = 0; i < mbclen; ++i) 446 pstr->offsets[byte_idx + i] = src_idx + i; 447 } 448 src_idx += mbclen; 449 450 pstr->wcs[byte_idx++] = wcu; 451 /* Write paddings. */ 452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;) 453 pstr->wcs[byte_idx++] = WEOF; 454 } 455 else if (mbclen == (size_t) -1 || mbclen == 0) 456 { 457 /* It is an invalid character or '\0'. Just use the byte. */ 458 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx]; 459 460 if (BE (pstr->trans != NULL, 0)) 461 ch = pstr->trans [ch]; 462 pstr->mbs[byte_idx] = ch; 463 464 if (BE (pstr->offsets_needed != 0, 0)) 465 pstr->offsets[byte_idx] = src_idx; 466 ++src_idx; 467 468 /* And also cast it to wide char. */ 469 pstr->wcs[byte_idx++] = (wchar_t) ch; 470 if (BE (mbclen == (size_t) -1, 0)) 471 pstr->cur_state = prev_st; 472 } 473 else 474 { 475 /* The buffer doesn't have enough space, finish to build. */ 476 pstr->cur_state = prev_st; 477 break; 478 } 479 } 480 pstr->valid_len = byte_idx; 481 pstr->valid_raw_len = src_idx; 482 return REG_NOERROR; 483 } 484 485 /* Skip characters until the index becomes greater than NEW_RAW_IDX. 486 Return the index. */ 487 488 static Idx 489 internal_function 490 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc) 491 { 492 mbstate_t prev_st; 493 Idx rawbuf_idx; 494 size_t mbclen; 495 wint_t wc = WEOF; 496 497 /* Skip the characters which are not necessary to check. */ 498 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len; 499 rawbuf_idx < new_raw_idx;) 500 { 501 wchar_t wc2; 502 Idx remain_len; 503 remain_len = pstr->len - rawbuf_idx; 504 prev_st = pstr->cur_state; 505 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx, 506 remain_len, &pstr->cur_state); 507 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0)) 508 { 509 /* We treat these cases as a single byte character. */ 510 if (mbclen == 0 || remain_len == 0) 511 wc = L'\0'; 512 else 513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx); 514 mbclen = 1; 515 pstr->cur_state = prev_st; 516 } 517 else 518 wc = wc2; 519 /* Then proceed the next character. */ 520 rawbuf_idx += mbclen; 521 } 522 *last_wc = wc; 523 return rawbuf_idx; 524 } 525 #endif /* RE_ENABLE_I18N */ 526 527 /* Build the buffer PSTR->MBS, and apply the translation if we need. 528 This function is used in case of REG_ICASE. */ 529 530 static void 531 internal_function 532 build_upper_buffer (re_string_t *pstr) 533 { 534 Idx char_idx, end_idx; 535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 536 537 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx) 538 { 539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx]; 540 if (BE (pstr->trans != NULL, 0)) 541 ch = pstr->trans[ch]; 542 if (islower (ch)) 543 pstr->mbs[char_idx] = toupper (ch); 544 else 545 pstr->mbs[char_idx] = ch; 546 } 547 pstr->valid_len = char_idx; 548 pstr->valid_raw_len = char_idx; 549 } 550 551 /* Apply TRANS to the buffer in PSTR. */ 552 553 static void 554 internal_function 555 re_string_translate_buffer (re_string_t *pstr) 556 { 557 Idx buf_idx, end_idx; 558 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len; 559 560 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx) 561 { 562 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx]; 563 pstr->mbs[buf_idx] = pstr->trans[ch]; 564 } 565 566 pstr->valid_len = buf_idx; 567 pstr->valid_raw_len = buf_idx; 568 } 569 570 /* This function re-construct the buffers. 571 Concretely, convert to wide character in case of pstr->mb_cur_max > 1, 572 convert to upper case in case of REG_ICASE, apply translation. */ 573 574 static reg_errcode_t 575 internal_function 576 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags) 577 { 578 Idx offset; 579 580 if (BE (pstr->raw_mbs_idx <= idx, 0)) 581 offset = idx - pstr->raw_mbs_idx; 582 else 583 { 584 /* Reset buffer. */ 585 #ifdef RE_ENABLE_I18N 586 if (pstr->mb_cur_max > 1) 587 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 588 #endif /* RE_ENABLE_I18N */ 589 pstr->len = pstr->raw_len; 590 pstr->stop = pstr->raw_stop; 591 pstr->valid_len = 0; 592 pstr->raw_mbs_idx = 0; 593 pstr->valid_raw_len = 0; 594 pstr->offsets_needed = 0; 595 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF 596 : CONTEXT_NEWLINE | CONTEXT_BEGBUF); 597 if (!pstr->mbs_allocated) 598 pstr->mbs = (unsigned char *) pstr->raw_mbs; 599 offset = idx; 600 } 601 602 if (BE (offset != 0, 1)) 603 { 604 /* Should the already checked characters be kept? */ 605 if (BE (offset < pstr->valid_raw_len, 1)) 606 { 607 /* Yes, move them to the front of the buffer. */ 608 #ifdef RE_ENABLE_I18N 609 if (BE (pstr->offsets_needed, 0)) 610 { 611 Idx low = 0, high = pstr->valid_len, mid; 612 do 613 { 614 mid = (high + low) / 2; 615 if (pstr->offsets[mid] > offset) 616 high = mid; 617 else if (pstr->offsets[mid] < offset) 618 low = mid + 1; 619 else 620 break; 621 } 622 while (low < high); 623 if (pstr->offsets[mid] < offset) 624 ++mid; 625 pstr->tip_context = re_string_context_at (pstr, mid - 1, 626 eflags); 627 /* This can be quite complicated, so handle specially 628 only the common and easy case where the character with 629 different length representation of lower and upper 630 case is present at or after offset. */ 631 if (pstr->valid_len > offset 632 && mid == offset && pstr->offsets[mid] == offset) 633 { 634 memmove (pstr->wcs, pstr->wcs + offset, 635 (pstr->valid_len - offset) * sizeof (wint_t)); 636 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset); 637 pstr->valid_len -= offset; 638 pstr->valid_raw_len -= offset; 639 for (low = 0; low < pstr->valid_len; low++) 640 pstr->offsets[low] = pstr->offsets[low + offset] - offset; 641 } 642 else 643 { 644 /* Otherwise, just find out how long the partial multibyte 645 character at offset is and fill it with WEOF/255. */ 646 pstr->len = pstr->raw_len - idx + offset; 647 pstr->stop = pstr->raw_stop - idx + offset; 648 pstr->offsets_needed = 0; 649 while (mid > 0 && pstr->offsets[mid - 1] == offset) 650 --mid; 651 while (mid < pstr->valid_len) 652 if (pstr->wcs[mid] != WEOF) 653 break; 654 else 655 ++mid; 656 if (mid == pstr->valid_len) 657 pstr->valid_len = 0; 658 else 659 { 660 pstr->valid_len = pstr->offsets[mid] - offset; 661 if (pstr->valid_len) 662 { 663 for (low = 0; low < pstr->valid_len; ++low) 664 pstr->wcs[low] = WEOF; 665 memset (pstr->mbs, 255, pstr->valid_len); 666 } 667 } 668 pstr->valid_raw_len = pstr->valid_len; 669 } 670 } 671 else 672 #endif 673 { 674 pstr->tip_context = re_string_context_at (pstr, offset - 1, 675 eflags); 676 #ifdef RE_ENABLE_I18N 677 if (pstr->mb_cur_max > 1) 678 memmove (pstr->wcs, pstr->wcs + offset, 679 (pstr->valid_len - offset) * sizeof (wint_t)); 680 #endif /* RE_ENABLE_I18N */ 681 if (BE (pstr->mbs_allocated, 0)) 682 memmove (pstr->mbs, pstr->mbs + offset, 683 pstr->valid_len - offset); 684 pstr->valid_len -= offset; 685 pstr->valid_raw_len -= offset; 686 #if DEBUG 687 assert (pstr->valid_len > 0); 688 #endif 689 } 690 } 691 else 692 { 693 #ifdef RE_ENABLE_I18N 694 /* No, skip all characters until IDX. */ 695 Idx prev_valid_len = pstr->valid_len; 696 697 if (BE (pstr->offsets_needed, 0)) 698 { 699 pstr->len = pstr->raw_len - idx + offset; 700 pstr->stop = pstr->raw_stop - idx + offset; 701 pstr->offsets_needed = 0; 702 } 703 #endif 704 pstr->valid_len = 0; 705 #ifdef RE_ENABLE_I18N 706 if (pstr->mb_cur_max > 1) 707 { 708 Idx wcs_idx; 709 wint_t wc = WEOF; 710 711 if (pstr->is_utf8) 712 { 713 const unsigned char *raw, *p, *end; 714 715 /* Special case UTF-8. Multi-byte chars start with any 716 byte other than 0x80 - 0xbf. */ 717 raw = pstr->raw_mbs + pstr->raw_mbs_idx; 718 end = raw + (offset - pstr->mb_cur_max); 719 if (end < pstr->raw_mbs) 720 end = pstr->raw_mbs; 721 p = raw + offset - 1; 722 #ifdef _LIBC 723 /* We know the wchar_t encoding is UCS4, so for the simple 724 case, ASCII characters, skip the conversion step. */ 725 if (isascii (*p) && BE (pstr->trans == NULL, 1)) 726 { 727 memset (&pstr->cur_state, '\0', sizeof (mbstate_t)); 728 /* pstr->valid_len = 0; */ 729 wc = (wchar_t) *p; 730 } 731 else 732 #endif 733 for (; p >= end; --p) 734 if ((*p & 0xc0) != 0x80) 735 { 736 mbstate_t cur_state; 737 wchar_t wc2; 738 Idx mlen = raw + pstr->len - p; 739 unsigned char buf[6]; 740 size_t mbclen; 741 742 if (BE (pstr->trans != NULL, 0)) 743 { 744 int i = mlen < 6 ? mlen : 6; 745 while (--i >= 0) 746 buf[i] = pstr->trans[p[i]]; 747 } 748 /* XXX Don't use mbrtowc, we know which conversion 749 to use (UTF-8 -> UCS4). */ 750 memset (&cur_state, 0, sizeof (cur_state)); 751 mbclen = __mbrtowc (&wc2, (const char *) p, mlen, 752 &cur_state); 753 if (raw + offset - p <= mbclen 754 && mbclen < (size_t) -2) 755 { 756 memset (&pstr->cur_state, '\0', 757 sizeof (mbstate_t)); 758 pstr->valid_len = mbclen - (raw + offset - p); 759 wc = wc2; 760 } 761 break; 762 } 763 } 764 765 if (wc == WEOF) 766 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; 767 if (wc == WEOF) 768 pstr->tip_context 769 = re_string_context_at (pstr, prev_valid_len - 1, eflags); 770 else 771 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0) 772 && IS_WIDE_WORD_CHAR (wc)) 773 ? CONTEXT_WORD 774 : ((IS_WIDE_NEWLINE (wc) 775 && pstr->newline_anchor) 776 ? CONTEXT_NEWLINE : 0)); 777 if (BE (pstr->valid_len, 0)) 778 { 779 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx) 780 pstr->wcs[wcs_idx] = WEOF; 781 if (pstr->mbs_allocated) 782 memset (pstr->mbs, 255, pstr->valid_len); 783 } 784 pstr->valid_raw_len = pstr->valid_len; 785 } 786 else 787 #endif /* RE_ENABLE_I18N */ 788 { 789 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1]; 790 pstr->valid_raw_len = 0; 791 if (pstr->trans) 792 c = pstr->trans[c]; 793 pstr->tip_context = (bitset_contain (pstr->word_char, c) 794 ? CONTEXT_WORD 795 : ((IS_NEWLINE (c) && pstr->newline_anchor) 796 ? CONTEXT_NEWLINE : 0)); 797 } 798 } 799 if (!BE (pstr->mbs_allocated, 0)) 800 pstr->mbs += offset; 801 } 802 pstr->raw_mbs_idx = idx; 803 pstr->len -= offset; 804 pstr->stop -= offset; 805 806 /* Then build the buffers. */ 807 #ifdef RE_ENABLE_I18N 808 if (pstr->mb_cur_max > 1) 809 { 810 if (pstr->icase) 811 { 812 reg_errcode_t ret = build_wcs_upper_buffer (pstr); 813 if (BE (ret != REG_NOERROR, 0)) 814 return ret; 815 } 816 else 817 build_wcs_buffer (pstr); 818 } 819 else 820 #endif /* RE_ENABLE_I18N */ 821 if (BE (pstr->mbs_allocated, 0)) 822 { 823 if (pstr->icase) 824 build_upper_buffer (pstr); 825 else if (pstr->trans != NULL) 826 re_string_translate_buffer (pstr); 827 } 828 else 829 pstr->valid_len = pstr->len; 830 831 pstr->cur_idx = 0; 832 return REG_NOERROR; 833 } 834 835 static unsigned char 836 internal_function __attribute ((pure)) 837 re_string_peek_byte_case (const re_string_t *pstr, Idx idx) 838 { 839 int ch; 840 Idx off; 841 842 /* Handle the common (easiest) cases first. */ 843 if (BE (!pstr->mbs_allocated, 1)) 844 return re_string_peek_byte (pstr, idx); 845 846 #ifdef RE_ENABLE_I18N 847 if (pstr->mb_cur_max > 1 848 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx)) 849 return re_string_peek_byte (pstr, idx); 850 #endif 851 852 off = pstr->cur_idx + idx; 853 #ifdef RE_ENABLE_I18N 854 if (pstr->offsets_needed) 855 off = pstr->offsets[off]; 856 #endif 857 858 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 859 860 #ifdef RE_ENABLE_I18N 861 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I 862 this function returns CAPITAL LETTER I instead of first byte of 863 DOTLESS SMALL LETTER I. The latter would confuse the parser, 864 since peek_byte_case doesn't advance cur_idx in any way. */ 865 if (pstr->offsets_needed && !isascii (ch)) 866 return re_string_peek_byte (pstr, idx); 867 #endif 868 869 return ch; 870 } 871 872 static unsigned char 873 internal_function __attribute ((pure)) 874 re_string_fetch_byte_case (re_string_t *pstr) 875 { 876 if (BE (!pstr->mbs_allocated, 1)) 877 return re_string_fetch_byte (pstr); 878 879 #ifdef RE_ENABLE_I18N 880 if (pstr->offsets_needed) 881 { 882 Idx off; 883 int ch; 884 885 /* For tr_TR.UTF-8 [[:islower:]] there is 886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip 887 in that case the whole multi-byte character and return 888 the original letter. On the other side, with 889 [[: DOTLESS SMALL LETTER I return [[:I, as doing 890 anything else would complicate things too much. */ 891 892 if (!re_string_first_byte (pstr, pstr->cur_idx)) 893 return re_string_fetch_byte (pstr); 894 895 off = pstr->offsets[pstr->cur_idx]; 896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off]; 897 898 if (! isascii (ch)) 899 return re_string_fetch_byte (pstr); 900 901 re_string_skip_bytes (pstr, 902 re_string_char_size_at (pstr, pstr->cur_idx)); 903 return ch; 904 } 905 #endif 906 907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++]; 908 } 909 910 static void 911 internal_function 912 re_string_destruct (re_string_t *pstr) 913 { 914 #ifdef RE_ENABLE_I18N 915 re_free (pstr->wcs); 916 re_free (pstr->offsets); 917 #endif /* RE_ENABLE_I18N */ 918 if (pstr->mbs_allocated) 919 re_free (pstr->mbs); 920 } 921 922 /* Return the context at IDX in INPUT. */ 923 924 static unsigned int 925 internal_function 926 re_string_context_at (const re_string_t *input, Idx idx, int eflags) 927 { 928 int c; 929 if (BE (! REG_VALID_INDEX (idx), 0)) 930 /* In this case, we use the value stored in input->tip_context, 931 since we can't know the character in input->mbs[-1] here. */ 932 return input->tip_context; 933 if (BE (idx == input->len, 0)) 934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF 935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF); 936 #ifdef RE_ENABLE_I18N 937 if (input->mb_cur_max > 1) 938 { 939 wint_t wc; 940 Idx wc_idx = idx; 941 while(input->wcs[wc_idx] == WEOF) 942 { 943 #ifdef DEBUG 944 /* It must not happen. */ 945 assert (REG_VALID_INDEX (wc_idx)); 946 #endif 947 --wc_idx; 948 if (! REG_VALID_INDEX (wc_idx)) 949 return input->tip_context; 950 } 951 wc = input->wcs[wc_idx]; 952 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc)) 953 return CONTEXT_WORD; 954 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor 955 ? CONTEXT_NEWLINE : 0); 956 } 957 else 958 #endif 959 { 960 c = re_string_byte_at (input, idx); 961 if (bitset_contain (input->word_char, c)) 962 return CONTEXT_WORD; 963 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0; 964 } 965 } 966 967 /* Functions for set operation. */ 969 970 static reg_errcode_t 971 internal_function 972 re_node_set_alloc (re_node_set *set, Idx size) 973 { 974 set->alloc = size; 975 set->nelem = 0; 976 set->elems = re_malloc (Idx, size); 977 if (BE (set->elems == NULL, 0)) 978 return REG_ESPACE; 979 return REG_NOERROR; 980 } 981 982 static reg_errcode_t 983 internal_function 984 re_node_set_init_1 (re_node_set *set, Idx elem) 985 { 986 set->alloc = 1; 987 set->nelem = 1; 988 set->elems = re_malloc (Idx, 1); 989 if (BE (set->elems == NULL, 0)) 990 { 991 set->alloc = set->nelem = 0; 992 return REG_ESPACE; 993 } 994 set->elems[0] = elem; 995 return REG_NOERROR; 996 } 997 998 static reg_errcode_t 999 internal_function 1000 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2) 1001 { 1002 set->alloc = 2; 1003 set->elems = re_malloc (Idx, 2); 1004 if (BE (set->elems == NULL, 0)) 1005 return REG_ESPACE; 1006 if (elem1 == elem2) 1007 { 1008 set->nelem = 1; 1009 set->elems[0] = elem1; 1010 } 1011 else 1012 { 1013 set->nelem = 2; 1014 if (elem1 < elem2) 1015 { 1016 set->elems[0] = elem1; 1017 set->elems[1] = elem2; 1018 } 1019 else 1020 { 1021 set->elems[0] = elem2; 1022 set->elems[1] = elem1; 1023 } 1024 } 1025 return REG_NOERROR; 1026 } 1027 1028 static reg_errcode_t 1029 internal_function 1030 re_node_set_init_copy (re_node_set *dest, const re_node_set *src) 1031 { 1032 dest->nelem = src->nelem; 1033 if (src->nelem > 0) 1034 { 1035 dest->alloc = dest->nelem; 1036 dest->elems = re_malloc (Idx, dest->alloc); 1037 if (BE (dest->elems == NULL, 0)) 1038 { 1039 dest->alloc = dest->nelem = 0; 1040 return REG_ESPACE; 1041 } 1042 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1043 } 1044 else 1045 re_node_set_init_empty (dest); 1046 return REG_NOERROR; 1047 } 1048 1049 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to 1050 DEST. Return value indicate the error code or REG_NOERROR if succeeded. 1051 Note: We assume dest->elems is NULL, when dest->alloc is 0. */ 1052 1053 static reg_errcode_t 1054 internal_function 1055 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1, 1056 const re_node_set *src2) 1057 { 1058 Idx i1, i2, is, id, delta, sbase; 1059 if (src1->nelem == 0 || src2->nelem == 0) 1060 return REG_NOERROR; 1061 1062 /* We need dest->nelem + 2 * elems_in_intersection; this is a 1063 conservative estimate. */ 1064 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc) 1065 { 1066 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc; 1067 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc); 1068 if (BE (new_elems == NULL, 0)) 1069 return REG_ESPACE; 1070 dest->elems = new_elems; 1071 dest->alloc = new_alloc; 1072 } 1073 1074 /* Find the items in the intersection of SRC1 and SRC2, and copy 1075 into the top of DEST those that are not already in DEST itself. */ 1076 sbase = dest->nelem + src1->nelem + src2->nelem; 1077 i1 = src1->nelem - 1; 1078 i2 = src2->nelem - 1; 1079 id = dest->nelem - 1; 1080 for (;;) 1081 { 1082 if (src1->elems[i1] == src2->elems[i2]) 1083 { 1084 /* Try to find the item in DEST. Maybe we could binary search? */ 1085 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1]) 1086 --id; 1087 1088 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1]) 1089 dest->elems[--sbase] = src1->elems[i1]; 1090 1091 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2)) 1092 break; 1093 } 1094 1095 /* Lower the highest of the two items. */ 1096 else if (src1->elems[i1] < src2->elems[i2]) 1097 { 1098 if (! REG_VALID_INDEX (--i2)) 1099 break; 1100 } 1101 else 1102 { 1103 if (! REG_VALID_INDEX (--i1)) 1104 break; 1105 } 1106 } 1107 1108 id = dest->nelem - 1; 1109 is = dest->nelem + src1->nelem + src2->nelem - 1; 1110 delta = is - sbase + 1; 1111 1112 /* Now copy. When DELTA becomes zero, the remaining 1113 DEST elements are already in place; this is more or 1114 less the same loop that is in re_node_set_merge. */ 1115 dest->nelem += delta; 1116 if (delta > 0 && REG_VALID_INDEX (id)) 1117 for (;;) 1118 { 1119 if (dest->elems[is] > dest->elems[id]) 1120 { 1121 /* Copy from the top. */ 1122 dest->elems[id + delta--] = dest->elems[is--]; 1123 if (delta == 0) 1124 break; 1125 } 1126 else 1127 { 1128 /* Slide from the bottom. */ 1129 dest->elems[id + delta] = dest->elems[id]; 1130 if (! REG_VALID_INDEX (--id)) 1131 break; 1132 } 1133 } 1134 1135 /* Copy remaining SRC elements. */ 1136 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx)); 1137 1138 return REG_NOERROR; 1139 } 1140 1141 /* Calculate the union set of the sets SRC1 and SRC2. And store it to 1142 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1143 1144 static reg_errcode_t 1145 internal_function 1146 re_node_set_init_union (re_node_set *dest, const re_node_set *src1, 1147 const re_node_set *src2) 1148 { 1149 Idx i1, i2, id; 1150 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0) 1151 { 1152 dest->alloc = src1->nelem + src2->nelem; 1153 dest->elems = re_malloc (Idx, dest->alloc); 1154 if (BE (dest->elems == NULL, 0)) 1155 return REG_ESPACE; 1156 } 1157 else 1158 { 1159 if (src1 != NULL && src1->nelem > 0) 1160 return re_node_set_init_copy (dest, src1); 1161 else if (src2 != NULL && src2->nelem > 0) 1162 return re_node_set_init_copy (dest, src2); 1163 else 1164 re_node_set_init_empty (dest); 1165 return REG_NOERROR; 1166 } 1167 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;) 1168 { 1169 if (src1->elems[i1] > src2->elems[i2]) 1170 { 1171 dest->elems[id++] = src2->elems[i2++]; 1172 continue; 1173 } 1174 if (src1->elems[i1] == src2->elems[i2]) 1175 ++i2; 1176 dest->elems[id++] = src1->elems[i1++]; 1177 } 1178 if (i1 < src1->nelem) 1179 { 1180 memcpy (dest->elems + id, src1->elems + i1, 1181 (src1->nelem - i1) * sizeof (Idx)); 1182 id += src1->nelem - i1; 1183 } 1184 else if (i2 < src2->nelem) 1185 { 1186 memcpy (dest->elems + id, src2->elems + i2, 1187 (src2->nelem - i2) * sizeof (Idx)); 1188 id += src2->nelem - i2; 1189 } 1190 dest->nelem = id; 1191 return REG_NOERROR; 1192 } 1193 1194 /* Calculate the union set of the sets DEST and SRC. And store it to 1195 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */ 1196 1197 static reg_errcode_t 1198 internal_function 1199 re_node_set_merge (re_node_set *dest, const re_node_set *src) 1200 { 1201 Idx is, id, sbase, delta; 1202 if (src == NULL || src->nelem == 0) 1203 return REG_NOERROR; 1204 if (dest->alloc < 2 * src->nelem + dest->nelem) 1205 { 1206 Idx new_alloc = 2 * (src->nelem + dest->alloc); 1207 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc); 1208 if (BE (new_buffer == NULL, 0)) 1209 return REG_ESPACE; 1210 dest->elems = new_buffer; 1211 dest->alloc = new_alloc; 1212 } 1213 1214 if (BE (dest->nelem == 0, 0)) 1215 { 1216 dest->nelem = src->nelem; 1217 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx)); 1218 return REG_NOERROR; 1219 } 1220 1221 /* Copy into the top of DEST the items of SRC that are not 1222 found in DEST. Maybe we could binary search in DEST? */ 1223 for (sbase = dest->nelem + 2 * src->nelem, 1224 is = src->nelem - 1, id = dest->nelem - 1; 1225 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); ) 1226 { 1227 if (dest->elems[id] == src->elems[is]) 1228 is--, id--; 1229 else if (dest->elems[id] < src->elems[is]) 1230 dest->elems[--sbase] = src->elems[is--]; 1231 else /* if (dest->elems[id] > src->elems[is]) */ 1232 --id; 1233 } 1234 1235 if (REG_VALID_INDEX (is)) 1236 { 1237 /* If DEST is exhausted, the remaining items of SRC must be unique. */ 1238 sbase -= is + 1; 1239 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx)); 1240 } 1241 1242 id = dest->nelem - 1; 1243 is = dest->nelem + 2 * src->nelem - 1; 1244 delta = is - sbase + 1; 1245 if (delta == 0) 1246 return REG_NOERROR; 1247 1248 /* Now copy. When DELTA becomes zero, the remaining 1249 DEST elements are already in place. */ 1250 dest->nelem += delta; 1251 for (;;) 1252 { 1253 if (dest->elems[is] > dest->elems[id]) 1254 { 1255 /* Copy from the top. */ 1256 dest->elems[id + delta--] = dest->elems[is--]; 1257 if (delta == 0) 1258 break; 1259 } 1260 else 1261 { 1262 /* Slide from the bottom. */ 1263 dest->elems[id + delta] = dest->elems[id]; 1264 if (! REG_VALID_INDEX (--id)) 1265 { 1266 /* Copy remaining SRC elements. */ 1267 memcpy (dest->elems, dest->elems + sbase, 1268 delta * sizeof (Idx)); 1269 break; 1270 } 1271 } 1272 } 1273 1274 return REG_NOERROR; 1275 } 1276 1277 /* Insert the new element ELEM to the re_node_set* SET. 1278 SET should not already have ELEM. 1279 Return true if successful. */ 1280 1281 static bool 1282 internal_function 1283 re_node_set_insert (re_node_set *set, Idx elem) 1284 { 1285 Idx idx; 1286 /* In case the set is empty. */ 1287 if (set->alloc == 0) 1288 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1); 1289 1290 if (BE (set->nelem, 0) == 0) 1291 { 1292 /* We already guaranteed above that set->alloc != 0. */ 1293 set->elems[0] = elem; 1294 ++set->nelem; 1295 return true; 1296 } 1297 1298 /* Realloc if we need. */ 1299 if (set->alloc == set->nelem) 1300 { 1301 Idx *new_elems; 1302 set->alloc = set->alloc * 2; 1303 new_elems = re_realloc (set->elems, Idx, set->alloc); 1304 if (BE (new_elems == NULL, 0)) 1305 return false; 1306 set->elems = new_elems; 1307 } 1308 1309 /* Move the elements which follows the new element. Test the 1310 first element separately to skip a check in the inner loop. */ 1311 if (elem < set->elems[0]) 1312 { 1313 idx = 0; 1314 for (idx = set->nelem; idx > 0; idx--) 1315 set->elems[idx] = set->elems[idx - 1]; 1316 } 1317 else 1318 { 1319 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--) 1320 set->elems[idx] = set->elems[idx - 1]; 1321 } 1322 1323 /* Insert the new element. */ 1324 set->elems[idx] = elem; 1325 ++set->nelem; 1326 return true; 1327 } 1328 1329 /* Insert the new element ELEM to the re_node_set* SET. 1330 SET should not already have any element greater than or equal to ELEM. 1331 Return true if successful. */ 1332 1333 static bool 1334 internal_function 1335 re_node_set_insert_last (re_node_set *set, Idx elem) 1336 { 1337 /* Realloc if we need. */ 1338 if (set->alloc == set->nelem) 1339 { 1340 Idx *new_elems; 1341 set->alloc = (set->alloc + 1) * 2; 1342 new_elems = re_realloc (set->elems, Idx, set->alloc); 1343 if (BE (new_elems == NULL, 0)) 1344 return false; 1345 set->elems = new_elems; 1346 } 1347 1348 /* Insert the new element. */ 1349 set->elems[set->nelem++] = elem; 1350 return true; 1351 } 1352 1353 /* Compare two node sets SET1 and SET2. 1354 Return true if SET1 and SET2 are equivalent. */ 1355 1356 static bool 1357 internal_function __attribute ((pure)) 1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2) 1359 { 1360 Idx i; 1361 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem) 1362 return false; 1363 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; ) 1364 if (set1->elems[i] != set2->elems[i]) 1365 return false; 1366 return true; 1367 } 1368 1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */ 1370 1371 static Idx 1372 internal_function __attribute ((pure)) 1373 re_node_set_contains (const re_node_set *set, Idx elem) 1374 { 1375 __re_size_t idx, right, mid; 1376 if (! REG_VALID_NONZERO_INDEX (set->nelem)) 1377 return 0; 1378 1379 /* Binary search the element. */ 1380 idx = 0; 1381 right = set->nelem - 1; 1382 while (idx < right) 1383 { 1384 mid = (idx + right) / 2; 1385 if (set->elems[mid] < elem) 1386 idx = mid + 1; 1387 else 1388 right = mid; 1389 } 1390 return set->elems[idx] == elem ? idx + 1 : 0; 1391 } 1392 1393 static void 1394 internal_function 1395 re_node_set_remove_at (re_node_set *set, Idx idx) 1396 { 1397 if (idx < 0 || idx >= set->nelem) 1398 return; 1399 --set->nelem; 1400 for (; idx < set->nelem; idx++) 1401 set->elems[idx] = set->elems[idx + 1]; 1402 } 1403 1404 1406 /* Add the token TOKEN to dfa->nodes, and return the index of the token. 1407 Or return REG_MISSING if an error occurred. */ 1408 1409 static Idx 1410 internal_function 1411 re_dfa_add_node (re_dfa_t *dfa, re_token_t token) 1412 { 1413 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0)) 1414 { 1415 size_t new_nodes_alloc = dfa->nodes_alloc * 2; 1416 Idx *new_nexts, *new_indices; 1417 re_node_set *new_edests, *new_eclosures; 1418 re_token_t *new_nodes; 1419 size_t max_object_size = 1420 MAX (sizeof (re_token_t), 1421 MAX (sizeof (re_node_set), 1422 sizeof (Idx))); 1423 1424 /* Avoid overflows. */ 1425 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0)) 1426 return REG_MISSING; 1427 1428 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc); 1429 if (BE (new_nodes == NULL, 0)) 1430 return REG_MISSING; 1431 dfa->nodes = new_nodes; 1432 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc); 1433 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc); 1434 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc); 1435 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc); 1436 if (BE (new_nexts == NULL || new_indices == NULL 1437 || new_edests == NULL || new_eclosures == NULL, 0)) 1438 return REG_MISSING; 1439 dfa->nexts = new_nexts; 1440 dfa->org_indices = new_indices; 1441 dfa->edests = new_edests; 1442 dfa->eclosures = new_eclosures; 1443 dfa->nodes_alloc = new_nodes_alloc; 1444 } 1445 dfa->nodes[dfa->nodes_len] = token; 1446 dfa->nodes[dfa->nodes_len].constraint = 0; 1447 #ifdef RE_ENABLE_I18N 1448 { 1449 int type = token.type; 1450 dfa->nodes[dfa->nodes_len].accept_mb = 1451 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET; 1452 } 1453 #endif 1454 dfa->nexts[dfa->nodes_len] = REG_MISSING; 1455 re_node_set_init_empty (dfa->edests + dfa->nodes_len); 1456 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len); 1457 return dfa->nodes_len++; 1458 } 1459 1460 static inline re_hashval_t 1461 internal_function 1462 calc_state_hash (const re_node_set *nodes, unsigned int context) 1463 { 1464 re_hashval_t hash = nodes->nelem + context; 1465 Idx i; 1466 for (i = 0 ; i < nodes->nelem ; i++) 1467 hash += nodes->elems[i]; 1468 return hash; 1469 } 1470 1471 /* Search for the state whose node_set is equivalent to NODES. 1472 Return the pointer to the state, if we found it in the DFA. 1473 Otherwise create the new one and return it. In case of an error 1474 return NULL and set the error code in ERR. 1475 Note: - We assume NULL as the invalid state, then it is possible that 1476 return value is NULL and ERR is REG_NOERROR. 1477 - We never return non-NULL value in case of any errors, it is for 1478 optimization. */ 1479 1480 static re_dfastate_t * 1481 internal_function 1482 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa, 1483 const re_node_set *nodes) 1484 { 1485 re_hashval_t hash; 1486 re_dfastate_t *new_state; 1487 struct re_state_table_entry *spot; 1488 Idx i; 1489 #ifdef lint 1490 /* Suppress bogus uninitialized-variable warnings. */ 1491 *err = REG_NOERROR; 1492 #endif 1493 if (BE (nodes->nelem == 0, 0)) 1494 { 1495 *err = REG_NOERROR; 1496 return NULL; 1497 } 1498 hash = calc_state_hash (nodes, 0); 1499 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1500 1501 for (i = 0 ; i < spot->num ; i++) 1502 { 1503 re_dfastate_t *state = spot->array[i]; 1504 if (hash != state->hash) 1505 continue; 1506 if (re_node_set_compare (&state->nodes, nodes)) 1507 return state; 1508 } 1509 1510 /* There are no appropriate state in the dfa, create the new one. */ 1511 new_state = create_ci_newstate (dfa, nodes, hash); 1512 if (BE (new_state == NULL, 0)) 1513 *err = REG_ESPACE; 1514 1515 return new_state; 1516 } 1517 1518 /* Search for the state whose node_set is equivalent to NODES and 1519 whose context is equivalent to CONTEXT. 1520 Return the pointer to the state, if we found it in the DFA. 1521 Otherwise create the new one and return it. In case of an error 1522 return NULL and set the error code in ERR. 1523 Note: - We assume NULL as the invalid state, then it is possible that 1524 return value is NULL and ERR is REG_NOERROR. 1525 - We never return non-NULL value in case of any errors, it is for 1526 optimization. */ 1527 1528 static re_dfastate_t * 1529 internal_function 1530 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa, 1531 const re_node_set *nodes, unsigned int context) 1532 { 1533 re_hashval_t hash; 1534 re_dfastate_t *new_state; 1535 struct re_state_table_entry *spot; 1536 Idx i; 1537 #ifdef lint 1538 /* Suppress bogus uninitialized-variable warnings. */ 1539 *err = REG_NOERROR; 1540 #endif 1541 if (nodes->nelem == 0) 1542 { 1543 *err = REG_NOERROR; 1544 return NULL; 1545 } 1546 hash = calc_state_hash (nodes, context); 1547 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1548 1549 for (i = 0 ; i < spot->num ; i++) 1550 { 1551 re_dfastate_t *state = spot->array[i]; 1552 if (state->hash == hash 1553 && state->context == context 1554 && re_node_set_compare (state->entrance_nodes, nodes)) 1555 return state; 1556 } 1557 /* There are no appropriate state in `dfa', create the new one. */ 1558 new_state = create_cd_newstate (dfa, nodes, context, hash); 1559 if (BE (new_state == NULL, 0)) 1560 *err = REG_ESPACE; 1561 1562 return new_state; 1563 } 1564 1565 /* Finish initialization of the new state NEWSTATE, and using its hash value 1566 HASH put in the appropriate bucket of DFA's state table. Return value 1567 indicates the error code if failed. */ 1568 1569 static reg_errcode_t 1570 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate, 1571 re_hashval_t hash) 1572 { 1573 struct re_state_table_entry *spot; 1574 reg_errcode_t err; 1575 Idx i; 1576 1577 newstate->hash = hash; 1578 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem); 1579 if (BE (err != REG_NOERROR, 0)) 1580 return REG_ESPACE; 1581 for (i = 0; i < newstate->nodes.nelem; i++) 1582 { 1583 Idx elem = newstate->nodes.elems[i]; 1584 if (!IS_EPSILON_NODE (dfa->nodes[elem].type)) 1585 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0)) 1586 return REG_ESPACE; 1587 } 1588 1589 spot = dfa->state_table + (hash & dfa->state_hash_mask); 1590 if (BE (spot->alloc <= spot->num, 0)) 1591 { 1592 Idx new_alloc = 2 * spot->num + 2; 1593 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *, 1594 new_alloc); 1595 if (BE (new_array == NULL, 0)) 1596 return REG_ESPACE; 1597 spot->array = new_array; 1598 spot->alloc = new_alloc; 1599 } 1600 spot->array[spot->num++] = newstate; 1601 return REG_NOERROR; 1602 } 1603 1604 static void 1605 free_state (re_dfastate_t *state) 1606 { 1607 re_node_set_free (&state->non_eps_nodes); 1608 re_node_set_free (&state->inveclosure); 1609 if (state->entrance_nodes != &state->nodes) 1610 { 1611 re_node_set_free (state->entrance_nodes); 1612 re_free (state->entrance_nodes); 1613 } 1614 re_node_set_free (&state->nodes); 1615 re_free (state->word_trtable); 1616 re_free (state->trtable); 1617 re_free (state); 1618 } 1619 1620 /* Create the new state which is independ of contexts. 1621 Return the new state if succeeded, otherwise return NULL. */ 1622 1623 static re_dfastate_t * 1624 internal_function 1625 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1626 re_hashval_t hash) 1627 { 1628 Idx i; 1629 reg_errcode_t err; 1630 re_dfastate_t *newstate; 1631 1632 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1633 if (BE (newstate == NULL, 0)) 1634 return NULL; 1635 err = re_node_set_init_copy (&newstate->nodes, nodes); 1636 if (BE (err != REG_NOERROR, 0)) 1637 { 1638 re_free (newstate); 1639 return NULL; 1640 } 1641 1642 newstate->entrance_nodes = &newstate->nodes; 1643 for (i = 0 ; i < nodes->nelem ; i++) 1644 { 1645 re_token_t *node = dfa->nodes + nodes->elems[i]; 1646 re_token_type_t type = node->type; 1647 if (type == CHARACTER && !node->constraint) 1648 continue; 1649 #ifdef RE_ENABLE_I18N 1650 newstate->accept_mb |= node->accept_mb; 1651 #endif /* RE_ENABLE_I18N */ 1652 1653 /* If the state has the halt node, the state is a halt state. */ 1654 if (type == END_OF_RE) 1655 newstate->halt = 1; 1656 else if (type == OP_BACK_REF) 1657 newstate->has_backref = 1; 1658 else if (type == ANCHOR || node->constraint) 1659 newstate->has_constraint = 1; 1660 } 1661 err = register_state (dfa, newstate, hash); 1662 if (BE (err != REG_NOERROR, 0)) 1663 { 1664 free_state (newstate); 1665 newstate = NULL; 1666 } 1667 return newstate; 1668 } 1669 1670 /* Create the new state which is depend on the context CONTEXT. 1671 Return the new state if succeeded, otherwise return NULL. */ 1672 1673 static re_dfastate_t * 1674 internal_function 1675 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes, 1676 unsigned int context, re_hashval_t hash) 1677 { 1678 Idx i, nctx_nodes = 0; 1679 reg_errcode_t err; 1680 re_dfastate_t *newstate; 1681 1682 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1); 1683 if (BE (newstate == NULL, 0)) 1684 return NULL; 1685 err = re_node_set_init_copy (&newstate->nodes, nodes); 1686 if (BE (err != REG_NOERROR, 0)) 1687 { 1688 re_free (newstate); 1689 return NULL; 1690 } 1691 1692 newstate->context = context; 1693 newstate->entrance_nodes = &newstate->nodes; 1694 1695 for (i = 0 ; i < nodes->nelem ; i++) 1696 { 1697 re_token_t *node = dfa->nodes + nodes->elems[i]; 1698 re_token_type_t type = node->type; 1699 unsigned int constraint = node->constraint; 1700 1701 if (type == CHARACTER && !constraint) 1702 continue; 1703 #ifdef RE_ENABLE_I18N 1704 newstate->accept_mb |= node->accept_mb; 1705 #endif /* RE_ENABLE_I18N */ 1706 1707 /* If the state has the halt node, the state is a halt state. */ 1708 if (type == END_OF_RE) 1709 newstate->halt = 1; 1710 else if (type == OP_BACK_REF) 1711 newstate->has_backref = 1; 1712 1713 if (constraint) 1714 { 1715 if (newstate->entrance_nodes == &newstate->nodes) 1716 { 1717 newstate->entrance_nodes = re_malloc (re_node_set, 1); 1718 if (BE (newstate->entrance_nodes == NULL, 0)) 1719 { 1720 free_state (newstate); 1721 return NULL; 1722 } 1723 re_node_set_init_copy (newstate->entrance_nodes, nodes); 1724 nctx_nodes = 0; 1725 newstate->has_constraint = 1; 1726 } 1727 1728 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context)) 1729 { 1730 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes); 1731 ++nctx_nodes; 1732 } 1733 } 1734 } 1735 err = register_state (dfa, newstate, hash); 1736 if (BE (err != REG_NOERROR, 0)) 1737 { 1738 free_state (newstate); 1739 newstate = NULL; 1740 } 1741 return newstate; 1742 } 1743