1 /********************************************************************** 2 regparse.c - Oniguruma (regular expression library) 3 **********************************************************************/ 4 /*- 5 * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> 6 * All rights reserved. 7 * 8 * (C) Copyright 2015 Hewlett Packard Enterprise Development LP<BR> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include "regparse.h" 33 #include "st.h" 34 35 #define WARN_BUFSIZE 256 36 37 #define CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 38 39 40 OnigSyntaxType OnigSyntaxRuby = { 41 (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | 42 ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | 43 ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | 44 ONIG_SYN_OP_ESC_C_CONTROL ) 45 & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) 46 , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | 47 ONIG_SYN_OP2_OPTION_RUBY | 48 ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | 49 ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | 50 ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY | 51 ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | 52 ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | 53 ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | 54 ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | 55 ONIG_SYN_OP2_ESC_H_XDIGIT ) 56 , ( SYN_GNU_REGEX_BV | 57 ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | 58 ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | 59 ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | 60 ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | 61 ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | 62 ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | 63 ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) 64 , ONIG_OPTION_NONE 65 , 66 { 67 (OnigCodePoint )'\\' /* esc */ 68 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ 69 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ 70 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ 71 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ 72 , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ 73 } 74 }; 75 76 OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; 77 78 extern void onig_null_warn(const char* s ARG_UNUSED) { } 79 80 #ifdef DEFAULT_WARN_FUNCTION 81 static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; 82 #else 83 static OnigWarnFunc onig_warn = onig_null_warn; 84 #endif 85 86 #ifdef DEFAULT_VERB_WARN_FUNCTION 87 static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; 88 #else 89 static OnigWarnFunc onig_verb_warn = onig_null_warn; 90 #endif 91 92 extern void onig_set_warn_func(OnigWarnFunc f) 93 { 94 onig_warn = f; 95 } 96 97 extern void onig_set_verb_warn_func(OnigWarnFunc f) 98 { 99 onig_verb_warn = f; 100 } 101 102 static void 103 bbuf_free(BBuf* bbuf) 104 { 105 if (IS_NOT_NULL(bbuf)) { 106 if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); 107 xfree(bbuf); 108 } 109 } 110 111 static int 112 bbuf_clone(BBuf** rto, BBuf* from) 113 { 114 int r; 115 BBuf *to; 116 117 *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); 118 CHECK_NULL_RETURN_MEMERR(to); 119 r = BBUF_INIT(to, from->alloc); 120 if (r != 0) return r; 121 to->used = from->used; 122 xmemcpy(to->p, from->p, from->used); 123 return 0; 124 } 125 126 #define BACKREF_REL_TO_ABS(rel_no, env) \ 127 ((env)->num_mem + 1 + (rel_no)) 128 129 #define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) 130 131 #define MBCODE_START_POS(enc) \ 132 (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80) 133 134 #define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \ 135 add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0)) 136 137 #define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\ 138 if (! ONIGENC_IS_SINGLEBYTE(enc)) {\ 139 r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\ 140 if (r) return r;\ 141 }\ 142 } while (0) 143 144 145 #define BITSET_IS_EMPTY(bs,empty) do {\ 146 int i;\ 147 empty = 1;\ 148 for (i = 0; i < (int )BITSET_SIZE; i++) {\ 149 if ((bs)[i] != 0) {\ 150 empty = 0; break;\ 151 }\ 152 }\ 153 } while (0) 154 155 static void 156 bitset_set_range(BitSetRef bs, int from, int to) 157 { 158 int i; 159 for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { 160 BITSET_SET_BIT(bs, i); 161 } 162 } 163 164 #if 0 165 static void 166 bitset_set_all(BitSetRef bs) 167 { 168 int i; 169 for (i = 0; i < BITSET_SIZE; i++) { bs[i] = ~((Bits )0); } 170 } 171 #endif 172 173 static void 174 bitset_invert(BitSetRef bs) 175 { 176 int i; 177 for (i = 0; i < (int )BITSET_SIZE; i++) { bs[i] = ~(bs[i]); } 178 } 179 180 static void 181 bitset_invert_to(BitSetRef from, BitSetRef to) 182 { 183 int i; 184 for (i = 0; i < (int )BITSET_SIZE; i++) { to[i] = ~(from[i]); } 185 } 186 187 static void 188 bitset_and(BitSetRef dest, BitSetRef bs) 189 { 190 int i; 191 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] &= bs[i]; } 192 } 193 194 static void 195 bitset_or(BitSetRef dest, BitSetRef bs) 196 { 197 int i; 198 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] |= bs[i]; } 199 } 200 201 static void 202 bitset_copy(BitSetRef dest, BitSetRef bs) 203 { 204 int i; 205 for (i = 0; i < (int )BITSET_SIZE; i++) { dest[i] = bs[i]; } 206 } 207 208 extern int 209 onig_strncmp(const UChar* s1, const UChar* s2, int n) 210 { 211 int x; 212 213 while (n-- > 0) { 214 x = *s2++ - *s1++; 215 if (x) return x; 216 } 217 return 0; 218 } 219 220 extern void 221 onig_strcpy(UChar* dest, const UChar* src, const UChar* end) 222 { 223 int len = (int)(end - src); 224 if (len > 0) { 225 xmemcpy(dest, src, len); 226 dest[len] = (UChar )0; 227 } 228 } 229 230 #ifdef USE_NAMED_GROUP 231 static UChar* 232 strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) 233 { 234 int slen, term_len, i; 235 UChar *r; 236 237 slen = (int)(end - s); 238 term_len = ONIGENC_MBC_MINLEN(enc); 239 240 r = (UChar* )xmalloc(slen + term_len); 241 CHECK_NULL_RETURN(r); 242 xmemcpy(r, s, slen); 243 244 for (i = 0; i < term_len; i++) 245 r[slen + i] = (UChar )0; 246 247 return r; 248 } 249 #endif 250 251 /* scan pattern methods */ 252 #define PEND_VALUE 0 253 254 #define PFETCH_READY UChar* pfetch_prev 255 #define PEND (p < end ? 0 : 1) 256 #define PUNFETCH p = pfetch_prev 257 #define PINC do { \ 258 pfetch_prev = p; \ 259 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 260 } while (0) 261 #define PFETCH(c) do { \ 262 c = ONIGENC_MBC_TO_CODE(enc, p, end); \ 263 pfetch_prev = p; \ 264 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 265 } while (0) 266 267 #define PINC_S do { \ 268 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 269 } while (0) 270 #define PFETCH_S(c) do { \ 271 c = ONIGENC_MBC_TO_CODE(enc, p, end); \ 272 p += ONIGENC_MBC_ENC_LEN(enc, p); \ 273 } while (0) 274 275 #define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE) 276 #define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c) 277 278 static UChar* 279 strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end, 280 int capa, int oldCapa) 281 { 282 UChar* r; 283 284 if (dest) 285 r = (UChar* )xrealloc(dest, capa + 1, oldCapa); 286 else 287 r = (UChar* )xmalloc(capa + 1); 288 289 CHECK_NULL_RETURN(r); 290 onig_strcpy(r + (dest_end - dest), src, src_end); 291 return r; 292 } 293 294 /* dest on static area */ 295 static UChar* 296 strcat_capa_from_static(UChar* dest, UChar* dest_end, 297 const UChar* src, const UChar* src_end, int capa) 298 { 299 UChar* r; 300 301 r = (UChar* )xmalloc(capa + 1); 302 CHECK_NULL_RETURN(r); 303 onig_strcpy(r, dest, dest_end); 304 onig_strcpy(r + (dest_end - dest), src, src_end); 305 return r; 306 } 307 308 309 #ifdef USE_ST_LIBRARY 310 311 typedef struct { 312 UChar* s; 313 UChar* end; 314 } st_str_end_key; 315 316 static int 317 str_end_cmp(st_str_end_key* x, st_str_end_key* y) 318 { 319 UChar *p, *q; 320 int c; 321 322 if ((x->end - x->s) != (y->end - y->s)) 323 return 1; 324 325 p = x->s; 326 q = y->s; 327 while (p < x->end) { 328 c = (int )*p - (int )*q; 329 if (c != 0) return c; 330 331 p++; q++; 332 } 333 334 return 0; 335 } 336 337 static int 338 str_end_hash(st_str_end_key* x) 339 { 340 UChar *p; 341 int val = 0; 342 343 p = x->s; 344 while (p < x->end) { 345 val = val * 997 + (int )*p++; 346 } 347 348 return val + (val >> 5); 349 } 350 351 extern hash_table_type* 352 onig_st_init_strend_table_with_size(int size) 353 { 354 static struct st_hash_type hashType = { 355 str_end_cmp, 356 str_end_hash, 357 }; 358 359 return (hash_table_type* ) 360 onig_st_init_table_with_size(&hashType, size); 361 } 362 363 extern int 364 onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, 365 const UChar* end_key, hash_data_type *value) 366 { 367 st_str_end_key key; 368 369 key.s = (UChar* )str_key; 370 key.end = (UChar* )end_key; 371 372 return onig_st_lookup(table, (st_data_t )(UINTN)(&key), value); 373 } 374 375 extern int 376 onig_st_insert_strend(hash_table_type* table, const UChar* str_key, 377 const UChar* end_key, hash_data_type value) 378 { 379 st_str_end_key* key; 380 int result; 381 382 key = (st_str_end_key* )xmalloc(sizeof(st_str_end_key)); 383 CHECK_NULL_RETURN_MEMERR(key); 384 key->s = (UChar* )str_key; 385 key->end = (UChar* )end_key; 386 result = onig_st_insert(table, (st_data_t )(UINTN)key, value); 387 if (result) { 388 xfree(key); 389 } 390 return result; 391 } 392 393 #endif /* USE_ST_LIBRARY */ 394 395 396 #ifdef USE_NAMED_GROUP 397 398 #define INIT_NAME_BACKREFS_ALLOC_NUM 8 399 400 typedef struct { 401 UChar* name; 402 int name_len; /* byte length */ 403 int back_num; /* number of backrefs */ 404 int back_alloc; 405 int back_ref1; 406 int* back_refs; 407 } NameEntry; 408 409 #ifdef USE_ST_LIBRARY 410 411 typedef st_table NameTable; 412 typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ 413 414 #define NAMEBUF_SIZE 24 415 #define NAMEBUF_SIZE_1 25 416 417 #ifdef ONIG_DEBUG 418 static int 419 i_print_name_entry(UChar* key, NameEntry* e, void* arg) 420 { 421 int i; 422 FILE* fp = (FILE* )arg; 423 424 fprintf(fp, "%s: ", e->name); 425 if (e->back_num == 0) 426 fputs("-", fp); 427 else if (e->back_num == 1) 428 fprintf(fp, "%d", e->back_ref1); 429 else { 430 for (i = 0; i < e->back_num; i++) { 431 if (i > 0) fprintf(fp, ", "); 432 fprintf(fp, "%d", e->back_refs[i]); 433 } 434 } 435 fputs("\n", fp); 436 return ST_CONTINUE; 437 } 438 439 extern int 440 onig_print_names(FILE* fp, regex_t* reg) 441 { 442 NameTable* t = (NameTable* )reg->name_table; 443 444 if (IS_NOT_NULL(t)) { 445 fprintf(fp, "name table\n"); 446 onig_st_foreach(t, i_print_name_entry, (HashDataType )fp); 447 fputs("\n", fp); 448 } 449 return 0; 450 } 451 #endif /* ONIG_DEBUG */ 452 453 static int 454 i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) 455 { 456 xfree(e->name); 457 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 458 xfree(key); 459 xfree(e); 460 return ST_DELETE; 461 } 462 463 static int 464 names_clear(regex_t* reg) 465 { 466 NameTable* t = (NameTable* )reg->name_table; 467 468 if (IS_NOT_NULL(t)) { 469 onig_st_foreach(t, i_free_name_entry, 0); 470 } 471 return 0; 472 } 473 474 extern int 475 onig_names_free(regex_t* reg) 476 { 477 int r; 478 NameTable* t; 479 480 r = names_clear(reg); 481 if (r) return r; 482 483 t = (NameTable* )reg->name_table; 484 if (IS_NOT_NULL(t)) onig_st_free_table(t); 485 reg->name_table = (void* )NULL; 486 return 0; 487 } 488 489 static NameEntry* 490 name_find(regex_t* reg, const UChar* name, const UChar* name_end) 491 { 492 NameEntry* e; 493 NameTable* t = (NameTable* )reg->name_table; 494 495 e = (NameEntry* )NULL; 496 if (IS_NOT_NULL(t)) { 497 onig_st_lookup_strend(t, name, name_end, (HashDataType* )((void* )(&e))); 498 } 499 return e; 500 } 501 502 typedef struct { 503 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*); 504 regex_t* reg; 505 void* arg; 506 int ret; 507 OnigEncoding enc; 508 } INamesArg; 509 510 static int 511 i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) 512 { 513 int r = (*(arg->func))(e->name, 514 e->name + e->name_len, 515 e->back_num, 516 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 517 arg->reg, arg->arg); 518 if (r != 0) { 519 arg->ret = r; 520 return ST_STOP; 521 } 522 return ST_CONTINUE; 523 } 524 525 extern int 526 onig_foreach_name(regex_t* reg, 527 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 528 { 529 INamesArg narg; 530 NameTable* t = (NameTable* )reg->name_table; 531 532 narg.ret = 0; 533 if (IS_NOT_NULL(t)) { 534 narg.func = func; 535 narg.reg = reg; 536 narg.arg = arg; 537 narg.enc = reg->enc; /* should be pattern encoding. */ 538 onig_st_foreach(t, i_names, (HashDataType )(UINTN)&narg); 539 } 540 return narg.ret; 541 } 542 543 static int 544 i_renumber_name(UChar* key ARG_UNUSED, NameEntry* e, GroupNumRemap* map) 545 { 546 int i; 547 548 if (e->back_num > 1) { 549 for (i = 0; i < e->back_num; i++) { 550 e->back_refs[i] = map[e->back_refs[i]].new_val; 551 } 552 } 553 else if (e->back_num == 1) { 554 e->back_ref1 = map[e->back_ref1].new_val; 555 } 556 557 return ST_CONTINUE; 558 } 559 560 extern int 561 onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) 562 { 563 NameTable* t = (NameTable* )reg->name_table; 564 565 if (IS_NOT_NULL(t)) { 566 onig_st_foreach(t, i_renumber_name, (HashDataType )(UINTN)map); 567 } 568 return 0; 569 } 570 571 572 extern int 573 onig_number_of_names(regex_t* reg) 574 { 575 NameTable* t = (NameTable* )reg->name_table; 576 577 if (IS_NOT_NULL(t)) 578 return t->num_entries; 579 else 580 return 0; 581 } 582 583 #else /* USE_ST_LIBRARY */ 584 585 #define INIT_NAMES_ALLOC_NUM 8 586 587 typedef struct { 588 NameEntry* e; 589 int num; 590 int alloc; 591 } NameTable; 592 593 #ifdef ONIG_DEBUG 594 extern int 595 onig_print_names(FILE* fp, regex_t* reg) 596 { 597 int i, j; 598 NameEntry* e; 599 NameTable* t = (NameTable* )reg->name_table; 600 601 if (IS_NOT_NULL(t) && t->num > 0) { 602 fprintf(fp, "name table\n"); 603 for (i = 0; i < t->num; i++) { 604 e = &(t->e[i]); 605 fprintf(fp, "%s: ", e->name); 606 if (e->back_num == 0) { 607 fputs("-", fp); 608 } 609 else if (e->back_num == 1) { 610 fprintf(fp, "%d", e->back_ref1); 611 } 612 else { 613 for (j = 0; j < e->back_num; j++) { 614 if (j > 0) fprintf(fp, ", "); 615 fprintf(fp, "%d", e->back_refs[j]); 616 } 617 } 618 fputs("\n", fp); 619 } 620 fputs("\n", fp); 621 } 622 return 0; 623 } 624 #endif 625 626 static int 627 names_clear(regex_t* reg) 628 { 629 int i; 630 NameEntry* e; 631 NameTable* t = (NameTable* )reg->name_table; 632 633 if (IS_NOT_NULL(t)) { 634 for (i = 0; i < t->num; i++) { 635 e = &(t->e[i]); 636 if (IS_NOT_NULL(e->name)) { 637 xfree(e->name); 638 e->name = NULL; 639 e->name_len = 0; 640 e->back_num = 0; 641 e->back_alloc = 0; 642 if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); 643 e->back_refs = (int* )NULL; 644 } 645 } 646 if (IS_NOT_NULL(t->e)) { 647 xfree(t->e); 648 t->e = NULL; 649 } 650 t->num = 0; 651 } 652 return 0; 653 } 654 655 extern int 656 onig_names_free(regex_t* reg) 657 { 658 int r; 659 NameTable* t; 660 661 r = names_clear(reg); 662 if (r) return r; 663 664 t = (NameTable* )reg->name_table; 665 if (IS_NOT_NULL(t)) xfree(t); 666 reg->name_table = NULL; 667 return 0; 668 } 669 670 static NameEntry* 671 name_find(regex_t* reg, UChar* name, UChar* name_end) 672 { 673 int i, len; 674 NameEntry* e; 675 NameTable* t = (NameTable* )reg->name_table; 676 677 if (IS_NOT_NULL(t)) { 678 len = name_end - name; 679 for (i = 0; i < t->num; i++) { 680 e = &(t->e[i]); 681 if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) 682 return e; 683 } 684 } 685 return (NameEntry* )NULL; 686 } 687 688 extern int 689 onig_foreach_name(regex_t* reg, 690 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 691 { 692 int i, r; 693 NameEntry* e; 694 NameTable* t = (NameTable* )reg->name_table; 695 696 if (IS_NOT_NULL(t)) { 697 for (i = 0; i < t->num; i++) { 698 e = &(t->e[i]); 699 r = (*func)(e->name, e->name + e->name_len, e->back_num, 700 (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), 701 reg, arg); 702 if (r != 0) return r; 703 } 704 } 705 return 0; 706 } 707 708 extern int 709 onig_number_of_names(regex_t* reg) 710 { 711 NameTable* t = (NameTable* )reg->name_table; 712 713 if (IS_NOT_NULL(t)) 714 return t->num; 715 else 716 return 0; 717 } 718 719 #endif /* else USE_ST_LIBRARY */ 720 721 static int 722 name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) 723 { 724 int alloc; 725 NameEntry* e; 726 NameTable* t = (NameTable* )reg->name_table; 727 728 if (name_end - name <= 0) 729 return ONIGERR_EMPTY_GROUP_NAME; 730 731 e = name_find(reg, name, name_end); 732 if (IS_NULL(e)) { 733 #ifdef USE_ST_LIBRARY 734 if (IS_NULL(t)) { 735 t = onig_st_init_strend_table_with_size(5); 736 CHECK_NULL_RETURN_MEMERR(t); 737 reg->name_table = (void* )t; 738 } 739 e = (NameEntry* )xmalloc(sizeof(NameEntry)); 740 CHECK_NULL_RETURN_MEMERR(e); 741 742 e->name = strdup_with_null(reg->enc, name, name_end); 743 if (IS_NULL(e->name)) { 744 xfree(e); return ONIGERR_MEMORY; 745 } 746 onig_st_insert_strend(t, e->name, (e->name + (name_end - name)), 747 (HashDataType )(UINTN)e); 748 749 e->name_len = (int)(name_end - name); 750 e->back_num = 0; 751 e->back_alloc = 0; 752 e->back_refs = (int* )NULL; 753 754 #else 755 756 if (IS_NULL(t)) { 757 alloc = INIT_NAMES_ALLOC_NUM; 758 t = (NameTable* )xmalloc(sizeof(NameTable)); 759 CHECK_NULL_RETURN_MEMERR(t); 760 t->e = NULL; 761 t->alloc = 0; 762 t->num = 0; 763 764 t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); 765 if (IS_NULL(t->e)) { 766 xfree(t); 767 return ONIGERR_MEMORY; 768 } 769 t->alloc = alloc; 770 reg->name_table = t; 771 goto clear; 772 } 773 else if (t->num == t->alloc) { 774 int i; 775 776 alloc = t->alloc * 2; 777 t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); 778 CHECK_NULL_RETURN_MEMERR(t->e); 779 t->alloc = alloc; 780 781 clear: 782 for (i = t->num; i < t->alloc; i++) { 783 t->e[i].name = NULL; 784 t->e[i].name_len = 0; 785 t->e[i].back_num = 0; 786 t->e[i].back_alloc = 0; 787 t->e[i].back_refs = (int* )NULL; 788 } 789 } 790 e = &(t->e[t->num]); 791 t->num++; 792 e->name = strdup_with_null(reg->enc, name, name_end); 793 if (IS_NULL(e->name)) return ONIGERR_MEMORY; 794 e->name_len = name_end - name; 795 #endif 796 } 797 798 if (e->back_num >= 1 && 799 ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { 800 onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, 801 name, name_end); 802 return ONIGERR_MULTIPLEX_DEFINED_NAME; 803 } 804 805 e->back_num++; 806 if (e->back_num == 1) { 807 e->back_ref1 = backref; 808 } 809 else { 810 if (e->back_num == 2) { 811 alloc = INIT_NAME_BACKREFS_ALLOC_NUM; 812 e->back_refs = (int* )xmalloc(sizeof(int) * alloc); 813 CHECK_NULL_RETURN_MEMERR(e->back_refs); 814 e->back_alloc = alloc; 815 e->back_refs[0] = e->back_ref1; 816 e->back_refs[1] = backref; 817 } 818 else { 819 if (e->back_num > e->back_alloc) { 820 alloc = e->back_alloc * 2; 821 e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc, sizeof(int) * e->back_alloc); 822 CHECK_NULL_RETURN_MEMERR(e->back_refs); 823 e->back_alloc = alloc; 824 } 825 e->back_refs[e->back_num - 1] = backref; 826 } 827 } 828 829 return 0; 830 } 831 832 extern int 833 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 834 const UChar* name_end, int** nums) 835 { 836 NameEntry* e = name_find(reg, name, name_end); 837 838 if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; 839 840 switch (e->back_num) { 841 case 0: 842 break; 843 case 1: 844 *nums = &(e->back_ref1); 845 break; 846 default: 847 *nums = e->back_refs; 848 break; 849 } 850 return e->back_num; 851 } 852 853 extern int 854 onig_name_to_backref_number(regex_t* reg, const UChar* name, 855 const UChar* name_end, OnigRegion *region) 856 { 857 int i, n, *nums; 858 859 n = onig_name_to_group_numbers(reg, name, name_end, &nums); 860 if (n < 0) 861 return n; 862 else if (n == 0) 863 return ONIGERR_PARSER_BUG; 864 else if (n == 1) 865 return nums[0]; 866 else { 867 if (IS_NOT_NULL(region)) { 868 for (i = n - 1; i >= 0; i--) { 869 if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) 870 return nums[i]; 871 } 872 } 873 return nums[n - 1]; 874 } 875 } 876 877 #else /* USE_NAMED_GROUP */ 878 879 extern int 880 onig_name_to_group_numbers(regex_t* reg, const UChar* name, 881 const UChar* name_end, int** nums) 882 { 883 return ONIG_NO_SUPPORT_CONFIG; 884 } 885 886 extern int 887 onig_name_to_backref_number(regex_t* reg, const UChar* name, 888 const UChar* name_end, OnigRegion* region) 889 { 890 return ONIG_NO_SUPPORT_CONFIG; 891 } 892 893 extern int 894 onig_foreach_name(regex_t* reg, 895 int (*func)(const UChar*, const UChar*,int,int*,regex_t*,void*), void* arg) 896 { 897 return ONIG_NO_SUPPORT_CONFIG; 898 } 899 900 extern int 901 onig_number_of_names(regex_t* reg) 902 { 903 return 0; 904 } 905 #endif /* else USE_NAMED_GROUP */ 906 907 extern int 908 onig_noname_group_capture_is_active(regex_t* reg) 909 { 910 if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) 911 return 0; 912 913 #ifdef USE_NAMED_GROUP 914 if (onig_number_of_names(reg) > 0 && 915 IS_SYNTAX_BV(reg->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && 916 !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { 917 return 0; 918 } 919 #endif 920 921 return 1; 922 } 923 924 925 #define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 926 927 static void 928 scan_env_clear(ScanEnv* env) 929 { 930 int i; 931 932 BIT_STATUS_CLEAR(env->capture_history); 933 BIT_STATUS_CLEAR(env->bt_mem_start); 934 BIT_STATUS_CLEAR(env->bt_mem_end); 935 BIT_STATUS_CLEAR(env->backrefed_mem); 936 env->error = (UChar* )NULL; 937 env->error_end = (UChar* )NULL; 938 env->num_call = 0; 939 env->num_mem = 0; 940 #ifdef USE_NAMED_GROUP 941 env->num_named = 0; 942 #endif 943 env->mem_alloc = 0; 944 env->mem_nodes_dynamic = (Node** )NULL; 945 946 for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) 947 env->mem_nodes_static[i] = NULL_NODE; 948 949 #ifdef USE_COMBINATION_EXPLOSION_CHECK 950 env->num_comb_exp_check = 0; 951 env->comb_exp_max_regnum = 0; 952 env->curr_max_regnum = 0; 953 env->has_recursion = 0; 954 #endif 955 } 956 957 static int 958 scan_env_add_mem_entry(ScanEnv* env) 959 { 960 int i, need, alloc; 961 Node** p; 962 963 need = env->num_mem + 1; 964 if (need >= SCANENV_MEMNODES_SIZE) { 965 if (env->mem_alloc <= need) { 966 if (IS_NULL(env->mem_nodes_dynamic)) { 967 alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; 968 p = (Node** )xmalloc(sizeof(Node*) * alloc); 969 CHECK_NULL_RETURN_MEMERR(p); 970 971 xmemcpy(p, env->mem_nodes_static, 972 sizeof(Node*) * SCANENV_MEMNODES_SIZE); 973 } 974 else { 975 alloc = env->mem_alloc * 2; 976 p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc, sizeof(Node*) * env->mem_alloc); 977 } 978 CHECK_NULL_RETURN_MEMERR(p); 979 980 for (i = env->num_mem + 1; i < alloc; i++) 981 p[i] = NULL_NODE; 982 983 env->mem_nodes_dynamic = p; 984 env->mem_alloc = alloc; 985 } 986 } 987 988 env->num_mem++; 989 return env->num_mem; 990 } 991 992 static int 993 scan_env_set_mem_node(ScanEnv* env, int num, Node* node) 994 { 995 if (env->num_mem >= num) 996 SCANENV_MEM_NODES(env)[num] = node; 997 else 998 return ONIGERR_PARSER_BUG; 999 return 0; 1000 } 1001 1002 1003 #ifdef USE_PARSE_TREE_NODE_RECYCLE 1004 typedef struct _FreeNode { 1005 struct _FreeNode* next; 1006 } FreeNode; 1007 1008 static FreeNode* FreeNodeList = (FreeNode* )NULL; 1009 #endif 1010 1011 extern void 1012 onig_node_free(Node* node) 1013 { 1014 start: 1015 if (IS_NULL(node)) return ; 1016 1017 switch (NTYPE(node)) { 1018 case NT_STR: 1019 if (NSTR(node)->capa != 0 && 1020 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { 1021 xfree(NSTR(node)->s); 1022 } 1023 break; 1024 1025 case NT_LIST: 1026 case NT_ALT: 1027 onig_node_free(NCAR(node)); 1028 { 1029 Node* next_node = NCDR(node); 1030 1031 #ifdef USE_PARSE_TREE_NODE_RECYCLE 1032 { 1033 FreeNode* n = (FreeNode* )node; 1034 1035 THREAD_ATOMIC_START; 1036 n->next = FreeNodeList; 1037 FreeNodeList = n; 1038 THREAD_ATOMIC_END; 1039 } 1040 #else 1041 xfree(node); 1042 #endif 1043 node = next_node; 1044 goto start; 1045 } 1046 break; 1047 1048 case NT_CCLASS: 1049 { 1050 CClassNode* cc = NCCLASS(node); 1051 1052 if (IS_NCCLASS_SHARE(cc)) return ; 1053 if (cc->mbuf) 1054 bbuf_free(cc->mbuf); 1055 } 1056 break; 1057 1058 case NT_QTFR: 1059 if (NQTFR(node)->target) 1060 onig_node_free(NQTFR(node)->target); 1061 break; 1062 1063 case NT_ENCLOSE: 1064 if (NENCLOSE(node)->target) 1065 onig_node_free(NENCLOSE(node)->target); 1066 break; 1067 1068 case NT_BREF: 1069 if (IS_NOT_NULL(NBREF(node)->back_dynamic)) 1070 xfree(NBREF(node)->back_dynamic); 1071 break; 1072 1073 case NT_ANCHOR: 1074 if (NANCHOR(node)->target) 1075 onig_node_free(NANCHOR(node)->target); 1076 break; 1077 } 1078 1079 #ifdef USE_PARSE_TREE_NODE_RECYCLE 1080 { 1081 FreeNode* n = (FreeNode* )node; 1082 1083 THREAD_ATOMIC_START; 1084 n->next = FreeNodeList; 1085 FreeNodeList = n; 1086 THREAD_ATOMIC_END; 1087 } 1088 #else 1089 xfree(node); 1090 #endif 1091 } 1092 1093 #ifdef USE_PARSE_TREE_NODE_RECYCLE 1094 extern int 1095 onig_free_node_list(void) 1096 { 1097 FreeNode* n; 1098 1099 /* THREAD_ATOMIC_START; */ 1100 while (IS_NOT_NULL(FreeNodeList)) { 1101 n = FreeNodeList; 1102 FreeNodeList = FreeNodeList->next; 1103 xfree(n); 1104 } 1105 /* THREAD_ATOMIC_END; */ 1106 return 0; 1107 } 1108 #endif 1109 1110 static Node* 1111 node_new(void) 1112 { 1113 Node* node; 1114 1115 #ifdef USE_PARSE_TREE_NODE_RECYCLE 1116 THREAD_ATOMIC_START; 1117 if (IS_NOT_NULL(FreeNodeList)) { 1118 node = (Node* )FreeNodeList; 1119 FreeNodeList = FreeNodeList->next; 1120 THREAD_ATOMIC_END; 1121 return node; 1122 } 1123 THREAD_ATOMIC_END; 1124 #endif 1125 1126 node = (Node* )xmalloc(sizeof(Node)); 1127 /* xmemset(node, 0, sizeof(Node)); */ 1128 return node; 1129 } 1130 1131 1132 static void 1133 initialize_cclass(CClassNode* cc) 1134 { 1135 BITSET_CLEAR(cc->bs); 1136 /* cc->base.flags = 0; */ 1137 cc->flags = 0; 1138 cc->mbuf = NULL; 1139 } 1140 1141 static Node* 1142 node_new_cclass(void) 1143 { 1144 Node* node = node_new(); 1145 CHECK_NULL_RETURN(node); 1146 1147 SET_NTYPE(node, NT_CCLASS); 1148 initialize_cclass(NCCLASS(node)); 1149 return node; 1150 } 1151 1152 static Node* 1153 node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out, 1154 const OnigCodePoint ranges[]) 1155 { 1156 int n, i; 1157 CClassNode* cc; 1158 OnigCodePoint j; 1159 1160 Node* node = node_new_cclass(); 1161 CHECK_NULL_RETURN(node); 1162 1163 cc = NCCLASS(node); 1164 if (not != 0) NCCLASS_SET_NOT(cc); 1165 1166 BITSET_CLEAR(cc->bs); 1167 if (sb_out > 0 && IS_NOT_NULL(ranges)) { 1168 n = ONIGENC_CODE_RANGE_NUM(ranges); 1169 for (i = 0; i < n; i++) { 1170 for (j = ONIGENC_CODE_RANGE_FROM(ranges, i); 1171 j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) { 1172 if (j >= sb_out) goto sb_end; 1173 1174 BITSET_SET_BIT(cc->bs, j); 1175 } 1176 } 1177 } 1178 1179 sb_end: 1180 if (IS_NULL(ranges)) { 1181 is_null: 1182 cc->mbuf = NULL; 1183 } 1184 else { 1185 BBuf* bbuf; 1186 1187 n = ONIGENC_CODE_RANGE_NUM(ranges); 1188 if (n == 0) goto is_null; 1189 1190 bbuf = (BBuf* )xmalloc(sizeof(BBuf)); 1191 CHECK_NULL_RETURN(bbuf); 1192 bbuf->alloc = n + 1; 1193 bbuf->used = n + 1; 1194 bbuf->p = (UChar* )((void* )ranges); 1195 1196 cc->mbuf = bbuf; 1197 } 1198 1199 return node; 1200 } 1201 1202 static Node* 1203 node_new_ctype(int type, int not) 1204 { 1205 Node* node = node_new(); 1206 CHECK_NULL_RETURN(node); 1207 1208 SET_NTYPE(node, NT_CTYPE); 1209 NCTYPE(node)->ctype = type; 1210 NCTYPE(node)->not = not; 1211 return node; 1212 } 1213 1214 static Node* 1215 node_new_anychar(void) 1216 { 1217 Node* node = node_new(); 1218 CHECK_NULL_RETURN(node); 1219 1220 SET_NTYPE(node, NT_CANY); 1221 return node; 1222 } 1223 1224 static Node* 1225 node_new_list(Node* left, Node* right) 1226 { 1227 Node* node = node_new(); 1228 CHECK_NULL_RETURN(node); 1229 1230 SET_NTYPE(node, NT_LIST); 1231 NCAR(node) = left; 1232 NCDR(node) = right; 1233 return node; 1234 } 1235 1236 extern Node* 1237 onig_node_new_list(Node* left, Node* right) 1238 { 1239 return node_new_list(left, right); 1240 } 1241 1242 extern Node* 1243 onig_node_list_add(Node* list, Node* x) 1244 { 1245 Node *n; 1246 1247 n = onig_node_new_list(x, NULL); 1248 if (IS_NULL(n)) return NULL_NODE; 1249 1250 if (IS_NOT_NULL(list)) { 1251 while (IS_NOT_NULL(NCDR(list))) 1252 list = NCDR(list); 1253 1254 NCDR(list) = n; 1255 } 1256 1257 return n; 1258 } 1259 1260 extern Node* 1261 onig_node_new_alt(Node* left, Node* right) 1262 { 1263 Node* node = node_new(); 1264 CHECK_NULL_RETURN(node); 1265 1266 SET_NTYPE(node, NT_ALT); 1267 NCAR(node) = left; 1268 NCDR(node) = right; 1269 return node; 1270 } 1271 1272 extern Node* 1273 onig_node_new_anchor(int type) 1274 { 1275 Node* node = node_new(); 1276 CHECK_NULL_RETURN(node); 1277 1278 SET_NTYPE(node, NT_ANCHOR); 1279 NANCHOR(node)->type = type; 1280 NANCHOR(node)->target = NULL; 1281 NANCHOR(node)->char_len = -1; 1282 return node; 1283 } 1284 1285 static Node* 1286 node_new_backref(int back_num, int* backrefs, int by_name, 1287 #ifdef USE_BACKREF_WITH_LEVEL 1288 int exist_level, int nest_level, 1289 #endif 1290 ScanEnv* env) 1291 { 1292 int i; 1293 Node* node = node_new(); 1294 1295 CHECK_NULL_RETURN(node); 1296 1297 SET_NTYPE(node, NT_BREF); 1298 NBREF(node)->state = 0; 1299 NBREF(node)->back_num = back_num; 1300 NBREF(node)->back_dynamic = (int* )NULL; 1301 if (by_name != 0) 1302 NBREF(node)->state |= NST_NAME_REF; 1303 1304 #ifdef USE_BACKREF_WITH_LEVEL 1305 if (exist_level != 0) { 1306 NBREF(node)->state |= NST_NEST_LEVEL; 1307 NBREF(node)->nest_level = nest_level; 1308 } 1309 #endif 1310 1311 for (i = 0; i < back_num; i++) { 1312 if (backrefs[i] <= env->num_mem && 1313 IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { 1314 NBREF(node)->state |= NST_RECURSION; /* /...(\1).../ */ 1315 break; 1316 } 1317 } 1318 1319 if (back_num <= NODE_BACKREFS_SIZE) { 1320 for (i = 0; i < back_num; i++) 1321 NBREF(node)->back_static[i] = backrefs[i]; 1322 } 1323 else { 1324 int* p = (int* )xmalloc(sizeof(int) * back_num); 1325 if (IS_NULL(p)) { 1326 onig_node_free(node); 1327 return NULL; 1328 } 1329 NBREF(node)->back_dynamic = p; 1330 for (i = 0; i < back_num; i++) 1331 p[i] = backrefs[i]; 1332 } 1333 return node; 1334 } 1335 1336 #ifdef USE_SUBEXP_CALL 1337 static Node* 1338 node_new_call(UChar* name, UChar* name_end, int gnum) 1339 { 1340 Node* node = node_new(); 1341 CHECK_NULL_RETURN(node); 1342 1343 SET_NTYPE(node, NT_CALL); 1344 NCALL(node)->state = 0; 1345 NCALL(node)->target = NULL_NODE; 1346 NCALL(node)->name = name; 1347 NCALL(node)->name_end = name_end; 1348 NCALL(node)->group_num = gnum; /* call by number if gnum != 0 */ 1349 return node; 1350 } 1351 #endif 1352 1353 static Node* 1354 node_new_quantifier(int lower, int upper, int by_number) 1355 { 1356 Node* node = node_new(); 1357 CHECK_NULL_RETURN(node); 1358 1359 SET_NTYPE(node, NT_QTFR); 1360 NQTFR(node)->state = 0; 1361 NQTFR(node)->target = NULL; 1362 NQTFR(node)->lower = lower; 1363 NQTFR(node)->upper = upper; 1364 NQTFR(node)->greedy = 1; 1365 NQTFR(node)->target_empty_info = NQ_TARGET_ISNOT_EMPTY; 1366 NQTFR(node)->head_exact = NULL_NODE; 1367 NQTFR(node)->next_head_exact = NULL_NODE; 1368 NQTFR(node)->is_refered = 0; 1369 if (by_number != 0) 1370 NQTFR(node)->state |= NST_BY_NUMBER; 1371 1372 #ifdef USE_COMBINATION_EXPLOSION_CHECK 1373 NQTFR(node)->comb_exp_check_num = 0; 1374 #endif 1375 1376 return node; 1377 } 1378 1379 static Node* 1380 node_new_enclose(int type) 1381 { 1382 Node* node = node_new(); 1383 CHECK_NULL_RETURN(node); 1384 1385 SET_NTYPE(node, NT_ENCLOSE); 1386 NENCLOSE(node)->type = type; 1387 NENCLOSE(node)->state = 0; 1388 NENCLOSE(node)->regnum = 0; 1389 NENCLOSE(node)->option = 0; 1390 NENCLOSE(node)->target = NULL; 1391 NENCLOSE(node)->call_addr = -1; 1392 NENCLOSE(node)->opt_count = 0; 1393 return node; 1394 } 1395 1396 extern Node* 1397 onig_node_new_enclose(int type) 1398 { 1399 return node_new_enclose(type); 1400 } 1401 1402 static Node* 1403 node_new_enclose_memory(OnigOptionType option, int is_named) 1404 { 1405 Node* node = node_new_enclose(ENCLOSE_MEMORY); 1406 CHECK_NULL_RETURN(node); 1407 if (is_named != 0) 1408 SET_ENCLOSE_STATUS(node, NST_NAMED_GROUP); 1409 1410 #ifdef USE_SUBEXP_CALL 1411 NENCLOSE(node)->option = option; 1412 #endif 1413 return node; 1414 } 1415 1416 static Node* 1417 node_new_option(OnigOptionType option) 1418 { 1419 Node* node = node_new_enclose(ENCLOSE_OPTION); 1420 CHECK_NULL_RETURN(node); 1421 NENCLOSE(node)->option = option; 1422 return node; 1423 } 1424 1425 extern int 1426 onig_node_str_cat(Node* node, const UChar* s, const UChar* end) 1427 { 1428 int addlen = (int)(end - s); 1429 1430 if (addlen > 0) { 1431 int len = (int)(NSTR(node)->end - NSTR(node)->s); 1432 1433 if (NSTR(node)->capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { 1434 UChar* p; 1435 int capa = len + addlen + NODE_STR_MARGIN; 1436 1437 if (capa <= NSTR(node)->capa) { 1438 onig_strcpy(NSTR(node)->s + len, s, end); 1439 } 1440 else { 1441 if (NSTR(node)->s == NSTR(node)->buf) 1442 p = strcat_capa_from_static(NSTR(node)->s, NSTR(node)->end, 1443 s, end, capa); 1444 else 1445 p = strcat_capa(NSTR(node)->s, NSTR(node)->end, s, end, capa, NSTR(node)->capa); 1446 1447 CHECK_NULL_RETURN_MEMERR(p); 1448 NSTR(node)->s = p; 1449 NSTR(node)->capa = capa; 1450 } 1451 } 1452 else { 1453 onig_strcpy(NSTR(node)->s + len, s, end); 1454 } 1455 NSTR(node)->end = NSTR(node)->s + len + addlen; 1456 } 1457 1458 return 0; 1459 } 1460 1461 extern int 1462 onig_node_str_set(Node* node, const UChar* s, const UChar* end) 1463 { 1464 onig_node_str_clear(node); 1465 return onig_node_str_cat(node, s, end); 1466 } 1467 1468 static int 1469 node_str_cat_char(Node* node, UChar c) 1470 { 1471 UChar s[1]; 1472 1473 s[0] = c; 1474 return onig_node_str_cat(node, s, s + 1); 1475 } 1476 1477 extern void 1478 onig_node_conv_to_str_node(Node* node, int flag) 1479 { 1480 SET_NTYPE(node, NT_STR); 1481 NSTR(node)->flag = flag; 1482 NSTR(node)->capa = 0; 1483 NSTR(node)->s = NSTR(node)->buf; 1484 NSTR(node)->end = NSTR(node)->buf; 1485 } 1486 1487 extern void 1488 onig_node_str_clear(Node* node) 1489 { 1490 if (NSTR(node)->capa != 0 && 1491 IS_NOT_NULL(NSTR(node)->s) && NSTR(node)->s != NSTR(node)->buf) { 1492 xfree(NSTR(node)->s); 1493 } 1494 1495 NSTR(node)->capa = 0; 1496 NSTR(node)->flag = 0; 1497 NSTR(node)->s = NSTR(node)->buf; 1498 NSTR(node)->end = NSTR(node)->buf; 1499 } 1500 1501 static Node* 1502 node_new_str(const UChar* s, const UChar* end) 1503 { 1504 Node* node = node_new(); 1505 CHECK_NULL_RETURN(node); 1506 1507 SET_NTYPE(node, NT_STR); 1508 NSTR(node)->capa = 0; 1509 NSTR(node)->flag = 0; 1510 NSTR(node)->s = NSTR(node)->buf; 1511 NSTR(node)->end = NSTR(node)->buf; 1512 if (onig_node_str_cat(node, s, end)) { 1513 onig_node_free(node); 1514 return NULL; 1515 } 1516 return node; 1517 } 1518 1519 extern Node* 1520 onig_node_new_str(const UChar* s, const UChar* end) 1521 { 1522 return node_new_str(s, end); 1523 } 1524 1525 static Node* 1526 node_new_str_raw(UChar* s, UChar* end) 1527 { 1528 Node* node = node_new_str(s, end); 1529 CHECK_NULL_RETURN(node); 1530 NSTRING_SET_RAW(node); 1531 return node; 1532 } 1533 1534 static Node* 1535 node_new_empty(void) 1536 { 1537 return node_new_str(NULL, NULL); 1538 } 1539 1540 static Node* 1541 node_new_str_raw_char(UChar c) 1542 { 1543 UChar p[1]; 1544 1545 p[0] = c; 1546 return node_new_str_raw(p, p + 1); 1547 } 1548 1549 static Node* 1550 str_node_split_last_char(StrNode* sn, OnigEncoding enc) 1551 { 1552 const UChar *p; 1553 Node* n = NULL_NODE; 1554 1555 if (sn->end > sn->s) { 1556 p = onigenc_get_prev_char_head(enc, sn->s, sn->end); 1557 if (p && p > sn->s) { /* can be splitted. */ 1558 n = node_new_str(p, sn->end); 1559 CHECK_NULL_RETURN(n); 1560 if ((sn->flag & NSTR_RAW) != 0) 1561 NSTRING_SET_RAW(n); 1562 sn->end = (UChar* )p; 1563 } 1564 } 1565 return n; 1566 } 1567 1568 static int 1569 str_node_can_be_split(StrNode* sn, OnigEncoding enc) 1570 { 1571 if (sn->end > sn->s) { 1572 return ((enclen(enc, sn->s) < sn->end - sn->s) ? 1 : 0); 1573 } 1574 return 0; 1575 } 1576 1577 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR 1578 static int 1579 node_str_head_pad(StrNode* sn, int num, UChar val) 1580 { 1581 UChar buf[NODE_STR_BUF_SIZE]; 1582 int i, len; 1583 1584 len = sn->end - sn->s; 1585 onig_strcpy(buf, sn->s, sn->end); 1586 onig_strcpy(&(sn->s[num]), buf, buf + len); 1587 sn->end += num; 1588 1589 for (i = 0; i < num; i++) { 1590 sn->s[i] = val; 1591 } 1592 } 1593 #endif 1594 1595 extern int 1596 onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) 1597 { 1598 unsigned int num, val; 1599 OnigCodePoint c; 1600 UChar* p = *src; 1601 PFETCH_READY; 1602 1603 num = 0; 1604 while (!PEND) { 1605 PFETCH(c); 1606 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 1607 val = (unsigned int )DIGITVAL(c); 1608 if ((INT_MAX_LIMIT - val) / 10UL < num) 1609 return -1; /* overflow */ 1610 1611 num = num * 10 + val; 1612 } 1613 else { 1614 PUNFETCH; 1615 break; 1616 } 1617 } 1618 *src = p; 1619 return num; 1620 } 1621 1622 static int 1623 scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, 1624 OnigEncoding enc) 1625 { 1626 OnigCodePoint c; 1627 unsigned int num, val; 1628 UChar* p = *src; 1629 PFETCH_READY; 1630 1631 num = 0; 1632 while (!PEND && maxlen-- != 0) { 1633 PFETCH(c); 1634 if (ONIGENC_IS_CODE_XDIGIT(enc, c)) { 1635 val = (unsigned int )XDIGITVAL(enc,c); 1636 if ((INT_MAX_LIMIT - val) / 16UL < num) 1637 return -1; /* overflow */ 1638 1639 num = (num << 4) + XDIGITVAL(enc,c); 1640 } 1641 else { 1642 PUNFETCH; 1643 break; 1644 } 1645 } 1646 *src = p; 1647 return num; 1648 } 1649 1650 static int 1651 scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, 1652 OnigEncoding enc) 1653 { 1654 OnigCodePoint c; 1655 unsigned int num, val; 1656 UChar* p = *src; 1657 PFETCH_READY; 1658 1659 num = 0; 1660 while (!PEND && maxlen-- != 0) { 1661 PFETCH(c); 1662 if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') { 1663 val = ODIGITVAL(c); 1664 if ((INT_MAX_LIMIT - val) / 8UL < num) 1665 return -1; /* overflow */ 1666 1667 num = (num << 3) + val; 1668 } 1669 else { 1670 PUNFETCH; 1671 break; 1672 } 1673 } 1674 *src = p; 1675 return num; 1676 } 1677 1678 1679 #define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \ 1680 BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) 1681 1682 /* data format: 1683 [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] 1684 (all data size is OnigCodePoint) 1685 */ 1686 static int 1687 new_code_range(BBuf** pbuf) 1688 { 1689 #define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) 1690 int r; 1691 OnigCodePoint n; 1692 BBuf* bbuf; 1693 1694 bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); 1695 CHECK_NULL_RETURN_MEMERR(*pbuf); 1696 r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); 1697 if (r) return r; 1698 1699 n = 0; 1700 BBUF_WRITE_CODE_POINT(bbuf, 0, n); 1701 return 0; 1702 } 1703 1704 static int 1705 add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to) 1706 { 1707 int r, inc_n, pos; 1708 int low, high, bound, x; 1709 OnigCodePoint n, *data; 1710 BBuf* bbuf; 1711 1712 if (from > to) { 1713 n = from; from = to; to = n; 1714 } 1715 1716 if (IS_NULL(*pbuf)) { 1717 r = new_code_range(pbuf); 1718 if (r) return r; 1719 bbuf = *pbuf; 1720 n = 0; 1721 } 1722 else { 1723 bbuf = *pbuf; 1724 GET_CODE_POINT(n, bbuf->p); 1725 } 1726 data = (OnigCodePoint* )(bbuf->p); 1727 data++; 1728 1729 for (low = 0, bound = n; low < bound; ) { 1730 x = (low + bound) >> 1; 1731 if (from > data[x*2 + 1]) 1732 low = x + 1; 1733 else 1734 bound = x; 1735 } 1736 1737 for (high = low, bound = n; high < bound; ) { 1738 x = (high + bound) >> 1; 1739 if (to >= data[x*2] - 1) 1740 high = x + 1; 1741 else 1742 bound = x; 1743 } 1744 1745 inc_n = low + 1 - high; 1746 if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) 1747 return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; 1748 1749 if (inc_n != 1) { 1750 if (from > data[low*2]) 1751 from = data[low*2]; 1752 if (to < data[(high - 1)*2 + 1]) 1753 to = data[(high - 1)*2 + 1]; 1754 } 1755 1756 if (inc_n != 0 && (OnigCodePoint )high < n) { 1757 int from_pos = SIZE_CODE_POINT * (1 + high * 2); 1758 int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); 1759 int size = (n - high) * 2 * SIZE_CODE_POINT; 1760 1761 if (inc_n > 0) { 1762 BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); 1763 } 1764 else { 1765 BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); 1766 } 1767 } 1768 1769 pos = SIZE_CODE_POINT * (1 + low * 2); 1770 BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); 1771 BBUF_WRITE_CODE_POINT(bbuf, pos, from); 1772 BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); 1773 n += inc_n; 1774 BBUF_WRITE_CODE_POINT(bbuf, 0, n); 1775 1776 return 0; 1777 } 1778 1779 static int 1780 add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) 1781 { 1782 if (from > to) { 1783 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 1784 return 0; 1785 else 1786 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 1787 } 1788 1789 return add_code_range_to_buf(pbuf, from, to); 1790 } 1791 1792 static int 1793 not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) 1794 { 1795 int r, i, n; 1796 OnigCodePoint pre, from, *data, to = 0; 1797 1798 *pbuf = (BBuf* )NULL; 1799 if (IS_NULL(bbuf)) { 1800 set_all: 1801 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 1802 } 1803 1804 data = (OnigCodePoint* )(bbuf->p); 1805 GET_CODE_POINT(n, data); 1806 data++; 1807 if (n <= 0) goto set_all; 1808 1809 r = 0; 1810 pre = MBCODE_START_POS(enc); 1811 for (i = 0; i < n; i++) { 1812 from = data[i*2]; 1813 to = data[i*2+1]; 1814 if (pre <= from - 1) { 1815 r = add_code_range_to_buf(pbuf, pre, from - 1); 1816 if (r != 0) return r; 1817 } 1818 if (to == ~((OnigCodePoint )0)) break; 1819 pre = to + 1; 1820 } 1821 if (to < ~((OnigCodePoint )0)) { 1822 r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0)); 1823 } 1824 return r; 1825 } 1826 1827 #define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ 1828 BBuf *tbuf; \ 1829 int tnot; \ 1830 tnot = not1; not1 = not2; not2 = tnot; \ 1831 tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ 1832 } while (0) 1833 1834 static int 1835 or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, 1836 BBuf* bbuf2, int not2, BBuf** pbuf) 1837 { 1838 int r; 1839 OnigCodePoint i, n1, *data1; 1840 OnigCodePoint from, to; 1841 1842 *pbuf = (BBuf* )NULL; 1843 if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { 1844 if (not1 != 0 || not2 != 0) 1845 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 1846 return 0; 1847 } 1848 1849 r = 0; 1850 if (IS_NULL(bbuf2)) 1851 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 1852 1853 if (IS_NULL(bbuf1)) { 1854 if (not1 != 0) { 1855 return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf); 1856 } 1857 else { 1858 if (not2 == 0) { 1859 return bbuf_clone(pbuf, bbuf2); 1860 } 1861 else { 1862 return not_code_range_buf(enc, bbuf2, pbuf); 1863 } 1864 } 1865 } 1866 1867 if (not1 != 0) 1868 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 1869 1870 data1 = (OnigCodePoint* )(bbuf1->p); 1871 GET_CODE_POINT(n1, data1); 1872 data1++; 1873 1874 if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ 1875 r = bbuf_clone(pbuf, bbuf2); 1876 } 1877 else if (not1 == 0) { /* 1 OR (not 2) */ 1878 r = not_code_range_buf(enc, bbuf2, pbuf); 1879 } 1880 if (r != 0) return r; 1881 1882 for (i = 0; i < n1; i++) { 1883 from = data1[i*2]; 1884 to = data1[i*2+1]; 1885 r = add_code_range_to_buf(pbuf, from, to); 1886 if (r != 0) return r; 1887 } 1888 return 0; 1889 } 1890 1891 static int 1892 and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1, 1893 OnigCodePoint* data, int n) 1894 { 1895 int i, r; 1896 OnigCodePoint from2, to2; 1897 1898 for (i = 0; i < n; i++) { 1899 from2 = data[i*2]; 1900 to2 = data[i*2+1]; 1901 if (from2 < from1) { 1902 if (to2 < from1) continue; 1903 else { 1904 from1 = to2 + 1; 1905 } 1906 } 1907 else if (from2 <= to1) { 1908 if (to2 < to1) { 1909 if (from1 <= from2 - 1) { 1910 r = add_code_range_to_buf(pbuf, from1, from2-1); 1911 if (r != 0) return r; 1912 } 1913 from1 = to2 + 1; 1914 } 1915 else { 1916 to1 = from2 - 1; 1917 } 1918 } 1919 else { 1920 from1 = from2; 1921 } 1922 if (from1 > to1) break; 1923 } 1924 if (from1 <= to1) { 1925 r = add_code_range_to_buf(pbuf, from1, to1); 1926 if (r != 0) return r; 1927 } 1928 return 0; 1929 } 1930 1931 static int 1932 and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) 1933 { 1934 int r; 1935 OnigCodePoint i, j, n1, n2, *data1, *data2; 1936 OnigCodePoint from, to, from1, to1, from2, to2; 1937 1938 *pbuf = (BBuf* )NULL; 1939 if (IS_NULL(bbuf1)) { 1940 if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ 1941 return bbuf_clone(pbuf, bbuf2); 1942 return 0; 1943 } 1944 else if (IS_NULL(bbuf2)) { 1945 if (not2 != 0) 1946 return bbuf_clone(pbuf, bbuf1); 1947 return 0; 1948 } 1949 1950 if (not1 != 0) 1951 SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); 1952 1953 data1 = (OnigCodePoint* )(bbuf1->p); 1954 data2 = (OnigCodePoint* )(bbuf2->p); 1955 GET_CODE_POINT(n1, data1); 1956 GET_CODE_POINT(n2, data2); 1957 data1++; 1958 data2++; 1959 1960 if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ 1961 for (i = 0; i < n1; i++) { 1962 from1 = data1[i*2]; 1963 to1 = data1[i*2+1]; 1964 for (j = 0; j < n2; j++) { 1965 from2 = data2[j*2]; 1966 to2 = data2[j*2+1]; 1967 if (from2 > to1) break; 1968 if (to2 < from1) continue; 1969 from = MAX(from1, from2); 1970 to = MIN(to1, to2); 1971 r = add_code_range_to_buf(pbuf, from, to); 1972 if (r != 0) return r; 1973 } 1974 } 1975 } 1976 else if (not1 == 0) { /* 1 AND (not 2) */ 1977 for (i = 0; i < n1; i++) { 1978 from1 = data1[i*2]; 1979 to1 = data1[i*2+1]; 1980 r = and_code_range1(pbuf, from1, to1, data2, n2); 1981 if (r != 0) return r; 1982 } 1983 } 1984 1985 return 0; 1986 } 1987 1988 static int 1989 and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) 1990 { 1991 int r, not1, not2; 1992 BBuf *buf1, *buf2, *pbuf; 1993 BitSetRef bsr1, bsr2; 1994 BitSet bs1, bs2; 1995 1996 not1 = IS_NCCLASS_NOT(dest); 1997 bsr1 = dest->bs; 1998 buf1 = dest->mbuf; 1999 not2 = IS_NCCLASS_NOT(cc); 2000 bsr2 = cc->bs; 2001 buf2 = cc->mbuf; 2002 2003 if (not1 != 0) { 2004 bitset_invert_to(bsr1, bs1); 2005 bsr1 = bs1; 2006 } 2007 if (not2 != 0) { 2008 bitset_invert_to(bsr2, bs2); 2009 bsr2 = bs2; 2010 } 2011 bitset_and(bsr1, bsr2); 2012 if (bsr1 != dest->bs) { 2013 bitset_copy(dest->bs, bsr1); 2014 bsr1 = dest->bs; 2015 } 2016 if (not1 != 0) { 2017 bitset_invert(dest->bs); 2018 } 2019 2020 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 2021 if (not1 != 0 && not2 != 0) { 2022 r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf); 2023 } 2024 else { 2025 r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); 2026 if (r == 0 && not1 != 0) { 2027 BBuf *tbuf; 2028 r = not_code_range_buf(enc, pbuf, &tbuf); 2029 if (r != 0) { 2030 bbuf_free(pbuf); 2031 return r; 2032 } 2033 bbuf_free(pbuf); 2034 pbuf = tbuf; 2035 } 2036 } 2037 if (r != 0) return r; 2038 2039 dest->mbuf = pbuf; 2040 bbuf_free(buf1); 2041 return r; 2042 } 2043 return 0; 2044 } 2045 2046 static int 2047 or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) 2048 { 2049 int r, not1, not2; 2050 BBuf *buf1, *buf2, *pbuf; 2051 BitSetRef bsr1, bsr2; 2052 BitSet bs1, bs2; 2053 2054 not1 = IS_NCCLASS_NOT(dest); 2055 bsr1 = dest->bs; 2056 buf1 = dest->mbuf; 2057 not2 = IS_NCCLASS_NOT(cc); 2058 bsr2 = cc->bs; 2059 buf2 = cc->mbuf; 2060 2061 if (not1 != 0) { 2062 bitset_invert_to(bsr1, bs1); 2063 bsr1 = bs1; 2064 } 2065 if (not2 != 0) { 2066 bitset_invert_to(bsr2, bs2); 2067 bsr2 = bs2; 2068 } 2069 bitset_or(bsr1, bsr2); 2070 if (bsr1 != dest->bs) { 2071 bitset_copy(dest->bs, bsr1); 2072 bsr1 = dest->bs; 2073 } 2074 if (not1 != 0) { 2075 bitset_invert(dest->bs); 2076 } 2077 2078 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 2079 if (not1 != 0 && not2 != 0) { 2080 r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); 2081 } 2082 else { 2083 r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf); 2084 if (r == 0 && not1 != 0) { 2085 BBuf *tbuf; 2086 r = not_code_range_buf(enc, pbuf, &tbuf); 2087 if (r != 0) { 2088 bbuf_free(pbuf); 2089 return r; 2090 } 2091 bbuf_free(pbuf); 2092 pbuf = tbuf; 2093 } 2094 } 2095 if (r != 0) return r; 2096 2097 dest->mbuf = pbuf; 2098 bbuf_free(buf1); 2099 return r; 2100 } 2101 else 2102 return 0; 2103 } 2104 2105 static int 2106 conv_backslash_value(int c, ScanEnv* env) 2107 { 2108 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { 2109 switch (c) { 2110 case 'n': return '\n'; 2111 case 't': return '\t'; 2112 case 'r': return '\r'; 2113 case 'f': return '\f'; 2114 case 'a': return '\007'; 2115 case 'b': return '\010'; 2116 case 'e': return '\033'; 2117 case 'v': 2118 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) 2119 return '\v'; 2120 break; 2121 2122 default: 2123 break; 2124 } 2125 } 2126 return c; 2127 } 2128 2129 static int 2130 is_invalid_quantifier_target(Node* node) 2131 { 2132 switch (NTYPE(node)) { 2133 case NT_ANCHOR: 2134 return 1; 2135 break; 2136 2137 case NT_ENCLOSE: 2138 /* allow enclosed elements */ 2139 /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */ 2140 break; 2141 2142 case NT_LIST: 2143 do { 2144 if (! is_invalid_quantifier_target(NCAR(node))) return 0; 2145 } while (IS_NOT_NULL(node = NCDR(node))); 2146 return 0; 2147 break; 2148 2149 case NT_ALT: 2150 do { 2151 if (is_invalid_quantifier_target(NCAR(node))) return 1; 2152 } while (IS_NOT_NULL(node = NCDR(node))); 2153 break; 2154 2155 default: 2156 break; 2157 } 2158 return 0; 2159 } 2160 2161 /* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ 2162 static int 2163 popular_quantifier_num(QtfrNode* q) 2164 { 2165 if (q->greedy) { 2166 if (q->lower == 0) { 2167 if (q->upper == 1) return 0; 2168 else if (IS_REPEAT_INFINITE(q->upper)) return 1; 2169 } 2170 else if (q->lower == 1) { 2171 if (IS_REPEAT_INFINITE(q->upper)) return 2; 2172 } 2173 } 2174 else { 2175 if (q->lower == 0) { 2176 if (q->upper == 1) return 3; 2177 else if (IS_REPEAT_INFINITE(q->upper)) return 4; 2178 } 2179 else if (q->lower == 1) { 2180 if (IS_REPEAT_INFINITE(q->upper)) return 5; 2181 } 2182 } 2183 return -1; 2184 } 2185 2186 2187 enum ReduceType { 2188 RQ_ASIS = 0, /* as is */ 2189 RQ_DEL = 1, /* delete parent */ 2190 RQ_A, /* to '*' */ 2191 RQ_AQ, /* to '*?' */ 2192 RQ_QQ, /* to '??' */ 2193 RQ_P_QQ, /* to '+)??' */ 2194 RQ_PQ_Q /* to '+?)?' */ 2195 }; 2196 2197 static enum ReduceType ReduceTypeTable[6][6] = { 2198 {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ 2199 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ 2200 {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ 2201 {RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL, RQ_AQ, RQ_AQ}, /* '??' */ 2202 {RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL, RQ_DEL}, /* '*?' */ 2203 {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */ 2204 }; 2205 2206 extern void 2207 onig_reduce_nested_quantifier(Node* pnode, Node* cnode) 2208 { 2209 int pnum, cnum; 2210 QtfrNode *p, *c; 2211 2212 p = NQTFR(pnode); 2213 c = NQTFR(cnode); 2214 pnum = popular_quantifier_num(p); 2215 cnum = popular_quantifier_num(c); 2216 if (pnum < 0 || cnum < 0) return ; 2217 2218 switch(ReduceTypeTable[cnum][pnum]) { 2219 case RQ_DEL: 2220 CopyMem (pnode, cnode, sizeof (Node)); 2221 break; 2222 case RQ_A: 2223 p->target = c->target; 2224 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; 2225 break; 2226 case RQ_AQ: 2227 p->target = c->target; 2228 p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; 2229 break; 2230 case RQ_QQ: 2231 p->target = c->target; 2232 p->lower = 0; p->upper = 1; p->greedy = 0; 2233 break; 2234 case RQ_P_QQ: 2235 p->target = cnode; 2236 p->lower = 0; p->upper = 1; p->greedy = 0; 2237 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; 2238 return ; 2239 break; 2240 case RQ_PQ_Q: 2241 p->target = cnode; 2242 p->lower = 0; p->upper = 1; p->greedy = 1; 2243 c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; 2244 return ; 2245 break; 2246 case RQ_ASIS: 2247 p->target = cnode; 2248 return ; 2249 break; 2250 } 2251 2252 c->target = NULL_NODE; 2253 onig_node_free(cnode); 2254 } 2255 2256 2257 enum TokenSyms { 2258 TK_EOT = 0, /* end of token */ 2259 TK_RAW_BYTE = 1, 2260 TK_CHAR, 2261 TK_STRING, 2262 TK_CODE_POINT, 2263 TK_ANYCHAR, 2264 TK_CHAR_TYPE, 2265 TK_BACKREF, 2266 TK_CALL, 2267 TK_ANCHOR, 2268 TK_OP_REPEAT, 2269 TK_INTERVAL, 2270 TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ 2271 TK_ALT, 2272 TK_SUBEXP_OPEN, 2273 TK_SUBEXP_CLOSE, 2274 TK_CC_OPEN, 2275 TK_QUOTE_OPEN, 2276 TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ 2277 /* in cc */ 2278 TK_CC_CLOSE, 2279 TK_CC_RANGE, 2280 TK_POSIX_BRACKET_OPEN, 2281 TK_CC_AND, /* && */ 2282 TK_CC_CC_OPEN /* [ */ 2283 }; 2284 2285 typedef struct { 2286 enum TokenSyms type; 2287 int escaped; 2288 int base; /* is number: 8, 16 (used in [....]) */ 2289 UChar* backp; 2290 union { 2291 UChar* s; 2292 int c; 2293 OnigCodePoint code; 2294 int anchor; 2295 int subtype; 2296 struct { 2297 int lower; 2298 int upper; 2299 int greedy; 2300 int possessive; 2301 } repeat; 2302 struct { 2303 int num; 2304 int ref1; 2305 int* refs; 2306 int by_name; 2307 #ifdef USE_BACKREF_WITH_LEVEL 2308 int exist_level; 2309 int level; /* \k<name+n> */ 2310 #endif 2311 } backref; 2312 struct { 2313 UChar* name; 2314 UChar* name_end; 2315 int gnum; 2316 } call; 2317 struct { 2318 int ctype; 2319 int not; 2320 } prop; 2321 } u; 2322 } OnigToken; 2323 2324 2325 static int 2326 fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) 2327 { 2328 int low, up, syn_allow, non_low = 0; 2329 int r = 0; 2330 OnigCodePoint c; 2331 OnigEncoding enc = env->enc; 2332 UChar* p = *src; 2333 PFETCH_READY; 2334 2335 syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); 2336 2337 if (PEND) { 2338 if (syn_allow) 2339 return 1; /* "....{" : OK! */ 2340 else 2341 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ 2342 } 2343 2344 if (! syn_allow) { 2345 c = PPEEK; 2346 if (c == ')' || c == '(' || c == '|') { 2347 return ONIGERR_END_PATTERN_AT_LEFT_BRACE; 2348 } 2349 } 2350 2351 low = onig_scan_unsigned_number(&p, end, env->enc); 2352 if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 2353 if (low > ONIG_MAX_REPEAT_NUM) 2354 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 2355 2356 if (p == *src) { /* can't read low */ 2357 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { 2358 /* allow {,n} as {0,n} */ 2359 low = 0; 2360 non_low = 1; 2361 } 2362 else 2363 goto invalid; 2364 } 2365 2366 if (PEND) goto invalid; 2367 PFETCH(c); 2368 if (c == ',') { 2369 UChar* prev = p; 2370 up = onig_scan_unsigned_number(&p, end, env->enc); 2371 if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 2372 if (up > ONIG_MAX_REPEAT_NUM) 2373 return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; 2374 2375 if (p == prev) { 2376 if (non_low != 0) 2377 goto invalid; 2378 up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ 2379 } 2380 } 2381 else { 2382 if (non_low != 0) 2383 goto invalid; 2384 2385 PUNFETCH; 2386 up = low; /* {n} : exact n times */ 2387 r = 2; /* fixed */ 2388 } 2389 2390 if (PEND) goto invalid; 2391 PFETCH(c); 2392 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { 2393 if (c != MC_ESC(env->syntax)) goto invalid; 2394 PFETCH(c); 2395 } 2396 if (c != '}') goto invalid; 2397 2398 if (!IS_REPEAT_INFINITE(up) && low > up) { 2399 return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; 2400 } 2401 2402 tok->type = TK_INTERVAL; 2403 tok->u.repeat.lower = low; 2404 tok->u.repeat.upper = up; 2405 *src = p; 2406 return r; /* 0: normal {n,m}, 2: fixed {n} */ 2407 2408 invalid: 2409 if (syn_allow) 2410 return 1; /* OK */ 2411 else 2412 return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; 2413 } 2414 2415 /* \M-, \C-, \c, or \... */ 2416 static int 2417 fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) 2418 { 2419 int v; 2420 OnigCodePoint c; 2421 OnigEncoding enc = env->enc; 2422 UChar* p = *src; 2423 2424 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 2425 2426 PFETCH_S(c); 2427 switch (c) { 2428 case 'M': 2429 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { 2430 if (PEND) return ONIGERR_END_PATTERN_AT_META; 2431 PFETCH_S(c); 2432 if (c != '-') return ONIGERR_META_CODE_SYNTAX; 2433 if (PEND) return ONIGERR_END_PATTERN_AT_META; 2434 PFETCH_S(c); 2435 if (c == MC_ESC(env->syntax)) { 2436 v = fetch_escaped_value(&p, end, env); 2437 if (v < 0) return v; 2438 c = (OnigCodePoint )v; 2439 } 2440 c = ((c & 0xff) | 0x80); 2441 } 2442 else 2443 goto backslash; 2444 break; 2445 2446 case 'C': 2447 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { 2448 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 2449 PFETCH_S(c); 2450 if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; 2451 goto control; 2452 } 2453 else 2454 goto backslash; 2455 2456 case 'c': 2457 if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { 2458 control: 2459 if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; 2460 PFETCH_S(c); 2461 if (c == '?') { 2462 c = 0177; 2463 } 2464 else { 2465 if (c == MC_ESC(env->syntax)) { 2466 v = fetch_escaped_value(&p, end, env); 2467 if (v < 0) return v; 2468 c = (OnigCodePoint )v; 2469 } 2470 c &= 0x9f; 2471 } 2472 break; 2473 } 2474 /* fall through */ 2475 2476 default: 2477 { 2478 backslash: 2479 c = conv_backslash_value(c, env); 2480 } 2481 break; 2482 } 2483 2484 *src = p; 2485 return c; 2486 } 2487 2488 static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); 2489 2490 static OnigCodePoint 2491 get_name_end_code_point(OnigCodePoint start) 2492 { 2493 switch (start) { 2494 case '<': return (OnigCodePoint )'>'; break; 2495 case '\'': return (OnigCodePoint )'\''; break; 2496 default: 2497 break; 2498 } 2499 2500 return (OnigCodePoint )0; 2501 } 2502 2503 #ifdef USE_NAMED_GROUP 2504 #ifdef USE_BACKREF_WITH_LEVEL 2505 /* 2506 \k<name+n>, \k<name-n> 2507 \k<num+n>, \k<num-n> 2508 \k<-num+n>, \k<-num-n> 2509 */ 2510 static int 2511 fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, 2512 UChar** rname_end, ScanEnv* env, 2513 int* rback_num, int* rlevel) 2514 { 2515 int r, sign, is_num, exist_level; 2516 OnigCodePoint end_code; 2517 OnigCodePoint c = 0; 2518 OnigEncoding enc = env->enc; 2519 UChar *name_end; 2520 UChar *pnum_head; 2521 UChar *p = *src; 2522 PFETCH_READY; 2523 2524 *rback_num = 0; 2525 is_num = exist_level = 0; 2526 sign = 1; 2527 pnum_head = *src; 2528 2529 end_code = get_name_end_code_point(start_code); 2530 2531 name_end = end; 2532 r = 0; 2533 if (PEND) { 2534 return ONIGERR_EMPTY_GROUP_NAME; 2535 } 2536 else { 2537 PFETCH(c); 2538 if (c == end_code) 2539 return ONIGERR_EMPTY_GROUP_NAME; 2540 2541 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 2542 is_num = 1; 2543 } 2544 else if (c == '-') { 2545 is_num = 2; 2546 sign = -1; 2547 pnum_head = p; 2548 } 2549 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 2550 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2551 } 2552 } 2553 2554 while (!PEND) { 2555 name_end = p; 2556 PFETCH(c); 2557 if (c == end_code || c == ')' || c == '+' || c == '-') { 2558 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; 2559 break; 2560 } 2561 2562 if (is_num != 0) { 2563 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 2564 is_num = 1; 2565 } 2566 else { 2567 r = ONIGERR_INVALID_GROUP_NAME; 2568 is_num = 0; 2569 } 2570 } 2571 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 2572 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2573 } 2574 } 2575 2576 if (r == 0 && c != end_code) { 2577 if (c == '+' || c == '-') { 2578 int level; 2579 int flag = (c == '-' ? -1 : 1); 2580 2581 PFETCH(c); 2582 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; 2583 PUNFETCH; 2584 level = onig_scan_unsigned_number(&p, end, enc); 2585 if (level < 0) return ONIGERR_TOO_BIG_NUMBER; 2586 *rlevel = (level * flag); 2587 exist_level = 1; 2588 2589 PFETCH(c); 2590 if (c == end_code) 2591 goto end; 2592 } 2593 2594 err: 2595 r = ONIGERR_INVALID_GROUP_NAME; 2596 name_end = end; 2597 } 2598 2599 end: 2600 if (r == 0) { 2601 if (is_num != 0) { 2602 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 2603 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 2604 else if (*rback_num == 0) goto err; 2605 2606 *rback_num *= sign; 2607 } 2608 2609 *rname_end = name_end; 2610 *src = p; 2611 return (exist_level ? 1 : 0); 2612 } 2613 else { 2614 onig_scan_env_set_error_string(env, r, *src, name_end); 2615 return r; 2616 } 2617 } 2618 #endif /* USE_BACKREF_WITH_LEVEL */ 2619 2620 /* 2621 def: 0 -> define name (don't allow number name) 2622 1 -> reference name (allow number name) 2623 */ 2624 static int 2625 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 2626 UChar** rname_end, ScanEnv* env, int* rback_num, int ref) 2627 { 2628 int r, is_num, sign; 2629 OnigCodePoint end_code; 2630 OnigCodePoint c = 0; 2631 OnigEncoding enc = env->enc; 2632 UChar *name_end; 2633 UChar *pnum_head; 2634 UChar *p = *src; 2635 2636 *rback_num = 0; 2637 2638 end_code = get_name_end_code_point(start_code); 2639 2640 name_end = end; 2641 pnum_head = *src; 2642 r = 0; 2643 is_num = 0; 2644 sign = 1; 2645 if (PEND) { 2646 return ONIGERR_EMPTY_GROUP_NAME; 2647 } 2648 else { 2649 PFETCH_S(c); 2650 if (c == end_code) 2651 return ONIGERR_EMPTY_GROUP_NAME; 2652 2653 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 2654 if (ref == 1) 2655 is_num = 1; 2656 else { 2657 r = ONIGERR_INVALID_GROUP_NAME; 2658 is_num = 0; 2659 } 2660 } 2661 else if (c == '-') { 2662 if (ref == 1) { 2663 is_num = 2; 2664 sign = -1; 2665 pnum_head = p; 2666 } 2667 else { 2668 r = ONIGERR_INVALID_GROUP_NAME; 2669 is_num = 0; 2670 } 2671 } 2672 else if (!ONIGENC_IS_CODE_WORD(enc, c)) { 2673 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2674 } 2675 } 2676 2677 if (r == 0) { 2678 while (!PEND) { 2679 name_end = p; 2680 PFETCH_S(c); 2681 if (c == end_code || c == ')') { 2682 if (is_num == 2) r = ONIGERR_INVALID_GROUP_NAME; 2683 break; 2684 } 2685 2686 if (is_num != 0) { 2687 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 2688 is_num = 1; 2689 } 2690 else { 2691 if (!ONIGENC_IS_CODE_WORD(enc, c)) 2692 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2693 else 2694 r = ONIGERR_INVALID_GROUP_NAME; 2695 is_num = 0; 2696 } 2697 } 2698 else { 2699 if (!ONIGENC_IS_CODE_WORD(enc, c)) { 2700 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2701 } 2702 } 2703 } 2704 2705 if (c != end_code) { 2706 r = ONIGERR_INVALID_GROUP_NAME; 2707 name_end = end; 2708 } 2709 2710 if (is_num != 0) { 2711 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 2712 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 2713 else if (*rback_num == 0) { 2714 r = ONIGERR_INVALID_GROUP_NAME; 2715 goto err; 2716 } 2717 2718 *rback_num *= sign; 2719 } 2720 2721 *rname_end = name_end; 2722 *src = p; 2723 return 0; 2724 } 2725 else { 2726 while (!PEND) { 2727 name_end = p; 2728 PFETCH_S(c); 2729 if (c == end_code || c == ')') 2730 break; 2731 } 2732 if (PEND) 2733 name_end = end; 2734 2735 err: 2736 onig_scan_env_set_error_string(env, r, *src, name_end); 2737 return r; 2738 } 2739 } 2740 #else 2741 static int 2742 fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, 2743 UChar** rname_end, ScanEnv* env, int* rback_num, int ref) 2744 { 2745 int r, is_num, sign; 2746 OnigCodePoint end_code; 2747 OnigCodePoint c = 0; 2748 UChar *name_end; 2749 OnigEncoding enc = env->enc; 2750 UChar *pnum_head; 2751 UChar *p = *src; 2752 PFETCH_READY; 2753 2754 *rback_num = 0; 2755 2756 end_code = get_name_end_code_point(start_code); 2757 2758 *rname_end = name_end = end; 2759 r = 0; 2760 pnum_head = *src; 2761 is_num = 0; 2762 sign = 1; 2763 2764 if (PEND) { 2765 return ONIGERR_EMPTY_GROUP_NAME; 2766 } 2767 else { 2768 PFETCH(c); 2769 if (c == end_code) 2770 return ONIGERR_EMPTY_GROUP_NAME; 2771 2772 if (ONIGENC_IS_CODE_DIGIT(enc, c)) { 2773 is_num = 1; 2774 } 2775 else if (c == '-') { 2776 is_num = 2; 2777 sign = -1; 2778 pnum_head = p; 2779 } 2780 else { 2781 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2782 } 2783 } 2784 2785 while (!PEND) { 2786 name_end = p; 2787 2788 PFETCH(c); 2789 if (c == end_code || c == ')') break; 2790 if (! ONIGENC_IS_CODE_DIGIT(enc, c)) 2791 r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; 2792 } 2793 if (r == 0 && c != end_code) { 2794 r = ONIGERR_INVALID_GROUP_NAME; 2795 name_end = end; 2796 } 2797 2798 if (r == 0) { 2799 *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); 2800 if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; 2801 else if (*rback_num == 0) { 2802 r = ONIGERR_INVALID_GROUP_NAME; 2803 goto err; 2804 } 2805 *rback_num *= sign; 2806 2807 *rname_end = name_end; 2808 *src = p; 2809 return 0; 2810 } 2811 else { 2812 err: 2813 onig_scan_env_set_error_string(env, r, *src, name_end); 2814 return r; 2815 } 2816 } 2817 #endif /* USE_NAMED_GROUP */ 2818 2819 static void 2820 CC_ESC_WARN(ScanEnv* env, UChar *c) 2821 { 2822 if (onig_warn == onig_null_warn) return ; 2823 2824 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && 2825 IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { 2826 UChar buf[WARN_BUFSIZE]; 2827 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 2828 env->pattern, env->pattern_end, 2829 (UChar* )"character class has '%s' without escape", c); 2830 (*onig_warn)((char* )buf); 2831 } 2832 } 2833 2834 static void 2835 CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) 2836 { 2837 if (onig_warn == onig_null_warn) return ; 2838 2839 if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { 2840 UChar buf[WARN_BUFSIZE]; 2841 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, 2842 (env)->pattern, (env)->pattern_end, 2843 (UChar* )"regular expression has '%s' without escape", c); 2844 (*onig_warn)((char* )buf); 2845 } 2846 } 2847 2848 static UChar* 2849 find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, 2850 UChar **next, OnigEncoding enc) 2851 { 2852 int i; 2853 OnigCodePoint x; 2854 UChar *q; 2855 UChar *p = from; 2856 2857 while (p < to) { 2858 x = ONIGENC_MBC_TO_CODE(enc, p, to); 2859 q = p + enclen(enc, p); 2860 if (x == s[0]) { 2861 for (i = 1; i < n && q < to; i++) { 2862 x = ONIGENC_MBC_TO_CODE(enc, q, to); 2863 if (x != s[i]) break; 2864 q += enclen(enc, q); 2865 } 2866 if (i >= n) { 2867 if (IS_NOT_NULL(next)) 2868 *next = q; 2869 return p; 2870 } 2871 } 2872 p = q; 2873 } 2874 return NULL_UCHARP; 2875 } 2876 2877 static int 2878 str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, 2879 OnigCodePoint bad, OnigEncoding enc, OnigSyntaxType* syn) 2880 { 2881 int i, in_esc; 2882 OnigCodePoint x; 2883 UChar *q; 2884 UChar *p = from; 2885 2886 in_esc = 0; 2887 while (p < to) { 2888 if (in_esc) { 2889 in_esc = 0; 2890 p += enclen(enc, p); 2891 } 2892 else { 2893 x = ONIGENC_MBC_TO_CODE(enc, p, to); 2894 q = p + enclen(enc, p); 2895 if (x == s[0]) { 2896 for (i = 1; i < n && q < to; i++) { 2897 x = ONIGENC_MBC_TO_CODE(enc, q, to); 2898 if (x != s[i]) break; 2899 q += enclen(enc, q); 2900 } 2901 if (i >= n) return 1; 2902 p += enclen(enc, p); 2903 } 2904 else { 2905 x = ONIGENC_MBC_TO_CODE(enc, p, to); 2906 if (x == bad) return 0; 2907 else if (x == MC_ESC(syn)) in_esc = 1; 2908 p = q; 2909 } 2910 } 2911 } 2912 return 0; 2913 } 2914 2915 static int 2916 fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) 2917 { 2918 int num; 2919 OnigCodePoint c, c2; 2920 OnigSyntaxType* syn = env->syntax; 2921 OnigEncoding enc = env->enc; 2922 UChar* prev; 2923 UChar* p = *src; 2924 PFETCH_READY; 2925 2926 if (PEND) { 2927 tok->type = TK_EOT; 2928 return tok->type; 2929 } 2930 2931 PFETCH(c); 2932 tok->type = TK_CHAR; 2933 tok->base = 0; 2934 tok->u.c = c; 2935 tok->escaped = 0; 2936 2937 if (c == ']') { 2938 tok->type = TK_CC_CLOSE; 2939 } 2940 else if (c == '-') { 2941 tok->type = TK_CC_RANGE; 2942 } 2943 else if (c == MC_ESC(syn)) { 2944 if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) 2945 goto end; 2946 2947 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 2948 2949 PFETCH(c); 2950 tok->escaped = 1; 2951 tok->u.c = c; 2952 switch (c) { 2953 case 'w': 2954 tok->type = TK_CHAR_TYPE; 2955 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 2956 tok->u.prop.not = 0; 2957 break; 2958 case 'W': 2959 tok->type = TK_CHAR_TYPE; 2960 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 2961 tok->u.prop.not = 1; 2962 break; 2963 case 'd': 2964 tok->type = TK_CHAR_TYPE; 2965 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 2966 tok->u.prop.not = 0; 2967 break; 2968 case 'D': 2969 tok->type = TK_CHAR_TYPE; 2970 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 2971 tok->u.prop.not = 1; 2972 break; 2973 case 's': 2974 tok->type = TK_CHAR_TYPE; 2975 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 2976 tok->u.prop.not = 0; 2977 break; 2978 case 'S': 2979 tok->type = TK_CHAR_TYPE; 2980 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 2981 tok->u.prop.not = 1; 2982 break; 2983 case 'h': 2984 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 2985 tok->type = TK_CHAR_TYPE; 2986 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 2987 tok->u.prop.not = 0; 2988 break; 2989 case 'H': 2990 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 2991 tok->type = TK_CHAR_TYPE; 2992 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 2993 tok->u.prop.not = 1; 2994 break; 2995 2996 case 'p': 2997 case 'P': 2998 c2 = PPEEK; 2999 if (c2 == '{' && 3000 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 3001 PINC; 3002 tok->type = TK_CHAR_PROPERTY; 3003 tok->u.prop.not = (c == 'P' ? 1 : 0); 3004 3005 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 3006 PFETCH(c2); 3007 if (c2 == '^') { 3008 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); 3009 } 3010 else 3011 PUNFETCH; 3012 } 3013 } 3014 break; 3015 3016 case 'x': 3017 if (PEND) break; 3018 3019 prev = p; 3020 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 3021 PINC; 3022 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); 3023 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 3024 if (!PEND) { 3025 c2 = PPEEK; 3026 if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) 3027 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 3028 } 3029 3030 if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { 3031 PINC; 3032 tok->type = TK_CODE_POINT; 3033 tok->base = 16; 3034 tok->u.code = (OnigCodePoint )num; 3035 } 3036 else { 3037 /* can't read nothing or invalid format */ 3038 p = prev; 3039 } 3040 } 3041 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 3042 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); 3043 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 3044 if (p == prev) { /* can't read nothing. */ 3045 num = 0; /* but, it's not error */ 3046 } 3047 tok->type = TK_RAW_BYTE; 3048 tok->base = 16; 3049 tok->u.c = num; 3050 } 3051 break; 3052 3053 case 'u': 3054 if (PEND) break; 3055 3056 prev = p; 3057 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 3058 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); 3059 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 3060 if (p == prev) { /* can't read nothing. */ 3061 num = 0; /* but, it's not error */ 3062 } 3063 tok->type = TK_CODE_POINT; 3064 tok->base = 16; 3065 tok->u.code = (OnigCodePoint )num; 3066 } 3067 break; 3068 3069 case '0': 3070 case '1': case '2': case '3': case '4': case '5': case '6': case '7': 3071 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 3072 PUNFETCH; 3073 prev = p; 3074 num = scan_unsigned_octal_number(&p, end, 3, enc); 3075 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 3076 if (p == prev) { /* can't read nothing. */ 3077 num = 0; /* but, it's not error */ 3078 } 3079 tok->type = TK_RAW_BYTE; 3080 tok->base = 8; 3081 tok->u.c = num; 3082 } 3083 break; 3084 3085 default: 3086 PUNFETCH; 3087 num = fetch_escaped_value(&p, end, env); 3088 if (num < 0) return num; 3089 if (tok->u.c != num) { 3090 tok->u.code = (OnigCodePoint )num; 3091 tok->type = TK_CODE_POINT; 3092 } 3093 break; 3094 } 3095 } 3096 else if (c == '[') { 3097 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) { 3098 OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; 3099 tok->backp = p; /* point at '[' is readed */ 3100 PINC; 3101 if (str_exist_check_with_esc(send, 2, p, end, 3102 (OnigCodePoint )']', enc, syn)) { 3103 tok->type = TK_POSIX_BRACKET_OPEN; 3104 } 3105 else { 3106 PUNFETCH; 3107 goto cc_in_cc; 3108 } 3109 } 3110 else { 3111 cc_in_cc: 3112 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { 3113 tok->type = TK_CC_CC_OPEN; 3114 } 3115 else { 3116 CC_ESC_WARN(env, (UChar* )"["); 3117 } 3118 } 3119 } 3120 else if (c == '&') { 3121 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && 3122 !PEND && (PPEEK_IS('&'))) { 3123 PINC; 3124 tok->type = TK_CC_AND; 3125 } 3126 } 3127 3128 end: 3129 *src = p; 3130 return tok->type; 3131 } 3132 3133 static int 3134 fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) 3135 { 3136 int r, num; 3137 OnigCodePoint c; 3138 OnigEncoding enc = env->enc; 3139 OnigSyntaxType* syn = env->syntax; 3140 UChar* prev; 3141 UChar* p = *src; 3142 PFETCH_READY; 3143 3144 start: 3145 if (PEND) { 3146 tok->type = TK_EOT; 3147 return tok->type; 3148 } 3149 3150 tok->type = TK_STRING; 3151 tok->base = 0; 3152 tok->backp = p; 3153 3154 PFETCH(c); 3155 if (IS_MC_ESC_CODE(c, syn)) { 3156 if (PEND) return ONIGERR_END_PATTERN_AT_ESCAPE; 3157 3158 tok->backp = p; 3159 PFETCH(c); 3160 3161 tok->u.c = c; 3162 tok->escaped = 1; 3163 switch (c) { 3164 case '*': 3165 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; 3166 tok->type = TK_OP_REPEAT; 3167 tok->u.repeat.lower = 0; 3168 tok->u.repeat.upper = REPEAT_INFINITE; 3169 goto greedy_check; 3170 break; 3171 3172 case '+': 3173 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; 3174 tok->type = TK_OP_REPEAT; 3175 tok->u.repeat.lower = 1; 3176 tok->u.repeat.upper = REPEAT_INFINITE; 3177 goto greedy_check; 3178 break; 3179 3180 case '?': 3181 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; 3182 tok->type = TK_OP_REPEAT; 3183 tok->u.repeat.lower = 0; 3184 tok->u.repeat.upper = 1; 3185 greedy_check: 3186 if (!PEND && PPEEK_IS('?') && 3187 IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { 3188 PFETCH(c); 3189 tok->u.repeat.greedy = 0; 3190 tok->u.repeat.possessive = 0; 3191 } 3192 else { 3193 possessive_check: 3194 if (!PEND && PPEEK_IS('+') && 3195 ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && 3196 tok->type != TK_INTERVAL) || 3197 (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && 3198 tok->type == TK_INTERVAL))) { 3199 PFETCH(c); 3200 tok->u.repeat.greedy = 1; 3201 tok->u.repeat.possessive = 1; 3202 } 3203 else { 3204 tok->u.repeat.greedy = 1; 3205 tok->u.repeat.possessive = 0; 3206 } 3207 } 3208 break; 3209 3210 case '{': 3211 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; 3212 r = fetch_range_quantifier(&p, end, tok, env); 3213 if (r < 0) return r; /* error */ 3214 if (r == 0) goto greedy_check; 3215 else if (r == 2) { /* {n} */ 3216 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 3217 goto possessive_check; 3218 3219 goto greedy_check; 3220 } 3221 /* r == 1 : normal char */ 3222 break; 3223 3224 case '|': 3225 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; 3226 tok->type = TK_ALT; 3227 break; 3228 3229 case '(': 3230 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 3231 tok->type = TK_SUBEXP_OPEN; 3232 break; 3233 3234 case ')': 3235 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; 3236 tok->type = TK_SUBEXP_CLOSE; 3237 break; 3238 3239 case 'w': 3240 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 3241 tok->type = TK_CHAR_TYPE; 3242 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 3243 tok->u.prop.not = 0; 3244 break; 3245 3246 case 'W': 3247 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; 3248 tok->type = TK_CHAR_TYPE; 3249 tok->u.prop.ctype = ONIGENC_CTYPE_WORD; 3250 tok->u.prop.not = 1; 3251 break; 3252 3253 case 'b': 3254 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 3255 tok->type = TK_ANCHOR; 3256 tok->u.anchor = ANCHOR_WORD_BOUND; 3257 break; 3258 3259 case 'B': 3260 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; 3261 tok->type = TK_ANCHOR; 3262 tok->u.anchor = ANCHOR_NOT_WORD_BOUND; 3263 break; 3264 3265 #ifdef USE_WORD_BEGIN_END 3266 case '<': 3267 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 3268 tok->type = TK_ANCHOR; 3269 tok->u.anchor = ANCHOR_WORD_BEGIN; 3270 break; 3271 3272 case '>': 3273 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; 3274 tok->type = TK_ANCHOR; 3275 tok->u.anchor = ANCHOR_WORD_END; 3276 break; 3277 #endif 3278 3279 case 's': 3280 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 3281 tok->type = TK_CHAR_TYPE; 3282 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 3283 tok->u.prop.not = 0; 3284 break; 3285 3286 case 'S': 3287 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; 3288 tok->type = TK_CHAR_TYPE; 3289 tok->u.prop.ctype = ONIGENC_CTYPE_SPACE; 3290 tok->u.prop.not = 1; 3291 break; 3292 3293 case 'd': 3294 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 3295 tok->type = TK_CHAR_TYPE; 3296 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 3297 tok->u.prop.not = 0; 3298 break; 3299 3300 case 'D': 3301 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; 3302 tok->type = TK_CHAR_TYPE; 3303 tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT; 3304 tok->u.prop.not = 1; 3305 break; 3306 3307 case 'h': 3308 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 3309 tok->type = TK_CHAR_TYPE; 3310 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 3311 tok->u.prop.not = 0; 3312 break; 3313 3314 case 'H': 3315 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break; 3316 tok->type = TK_CHAR_TYPE; 3317 tok->u.prop.ctype = ONIGENC_CTYPE_XDIGIT; 3318 tok->u.prop.not = 1; 3319 break; 3320 3321 case 'A': 3322 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 3323 begin_buf: 3324 tok->type = TK_ANCHOR; 3325 tok->u.subtype = ANCHOR_BEGIN_BUF; 3326 break; 3327 3328 case 'Z': 3329 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 3330 tok->type = TK_ANCHOR; 3331 tok->u.subtype = ANCHOR_SEMI_END_BUF; 3332 break; 3333 3334 case 'z': 3335 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; 3336 end_buf: 3337 tok->type = TK_ANCHOR; 3338 tok->u.subtype = ANCHOR_END_BUF; 3339 break; 3340 3341 case 'G': 3342 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; 3343 tok->type = TK_ANCHOR; 3344 tok->u.subtype = ANCHOR_BEGIN_POSITION; 3345 break; 3346 3347 case '`': 3348 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 3349 goto begin_buf; 3350 break; 3351 3352 case '\'': 3353 if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; 3354 goto end_buf; 3355 break; 3356 3357 case 'x': 3358 if (PEND) break; 3359 3360 prev = p; 3361 if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { 3362 PINC; 3363 num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); 3364 if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; 3365 if (!PEND) { 3366 if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) 3367 return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; 3368 } 3369 3370 if ((p > prev + enclen(enc, prev)) && !PEND && PPEEK_IS('}')) { 3371 PINC; 3372 tok->type = TK_CODE_POINT; 3373 tok->u.code = (OnigCodePoint )num; 3374 } 3375 else { 3376 /* can't read nothing or invalid format */ 3377 p = prev; 3378 } 3379 } 3380 else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { 3381 num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); 3382 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 3383 if (p == prev) { /* can't read nothing. */ 3384 num = 0; /* but, it's not error */ 3385 } 3386 tok->type = TK_RAW_BYTE; 3387 tok->base = 16; 3388 tok->u.c = num; 3389 } 3390 break; 3391 3392 case 'u': 3393 if (PEND) break; 3394 3395 prev = p; 3396 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { 3397 num = scan_unsigned_hexadecimal_number(&p, end, 4, enc); 3398 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 3399 if (p == prev) { /* can't read nothing. */ 3400 num = 0; /* but, it's not error */ 3401 } 3402 tok->type = TK_CODE_POINT; 3403 tok->base = 16; 3404 tok->u.code = (OnigCodePoint )num; 3405 } 3406 break; 3407 3408 case '1': case '2': case '3': case '4': 3409 case '5': case '6': case '7': case '8': case '9': 3410 PUNFETCH; 3411 prev = p; 3412 num = onig_scan_unsigned_number(&p, end, enc); 3413 if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { 3414 goto skip_backref; 3415 } 3416 3417 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && 3418 (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ 3419 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 3420 if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) 3421 return ONIGERR_INVALID_BACKREF; 3422 } 3423 3424 tok->type = TK_BACKREF; 3425 tok->u.backref.num = 1; 3426 tok->u.backref.ref1 = num; 3427 tok->u.backref.by_name = 0; 3428 #ifdef USE_BACKREF_WITH_LEVEL 3429 tok->u.backref.exist_level = 0; 3430 #endif 3431 break; 3432 } 3433 3434 skip_backref: 3435 if (c == '8' || c == '9') { 3436 /* normal char */ 3437 p = prev; PINC; 3438 break; 3439 } 3440 3441 p = prev; 3442 /* fall through */ 3443 case '0': 3444 if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { 3445 prev = p; 3446 num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc); 3447 if (num < 0) return ONIGERR_TOO_BIG_NUMBER; 3448 if (p == prev) { /* can't read nothing. */ 3449 num = 0; /* but, it's not error */ 3450 } 3451 tok->type = TK_RAW_BYTE; 3452 tok->base = 8; 3453 tok->u.c = num; 3454 } 3455 else if (c != '0') { 3456 PINC; 3457 } 3458 break; 3459 3460 #ifdef USE_NAMED_GROUP 3461 case 'k': 3462 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { 3463 PFETCH(c); 3464 if (c == '<' || c == '\'') { 3465 UChar* name_end; 3466 int* backs; 3467 int back_num; 3468 3469 prev = p; 3470 3471 #ifdef USE_BACKREF_WITH_LEVEL 3472 name_end = NULL_UCHARP; /* no need. escape gcc warning. */ 3473 r = fetch_name_with_level((OnigCodePoint )c, &p, end, &name_end, 3474 env, &back_num, &tok->u.backref.level); 3475 if (r == 1) tok->u.backref.exist_level = 1; 3476 else tok->u.backref.exist_level = 0; 3477 #else 3478 r = fetch_name(&p, end, &name_end, env, &back_num, 1); 3479 #endif 3480 if (r < 0) return r; 3481 3482 if (back_num != 0) { 3483 if (back_num < 0) { 3484 back_num = BACKREF_REL_TO_ABS(back_num, env); 3485 if (back_num <= 0) 3486 return ONIGERR_INVALID_BACKREF; 3487 } 3488 3489 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 3490 if (back_num > env->num_mem || 3491 IS_NULL(SCANENV_MEM_NODES(env)[back_num])) 3492 return ONIGERR_INVALID_BACKREF; 3493 } 3494 tok->type = TK_BACKREF; 3495 tok->u.backref.by_name = 0; 3496 tok->u.backref.num = 1; 3497 tok->u.backref.ref1 = back_num; 3498 } 3499 else { 3500 num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); 3501 if (num <= 0) { 3502 onig_scan_env_set_error_string(env, 3503 ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); 3504 return ONIGERR_UNDEFINED_NAME_REFERENCE; 3505 } 3506 if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { 3507 int i; 3508 for (i = 0; i < num; i++) { 3509 if (backs[i] > env->num_mem || 3510 IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) 3511 return ONIGERR_INVALID_BACKREF; 3512 } 3513 } 3514 3515 tok->type = TK_BACKREF; 3516 tok->u.backref.by_name = 1; 3517 if (num == 1) { 3518 tok->u.backref.num = 1; 3519 tok->u.backref.ref1 = backs[0]; 3520 } 3521 else { 3522 tok->u.backref.num = num; 3523 tok->u.backref.refs = backs; 3524 } 3525 } 3526 } 3527 else 3528 PUNFETCH; 3529 } 3530 break; 3531 #endif 3532 3533 #ifdef USE_SUBEXP_CALL 3534 case 'g': 3535 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { 3536 PFETCH(c); 3537 if (c == '<' || c == '\'') { 3538 int gnum; 3539 UChar* name_end; 3540 3541 prev = p; 3542 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &gnum, 1); 3543 if (r < 0) return r; 3544 3545 tok->type = TK_CALL; 3546 tok->u.call.name = prev; 3547 tok->u.call.name_end = name_end; 3548 tok->u.call.gnum = gnum; 3549 } 3550 else 3551 PUNFETCH; 3552 } 3553 break; 3554 #endif 3555 3556 case 'Q': 3557 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { 3558 tok->type = TK_QUOTE_OPEN; 3559 } 3560 break; 3561 3562 case 'p': 3563 case 'P': 3564 if (PPEEK_IS('{') && 3565 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { 3566 PINC; 3567 tok->type = TK_CHAR_PROPERTY; 3568 tok->u.prop.not = (c == 'P' ? 1 : 0); 3569 3570 if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { 3571 PFETCH(c); 3572 if (c == '^') { 3573 tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); 3574 } 3575 else 3576 PUNFETCH; 3577 } 3578 } 3579 break; 3580 3581 default: 3582 PUNFETCH; 3583 num = fetch_escaped_value(&p, end, env); 3584 if (num < 0) return num; 3585 /* set_raw: */ 3586 if (tok->u.c != num) { 3587 tok->type = TK_CODE_POINT; 3588 tok->u.code = (OnigCodePoint )num; 3589 } 3590 else { /* string */ 3591 p = tok->backp + enclen(enc, tok->backp); 3592 } 3593 break; 3594 } 3595 } 3596 else { 3597 tok->u.c = c; 3598 tok->escaped = 0; 3599 3600 #ifdef USE_VARIABLE_META_CHARS 3601 if ((c != ONIG_INEFFECTIVE_META_CHAR) && 3602 IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { 3603 if (c == MC_ANYCHAR(syn)) 3604 goto any_char; 3605 else if (c == MC_ANYTIME(syn)) 3606 goto anytime; 3607 else if (c == MC_ZERO_OR_ONE_TIME(syn)) 3608 goto zero_or_one_time; 3609 else if (c == MC_ONE_OR_MORE_TIME(syn)) 3610 goto one_or_more_time; 3611 else if (c == MC_ANYCHAR_ANYTIME(syn)) { 3612 tok->type = TK_ANYCHAR_ANYTIME; 3613 goto out; 3614 } 3615 } 3616 #endif 3617 3618 switch (c) { 3619 case '.': 3620 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; 3621 #ifdef USE_VARIABLE_META_CHARS 3622 any_char: 3623 #endif 3624 tok->type = TK_ANYCHAR; 3625 break; 3626 3627 case '*': 3628 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; 3629 #ifdef USE_VARIABLE_META_CHARS 3630 anytime: 3631 #endif 3632 tok->type = TK_OP_REPEAT; 3633 tok->u.repeat.lower = 0; 3634 tok->u.repeat.upper = REPEAT_INFINITE; 3635 goto greedy_check; 3636 break; 3637 3638 case '+': 3639 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; 3640 #ifdef USE_VARIABLE_META_CHARS 3641 one_or_more_time: 3642 #endif 3643 tok->type = TK_OP_REPEAT; 3644 tok->u.repeat.lower = 1; 3645 tok->u.repeat.upper = REPEAT_INFINITE; 3646 goto greedy_check; 3647 break; 3648 3649 case '?': 3650 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; 3651 #ifdef USE_VARIABLE_META_CHARS 3652 zero_or_one_time: 3653 #endif 3654 tok->type = TK_OP_REPEAT; 3655 tok->u.repeat.lower = 0; 3656 tok->u.repeat.upper = 1; 3657 goto greedy_check; 3658 break; 3659 3660 case '{': 3661 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; 3662 r = fetch_range_quantifier(&p, end, tok, env); 3663 if (r < 0) return r; /* error */ 3664 if (r == 0) goto greedy_check; 3665 else if (r == 2) { /* {n} */ 3666 if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) 3667 goto possessive_check; 3668 3669 goto greedy_check; 3670 } 3671 /* r == 1 : normal char */ 3672 break; 3673 3674 case '|': 3675 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; 3676 tok->type = TK_ALT; 3677 break; 3678 3679 case '(': 3680 if (PPEEK_IS('?') && 3681 IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 3682 PINC; 3683 if (PPEEK_IS('#')) { 3684 PFETCH(c); 3685 while (1) { 3686 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 3687 PFETCH(c); 3688 if (c == MC_ESC(syn)) { 3689 if (!PEND) PFETCH(c); 3690 } 3691 else { 3692 if (c == ')') break; 3693 } 3694 } 3695 goto start; 3696 } 3697 PUNFETCH; 3698 } 3699 3700 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 3701 tok->type = TK_SUBEXP_OPEN; 3702 break; 3703 3704 case ')': 3705 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; 3706 tok->type = TK_SUBEXP_CLOSE; 3707 break; 3708 3709 case '^': 3710 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 3711 tok->type = TK_ANCHOR; 3712 tok->u.subtype = (IS_SINGLELINE(env->option) 3713 ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); 3714 break; 3715 3716 case '$': 3717 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; 3718 tok->type = TK_ANCHOR; 3719 tok->u.subtype = (IS_SINGLELINE(env->option) 3720 ? ANCHOR_SEMI_END_BUF : ANCHOR_END_LINE); 3721 break; 3722 3723 case '[': 3724 if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; 3725 tok->type = TK_CC_OPEN; 3726 break; 3727 3728 case ']': 3729 if (*src > env->pattern) /* /].../ is allowed. */ 3730 CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); 3731 break; 3732 3733 case '#': 3734 if (IS_EXTEND(env->option)) { 3735 while (!PEND) { 3736 PFETCH(c); 3737 if (ONIGENC_IS_CODE_NEWLINE(enc, c)) 3738 break; 3739 } 3740 goto start; 3741 break; 3742 } 3743 break; 3744 3745 case ' ': case '\t': case '\n': case '\r': case '\f': 3746 if (IS_EXTEND(env->option)) 3747 goto start; 3748 break; 3749 3750 default: 3751 /* string */ 3752 break; 3753 } 3754 } 3755 3756 #ifdef USE_VARIABLE_META_CHARS 3757 out: 3758 #endif 3759 *src = p; 3760 return tok->type; 3761 } 3762 3763 static int 3764 add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, 3765 OnigEncoding enc ARG_UNUSED, 3766 OnigCodePoint sb_out, const OnigCodePoint mbr[]) 3767 { 3768 int i, r; 3769 OnigCodePoint j; 3770 3771 int n = ONIGENC_CODE_RANGE_NUM(mbr); 3772 3773 if (not == 0) { 3774 for (i = 0; i < n; i++) { 3775 for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); 3776 j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { 3777 if (j >= sb_out) { 3778 if (j == ONIGENC_CODE_RANGE_TO(mbr, i)) i++; 3779 else if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { 3780 r = add_code_range_to_buf(&(cc->mbuf), j, 3781 ONIGENC_CODE_RANGE_TO(mbr, i)); 3782 if (r != 0) return r; 3783 i++; 3784 } 3785 3786 goto sb_end; 3787 } 3788 BITSET_SET_BIT(cc->bs, j); 3789 } 3790 } 3791 3792 sb_end: 3793 for ( ; i < n; i++) { 3794 r = add_code_range_to_buf(&(cc->mbuf), 3795 ONIGENC_CODE_RANGE_FROM(mbr, i), 3796 ONIGENC_CODE_RANGE_TO(mbr, i)); 3797 if (r != 0) return r; 3798 } 3799 } 3800 else { 3801 OnigCodePoint prev = 0; 3802 3803 for (i = 0; i < n; i++) { 3804 for (j = prev; 3805 j < ONIGENC_CODE_RANGE_FROM(mbr, i); j++) { 3806 if (j >= sb_out) { 3807 goto sb_end2; 3808 } 3809 BITSET_SET_BIT(cc->bs, j); 3810 } 3811 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 3812 } 3813 for (j = prev; j < sb_out; j++) { 3814 BITSET_SET_BIT(cc->bs, j); 3815 } 3816 3817 sb_end2: 3818 prev = sb_out; 3819 3820 for (i = 0; i < n; i++) { 3821 if (prev < ONIGENC_CODE_RANGE_FROM(mbr, i)) { 3822 r = add_code_range_to_buf(&(cc->mbuf), prev, 3823 ONIGENC_CODE_RANGE_FROM(mbr, i) - 1); 3824 if (r != 0) return r; 3825 } 3826 prev = ONIGENC_CODE_RANGE_TO(mbr, i) + 1; 3827 } 3828 if (prev < 0x7fffffff) { 3829 r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff); 3830 if (r != 0) return r; 3831 } 3832 } 3833 3834 return 0; 3835 } 3836 3837 static int 3838 add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) 3839 { 3840 int c, r; 3841 const OnigCodePoint *ranges; 3842 OnigCodePoint sb_out; 3843 OnigEncoding enc = env->enc; 3844 3845 r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); 3846 if (r == 0) { 3847 return add_ctype_to_cc_by_range(cc, ctype, not, env->enc, sb_out, ranges); 3848 } 3849 else if (r != ONIG_NO_SUPPORT_CONFIG) { 3850 return r; 3851 } 3852 3853 r = 0; 3854 switch (ctype) { 3855 case ONIGENC_CTYPE_ALPHA: 3856 case ONIGENC_CTYPE_BLANK: 3857 case ONIGENC_CTYPE_CNTRL: 3858 case ONIGENC_CTYPE_DIGIT: 3859 case ONIGENC_CTYPE_LOWER: 3860 case ONIGENC_CTYPE_PUNCT: 3861 case ONIGENC_CTYPE_SPACE: 3862 case ONIGENC_CTYPE_UPPER: 3863 case ONIGENC_CTYPE_XDIGIT: 3864 case ONIGENC_CTYPE_ASCII: 3865 case ONIGENC_CTYPE_ALNUM: 3866 if (not != 0) { 3867 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 3868 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 3869 BITSET_SET_BIT(cc->bs, c); 3870 } 3871 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 3872 } 3873 else { 3874 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 3875 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 3876 BITSET_SET_BIT(cc->bs, c); 3877 } 3878 } 3879 break; 3880 3881 case ONIGENC_CTYPE_GRAPH: 3882 case ONIGENC_CTYPE_PRINT: 3883 if (not != 0) { 3884 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 3885 if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 3886 BITSET_SET_BIT(cc->bs, c); 3887 } 3888 } 3889 else { 3890 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 3891 if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) 3892 BITSET_SET_BIT(cc->bs, c); 3893 } 3894 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 3895 } 3896 break; 3897 3898 case ONIGENC_CTYPE_WORD: 3899 if (not == 0) { 3900 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 3901 if (IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); 3902 } 3903 ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); 3904 } 3905 else { 3906 for (c = 0; c < SINGLE_BYTE_SIZE; c++) { 3907 if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ 3908 && ! ONIGENC_IS_CODE_WORD(enc, c)) 3909 BITSET_SET_BIT(cc->bs, c); 3910 } 3911 } 3912 break; 3913 3914 default: 3915 return ONIGERR_PARSER_BUG; 3916 break; 3917 } 3918 3919 return r; 3920 } 3921 3922 static int 3923 parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) 3924 { 3925 #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 3926 #define POSIX_BRACKET_NAME_MIN_LEN 4 3927 3928 static PosixBracketEntryType PBS[] = { 3929 { (UChar* )"alnum", ONIGENC_CTYPE_ALNUM, 5 }, 3930 { (UChar* )"alpha", ONIGENC_CTYPE_ALPHA, 5 }, 3931 { (UChar* )"blank", ONIGENC_CTYPE_BLANK, 5 }, 3932 { (UChar* )"cntrl", ONIGENC_CTYPE_CNTRL, 5 }, 3933 { (UChar* )"digit", ONIGENC_CTYPE_DIGIT, 5 }, 3934 { (UChar* )"graph", ONIGENC_CTYPE_GRAPH, 5 }, 3935 { (UChar* )"lower", ONIGENC_CTYPE_LOWER, 5 }, 3936 { (UChar* )"print", ONIGENC_CTYPE_PRINT, 5 }, 3937 { (UChar* )"punct", ONIGENC_CTYPE_PUNCT, 5 }, 3938 { (UChar* )"space", ONIGENC_CTYPE_SPACE, 5 }, 3939 { (UChar* )"upper", ONIGENC_CTYPE_UPPER, 5 }, 3940 { (UChar* )"xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, 3941 { (UChar* )"ascii", ONIGENC_CTYPE_ASCII, 5 }, 3942 { (UChar* )"word", ONIGENC_CTYPE_WORD, 4 }, 3943 { (UChar* )NULL, -1, 0 } 3944 }; 3945 3946 PosixBracketEntryType *pb; 3947 int not, i, r; 3948 OnigCodePoint c; 3949 OnigEncoding enc = env->enc; 3950 UChar *p = *src; 3951 3952 if (PPEEK_IS('^')) { 3953 PINC_S; 3954 not = 1; 3955 } 3956 else 3957 not = 0; 3958 3959 if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MIN_LEN + 3) 3960 goto not_posix_bracket; 3961 3962 for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { 3963 if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { 3964 p = (UChar* )onigenc_step(enc, p, end, pb->len); 3965 if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) 3966 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 3967 3968 r = add_ctype_to_cc(cc, pb->ctype, not, env); 3969 if (r != 0) return r; 3970 3971 PINC_S; PINC_S; 3972 *src = p; 3973 return 0; 3974 } 3975 } 3976 3977 not_posix_bracket: 3978 c = 0; 3979 i = 0; 3980 while (!PEND && ((c = PPEEK) != ':') && c != ']') { 3981 PINC_S; 3982 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; 3983 } 3984 if (c == ':' && ! PEND) { 3985 PINC_S; 3986 if (! PEND) { 3987 PFETCH_S(c); 3988 if (c == ']') 3989 return ONIGERR_INVALID_POSIX_BRACKET_TYPE; 3990 } 3991 } 3992 3993 return 1; /* 1: is not POSIX bracket, but no error. */ 3994 } 3995 3996 static int 3997 fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) 3998 { 3999 int r; 4000 OnigCodePoint c; 4001 OnigEncoding enc = env->enc; 4002 UChar *prev, *start, *p = *src; 4003 4004 r = 0; 4005 start = prev = p; 4006 4007 while (!PEND) { 4008 prev = p; 4009 PFETCH_S(c); 4010 if (c == '}') { 4011 r = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, start, prev); 4012 if (r < 0) break; 4013 4014 *src = p; 4015 return r; 4016 } 4017 else if (c == '(' || c == ')' || c == '{' || c == '|') { 4018 r = ONIGERR_INVALID_CHAR_PROPERTY_NAME; 4019 break; 4020 } 4021 } 4022 4023 onig_scan_env_set_error_string(env, r, *src, prev); 4024 return r; 4025 } 4026 4027 static int 4028 parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, 4029 ScanEnv* env) 4030 { 4031 int r, ctype; 4032 CClassNode* cc; 4033 4034 ctype = fetch_char_property_to_ctype(src, end, env); 4035 if (ctype < 0) return ctype; 4036 4037 *np = node_new_cclass(); 4038 CHECK_NULL_RETURN_MEMERR(*np); 4039 cc = NCCLASS(*np); 4040 r = add_ctype_to_cc(cc, ctype, 0, env); 4041 if (r != 0) return r; 4042 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 4043 4044 return 0; 4045 } 4046 4047 4048 enum CCSTATE { 4049 CCS_VALUE, 4050 CCS_RANGE, 4051 CCS_COMPLETE, 4052 CCS_START 4053 }; 4054 4055 enum CCVALTYPE { 4056 CCV_SB, 4057 CCV_CODE_POINT, 4058 CCV_CLASS 4059 }; 4060 4061 static int 4062 next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, 4063 enum CCSTATE* state, ScanEnv* env) 4064 { 4065 int r; 4066 4067 if (*state == CCS_RANGE) 4068 return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; 4069 4070 if (*state == CCS_VALUE && *type != CCV_CLASS) { 4071 if (*type == CCV_SB) 4072 BITSET_SET_BIT(cc->bs, (int )(*vs)); 4073 else if (*type == CCV_CODE_POINT) { 4074 r = add_code_range(&(cc->mbuf), env, *vs, *vs); 4075 if (r < 0) return r; 4076 } 4077 } 4078 4079 *state = CCS_VALUE; 4080 *type = CCV_CLASS; 4081 return 0; 4082 } 4083 4084 static int 4085 next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, 4086 int* vs_israw, int v_israw, 4087 enum CCVALTYPE intype, enum CCVALTYPE* type, 4088 enum CCSTATE* state, ScanEnv* env) 4089 { 4090 int r; 4091 4092 switch (*state) { 4093 case CCS_VALUE: 4094 if (*type == CCV_SB) 4095 BITSET_SET_BIT(cc->bs, (int )(*vs)); 4096 else if (*type == CCV_CODE_POINT) { 4097 r = add_code_range(&(cc->mbuf), env, *vs, *vs); 4098 if (r < 0) return r; 4099 } 4100 break; 4101 4102 case CCS_RANGE: 4103 if (intype == *type) { 4104 if (intype == CCV_SB) { 4105 if (*vs > 0xff || v > 0xff) 4106 return ONIGERR_INVALID_CODE_POINT_VALUE; 4107 4108 if (*vs > v) { 4109 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 4110 goto ccs_range_end; 4111 else 4112 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 4113 } 4114 bitset_set_range(cc->bs, (int )*vs, (int )v); 4115 } 4116 else { 4117 r = add_code_range(&(cc->mbuf), env, *vs, v); 4118 if (r < 0) return r; 4119 } 4120 } 4121 else { 4122 #if 0 4123 if (intype == CCV_CODE_POINT && *type == CCV_SB) { 4124 #endif 4125 if (*vs > v) { 4126 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) 4127 goto ccs_range_end; 4128 else 4129 return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; 4130 } 4131 bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); 4132 r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); 4133 if (r < 0) return r; 4134 #if 0 4135 } 4136 else 4137 return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; 4138 #endif 4139 } 4140 ccs_range_end: 4141 *state = CCS_COMPLETE; 4142 break; 4143 4144 case CCS_COMPLETE: 4145 case CCS_START: 4146 *state = CCS_VALUE; 4147 break; 4148 4149 default: 4150 break; 4151 } 4152 4153 *vs_israw = v_israw; 4154 *vs = v; 4155 *type = intype; 4156 return 0; 4157 } 4158 4159 static int 4160 code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, 4161 ScanEnv* env) 4162 { 4163 int in_esc; 4164 OnigCodePoint code; 4165 OnigEncoding enc = env->enc; 4166 UChar* p = from; 4167 4168 in_esc = 0; 4169 while (! PEND) { 4170 if (ignore_escaped && in_esc) { 4171 in_esc = 0; 4172 } 4173 else { 4174 PFETCH_S(code); 4175 if (code == c) return 1; 4176 if (code == MC_ESC(env->syntax)) in_esc = 1; 4177 } 4178 } 4179 return 0; 4180 } 4181 4182 static int 4183 parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, 4184 ScanEnv* env) 4185 { 4186 int r, neg, len, fetched, and_start; 4187 OnigCodePoint v, vs; 4188 UChar *p; 4189 Node* node; 4190 CClassNode *cc, *prev_cc; 4191 CClassNode work_cc; 4192 4193 enum CCSTATE state; 4194 enum CCVALTYPE val_type, in_type; 4195 int val_israw, in_israw; 4196 4197 prev_cc = (CClassNode* )NULL; 4198 *np = NULL_NODE; 4199 r = fetch_token_in_cc(tok, src, end, env); 4200 if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { 4201 neg = 1; 4202 r = fetch_token_in_cc(tok, src, end, env); 4203 } 4204 else { 4205 neg = 0; 4206 } 4207 4208 if (r < 0) return r; 4209 if (r == TK_CC_CLOSE) { 4210 if (! code_exist_check((OnigCodePoint )']', 4211 *src, env->pattern_end, 1, env)) 4212 return ONIGERR_EMPTY_CHAR_CLASS; 4213 4214 CC_ESC_WARN(env, (UChar* )"]"); 4215 r = tok->type = TK_CHAR; /* allow []...] */ 4216 } 4217 4218 *np = node = node_new_cclass(); 4219 CHECK_NULL_RETURN_MEMERR(node); 4220 cc = NCCLASS(node); 4221 4222 and_start = 0; 4223 state = CCS_START; 4224 p = *src; 4225 while (r != TK_CC_CLOSE) { 4226 fetched = 0; 4227 switch (r) { 4228 case TK_CHAR: 4229 len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); 4230 if (len > 1) { 4231 in_type = CCV_CODE_POINT; 4232 } 4233 else if (len < 0) { 4234 r = len; 4235 goto err; 4236 } 4237 else { 4238 sb_char: 4239 in_type = CCV_SB; 4240 } 4241 v = (OnigCodePoint )tok->u.c; 4242 in_israw = 0; 4243 goto val_entry2; 4244 break; 4245 4246 case TK_RAW_BYTE: 4247 /* tok->base != 0 : octal or hexadec. */ 4248 if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { 4249 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 4250 UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; 4251 UChar* psave = p; 4252 int i, base = tok->base; 4253 4254 buf[0] = (UChar)tok->u.c; 4255 for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { 4256 r = fetch_token_in_cc(tok, &p, end, env); 4257 if (r < 0) goto err; 4258 if (r != TK_RAW_BYTE || tok->base != base) { 4259 fetched = 1; 4260 break; 4261 } 4262 buf[i] = (UChar)tok->u.c; 4263 } 4264 4265 if (i < ONIGENC_MBC_MINLEN(env->enc)) { 4266 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 4267 goto err; 4268 } 4269 4270 len = enclen(env->enc, buf); 4271 if (i < len) { 4272 r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 4273 goto err; 4274 } 4275 else if (i > len) { /* fetch back */ 4276 p = psave; 4277 for (i = 1; i < len; i++) { 4278 r = fetch_token_in_cc(tok, &p, end, env); 4279 } 4280 fetched = 0; 4281 } 4282 4283 if (i == 1) { 4284 v = (OnigCodePoint )buf[0]; 4285 goto raw_single; 4286 } 4287 else { 4288 v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe); 4289 in_type = CCV_CODE_POINT; 4290 } 4291 } 4292 else { 4293 v = (OnigCodePoint )tok->u.c; 4294 raw_single: 4295 in_type = CCV_SB; 4296 } 4297 in_israw = 1; 4298 goto val_entry2; 4299 break; 4300 4301 case TK_CODE_POINT: 4302 v = tok->u.code; 4303 in_israw = 1; 4304 val_entry: 4305 len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); 4306 if (len < 0) { 4307 r = len; 4308 goto err; 4309 } 4310 in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); 4311 val_entry2: 4312 r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, 4313 &state, env); 4314 if (r != 0) goto err; 4315 break; 4316 4317 case TK_POSIX_BRACKET_OPEN: 4318 r = parse_posix_bracket(cc, &p, end, env); 4319 if (r < 0) goto err; 4320 if (r == 1) { /* is not POSIX bracket */ 4321 CC_ESC_WARN(env, (UChar* )"["); 4322 p = tok->backp; 4323 v = (OnigCodePoint )tok->u.c; 4324 in_israw = 0; 4325 goto val_entry; 4326 } 4327 goto next_class; 4328 break; 4329 4330 case TK_CHAR_TYPE: 4331 r = add_ctype_to_cc(cc, tok->u.prop.ctype, tok->u.prop.not, env); 4332 if (r != 0) return r; 4333 4334 next_class: 4335 r = next_state_class(cc, &vs, &val_type, &state, env); 4336 if (r != 0) goto err; 4337 break; 4338 4339 case TK_CHAR_PROPERTY: 4340 { 4341 int ctype; 4342 4343 ctype = fetch_char_property_to_ctype(&p, end, env); 4344 if (ctype < 0) return ctype; 4345 r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); 4346 if (r != 0) return r; 4347 goto next_class; 4348 } 4349 break; 4350 4351 case TK_CC_RANGE: 4352 if (state == CCS_VALUE) { 4353 r = fetch_token_in_cc(tok, &p, end, env); 4354 if (r < 0) goto err; 4355 fetched = 1; 4356 if (r == TK_CC_CLOSE) { /* allow [x-] */ 4357 range_end_val: 4358 v = (OnigCodePoint )'-'; 4359 in_israw = 0; 4360 goto val_entry; 4361 } 4362 else if (r == TK_CC_AND) { 4363 CC_ESC_WARN(env, (UChar* )"-"); 4364 goto range_end_val; 4365 } 4366 state = CCS_RANGE; 4367 } 4368 else if (state == CCS_START) { 4369 /* [-xa] is allowed */ 4370 v = (OnigCodePoint )tok->u.c; 4371 in_israw = 0; 4372 4373 r = fetch_token_in_cc(tok, &p, end, env); 4374 if (r < 0) goto err; 4375 fetched = 1; 4376 /* [--x] or [a&&-x] is warned. */ 4377 if (r == TK_CC_RANGE || and_start != 0) 4378 CC_ESC_WARN(env, (UChar* )"-"); 4379 4380 goto val_entry; 4381 } 4382 else if (state == CCS_RANGE) { 4383 CC_ESC_WARN(env, (UChar* )"-"); 4384 goto sb_char; /* [!--x] is allowed */ 4385 } 4386 else { /* CCS_COMPLETE */ 4387 r = fetch_token_in_cc(tok, &p, end, env); 4388 if (r < 0) goto err; 4389 fetched = 1; 4390 if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ 4391 else if (r == TK_CC_AND) { 4392 CC_ESC_WARN(env, (UChar* )"-"); 4393 goto range_end_val; 4394 } 4395 4396 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { 4397 CC_ESC_WARN(env, (UChar* )"-"); 4398 goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ 4399 } 4400 r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; 4401 goto err; 4402 } 4403 break; 4404 4405 case TK_CC_CC_OPEN: /* [ */ 4406 { 4407 Node *anode; 4408 CClassNode* acc; 4409 4410 r = parse_char_class(&anode, tok, &p, end, env); 4411 if (r != 0) goto cc_open_err; 4412 acc = NCCLASS(anode); 4413 r = or_cclass(cc, acc, env->enc); 4414 4415 onig_node_free(anode); 4416 cc_open_err: 4417 if (r != 0) goto err; 4418 } 4419 break; 4420 4421 case TK_CC_AND: /* && */ 4422 { 4423 if (state == CCS_VALUE) { 4424 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, 4425 &val_type, &state, env); 4426 if (r != 0) goto err; 4427 } 4428 /* initialize local variables */ 4429 and_start = 1; 4430 state = CCS_START; 4431 4432 if (IS_NOT_NULL(prev_cc)) { 4433 r = and_cclass(prev_cc, cc, env->enc); 4434 if (r != 0) goto err; 4435 bbuf_free(cc->mbuf); 4436 } 4437 else { 4438 prev_cc = cc; 4439 cc = &work_cc; 4440 } 4441 initialize_cclass(cc); 4442 } 4443 break; 4444 4445 case TK_EOT: 4446 r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; 4447 goto err; 4448 break; 4449 default: 4450 r = ONIGERR_PARSER_BUG; 4451 goto err; 4452 break; 4453 } 4454 4455 if (fetched) 4456 r = tok->type; 4457 else { 4458 r = fetch_token_in_cc(tok, &p, end, env); 4459 if (r < 0) goto err; 4460 } 4461 } 4462 4463 if (state == CCS_VALUE) { 4464 r = next_state_val(cc, &vs, 0, &val_israw, 0, val_type, 4465 &val_type, &state, env); 4466 if (r != 0) goto err; 4467 } 4468 4469 if (IS_NOT_NULL(prev_cc)) { 4470 r = and_cclass(prev_cc, cc, env->enc); 4471 if (r != 0) goto err; 4472 bbuf_free(cc->mbuf); 4473 cc = prev_cc; 4474 } 4475 4476 if (neg != 0) 4477 NCCLASS_SET_NOT(cc); 4478 else 4479 NCCLASS_CLEAR_NOT(cc); 4480 if (IS_NCCLASS_NOT(cc) && 4481 IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { 4482 int is_empty; 4483 4484 is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); 4485 if (is_empty != 0) 4486 BITSET_IS_EMPTY(cc->bs, is_empty); 4487 4488 if (is_empty == 0) { 4489 #define NEWLINE_CODE 0x0a 4490 4491 if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { 4492 if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) 4493 BITSET_SET_BIT(cc->bs, NEWLINE_CODE); 4494 else 4495 add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); 4496 } 4497 } 4498 } 4499 *src = p; 4500 return 0; 4501 4502 err: 4503 if (cc != NCCLASS(*np)) 4504 bbuf_free(cc->mbuf); 4505 onig_node_free(*np); 4506 return r; 4507 } 4508 4509 static int parse_subexp(Node** top, OnigToken* tok, int term, 4510 UChar** src, UChar* end, ScanEnv* env); 4511 4512 static int 4513 parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, 4514 ScanEnv* env) 4515 { 4516 int r, num; 4517 Node *target; 4518 OnigOptionType option; 4519 OnigCodePoint c; 4520 OnigEncoding enc = env->enc; 4521 4522 #ifdef USE_NAMED_GROUP 4523 int list_capture; 4524 #endif 4525 4526 UChar* p = *src; 4527 PFETCH_READY; 4528 4529 *np = NULL; 4530 if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 4531 4532 option = env->option; 4533 if (PPEEK_IS('?') && 4534 IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { 4535 PINC; 4536 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 4537 4538 PFETCH(c); 4539 switch (c) { 4540 case ':': /* (?:...) grouping only */ 4541 group: 4542 r = fetch_token(tok, &p, end, env); 4543 if (r < 0) return r; 4544 r = parse_subexp(np, tok, term, &p, end, env); 4545 if (r < 0) return r; 4546 *src = p; 4547 return 1; /* group */ 4548 break; 4549 4550 case '=': 4551 *np = onig_node_new_anchor(ANCHOR_PREC_READ); 4552 break; 4553 case '!': /* preceding read */ 4554 *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); 4555 break; 4556 case '>': /* (?>...) stop backtrack */ 4557 *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 4558 break; 4559 4560 #ifdef USE_NAMED_GROUP 4561 case '\'': 4562 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 4563 goto named_group1; 4564 } 4565 else 4566 return ONIGERR_UNDEFINED_GROUP_OPTION; 4567 break; 4568 #endif 4569 4570 case '<': /* look behind (?<=...), (?<!...) */ 4571 PFETCH(c); 4572 if (c == '=') 4573 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND); 4574 else if (c == '!') 4575 *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND_NOT); 4576 #ifdef USE_NAMED_GROUP 4577 else { 4578 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 4579 UChar *name; 4580 UChar *name_end; 4581 4582 PUNFETCH; 4583 c = '<'; 4584 4585 named_group1: 4586 list_capture = 0; 4587 4588 named_group2: 4589 name = p; 4590 r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); 4591 if (r < 0) return r; 4592 4593 num = scan_env_add_mem_entry(env); 4594 if (num < 0) return num; 4595 if (list_capture != 0 && num >= (int )BIT_STATUS_BITS_NUM) 4596 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 4597 4598 r = name_add(env->reg, name, name_end, num, env); 4599 if (r != 0) return r; 4600 *np = node_new_enclose_memory(env->option, 1); 4601 CHECK_NULL_RETURN_MEMERR(*np); 4602 NENCLOSE(*np)->regnum = num; 4603 if (list_capture != 0) 4604 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); 4605 env->num_named++; 4606 } 4607 else { 4608 return ONIGERR_UNDEFINED_GROUP_OPTION; 4609 } 4610 } 4611 #else 4612 else { 4613 return ONIGERR_UNDEFINED_GROUP_OPTION; 4614 } 4615 #endif 4616 break; 4617 4618 case '@': 4619 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { 4620 #ifdef USE_NAMED_GROUP 4621 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { 4622 PFETCH(c); 4623 if (c == '<' || c == '\'') { 4624 list_capture = 1; 4625 goto named_group2; /* (?@<name>...) */ 4626 } 4627 PUNFETCH; 4628 } 4629 #endif 4630 *np = node_new_enclose_memory(env->option, 0); 4631 CHECK_NULL_RETURN_MEMERR(*np); 4632 num = scan_env_add_mem_entry(env); 4633 if (num < 0) { 4634 onig_node_free(*np); 4635 return num; 4636 } 4637 else if (num >= (int )BIT_STATUS_BITS_NUM) { 4638 onig_node_free(*np); 4639 return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; 4640 } 4641 NENCLOSE(*np)->regnum = num; 4642 BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); 4643 } 4644 else { 4645 return ONIGERR_UNDEFINED_GROUP_OPTION; 4646 } 4647 break; 4648 4649 #ifdef USE_POSIXLINE_OPTION 4650 case 'p': 4651 #endif 4652 case '-': case 'i': case 'm': case 's': case 'x': 4653 { 4654 int neg = 0; 4655 4656 while (1) { 4657 switch (c) { 4658 case ':': 4659 case ')': 4660 break; 4661 4662 case '-': neg = 1; break; 4663 case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break; 4664 case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break; 4665 case 's': 4666 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 4667 ONOFF(option, ONIG_OPTION_MULTILINE, neg); 4668 } 4669 else 4670 return ONIGERR_UNDEFINED_GROUP_OPTION; 4671 break; 4672 4673 case 'm': 4674 if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { 4675 ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); 4676 } 4677 else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { 4678 ONOFF(option, ONIG_OPTION_MULTILINE, neg); 4679 } 4680 else 4681 return ONIGERR_UNDEFINED_GROUP_OPTION; 4682 break; 4683 #ifdef USE_POSIXLINE_OPTION 4684 case 'p': 4685 ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); 4686 break; 4687 #endif 4688 default: 4689 return ONIGERR_UNDEFINED_GROUP_OPTION; 4690 } 4691 4692 if (c == ')') { 4693 *np = node_new_option(option); 4694 CHECK_NULL_RETURN_MEMERR(*np); 4695 *src = p; 4696 return 2; /* option only */ 4697 } 4698 else if (c == ':') { 4699 OnigOptionType prev = env->option; 4700 4701 env->option = option; 4702 r = fetch_token(tok, &p, end, env); 4703 if (r < 0) return r; 4704 r = parse_subexp(&target, tok, term, &p, end, env); 4705 env->option = prev; 4706 if (r < 0) return r; 4707 *np = node_new_option(option); 4708 CHECK_NULL_RETURN_MEMERR(*np); 4709 NENCLOSE(*np)->target = target; 4710 *src = p; 4711 return 0; 4712 } 4713 4714 if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; 4715 PFETCH(c); 4716 } 4717 } 4718 break; 4719 4720 default: 4721 return ONIGERR_UNDEFINED_GROUP_OPTION; 4722 } 4723 } 4724 else { 4725 if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) 4726 goto group; 4727 4728 *np = node_new_enclose_memory(env->option, 0); 4729 CHECK_NULL_RETURN_MEMERR(*np); 4730 num = scan_env_add_mem_entry(env); 4731 if (num < 0) return num; 4732 NENCLOSE(*np)->regnum = num; 4733 } 4734 4735 CHECK_NULL_RETURN_MEMERR(*np); 4736 r = fetch_token(tok, &p, end, env); 4737 if (r < 0) return r; 4738 r = parse_subexp(&target, tok, term, &p, end, env); 4739 if (r < 0) return r; 4740 4741 if (NTYPE(*np) == NT_ANCHOR) 4742 NANCHOR(*np)->target = target; 4743 else { 4744 NENCLOSE(*np)->target = target; 4745 if (NENCLOSE(*np)->type == ENCLOSE_MEMORY) { 4746 /* Don't move this to previous of parse_subexp() */ 4747 r = scan_env_set_mem_node(env, NENCLOSE(*np)->regnum, *np); 4748 if (r != 0) return r; 4749 } 4750 } 4751 4752 *src = p; 4753 return 0; 4754 } 4755 4756 static const char* PopularQStr[] = { 4757 "?", "*", "+", "??", "*?", "+?" 4758 }; 4759 4760 static const char* ReduceQStr[] = { 4761 "", "", "*", "*?", "??", "+ and ??", "+? and ?" 4762 }; 4763 4764 static int 4765 set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) 4766 { 4767 QtfrNode* qn; 4768 4769 qn = NQTFR(qnode); 4770 if (qn->lower == 1 && qn->upper == 1) { 4771 return 1; 4772 } 4773 4774 switch (NTYPE(target)) { 4775 case NT_STR: 4776 if (! group) { 4777 StrNode* sn = NSTR(target); 4778 if (str_node_can_be_split(sn, env->enc)) { 4779 Node* n = str_node_split_last_char(sn, env->enc); 4780 if (IS_NOT_NULL(n)) { 4781 qn->target = n; 4782 return 2; 4783 } 4784 } 4785 } 4786 break; 4787 4788 case NT_QTFR: 4789 { /* check redundant double repeat. */ 4790 /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ 4791 QtfrNode* qnt = NQTFR(target); 4792 int nestq_num = popular_quantifier_num(qn); 4793 int targetq_num = popular_quantifier_num(qnt); 4794 if (nestq_num < 0 || targetq_num < 0) { 4795 return ONIGERR_TYPE_BUG; 4796 } 4797 4798 #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR 4799 if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) && 4800 IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { 4801 UChar buf[WARN_BUFSIZE]; 4802 4803 switch(ReduceTypeTable[targetq_num][nestq_num]) { 4804 case RQ_ASIS: 4805 break; 4806 4807 case RQ_DEL: 4808 if (onig_verb_warn != onig_null_warn) { 4809 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 4810 env->pattern, env->pattern_end, 4811 (UChar* )"redundant nested repeat operator"); 4812 (*onig_verb_warn)((char* )buf); 4813 } 4814 goto warn_exit; 4815 break; 4816 4817 default: 4818 if (onig_verb_warn != onig_null_warn) { 4819 onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, 4820 env->pattern, env->pattern_end, 4821 (UChar* )"nested repeat operator %s and %s was replaced with '%s'", 4822 PopularQStr[targetq_num], PopularQStr[nestq_num], 4823 ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); 4824 (*onig_verb_warn)((char* )buf); 4825 } 4826 goto warn_exit; 4827 break; 4828 } 4829 } 4830 4831 warn_exit: 4832 #endif 4833 if (targetq_num >= 0) { 4834 if (nestq_num >= 0) { 4835 onig_reduce_nested_quantifier(qnode, target); 4836 goto q_exit; 4837 } 4838 else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ 4839 /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ 4840 if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { 4841 qn->upper = (qn->lower == 0 ? 1 : qn->lower); 4842 } 4843 } 4844 } 4845 } 4846 break; 4847 4848 default: 4849 break; 4850 } 4851 4852 qn->target = target; 4853 q_exit: 4854 return 0; 4855 } 4856 4857 4858 #ifdef USE_SHARED_CCLASS_TABLE 4859 4860 #define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 4861 4862 /* for ctype node hash table */ 4863 4864 typedef struct { 4865 OnigEncoding enc; 4866 int not; 4867 int type; 4868 } type_cclass_key; 4869 4870 static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) 4871 { 4872 if (x->type != y->type) return 1; 4873 if (x->enc != y->enc) return 1; 4874 if (x->not != y->not) return 1; 4875 return 0; 4876 } 4877 4878 static int type_cclass_hash(type_cclass_key* key) 4879 { 4880 int i, val; 4881 UChar *p; 4882 4883 val = 0; 4884 4885 p = (UChar* )&(key->enc); 4886 for (i = 0; i < (int )sizeof(key->enc); i++) { 4887 val = val * 997 + (int )*p++; 4888 } 4889 4890 p = (UChar* )(&key->type); 4891 for (i = 0; i < (int )sizeof(key->type); i++) { 4892 val = val * 997 + (int )*p++; 4893 } 4894 4895 val += key->not; 4896 return val + (val >> 5); 4897 } 4898 4899 static struct st_hash_type type_type_cclass_hash = { 4900 type_cclass_cmp, 4901 type_cclass_hash, 4902 }; 4903 4904 static st_table* OnigTypeCClassTable; 4905 4906 4907 static int 4908 i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED) 4909 { 4910 if (IS_NOT_NULL(node)) { 4911 CClassNode* cc = NCCLASS(node); 4912 if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); 4913 xfree(node); 4914 } 4915 4916 if (IS_NOT_NULL(key)) xfree(key); 4917 return ST_DELETE; 4918 } 4919 4920 extern int 4921 onig_free_shared_cclass_table(void) 4922 { 4923 if (IS_NOT_NULL(OnigTypeCClassTable)) { 4924 onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); 4925 onig_st_free_table(OnigTypeCClassTable); 4926 OnigTypeCClassTable = NULL; 4927 } 4928 4929 return 0; 4930 } 4931 4932 #endif /* USE_SHARED_CCLASS_TABLE */ 4933 4934 4935 #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 4936 static int 4937 clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) 4938 { 4939 BBuf *tbuf; 4940 int r; 4941 4942 if (IS_NCCLASS_NOT(cc)) { 4943 bitset_invert(cc->bs); 4944 4945 if (! ONIGENC_IS_SINGLEBYTE(enc)) { 4946 r = not_code_range_buf(enc, cc->mbuf, &tbuf); 4947 if (r != 0) return r; 4948 4949 bbuf_free(cc->mbuf); 4950 cc->mbuf = tbuf; 4951 } 4952 4953 NCCLASS_CLEAR_NOT(cc); 4954 } 4955 4956 return 0; 4957 } 4958 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 4959 4960 typedef struct { 4961 ScanEnv* env; 4962 CClassNode* cc; 4963 Node* alt_root; 4964 Node** ptail; 4965 } IApplyCaseFoldArg; 4966 4967 static int 4968 i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], 4969 int to_len, void* arg) 4970 { 4971 IApplyCaseFoldArg* iarg; 4972 ScanEnv* env; 4973 CClassNode* cc; 4974 BitSetRef bs; 4975 4976 iarg = (IApplyCaseFoldArg* )arg; 4977 env = iarg->env; 4978 cc = iarg->cc; 4979 bs = cc->bs; 4980 4981 if (to_len == 1) { 4982 int is_in = onig_is_code_in_cc(env->enc, from, cc); 4983 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 4984 if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) || 4985 (is_in == 0 && IS_NCCLASS_NOT(cc))) { 4986 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { 4987 add_code_range(&(cc->mbuf), env, *to, *to); 4988 } 4989 else { 4990 BITSET_SET_BIT(bs, *to); 4991 } 4992 } 4993 #else 4994 if (is_in != 0) { 4995 if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { 4996 if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); 4997 add_code_range(&(cc->mbuf), env, *to, *to); 4998 } 4999 else { 5000 if (IS_NCCLASS_NOT(cc)) { 5001 BITSET_CLEAR_BIT(bs, *to); 5002 } 5003 else 5004 BITSET_SET_BIT(bs, *to); 5005 } 5006 } 5007 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */ 5008 } 5009 else { 5010 int r, i, len; 5011 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 5012 Node *snode = NULL_NODE; 5013 5014 if (onig_is_code_in_cc(env->enc, from, cc) 5015 #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS 5016 && !IS_NCCLASS_NOT(cc) 5017 #endif 5018 ) { 5019 for (i = 0; i < to_len; i++) { 5020 len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); 5021 if (i == 0) { 5022 snode = onig_node_new_str(buf, buf + len); 5023 CHECK_NULL_RETURN_MEMERR(snode); 5024 5025 /* char-class expanded multi-char only 5026 compare with string folded at match time. */ 5027 NSTRING_SET_AMBIG(snode); 5028 } 5029 else { 5030 r = onig_node_str_cat(snode, buf, buf + len); 5031 if (r < 0) { 5032 onig_node_free(snode); 5033 return r; 5034 } 5035 } 5036 } 5037 5038 *(iarg->ptail) = onig_node_new_alt(snode, NULL_NODE); 5039 CHECK_NULL_RETURN_MEMERR(*(iarg->ptail)); 5040 iarg->ptail = &(NCDR((*(iarg->ptail)))); 5041 } 5042 } 5043 5044 return 0; 5045 } 5046 5047 static int 5048 parse_exp(Node** np, OnigToken* tok, int term, 5049 UChar** src, UChar* end, ScanEnv* env) 5050 { 5051 int r, len, group = 0; 5052 Node* qn; 5053 Node** targetp; 5054 5055 *np = NULL; 5056 if (tok->type == (enum TokenSyms )term) 5057 goto end_of_token; 5058 5059 switch (tok->type) { 5060 case TK_ALT: 5061 case TK_EOT: 5062 end_of_token: 5063 *np = node_new_empty(); 5064 return tok->type; 5065 break; 5066 5067 case TK_SUBEXP_OPEN: 5068 r = parse_enclose(np, tok, TK_SUBEXP_CLOSE, src, end, env); 5069 if (r < 0) return r; 5070 if (r == 1) group = 1; 5071 else if (r == 2) { /* option only */ 5072 Node* target; 5073 OnigOptionType prev = env->option; 5074 5075 env->option = NENCLOSE(*np)->option; 5076 r = fetch_token(tok, src, end, env); 5077 if (r < 0) return r; 5078 r = parse_subexp(&target, tok, term, src, end, env); 5079 env->option = prev; 5080 if (r < 0) return r; 5081 NENCLOSE(*np)->target = target; 5082 return tok->type; 5083 } 5084 break; 5085 5086 case TK_SUBEXP_CLOSE: 5087 if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) 5088 return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; 5089 5090 if (tok->escaped) goto tk_raw_byte; 5091 else goto tk_byte; 5092 break; 5093 5094 case TK_STRING: 5095 tk_byte: 5096 { 5097 *np = node_new_str(tok->backp, *src); 5098 CHECK_NULL_RETURN_MEMERR(*np); 5099 5100 while (1) { 5101 r = fetch_token(tok, src, end, env); 5102 if (r < 0) return r; 5103 if (r != TK_STRING) break; 5104 5105 r = onig_node_str_cat(*np, tok->backp, *src); 5106 if (r < 0) return r; 5107 } 5108 5109 string_end: 5110 targetp = np; 5111 goto repeat; 5112 } 5113 break; 5114 5115 case TK_RAW_BYTE: 5116 tk_raw_byte: 5117 { 5118 *np = node_new_str_raw_char((UChar )tok->u.c); 5119 CHECK_NULL_RETURN_MEMERR(*np); 5120 len = 1; 5121 while (1) { 5122 if (len >= ONIGENC_MBC_MINLEN(env->enc)) { 5123 if (len == enclen(env->enc, NSTR(*np)->s)) { 5124 r = fetch_token(tok, src, end, env); 5125 NSTRING_CLEAR_RAW(*np); 5126 goto string_end; 5127 } 5128 } 5129 5130 r = fetch_token(tok, src, end, env); 5131 if (r < 0) return r; 5132 if (r != TK_RAW_BYTE) { 5133 /* Don't use this, it is wrong for little endian encodings. */ 5134 #ifdef USE_PAD_TO_SHORT_BYTE_CHAR 5135 int rem; 5136 if (len < ONIGENC_MBC_MINLEN(env->enc)) { 5137 rem = ONIGENC_MBC_MINLEN(env->enc) - len; 5138 (void )node_str_head_pad(NSTR(*np), rem, (UChar )0); 5139 if (len + rem == enclen(env->enc, NSTR(*np)->s)) { 5140 NSTRING_CLEAR_RAW(*np); 5141 goto string_end; 5142 } 5143 } 5144 #endif 5145 return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; 5146 } 5147 5148 r = node_str_cat_char(*np, (UChar )tok->u.c); 5149 if (r < 0) return r; 5150 5151 len++; 5152 } 5153 } 5154 break; 5155 5156 case TK_CODE_POINT: 5157 { 5158 UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; 5159 int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); 5160 if (num < 0) return num; 5161 #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG 5162 *np = node_new_str_raw(buf, buf + num); 5163 #else 5164 *np = node_new_str(buf, buf + num); 5165 #endif 5166 CHECK_NULL_RETURN_MEMERR(*np); 5167 } 5168 break; 5169 5170 case TK_QUOTE_OPEN: 5171 { 5172 OnigCodePoint end_op[2]; 5173 UChar *qstart, *qend, *nextp; 5174 5175 end_op[0] = (OnigCodePoint )MC_ESC(env->syntax); 5176 end_op[1] = (OnigCodePoint )'E'; 5177 qstart = *src; 5178 qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); 5179 if (IS_NULL(qend)) { 5180 nextp = qend = end; 5181 } 5182 *np = node_new_str(qstart, qend); 5183 CHECK_NULL_RETURN_MEMERR(*np); 5184 *src = nextp; 5185 } 5186 break; 5187 5188 case TK_CHAR_TYPE: 5189 { 5190 switch (tok->u.prop.ctype) { 5191 case ONIGENC_CTYPE_WORD: 5192 *np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not); 5193 CHECK_NULL_RETURN_MEMERR(*np); 5194 break; 5195 5196 case ONIGENC_CTYPE_SPACE: 5197 case ONIGENC_CTYPE_DIGIT: 5198 case ONIGENC_CTYPE_XDIGIT: 5199 { 5200 CClassNode* cc; 5201 5202 #ifdef USE_SHARED_CCLASS_TABLE 5203 const OnigCodePoint *mbr; 5204 OnigCodePoint sb_out; 5205 5206 r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype, 5207 &sb_out, &mbr); 5208 if (r == 0 && 5209 ONIGENC_CODE_RANGE_NUM(mbr) 5210 >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { 5211 type_cclass_key key; 5212 type_cclass_key* new_key; 5213 5214 key.enc = env->enc; 5215 key.not = tok->u.prop.not; 5216 key.type = tok->u.prop.ctype; 5217 5218 THREAD_ATOMIC_START; 5219 5220 if (IS_NULL(OnigTypeCClassTable)) { 5221 OnigTypeCClassTable 5222 = onig_st_init_table_with_size(&type_type_cclass_hash, 10); 5223 if (IS_NULL(OnigTypeCClassTable)) { 5224 THREAD_ATOMIC_END; 5225 return ONIGERR_MEMORY; 5226 } 5227 } 5228 else { 5229 if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )(UINTN)&key, 5230 (st_data_t* )np)) { 5231 THREAD_ATOMIC_END; 5232 break; 5233 } 5234 } 5235 5236 *np = node_new_cclass_by_codepoint_range(tok->u.prop.not, 5237 sb_out, mbr); 5238 if (IS_NULL(*np)) { 5239 THREAD_ATOMIC_END; 5240 return ONIGERR_MEMORY; 5241 } 5242 5243 cc = NCCLASS(*np); 5244 NCCLASS_SET_SHARE(cc); 5245 new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); 5246 CHECK_NULL_RETURN_MEMERR(new_key); 5247 xmemcpy(new_key, &key, sizeof(type_cclass_key)); 5248 onig_st_add_direct(OnigTypeCClassTable, (st_data_t )(UINTN)new_key, 5249 (st_data_t )(UINTN)*np); 5250 5251 THREAD_ATOMIC_END; 5252 } 5253 else { 5254 #endif 5255 *np = node_new_cclass(); 5256 CHECK_NULL_RETURN_MEMERR(*np); 5257 cc = NCCLASS(*np); 5258 add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env); 5259 if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); 5260 #ifdef USE_SHARED_CCLASS_TABLE 5261 } 5262 #endif 5263 } 5264 break; 5265 5266 default: 5267 return ONIGERR_PARSER_BUG; 5268 break; 5269 } 5270 } 5271 break; 5272 5273 case TK_CHAR_PROPERTY: 5274 r = parse_char_property(np, tok, src, end, env); 5275 if (r != 0) return r; 5276 break; 5277 5278 case TK_CC_OPEN: 5279 { 5280 CClassNode* cc; 5281 5282 r = parse_char_class(np, tok, src, end, env); 5283 if (r != 0) return r; 5284 5285 cc = NCCLASS(*np); 5286 if (IS_IGNORECASE(env->option)) { 5287 IApplyCaseFoldArg iarg; 5288 5289 iarg.env = env; 5290 iarg.cc = cc; 5291 iarg.alt_root = NULL_NODE; 5292 iarg.ptail = &(iarg.alt_root); 5293 5294 r = ONIGENC_APPLY_ALL_CASE_FOLD(env->enc, env->case_fold_flag, 5295 i_apply_case_fold, &iarg); 5296 if (r != 0) { 5297 onig_node_free(iarg.alt_root); 5298 return r; 5299 } 5300 if (IS_NOT_NULL(iarg.alt_root)) { 5301 Node* work = onig_node_new_alt(*np, iarg.alt_root); 5302 if (IS_NULL(work)) { 5303 onig_node_free(iarg.alt_root); 5304 return ONIGERR_MEMORY; 5305 } 5306 *np = work; 5307 } 5308 } 5309 } 5310 break; 5311 5312 case TK_ANYCHAR: 5313 *np = node_new_anychar(); 5314 CHECK_NULL_RETURN_MEMERR(*np); 5315 break; 5316 5317 case TK_ANYCHAR_ANYTIME: 5318 *np = node_new_anychar(); 5319 CHECK_NULL_RETURN_MEMERR(*np); 5320 qn = node_new_quantifier(0, REPEAT_INFINITE, 0); 5321 CHECK_NULL_RETURN_MEMERR(qn); 5322 NQTFR(qn)->target = *np; 5323 *np = qn; 5324 break; 5325 5326 case TK_BACKREF: 5327 len = tok->u.backref.num; 5328 *np = node_new_backref(len, 5329 (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), 5330 tok->u.backref.by_name, 5331 #ifdef USE_BACKREF_WITH_LEVEL 5332 tok->u.backref.exist_level, 5333 tok->u.backref.level, 5334 #endif 5335 env); 5336 CHECK_NULL_RETURN_MEMERR(*np); 5337 break; 5338 5339 #ifdef USE_SUBEXP_CALL 5340 case TK_CALL: 5341 { 5342 int gnum = tok->u.call.gnum; 5343 5344 if (gnum < 0) { 5345 gnum = BACKREF_REL_TO_ABS(gnum, env); 5346 if (gnum <= 0) 5347 return ONIGERR_INVALID_BACKREF; 5348 } 5349 *np = node_new_call(tok->u.call.name, tok->u.call.name_end, gnum); 5350 CHECK_NULL_RETURN_MEMERR(*np); 5351 env->num_call++; 5352 } 5353 break; 5354 #endif 5355 5356 case TK_ANCHOR: 5357 *np = onig_node_new_anchor(tok->u.anchor); 5358 CHECK_NULL_RETURN_MEMERR(*np); 5359 break; 5360 5361 case TK_OP_REPEAT: 5362 case TK_INTERVAL: 5363 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { 5364 if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) 5365 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; 5366 else 5367 *np = node_new_empty(); 5368 CHECK_NULL_RETURN_MEMERR(*np); 5369 } 5370 else { 5371 goto tk_byte; 5372 } 5373 break; 5374 5375 default: 5376 return ONIGERR_PARSER_BUG; 5377 break; 5378 } 5379 5380 { 5381 targetp = np; 5382 5383 re_entry: 5384 r = fetch_token(tok, src, end, env); 5385 if (r < 0) return r; 5386 5387 repeat: 5388 if (r == TK_OP_REPEAT || r == TK_INTERVAL) { 5389 if (is_invalid_quantifier_target(*targetp)) 5390 return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; 5391 5392 qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, 5393 (r == TK_INTERVAL ? 1 : 0)); 5394 CHECK_NULL_RETURN_MEMERR(qn); 5395 NQTFR(qn)->greedy = tok->u.repeat.greedy; 5396 r = set_quantifier(qn, *targetp, group, env); 5397 if (r < 0) { 5398 onig_node_free(qn); 5399 return r; 5400 } 5401 5402 if (tok->u.repeat.possessive != 0) { 5403 Node* en; 5404 en = node_new_enclose(ENCLOSE_STOP_BACKTRACK); 5405 if (IS_NULL(en)) { 5406 onig_node_free(qn); 5407 return ONIGERR_MEMORY; 5408 } 5409 NENCLOSE(en)->target = qn; 5410 qn = en; 5411 } 5412 5413 if (r == 0) { 5414 *targetp = qn; 5415 } 5416 else if (r == 1) { 5417 onig_node_free(qn); 5418 } 5419 else if (r == 2) { /* split case: /abc+/ */ 5420 Node *tmp; 5421 5422 *targetp = node_new_list(*targetp, NULL); 5423 if (IS_NULL(*targetp)) { 5424 onig_node_free(qn); 5425 return ONIGERR_MEMORY; 5426 } 5427 tmp = NCDR(*targetp) = node_new_list(qn, NULL); 5428 if (IS_NULL(tmp)) { 5429 onig_node_free(qn); 5430 return ONIGERR_MEMORY; 5431 } 5432 targetp = &(NCAR(tmp)); 5433 } 5434 goto re_entry; 5435 } 5436 } 5437 5438 return r; 5439 } 5440 5441 static int 5442 parse_branch(Node** top, OnigToken* tok, int term, 5443 UChar** src, UChar* end, ScanEnv* env) 5444 { 5445 int r; 5446 Node *node, **headp; 5447 5448 *top = NULL; 5449 r = parse_exp(&node, tok, term, src, end, env); 5450 if (r < 0) return r; 5451 5452 if (r == TK_EOT || r == term || r == TK_ALT) { 5453 *top = node; 5454 } 5455 else { 5456 *top = node_new_list(node, NULL); 5457 CHECK_NULL_RETURN_MEMERR(*top); 5458 headp = &(NCDR(*top)); 5459 while (r != TK_EOT && r != term && r != TK_ALT) { 5460 r = parse_exp(&node, tok, term, src, end, env); 5461 CHECK_NULL_RETURN_MEMERR(node); 5462 if (r < 0) return r; 5463 5464 if (NTYPE(node) == NT_LIST) { 5465 *headp = node; 5466 while (IS_NOT_NULL(NCDR(node))) node = NCDR(node); 5467 headp = &(NCDR(node)); 5468 } 5469 else { 5470 *headp = node_new_list(node, NULL); 5471 headp = &(NCDR(*headp)); 5472 } 5473 } 5474 } 5475 5476 return r; 5477 } 5478 5479 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ 5480 static int 5481 parse_subexp(Node** top, OnigToken* tok, int term, 5482 UChar** src, UChar* end, ScanEnv* env) 5483 { 5484 int r; 5485 Node *node, **headp; 5486 5487 *top = NULL; 5488 r = parse_branch(&node, tok, term, src, end, env); 5489 if (r < 0) { 5490 onig_node_free(node); 5491 return r; 5492 } 5493 5494 if (r == term) { 5495 *top = node; 5496 } 5497 else if (r == TK_ALT) { 5498 *top = onig_node_new_alt(node, NULL); 5499 CHECK_NULL_RETURN_MEMERR(*top); 5500 headp = &(NCDR(*top)); 5501 while (r == TK_ALT) { 5502 r = fetch_token(tok, src, end, env); 5503 if (r < 0) return r; 5504 r = parse_branch(&node, tok, term, src, end, env); 5505 if (r < 0) return r; 5506 5507 *headp = onig_node_new_alt(node, NULL); 5508 headp = &(NCDR(*headp)); 5509 } 5510 5511 if (tok->type != (enum TokenSyms )term) 5512 goto err; 5513 } 5514 else { 5515 err: 5516 if (term == TK_SUBEXP_CLOSE) 5517 return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; 5518 else 5519 return ONIGERR_PARSER_BUG; 5520 } 5521 5522 return r; 5523 } 5524 5525 static int 5526 parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) 5527 { 5528 int r; 5529 OnigToken tok; 5530 5531 r = fetch_token(&tok, src, end, env); 5532 if (r < 0) return r; 5533 r = parse_subexp(top, &tok, TK_EOT, src, end, env); 5534 if (r < 0) return r; 5535 return 0; 5536 } 5537 5538 extern int 5539 onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, 5540 regex_t* reg, ScanEnv* env) 5541 { 5542 int r; 5543 UChar* p; 5544 5545 #ifdef USE_NAMED_GROUP 5546 names_clear(reg); 5547 #endif 5548 5549 scan_env_clear(env); 5550 env->option = reg->options; 5551 env->case_fold_flag = reg->case_fold_flag; 5552 env->enc = reg->enc; 5553 env->syntax = reg->syntax; 5554 env->pattern = (UChar* )pattern; 5555 env->pattern_end = (UChar* )end; 5556 env->reg = reg; 5557 5558 *root = NULL; 5559 p = (UChar* )pattern; 5560 r = parse_regexp(root, &p, (UChar* )end, env); 5561 reg->num_mem = env->num_mem; 5562 return r; 5563 } 5564 5565 extern void 5566 onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, 5567 UChar* arg, UChar* arg_end) 5568 { 5569 env->error = arg; 5570 env->error_end = arg_end; 5571 } 5572