1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016-2018 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 #ifdef HAVE_CONFIG_H 43 #include "config.h" 44 #endif 45 46 #include "pcre2_internal.h" 47 48 #define TYPE_OPTIONS (PCRE2_CONVERT_GLOB| \ 49 PCRE2_CONVERT_POSIX_BASIC|PCRE2_CONVERT_POSIX_EXTENDED) 50 51 #define ALL_OPTIONS (PCRE2_CONVERT_UTF|PCRE2_CONVERT_NO_UTF_CHECK| \ 52 PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR| \ 53 PCRE2_CONVERT_GLOB_NO_STARSTAR| \ 54 TYPE_OPTIONS) 55 56 #define DUMMY_BUFFER_SIZE 100 57 58 /* Generated pattern fragments */ 59 60 #define STR_BACKSLASH_A STR_BACKSLASH STR_A 61 #define STR_BACKSLASH_z STR_BACKSLASH STR_z 62 #define STR_COLON_RIGHT_SQUARE_BRACKET STR_COLON STR_RIGHT_SQUARE_BRACKET 63 #define STR_DOT_STAR_LOOKBEHIND STR_DOT STR_ASTERISK STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_LESS_THAN_SIGN STR_EQUALS_SIGN 64 #define STR_LOOKAHEAD_NOT_DOT STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_EXCLAMATION_MARK STR_BACKSLASH STR_DOT STR_RIGHT_PARENTHESIS 65 #define STR_QUERY_s STR_LEFT_PARENTHESIS STR_QUESTION_MARK STR_s STR_RIGHT_PARENTHESIS 66 #define STR_STAR_NUL STR_LEFT_PARENTHESIS STR_ASTERISK STR_N STR_U STR_L STR_RIGHT_PARENTHESIS 67 68 /* States for range and POSIX processing */ 69 70 enum { RANGE_NOT_STARTED, RANGE_STARTING, RANGE_STARTED }; 71 enum { POSIX_START_REGEX, POSIX_ANCHORED, POSIX_NOT_BRACKET, 72 POSIX_CLASS_NOT_STARTED, POSIX_CLASS_STARTING, POSIX_CLASS_STARTED }; 73 74 /* Macro to add a character string to the output buffer, checking for overflow. */ 75 76 #define PUTCHARS(string) \ 77 { \ 78 for (s = (char *)(string); *s != 0; s++) \ 79 { \ 80 if (p >= endp) return PCRE2_ERROR_NOMEMORY; \ 81 *p++ = *s; \ 82 } \ 83 } 84 85 /* Literals that must be escaped: \ ? * + | . ^ $ { } [ ] ( ) */ 86 87 static const char *pcre2_escaped_literals = 88 STR_BACKSLASH STR_QUESTION_MARK STR_ASTERISK STR_PLUS 89 STR_VERTICAL_LINE STR_DOT STR_CIRCUMFLEX_ACCENT STR_DOLLAR_SIGN 90 STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET 91 STR_LEFT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET 92 STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS; 93 94 /* Recognized escaped metacharacters in POSIX basic patterns. */ 95 96 static const char *posix_meta_escapes = 97 STR_LEFT_PARENTHESIS STR_RIGHT_PARENTHESIS 98 STR_LEFT_CURLY_BRACKET STR_RIGHT_CURLY_BRACKET 99 STR_1 STR_2 STR_3 STR_4 STR_5 STR_6 STR_7 STR_8 STR_9; 100 101 102 103 /************************************************* 104 * Convert a POSIX pattern * 105 *************************************************/ 106 107 /* This function handles both basic and extended POSIX patterns. 108 109 Arguments: 110 pattype the pattern type 111 pattern the pattern 112 plength length in code units 113 utf TRUE if UTF 114 use_buffer where to put the output 115 use_length length of use_buffer 116 bufflenptr where to put the used length 117 dummyrun TRUE if a dummy run 118 ccontext the convert context 119 120 Returns: 0 => success 121 !0 => error code 122 */ 123 124 static int 125 convert_posix(uint32_t pattype, PCRE2_SPTR pattern, PCRE2_SIZE plength, 126 BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, 127 PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) 128 { 129 char *s; 130 PCRE2_SPTR posix = pattern; 131 PCRE2_UCHAR *p = use_buffer; 132 PCRE2_UCHAR *pp = p; 133 PCRE2_UCHAR *endp = p + use_length - 1; /* Allow for trailing zero */ 134 PCRE2_SIZE convlength = 0; 135 136 uint32_t bracount = 0; 137 uint32_t posix_state = POSIX_START_REGEX; 138 uint32_t lastspecial = 0; 139 BOOL extended = (pattype & PCRE2_CONVERT_POSIX_EXTENDED) != 0; 140 BOOL nextisliteral = FALSE; 141 142 (void)utf; /* Not used when Unicode not supported */ 143 (void)ccontext; /* Not currently used */ 144 145 /* Initialize default for error offset as end of input. */ 146 147 *bufflenptr = plength; 148 PUTCHARS(STR_STAR_NUL); 149 150 /* Now scan the input. */ 151 152 while (plength > 0) 153 { 154 uint32_t c, sc; 155 int clength = 1; 156 157 /* Add in the length of the last item, then, if in the dummy run, pull the 158 pointer back to the start of the (temporary) buffer and then remember the 159 start of the next item. */ 160 161 convlength += p - pp; 162 if (dummyrun) p = use_buffer; 163 pp = p; 164 165 /* Pick up the next character */ 166 167 #ifndef SUPPORT_UNICODE 168 c = *posix; 169 #else 170 GETCHARLENTEST(c, posix, clength); 171 #endif 172 posix += clength; 173 plength -= clength; 174 175 sc = nextisliteral? 0 : c; 176 nextisliteral = FALSE; 177 178 /* Handle a character within a class. */ 179 180 if (posix_state >= POSIX_CLASS_NOT_STARTED) 181 { 182 if (c == CHAR_RIGHT_SQUARE_BRACKET) 183 { 184 PUTCHARS(STR_RIGHT_SQUARE_BRACKET); 185 posix_state = POSIX_NOT_BRACKET; 186 } 187 188 /* Not the end of the class */ 189 190 else 191 { 192 switch (posix_state) 193 { 194 case POSIX_CLASS_STARTED: 195 if (c <= 127 && islower(c)) break; /* Remain in started state */ 196 posix_state = POSIX_CLASS_NOT_STARTED; 197 if (c == CHAR_COLON && plength > 0 && 198 *posix == CHAR_RIGHT_SQUARE_BRACKET) 199 { 200 PUTCHARS(STR_COLON_RIGHT_SQUARE_BRACKET); 201 plength--; 202 posix++; 203 continue; /* With next character after :] */ 204 } 205 /* Fall through */ 206 207 case POSIX_CLASS_NOT_STARTED: 208 if (c == CHAR_LEFT_SQUARE_BRACKET) 209 posix_state = POSIX_CLASS_STARTING; 210 break; 211 212 case POSIX_CLASS_STARTING: 213 if (c == CHAR_COLON) posix_state = POSIX_CLASS_STARTED; 214 break; 215 } 216 217 if (c == CHAR_BACKSLASH) PUTCHARS(STR_BACKSLASH); 218 if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; 219 memcpy(p, posix - clength, CU2BYTES(clength)); 220 p += clength; 221 } 222 } 223 224 /* Handle a character not within a class. */ 225 226 else switch(sc) 227 { 228 case CHAR_LEFT_SQUARE_BRACKET: 229 PUTCHARS(STR_LEFT_SQUARE_BRACKET); 230 231 #ifdef NEVER 232 /* We could handle special cases [[:<:]] and [[:>:]] (which PCRE does 233 support) but they are not part of POSIX 1003.1. */ 234 235 if (plength >= 6) 236 { 237 if (posix[0] == CHAR_LEFT_SQUARE_BRACKET && 238 posix[1] == CHAR_COLON && 239 (posix[2] == CHAR_LESS_THAN_SIGN || 240 posix[2] == CHAR_GREATER_THAN_SIGN) && 241 posix[3] == CHAR_COLON && 242 posix[4] == CHAR_RIGHT_SQUARE_BRACKET && 243 posix[5] == CHAR_RIGHT_SQUARE_BRACKET) 244 { 245 if (p + 6 > endp) return PCRE2_ERROR_NOMEMORY; 246 memcpy(p, posix, CU2BYTES(6)); 247 p += 6; 248 posix += 6; 249 plength -= 6; 250 continue; /* With next character */ 251 } 252 } 253 #endif 254 255 /* Handle start of "normal" character classes */ 256 257 posix_state = POSIX_CLASS_NOT_STARTED; 258 259 /* Handle ^ and ] as first characters */ 260 261 if (plength > 0) 262 { 263 if (*posix == CHAR_CIRCUMFLEX_ACCENT) 264 { 265 posix++; 266 plength--; 267 PUTCHARS(STR_CIRCUMFLEX_ACCENT); 268 } 269 if (plength > 0 && *posix == CHAR_RIGHT_SQUARE_BRACKET) 270 { 271 posix++; 272 plength--; 273 PUTCHARS(STR_RIGHT_SQUARE_BRACKET); 274 } 275 } 276 break; 277 278 case CHAR_BACKSLASH: 279 if (plength <= 0) return PCRE2_ERROR_END_BACKSLASH; 280 if (extended) nextisliteral = TRUE; else 281 { 282 if (*posix < 127 && strchr(posix_meta_escapes, *posix) != NULL) 283 { 284 if (isdigit(*posix)) PUTCHARS(STR_BACKSLASH); 285 if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; 286 lastspecial = *p++ = *posix++; 287 plength--; 288 } 289 else nextisliteral = TRUE; 290 } 291 break; 292 293 case CHAR_RIGHT_PARENTHESIS: 294 if (!extended || bracount == 0) goto ESCAPE_LITERAL; 295 bracount--; 296 goto COPY_SPECIAL; 297 298 case CHAR_LEFT_PARENTHESIS: 299 bracount++; 300 /* Fall through */ 301 302 case CHAR_QUESTION_MARK: 303 case CHAR_PLUS: 304 case CHAR_LEFT_CURLY_BRACKET: 305 case CHAR_RIGHT_CURLY_BRACKET: 306 case CHAR_VERTICAL_LINE: 307 if (!extended) goto ESCAPE_LITERAL; 308 /* Fall through */ 309 310 case CHAR_DOT: 311 case CHAR_DOLLAR_SIGN: 312 posix_state = POSIX_NOT_BRACKET; 313 COPY_SPECIAL: 314 lastspecial = c; 315 if (p + 1 > endp) return PCRE2_ERROR_NOMEMORY; 316 *p++ = c; 317 break; 318 319 case CHAR_ASTERISK: 320 if (lastspecial != CHAR_ASTERISK) 321 { 322 if (!extended && (posix_state < POSIX_NOT_BRACKET || 323 lastspecial == CHAR_LEFT_PARENTHESIS)) 324 goto ESCAPE_LITERAL; 325 goto COPY_SPECIAL; 326 } 327 break; /* Ignore second and subsequent asterisks */ 328 329 case CHAR_CIRCUMFLEX_ACCENT: 330 if (extended) goto COPY_SPECIAL; 331 if (posix_state == POSIX_START_REGEX || 332 lastspecial == CHAR_LEFT_PARENTHESIS) 333 { 334 posix_state = POSIX_ANCHORED; 335 goto COPY_SPECIAL; 336 } 337 /* Fall through */ 338 339 default: 340 if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) 341 { 342 ESCAPE_LITERAL: 343 PUTCHARS(STR_BACKSLASH); 344 } 345 lastspecial = 0xff; /* Indicates nothing special */ 346 if (p + clength > endp) return PCRE2_ERROR_NOMEMORY; 347 memcpy(p, posix - clength, CU2BYTES(clength)); 348 p += clength; 349 posix_state = POSIX_NOT_BRACKET; 350 break; 351 } 352 } 353 354 if (posix_state >= POSIX_CLASS_NOT_STARTED) 355 return PCRE2_ERROR_MISSING_SQUARE_BRACKET; 356 convlength += p - pp; /* Final segment */ 357 *bufflenptr = convlength; 358 *p++ = 0; 359 return 0; 360 } 361 362 363 /************************************************* 364 * Convert a glob pattern * 365 *************************************************/ 366 367 /* Context for writing the output into a buffer. */ 368 369 typedef struct pcre2_output_context { 370 PCRE2_UCHAR *output; /* current output position */ 371 PCRE2_SPTR output_end; /* output end */ 372 PCRE2_SIZE output_size; /* size of the output */ 373 uint8_t out_str[8]; /* string copied to the output */ 374 } pcre2_output_context; 375 376 377 /* Write a character into the output. 378 379 Arguments: 380 out output context 381 chr the next character 382 */ 383 384 static void 385 convert_glob_write(pcre2_output_context *out, PCRE2_UCHAR chr) 386 { 387 out->output_size++; 388 389 if (out->output < out->output_end) 390 *out->output++ = chr; 391 } 392 393 394 /* Write a string into the output. 395 396 Arguments: 397 out output context 398 length length of out->out_str 399 */ 400 401 static void 402 convert_glob_write_str(pcre2_output_context *out, PCRE2_SIZE length) 403 { 404 uint8_t *out_str = out->out_str; 405 PCRE2_UCHAR *output = out->output; 406 PCRE2_SPTR output_end = out->output_end; 407 PCRE2_SIZE output_size = out->output_size; 408 409 do 410 { 411 output_size++; 412 413 if (output < output_end) 414 *output++ = *out_str++; 415 } 416 while (--length != 0); 417 418 out->output = output; 419 out->output_size = output_size; 420 } 421 422 423 /* Prints the separator into the output. 424 425 Arguments: 426 out output context 427 separator glob separator 428 with_escape backslash is needed before separator 429 */ 430 431 static void 432 convert_glob_print_separator(pcre2_output_context *out, 433 PCRE2_UCHAR separator, BOOL with_escape) 434 { 435 if (with_escape) 436 convert_glob_write(out, CHAR_BACKSLASH); 437 438 convert_glob_write(out, separator); 439 } 440 441 442 /* Prints a wildcard into the output. 443 444 Arguments: 445 out output context 446 separator glob separator 447 with_escape backslash is needed before separator 448 */ 449 450 static void 451 convert_glob_print_wildcard(pcre2_output_context *out, 452 PCRE2_UCHAR separator, BOOL with_escape) 453 { 454 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; 455 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; 456 convert_glob_write_str(out, 2); 457 458 convert_glob_print_separator(out, separator, with_escape); 459 460 convert_glob_write(out, CHAR_RIGHT_SQUARE_BRACKET); 461 } 462 463 464 /* Parse a posix class. 465 466 Arguments: 467 from starting point of scanning the range 468 pattern_end end of pattern 469 out output context 470 471 Returns: >0 => class index 472 0 => malformed class 473 */ 474 475 static int 476 convert_glob_parse_class(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, 477 pcre2_output_context *out) 478 { 479 static const char *posix_classes = "alnum:alpha:ascii:blank:cntrl:digit:" 480 "graph:lower:print:punct:space:upper:word:xdigit:"; 481 PCRE2_SPTR start = *from + 1; 482 PCRE2_SPTR pattern = start; 483 const char *class_ptr; 484 PCRE2_UCHAR c; 485 int class_index; 486 487 while (TRUE) 488 { 489 if (pattern >= pattern_end) return 0; 490 491 c = *pattern++; 492 493 if (c < CHAR_a || c > CHAR_z) break; 494 } 495 496 if (c != CHAR_COLON || pattern >= pattern_end || 497 *pattern != CHAR_RIGHT_SQUARE_BRACKET) 498 return 0; 499 500 class_ptr = posix_classes; 501 class_index = 1; 502 503 while (TRUE) 504 { 505 if (*class_ptr == CHAR_NUL) return 0; 506 507 pattern = start; 508 509 while (*pattern == (PCRE2_UCHAR) *class_ptr) 510 { 511 if (*pattern == CHAR_COLON) 512 { 513 pattern += 2; 514 start -= 2; 515 516 do convert_glob_write(out, *start++); while (start < pattern); 517 518 *from = pattern; 519 return class_index; 520 } 521 pattern++; 522 class_ptr++; 523 } 524 525 while (*class_ptr != CHAR_COLON) class_ptr++; 526 class_ptr++; 527 class_index++; 528 } 529 } 530 531 /* Checks whether the character is in the class. 532 533 Arguments: 534 class_index class index 535 c character 536 537 Returns: !0 => character is found in the class 538 0 => otherwise 539 */ 540 541 static BOOL 542 convert_glob_char_in_class(int class_index, PCRE2_UCHAR c) 543 { 544 switch (class_index) 545 { 546 case 1: return isalnum(c); 547 case 2: return isalpha(c); 548 case 3: return 1; 549 case 4: return c == CHAR_HT || c == CHAR_SPACE; 550 case 5: return iscntrl(c); 551 case 6: return isdigit(c); 552 case 7: return isgraph(c); 553 case 8: return islower(c); 554 case 9: return isprint(c); 555 case 10: return ispunct(c); 556 case 11: return isspace(c); 557 case 12: return isupper(c); 558 case 13: return isalnum(c) || c == CHAR_UNDERSCORE; 559 default: return isxdigit(c); 560 } 561 } 562 563 /* Parse a range of characters. 564 565 Arguments: 566 from starting point of scanning the range 567 pattern_end end of pattern 568 out output context 569 separator glob separator 570 with_escape backslash is needed before separator 571 572 Returns: 0 => success 573 !0 => error code 574 */ 575 576 static int 577 convert_glob_parse_range(PCRE2_SPTR *from, PCRE2_SPTR pattern_end, 578 pcre2_output_context *out, BOOL utf, PCRE2_UCHAR separator, 579 BOOL with_escape, PCRE2_UCHAR escape, BOOL no_wildsep) 580 { 581 BOOL is_negative = FALSE; 582 BOOL separator_seen = FALSE; 583 BOOL has_prev_c; 584 PCRE2_SPTR pattern = *from; 585 PCRE2_SPTR char_start = NULL; 586 uint32_t c, prev_c; 587 int len, class_index; 588 589 (void)utf; /* Avoid compiler warning. */ 590 591 if (pattern >= pattern_end) 592 { 593 *from = pattern; 594 return PCRE2_ERROR_MISSING_SQUARE_BRACKET; 595 } 596 597 if (*pattern == CHAR_EXCLAMATION_MARK 598 || *pattern == CHAR_CIRCUMFLEX_ACCENT) 599 { 600 pattern++; 601 602 if (pattern >= pattern_end) 603 { 604 *from = pattern; 605 return PCRE2_ERROR_MISSING_SQUARE_BRACKET; 606 } 607 608 is_negative = TRUE; 609 610 out->out_str[0] = CHAR_LEFT_SQUARE_BRACKET; 611 out->out_str[1] = CHAR_CIRCUMFLEX_ACCENT; 612 len = 2; 613 614 if (!no_wildsep) 615 { 616 if (with_escape) 617 { 618 out->out_str[len] = CHAR_BACKSLASH; 619 len++; 620 } 621 out->out_str[len] = (uint8_t) separator; 622 } 623 624 convert_glob_write_str(out, len + 1); 625 } 626 else 627 convert_glob_write(out, CHAR_LEFT_SQUARE_BRACKET); 628 629 has_prev_c = FALSE; 630 prev_c = 0; 631 632 if (*pattern == CHAR_RIGHT_SQUARE_BRACKET) 633 { 634 out->out_str[0] = CHAR_BACKSLASH; 635 out->out_str[1] = CHAR_RIGHT_SQUARE_BRACKET; 636 convert_glob_write_str(out, 2); 637 has_prev_c = TRUE; 638 prev_c = CHAR_RIGHT_SQUARE_BRACKET; 639 pattern++; 640 } 641 642 while (pattern < pattern_end) 643 { 644 char_start = pattern; 645 GETCHARINCTEST(c, pattern); 646 647 if (c == CHAR_RIGHT_SQUARE_BRACKET) 648 { 649 convert_glob_write(out, c); 650 651 if (!is_negative && !no_wildsep && separator_seen) 652 { 653 out->out_str[0] = CHAR_LEFT_PARENTHESIS; 654 out->out_str[1] = CHAR_QUESTION_MARK; 655 out->out_str[2] = CHAR_LESS_THAN_SIGN; 656 out->out_str[3] = CHAR_EXCLAMATION_MARK; 657 convert_glob_write_str(out, 4); 658 659 convert_glob_print_separator(out, separator, with_escape); 660 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); 661 } 662 663 *from = pattern; 664 return 0; 665 } 666 667 if (pattern >= pattern_end) break; 668 669 if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) 670 { 671 *from = pattern; 672 class_index = convert_glob_parse_class(from, pattern_end, out); 673 674 if (class_index != 0) 675 { 676 pattern = *from; 677 678 has_prev_c = FALSE; 679 prev_c = 0; 680 681 if (!is_negative && 682 convert_glob_char_in_class (class_index, separator)) 683 separator_seen = TRUE; 684 continue; 685 } 686 } 687 else if (c == CHAR_MINUS && has_prev_c && 688 *pattern != CHAR_RIGHT_SQUARE_BRACKET) 689 { 690 convert_glob_write(out, CHAR_MINUS); 691 692 char_start = pattern; 693 GETCHARINCTEST(c, pattern); 694 695 if (pattern >= pattern_end) break; 696 697 if (escape != 0 && c == escape) 698 { 699 char_start = pattern; 700 GETCHARINCTEST(c, pattern); 701 } 702 else if (c == CHAR_LEFT_SQUARE_BRACKET && *pattern == CHAR_COLON) 703 { 704 *from = pattern; 705 return PCRE2_ERROR_CONVERT_SYNTAX; 706 } 707 708 if (prev_c > c) 709 { 710 *from = pattern; 711 return PCRE2_ERROR_CONVERT_SYNTAX; 712 } 713 714 if (prev_c < separator && separator < c) separator_seen = TRUE; 715 716 has_prev_c = FALSE; 717 prev_c = 0; 718 } 719 else 720 { 721 if (escape != 0 && c == escape) 722 { 723 char_start = pattern; 724 GETCHARINCTEST(c, pattern); 725 726 if (pattern >= pattern_end) break; 727 } 728 729 has_prev_c = TRUE; 730 prev_c = c; 731 } 732 733 if (c == CHAR_LEFT_SQUARE_BRACKET || c == CHAR_RIGHT_SQUARE_BRACKET || 734 c == CHAR_BACKSLASH || c == CHAR_MINUS) 735 convert_glob_write(out, CHAR_BACKSLASH); 736 737 if (c == separator) separator_seen = TRUE; 738 739 do convert_glob_write(out, *char_start++); while (char_start < pattern); 740 } 741 742 *from = pattern; 743 return PCRE2_ERROR_MISSING_SQUARE_BRACKET; 744 } 745 746 747 /* Prints a (*COMMIT) into the output. 748 749 Arguments: 750 out output context 751 */ 752 753 static void 754 convert_glob_print_commit(pcre2_output_context *out) 755 { 756 out->out_str[0] = CHAR_LEFT_PARENTHESIS; 757 out->out_str[1] = CHAR_ASTERISK; 758 out->out_str[2] = CHAR_C; 759 out->out_str[3] = CHAR_O; 760 out->out_str[4] = CHAR_M; 761 out->out_str[5] = CHAR_M; 762 out->out_str[6] = CHAR_I; 763 out->out_str[7] = CHAR_T; 764 convert_glob_write_str(out, 8); 765 convert_glob_write(out, CHAR_RIGHT_PARENTHESIS); 766 } 767 768 769 /* Bash glob converter. 770 771 Arguments: 772 pattype the pattern type 773 pattern the pattern 774 plength length in code units 775 utf TRUE if UTF 776 use_buffer where to put the output 777 use_length length of use_buffer 778 bufflenptr where to put the used length 779 dummyrun TRUE if a dummy run 780 ccontext the convert context 781 782 Returns: 0 => success 783 !0 => error code 784 */ 785 786 static int 787 convert_glob(uint32_t options, PCRE2_SPTR pattern, PCRE2_SIZE plength, 788 BOOL utf, PCRE2_UCHAR *use_buffer, PCRE2_SIZE use_length, 789 PCRE2_SIZE *bufflenptr, BOOL dummyrun, pcre2_convert_context *ccontext) 790 { 791 pcre2_output_context out; 792 PCRE2_SPTR pattern_start = pattern; 793 PCRE2_SPTR pattern_end = pattern + plength; 794 PCRE2_UCHAR separator = ccontext->glob_separator; 795 PCRE2_UCHAR escape = ccontext->glob_escape; 796 PCRE2_UCHAR c; 797 BOOL no_wildsep = (options & PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR) != 0; 798 BOOL no_starstar = (options & PCRE2_CONVERT_GLOB_NO_STARSTAR) != 0; 799 BOOL in_atomic = FALSE; 800 BOOL after_starstar = FALSE; 801 BOOL no_slash_z = FALSE; 802 BOOL with_escape, is_start, after_separator; 803 int result = 0; 804 805 (void)utf; /* Avoid compiler warning. */ 806 807 #ifdef SUPPORT_UNICODE 808 if (utf && (separator >= 128 || escape >= 128)) 809 { 810 /* Currently only ASCII characters are supported. */ 811 *bufflenptr = 0; 812 return PCRE2_ERROR_CONVERT_SYNTAX; 813 } 814 #endif 815 816 with_escape = strchr(pcre2_escaped_literals, separator) != NULL; 817 818 /* Initialize default for error offset as end of input. */ 819 out.output = use_buffer; 820 out.output_end = use_buffer + use_length; 821 out.output_size = 0; 822 823 out.out_str[0] = CHAR_LEFT_PARENTHESIS; 824 out.out_str[1] = CHAR_QUESTION_MARK; 825 out.out_str[2] = CHAR_s; 826 out.out_str[3] = CHAR_RIGHT_PARENTHESIS; 827 convert_glob_write_str(&out, 4); 828 829 is_start = TRUE; 830 831 if (pattern < pattern_end && pattern[0] == CHAR_ASTERISK) 832 { 833 if (no_wildsep) 834 is_start = FALSE; 835 else if (!no_starstar && pattern + 1 < pattern_end && 836 pattern[1] == CHAR_ASTERISK) 837 is_start = FALSE; 838 } 839 840 if (is_start) 841 { 842 out.out_str[0] = CHAR_BACKSLASH; 843 out.out_str[1] = CHAR_A; 844 convert_glob_write_str(&out, 2); 845 } 846 847 while (pattern < pattern_end) 848 { 849 c = *pattern++; 850 851 if (c == CHAR_ASTERISK) 852 { 853 is_start = pattern == pattern_start + 1; 854 855 if (in_atomic) 856 { 857 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); 858 in_atomic = FALSE; 859 } 860 861 if (!no_starstar && pattern < pattern_end && *pattern == CHAR_ASTERISK) 862 { 863 after_separator = is_start || (pattern[-2] == separator); 864 865 do pattern++; while (pattern < pattern_end && 866 *pattern == CHAR_ASTERISK); 867 868 if (pattern >= pattern_end) 869 { 870 no_slash_z = TRUE; 871 break; 872 } 873 874 after_starstar = TRUE; 875 876 if (after_separator && escape != 0 && *pattern == escape && 877 pattern + 1 < pattern_end && pattern[1] == separator) 878 pattern++; 879 880 if (is_start) 881 { 882 if (*pattern != separator) continue; 883 884 out.out_str[0] = CHAR_LEFT_PARENTHESIS; 885 out.out_str[1] = CHAR_QUESTION_MARK; 886 out.out_str[2] = CHAR_COLON; 887 out.out_str[3] = CHAR_BACKSLASH; 888 out.out_str[4] = CHAR_A; 889 out.out_str[5] = CHAR_VERTICAL_LINE; 890 convert_glob_write_str(&out, 6); 891 892 convert_glob_print_separator(&out, separator, with_escape); 893 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); 894 895 pattern++; 896 continue; 897 } 898 899 convert_glob_print_commit(&out); 900 901 if (!after_separator || *pattern != separator) 902 { 903 out.out_str[0] = CHAR_DOT; 904 out.out_str[1] = CHAR_ASTERISK; 905 out.out_str[2] = CHAR_QUESTION_MARK; 906 convert_glob_write_str(&out, 3); 907 continue; 908 } 909 910 out.out_str[0] = CHAR_LEFT_PARENTHESIS; 911 out.out_str[1] = CHAR_QUESTION_MARK; 912 out.out_str[2] = CHAR_COLON; 913 out.out_str[3] = CHAR_DOT; 914 out.out_str[4] = CHAR_ASTERISK; 915 out.out_str[5] = CHAR_QUESTION_MARK; 916 917 convert_glob_write_str(&out, 6); 918 919 convert_glob_print_separator(&out, separator, with_escape); 920 921 out.out_str[0] = CHAR_RIGHT_PARENTHESIS; 922 out.out_str[1] = CHAR_QUESTION_MARK; 923 out.out_str[2] = CHAR_QUESTION_MARK; 924 convert_glob_write_str(&out, 3); 925 926 pattern++; 927 continue; 928 } 929 930 if (pattern < pattern_end && *pattern == CHAR_ASTERISK) 931 { 932 do pattern++; while (pattern < pattern_end && 933 *pattern == CHAR_ASTERISK); 934 } 935 936 if (no_wildsep) 937 { 938 if (pattern >= pattern_end) 939 { 940 no_slash_z = TRUE; 941 break; 942 } 943 944 /* Start check must be after the end check. */ 945 if (is_start) continue; 946 } 947 948 if (!is_start) 949 { 950 if (after_starstar) 951 { 952 out.out_str[0] = CHAR_LEFT_PARENTHESIS; 953 out.out_str[1] = CHAR_QUESTION_MARK; 954 out.out_str[2] = CHAR_GREATER_THAN_SIGN; 955 convert_glob_write_str(&out, 3); 956 in_atomic = TRUE; 957 } 958 else 959 convert_glob_print_commit(&out); 960 } 961 962 if (no_wildsep) 963 convert_glob_write(&out, CHAR_DOT); 964 else 965 convert_glob_print_wildcard(&out, separator, with_escape); 966 967 out.out_str[0] = CHAR_ASTERISK; 968 out.out_str[1] = CHAR_QUESTION_MARK; 969 if (pattern >= pattern_end) 970 out.out_str[1] = CHAR_PLUS; 971 convert_glob_write_str(&out, 2); 972 continue; 973 } 974 975 if (c == CHAR_QUESTION_MARK) 976 { 977 if (no_wildsep) 978 convert_glob_write(&out, CHAR_DOT); 979 else 980 convert_glob_print_wildcard(&out, separator, with_escape); 981 continue; 982 } 983 984 if (c == CHAR_LEFT_SQUARE_BRACKET) 985 { 986 result = convert_glob_parse_range(&pattern, pattern_end, 987 &out, utf, separator, with_escape, escape, no_wildsep); 988 if (result != 0) break; 989 continue; 990 } 991 992 if (escape != 0 && c == escape) 993 { 994 if (pattern >= pattern_end) 995 { 996 result = PCRE2_ERROR_CONVERT_SYNTAX; 997 break; 998 } 999 c = *pattern++; 1000 } 1001 1002 if (c < 128 && strchr(pcre2_escaped_literals, c) != NULL) 1003 convert_glob_write(&out, CHAR_BACKSLASH); 1004 1005 convert_glob_write(&out, c); 1006 } 1007 1008 if (result == 0) 1009 { 1010 if (!no_slash_z) 1011 { 1012 out.out_str[0] = CHAR_BACKSLASH; 1013 out.out_str[1] = CHAR_z; 1014 convert_glob_write_str(&out, 2); 1015 } 1016 1017 if (in_atomic) 1018 convert_glob_write(&out, CHAR_RIGHT_PARENTHESIS); 1019 1020 convert_glob_write(&out, CHAR_NUL); 1021 1022 if (!dummyrun && out.output_size != (PCRE2_SIZE) (out.output - use_buffer)) 1023 result = PCRE2_ERROR_NOMEMORY; 1024 } 1025 1026 if (result != 0) 1027 { 1028 *bufflenptr = pattern - pattern_start; 1029 return result; 1030 } 1031 1032 *bufflenptr = out.output_size - 1; 1033 return 0; 1034 } 1035 1036 1037 /************************************************* 1038 * Convert pattern * 1039 *************************************************/ 1040 1041 /* This is the external-facing function for converting other forms of pattern 1042 into PCRE2 regular expression patterns. On error, the bufflenptr argument is 1043 used to return an offset in the original pattern. 1044 1045 Arguments: 1046 pattern the input pattern 1047 plength length of input, or PCRE2_ZERO_TERMINATED 1048 options options bits 1049 buffptr pointer to pointer to output buffer 1050 bufflenptr pointer to length of output buffer 1051 ccontext convert context or NULL 1052 1053 Returns: 0 for success, else an error code (+ve or -ve) 1054 */ 1055 1056 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION 1057 pcre2_pattern_convert(PCRE2_SPTR pattern, PCRE2_SIZE plength, uint32_t options, 1058 PCRE2_UCHAR **buffptr, PCRE2_SIZE *bufflenptr, 1059 pcre2_convert_context *ccontext) 1060 { 1061 int i, rc; 1062 PCRE2_UCHAR dummy_buffer[DUMMY_BUFFER_SIZE]; 1063 PCRE2_UCHAR *use_buffer = dummy_buffer; 1064 PCRE2_SIZE use_length = DUMMY_BUFFER_SIZE; 1065 BOOL utf = (options & PCRE2_CONVERT_UTF) != 0; 1066 uint32_t pattype = options & TYPE_OPTIONS; 1067 1068 if (pattern == NULL || bufflenptr == NULL) return PCRE2_ERROR_NULL; 1069 1070 if ((options & ~ALL_OPTIONS) != 0 || /* Undefined bit set */ 1071 (pattype & (~pattype+1)) != pattype || /* More than one type set */ 1072 pattype == 0) /* No type set */ 1073 { 1074 *bufflenptr = 0; /* Error offset */ 1075 return PCRE2_ERROR_BADOPTION; 1076 } 1077 1078 if (plength == PCRE2_ZERO_TERMINATED) plength = PRIV(strlen)(pattern); 1079 if (ccontext == NULL) ccontext = 1080 (pcre2_convert_context *)(&PRIV(default_convert_context)); 1081 1082 /* Check UTF if required. */ 1083 1084 #ifndef SUPPORT_UNICODE 1085 if (utf) 1086 { 1087 *bufflenptr = 0; /* Error offset */ 1088 return PCRE2_ERROR_UNICODE_NOT_SUPPORTED; 1089 } 1090 #else 1091 if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0) 1092 { 1093 PCRE2_SIZE erroroffset; 1094 rc = PRIV(valid_utf)(pattern, plength, &erroroffset); 1095 if (rc != 0) 1096 { 1097 *bufflenptr = erroroffset; 1098 return rc; 1099 } 1100 } 1101 #endif 1102 1103 /* If buffptr is not NULL, and what it points to is not NULL, we are being 1104 provided with a buffer and a length, so set them as the buffer to use. */ 1105 1106 if (buffptr != NULL && *buffptr != NULL) 1107 { 1108 use_buffer = *buffptr; 1109 use_length = *bufflenptr; 1110 } 1111 1112 /* Call an individual converter, either just once (if a buffer was provided or 1113 just the length is needed), or twice (if a memory allocation is required). */ 1114 1115 for (i = 0; i < 2; i++) 1116 { 1117 PCRE2_UCHAR *allocated; 1118 BOOL dummyrun = buffptr == NULL || *buffptr == NULL; 1119 1120 switch(pattype) 1121 { 1122 case PCRE2_CONVERT_GLOB: 1123 rc = convert_glob(options & ~PCRE2_CONVERT_GLOB, pattern, plength, utf, 1124 use_buffer, use_length, bufflenptr, dummyrun, ccontext); 1125 break; 1126 1127 case PCRE2_CONVERT_POSIX_BASIC: 1128 case PCRE2_CONVERT_POSIX_EXTENDED: 1129 rc = convert_posix(pattype, pattern, plength, utf, use_buffer, use_length, 1130 bufflenptr, dummyrun, ccontext); 1131 break; 1132 1133 default: 1134 *bufflenptr = 0; /* Error offset */ 1135 return PCRE2_ERROR_INTERNAL; 1136 } 1137 1138 if (rc != 0 || /* Error */ 1139 buffptr == NULL || /* Just the length is required */ 1140 *buffptr != NULL) /* Buffer was provided or allocated */ 1141 return rc; 1142 1143 /* Allocate memory for the buffer, with hidden space for an allocator at 1144 the start. The next time round the loop runs the conversion for real. */ 1145 1146 allocated = PRIV(memctl_malloc)(sizeof(pcre2_memctl) + 1147 (*bufflenptr + 1)*PCRE2_CODE_UNIT_WIDTH, (pcre2_memctl *)ccontext); 1148 if (allocated == NULL) return PCRE2_ERROR_NOMEMORY; 1149 *buffptr = (PCRE2_UCHAR *)(((char *)allocated) + sizeof(pcre2_memctl)); 1150 1151 use_buffer = *buffptr; 1152 use_length = *bufflenptr + 1; 1153 } 1154 1155 /* Control should never get here. */ 1156 1157 return PCRE2_ERROR_INTERNAL; 1158 } 1159 1160 1161 /************************************************* 1162 * Free converted pattern * 1163 *************************************************/ 1164 1165 /* This frees a converted pattern that was put in newly-allocated memory. 1166 1167 Argument: the converted pattern 1168 Returns: nothing 1169 */ 1170 1171 PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION 1172 pcre2_converted_pattern_free(PCRE2_UCHAR *converted) 1173 { 1174 if (converted != NULL) 1175 { 1176 pcre2_memctl *memctl = 1177 (pcre2_memctl *)((char *)converted - sizeof(pcre2_memctl)); 1178 memctl->free(memctl, memctl->memory_data); 1179 } 1180 } 1181 1182 /* End of pcre2_convert.c */ 1183