1 /* This is JavaScriptCore's variant of the PCRE library. While this library 2 started out as a copy of PCRE, many of the features of PCRE have been 3 removed. This library now supports only the regular expression features 4 required by the JavaScript language specification, and has only the functions 5 needed by JavaScriptCore and the rest of WebKit. 6 7 Originally written by Philip Hazel 8 Copyright (c) 1997-2006 University of Cambridge 9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 10 Copyright (C) 2007 Eric Seidel <eric (at) webkit.org> 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 /* This module contains the external function jsRegExpExecute(), along with 42 supporting internal functions that are not used by other modules. */ 43 44 #include "config.h" 45 46 #include "pcre_internal.h" 47 48 #include <string.h> 49 #include <wtf/ASCIICType.h> 50 #include <wtf/FastMalloc.h> 51 52 using namespace WTF; 53 54 /* Negative values for the firstchar and reqchar variables */ 55 56 #define REQ_UNSET (-2) 57 #define REQ_NONE (-1) 58 59 /************************************************* 60 * Code parameters and static tables * 61 *************************************************/ 62 63 /* Maximum number of items on the nested bracket stacks at compile time. This 64 applies to the nesting of all kinds of parentheses. It does not limit 65 un-nested, non-capturing parentheses. This number can be made bigger if 66 necessary - it is used to dimension one int and one unsigned char vector at 67 compile time. */ 68 69 #define BRASTACK_SIZE 200 70 71 /* Table for handling escaped characters in the range '0'-'z'. Positive returns 72 are simple data values; negative values are for special things like \d and so 73 on. Zero means further processing is needed (for things like \x), or the escape 74 is invalid. */ 75 76 static const short escapes[] = { 77 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 78 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ 79 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */ 80 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ 81 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ 82 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */ 83 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */ 84 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */ 85 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */ 86 0, 0, 0 /* x - z */ 87 }; 88 89 /* Error code numbers. They are given names so that they can more easily be 90 tracked. */ 91 92 enum ErrorCode { 93 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, 94 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17 95 }; 96 97 /* The texts of compile-time error messages. These are "char *" because they 98 are passed to the outside world. */ 99 100 static const char* errorText(ErrorCode code) 101 { 102 static const char errorTexts[] = 103 /* 1 */ 104 "\\ at end of pattern\0" 105 "\\c at end of pattern\0" 106 "character value in \\x{...} sequence is too large\0" 107 "numbers out of order in {} quantifier\0" 108 /* 5 */ 109 "number too big in {} quantifier\0" 110 "missing terminating ] for character class\0" 111 "internal error: code overflow\0" 112 "range out of order in character class\0" 113 "nothing to repeat\0" 114 /* 10 */ 115 "unmatched parentheses\0" 116 "internal error: unexpected repeat\0" 117 "unrecognized character after (?\0" 118 "failed to get memory\0" 119 "missing )\0" 120 /* 15 */ 121 "reference to non-existent subpattern\0" 122 "regular expression too large\0" 123 "parentheses nested too deeply" 124 ; 125 126 int i = code; 127 const char* text = errorTexts; 128 while (i > 1) 129 i -= !*text++; 130 return text; 131 } 132 133 /* Structure for passing "static" information around between the functions 134 doing the compiling. */ 135 136 struct CompileData { 137 CompileData() { 138 topBackref = 0; 139 backrefMap = 0; 140 reqVaryOpt = 0; 141 needOuterBracket = false; 142 numCapturingBrackets = 0; 143 } 144 int topBackref; /* Maximum back reference */ 145 unsigned backrefMap; /* Bitmap of low back refs */ 146 int reqVaryOpt; /* "After variable item" flag for reqByte */ 147 bool needOuterBracket; 148 int numCapturingBrackets; 149 }; 150 151 /* Definitions to allow mutual recursion */ 152 153 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&); 154 static bool bracketIsAnchored(const unsigned char* code); 155 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap); 156 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert); 157 158 /************************************************* 159 * Handle escapes * 160 *************************************************/ 161 162 /* This function is called when a \ has been encountered. It either returns a 163 positive value for a simple escape such as \n, or a negative value which 164 encodes one of the more complicated things such as \d. When UTF-8 is enabled, 165 a positive value greater than 255 may be returned. On entry, ptr is pointing at 166 the \. On exit, it is on the final character of the escape sequence. 167 168 Arguments: 169 ptrPtr points to the pattern position pointer 170 errorCodePtr points to the errorcode variable 171 bracount number of previous extracting brackets 172 options the options bits 173 isClass true if inside a character class 174 175 Returns: zero or positive => a data character 176 negative => a special escape sequence 177 on error, errorPtr is set 178 */ 179 180 static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass) 181 { 182 const UChar* ptr = *ptrPtr + 1; 183 184 /* If backslash is at the end of the pattern, it's an error. */ 185 if (ptr == patternEnd) { 186 *errorCodePtr = ERR1; 187 *ptrPtr = ptr; 188 return 0; 189 } 190 191 int c = *ptr; 192 193 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in 194 a table. A non-zero result is something that can be returned immediately. 195 Otherwise further processing may be required. */ 196 197 if (c < '0' || c > 'z') { /* Not alphameric */ 198 } else if (int escapeValue = escapes[c - '0']) { 199 c = escapeValue; 200 if (isClass) { 201 if (-c == ESC_b) 202 c = '\b'; /* \b is backslash in a class */ 203 else if (-c == ESC_B) 204 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */ 205 } 206 /* Escapes that need further processing, or are illegal. */ 207 208 } else { 209 switch (c) { 210 case '1': 211 case '2': 212 case '3': 213 case '4': 214 case '5': 215 case '6': 216 case '7': 217 case '8': 218 case '9': 219 /* Escape sequences starting with a non-zero digit are backreferences, 220 unless there are insufficient brackets, in which case they are octal 221 escape sequences. Those sequences end on the first non-octal character 222 or when we overflow 0-255, whichever comes first. */ 223 224 if (!isClass) { 225 const UChar* oldptr = ptr; 226 c -= '0'; 227 while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount) 228 c = c * 10 + *(++ptr) - '0'; 229 if (c <= bracount) { 230 c = -(ESC_REF + c); 231 break; 232 } 233 ptr = oldptr; /* Put the pointer back and fall through */ 234 } 235 236 /* Handle an octal number following \. If the first digit is 8 or 9, 237 this is not octal. */ 238 239 if ((c = *ptr) >= '8') { 240 c = '\\'; 241 ptr -= 1; 242 break; 243 } 244 245 /* \0 always starts an octal number, but we may drop through to here with a 246 larger first octal digit. */ 247 248 case '0': { 249 c -= '0'; 250 int i; 251 for (i = 1; i <= 2; ++i) { 252 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7') 253 break; 254 int cc = c * 8 + ptr[i] - '0'; 255 if (cc > 255) 256 break; 257 c = cc; 258 } 259 ptr += i - 1; 260 break; 261 } 262 263 case 'x': { 264 c = 0; 265 int i; 266 for (i = 1; i <= 2; ++i) { 267 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) { 268 c = 'x'; 269 i = 1; 270 break; 271 } 272 int cc = ptr[i]; 273 if (cc >= 'a') 274 cc -= 32; /* Convert to upper case */ 275 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 276 } 277 ptr += i - 1; 278 break; 279 } 280 281 case 'u': { 282 c = 0; 283 int i; 284 for (i = 1; i <= 4; ++i) { 285 if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) { 286 c = 'u'; 287 i = 1; 288 break; 289 } 290 int cc = ptr[i]; 291 if (cc >= 'a') 292 cc -= 32; /* Convert to upper case */ 293 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10)); 294 } 295 ptr += i - 1; 296 break; 297 } 298 299 case 'c': 300 if (++ptr == patternEnd) { 301 *errorCodePtr = ERR2; 302 return 0; 303 } 304 305 c = *ptr; 306 307 /* To match Firefox, inside a character class, we also accept 308 numbers and '_' as control characters */ 309 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) { 310 c = '\\'; 311 ptr -= 2; 312 break; 313 } 314 315 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding 316 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */ 317 c = toASCIIUpper(c) ^ 0x40; 318 break; 319 } 320 } 321 322 *ptrPtr = ptr; 323 return c; 324 } 325 326 /************************************************* 327 * Check for counted repeat * 328 *************************************************/ 329 330 /* This function is called when a '{' is encountered in a place where it might 331 start a quantifier. It looks ahead to see if it really is a quantifier or not. 332 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} 333 where the ddds are digits. 334 335 Arguments: 336 p pointer to the first char after '{' 337 338 Returns: true or false 339 */ 340 341 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd) 342 { 343 if (p >= patternEnd || !isASCIIDigit(*p)) 344 return false; 345 p++; 346 while (p < patternEnd && isASCIIDigit(*p)) 347 p++; 348 if (p < patternEnd && *p == '}') 349 return true; 350 351 if (p >= patternEnd || *p++ != ',') 352 return false; 353 if (p < patternEnd && *p == '}') 354 return true; 355 356 if (p >= patternEnd || !isASCIIDigit(*p)) 357 return false; 358 p++; 359 while (p < patternEnd && isASCIIDigit(*p)) 360 p++; 361 362 return (p < patternEnd && *p == '}'); 363 } 364 365 /************************************************* 366 * Read repeat counts * 367 *************************************************/ 368 369 /* Read an item of the form {n,m} and return the values. This is called only 370 after isCountedRepeat() has confirmed that a repeat-count quantifier exists, 371 so the syntax is guaranteed to be correct, but we need to check the values. 372 373 Arguments: 374 p pointer to first char after '{' 375 minp pointer to int for min 376 maxp pointer to int for max 377 returned as -1 if no max 378 errorCodePtr points to error code variable 379 380 Returns: pointer to '}' on success; 381 current ptr on error, with errorCodePtr set non-zero 382 */ 383 384 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr) 385 { 386 int min = 0; 387 int max = -1; 388 389 /* Read the minimum value and do a paranoid check: a negative value indicates 390 an integer overflow. */ 391 392 while (isASCIIDigit(*p)) 393 min = min * 10 + *p++ - '0'; 394 if (min < 0 || min > 65535) { 395 *errorCodePtr = ERR5; 396 return p; 397 } 398 399 /* Read the maximum value if there is one, and again do a paranoid on its size. 400 Also, max must not be less than min. */ 401 402 if (*p == '}') 403 max = min; 404 else { 405 if (*(++p) != '}') { 406 max = 0; 407 while (isASCIIDigit(*p)) 408 max = max * 10 + *p++ - '0'; 409 if (max < 0 || max > 65535) { 410 *errorCodePtr = ERR5; 411 return p; 412 } 413 if (max < min) { 414 *errorCodePtr = ERR4; 415 return p; 416 } 417 } 418 } 419 420 /* Fill in the required variables, and pass back the pointer to the terminating 421 '}'. */ 422 423 *minp = min; 424 *maxp = max; 425 return p; 426 } 427 428 /************************************************* 429 * Find first significant op code * 430 *************************************************/ 431 432 /* This is called by several functions that scan a compiled expression looking 433 for a fixed first character, or an anchoring op code etc. It skips over things 434 that do not influence this. 435 436 Arguments: 437 code pointer to the start of the group 438 Returns: pointer to the first significant opcode 439 */ 440 441 static const unsigned char* firstSignificantOpcode(const unsigned char* code) 442 { 443 while (*code == OP_BRANUMBER) 444 code += 3; 445 return code; 446 } 447 448 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code) 449 { 450 while (true) { 451 switch (*code) { 452 case OP_ASSERT_NOT: 453 advanceToEndOfBracket(code); 454 code += 1 + LINK_SIZE; 455 break; 456 case OP_WORD_BOUNDARY: 457 case OP_NOT_WORD_BOUNDARY: 458 ++code; 459 break; 460 case OP_BRANUMBER: 461 code += 3; 462 break; 463 default: 464 return code; 465 } 466 } 467 } 468 469 /************************************************* 470 * Get othercase range * 471 *************************************************/ 472 473 /* This function is passed the start and end of a class range, in UTF-8 mode 474 with UCP support. It searches up the characters, looking for internal ranges of 475 characters in the "other" case. Each call returns the next one, updating the 476 start address. 477 478 Arguments: 479 cptr points to starting character value; updated 480 d end value 481 ocptr where to put start of othercase range 482 odptr where to put end of othercase range 483 484 Yield: true when range returned; false when no more 485 */ 486 487 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr) 488 { 489 int c, othercase = 0; 490 491 for (c = *cptr; c <= d; c++) { 492 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) 493 break; 494 } 495 496 if (c > d) 497 return false; 498 499 *ocptr = othercase; 500 int next = othercase + 1; 501 502 for (++c; c <= d; c++) { 503 if (jsc_pcre_ucp_othercase(c) != next) 504 break; 505 next++; 506 } 507 508 *odptr = next - 1; 509 *cptr = c; 510 511 return true; 512 } 513 514 /************************************************* 515 * Convert character value to UTF-8 * 516 *************************************************/ 517 518 /* This function takes an integer value in the range 0 - 0x7fffffff 519 and encodes it as a UTF-8 character in 0 to 6 bytes. 520 521 Arguments: 522 cvalue the character value 523 buffer pointer to buffer for result - at least 6 bytes long 524 525 Returns: number of characters placed in the buffer 526 */ 527 528 static int encodeUTF8(int cvalue, unsigned char *buffer) 529 { 530 int i; 531 for (i = 0; i < jsc_pcre_utf8_table1_size; i++) 532 if (cvalue <= jsc_pcre_utf8_table1[i]) 533 break; 534 buffer += i; 535 for (int j = i; j > 0; j--) { 536 *buffer-- = 0x80 | (cvalue & 0x3f); 537 cvalue >>= 6; 538 } 539 *buffer = jsc_pcre_utf8_table2[i] | cvalue; 540 return i + 1; 541 } 542 543 /************************************************* 544 * Compile one branch * 545 *************************************************/ 546 547 /* Scan the pattern, compiling it into the code vector. 548 549 Arguments: 550 options the option bits 551 brackets points to number of extracting brackets used 552 codePtr points to the pointer to the current code point 553 ptrPtr points to the current pattern pointer 554 errorCodePtr points to error code variable 555 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) 556 reqbyteptr set to the last literal character required, else < 0 557 cd contains pointers to tables etc. 558 559 Returns: true on success 560 false, with *errorCodePtr set non-zero on error 561 */ 562 563 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected) 564 { 565 return ((ptr + 1 < patternEnd) && ptr[1] == expected); 566 } 567 568 static bool 569 compileBranch(int options, int* brackets, unsigned char** codePtr, 570 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr, 571 int* reqbyteptr, CompileData& cd) 572 { 573 int repeatType, opType; 574 int repeatMin = 0, repeat_max = 0; /* To please picky compilers */ 575 int bravalue = 0; 576 int reqvary, tempreqvary; 577 int c; 578 unsigned char* code = *codePtr; 579 unsigned char* tempcode; 580 bool didGroupSetFirstByte = false; 581 const UChar* ptr = *ptrPtr; 582 const UChar* tempptr; 583 unsigned char* previous = NULL; 584 unsigned char classbits[32]; 585 586 bool class_utf8; 587 unsigned char* class_utf8data; 588 unsigned char utf8_char[6]; 589 590 /* Initialize no first byte, no required byte. REQ_UNSET means "no char 591 matching encountered yet". It gets changed to REQ_NONE if we hit something that 592 matches a non-fixed char first char; reqByte just remains unset if we never 593 find one. 594 595 When we hit a repeat whose minimum is zero, we may have to adjust these values 596 to take the zero repeat into account. This is implemented by setting them to 597 zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual 598 item types that can be repeated set these backoff variables appropriately. */ 599 600 int firstByte = REQ_UNSET; 601 int reqByte = REQ_UNSET; 602 int zeroReqByte = REQ_UNSET; 603 int zeroFirstByte = REQ_UNSET; 604 605 /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero, 606 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit 607 value > 255. It is added into the firstByte or reqByte variables to record the 608 case status of the value. This is used only for ASCII characters. */ 609 610 int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0; 611 612 /* Switch on next character until the end of the branch */ 613 614 for (;; ptr++) { 615 bool negateClass; 616 bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */ 617 int classCharCount; 618 int classLastChar; 619 int skipBytes; 620 int subReqByte; 621 int subFirstByte; 622 int mcLength; 623 unsigned char mcbuffer[8]; 624 625 /* Next byte in the pattern */ 626 627 c = ptr < patternEnd ? *ptr : 0; 628 629 /* Fill in length of a previous callout, except when the next thing is 630 a quantifier. */ 631 632 bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd)); 633 634 switch (c) { 635 /* The branch terminates at end of string, |, or ). */ 636 637 case 0: 638 if (ptr < patternEnd) 639 goto NORMAL_CHAR; 640 // End of string; fall through 641 case '|': 642 case ')': 643 *firstbyteptr = firstByte; 644 *reqbyteptr = reqByte; 645 *codePtr = code; 646 *ptrPtr = ptr; 647 return true; 648 649 /* Handle single-character metacharacters. In multiline mode, ^ disables 650 the setting of any following char as a first character. */ 651 652 case '^': 653 if (options & MatchAcrossMultipleLinesOption) { 654 if (firstByte == REQ_UNSET) 655 firstByte = REQ_NONE; 656 *code++ = OP_BOL; 657 } else 658 *code++ = OP_CIRC; 659 previous = NULL; 660 break; 661 662 case '$': 663 previous = NULL; 664 if (options & MatchAcrossMultipleLinesOption) 665 *code++ = OP_EOL; 666 else 667 *code++ = OP_DOLL; 668 break; 669 670 /* There can never be a first char if '.' is first, whatever happens about 671 repeats. The value of reqByte doesn't change either. */ 672 673 case '.': 674 if (firstByte == REQ_UNSET) 675 firstByte = REQ_NONE; 676 zeroFirstByte = firstByte; 677 zeroReqByte = reqByte; 678 previous = code; 679 *code++ = OP_NOT_NEWLINE; 680 break; 681 682 /* Character classes. If the included characters are all < 256, we build a 683 32-byte bitmap of the permitted characters, except in the special case 684 where there is only one such character. For negated classes, we build the 685 map as usual, then invert it at the end. However, we use a different opcode 686 so that data characters > 255 can be handled correctly. 687 688 If the class contains characters outside the 0-255 range, a different 689 opcode is compiled. It may optionally have a bit map for characters < 256, 690 but those above are are explicitly listed afterwards. A flag byte tells 691 whether the bitmap is present, and whether this is a negated class or not. 692 */ 693 694 case '[': { 695 previous = code; 696 shouldFlipNegation = false; 697 698 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 699 they are encountered at the top level, so we'll do that too. */ 700 701 /* If the first character is '^', set the negation flag and skip it. */ 702 703 if (ptr + 1 >= patternEnd) { 704 *errorCodePtr = ERR6; 705 return false; 706 } 707 708 if (ptr[1] == '^') { 709 negateClass = true; 710 ++ptr; 711 } else 712 negateClass = false; 713 714 /* Keep a count of chars with values < 256 so that we can optimize the case 715 of just a single character (as long as it's < 256). For higher valued UTF-8 716 characters, we don't yet do any optimization. */ 717 718 classCharCount = 0; 719 classLastChar = -1; 720 721 class_utf8 = false; /* No chars >= 256 */ 722 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */ 723 724 /* Initialize the 32-char bit map to all zeros. We have to build the 725 map in a temporary bit of store, in case the class contains only 1 726 character (< 256), because in that case the compiled code doesn't use the 727 bit map. */ 728 729 memset(classbits, 0, 32 * sizeof(unsigned char)); 730 731 /* Process characters until ] is reached. The first pass 732 through the regex checked the overall syntax, so we don't need to be very 733 strict here. At the start of the loop, c contains the first byte of the 734 character. */ 735 736 while ((++ptr < patternEnd) && (c = *ptr) != ']') { 737 /* Backslash may introduce a single character, or it may introduce one 738 of the specials, which just set a flag. Escaped items are checked for 739 validity in the pre-compiling pass. The sequence \b is a special case. 740 Inside a class (and only there) it is treated as backspace. Elsewhere 741 it marks a word boundary. Other escapes have preset maps ready to 742 or into the one we are building. We assume they have more than one 743 character in them, so set classCharCount bigger than one. */ 744 745 if (c == '\\') { 746 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true); 747 if (c < 0) { 748 classCharCount += 2; /* Greater than 1 is what matters */ 749 switch (-c) { 750 case ESC_d: 751 for (c = 0; c < 32; c++) 752 classbits[c] |= classBitmapForChar(c + cbit_digit); 753 continue; 754 755 case ESC_D: 756 shouldFlipNegation = true; 757 for (c = 0; c < 32; c++) 758 classbits[c] |= ~classBitmapForChar(c + cbit_digit); 759 continue; 760 761 case ESC_w: 762 for (c = 0; c < 32; c++) 763 classbits[c] |= classBitmapForChar(c + cbit_word); 764 continue; 765 766 case ESC_W: 767 shouldFlipNegation = true; 768 for (c = 0; c < 32; c++) 769 classbits[c] |= ~classBitmapForChar(c + cbit_word); 770 continue; 771 772 case ESC_s: 773 for (c = 0; c < 32; c++) 774 classbits[c] |= classBitmapForChar(c + cbit_space); 775 continue; 776 777 case ESC_S: 778 shouldFlipNegation = true; 779 for (c = 0; c < 32; c++) 780 classbits[c] |= ~classBitmapForChar(c + cbit_space); 781 continue; 782 783 /* Unrecognized escapes are faulted if PCRE is running in its 784 strict mode. By default, for compatibility with Perl, they are 785 treated as literals. */ 786 787 default: 788 c = *ptr; /* The final character */ 789 classCharCount -= 2; /* Undo the default count from above */ 790 } 791 } 792 793 /* Fall through if we have a single character (c >= 0). This may be 794 > 256 in UTF-8 mode. */ 795 796 } /* End of backslash handling */ 797 798 /* A single character may be followed by '-' to form a range. However, 799 Perl does not permit ']' to be the end of the range. A '-' character 800 here is treated as a literal. */ 801 802 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') { 803 ptr += 2; 804 805 int d = *ptr; 806 807 /* The second part of a range can be a single-character escape, but 808 not any of the other escapes. Perl 5.6 treats a hyphen as a literal 809 in such circumstances. */ 810 811 if (d == '\\') { 812 const UChar* oldptr = ptr; 813 d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true); 814 815 /* \X is literal X; any other special means the '-' was literal */ 816 if (d < 0) { 817 ptr = oldptr - 2; 818 goto LONE_SINGLE_CHARACTER; /* A few lines below */ 819 } 820 } 821 822 /* The check that the two values are in the correct order happens in 823 the pre-pass. Optimize one-character ranges */ 824 825 if (d == c) 826 goto LONE_SINGLE_CHARACTER; /* A few lines below */ 827 828 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless 829 matching, we have to use an XCLASS with extra data items. Caseless 830 matching for characters > 127 is available only if UCP support is 831 available. */ 832 833 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) { 834 class_utf8 = true; 835 836 /* With UCP support, we can find the other case equivalents of 837 the relevant characters. There may be several ranges. Optimize how 838 they fit with the basic range. */ 839 840 if (options & IgnoreCaseOption) { 841 int occ, ocd; 842 int cc = c; 843 int origd = d; 844 while (getOthercaseRange(&cc, origd, &occ, &ocd)) { 845 if (occ >= c && ocd <= d) 846 continue; /* Skip embedded ranges */ 847 848 if (occ < c && ocd >= c - 1) /* Extend the basic range */ 849 { /* if there is overlap, */ 850 c = occ; /* noting that if occ < c */ 851 continue; /* we can't have ocd > d */ 852 } /* because a subrange is */ 853 if (ocd > d && occ <= d + 1) /* always shorter than */ 854 { /* the basic range. */ 855 d = ocd; 856 continue; 857 } 858 859 if (occ == ocd) 860 *class_utf8data++ = XCL_SINGLE; 861 else { 862 *class_utf8data++ = XCL_RANGE; 863 class_utf8data += encodeUTF8(occ, class_utf8data); 864 } 865 class_utf8data += encodeUTF8(ocd, class_utf8data); 866 } 867 } 868 869 /* Now record the original range, possibly modified for UCP caseless 870 overlapping ranges. */ 871 872 *class_utf8data++ = XCL_RANGE; 873 class_utf8data += encodeUTF8(c, class_utf8data); 874 class_utf8data += encodeUTF8(d, class_utf8data); 875 876 /* With UCP support, we are done. Without UCP support, there is no 877 caseless matching for UTF-8 characters > 127; we can use the bit map 878 for the smaller ones. */ 879 880 continue; /* With next character in the class */ 881 } 882 883 /* We use the bit map for all cases when not in UTF-8 mode; else 884 ranges that lie entirely within 0-127 when there is UCP support; else 885 for partial ranges without UCP support. */ 886 887 for (; c <= d; c++) { 888 classbits[c/8] |= (1 << (c&7)); 889 if (options & IgnoreCaseOption) { 890 int uc = flipCase(c); 891 classbits[uc/8] |= (1 << (uc&7)); 892 } 893 classCharCount++; /* in case a one-char range */ 894 classLastChar = c; 895 } 896 897 continue; /* Go get the next char in the class */ 898 } 899 900 /* Handle a lone single character - we can get here for a normal 901 non-escape char, or after \ that introduces a single character or for an 902 apparent range that isn't. */ 903 904 LONE_SINGLE_CHARACTER: 905 906 /* Handle a character that cannot go in the bit map */ 907 908 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) { 909 class_utf8 = true; 910 *class_utf8data++ = XCL_SINGLE; 911 class_utf8data += encodeUTF8(c, class_utf8data); 912 913 if (options & IgnoreCaseOption) { 914 int othercase; 915 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) { 916 *class_utf8data++ = XCL_SINGLE; 917 class_utf8data += encodeUTF8(othercase, class_utf8data); 918 } 919 } 920 } else { 921 /* Handle a single-byte character */ 922 classbits[c/8] |= (1 << (c&7)); 923 if (options & IgnoreCaseOption) { 924 c = flipCase(c); 925 classbits[c/8] |= (1 << (c&7)); 926 } 927 classCharCount++; 928 classLastChar = c; 929 } 930 } 931 932 /* If classCharCount is 1, we saw precisely one character whose value is 933 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we 934 can optimize the negative case only if there were no characters >= 128 935 because OP_NOT and the related opcodes like OP_NOTSTAR operate on 936 single-bytes only. This is an historical hangover. Maybe one day we can 937 tidy these opcodes to handle multi-byte characters. 938 939 The optimization throws away the bit map. We turn the item into a 940 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note 941 that OP_NOT does not support multibyte characters. In the positive case, it 942 can cause firstByte to be set. Otherwise, there can be no first char if 943 this item is first, whatever repeat count may follow. In the case of 944 reqByte, save the previous value for reinstating. */ 945 946 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) { 947 zeroReqByte = reqByte; 948 949 /* The OP_NOT opcode works on one-byte characters only. */ 950 951 if (negateClass) { 952 if (firstByte == REQ_UNSET) 953 firstByte = REQ_NONE; 954 zeroFirstByte = firstByte; 955 *code++ = OP_NOT; 956 *code++ = classLastChar; 957 break; 958 } 959 960 /* For a single, positive character, get the value into c, and 961 then we can handle this with the normal one-character code. */ 962 963 c = classLastChar; 964 goto NORMAL_CHAR; 965 } /* End of 1-char optimization */ 966 967 /* The general case - not the one-char optimization. If this is the first 968 thing in the branch, there can be no first char setting, whatever the 969 repeat count. Any reqByte setting must remain unchanged after any kind of 970 repeat. */ 971 972 if (firstByte == REQ_UNSET) firstByte = REQ_NONE; 973 zeroFirstByte = firstByte; 974 zeroReqByte = reqByte; 975 976 /* If there are characters with values > 255, we have to compile an 977 extended class, with its own opcode. If there are no characters < 256, 978 we can omit the bitmap. */ 979 980 if (class_utf8 && !shouldFlipNegation) { 981 *class_utf8data++ = XCL_END; /* Marks the end of extra data */ 982 *code++ = OP_XCLASS; 983 code += LINK_SIZE; 984 *code = negateClass? XCL_NOT : 0; 985 986 /* If the map is required, install it, and move on to the end of 987 the extra data */ 988 989 if (classCharCount > 0) { 990 *code++ |= XCL_MAP; 991 memcpy(code, classbits, 32); 992 code = class_utf8data; 993 } 994 995 /* If the map is not required, slide down the extra data. */ 996 997 else { 998 int len = class_utf8data - (code + 33); 999 memmove(code + 1, code + 33, len); 1000 code += len + 1; 1001 } 1002 1003 /* Now fill in the complete length of the item */ 1004 1005 putLinkValue(previous + 1, code - previous); 1006 break; /* End of class handling */ 1007 } 1008 1009 /* If there are no characters > 255, negate the 32-byte map if necessary, 1010 and copy it into the code vector. If this is the first thing in the branch, 1011 there can be no first char setting, whatever the repeat count. Any reqByte 1012 setting must remain unchanged after any kind of repeat. */ 1013 1014 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS; 1015 if (negateClass) 1016 for (c = 0; c < 32; c++) 1017 code[c] = ~classbits[c]; 1018 else 1019 memcpy(code, classbits, 32); 1020 code += 32; 1021 break; 1022 } 1023 1024 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this 1025 has been tested above. */ 1026 1027 case '{': 1028 if (!isQuantifier) 1029 goto NORMAL_CHAR; 1030 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr); 1031 if (*errorCodePtr) 1032 goto FAILED; 1033 goto REPEAT; 1034 1035 case '*': 1036 repeatMin = 0; 1037 repeat_max = -1; 1038 goto REPEAT; 1039 1040 case '+': 1041 repeatMin = 1; 1042 repeat_max = -1; 1043 goto REPEAT; 1044 1045 case '?': 1046 repeatMin = 0; 1047 repeat_max = 1; 1048 1049 REPEAT: 1050 if (!previous) { 1051 *errorCodePtr = ERR9; 1052 goto FAILED; 1053 } 1054 1055 if (repeatMin == 0) { 1056 firstByte = zeroFirstByte; /* Adjust for zero repeat */ 1057 reqByte = zeroReqByte; /* Ditto */ 1058 } 1059 1060 /* Remember whether this is a variable length repeat */ 1061 1062 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY; 1063 1064 opType = 0; /* Default single-char op codes */ 1065 1066 /* Save start of previous item, in case we have to move it up to make space 1067 for an inserted OP_ONCE for the additional '+' extension. */ 1068 /* FIXME: Probably don't need this because we don't use OP_ONCE. */ 1069 1070 tempcode = previous; 1071 1072 /* If the next character is '+', we have a possessive quantifier. This 1073 implies greediness, whatever the setting of the PCRE_UNGREEDY option. 1074 If the next character is '?' this is a minimizing repeat, by default, 1075 but if PCRE_UNGREEDY is set, it works the other way round. We change the 1076 repeat type to the non-default. */ 1077 1078 if (safelyCheckNextChar(ptr, patternEnd, '?')) { 1079 repeatType = 1; 1080 ptr++; 1081 } else 1082 repeatType = 0; 1083 1084 /* If previous was a character match, abolish the item and generate a 1085 repeat item instead. If a char item has a minumum of more than one, ensure 1086 that it is set in reqByte - it might not be if a sequence such as x{3} is 1087 the first thing in a branch because the x will have gone into firstByte 1088 instead. */ 1089 1090 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) { 1091 /* Deal with UTF-8 characters that take up more than one byte. It's 1092 easier to write this out separately than try to macrify it. Use c to 1093 hold the length of the character in bytes, plus 0x80 to flag that it's a 1094 length rather than a small character. */ 1095 1096 if (code[-1] & 0x80) { 1097 unsigned char *lastchar = code - 1; 1098 while((*lastchar & 0xc0) == 0x80) 1099 lastchar--; 1100 c = code - lastchar; /* Length of UTF-8 character */ 1101 memcpy(utf8_char, lastchar, c); /* Save the char */ 1102 c |= 0x80; /* Flag c as a length */ 1103 } 1104 else { 1105 c = code[-1]; 1106 if (repeatMin > 1) 1107 reqByte = c | reqCaseOpt | cd.reqVaryOpt; 1108 } 1109 1110 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 1111 } 1112 1113 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) { 1114 c = previous[1]; 1115 if (repeatMin > 1) 1116 reqByte = c | reqCaseOpt | cd.reqVaryOpt; 1117 goto OUTPUT_SINGLE_REPEAT; 1118 } 1119 1120 /* If previous was a single negated character ([^a] or similar), we use 1121 one of the special opcodes, replacing it. The code is shared with single- 1122 character repeats by setting opt_type to add a suitable offset into 1123 repeatType. OP_NOT is currently used only for single-byte chars. */ 1124 1125 else if (*previous == OP_NOT) { 1126 opType = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ 1127 c = previous[1]; 1128 goto OUTPUT_SINGLE_REPEAT; 1129 } 1130 1131 /* If previous was a character type match (\d or similar), abolish it and 1132 create a suitable repeat item. The code is shared with single-character 1133 repeats by setting opType to add a suitable offset into repeatType. */ 1134 1135 else if (*previous <= OP_NOT_NEWLINE) { 1136 opType = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 1137 c = *previous; 1138 1139 OUTPUT_SINGLE_REPEAT: 1140 int prop_type = -1; 1141 int prop_value = -1; 1142 1143 unsigned char* oldcode = code; 1144 code = previous; /* Usually overwrite previous item */ 1145 1146 /* If the maximum is zero then the minimum must also be zero; Perl allows 1147 this case, so we do too - by simply omitting the item altogether. */ 1148 1149 if (repeat_max == 0) 1150 goto END_REPEAT; 1151 1152 /* Combine the opType with the repeatType */ 1153 1154 repeatType += opType; 1155 1156 /* A minimum of zero is handled either as the special case * or ?, or as 1157 an UPTO, with the maximum given. */ 1158 1159 if (repeatMin == 0) { 1160 if (repeat_max == -1) 1161 *code++ = OP_STAR + repeatType; 1162 else if (repeat_max == 1) 1163 *code++ = OP_QUERY + repeatType; 1164 else { 1165 *code++ = OP_UPTO + repeatType; 1166 put2ByteValueAndAdvance(code, repeat_max); 1167 } 1168 } 1169 1170 /* A repeat minimum of 1 is optimized into some special cases. If the 1171 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it 1172 left in place and, if the maximum is greater than 1, we use OP_UPTO with 1173 one less than the maximum. */ 1174 1175 else if (repeatMin == 1) { 1176 if (repeat_max == -1) 1177 *code++ = OP_PLUS + repeatType; 1178 else { 1179 code = oldcode; /* leave previous item in place */ 1180 if (repeat_max == 1) 1181 goto END_REPEAT; 1182 *code++ = OP_UPTO + repeatType; 1183 put2ByteValueAndAdvance(code, repeat_max - 1); 1184 } 1185 } 1186 1187 /* The case {n,n} is just an EXACT, while the general case {n,m} is 1188 handled as an EXACT followed by an UPTO. */ 1189 1190 else { 1191 *code++ = OP_EXACT + opType; /* NB EXACT doesn't have repeatType */ 1192 put2ByteValueAndAdvance(code, repeatMin); 1193 1194 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, 1195 we have to insert the character for the previous code. For a repeated 1196 Unicode property match, there are two extra bytes that define the 1197 required property. In UTF-8 mode, long characters have their length in 1198 c, with the 0x80 bit as a flag. */ 1199 1200 if (repeat_max < 0) { 1201 if (c >= 128) { 1202 memcpy(code, utf8_char, c & 7); 1203 code += c & 7; 1204 } else { 1205 *code++ = c; 1206 if (prop_type >= 0) { 1207 *code++ = prop_type; 1208 *code++ = prop_value; 1209 } 1210 } 1211 *code++ = OP_STAR + repeatType; 1212 } 1213 1214 /* Else insert an UPTO if the max is greater than the min, again 1215 preceded by the character, for the previously inserted code. */ 1216 1217 else if (repeat_max != repeatMin) { 1218 if (c >= 128) { 1219 memcpy(code, utf8_char, c & 7); 1220 code += c & 7; 1221 } else 1222 *code++ = c; 1223 if (prop_type >= 0) { 1224 *code++ = prop_type; 1225 *code++ = prop_value; 1226 } 1227 repeat_max -= repeatMin; 1228 *code++ = OP_UPTO + repeatType; 1229 put2ByteValueAndAdvance(code, repeat_max); 1230 } 1231 } 1232 1233 /* The character or character type itself comes last in all cases. */ 1234 1235 if (c >= 128) { 1236 memcpy(code, utf8_char, c & 7); 1237 code += c & 7; 1238 } else 1239 *code++ = c; 1240 1241 /* For a repeated Unicode property match, there are two extra bytes that 1242 define the required property. */ 1243 1244 if (prop_type >= 0) { 1245 *code++ = prop_type; 1246 *code++ = prop_value; 1247 } 1248 } 1249 1250 /* If previous was a character class or a back reference, we put the repeat 1251 stuff after it, but just skip the item if the repeat was {0,0}. */ 1252 1253 else if (*previous == OP_CLASS || 1254 *previous == OP_NCLASS || 1255 *previous == OP_XCLASS || 1256 *previous == OP_REF) 1257 { 1258 if (repeat_max == 0) { 1259 code = previous; 1260 goto END_REPEAT; 1261 } 1262 1263 if (repeatMin == 0 && repeat_max == -1) 1264 *code++ = OP_CRSTAR + repeatType; 1265 else if (repeatMin == 1 && repeat_max == -1) 1266 *code++ = OP_CRPLUS + repeatType; 1267 else if (repeatMin == 0 && repeat_max == 1) 1268 *code++ = OP_CRQUERY + repeatType; 1269 else { 1270 *code++ = OP_CRRANGE + repeatType; 1271 put2ByteValueAndAdvance(code, repeatMin); 1272 if (repeat_max == -1) 1273 repeat_max = 0; /* 2-byte encoding for max */ 1274 put2ByteValueAndAdvance(code, repeat_max); 1275 } 1276 } 1277 1278 /* If previous was a bracket group, we may have to replicate it in certain 1279 cases. */ 1280 1281 else if (*previous >= OP_BRA) { 1282 int ketoffset = 0; 1283 int len = code - previous; 1284 unsigned char* bralink = NULL; 1285 1286 /* If the maximum repeat count is unlimited, find the end of the bracket 1287 by scanning through from the start, and compute the offset back to it 1288 from the current code pointer. There may be an OP_OPT setting following 1289 the final KET, so we can't find the end just by going back from the code 1290 pointer. */ 1291 1292 if (repeat_max == -1) { 1293 const unsigned char* ket = previous; 1294 advanceToEndOfBracket(ket); 1295 ketoffset = code - ket; 1296 } 1297 1298 /* The case of a zero minimum is special because of the need to stick 1299 OP_BRAZERO in front of it, and because the group appears once in the 1300 data, whereas in other cases it appears the minimum number of times. For 1301 this reason, it is simplest to treat this case separately, as otherwise 1302 the code gets far too messy. There are several special subcases when the 1303 minimum is zero. */ 1304 1305 if (repeatMin == 0) { 1306 /* If the maximum is also zero, we just omit the group from the output 1307 altogether. */ 1308 1309 if (repeat_max == 0) { 1310 code = previous; 1311 goto END_REPEAT; 1312 } 1313 1314 /* If the maximum is 1 or unlimited, we just have to stick in the 1315 BRAZERO and do no more at this point. However, we do need to adjust 1316 any OP_RECURSE calls inside the group that refer to the group itself or 1317 any internal group, because the offset is from the start of the whole 1318 regex. Temporarily terminate the pattern while doing this. */ 1319 1320 if (repeat_max <= 1) { 1321 *code = OP_END; 1322 memmove(previous+1, previous, len); 1323 code++; 1324 *previous++ = OP_BRAZERO + repeatType; 1325 } 1326 1327 /* If the maximum is greater than 1 and limited, we have to replicate 1328 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 1329 The first one has to be handled carefully because it's the original 1330 copy, which has to be moved up. The remainder can be handled by code 1331 that is common with the non-zero minimum case below. We have to 1332 adjust the value of repeat_max, since one less copy is required. */ 1333 1334 else { 1335 *code = OP_END; 1336 memmove(previous + 2 + LINK_SIZE, previous, len); 1337 code += 2 + LINK_SIZE; 1338 *previous++ = OP_BRAZERO + repeatType; 1339 *previous++ = OP_BRA; 1340 1341 /* We chain together the bracket offset fields that have to be 1342 filled in later when the ends of the brackets are reached. */ 1343 1344 int offset = (!bralink) ? 0 : previous - bralink; 1345 bralink = previous; 1346 putLinkValueAllowZeroAndAdvance(previous, offset); 1347 } 1348 1349 repeat_max--; 1350 } 1351 1352 /* If the minimum is greater than zero, replicate the group as many 1353 times as necessary, and adjust the maximum to the number of subsequent 1354 copies that we need. If we set a first char from the group, and didn't 1355 set a required char, copy the latter from the former. */ 1356 1357 else { 1358 if (repeatMin > 1) { 1359 if (didGroupSetFirstByte && reqByte < 0) 1360 reqByte = firstByte; 1361 for (int i = 1; i < repeatMin; i++) { 1362 memcpy(code, previous, len); 1363 code += len; 1364 } 1365 } 1366 if (repeat_max > 0) 1367 repeat_max -= repeatMin; 1368 } 1369 1370 /* This code is common to both the zero and non-zero minimum cases. If 1371 the maximum is limited, it replicates the group in a nested fashion, 1372 remembering the bracket starts on a stack. In the case of a zero minimum, 1373 the first one was set up above. In all cases the repeat_max now specifies 1374 the number of additional copies needed. */ 1375 1376 if (repeat_max >= 0) { 1377 for (int i = repeat_max - 1; i >= 0; i--) { 1378 *code++ = OP_BRAZERO + repeatType; 1379 1380 /* All but the final copy start a new nesting, maintaining the 1381 chain of brackets outstanding. */ 1382 1383 if (i != 0) { 1384 *code++ = OP_BRA; 1385 int offset = (!bralink) ? 0 : code - bralink; 1386 bralink = code; 1387 putLinkValueAllowZeroAndAdvance(code, offset); 1388 } 1389 1390 memcpy(code, previous, len); 1391 code += len; 1392 } 1393 1394 /* Now chain through the pending brackets, and fill in their length 1395 fields (which are holding the chain links pro tem). */ 1396 1397 while (bralink) { 1398 int offset = code - bralink + 1; 1399 unsigned char* bra = code - offset; 1400 int oldlinkoffset = getLinkValueAllowZero(bra + 1); 1401 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset; 1402 *code++ = OP_KET; 1403 putLinkValueAndAdvance(code, offset); 1404 putLinkValue(bra + 1, offset); 1405 } 1406 } 1407 1408 /* If the maximum is unlimited, set a repeater in the final copy. We 1409 can't just offset backwards from the current code point, because we 1410 don't know if there's been an options resetting after the ket. The 1411 correct offset was computed above. */ 1412 1413 else 1414 code[-ketoffset] = OP_KETRMAX + repeatType; 1415 } 1416 1417 // A quantifier after an assertion is mostly meaningless, but it 1418 // can nullify the assertion if it has a 0 minimum. 1419 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) { 1420 if (repeatMin == 0) { 1421 code = previous; 1422 goto END_REPEAT; 1423 } 1424 } 1425 1426 /* Else there's some kind of shambles */ 1427 1428 else { 1429 *errorCodePtr = ERR11; 1430 goto FAILED; 1431 } 1432 1433 /* In all case we no longer have a previous item. We also set the 1434 "follows varying string" flag for subsequently encountered reqbytes if 1435 it isn't already set and we have just passed a varying length item. */ 1436 1437 END_REPEAT: 1438 previous = NULL; 1439 cd.reqVaryOpt |= reqvary; 1440 break; 1441 1442 /* Start of nested bracket sub-expression, or comment or lookahead or 1443 lookbehind or option setting or condition. First deal with special things 1444 that can come after a bracket; all are introduced by ?, and the appearance 1445 of any of them means that this is not a referencing group. They were 1446 checked for validity in the first pass over the string, so we don't have to 1447 check for syntax errors here. */ 1448 1449 case '(': 1450 skipBytes = 0; 1451 1452 if (*(++ptr) == '?') { 1453 switch (*(++ptr)) { 1454 case ':': /* Non-extracting bracket */ 1455 bravalue = OP_BRA; 1456 ptr++; 1457 break; 1458 1459 case '=': /* Positive lookahead */ 1460 bravalue = OP_ASSERT; 1461 ptr++; 1462 break; 1463 1464 case '!': /* Negative lookahead */ 1465 bravalue = OP_ASSERT_NOT; 1466 ptr++; 1467 break; 1468 1469 /* Character after (? not specially recognized */ 1470 1471 default: 1472 *errorCodePtr = ERR12; 1473 goto FAILED; 1474 } 1475 } 1476 1477 /* Else we have a referencing group; adjust the opcode. If the bracket 1478 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and 1479 arrange for the true number to follow later, in an OP_BRANUMBER item. */ 1480 1481 else { 1482 if (++(*brackets) > EXTRACT_BASIC_MAX) { 1483 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; 1484 code[1 + LINK_SIZE] = OP_BRANUMBER; 1485 put2ByteValue(code + 2 + LINK_SIZE, *brackets); 1486 skipBytes = 3; 1487 } 1488 else 1489 bravalue = OP_BRA + *brackets; 1490 } 1491 1492 /* Process nested bracketed re. We copy code into a non-variable 1493 in order to be able to pass its address because some compilers 1494 complain otherwise. Pass in a new setting for the ims options 1495 if they have changed. */ 1496 1497 previous = code; 1498 *code = bravalue; 1499 tempcode = code; 1500 tempreqvary = cd.reqVaryOpt; /* Save value before bracket */ 1501 1502 if (!compileBracket( 1503 options, 1504 brackets, /* Extracting bracket count */ 1505 &tempcode, /* Where to put code (updated) */ 1506 &ptr, /* Input pointer (updated) */ 1507 patternEnd, 1508 errorCodePtr, /* Where to put an error message */ 1509 skipBytes, /* Skip over OP_BRANUMBER */ 1510 &subFirstByte, /* For possible first char */ 1511 &subReqByte, /* For possible last char */ 1512 cd)) /* Tables block */ 1513 goto FAILED; 1514 1515 /* At the end of compiling, code is still pointing to the start of the 1516 group, while tempcode has been updated to point past the end of the group 1517 and any option resetting that may follow it. The pattern pointer (ptr) 1518 is on the bracket. */ 1519 1520 /* Handle updating of the required and first characters. Update for normal 1521 brackets of all kinds, and conditions with two branches (see code above). 1522 If the bracket is followed by a quantifier with zero repeat, we have to 1523 back off. Hence the definition of zeroReqByte and zeroFirstByte outside the 1524 main loop so that they can be accessed for the back off. */ 1525 1526 zeroReqByte = reqByte; 1527 zeroFirstByte = firstByte; 1528 didGroupSetFirstByte = false; 1529 1530 if (bravalue >= OP_BRA) { 1531 /* If we have not yet set a firstByte in this branch, take it from the 1532 subpattern, remembering that it was set here so that a repeat of more 1533 than one can replicate it as reqByte if necessary. If the subpattern has 1534 no firstByte, set "none" for the whole branch. In both cases, a zero 1535 repeat forces firstByte to "none". */ 1536 1537 if (firstByte == REQ_UNSET) { 1538 if (subFirstByte >= 0) { 1539 firstByte = subFirstByte; 1540 didGroupSetFirstByte = true; 1541 } 1542 else 1543 firstByte = REQ_NONE; 1544 zeroFirstByte = REQ_NONE; 1545 } 1546 1547 /* If firstByte was previously set, convert the subpattern's firstByte 1548 into reqByte if there wasn't one, using the vary flag that was in 1549 existence beforehand. */ 1550 1551 else if (subFirstByte >= 0 && subReqByte < 0) 1552 subReqByte = subFirstByte | tempreqvary; 1553 1554 /* If the subpattern set a required byte (or set a first byte that isn't 1555 really the first byte - see above), set it. */ 1556 1557 if (subReqByte >= 0) 1558 reqByte = subReqByte; 1559 } 1560 1561 /* For a forward assertion, we take the reqByte, if set. This can be 1562 helpful if the pattern that follows the assertion doesn't set a different 1563 char. For example, it's useful for /(?=abcde).+/. We can't set firstByte 1564 for an assertion, however because it leads to incorrect effect for patterns 1565 such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead 1566 of a firstByte. This is overcome by a scan at the end if there's no 1567 firstByte, looking for an asserted first char. */ 1568 1569 else if (bravalue == OP_ASSERT && subReqByte >= 0) 1570 reqByte = subReqByte; 1571 1572 /* Now update the main code pointer to the end of the group. */ 1573 1574 code = tempcode; 1575 1576 /* Error if hit end of pattern */ 1577 1578 if (ptr >= patternEnd || *ptr != ')') { 1579 *errorCodePtr = ERR14; 1580 goto FAILED; 1581 } 1582 break; 1583 1584 /* Check \ for being a real metacharacter; if not, fall through and handle 1585 it as a data character at the start of a string. Escape items are checked 1586 for validity in the pre-compiling pass. */ 1587 1588 case '\\': 1589 tempptr = ptr; 1590 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false); 1591 1592 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values 1593 are arranged to be the negation of the corresponding OP_values. For the 1594 back references, the values are ESC_REF plus the reference number. Only 1595 back references and those types that consume a character may be repeated. 1596 We can test for values between ESC_b and ESC_w for the latter; this may 1597 have to change if any new ones are ever created. */ 1598 1599 if (c < 0) { 1600 /* For metasequences that actually match a character, we disable the 1601 setting of a first character if it hasn't already been set. */ 1602 1603 if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w) 1604 firstByte = REQ_NONE; 1605 1606 /* Set values to reset to if this is followed by a zero repeat. */ 1607 1608 zeroFirstByte = firstByte; 1609 zeroReqByte = reqByte; 1610 1611 /* Back references are handled specially */ 1612 1613 if (-c >= ESC_REF) { 1614 int number = -c - ESC_REF; 1615 previous = code; 1616 *code++ = OP_REF; 1617 put2ByteValueAndAdvance(code, number); 1618 } 1619 1620 /* For the rest, we can obtain the OP value by negating the escape 1621 value */ 1622 1623 else { 1624 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL; 1625 *code++ = -c; 1626 } 1627 continue; 1628 } 1629 1630 /* Fall through. */ 1631 1632 /* Handle a literal character. It is guaranteed not to be whitespace or # 1633 when the extended flag is set. If we are in UTF-8 mode, it may be a 1634 multi-byte literal character. */ 1635 1636 default: 1637 NORMAL_CHAR: 1638 1639 previous = code; 1640 1641 if (c < 128) { 1642 mcLength = 1; 1643 mcbuffer[0] = c; 1644 1645 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') { 1646 *code++ = OP_ASCII_LETTER_IGNORING_CASE; 1647 *code++ = c | 0x20; 1648 } else { 1649 *code++ = OP_ASCII_CHAR; 1650 *code++ = c; 1651 } 1652 } else { 1653 mcLength = encodeUTF8(c, mcbuffer); 1654 1655 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR; 1656 for (c = 0; c < mcLength; c++) 1657 *code++ = mcbuffer[c]; 1658 } 1659 1660 /* Set the first and required bytes appropriately. If no previous first 1661 byte, set it from this character, but revert to none on a zero repeat. 1662 Otherwise, leave the firstByte value alone, and don't change it on a zero 1663 repeat. */ 1664 1665 if (firstByte == REQ_UNSET) { 1666 zeroFirstByte = REQ_NONE; 1667 zeroReqByte = reqByte; 1668 1669 /* If the character is more than one byte long, we can set firstByte 1670 only if it is not to be matched caselessly. */ 1671 1672 if (mcLength == 1 || reqCaseOpt == 0) { 1673 firstByte = mcbuffer[0] | reqCaseOpt; 1674 if (mcLength != 1) 1675 reqByte = code[-1] | cd.reqVaryOpt; 1676 } 1677 else 1678 firstByte = reqByte = REQ_NONE; 1679 } 1680 1681 /* firstByte was previously set; we can set reqByte only the length is 1682 1 or the matching is caseful. */ 1683 1684 else { 1685 zeroFirstByte = firstByte; 1686 zeroReqByte = reqByte; 1687 if (mcLength == 1 || reqCaseOpt == 0) 1688 reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt; 1689 } 1690 1691 break; /* End of literal character handling */ 1692 } 1693 } /* end of big loop */ 1694 1695 /* Control never reaches here by falling through, only by a goto for all the 1696 error states. Pass back the position in the pattern so that it can be displayed 1697 to the user for diagnosing the error. */ 1698 1699 FAILED: 1700 *ptrPtr = ptr; 1701 return false; 1702 } 1703 1704 /************************************************* 1705 * Compile sequence of alternatives * 1706 *************************************************/ 1707 1708 /* On entry, ptr is pointing past the bracket character, but on return 1709 it points to the closing bracket, or vertical bar, or end of string. 1710 The code variable is pointing at the byte into which the BRA operator has been 1711 stored. If the ims options are changed at the start (for a (?ims: group) or 1712 during any branch, we need to insert an OP_OPT item at the start of every 1713 following branch to ensure they get set correctly at run time, and also pass 1714 the new options into every subsequent branch compile. 1715 1716 Argument: 1717 options option bits, including any changes for this subpattern 1718 brackets -> int containing the number of extracting brackets used 1719 codePtr -> the address of the current code pointer 1720 ptrPtr -> the address of the current pattern pointer 1721 errorCodePtr -> pointer to error code variable 1722 skipBytes skip this many bytes at start (for OP_BRANUMBER) 1723 firstbyteptr place to put the first required character, or a negative number 1724 reqbyteptr place to put the last required character, or a negative number 1725 cd points to the data block with tables pointers etc. 1726 1727 Returns: true on success 1728 */ 1729 1730 static bool 1731 compileBracket(int options, int* brackets, unsigned char** codePtr, 1732 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes, 1733 int* firstbyteptr, int* reqbyteptr, CompileData& cd) 1734 { 1735 const UChar* ptr = *ptrPtr; 1736 unsigned char* code = *codePtr; 1737 unsigned char* lastBranch = code; 1738 unsigned char* start_bracket = code; 1739 int firstByte = REQ_UNSET; 1740 int reqByte = REQ_UNSET; 1741 1742 /* Offset is set zero to mark that this bracket is still open */ 1743 1744 putLinkValueAllowZero(code + 1, 0); 1745 code += 1 + LINK_SIZE + skipBytes; 1746 1747 /* Loop for each alternative branch */ 1748 1749 while (true) { 1750 /* Now compile the branch */ 1751 1752 int branchFirstByte; 1753 int branchReqByte; 1754 if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr, 1755 &branchFirstByte, &branchReqByte, cd)) { 1756 *ptrPtr = ptr; 1757 return false; 1758 } 1759 1760 /* If this is the first branch, the firstByte and reqByte values for the 1761 branch become the values for the regex. */ 1762 1763 if (*lastBranch != OP_ALT) { 1764 firstByte = branchFirstByte; 1765 reqByte = branchReqByte; 1766 } 1767 1768 /* If this is not the first branch, the first char and reqByte have to 1769 match the values from all the previous branches, except that if the previous 1770 value for reqByte didn't have REQ_VARY set, it can still match, and we set 1771 REQ_VARY for the regex. */ 1772 1773 else { 1774 /* If we previously had a firstByte, but it doesn't match the new branch, 1775 we have to abandon the firstByte for the regex, but if there was previously 1776 no reqByte, it takes on the value of the old firstByte. */ 1777 1778 if (firstByte >= 0 && firstByte != branchFirstByte) { 1779 if (reqByte < 0) 1780 reqByte = firstByte; 1781 firstByte = REQ_NONE; 1782 } 1783 1784 /* If we (now or from before) have no firstByte, a firstByte from the 1785 branch becomes a reqByte if there isn't a branch reqByte. */ 1786 1787 if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0) 1788 branchReqByte = branchFirstByte; 1789 1790 /* Now ensure that the reqbytes match */ 1791 1792 if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY)) 1793 reqByte = REQ_NONE; 1794 else 1795 reqByte |= branchReqByte; /* To "or" REQ_VARY */ 1796 } 1797 1798 /* Reached end of expression, either ')' or end of pattern. Go back through 1799 the alternative branches and reverse the chain of offsets, with the field in 1800 the BRA item now becoming an offset to the first alternative. If there are 1801 no alternatives, it points to the end of the group. The length in the 1802 terminating ket is always the length of the whole bracketed item. If any of 1803 the ims options were changed inside the group, compile a resetting op-code 1804 following, except at the very end of the pattern. Return leaving the pointer 1805 at the terminating char. */ 1806 1807 if (ptr >= patternEnd || *ptr != '|') { 1808 int length = code - lastBranch; 1809 do { 1810 int prevLength = getLinkValueAllowZero(lastBranch + 1); 1811 putLinkValue(lastBranch + 1, length); 1812 length = prevLength; 1813 lastBranch -= length; 1814 } while (length > 0); 1815 1816 /* Fill in the ket */ 1817 1818 *code = OP_KET; 1819 putLinkValue(code + 1, code - start_bracket); 1820 code += 1 + LINK_SIZE; 1821 1822 /* Set values to pass back */ 1823 1824 *codePtr = code; 1825 *ptrPtr = ptr; 1826 *firstbyteptr = firstByte; 1827 *reqbyteptr = reqByte; 1828 return true; 1829 } 1830 1831 /* Another branch follows; insert an "or" node. Its length field points back 1832 to the previous branch while the bracket remains open. At the end the chain 1833 is reversed. It's done like this so that the start of the bracket has a 1834 zero offset until it is closed, making it possible to detect recursion. */ 1835 1836 *code = OP_ALT; 1837 putLinkValue(code + 1, code - lastBranch); 1838 lastBranch = code; 1839 code += 1 + LINK_SIZE; 1840 ptr++; 1841 } 1842 ASSERT_NOT_REACHED(); 1843 } 1844 1845 /************************************************* 1846 * Check for anchored expression * 1847 *************************************************/ 1848 1849 /* Try to find out if this is an anchored regular expression. Consider each 1850 alternative branch. If they all start OP_CIRC, or with a bracket 1851 all of whose alternatives start OP_CIRC (recurse ad lib), then 1852 it's anchored. 1853 1854 Arguments: 1855 code points to start of expression (the bracket) 1856 captureMap a bitmap of which brackets we are inside while testing; this 1857 handles up to substring 31; all brackets after that share 1858 the zero bit 1859 backrefMap the back reference bitmap 1860 */ 1861 1862 static bool branchIsAnchored(const unsigned char* code) 1863 { 1864 const unsigned char* scode = firstSignificantOpcode(code); 1865 int op = *scode; 1866 1867 /* Brackets */ 1868 if (op >= OP_BRA || op == OP_ASSERT) 1869 return bracketIsAnchored(scode); 1870 1871 /* Check for explicit anchoring */ 1872 return op == OP_CIRC; 1873 } 1874 1875 static bool bracketIsAnchored(const unsigned char* code) 1876 { 1877 do { 1878 if (!branchIsAnchored(code + 1 + LINK_SIZE)) 1879 return false; 1880 code += getLinkValue(code + 1); 1881 } while (*code == OP_ALT); /* Loop for each alternative */ 1882 return true; 1883 } 1884 1885 /************************************************* 1886 * Check for starting with ^ or .* * 1887 *************************************************/ 1888 1889 /* This is called to find out if every branch starts with ^ or .* so that 1890 "first char" processing can be done to speed things up in multiline 1891 matching and for non-DOTALL patterns that start with .* (which must start at 1892 the beginning or after \n) 1893 1894 Except when the .* appears inside capturing parentheses, and there is a 1895 subsequent back reference to those parentheses. By keeping a bitmap of the 1896 first 31 back references, we can catch some of the more common cases more 1897 precisely; all the greater back references share a single bit. 1898 1899 Arguments: 1900 code points to start of expression (the bracket) 1901 captureMap a bitmap of which brackets we are inside while testing; this 1902 handles up to substring 31; all brackets after that share 1903 the zero bit 1904 backrefMap the back reference bitmap 1905 */ 1906 1907 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap) 1908 { 1909 const unsigned char* scode = firstSignificantOpcode(code); 1910 int op = *scode; 1911 1912 /* Capturing brackets */ 1913 if (op > OP_BRA) { 1914 int captureNum = op - OP_BRA; 1915 if (captureNum > EXTRACT_BASIC_MAX) 1916 captureNum = get2ByteValue(scode + 2 + LINK_SIZE); 1917 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1; 1918 return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap); 1919 } 1920 1921 /* Other brackets */ 1922 if (op == OP_BRA || op == OP_ASSERT) 1923 return bracketNeedsLineStart(scode, captureMap, backrefMap); 1924 1925 /* .* means "start at start or after \n" if it isn't in brackets that 1926 may be referenced. */ 1927 1928 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) 1929 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap); 1930 1931 /* Explicit ^ */ 1932 return op == OP_CIRC || op == OP_BOL; 1933 } 1934 1935 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap) 1936 { 1937 do { 1938 if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap)) 1939 return false; 1940 code += getLinkValue(code + 1); 1941 } while (*code == OP_ALT); /* Loop for each alternative */ 1942 return true; 1943 } 1944 1945 /************************************************* 1946 * Check for asserted fixed first char * 1947 *************************************************/ 1948 1949 /* During compilation, the "first char" settings from forward assertions are 1950 discarded, because they can cause conflicts with actual literals that follow. 1951 However, if we end up without a first char setting for an unanchored pattern, 1952 it is worth scanning the regex to see if there is an initial asserted first 1953 char. If all branches start with the same asserted char, or with a bracket all 1954 of whose alternatives start with the same asserted char (recurse ad lib), then 1955 we return that char, otherwise -1. 1956 1957 Arguments: 1958 code points to start of expression (the bracket) 1959 options pointer to the options (used to check casing changes) 1960 inassert true if in an assertion 1961 1962 Returns: -1 or the fixed first char 1963 */ 1964 1965 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert) 1966 { 1967 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code); 1968 int op = *scode; 1969 1970 if (op >= OP_BRA) 1971 op = OP_BRA; 1972 1973 switch (op) { 1974 default: 1975 return -1; 1976 1977 case OP_BRA: 1978 case OP_ASSERT: 1979 return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT); 1980 1981 case OP_EXACT: 1982 scode += 2; 1983 /* Fall through */ 1984 1985 case OP_CHAR: 1986 case OP_CHAR_IGNORING_CASE: 1987 case OP_ASCII_CHAR: 1988 case OP_ASCII_LETTER_IGNORING_CASE: 1989 case OP_PLUS: 1990 case OP_MINPLUS: 1991 if (!inassert) 1992 return -1; 1993 return scode[1]; 1994 } 1995 } 1996 1997 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert) 1998 { 1999 int c = -1; 2000 do { 2001 int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert); 2002 if (d < 0) 2003 return -1; 2004 if (c < 0) 2005 c = d; 2006 else if (c != d) 2007 return -1; 2008 code += getLinkValue(code + 1); 2009 } while (*code == OP_ALT); 2010 return c; 2011 } 2012 2013 static inline int multiplyWithOverflowCheck(int a, int b) 2014 { 2015 if (!a || !b) 2016 return 0; 2017 if (a > MAX_PATTERN_SIZE / b) 2018 return -1; 2019 return a * b; 2020 } 2021 2022 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase, 2023 CompileData& cd, ErrorCode& errorcode) 2024 { 2025 /* Make a pass over the pattern to compute the 2026 amount of store required to hold the compiled code. This does not have to be 2027 perfect as long as errors are overestimates. */ 2028 2029 if (patternLength > MAX_PATTERN_SIZE) { 2030 errorcode = ERR16; 2031 return -1; 2032 } 2033 2034 int length = 1 + LINK_SIZE; /* For initial BRA plus length */ 2035 int branch_extra = 0; 2036 int lastitemlength = 0; 2037 unsigned brastackptr = 0; 2038 int brastack[BRASTACK_SIZE]; 2039 unsigned char bralenstack[BRASTACK_SIZE]; 2040 int bracount = 0; 2041 2042 const UChar* ptr = (const UChar*)(pattern - 1); 2043 const UChar* patternEnd = (const UChar*)(pattern + patternLength); 2044 2045 while (++ptr < patternEnd) { 2046 int minRepeats = 0, maxRepeats = 0; 2047 int c = *ptr; 2048 2049 switch (c) { 2050 /* A backslashed item may be an escaped data character or it may be a 2051 character type. */ 2052 2053 case '\\': 2054 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false); 2055 if (errorcode != 0) 2056 return -1; 2057 2058 lastitemlength = 1; /* Default length of last item for repeats */ 2059 2060 if (c >= 0) { /* Data character */ 2061 length += 2; /* For a one-byte character */ 2062 2063 if (c > 127) { 2064 int i; 2065 for (i = 0; i < jsc_pcre_utf8_table1_size; i++) 2066 if (c <= jsc_pcre_utf8_table1[i]) break; 2067 length += i; 2068 lastitemlength += i; 2069 } 2070 2071 continue; 2072 } 2073 2074 /* Other escapes need one byte */ 2075 2076 length++; 2077 2078 /* A back reference needs an additional 2 bytes, plus either one or 5 2079 bytes for a repeat. We also need to keep the value of the highest 2080 back reference. */ 2081 2082 if (c <= -ESC_REF) { 2083 int refnum = -c - ESC_REF; 2084 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1; 2085 if (refnum > cd.topBackref) 2086 cd.topBackref = refnum; 2087 length += 2; /* For single back reference */ 2088 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) { 2089 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode); 2090 if (errorcode) 2091 return -1; 2092 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) || 2093 (minRepeats == 1 && maxRepeats == -1)) 2094 length++; 2095 else 2096 length += 5; 2097 if (safelyCheckNextChar(ptr, patternEnd, '?')) 2098 ptr++; 2099 } 2100 } 2101 continue; 2102 2103 case '^': /* Single-byte metacharacters */ 2104 case '.': 2105 case '$': 2106 length++; 2107 lastitemlength = 1; 2108 continue; 2109 2110 case '*': /* These repeats won't be after brackets; */ 2111 case '+': /* those are handled separately */ 2112 case '?': 2113 length++; 2114 goto POSSESSIVE; 2115 2116 /* This covers the cases of braced repeats after a single char, metachar, 2117 class, or back reference. */ 2118 2119 case '{': 2120 if (!isCountedRepeat(ptr + 1, patternEnd)) 2121 goto NORMAL_CHAR; 2122 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode); 2123 if (errorcode != 0) 2124 return -1; 2125 2126 /* These special cases just insert one extra opcode */ 2127 2128 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) || 2129 (minRepeats == 1 && maxRepeats == -1)) 2130 length++; 2131 2132 /* These cases might insert additional copies of a preceding character. */ 2133 2134 else { 2135 if (minRepeats != 1) { 2136 length -= lastitemlength; /* Uncount the original char or metachar */ 2137 if (minRepeats > 0) 2138 length += 3 + lastitemlength; 2139 } 2140 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1); 2141 } 2142 2143 if (safelyCheckNextChar(ptr, patternEnd, '?')) 2144 ptr++; /* Needs no extra length */ 2145 2146 POSSESSIVE: /* Test for possessive quantifier */ 2147 if (safelyCheckNextChar(ptr, patternEnd, '+')) { 2148 ptr++; 2149 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */ 2150 } 2151 continue; 2152 2153 /* An alternation contains an offset to the next branch or ket. If any ims 2154 options changed in the previous branch(es), and/or if we are in a 2155 lookbehind assertion, extra space will be needed at the start of the 2156 branch. This is handled by branch_extra. */ 2157 2158 case '|': 2159 if (brastackptr == 0) 2160 cd.needOuterBracket = true; 2161 length += 1 + LINK_SIZE + branch_extra; 2162 continue; 2163 2164 /* A character class uses 33 characters provided that all the character 2165 values are less than 256. Otherwise, it uses a bit map for low valued 2166 characters, and individual items for others. Don't worry about character 2167 types that aren't allowed in classes - they'll get picked up during the 2168 compile. A character class that contains only one single-byte character 2169 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this 2170 where we can. (In UTF-8 mode we can do this only for chars < 128.) */ 2171 2172 case '[': { 2173 int class_optcount; 2174 if (*(++ptr) == '^') { 2175 class_optcount = 10; /* Greater than one */ 2176 ptr++; 2177 } 2178 else 2179 class_optcount = 0; 2180 2181 bool class_utf8 = false; 2182 2183 for (; ptr < patternEnd && *ptr != ']'; ++ptr) { 2184 /* Check for escapes */ 2185 2186 if (*ptr == '\\') { 2187 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true); 2188 if (errorcode != 0) 2189 return -1; 2190 2191 /* Handle escapes that turn into characters */ 2192 2193 if (c >= 0) 2194 goto NON_SPECIAL_CHARACTER; 2195 2196 /* Escapes that are meta-things. The normal ones just affect the 2197 bit map, but Unicode properties require an XCLASS extended item. */ 2198 2199 else 2200 class_optcount = 10; /* \d, \s etc; make sure > 1 */ 2201 } 2202 2203 /* Anything else increments the possible optimization count. We have to 2204 detect ranges here so that we can compute the number of extra ranges for 2205 caseless wide characters when UCP support is available. If there are wide 2206 characters, we are going to have to use an XCLASS, even for single 2207 characters. */ 2208 2209 else { 2210 c = *ptr; 2211 2212 /* Come here from handling \ above when it escapes to a char value */ 2213 2214 NON_SPECIAL_CHARACTER: 2215 class_optcount++; 2216 2217 int d = -1; 2218 if (safelyCheckNextChar(ptr, patternEnd, '-')) { 2219 const UChar* hyptr = ptr++; 2220 if (safelyCheckNextChar(ptr, patternEnd, '\\')) { 2221 ptr++; 2222 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true); 2223 if (errorcode != 0) 2224 return -1; 2225 } 2226 else if ((ptr + 1 < patternEnd) && ptr[1] != ']') 2227 d = *++ptr; 2228 if (d < 0) 2229 ptr = hyptr; /* go back to hyphen as data */ 2230 } 2231 2232 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > 2233 127 for caseless matching, we will need to use an XCLASS. */ 2234 2235 if (d >= 0) { 2236 class_optcount = 10; /* Ensure > 1 */ 2237 if (d < c) { 2238 errorcode = ERR8; 2239 return -1; 2240 } 2241 2242 if ((d > 255 || (ignoreCase && d > 127))) { 2243 unsigned char buffer[6]; 2244 if (!class_utf8) /* Allow for XCLASS overhead */ 2245 { 2246 class_utf8 = true; 2247 length += LINK_SIZE + 2; 2248 } 2249 2250 /* If we have UCP support, find out how many extra ranges are 2251 needed to map the other case of characters within this range. We 2252 have to mimic the range optimization here, because extending the 2253 range upwards might push d over a boundary that makes it use 2254 another byte in the UTF-8 representation. */ 2255 2256 if (ignoreCase) { 2257 int occ, ocd; 2258 int cc = c; 2259 int origd = d; 2260 while (getOthercaseRange(&cc, origd, &occ, &ocd)) { 2261 if (occ >= c && ocd <= d) 2262 continue; /* Skip embedded */ 2263 2264 if (occ < c && ocd >= c - 1) /* Extend the basic range */ 2265 { /* if there is overlap, */ 2266 c = occ; /* noting that if occ < c */ 2267 continue; /* we can't have ocd > d */ 2268 } /* because a subrange is */ 2269 if (ocd > d && occ <= d + 1) /* always shorter than */ 2270 { /* the basic range. */ 2271 d = ocd; 2272 continue; 2273 } 2274 2275 /* An extra item is needed */ 2276 2277 length += 1 + encodeUTF8(occ, buffer) + 2278 ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer)); 2279 } 2280 } 2281 2282 /* The length of the (possibly extended) range */ 2283 2284 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer); 2285 } 2286 2287 } 2288 2289 /* We have a single character. There is nothing to be done unless we 2290 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must 2291 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP 2292 support. */ 2293 2294 else { 2295 if ((c > 255 || (ignoreCase && c > 127))) { 2296 unsigned char buffer[6]; 2297 class_optcount = 10; /* Ensure > 1 */ 2298 if (!class_utf8) /* Allow for XCLASS overhead */ 2299 { 2300 class_utf8 = true; 2301 length += LINK_SIZE + 2; 2302 } 2303 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer)); 2304 } 2305 } 2306 } 2307 } 2308 2309 if (ptr >= patternEnd) { /* Missing terminating ']' */ 2310 errorcode = ERR6; 2311 return -1; 2312 } 2313 2314 /* We can optimize when there was only one optimizable character. 2315 Note that this does not detect the case of a negated single character. 2316 In that case we do an incorrect length computation, but it's not a serious 2317 problem because the computed length is too large rather than too small. */ 2318 2319 if (class_optcount == 1) 2320 goto NORMAL_CHAR; 2321 2322 /* Here, we handle repeats for the class opcodes. */ 2323 { 2324 length += 33; 2325 2326 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, 2327 we also need extra for wrapping the whole thing in a sub-pattern. */ 2328 2329 if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) { 2330 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode); 2331 if (errorcode != 0) 2332 return -1; 2333 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) || 2334 (minRepeats == 1 && maxRepeats == -1)) 2335 length++; 2336 else 2337 length += 5; 2338 if (safelyCheckNextChar(ptr, patternEnd, '+')) { 2339 ptr++; 2340 length += 2 + 2 * LINK_SIZE; 2341 } else if (safelyCheckNextChar(ptr, patternEnd, '?')) 2342 ptr++; 2343 } 2344 } 2345 continue; 2346 } 2347 2348 /* Brackets may be genuine groups or special things */ 2349 2350 case '(': { 2351 int branch_newextra = 0; 2352 int bracket_length = 1 + LINK_SIZE; 2353 bool capturing = false; 2354 2355 /* Handle special forms of bracket, which all start (? */ 2356 2357 if (safelyCheckNextChar(ptr, patternEnd, '?')) { 2358 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) { 2359 /* Non-referencing groups and lookaheads just move the pointer on, and 2360 then behave like a non-special bracket, except that they don't increment 2361 the count of extracting brackets. Ditto for the "once only" bracket, 2362 which is in Perl from version 5.005. */ 2363 2364 case ':': 2365 case '=': 2366 case '!': 2367 ptr += 2; 2368 break; 2369 2370 /* Else loop checking valid options until ) is met. Anything else is an 2371 error. If we are without any brackets, i.e. at top level, the settings 2372 act as if specified in the options, so massage the options immediately. 2373 This is for backward compatibility with Perl 5.004. */ 2374 2375 default: 2376 errorcode = ERR12; 2377 return -1; 2378 } 2379 } else 2380 capturing = 1; 2381 2382 /* Capturing brackets must be counted so we can process escapes in a 2383 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need 2384 an additional 3 bytes of memory per capturing bracket. */ 2385 2386 if (capturing) { 2387 bracount++; 2388 if (bracount > EXTRACT_BASIC_MAX) 2389 bracket_length += 3; 2390 } 2391 2392 /* Save length for computing whole length at end if there's a repeat that 2393 requires duplication of the group. Also save the current value of 2394 branch_extra, and start the new group with the new value. If non-zero, this 2395 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ 2396 2397 if (brastackptr >= sizeof(brastack)/sizeof(int)) { 2398 errorcode = ERR17; 2399 return -1; 2400 } 2401 2402 bralenstack[brastackptr] = branch_extra; 2403 branch_extra = branch_newextra; 2404 2405 brastack[brastackptr++] = length; 2406 length += bracket_length; 2407 continue; 2408 } 2409 2410 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we 2411 have to replicate this bracket up to that many times. If brastackptr is 2412 0 this is an unmatched bracket which will generate an error, but take care 2413 not to try to access brastack[-1] when computing the length and restoring 2414 the branch_extra value. */ 2415 2416 case ')': { 2417 int duplength; 2418 length += 1 + LINK_SIZE; 2419 if (brastackptr > 0) { 2420 duplength = length - brastack[--brastackptr]; 2421 branch_extra = bralenstack[brastackptr]; 2422 } 2423 else 2424 duplength = 0; 2425 2426 /* Leave ptr at the final char; for readRepeatCounts this happens 2427 automatically; for the others we need an increment. */ 2428 2429 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) { 2430 ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode); 2431 if (errorcode) 2432 return -1; 2433 } else if (c == '*') { 2434 minRepeats = 0; 2435 maxRepeats = -1; 2436 ptr++; 2437 } else if (c == '+') { 2438 minRepeats = 1; 2439 maxRepeats = -1; 2440 ptr++; 2441 } else if (c == '?') { 2442 minRepeats = 0; 2443 maxRepeats = 1; 2444 ptr++; 2445 } else { 2446 minRepeats = 1; 2447 maxRepeats = 1; 2448 } 2449 2450 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the 2451 group, and if the maximum is greater than zero, we have to replicate 2452 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting 2453 bracket set. */ 2454 2455 int repeatsLength; 2456 if (minRepeats == 0) { 2457 length++; 2458 if (maxRepeats > 0) { 2459 repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE); 2460 if (repeatsLength < 0) { 2461 errorcode = ERR16; 2462 return -1; 2463 } 2464 length += repeatsLength; 2465 if (length > MAX_PATTERN_SIZE) { 2466 errorcode = ERR16; 2467 return -1; 2468 } 2469 } 2470 } 2471 2472 /* When the minimum is greater than zero, we have to replicate up to 2473 minval-1 times, with no additions required in the copies. Then, if there 2474 is a limited maximum we have to replicate up to maxval-1 times allowing 2475 for a BRAZERO item before each optional copy and nesting brackets for all 2476 but one of the optional copies. */ 2477 2478 else { 2479 repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength); 2480 if (repeatsLength < 0) { 2481 errorcode = ERR16; 2482 return -1; 2483 } 2484 length += repeatsLength; 2485 if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */ 2486 repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE); 2487 if (repeatsLength < 0) { 2488 errorcode = ERR16; 2489 return -1; 2490 } 2491 length += repeatsLength - (2 + 2 * LINK_SIZE); 2492 } 2493 if (length > MAX_PATTERN_SIZE) { 2494 errorcode = ERR16; 2495 return -1; 2496 } 2497 } 2498 2499 /* Allow space for once brackets for "possessive quantifier" */ 2500 2501 if (safelyCheckNextChar(ptr, patternEnd, '+')) { 2502 ptr++; 2503 length += 2 + 2 * LINK_SIZE; 2504 } 2505 continue; 2506 } 2507 2508 /* Non-special character. It won't be space or # in extended mode, so it is 2509 always a genuine character. If we are in a \Q...\E sequence, check for the 2510 end; if not, we have a literal. */ 2511 2512 default: 2513 NORMAL_CHAR: 2514 length += 2; /* For a one-byte character */ 2515 lastitemlength = 1; /* Default length of last item for repeats */ 2516 2517 if (c > 127) { 2518 int i; 2519 for (i = 0; i < jsc_pcre_utf8_table1_size; i++) 2520 if (c <= jsc_pcre_utf8_table1[i]) 2521 break; 2522 length += i; 2523 lastitemlength += i; 2524 } 2525 2526 continue; 2527 } 2528 } 2529 2530 length += 2 + LINK_SIZE; /* For final KET and END */ 2531 2532 cd.numCapturingBrackets = bracount; 2533 return length; 2534 } 2535 2536 /************************************************* 2537 * Compile a Regular Expression * 2538 *************************************************/ 2539 2540 /* This function takes a string and returns a pointer to a block of store 2541 holding a compiled version of the expression. The original API for this 2542 function had no error code return variable; it is retained for backwards 2543 compatibility. The new function is given a new name. 2544 2545 Arguments: 2546 pattern the regular expression 2547 options various option bits 2548 errorCodePtr pointer to error code variable (pcre_compile2() only) 2549 can be NULL if you don't want a code value 2550 errorPtr pointer to pointer to error text 2551 erroroffset ptr offset in pattern where error was detected 2552 tables pointer to character tables or NULL 2553 2554 Returns: pointer to compiled data block, or NULL on error, 2555 with errorPtr and erroroffset set 2556 */ 2557 2558 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr) 2559 { 2560 *errorPtr = errorText(errorcode); 2561 return 0; 2562 } 2563 2564 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength, 2565 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline, 2566 unsigned* numSubpatterns, const char** errorPtr) 2567 { 2568 /* We can't pass back an error message if errorPtr is NULL; I guess the best we 2569 can do is just return NULL, but we can set a code value if there is a code pointer. */ 2570 if (!errorPtr) 2571 return 0; 2572 *errorPtr = NULL; 2573 2574 CompileData cd; 2575 2576 ErrorCode errorcode = ERR0; 2577 /* Call this once just to count the brackets. */ 2578 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode); 2579 /* Call it again to compute the length. */ 2580 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode); 2581 if (errorcode) 2582 return returnError(errorcode, errorPtr); 2583 2584 if (length > MAX_PATTERN_SIZE) 2585 return returnError(ERR16, errorPtr); 2586 2587 size_t size = length + sizeof(JSRegExp); 2588 #if REGEXP_HISTOGRAM 2589 size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar); 2590 size = stringOffset + patternLength * sizeof(UChar); 2591 #endif 2592 JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]); 2593 2594 if (!re) 2595 return returnError(ERR13, errorPtr); 2596 2597 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0); 2598 2599 /* The starting points of the name/number translation table and of the code are 2600 passed around in the compile data block. */ 2601 2602 const unsigned char* codeStart = (const unsigned char*)(re + 1); 2603 2604 /* Set up a starting, non-extracting bracket, then compile the expression. On 2605 error, errorcode will be set non-zero, so we don't need to look at the result 2606 of the function here. */ 2607 2608 const UChar* ptr = (const UChar*)pattern; 2609 const UChar* patternEnd = pattern + patternLength; 2610 unsigned char* code = const_cast<unsigned char*>(codeStart); 2611 int firstByte, reqByte; 2612 int bracketCount = 0; 2613 if (!cd.needOuterBracket) 2614 compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd); 2615 else { 2616 *code = OP_BRA; 2617 compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstByte, &reqByte, cd); 2618 } 2619 re->topBracket = bracketCount; 2620 re->topBackref = cd.topBackref; 2621 2622 /* If not reached end of pattern on success, there's an excess bracket. */ 2623 2624 if (errorcode == 0 && ptr < patternEnd) 2625 errorcode = ERR10; 2626 2627 /* Fill in the terminating state and check for disastrous overflow, but 2628 if debugging, leave the test till after things are printed out. */ 2629 2630 *code++ = OP_END; 2631 2632 ASSERT(code - codeStart <= length); 2633 if (code - codeStart > length) 2634 errorcode = ERR7; 2635 2636 /* Give an error if there's back reference to a non-existent capturing 2637 subpattern. */ 2638 2639 if (re->topBackref > re->topBracket) 2640 errorcode = ERR15; 2641 2642 /* Failed to compile, or error while post-processing */ 2643 2644 if (errorcode != ERR0) { 2645 delete [] reinterpret_cast<char*>(re); 2646 return returnError(errorcode, errorPtr); 2647 } 2648 2649 /* If the anchored option was not passed, set the flag if we can determine that 2650 the pattern is anchored by virtue of ^ characters or \A or anything else (such 2651 as starting with .* when DOTALL is set). 2652 2653 Otherwise, if we know what the first character has to be, save it, because that 2654 speeds up unanchored matches no end. If not, see if we can set the 2655 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches 2656 start with ^. and also when all branches start with .* for non-DOTALL matches. 2657 */ 2658 2659 if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart)) 2660 re->options |= IsAnchoredOption; 2661 else { 2662 if (firstByte < 0) { 2663 firstByte = (cd.needOuterBracket 2664 ? bracketFindFirstAssertedCharacter(codeStart, false) 2665 : branchFindFirstAssertedCharacter(codeStart, false)) 2666 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0); 2667 } 2668 if (firstByte >= 0) { 2669 int ch = firstByte & 255; 2670 if (ch < 127) { 2671 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte; 2672 re->options |= UseFirstByteOptimizationOption; 2673 } 2674 } else { 2675 if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap)) 2676 re->options |= UseMultiLineFirstByteOptimizationOption; 2677 } 2678 } 2679 2680 /* For an anchored pattern, we use the "required byte" only if it follows a 2681 variable length item in the regex. Remove the caseless flag for non-caseable 2682 bytes. */ 2683 2684 if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) { 2685 int ch = reqByte & 255; 2686 if (ch < 127) { 2687 re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte; 2688 re->options |= UseRequiredByteOptimizationOption; 2689 } 2690 } 2691 2692 #if REGEXP_HISTOGRAM 2693 re->stringOffset = stringOffset; 2694 re->stringLength = patternLength; 2695 memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2); 2696 #endif 2697 2698 if (numSubpatterns) 2699 *numSubpatterns = re->topBracket; 2700 return re; 2701 } 2702 2703 void jsRegExpFree(JSRegExp* re) 2704 { 2705 delete [] reinterpret_cast<char*>(re); 2706 } 2707