1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 // 4 // file: regexcmp.cpp 5 // 6 // Copyright (C) 2002-2016 International Business Machines Corporation and others. 7 // All Rights Reserved. 8 // 9 // This file contains the ICU regular expression compiler, which is responsible 10 // for processing a regular expression pattern into the compiled form that 11 // is used by the match finding engine. 12 // 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 17 18 #include "unicode/ustring.h" 19 #include "unicode/unistr.h" 20 #include "unicode/uniset.h" 21 #include "unicode/uchar.h" 22 #include "unicode/uchriter.h" 23 #include "unicode/parsepos.h" 24 #include "unicode/parseerr.h" 25 #include "unicode/regex.h" 26 #include "unicode/utf.h" 27 #include "unicode/utf16.h" 28 #include "patternprops.h" 29 #include "putilimp.h" 30 #include "cmemory.h" 31 #include "cstr.h" 32 #include "cstring.h" 33 #include "uvectr32.h" 34 #include "uvectr64.h" 35 #include "uassert.h" 36 #include "uinvchar.h" 37 38 #include "regeximp.h" 39 #include "regexcst.h" // Contains state table for the regex pattern parser. 40 // generated by a Perl script. 41 #include "regexcmp.h" 42 #include "regexst.h" 43 #include "regextxt.h" 44 45 46 47 U_NAMESPACE_BEGIN 48 49 50 //------------------------------------------------------------------------------ 51 // 52 // Constructor. 53 // 54 //------------------------------------------------------------------------------ 55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 56 fParenStack(status), fSetStack(status), fSetOpStack(status) 57 { 58 // Lazy init of all shared global sets (needed for init()'s empty text) 59 RegexStaticSets::initGlobals(&status); 60 61 fStatus = &status; 62 63 fRXPat = rxp; 64 fScanIndex = 0; 65 fLastChar = -1; 66 fPeekChar = -1; 67 fLineNum = 1; 68 fCharNum = 0; 69 fQuoteMode = FALSE; 70 fInBackslashQuote = FALSE; 71 fModeFlags = fRXPat->fFlags | 0x80000000; 72 fEOLComments = TRUE; 73 74 fMatchOpenParen = -1; 75 fMatchCloseParen = -1; 76 fCaptureName = NULL; 77 fLastSetLiteral = U_SENTINEL; 78 79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 80 status = rxp->fDeferredStatus; 81 } 82 } 83 84 static const UChar chAmp = 0x26; // '&' 85 static const UChar chDash = 0x2d; // '-' 86 87 88 //------------------------------------------------------------------------------ 89 // 90 // Destructor 91 // 92 //------------------------------------------------------------------------------ 93 RegexCompile::~RegexCompile() { 94 delete fCaptureName; // Normally will be NULL, but can exist if pattern 95 // compilation stops with a syntax error. 96 } 97 98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 100 } 101 102 //------------------------------------------------------------------------------ 103 // 104 // Compile regex pattern. The state machine for rexexp pattern parsing is here. 105 // The state tables are hand-written in the file regexcst.txt, 106 // and converted to the form used here by a perl 107 // script regexcst.pl 108 // 109 //------------------------------------------------------------------------------ 110 void RegexCompile::compile( 111 const UnicodeString &pat, // Source pat to be compiled. 112 UParseError &pp, // Error position info 113 UErrorCode &e) // Error Code 114 { 115 fRXPat->fPatternString = new UnicodeString(pat); 116 UText patternText = UTEXT_INITIALIZER; 117 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 118 119 if (U_SUCCESS(e)) { 120 compile(&patternText, pp, e); 121 utext_close(&patternText); 122 } 123 } 124 125 // 126 // compile, UText mode 127 // All the work is actually done here. 128 // 129 void RegexCompile::compile( 130 UText *pat, // Source pat to be compiled. 131 UParseError &pp, // Error position info 132 UErrorCode &e) // Error Code 133 { 134 fStatus = &e; 135 fParseErr = &pp; 136 fStackPtr = 0; 137 fStack[fStackPtr] = 0; 138 139 if (U_FAILURE(*fStatus)) { 140 return; 141 } 142 143 // There should be no pattern stuff in the RegexPattern object. They can not be reused. 144 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 145 146 // Prepare the RegexPattern object to receive the compiled pattern. 147 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 148 if (U_FAILURE(*fStatus)) { 149 return; 150 } 151 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 152 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 153 154 155 // Initialize the pattern scanning state machine 156 fPatternLength = utext_nativeLength(pat); 157 uint16_t state = 1; 158 const RegexTableEl *tableEl; 159 160 // UREGEX_LITERAL force entire pattern to be treated as a literal string. 161 if (fModeFlags & UREGEX_LITERAL) { 162 fQuoteMode = TRUE; 163 } 164 165 nextChar(fC); // Fetch the first char from the pattern string. 166 167 // 168 // Main loop for the regex pattern parsing state machine. 169 // Runs once per state transition. 170 // Each time through optionally performs, depending on the state table, 171 // - an advance to the the next pattern char 172 // - an action to be performed. 173 // - pushing or popping a state to/from the local state return stack. 174 // file regexcst.txt is the source for the state table. The logic behind 175 // recongizing the pattern syntax is there, not here. 176 // 177 for (;;) { 178 // Bail out if anything has gone wrong. 179 // Regex pattern parsing stops on the first error encountered. 180 if (U_FAILURE(*fStatus)) { 181 break; 182 } 183 184 U_ASSERT(state != 0); 185 186 // Find the state table element that matches the input char from the pattern, or the 187 // class of the input character. Start with the first table row for this 188 // state, then linearly scan forward until we find a row that matches the 189 // character. The last row for each state always matches all characters, so 190 // the search will stop there, if not before. 191 // 192 tableEl = &gRuleParseStateTable[state]; 193 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 194 fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 195 196 for (;;) { // loop through table rows belonging to this state, looking for one 197 // that matches the current input char. 198 REGEX_SCAN_DEBUG_PRINTF((".")); 199 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 200 // Table row specified an individual character, not a set, and 201 // the input character is not quoted, and 202 // the input character matched it. 203 break; 204 } 205 if (tableEl->fCharClass == 255) { 206 // Table row specified default, match anything character class. 207 break; 208 } 209 if (tableEl->fCharClass == 254 && fC.fQuoted) { 210 // Table row specified "quoted" and the char was quoted. 211 break; 212 } 213 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 214 // Table row specified eof and we hit eof on the input. 215 break; 216 } 217 218 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 219 fC.fQuoted == FALSE && // char is not escaped && 220 fC.fChar != (UChar32)-1) { // char is not EOF 221 U_ASSERT(tableEl->fCharClass <= 137); 222 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 223 // Table row specified a character class, or set of characters, 224 // and the current char matches it. 225 break; 226 } 227 } 228 229 // No match on this row, advance to the next row for this state, 230 tableEl++; 231 } 232 REGEX_SCAN_DEBUG_PRINTF(("\n")); 233 234 // 235 // We've found the row of the state table that matches the current input 236 // character from the rules string. 237 // Perform any action specified by this row in the state table. 238 if (doParseActions(tableEl->fAction) == FALSE) { 239 // Break out of the state machine loop if the 240 // the action signalled some kind of error, or 241 // the action was to exit, occurs on normal end-of-rules-input. 242 break; 243 } 244 245 if (tableEl->fPushState != 0) { 246 fStackPtr++; 247 if (fStackPtr >= kStackSize) { 248 error(U_REGEX_INTERNAL_ERROR); 249 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 250 fStackPtr--; 251 } 252 fStack[fStackPtr] = tableEl->fPushState; 253 } 254 255 // 256 // NextChar. This is where characters are actually fetched from the pattern. 257 // Happens under control of the 'n' tag in the state table. 258 // 259 if (tableEl->fNextChar) { 260 nextChar(fC); 261 } 262 263 // Get the next state from the table entry, or from the 264 // state stack if the next state was specified as "pop". 265 if (tableEl->fNextState != 255) { 266 state = tableEl->fNextState; 267 } else { 268 state = fStack[fStackPtr]; 269 fStackPtr--; 270 if (fStackPtr < 0) { 271 // state stack underflow 272 // This will occur if the user pattern has mis-matched parentheses, 273 // with extra close parens. 274 // 275 fStackPtr++; 276 error(U_REGEX_MISMATCHED_PAREN); 277 } 278 } 279 280 } 281 282 if (U_FAILURE(*fStatus)) { 283 // Bail out if the pattern had errors. 284 // Set stack cleanup: a successful compile would have left it empty, 285 // but errors can leave temporary sets hanging around. 286 while (!fSetStack.empty()) { 287 delete (UnicodeSet *)fSetStack.pop(); 288 } 289 return; 290 } 291 292 // 293 // The pattern has now been read and processed, and the compiled code generated. 294 // 295 296 // 297 // The pattern's fFrameSize so far has accumulated the requirements for 298 // storage for capture parentheses, counters, etc. that are encountered 299 // in the pattern. Add space for the two variables that are always 300 // present in the saved state: the input string position (int64_t) and 301 // the position in the compiled pattern. 302 // 303 allocateStackData(RESTACKFRAME_HDRCOUNT); 304 305 // 306 // Optimization pass 1: NOPs, back-references, and case-folding 307 // 308 stripNOPs(); 309 310 // 311 // Get bounds for the minimum and maximum length of a string that this 312 // pattern can match. Used to avoid looking for matches in strings that 313 // are too short. 314 // 315 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 316 317 // 318 // Optimization pass 2: match start type 319 // 320 matchStartType(); 321 322 // 323 // Set up fast latin-1 range sets 324 // 325 int32_t numSets = fRXPat->fSets->size(); 326 fRXPat->fSets8 = new Regex8BitSet[numSets]; 327 // Null pointer check. 328 if (fRXPat->fSets8 == NULL) { 329 e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 330 return; 331 } 332 int32_t i; 333 for (i=0; i<numSets; i++) { 334 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 335 fRXPat->fSets8[i].init(s); 336 } 337 338 } 339 340 341 342 343 344 //------------------------------------------------------------------------------ 345 // 346 // doParseAction Do some action during regex pattern parsing. 347 // Called by the parse state machine. 348 // 349 // Generation of the match engine PCode happens here, or 350 // in functions called from the parse actions defined here. 351 // 352 // 353 //------------------------------------------------------------------------------ 354 UBool RegexCompile::doParseActions(int32_t action) 355 { 356 UBool returnVal = TRUE; 357 358 switch ((Regex_PatternParseAction)action) { 359 360 case doPatStart: 361 // Start of pattern compiles to: 362 //0 SAVE 2 Fall back to position of FAIL 363 //1 jmp 3 364 //2 FAIL Stop if we ever reach here. 365 //3 NOP Dummy, so start of pattern looks the same as 366 // the start of an ( grouping. 367 //4 NOP Resreved, will be replaced by a save if there are 368 // OR | operators at the top level 369 appendOp(URX_STATE_SAVE, 2); 370 appendOp(URX_JMP, 3); 371 appendOp(URX_FAIL, 0); 372 373 // Standard open nonCapture paren action emits the two NOPs and 374 // sets up the paren stack frame. 375 doParseActions(doOpenNonCaptureParen); 376 break; 377 378 case doPatFinish: 379 // We've scanned to the end of the pattern 380 // The end of pattern compiles to: 381 // URX_END 382 // which will stop the runtime match engine. 383 // Encountering end of pattern also behaves like a close paren, 384 // and forces fixups of the State Save at the beginning of the compiled pattern 385 // and of any OR operations at the top level. 386 // 387 handleCloseParen(); 388 if (fParenStack.size() > 0) { 389 // Missing close paren in pattern. 390 error(U_REGEX_MISMATCHED_PAREN); 391 } 392 393 // add the END operation to the compiled pattern. 394 appendOp(URX_END, 0); 395 396 // Terminate the pattern compilation state machine. 397 returnVal = FALSE; 398 break; 399 400 401 402 case doOrOperator: 403 // Scanning a '|', as in (A|B) 404 { 405 // Generate code for any pending literals preceding the '|' 406 fixLiterals(FALSE); 407 408 // Insert a SAVE operation at the start of the pattern section preceding 409 // this OR at this level. This SAVE will branch the match forward 410 // to the right hand side of the OR in the event that the left hand 411 // side fails to match and backtracks. Locate the position for the 412 // save from the location on the top of the parentheses stack. 413 int32_t savePosition = fParenStack.popi(); 414 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 415 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 416 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 417 fRXPat->fCompiledPat->setElementAt(op, savePosition); 418 419 // Append an JMP operation into the compiled pattern. The operand for 420 // the JMP will eventually be the location following the ')' for the 421 // group. This will be patched in later, when the ')' is encountered. 422 appendOp(URX_JMP, 0); 423 424 // Push the position of the newly added JMP op onto the parentheses stack. 425 // This registers if for fixup when this block's close paren is encountered. 426 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 427 428 // Append a NOP to the compiled pattern. This is the slot reserved 429 // for a SAVE in the event that there is yet another '|' following 430 // this one. 431 appendOp(URX_NOP, 0); 432 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 433 } 434 break; 435 436 437 case doBeginNamedCapture: 438 // Scanning (?<letter. 439 // The first letter of the name will come through again under doConinueNamedCapture. 440 fCaptureName = new UnicodeString(); 441 if (fCaptureName == NULL) { 442 error(U_MEMORY_ALLOCATION_ERROR); 443 } 444 break; 445 446 case doContinueNamedCapture: 447 fCaptureName->append(fC.fChar); 448 break; 449 450 case doBadNamedCapture: 451 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 452 break; 453 454 case doOpenCaptureParen: 455 // Open Capturing Paren, possibly named. 456 // Compile to a 457 // - NOP, which later may be replaced by a save-state if the 458 // parenthesized group gets a * quantifier, followed by 459 // - START_CAPTURE n where n is stack frame offset to the capture group variables. 460 // - NOP, which may later be replaced by a save-state if there 461 // is an '|' alternation within the parens. 462 // 463 // Each capture group gets three slots in the save stack frame: 464 // 0: Capture Group start position (in input string being matched.) 465 // 1: Capture Group end position. 466 // 2: Start of Match-in-progress. 467 // The first two locations are for a completed capture group, and are 468 // referred to by back references and the like. 469 // The third location stores the capture start position when an START_CAPTURE is 470 // encountered. This will be promoted to a completed capture when (and if) the corresponding 471 // END_CAPTURE is encountered. 472 { 473 fixLiterals(); 474 appendOp(URX_NOP, 0); 475 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame. 476 appendOp(URX_START_CAPTURE, varsLoc); 477 appendOp(URX_NOP, 0); 478 479 // On the Parentheses stack, start a new frame and add the postions 480 // of the two NOPs. Depending on what follows in the pattern, the 481 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 482 // address of the end of the parenthesized group. 483 fParenStack.push(fModeFlags, *fStatus); // Match mode state 484 fParenStack.push(capturing, *fStatus); // Frame type. 485 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 486 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 487 488 // Save the mapping from group number to stack frame variable position. 489 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 490 491 // If this is a named capture group, add the name->group number mapping. 492 if (fCaptureName != NULL) { 493 int32_t groupNumber = fRXPat->fGroupMap->size(); 494 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus); 495 fCaptureName = NULL; // hash table takes ownership of the name (key) string. 496 if (previousMapping > 0 && U_SUCCESS(*fStatus)) { 497 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 498 } 499 } 500 } 501 break; 502 503 case doOpenNonCaptureParen: 504 // Open non-caputuring (grouping only) Paren. 505 // Compile to a 506 // - NOP, which later may be replaced by a save-state if the 507 // parenthesized group gets a * quantifier, followed by 508 // - NOP, which may later be replaced by a save-state if there 509 // is an '|' alternation within the parens. 510 { 511 fixLiterals(); 512 appendOp(URX_NOP, 0); 513 appendOp(URX_NOP, 0); 514 515 // On the Parentheses stack, start a new frame and add the postions 516 // of the two NOPs. 517 fParenStack.push(fModeFlags, *fStatus); // Match mode state 518 fParenStack.push(plain, *fStatus); // Begin a new frame. 519 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 520 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 521 } 522 break; 523 524 525 case doOpenAtomicParen: 526 // Open Atomic Paren. (?> 527 // Compile to a 528 // - NOP, which later may be replaced if the parenthesized group 529 // has a quantifier, followed by 530 // - STO_SP save state stack position, so it can be restored at the ")" 531 // - NOP, which may later be replaced by a save-state if there 532 // is an '|' alternation within the parens. 533 { 534 fixLiterals(); 535 appendOp(URX_NOP, 0); 536 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr. 537 appendOp(URX_STO_SP, varLoc); 538 appendOp(URX_NOP, 0); 539 540 // On the Parentheses stack, start a new frame and add the postions 541 // of the two NOPs. Depending on what follows in the pattern, the 542 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 543 // address of the end of the parenthesized group. 544 fParenStack.push(fModeFlags, *fStatus); // Match mode state 545 fParenStack.push(atomic, *fStatus); // Frame type. 546 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 547 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 548 } 549 break; 550 551 552 case doOpenLookAhead: 553 // Positive Look-ahead (?= stuff ) 554 // 555 // Note: Addition of transparent input regions, with the need to 556 // restore the original regions when failing out of a lookahead 557 // block, complicated this sequence. Some conbined opcodes 558 // might make sense - or might not, lookahead aren't that common. 559 // 560 // Caution: min match length optimization knows about this 561 // sequence; don't change without making updates there too. 562 // 563 // Compiles to 564 // 1 START_LA dataLoc Saves SP, Input Pos 565 // 2. STATE_SAVE 4 on failure of lookahead, goto 4 566 // 3 JMP 6 continue ... 567 // 568 // 4. LA_END Look Ahead failed. Restore regions. 569 // 5. BACKTRACK and back track again. 570 // 571 // 6. NOP reserved for use by quantifiers on the block. 572 // Look-ahead can't have quantifiers, but paren stack 573 // compile time conventions require the slot anyhow. 574 // 7. NOP may be replaced if there is are '|' ops in the block. 575 // 8. code for parenthesized stuff. 576 // 9. LA_END 577 // 578 // Two data slots are reserved, for saving the stack ptr and the input position. 579 { 580 fixLiterals(); 581 int32_t dataLoc = allocateData(2); 582 appendOp(URX_LA_START, dataLoc); 583 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 584 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 585 appendOp(URX_LA_END, dataLoc); 586 appendOp(URX_BACKTRACK, 0); 587 appendOp(URX_NOP, 0); 588 appendOp(URX_NOP, 0); 589 590 // On the Parentheses stack, start a new frame and add the postions 591 // of the NOPs. 592 fParenStack.push(fModeFlags, *fStatus); // Match mode state 593 fParenStack.push(lookAhead, *fStatus); // Frame type. 594 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 595 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 596 } 597 break; 598 599 case doOpenLookAheadNeg: 600 // Negated Lookahead. (?! stuff ) 601 // Compiles to 602 // 1. START_LA dataloc 603 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 604 // // which continues with the match. 605 // 3. NOP // Std. Open Paren sequence, for possible '|' 606 // 4. code for parenthesized stuff. 607 // 5. END_LA // Cut back stack, remove saved state from step 2. 608 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 609 // 7. END_LA // Restore match region, in case look-ahead was using 610 // an alternate (transparent) region. 611 { 612 fixLiterals(); 613 int32_t dataLoc = allocateData(2); 614 appendOp(URX_LA_START, dataLoc); 615 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later. 616 appendOp(URX_NOP, 0); 617 618 // On the Parentheses stack, start a new frame and add the postions 619 // of the StateSave and NOP. 620 fParenStack.push(fModeFlags, *fStatus); // Match mode state 621 fParenStack.push(negLookAhead, *fStatus); // Frame type 622 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 623 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 624 625 // Instructions #5 - #7 will be added when the ')' is encountered. 626 } 627 break; 628 629 case doOpenLookBehind: 630 { 631 // Compile a (?<= look-behind open paren. 632 // 633 // Compiles to 634 // 0 URX_LB_START dataLoc 635 // 1 URX_LB_CONT dataLoc 636 // 2 MinMatchLen 637 // 3 MaxMatchLen 638 // 4 URX_NOP Standard '(' boilerplate. 639 // 5 URX_NOP Reserved slot for use with '|' ops within (block). 640 // 6 <code for LookBehind expression> 641 // 7 URX_LB_END dataLoc # Check match len, restore input len 642 // 8 URX_LA_END dataLoc # Restore stack, input pos 643 // 644 // Allocate a block of matcher data, to contain (when running a match) 645 // 0: Stack ptr on entry 646 // 1: Input Index on entry 647 // 2: Start index of match current match attempt. 648 // 3: Original Input String len. 649 650 // Generate match code for any pending literals. 651 fixLiterals(); 652 653 // Allocate data space 654 int32_t dataLoc = allocateData(4); 655 656 // Emit URX_LB_START 657 appendOp(URX_LB_START, dataLoc); 658 659 // Emit URX_LB_CONT 660 appendOp(URX_LB_CONT, dataLoc); 661 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. 662 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. 663 664 // Emit the NOPs 665 appendOp(URX_NOP, 0); 666 appendOp(URX_NOP, 0); 667 668 // On the Parentheses stack, start a new frame and add the postions 669 // of the URX_LB_CONT and the NOP. 670 fParenStack.push(fModeFlags, *fStatus); // Match mode state 671 fParenStack.push(lookBehind, *fStatus); // Frame type 672 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 673 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 674 675 // The final two instructions will be added when the ')' is encountered. 676 } 677 678 break; 679 680 case doOpenLookBehindNeg: 681 { 682 // Compile a (?<! negated look-behind open paren. 683 // 684 // Compiles to 685 // 0 URX_LB_START dataLoc # Save entry stack, input len 686 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 687 // 2 MinMatchLen 688 // 3 MaxMatchLen 689 // 4 continueLoc (9) 690 // 5 URX_NOP Standard '(' boilerplate. 691 // 6 URX_NOP Reserved slot for use with '|' ops within (block). 692 // 7 <code for LookBehind expression> 693 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 694 // 9 ... 695 // 696 // Allocate a block of matcher data, to contain (when running a match) 697 // 0: Stack ptr on entry 698 // 1: Input Index on entry 699 // 2: Start index of match current match attempt. 700 // 3: Original Input String len. 701 702 // Generate match code for any pending literals. 703 fixLiterals(); 704 705 // Allocate data space 706 int32_t dataLoc = allocateData(4); 707 708 // Emit URX_LB_START 709 appendOp(URX_LB_START, dataLoc); 710 711 // Emit URX_LBN_CONT 712 appendOp(URX_LBN_CONT, dataLoc); 713 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. 714 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. 715 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later. 716 717 // Emit the NOPs 718 appendOp(URX_NOP, 0); 719 appendOp(URX_NOP, 0); 720 721 // On the Parentheses stack, start a new frame and add the postions 722 // of the URX_LB_CONT and the NOP. 723 fParenStack.push(fModeFlags, *fStatus); // Match mode state 724 fParenStack.push(lookBehindN, *fStatus); // Frame type 725 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 726 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 727 728 // The final two instructions will be added when the ')' is encountered. 729 } 730 break; 731 732 case doConditionalExpr: 733 // Conditionals such as (?(1)a:b) 734 case doPerlInline: 735 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 736 error(U_REGEX_UNIMPLEMENTED); 737 break; 738 739 740 case doCloseParen: 741 handleCloseParen(); 742 if (fParenStack.size() <= 0) { 743 // Extra close paren, or missing open paren. 744 error(U_REGEX_MISMATCHED_PAREN); 745 } 746 break; 747 748 case doNOP: 749 break; 750 751 752 case doBadOpenParenType: 753 case doRuleError: 754 error(U_REGEX_RULE_SYNTAX); 755 break; 756 757 758 case doMismatchedParenErr: 759 error(U_REGEX_MISMATCHED_PAREN); 760 break; 761 762 case doPlus: 763 // Normal '+' compiles to 764 // 1. stuff to be repeated (already built) 765 // 2. jmp-sav 1 766 // 3. ... 767 // 768 // Or, if the item to be repeated can match a zero length string, 769 // 1. STO_INP_LOC data-loc 770 // 2. body of stuff to be repeated 771 // 3. JMP_SAV_X 2 772 // 4. ... 773 774 // 775 // Or, if the item to be repeated is simple 776 // 1. Item to be repeated. 777 // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 778 // 3. LOOP_C stack location 779 { 780 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 781 int32_t frameLoc; 782 783 // Check for simple constructs, which may get special optimized code. 784 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 785 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 786 787 if (URX_TYPE(repeatedOp) == URX_SETREF) { 788 // Emit optimized code for [char set]+ 789 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 790 frameLoc = allocateStackData(1); 791 appendOp(URX_LOOP_C, frameLoc); 792 break; 793 } 794 795 if (URX_TYPE(repeatedOp) == URX_DOTANY || 796 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 797 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 798 // Emit Optimized code for .+ operations. 799 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); 800 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 801 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 802 loopOpI |= 1; 803 } 804 if (fModeFlags & UREGEX_UNIX_LINES) { 805 loopOpI |= 2; 806 } 807 appendOp(loopOpI); 808 frameLoc = allocateStackData(1); 809 appendOp(URX_LOOP_C, frameLoc); 810 break; 811 } 812 813 } 814 815 // General case. 816 817 // Check for minimum match length of zero, which requires 818 // extra loop-breaking code. 819 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 820 // Zero length match is possible. 821 // Emit the code sequence that can handle it. 822 insertOp(topLoc); 823 frameLoc = allocateStackData(1); 824 825 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); 826 fRXPat->fCompiledPat->setElementAt(op, topLoc); 827 828 appendOp(URX_JMP_SAV_X, topLoc+1); 829 } else { 830 // Simpler code when the repeated body must match something non-empty 831 appendOp(URX_JMP_SAV, topLoc); 832 } 833 } 834 break; 835 836 case doNGPlus: 837 // Non-greedy '+?' compiles to 838 // 1. stuff to be repeated (already built) 839 // 2. state-save 1 840 // 3. ... 841 { 842 int32_t topLoc = blockTopLoc(FALSE); 843 appendOp(URX_STATE_SAVE, topLoc); 844 } 845 break; 846 847 848 case doOpt: 849 // Normal (greedy) ? quantifier. 850 // Compiles to 851 // 1. state save 3 852 // 2. body of optional block 853 // 3. ... 854 // Insert the state save into the compiled pattern, and we're done. 855 { 856 int32_t saveStateLoc = blockTopLoc(TRUE); 857 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 858 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 859 } 860 break; 861 862 case doNGOpt: 863 // Non-greedy ?? quantifier 864 // compiles to 865 // 1. jmp 4 866 // 2. body of optional block 867 // 3 jmp 5 868 // 4. state save 2 869 // 5 ... 870 // This code is less than ideal, with two jmps instead of one, because we can only 871 // insert one instruction at the top of the block being iterated. 872 { 873 int32_t jmp1_loc = blockTopLoc(TRUE); 874 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 875 876 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); 877 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 878 879 appendOp(URX_JMP, jmp2_loc+2); 880 881 appendOp(URX_STATE_SAVE, jmp1_loc+1); 882 } 883 break; 884 885 886 case doStar: 887 // Normal (greedy) * quantifier. 888 // Compiles to 889 // 1. STATE_SAVE 4 890 // 2. body of stuff being iterated over 891 // 3. JMP_SAV 2 892 // 4. ... 893 // 894 // Or, if the body is a simple [Set], 895 // 1. LOOP_SR_I set number 896 // 2. LOOP_C stack location 897 // ... 898 // 899 // Or if this is a .* 900 // 1. LOOP_DOT_I (. matches all mode flag) 901 // 2. LOOP_C stack location 902 // 903 // Or, if the body can match a zero-length string, to inhibit infinite loops, 904 // 1. STATE_SAVE 5 905 // 2. STO_INP_LOC data-loc 906 // 3. body of stuff 907 // 4. JMP_SAV_X 2 908 // 5. ... 909 { 910 // location of item #1, the STATE_SAVE 911 int32_t topLoc = blockTopLoc(FALSE); 912 int32_t dataLoc = -1; 913 914 // Check for simple *, where the construct being repeated 915 // compiled to single opcode, and might be optimizable. 916 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 917 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 918 919 if (URX_TYPE(repeatedOp) == URX_SETREF) { 920 // Emit optimized code for a [char set]* 921 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 922 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 923 dataLoc = allocateStackData(1); 924 appendOp(URX_LOOP_C, dataLoc); 925 break; 926 } 927 928 if (URX_TYPE(repeatedOp) == URX_DOTANY || 929 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 930 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 931 // Emit Optimized code for .* operations. 932 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); 933 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 934 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 935 loopOpI |= 1; 936 } 937 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 938 loopOpI |= 2; 939 } 940 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 941 dataLoc = allocateStackData(1); 942 appendOp(URX_LOOP_C, dataLoc); 943 break; 944 } 945 } 946 947 // Emit general case code for this * 948 // The optimizations did not apply. 949 950 int32_t saveStateLoc = blockTopLoc(TRUE); 951 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); 952 953 // Check for minimum match length of zero, which requires 954 // extra loop-breaking code. 955 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 956 insertOp(saveStateLoc); 957 dataLoc = allocateStackData(1); 958 959 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); 960 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 961 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); 962 } 963 964 // Locate the position in the compiled pattern where the match will continue 965 // after completing the *. (4 or 5 in the comment above) 966 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 967 968 // Put together the save state op and store it into the compiled code. 969 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); 970 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 971 972 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 973 appendOp(jmpOp); 974 } 975 break; 976 977 case doNGStar: 978 // Non-greedy *? quantifier 979 // compiles to 980 // 1. JMP 3 981 // 2. body of stuff being iterated over 982 // 3. STATE_SAVE 2 983 // 4 ... 984 { 985 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 986 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 987 int32_t jmpOp = buildOp(URX_JMP, saveLoc); 988 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 989 appendOp(URX_STATE_SAVE, jmpLoc+1); 990 } 991 break; 992 993 994 case doIntervalInit: 995 // The '{' opening an interval quantifier was just scanned. 996 // Init the counter varaiables that will accumulate the values as the digits 997 // are scanned. 998 fIntervalLow = 0; 999 fIntervalUpper = -1; 1000 break; 1001 1002 case doIntevalLowerDigit: 1003 // Scanned a digit from the lower value of an {lower,upper} interval 1004 { 1005 int32_t digitValue = u_charDigitValue(fC.fChar); 1006 U_ASSERT(digitValue >= 0); 1007 int64_t val = (int64_t)fIntervalLow*10 + digitValue; 1008 if (val > INT32_MAX) { 1009 error(U_REGEX_NUMBER_TOO_BIG); 1010 } else { 1011 fIntervalLow = (int32_t)val; 1012 } 1013 } 1014 break; 1015 1016 case doIntervalUpperDigit: 1017 // Scanned a digit from the upper value of an {lower,upper} interval 1018 { 1019 if (fIntervalUpper < 0) { 1020 fIntervalUpper = 0; 1021 } 1022 int32_t digitValue = u_charDigitValue(fC.fChar); 1023 U_ASSERT(digitValue >= 0); 1024 int64_t val = (int64_t)fIntervalUpper*10 + digitValue; 1025 if (val > INT32_MAX) { 1026 error(U_REGEX_NUMBER_TOO_BIG); 1027 } else { 1028 fIntervalUpper = (int32_t)val; 1029 } 1030 } 1031 break; 1032 1033 case doIntervalSame: 1034 // Scanned a single value interval like {27}. Upper = Lower. 1035 fIntervalUpper = fIntervalLow; 1036 break; 1037 1038 case doInterval: 1039 // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1040 if (compileInlineInterval() == FALSE) { 1041 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1042 } 1043 break; 1044 1045 case doPossessiveInterval: 1046 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1047 { 1048 // Remember the loc for the top of the block being looped over. 1049 // (Can not reserve a slot in the compiled pattern at this time, because 1050 // compileInterval needs to reserve also, and blockTopLoc can only reserve 1051 // once per block.) 1052 int32_t topLoc = blockTopLoc(FALSE); 1053 1054 // Produce normal looping code. 1055 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1056 1057 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1058 // just as if the loop was inclosed in atomic parentheses. 1059 1060 // First the STO_SP before the start of the loop 1061 insertOp(topLoc); 1062 1063 int32_t varLoc = allocateData(1); // Reserve a data location for saving the 1064 int32_t op = buildOp(URX_STO_SP, varLoc); 1065 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1066 1067 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1068 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1069 loopOp++; // point LoopOp after the just-inserted STO_SP 1070 fRXPat->fCompiledPat->push(loopOp, *fStatus); 1071 1072 // Then the LD_SP after the end of the loop 1073 appendOp(URX_LD_SP, varLoc); 1074 } 1075 1076 break; 1077 1078 case doNGInterval: 1079 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1080 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1081 break; 1082 1083 case doIntervalError: 1084 error(U_REGEX_BAD_INTERVAL); 1085 break; 1086 1087 case doLiteralChar: 1088 // We've just scanned a "normal" character from the pattern, 1089 literalChar(fC.fChar); 1090 break; 1091 1092 1093 case doEscapedLiteralChar: 1094 // We've just scanned an backslashed escaped character with no 1095 // special meaning. It represents itself. 1096 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1097 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1098 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1099 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1100 } 1101 literalChar(fC.fChar); 1102 break; 1103 1104 1105 case doDotAny: 1106 // scanned a ".", match any single character. 1107 { 1108 fixLiterals(FALSE); 1109 if (fModeFlags & UREGEX_DOTALL) { 1110 appendOp(URX_DOTANY_ALL, 0); 1111 } else if (fModeFlags & UREGEX_UNIX_LINES) { 1112 appendOp(URX_DOTANY_UNIX, 0); 1113 } else { 1114 appendOp(URX_DOTANY, 0); 1115 } 1116 } 1117 break; 1118 1119 case doCaret: 1120 { 1121 fixLiterals(FALSE); 1122 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1123 appendOp(URX_CARET, 0); 1124 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1125 appendOp(URX_CARET_M, 0); 1126 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1127 appendOp(URX_CARET, 0); // Only testing true start of input. 1128 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1129 appendOp(URX_CARET_M_UNIX, 0); 1130 } 1131 } 1132 break; 1133 1134 case doDollar: 1135 { 1136 fixLiterals(FALSE); 1137 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1138 appendOp(URX_DOLLAR, 0); 1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1140 appendOp(URX_DOLLAR_M, 0); 1141 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1142 appendOp(URX_DOLLAR_D, 0); 1143 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1144 appendOp(URX_DOLLAR_MD, 0); 1145 } 1146 } 1147 break; 1148 1149 case doBackslashA: 1150 fixLiterals(FALSE); 1151 appendOp(URX_CARET, 0); 1152 break; 1153 1154 case doBackslashB: 1155 { 1156 #if UCONFIG_NO_BREAK_ITERATION==1 1157 if (fModeFlags & UREGEX_UWORD) { 1158 error(U_UNSUPPORTED_ERROR); 1159 } 1160 #endif 1161 fixLiterals(FALSE); 1162 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1163 appendOp(op, 1); 1164 } 1165 break; 1166 1167 case doBackslashb: 1168 { 1169 #if UCONFIG_NO_BREAK_ITERATION==1 1170 if (fModeFlags & UREGEX_UWORD) { 1171 error(U_UNSUPPORTED_ERROR); 1172 } 1173 #endif 1174 fixLiterals(FALSE); 1175 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1176 appendOp(op, 0); 1177 } 1178 break; 1179 1180 case doBackslashD: 1181 fixLiterals(FALSE); 1182 appendOp(URX_BACKSLASH_D, 1); 1183 break; 1184 1185 case doBackslashd: 1186 fixLiterals(FALSE); 1187 appendOp(URX_BACKSLASH_D, 0); 1188 break; 1189 1190 case doBackslashG: 1191 fixLiterals(FALSE); 1192 appendOp(URX_BACKSLASH_G, 0); 1193 break; 1194 1195 case doBackslashH: 1196 fixLiterals(FALSE); 1197 appendOp(URX_BACKSLASH_H, 1); 1198 break; 1199 1200 case doBackslashh: 1201 fixLiterals(FALSE); 1202 appendOp(URX_BACKSLASH_H, 0); 1203 break; 1204 1205 case doBackslashR: 1206 fixLiterals(FALSE); 1207 appendOp(URX_BACKSLASH_R, 0); 1208 break; 1209 1210 case doBackslashS: 1211 fixLiterals(FALSE); 1212 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); 1213 break; 1214 1215 case doBackslashs: 1216 fixLiterals(FALSE); 1217 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); 1218 break; 1219 1220 case doBackslashV: 1221 fixLiterals(FALSE); 1222 appendOp(URX_BACKSLASH_V, 1); 1223 break; 1224 1225 case doBackslashv: 1226 fixLiterals(FALSE); 1227 appendOp(URX_BACKSLASH_V, 0); 1228 break; 1229 1230 case doBackslashW: 1231 fixLiterals(FALSE); 1232 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); 1233 break; 1234 1235 case doBackslashw: 1236 fixLiterals(FALSE); 1237 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); 1238 break; 1239 1240 case doBackslashX: 1241 fixLiterals(FALSE); 1242 appendOp(URX_BACKSLASH_X, 0); 1243 break; 1244 1245 1246 case doBackslashZ: 1247 fixLiterals(FALSE); 1248 appendOp(URX_DOLLAR, 0); 1249 break; 1250 1251 case doBackslashz: 1252 fixLiterals(FALSE); 1253 appendOp(URX_BACKSLASH_Z, 0); 1254 break; 1255 1256 case doEscapeError: 1257 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1258 break; 1259 1260 case doExit: 1261 fixLiterals(FALSE); 1262 returnVal = FALSE; 1263 break; 1264 1265 case doProperty: 1266 { 1267 fixLiterals(FALSE); 1268 UnicodeSet *theSet = scanProp(); 1269 compileSet(theSet); 1270 } 1271 break; 1272 1273 case doNamedChar: 1274 { 1275 UChar32 c = scanNamedChar(); 1276 literalChar(c); 1277 } 1278 break; 1279 1280 1281 case doBackRef: 1282 // BackReference. Somewhat unusual in that the front-end can not completely parse 1283 // the regular expression, because the number of digits to be consumed 1284 // depends on the number of capture groups that have been defined. So 1285 // we have to do it here instead. 1286 { 1287 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1288 int32_t groupNum = 0; 1289 UChar32 c = fC.fChar; 1290 1291 for (;;) { 1292 // Loop once per digit, for max allowed number of digits in a back reference. 1293 int32_t digit = u_charDigitValue(c); 1294 groupNum = groupNum * 10 + digit; 1295 if (groupNum >= numCaptureGroups) { 1296 break; 1297 } 1298 c = peekCharLL(); 1299 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1300 break; 1301 } 1302 nextCharLL(); 1303 } 1304 1305 // Scan of the back reference in the source regexp is complete. Now generate 1306 // the compiled code for it. 1307 // Because capture groups can be forward-referenced by back-references, 1308 // we fill the operand with the capture group number. At the end 1309 // of compilation, it will be changed to the variable's location. 1310 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, 1311 // and shouldn't enter this code path at all. 1312 fixLiterals(FALSE); 1313 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1314 appendOp(URX_BACKREF_I, groupNum); 1315 } else { 1316 appendOp(URX_BACKREF, groupNum); 1317 } 1318 } 1319 break; 1320 1321 case doBeginNamedBackRef: 1322 U_ASSERT(fCaptureName == NULL); 1323 fCaptureName = new UnicodeString; 1324 if (fCaptureName == NULL) { 1325 error(U_MEMORY_ALLOCATION_ERROR); 1326 } 1327 break; 1328 1329 case doContinueNamedBackRef: 1330 fCaptureName->append(fC.fChar); 1331 break; 1332 1333 case doCompleteNamedBackRef: 1334 { 1335 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName); 1336 if (groupNumber == 0) { 1337 // Group name has not been defined. 1338 // Could be a forward reference. If we choose to support them at some 1339 // future time, extra mechanism will be required at this point. 1340 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 1341 } else { 1342 // Given the number, handle identically to a \n numbered back reference. 1343 // See comments above, under doBackRef 1344 fixLiterals(FALSE); 1345 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1346 appendOp(URX_BACKREF_I, groupNumber); 1347 } else { 1348 appendOp(URX_BACKREF, groupNumber); 1349 } 1350 } 1351 delete fCaptureName; 1352 fCaptureName = NULL; 1353 break; 1354 } 1355 1356 case doPossessivePlus: 1357 // Possessive ++ quantifier. 1358 // Compiles to 1359 // 1. STO_SP 1360 // 2. body of stuff being iterated over 1361 // 3. STATE_SAVE 5 1362 // 4. JMP 2 1363 // 5. LD_SP 1364 // 6. ... 1365 // 1366 // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1367 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1368 // 1369 { 1370 // Emit the STO_SP 1371 int32_t topLoc = blockTopLoc(TRUE); 1372 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1373 int32_t op = buildOp(URX_STO_SP, stoLoc); 1374 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1375 1376 // Emit the STATE_SAVE 1377 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1378 1379 // Emit the JMP 1380 appendOp(URX_JMP, topLoc+1); 1381 1382 // Emit the LD_SP 1383 appendOp(URX_LD_SP, stoLoc); 1384 } 1385 break; 1386 1387 case doPossessiveStar: 1388 // Possessive *+ quantifier. 1389 // Compiles to 1390 // 1. STO_SP loc 1391 // 2. STATE_SAVE 5 1392 // 3. body of stuff being iterated over 1393 // 4. JMP 2 1394 // 5. LD_SP loc 1395 // 6 ... 1396 // TODO: do something to cut back the state stack each time through the loop. 1397 { 1398 // Reserve two slots at the top of the block. 1399 int32_t topLoc = blockTopLoc(TRUE); 1400 insertOp(topLoc); 1401 1402 // emit STO_SP loc 1403 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1404 int32_t op = buildOp(URX_STO_SP, stoLoc); 1405 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1406 1407 // Emit the SAVE_STATE 5 1408 int32_t L7 = fRXPat->fCompiledPat->size()+1; 1409 op = buildOp(URX_STATE_SAVE, L7); 1410 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1411 1412 // Append the JMP operation. 1413 appendOp(URX_JMP, topLoc+1); 1414 1415 // Emit the LD_SP loc 1416 appendOp(URX_LD_SP, stoLoc); 1417 } 1418 break; 1419 1420 case doPossessiveOpt: 1421 // Possessive ?+ quantifier. 1422 // Compiles to 1423 // 1. STO_SP loc 1424 // 2. SAVE_STATE 5 1425 // 3. body of optional block 1426 // 4. LD_SP loc 1427 // 5. ... 1428 // 1429 { 1430 // Reserve two slots at the top of the block. 1431 int32_t topLoc = blockTopLoc(TRUE); 1432 insertOp(topLoc); 1433 1434 // Emit the STO_SP 1435 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 1436 int32_t op = buildOp(URX_STO_SP, stoLoc); 1437 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1438 1439 // Emit the SAVE_STATE 1440 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1441 op = buildOp(URX_STATE_SAVE, continueLoc); 1442 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1443 1444 // Emit the LD_SP 1445 appendOp(URX_LD_SP, stoLoc); 1446 } 1447 break; 1448 1449 1450 case doBeginMatchMode: 1451 fNewModeFlags = fModeFlags; 1452 fSetModeFlag = TRUE; 1453 break; 1454 1455 case doMatchMode: // (?i) and similar 1456 { 1457 int32_t bit = 0; 1458 switch (fC.fChar) { 1459 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1460 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1461 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1462 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1463 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1464 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1465 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1466 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1467 default: 1468 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1469 // by the scanner. 1470 } 1471 if (fSetModeFlag) { 1472 fNewModeFlags |= bit; 1473 } else { 1474 fNewModeFlags &= ~bit; 1475 } 1476 } 1477 break; 1478 1479 case doSetMatchMode: 1480 // Emit code to match any pending literals, using the not-yet changed match mode. 1481 fixLiterals(); 1482 1483 // We've got a (?i) or similar. The match mode is being changed, but 1484 // the change is not scoped to a parenthesized block. 1485 U_ASSERT(fNewModeFlags < 0); 1486 fModeFlags = fNewModeFlags; 1487 1488 break; 1489 1490 1491 case doMatchModeParen: 1492 // We've got a (?i: or similar. Begin a parenthesized block, save old 1493 // mode flags so they can be restored at the close of the block. 1494 // 1495 // Compile to a 1496 // - NOP, which later may be replaced by a save-state if the 1497 // parenthesized group gets a * quantifier, followed by 1498 // - NOP, which may later be replaced by a save-state if there 1499 // is an '|' alternation within the parens. 1500 { 1501 fixLiterals(FALSE); 1502 appendOp(URX_NOP, 0); 1503 appendOp(URX_NOP, 0); 1504 1505 // On the Parentheses stack, start a new frame and add the postions 1506 // of the two NOPs (a normal non-capturing () frame, except for the 1507 // saving of the orignal mode flags.) 1508 fParenStack.push(fModeFlags, *fStatus); 1509 fParenStack.push(flags, *fStatus); // Frame Marker 1510 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1511 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1512 1513 // Set the current mode flags to the new values. 1514 U_ASSERT(fNewModeFlags < 0); 1515 fModeFlags = fNewModeFlags; 1516 } 1517 break; 1518 1519 case doBadModeFlag: 1520 error(U_REGEX_INVALID_FLAG); 1521 break; 1522 1523 case doSuppressComments: 1524 // We have just scanned a '(?'. We now need to prevent the character scanner from 1525 // treating a '#' as a to-the-end-of-line comment. 1526 // (This Perl compatibility just gets uglier and uglier to do...) 1527 fEOLComments = FALSE; 1528 break; 1529 1530 1531 case doSetAddAmp: 1532 { 1533 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1534 set->add(chAmp); 1535 } 1536 break; 1537 1538 case doSetAddDash: 1539 { 1540 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1541 set->add(chDash); 1542 } 1543 break; 1544 1545 case doSetBackslash_s: 1546 { 1547 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1548 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1549 break; 1550 } 1551 1552 case doSetBackslash_S: 1553 { 1554 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1555 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1556 SSet.complement(); 1557 set->addAll(SSet); 1558 break; 1559 } 1560 1561 case doSetBackslash_d: 1562 { 1563 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1564 // TODO - make a static set, ticket 6058. 1565 addCategory(set, U_GC_ND_MASK, *fStatus); 1566 break; 1567 } 1568 1569 case doSetBackslash_D: 1570 { 1571 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1572 UnicodeSet digits; 1573 // TODO - make a static set, ticket 6058. 1574 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1575 digits.complement(); 1576 set->addAll(digits); 1577 break; 1578 } 1579 1580 case doSetBackslash_h: 1581 { 1582 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1583 UnicodeSet h; 1584 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 1585 h.add((UChar32)9); // Tab 1586 set->addAll(h); 1587 break; 1588 } 1589 1590 case doSetBackslash_H: 1591 { 1592 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1593 UnicodeSet h; 1594 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 1595 h.add((UChar32)9); // Tab 1596 h.complement(); 1597 set->addAll(h); 1598 break; 1599 } 1600 1601 case doSetBackslash_v: 1602 { 1603 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1604 set->add((UChar32)0x0a, (UChar32)0x0d); // add range 1605 set->add((UChar32)0x85); 1606 set->add((UChar32)0x2028, (UChar32)0x2029); 1607 break; 1608 } 1609 1610 case doSetBackslash_V: 1611 { 1612 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1613 UnicodeSet v; 1614 v.add((UChar32)0x0a, (UChar32)0x0d); // add range 1615 v.add((UChar32)0x85); 1616 v.add((UChar32)0x2028, (UChar32)0x2029); 1617 v.complement(); 1618 set->addAll(v); 1619 break; 1620 } 1621 1622 case doSetBackslash_w: 1623 { 1624 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1625 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1626 break; 1627 } 1628 1629 case doSetBackslash_W: 1630 { 1631 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1632 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1633 SSet.complement(); 1634 set->addAll(SSet); 1635 break; 1636 } 1637 1638 case doSetBegin: 1639 fixLiterals(FALSE); 1640 fSetStack.push(new UnicodeSet(), *fStatus); 1641 fSetOpStack.push(setStart, *fStatus); 1642 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1643 fSetOpStack.push(setCaseClose, *fStatus); 1644 } 1645 break; 1646 1647 case doSetBeginDifference1: 1648 // We have scanned something like [[abc]-[ 1649 // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1650 // Push a Difference operator, which will cause the new set to be subtracted from what 1651 // went before once it is created. 1652 setPushOp(setDifference1); 1653 fSetOpStack.push(setStart, *fStatus); 1654 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1655 fSetOpStack.push(setCaseClose, *fStatus); 1656 } 1657 break; 1658 1659 case doSetBeginIntersection1: 1660 // We have scanned something like [[abc]&[ 1661 // Need both the '&' operator and the open '[' operator. 1662 setPushOp(setIntersection1); 1663 fSetOpStack.push(setStart, *fStatus); 1664 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1665 fSetOpStack.push(setCaseClose, *fStatus); 1666 } 1667 break; 1668 1669 case doSetBeginUnion: 1670 // We have scanned something like [[abc][ 1671 // Need to handle the union operation explicitly [[abc] | [ 1672 setPushOp(setUnion); 1673 fSetOpStack.push(setStart, *fStatus); 1674 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1675 fSetOpStack.push(setCaseClose, *fStatus); 1676 } 1677 break; 1678 1679 case doSetDifference2: 1680 // We have scanned something like [abc-- 1681 // Consider this to unambiguously be a set difference operator. 1682 setPushOp(setDifference2); 1683 break; 1684 1685 case doSetEnd: 1686 // Have encountered the ']' that closes a set. 1687 // Force the evaluation of any pending operations within this set, 1688 // leave the completed set on the top of the set stack. 1689 setEval(setEnd); 1690 U_ASSERT(fSetOpStack.peeki()==setStart); 1691 fSetOpStack.popi(); 1692 break; 1693 1694 case doSetFinish: 1695 { 1696 // Finished a complete set expression, including all nested sets. 1697 // The close bracket has already triggered clearing out pending set operators, 1698 // the operator stack should be empty and the operand stack should have just 1699 // one entry, the result set. 1700 U_ASSERT(fSetOpStack.empty()); 1701 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1702 U_ASSERT(fSetStack.empty()); 1703 compileSet(theSet); 1704 break; 1705 } 1706 1707 case doSetIntersection2: 1708 // Have scanned something like [abc&& 1709 setPushOp(setIntersection2); 1710 break; 1711 1712 case doSetLiteral: 1713 // Union the just-scanned literal character into the set being built. 1714 // This operation is the highest precedence set operation, so we can always do 1715 // it immediately, without waiting to see what follows. It is necessary to perform 1716 // any pending '-' or '&' operation first, because these have the same precedence 1717 // as union-ing in a literal' 1718 { 1719 setEval(setUnion); 1720 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1721 s->add(fC.fChar); 1722 fLastSetLiteral = fC.fChar; 1723 break; 1724 } 1725 1726 case doSetLiteralEscaped: 1727 // A back-slash escaped literal character was encountered. 1728 // Processing is the same as with setLiteral, above, with the addition of 1729 // the optional check for errors on escaped ASCII letters. 1730 { 1731 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1732 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1733 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1734 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1735 } 1736 setEval(setUnion); 1737 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1738 s->add(fC.fChar); 1739 fLastSetLiteral = fC.fChar; 1740 break; 1741 } 1742 1743 case doSetNamedChar: 1744 // Scanning a \N{UNICODE CHARACTER NAME} 1745 // Aside from the source of the character, the processing is identical to doSetLiteral, 1746 // above. 1747 { 1748 UChar32 c = scanNamedChar(); 1749 setEval(setUnion); 1750 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1751 s->add(c); 1752 fLastSetLiteral = c; 1753 break; 1754 } 1755 1756 case doSetNamedRange: 1757 // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1758 // The left character is already in the set, and is saved in fLastSetLiteral. 1759 // The right side needs to be picked up, the scan is at the 'N'. 1760 // Lower Limit > Upper limit being an error matches both Java 1761 // and ICU UnicodeSet behavior. 1762 { 1763 UChar32 c = scanNamedChar(); 1764 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) { 1765 error(U_REGEX_INVALID_RANGE); 1766 } 1767 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1768 s->add(fLastSetLiteral, c); 1769 fLastSetLiteral = c; 1770 break; 1771 } 1772 1773 1774 case doSetNegate: 1775 // Scanned a '^' at the start of a set. 1776 // Push the negation operator onto the set op stack. 1777 // A twist for case-insensitive matching: 1778 // the case closure operation must happen _before_ negation. 1779 // But the case closure operation will already be on the stack if it's required. 1780 // This requires checking for case closure, and swapping the stack order 1781 // if it is present. 1782 { 1783 int32_t tosOp = fSetOpStack.peeki(); 1784 if (tosOp == setCaseClose) { 1785 fSetOpStack.popi(); 1786 fSetOpStack.push(setNegation, *fStatus); 1787 fSetOpStack.push(setCaseClose, *fStatus); 1788 } else { 1789 fSetOpStack.push(setNegation, *fStatus); 1790 } 1791 } 1792 break; 1793 1794 case doSetNoCloseError: 1795 error(U_REGEX_MISSING_CLOSE_BRACKET); 1796 break; 1797 1798 case doSetOpError: 1799 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1800 break; 1801 1802 case doSetPosixProp: 1803 { 1804 UnicodeSet *s = scanPosixProp(); 1805 if (s != NULL) { 1806 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1807 tos->addAll(*s); 1808 delete s; 1809 } // else error. scanProp() reported the error status already. 1810 } 1811 break; 1812 1813 case doSetProp: 1814 // Scanned a \p \P within [brackets]. 1815 { 1816 UnicodeSet *s = scanProp(); 1817 if (s != NULL) { 1818 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1819 tos->addAll(*s); 1820 delete s; 1821 } // else error. scanProp() reported the error status already. 1822 } 1823 break; 1824 1825 1826 case doSetRange: 1827 // We have scanned literal-literal. Add the range to the set. 1828 // The left character is already in the set, and is saved in fLastSetLiteral. 1829 // The right side is the current character. 1830 // Lower Limit > Upper limit being an error matches both Java 1831 // and ICU UnicodeSet behavior. 1832 { 1833 1834 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) { 1835 error(U_REGEX_INVALID_RANGE); 1836 } 1837 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1838 s->add(fLastSetLiteral, fC.fChar); 1839 break; 1840 } 1841 1842 default: 1843 U_ASSERT(FALSE); 1844 error(U_REGEX_INTERNAL_ERROR); 1845 break; 1846 } 1847 1848 if (U_FAILURE(*fStatus)) { 1849 returnVal = FALSE; 1850 } 1851 1852 return returnVal; 1853 } 1854 1855 1856 1857 //------------------------------------------------------------------------------ 1858 // 1859 // literalChar We've encountered a literal character from the pattern, 1860 // or an escape sequence that reduces to a character. 1861 // Add it to the string containing all literal chars/strings from 1862 // the pattern. 1863 // 1864 //------------------------------------------------------------------------------ 1865 void RegexCompile::literalChar(UChar32 c) { 1866 fLiteralChars.append(c); 1867 } 1868 1869 1870 //------------------------------------------------------------------------------ 1871 // 1872 // fixLiterals When compiling something that can follow a literal 1873 // string in a pattern, emit the code to match the 1874 // accumulated literal string. 1875 // 1876 // Optionally, split the last char of the string off into 1877 // a single "ONE_CHAR" operation, so that quantifiers can 1878 // apply to that char alone. Example: abc* 1879 // The * must apply to the 'c' only. 1880 // 1881 //------------------------------------------------------------------------------ 1882 void RegexCompile::fixLiterals(UBool split) { 1883 1884 // If no literal characters have been scanned but not yet had code generated 1885 // for them, nothing needs to be done. 1886 if (fLiteralChars.length() == 0) { 1887 return; 1888 } 1889 1890 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1891 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1892 1893 // Split: We need to ensure that the last item in the compiled pattern 1894 // refers only to the last literal scanned in the pattern, so that 1895 // quantifiers (*, +, etc.) affect only it, and not a longer string. 1896 // Split before case folding for case insensitive matches. 1897 1898 if (split) { 1899 fLiteralChars.truncate(indexOfLastCodePoint); 1900 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string. 1901 // Note that the truncated literal string may be empty, in which case 1902 // nothing will be emitted. 1903 1904 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. 1905 fixLiterals(FALSE); // Second recursive call, code for the final code point. 1906 return; 1907 } 1908 1909 // If we are doing case-insensitive matching, case fold the string. This may expand 1910 // the string, e.g. the German sharp-s turns into "ss" 1911 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1912 fLiteralChars.foldCase(); 1913 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1914 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1915 } 1916 1917 if (indexOfLastCodePoint == 0) { 1918 // Single character, emit a URX_ONECHAR op to match it. 1919 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1920 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { 1921 appendOp(URX_ONECHAR_I, lastCodePoint); 1922 } else { 1923 appendOp(URX_ONECHAR, lastCodePoint); 1924 } 1925 } else { 1926 // Two or more chars, emit a URX_STRING to match them. 1927 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) { 1928 error(U_REGEX_PATTERN_TOO_BIG); 1929 } 1930 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1931 appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); 1932 } else { 1933 // TODO here: add optimization to split case sensitive strings of length two 1934 // into two single char ops, for efficiency. 1935 appendOp(URX_STRING, fRXPat->fLiteralText.length()); 1936 } 1937 appendOp(URX_STRING_LEN, fLiteralChars.length()); 1938 1939 // Add this string into the accumulated strings of the compiled pattern. 1940 fRXPat->fLiteralText.append(fLiteralChars); 1941 } 1942 1943 fLiteralChars.remove(); 1944 } 1945 1946 1947 int32_t RegexCompile::buildOp(int32_t type, int32_t val) { 1948 if (U_FAILURE(*fStatus)) { 1949 return 0; 1950 } 1951 if (type < 0 || type > 255) { 1952 U_ASSERT(FALSE); 1953 error(U_REGEX_INTERNAL_ERROR); 1954 type = URX_RESERVED_OP; 1955 } 1956 if (val > 0x00ffffff) { 1957 U_ASSERT(FALSE); 1958 error(U_REGEX_INTERNAL_ERROR); 1959 val = 0; 1960 } 1961 if (val < 0) { 1962 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { 1963 U_ASSERT(FALSE); 1964 error(U_REGEX_INTERNAL_ERROR); 1965 return -1; 1966 } 1967 if (URX_TYPE(val) != 0xff) { 1968 U_ASSERT(FALSE); 1969 error(U_REGEX_INTERNAL_ERROR); 1970 return -1; 1971 } 1972 type = URX_RESERVED_OP_N; 1973 } 1974 return (type << 24) | val; 1975 } 1976 1977 1978 //------------------------------------------------------------------------------ 1979 // 1980 // appendOp() Append a new instruction onto the compiled pattern 1981 // Includes error checking, limiting the size of the 1982 // pattern to lengths that can be represented in the 1983 // 24 bit operand field of an instruction. 1984 // 1985 //------------------------------------------------------------------------------ 1986 void RegexCompile::appendOp(int32_t op) { 1987 if (U_FAILURE(*fStatus)) { 1988 return; 1989 } 1990 fRXPat->fCompiledPat->addElement(op, *fStatus); 1991 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { 1992 error(U_REGEX_PATTERN_TOO_BIG); 1993 } 1994 } 1995 1996 void RegexCompile::appendOp(int32_t type, int32_t val) { 1997 appendOp(buildOp(type, val)); 1998 } 1999 2000 2001 //------------------------------------------------------------------------------ 2002 // 2003 // insertOp() Insert a slot for a new opcode into the already 2004 // compiled pattern code. 2005 // 2006 // Fill the slot with a NOP. Our caller will replace it 2007 // with what they really wanted. 2008 // 2009 //------------------------------------------------------------------------------ 2010 void RegexCompile::insertOp(int32_t where) { 2011 UVector64 *code = fRXPat->fCompiledPat; 2012 U_ASSERT(where>0 && where < code->size()); 2013 2014 int32_t nop = buildOp(URX_NOP, 0); 2015 code->insertElementAt(nop, where, *fStatus); 2016 2017 // Walk through the pattern, looking for any ops with targets that 2018 // were moved down by the insert. Fix them. 2019 int32_t loc; 2020 for (loc=0; loc<code->size(); loc++) { 2021 int32_t op = (int32_t)code->elementAti(loc); 2022 int32_t opType = URX_TYPE(op); 2023 int32_t opValue = URX_VAL(op); 2024 if ((opType == URX_JMP || 2025 opType == URX_JMPX || 2026 opType == URX_STATE_SAVE || 2027 opType == URX_CTR_LOOP || 2028 opType == URX_CTR_LOOP_NG || 2029 opType == URX_JMP_SAV || 2030 opType == URX_JMP_SAV_X || 2031 opType == URX_RELOC_OPRND) && opValue > where) { 2032 // Target location for this opcode is after the insertion point and 2033 // needs to be incremented to adjust for the insertion. 2034 opValue++; 2035 op = buildOp(opType, opValue); 2036 code->setElementAt(op, loc); 2037 } 2038 } 2039 2040 // Now fix up the parentheses stack. All positive values in it are locations in 2041 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 2042 for (loc=0; loc<fParenStack.size(); loc++) { 2043 int32_t x = fParenStack.elementAti(loc); 2044 U_ASSERT(x < code->size()); 2045 if (x>where) { 2046 x++; 2047 fParenStack.setElementAt(x, loc); 2048 } 2049 } 2050 2051 if (fMatchCloseParen > where) { 2052 fMatchCloseParen++; 2053 } 2054 if (fMatchOpenParen > where) { 2055 fMatchOpenParen++; 2056 } 2057 } 2058 2059 2060 //------------------------------------------------------------------------------ 2061 // 2062 // allocateData() Allocate storage in the matcher's static data area. 2063 // Return the index for the newly allocated data. 2064 // The storage won't actually exist until we are running a match 2065 // operation, but the storage indexes are inserted into various 2066 // opcodes while compiling the pattern. 2067 // 2068 //------------------------------------------------------------------------------ 2069 int32_t RegexCompile::allocateData(int32_t size) { 2070 if (U_FAILURE(*fStatus)) { 2071 return 0; 2072 } 2073 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { 2074 error(U_REGEX_INTERNAL_ERROR); 2075 return 0; 2076 } 2077 int32_t dataIndex = fRXPat->fDataSize; 2078 fRXPat->fDataSize += size; 2079 if (fRXPat->fDataSize >= 0x00fffff0) { 2080 error(U_REGEX_INTERNAL_ERROR); 2081 } 2082 return dataIndex; 2083 } 2084 2085 2086 //------------------------------------------------------------------------------ 2087 // 2088 // allocateStackData() Allocate space in the back-tracking stack frame. 2089 // Return the index for the newly allocated data. 2090 // The frame indexes are inserted into various 2091 // opcodes while compiling the pattern, meaning that frame 2092 // size must be restricted to the size that will fit 2093 // as an operand (24 bits). 2094 // 2095 //------------------------------------------------------------------------------ 2096 int32_t RegexCompile::allocateStackData(int32_t size) { 2097 if (U_FAILURE(*fStatus)) { 2098 return 0; 2099 } 2100 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { 2101 error(U_REGEX_INTERNAL_ERROR); 2102 return 0; 2103 } 2104 int32_t dataIndex = fRXPat->fFrameSize; 2105 fRXPat->fFrameSize += size; 2106 if (fRXPat->fFrameSize >= 0x00fffff0) { 2107 error(U_REGEX_PATTERN_TOO_BIG); 2108 } 2109 return dataIndex; 2110 } 2111 2112 2113 //------------------------------------------------------------------------------ 2114 // 2115 // blockTopLoc() Find or create a location in the compiled pattern 2116 // at the start of the operation or block that has 2117 // just been compiled. Needed when a quantifier (* or 2118 // whatever) appears, and we need to add an operation 2119 // at the start of the thing being quantified. 2120 // 2121 // (Parenthesized Blocks) have a slot with a NOP that 2122 // is reserved for this purpose. .* or similar don't 2123 // and a slot needs to be added. 2124 // 2125 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode 2126 // at the returned location. 2127 // FALSE - just return the address, 2128 // do not reserve a location there. 2129 // 2130 //------------------------------------------------------------------------------ 2131 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 2132 int32_t theLoc; 2133 fixLiterals(TRUE); // Emit code for any pending literals. 2134 // If last item was a string, emit separate op for the its last char. 2135 if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 2136 { 2137 // The item just processed is a parenthesized block. 2138 theLoc = fMatchOpenParen; // A slot is already reserved for us. 2139 U_ASSERT(theLoc > 0); 2140 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 2141 } 2142 else { 2143 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. 2144 // No slot for STATE_SAVE was pre-reserved in the compiled code. 2145 // We need to make space now. 2146 theLoc = fRXPat->fCompiledPat->size()-1; 2147 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); 2148 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { 2149 // Strings take two opcode, we want the position of the first one. 2150 // We can have a string at this point if a single character case-folded to two. 2151 theLoc--; 2152 } 2153 if (reserveLoc) { 2154 int32_t nop = buildOp(URX_NOP, 0); 2155 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 2156 } 2157 } 2158 return theLoc; 2159 } 2160 2161 2162 2163 //------------------------------------------------------------------------------ 2164 // 2165 // handleCloseParen When compiling a close paren, we need to go back 2166 // and fix up any JMP or SAVE operations within the 2167 // parenthesized block that need to target the end 2168 // of the block. The locations of these are kept on 2169 // the paretheses stack. 2170 // 2171 // This function is called both when encountering a 2172 // real ) and at the end of the pattern. 2173 // 2174 //------------------------------------------------------------------------------ 2175 void RegexCompile::handleCloseParen() { 2176 int32_t patIdx; 2177 int32_t patOp; 2178 if (fParenStack.size() <= 0) { 2179 error(U_REGEX_MISMATCHED_PAREN); 2180 return; 2181 } 2182 2183 // Emit code for any pending literals. 2184 fixLiterals(FALSE); 2185 2186 // Fixup any operations within the just-closed parenthesized group 2187 // that need to reference the end of the (block). 2188 // (The first one popped from the stack is an unused slot for 2189 // alternation (OR) state save, but applying the fixup to it does no harm.) 2190 for (;;) { 2191 patIdx = fParenStack.popi(); 2192 if (patIdx < 0) { 2193 // value < 0 flags the start of the frame on the paren stack. 2194 break; 2195 } 2196 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 2197 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 2198 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2199 patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2200 fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2201 fMatchOpenParen = patIdx; 2202 } 2203 2204 // At the close of any parenthesized block, restore the match mode flags to 2205 // the value they had at the open paren. Saved value is 2206 // at the top of the paren stack. 2207 fModeFlags = fParenStack.popi(); 2208 U_ASSERT(fModeFlags < 0); 2209 2210 // DO any additional fixups, depending on the specific kind of 2211 // parentesized grouping this is 2212 2213 switch (patIdx) { 2214 case plain: 2215 case flags: 2216 // No additional fixups required. 2217 // (Grouping-only parentheses) 2218 break; 2219 case capturing: 2220 // Capturing Parentheses. 2221 // Insert a End Capture op into the pattern. 2222 // The frame offset of the variables for this cg is obtained from the 2223 // start capture op and put it into the end-capture op. 2224 { 2225 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2226 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2227 2228 int32_t frameVarLocation = URX_VAL(captureOp); 2229 appendOp(URX_END_CAPTURE, frameVarLocation); 2230 } 2231 break; 2232 case atomic: 2233 // Atomic Parenthesis. 2234 // Insert a LD_SP operation to restore the state stack to the position 2235 // it was when the atomic parens were entered. 2236 { 2237 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2238 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2239 int32_t stoLoc = URX_VAL(stoOp); 2240 appendOp(URX_LD_SP, stoLoc); 2241 } 2242 break; 2243 2244 case lookAhead: 2245 { 2246 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2247 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2248 int32_t dataLoc = URX_VAL(startOp); 2249 appendOp(URX_LA_END, dataLoc); 2250 } 2251 break; 2252 2253 case negLookAhead: 2254 { 2255 // See comment at doOpenLookAheadNeg 2256 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 2257 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2258 int32_t dataLoc = URX_VAL(startOp); 2259 appendOp(URX_LA_END, dataLoc); 2260 appendOp(URX_BACKTRACK, 0); 2261 appendOp(URX_LA_END, dataLoc); 2262 2263 // Patch the URX_SAVE near the top of the block. 2264 // The destination of the SAVE is the final LA_END that was just added. 2265 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 2266 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2267 int32_t dest = fRXPat->fCompiledPat->size()-1; 2268 saveOp = buildOp(URX_STATE_SAVE, dest); 2269 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2270 } 2271 break; 2272 2273 case lookBehind: 2274 { 2275 // See comment at doOpenLookBehind. 2276 2277 // Append the URX_LB_END and URX_LA_END to the compiled pattern. 2278 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 2279 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2280 int32_t dataLoc = URX_VAL(startOp); 2281 appendOp(URX_LB_END, dataLoc); 2282 appendOp(URX_LA_END, dataLoc); 2283 2284 // Determine the min and max bounds for the length of the 2285 // string that the pattern can match. 2286 // An unbounded upper limit is an error. 2287 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2288 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2289 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2290 if (URX_TYPE(maxML) != 0) { 2291 error(U_REGEX_LOOK_BEHIND_LIMIT); 2292 break; 2293 } 2294 if (maxML == INT32_MAX) { 2295 error(U_REGEX_LOOK_BEHIND_LIMIT); 2296 break; 2297 } 2298 U_ASSERT(minML <= maxML); 2299 2300 // Insert the min and max match len bounds into the URX_LB_CONT op that 2301 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2302 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2303 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2304 2305 } 2306 break; 2307 2308 2309 2310 case lookBehindN: 2311 { 2312 // See comment at doOpenLookBehindNeg. 2313 2314 // Append the URX_LBN_END to the compiled pattern. 2315 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2316 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2317 int32_t dataLoc = URX_VAL(startOp); 2318 appendOp(URX_LBN_END, dataLoc); 2319 2320 // Determine the min and max bounds for the length of the 2321 // string that the pattern can match. 2322 // An unbounded upper limit is an error. 2323 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2324 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2325 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2326 if (URX_TYPE(maxML) != 0) { 2327 error(U_REGEX_LOOK_BEHIND_LIMIT); 2328 break; 2329 } 2330 if (maxML == INT32_MAX) { 2331 error(U_REGEX_LOOK_BEHIND_LIMIT); 2332 break; 2333 } 2334 U_ASSERT(minML <= maxML); 2335 2336 // Insert the min and max match len bounds into the URX_LB_CONT op that 2337 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2338 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2339 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2340 2341 // Insert the pattern location to continue at after a successful match 2342 // as the last operand of the URX_LBN_CONT 2343 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2344 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2345 } 2346 break; 2347 2348 2349 2350 default: 2351 U_ASSERT(FALSE); 2352 } 2353 2354 // remember the next location in the compiled pattern. 2355 // The compilation of Quantifiers will look at this to see whether its looping 2356 // over a parenthesized block or a single item 2357 fMatchCloseParen = fRXPat->fCompiledPat->size(); 2358 } 2359 2360 2361 2362 //------------------------------------------------------------------------------ 2363 // 2364 // compileSet Compile the pattern operations for a reference to a 2365 // UnicodeSet. 2366 // 2367 //------------------------------------------------------------------------------ 2368 void RegexCompile::compileSet(UnicodeSet *theSet) 2369 { 2370 if (theSet == NULL) { 2371 return; 2372 } 2373 // Remove any strings from the set. 2374 // There shoudn't be any, but just in case. 2375 // (Case Closure can add them; if we had a simple case closure avaialble that 2376 // ignored strings, that would be better.) 2377 theSet->removeAllStrings(); 2378 int32_t setSize = theSet->size(); 2379 2380 switch (setSize) { 2381 case 0: 2382 { 2383 // Set of no elements. Always fails to match. 2384 appendOp(URX_BACKTRACK, 0); 2385 delete theSet; 2386 } 2387 break; 2388 2389 case 1: 2390 { 2391 // The set contains only a single code point. Put it into 2392 // the compiled pattern as a single char operation rather 2393 // than a set, and discard the set itself. 2394 literalChar(theSet->charAt(0)); 2395 delete theSet; 2396 } 2397 break; 2398 2399 default: 2400 { 2401 // The set contains two or more chars. (the normal case) 2402 // Put it into the compiled pattern as a set. 2403 int32_t setNumber = fRXPat->fSets->size(); 2404 fRXPat->fSets->addElement(theSet, *fStatus); 2405 appendOp(URX_SETREF, setNumber); 2406 } 2407 } 2408 } 2409 2410 2411 //------------------------------------------------------------------------------ 2412 // 2413 // compileInterval Generate the code for a {min, max} style interval quantifier. 2414 // Except for the specific opcodes used, the code is the same 2415 // for all three types (greedy, non-greedy, possessive) of 2416 // intervals. The opcodes are supplied as parameters. 2417 // (There are two sets of opcodes - greedy & possessive use the 2418 // same ones, while non-greedy has it's own.) 2419 // 2420 // The code for interval loops has this form: 2421 // 0 CTR_INIT counter loc (in stack frame) 2422 // 1 5 patt address of CTR_LOOP at bottom of block 2423 // 2 min count 2424 // 3 max count (-1 for unbounded) 2425 // 4 ... block to be iterated over 2426 // 5 CTR_LOOP 2427 // 2428 // In 2429 //------------------------------------------------------------------------------ 2430 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2431 { 2432 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2433 // four slots in the compiled code. Reserve them. 2434 int32_t topOfBlock = blockTopLoc(TRUE); 2435 insertOp(topOfBlock); 2436 insertOp(topOfBlock); 2437 insertOp(topOfBlock); 2438 2439 // The operands for the CTR_INIT opcode include the index in the matcher data 2440 // of the counter. Allocate it now. There are two data items 2441 // counterLoc --> Loop counter 2442 // +1 --> Input index (for breaking non-progressing loops) 2443 // (Only present if unbounded upper limit on loop) 2444 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; 2445 int32_t counterLoc = allocateStackData(dataSize); 2446 2447 int32_t op = buildOp(InitOp, counterLoc); 2448 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2449 2450 // The second operand of CTR_INIT is the location following the end of the loop. 2451 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2452 // compilation of something later on causes the code to grow and the target 2453 // position to move. 2454 int32_t loopEnd = fRXPat->fCompiledPat->size(); 2455 op = buildOp(URX_RELOC_OPRND, loopEnd); 2456 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2457 2458 // Followed by the min and max counts. 2459 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2460 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2461 2462 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2463 // Goes at end of the block being looped over, so just append to the code so far. 2464 appendOp(LoopOp, topOfBlock); 2465 2466 if ((fIntervalLow & 0xff000000) != 0 || 2467 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2468 error(U_REGEX_NUMBER_TOO_BIG); 2469 } 2470 2471 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2472 error(U_REGEX_MAX_LT_MIN); 2473 } 2474 } 2475 2476 2477 2478 UBool RegexCompile::compileInlineInterval() { 2479 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2480 // Too big to inline. Fail, which will cause looping code to be generated. 2481 // (Upper < Lower picks up unbounded upper and errors, both.) 2482 return FALSE; 2483 } 2484 2485 int32_t topOfBlock = blockTopLoc(FALSE); 2486 if (fIntervalUpper == 0) { 2487 // Pathological case. Attempt no matches, as if the block doesn't exist. 2488 // Discard the generated code for the block. 2489 // If the block included parens, discard the info pertaining to them as well. 2490 fRXPat->fCompiledPat->setSize(topOfBlock); 2491 if (fMatchOpenParen >= topOfBlock) { 2492 fMatchOpenParen = -1; 2493 } 2494 if (fMatchCloseParen >= topOfBlock) { 2495 fMatchCloseParen = -1; 2496 } 2497 return TRUE; 2498 } 2499 2500 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2501 // The thing being repeated is not a single op, but some 2502 // more complex block. Do it as a loop, not inlines. 2503 // Note that things "repeated" a max of once are handled as inline, because 2504 // the one copy of the code already generated is just fine. 2505 return FALSE; 2506 } 2507 2508 // Pick up the opcode that is to be repeated 2509 // 2510 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 2511 2512 // Compute the pattern location where the inline sequence 2513 // will end, and set up the state save op that will be needed. 2514 // 2515 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2516 + fIntervalUpper + (fIntervalUpper-fIntervalLow); 2517 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); 2518 if (fIntervalLow == 0) { 2519 insertOp(topOfBlock); 2520 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2521 } 2522 2523 2524 2525 // Loop, emitting the op for the thing being repeated each time. 2526 // Loop starts at 1 because one instance of the op already exists in the pattern, 2527 // it was put there when it was originally encountered. 2528 int32_t i; 2529 for (i=1; i<fIntervalUpper; i++ ) { 2530 if (i >= fIntervalLow) { 2531 appendOp(saveOp); 2532 } 2533 appendOp(op); 2534 } 2535 return TRUE; 2536 } 2537 2538 2539 2540 //------------------------------------------------------------------------------ 2541 // 2542 // caseInsensitiveStart given a single code point from a pattern string, determine the 2543 // set of characters that could potentially begin a case-insensitive 2544 // match of a string beginning with that character, using full Unicode 2545 // case insensitive matching. 2546 // 2547 // This is used in optimizing find(). 2548 // 2549 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but 2550 // misses cases like this: 2551 // A string from the pattern begins with 'ss' (although all we know 2552 // in this context is that it begins with 's') 2553 // The pattern could match a string beginning with a German sharp-s 2554 // 2555 // To the ordinary case closure for a character c, we add all other 2556 // characters cx where the case closure of cx incudes a string form that begins 2557 // with the original character c. 2558 // 2559 // This function could be made smarter. The full pattern string is available 2560 // and it would be possible to verify that the extra characters being added 2561 // to the starting set fully match, rather than having just a first-char of the 2562 // folded form match. 2563 // 2564 //------------------------------------------------------------------------------ 2565 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) { 2566 2567 // Machine Generated below. 2568 // It may need updating with new versions of Unicode. 2569 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. 2570 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing 2571 2572 // Machine Generated Data. Do not hand edit. 2573 static const UChar32 RECaseFixCodePoints[] = { 2574 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 2575 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 2576 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 2577 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 2578 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000}; 2579 2580 static const int16_t RECaseFixStringOffsets[] = { 2581 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 2582 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 2583 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 2584 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 2585 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0}; 2586 2587 static const int16_t RECaseFixCounts[] = { 2588 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 2589 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 2590 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 2591 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 2592 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0}; 2593 2594 static const UChar RECaseFixData[] = { 2595 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 2596 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 2597 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 2598 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 2599 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 2600 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 2601 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 2602 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 2603 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 2604 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 2605 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0}; 2606 2607 // End of machine generated data. 2608 2609 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) { 2610 // This function should never be called with an invalid input character. 2611 U_ASSERT(FALSE); 2612 starterChars->clear(); 2613 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2614 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); 2615 starterChars->set(caseFoldedC, caseFoldedC); 2616 2617 int32_t i; 2618 for (i=0; RECaseFixCodePoints[i]<c ; i++) { 2619 // Simple linear search through the sorted list of interesting code points. 2620 } 2621 2622 if (RECaseFixCodePoints[i] == c) { 2623 int32_t dataIndex = RECaseFixStringOffsets[i]; 2624 int32_t numCharsToAdd = RECaseFixCounts[i]; 2625 UChar32 cpToAdd = 0; 2626 for (int32_t j=0; j<numCharsToAdd; j++) { 2627 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd); 2628 starterChars->add(cpToAdd); 2629 } 2630 } 2631 2632 starterChars->closeOver(USET_CASE_INSENSITIVE); 2633 starterChars->removeAllStrings(); 2634 } else { 2635 // Not a cased character. Just return it alone. 2636 starterChars->set(c, c); 2637 } 2638 } 2639 2640 2641 // Increment with overflow check. 2642 // val and delta will both be positive. 2643 2644 static int32_t safeIncrement(int32_t val, int32_t delta) { 2645 if (INT32_MAX - val > delta) { 2646 return val + delta; 2647 } else { 2648 return INT32_MAX; 2649 } 2650 } 2651 2652 2653 //------------------------------------------------------------------------------ 2654 // 2655 // matchStartType Determine how a match can start. 2656 // Used to optimize find() operations. 2657 // 2658 // Operation is very similar to minMatchLength(). Walk the compiled 2659 // pattern, keeping an on-going minimum-match-length. For any 2660 // op where the min match coming in is zero, add that ops possible 2661 // starting matches to the possible starts for the overall pattern. 2662 // 2663 //------------------------------------------------------------------------------ 2664 void RegexCompile::matchStartType() { 2665 if (U_FAILURE(*fStatus)) { 2666 return; 2667 } 2668 2669 2670 int32_t loc; // Location in the pattern of the current op being processed. 2671 int32_t op; // The op being processed 2672 int32_t opType; // The opcode type of the op 2673 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2674 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2675 2676 UBool atStart = TRUE; // True if no part of the pattern yet encountered 2677 // could have advanced the position in a match. 2678 // (Maximum match length so far == 0) 2679 2680 // forwardedLength is a vector holding minimum-match-length values that 2681 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2682 // It must be one longer than the pattern being checked because some ops 2683 // will jmp to a end-of-block+1 location from within a block, and we must 2684 // count those when checking the block. 2685 int32_t end = fRXPat->fCompiledPat->size(); 2686 UVector32 forwardedLength(end+1, *fStatus); 2687 forwardedLength.setSize(end+1); 2688 for (loc=3; loc<end; loc++) { 2689 forwardedLength.setElementAt(INT32_MAX, loc); 2690 } 2691 2692 for (loc = 3; loc<end; loc++) { 2693 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2694 opType = URX_TYPE(op); 2695 2696 // The loop is advancing linearly through the pattern. 2697 // If the op we are now at was the destination of a branch in the pattern, 2698 // and that path has a shorter minimum length than the current accumulated value, 2699 // replace the current accumulated value. 2700 if (forwardedLength.elementAti(loc) < currentLen) { 2701 currentLen = forwardedLength.elementAti(loc); 2702 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2703 } 2704 2705 switch (opType) { 2706 // Ops that don't change the total length matched 2707 case URX_RESERVED_OP: 2708 case URX_END: 2709 case URX_FAIL: 2710 case URX_STRING_LEN: 2711 case URX_NOP: 2712 case URX_START_CAPTURE: 2713 case URX_END_CAPTURE: 2714 case URX_BACKSLASH_B: 2715 case URX_BACKSLASH_BU: 2716 case URX_BACKSLASH_G: 2717 case URX_BACKSLASH_Z: 2718 case URX_DOLLAR: 2719 case URX_DOLLAR_M: 2720 case URX_DOLLAR_D: 2721 case URX_DOLLAR_MD: 2722 case URX_RELOC_OPRND: 2723 case URX_STO_INP_LOC: 2724 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2725 case URX_BACKREF_I: 2726 2727 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2728 case URX_LD_SP: 2729 break; 2730 2731 case URX_CARET: 2732 if (atStart) { 2733 fRXPat->fStartType = START_START; 2734 } 2735 break; 2736 2737 case URX_CARET_M: 2738 case URX_CARET_M_UNIX: 2739 if (atStart) { 2740 fRXPat->fStartType = START_LINE; 2741 } 2742 break; 2743 2744 case URX_ONECHAR: 2745 if (currentLen == 0) { 2746 // This character could appear at the start of a match. 2747 // Add it to the set of possible starting characters. 2748 fRXPat->fInitialChars->add(URX_VAL(op)); 2749 numInitialStrings += 2; 2750 } 2751 currentLen = safeIncrement(currentLen, 1); 2752 atStart = FALSE; 2753 break; 2754 2755 2756 case URX_SETREF: 2757 if (currentLen == 0) { 2758 int32_t sn = URX_VAL(op); 2759 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2760 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2761 fRXPat->fInitialChars->addAll(*s); 2762 numInitialStrings += 2; 2763 } 2764 currentLen = safeIncrement(currentLen, 1); 2765 atStart = FALSE; 2766 break; 2767 2768 case URX_LOOP_SR_I: 2769 // [Set]*, like a SETREF, above, in what it can match, 2770 // but may not match at all, so currentLen is not incremented. 2771 if (currentLen == 0) { 2772 int32_t sn = URX_VAL(op); 2773 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2774 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2775 fRXPat->fInitialChars->addAll(*s); 2776 numInitialStrings += 2; 2777 } 2778 atStart = FALSE; 2779 break; 2780 2781 case URX_LOOP_DOT_I: 2782 if (currentLen == 0) { 2783 // .* at the start of a pattern. 2784 // Any character can begin the match. 2785 fRXPat->fInitialChars->clear(); 2786 fRXPat->fInitialChars->complement(); 2787 numInitialStrings += 2; 2788 } 2789 atStart = FALSE; 2790 break; 2791 2792 2793 case URX_STATIC_SETREF: 2794 if (currentLen == 0) { 2795 int32_t sn = URX_VAL(op); 2796 U_ASSERT(sn>0 && sn<URX_LAST_SET); 2797 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2798 fRXPat->fInitialChars->addAll(*s); 2799 numInitialStrings += 2; 2800 } 2801 currentLen = safeIncrement(currentLen, 1); 2802 atStart = FALSE; 2803 break; 2804 2805 2806 2807 case URX_STAT_SETREF_N: 2808 if (currentLen == 0) { 2809 int32_t sn = URX_VAL(op); 2810 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2811 UnicodeSet sc(*s); 2812 sc.complement(); 2813 fRXPat->fInitialChars->addAll(sc); 2814 numInitialStrings += 2; 2815 } 2816 currentLen = safeIncrement(currentLen, 1); 2817 atStart = FALSE; 2818 break; 2819 2820 2821 2822 case URX_BACKSLASH_D: 2823 // Digit Char 2824 if (currentLen == 0) { 2825 UnicodeSet s; 2826 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2827 if (URX_VAL(op) != 0) { 2828 s.complement(); 2829 } 2830 fRXPat->fInitialChars->addAll(s); 2831 numInitialStrings += 2; 2832 } 2833 currentLen = safeIncrement(currentLen, 1); 2834 atStart = FALSE; 2835 break; 2836 2837 2838 case URX_BACKSLASH_H: 2839 // Horiz white space 2840 if (currentLen == 0) { 2841 UnicodeSet s; 2842 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 2843 s.add((UChar32)9); // Tab 2844 if (URX_VAL(op) != 0) { 2845 s.complement(); 2846 } 2847 fRXPat->fInitialChars->addAll(s); 2848 numInitialStrings += 2; 2849 } 2850 currentLen = safeIncrement(currentLen, 1); 2851 atStart = FALSE; 2852 break; 2853 2854 2855 case URX_BACKSLASH_R: // Any line ending sequence 2856 case URX_BACKSLASH_V: // Any line ending code point, with optional negation 2857 if (currentLen == 0) { 2858 UnicodeSet s; 2859 s.add((UChar32)0x0a, (UChar32)0x0d); // add range 2860 s.add((UChar32)0x85); 2861 s.add((UChar32)0x2028, (UChar32)0x2029); 2862 if (URX_VAL(op) != 0) { 2863 // Complement option applies to URX_BACKSLASH_V only. 2864 s.complement(); 2865 } 2866 fRXPat->fInitialChars->addAll(s); 2867 numInitialStrings += 2; 2868 } 2869 currentLen = safeIncrement(currentLen, 1); 2870 atStart = FALSE; 2871 break; 2872 2873 2874 2875 case URX_ONECHAR_I: 2876 // Case Insensitive Single Character. 2877 if (currentLen == 0) { 2878 UChar32 c = URX_VAL(op); 2879 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2880 UnicodeSet starters(c, c); 2881 starters.closeOver(USET_CASE_INSENSITIVE); 2882 // findCaseInsensitiveStarters(c, &starters); 2883 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings. 2884 // The expanded folding can't match the pattern. 2885 fRXPat->fInitialChars->addAll(starters); 2886 } else { 2887 // Char has no case variants. Just add it as-is to the 2888 // set of possible starting chars. 2889 fRXPat->fInitialChars->add(c); 2890 } 2891 numInitialStrings += 2; 2892 } 2893 currentLen = safeIncrement(currentLen, 1); 2894 atStart = FALSE; 2895 break; 2896 2897 2898 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2899 case URX_DOTANY_ALL: // . matches one or two. 2900 case URX_DOTANY: 2901 case URX_DOTANY_UNIX: 2902 if (currentLen == 0) { 2903 // These constructs are all bad news when they appear at the start 2904 // of a match. Any character can begin the match. 2905 fRXPat->fInitialChars->clear(); 2906 fRXPat->fInitialChars->complement(); 2907 numInitialStrings += 2; 2908 } 2909 currentLen = safeIncrement(currentLen, 1); 2910 atStart = FALSE; 2911 break; 2912 2913 2914 case URX_JMPX: 2915 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 2916 U_FALLTHROUGH; 2917 case URX_JMP: 2918 { 2919 int32_t jmpDest = URX_VAL(op); 2920 if (jmpDest < loc) { 2921 // Loop of some kind. Can safely ignore, the worst that will happen 2922 // is that we understate the true minimum length 2923 currentLen = forwardedLength.elementAti(loc+1); 2924 2925 } else { 2926 // Forward jump. Propagate the current min length to the target loc of the jump. 2927 U_ASSERT(jmpDest <= end+1); 2928 if (forwardedLength.elementAti(jmpDest) > currentLen) { 2929 forwardedLength.setElementAt(currentLen, jmpDest); 2930 } 2931 } 2932 } 2933 atStart = FALSE; 2934 break; 2935 2936 case URX_JMP_SAV: 2937 case URX_JMP_SAV_X: 2938 // Combo of state save to the next loc, + jmp backwards. 2939 // Net effect on min. length computation is nothing. 2940 atStart = FALSE; 2941 break; 2942 2943 case URX_BACKTRACK: 2944 // Fails are kind of like a branch, except that the min length was 2945 // propagated already, by the state save. 2946 currentLen = forwardedLength.elementAti(loc+1); 2947 atStart = FALSE; 2948 break; 2949 2950 2951 case URX_STATE_SAVE: 2952 { 2953 // State Save, for forward jumps, propagate the current minimum. 2954 // of the state save. 2955 int32_t jmpDest = URX_VAL(op); 2956 if (jmpDest > loc) { 2957 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2958 forwardedLength.setElementAt(currentLen, jmpDest); 2959 } 2960 } 2961 } 2962 atStart = FALSE; 2963 break; 2964 2965 2966 2967 2968 case URX_STRING: 2969 { 2970 loc++; 2971 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2972 int32_t stringLen = URX_VAL(stringLenOp); 2973 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2974 U_ASSERT(stringLenOp >= 2); 2975 if (currentLen == 0) { 2976 // Add the starting character of this string to the set of possible starting 2977 // characters for this pattern. 2978 int32_t stringStartIdx = URX_VAL(op); 2979 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2980 fRXPat->fInitialChars->add(c); 2981 2982 // Remember this string. After the entire pattern has been checked, 2983 // if nothing else is identified that can start a match, we'll use it. 2984 numInitialStrings++; 2985 fRXPat->fInitialStringIdx = stringStartIdx; 2986 fRXPat->fInitialStringLen = stringLen; 2987 } 2988 2989 currentLen = safeIncrement(currentLen, stringLen); 2990 atStart = FALSE; 2991 } 2992 break; 2993 2994 case URX_STRING_I: 2995 { 2996 // Case-insensitive string. Unlike exact-match strings, we won't 2997 // attempt a string search for possible match positions. But we 2998 // do update the set of possible starting characters. 2999 loc++; 3000 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3001 int32_t stringLen = URX_VAL(stringLenOp); 3002 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 3003 U_ASSERT(stringLenOp >= 2); 3004 if (currentLen == 0) { 3005 // Add the starting character of this string to the set of possible starting 3006 // characters for this pattern. 3007 int32_t stringStartIdx = URX_VAL(op); 3008 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 3009 UnicodeSet s; 3010 findCaseInsensitiveStarters(c, &s); 3011 fRXPat->fInitialChars->addAll(s); 3012 numInitialStrings += 2; // Matching on an initial string not possible. 3013 } 3014 currentLen = safeIncrement(currentLen, stringLen); 3015 atStart = FALSE; 3016 } 3017 break; 3018 3019 case URX_CTR_INIT: 3020 case URX_CTR_INIT_NG: 3021 { 3022 // Loop Init Ops. These don't change the min length, but they are 4 word ops 3023 // so location must be updated accordingly. 3024 // Loop Init Ops. 3025 // If the min loop count == 0 3026 // move loc forwards to the end of the loop, skipping over the body. 3027 // If the min count is > 0, 3028 // continue normal processing of the body of the loop. 3029 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3030 loopEndLoc = URX_VAL(loopEndLoc); 3031 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3032 if (minLoopCount == 0) { 3033 // Min Loop Count of 0, treat like a forward branch and 3034 // move the current minimum length up to the target 3035 // (end of loop) location. 3036 U_ASSERT(loopEndLoc <= end+1); 3037 if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 3038 forwardedLength.setElementAt(currentLen, loopEndLoc); 3039 } 3040 } 3041 loc+=3; // Skips over operands of CTR_INIT 3042 } 3043 atStart = FALSE; 3044 break; 3045 3046 3047 case URX_CTR_LOOP: 3048 case URX_CTR_LOOP_NG: 3049 // Loop ops. 3050 // The jump is conditional, backwards only. 3051 atStart = FALSE; 3052 break; 3053 3054 case URX_LOOP_C: 3055 // More loop ops. These state-save to themselves. 3056 // don't change the minimum match 3057 atStart = FALSE; 3058 break; 3059 3060 3061 case URX_LA_START: 3062 case URX_LB_START: 3063 { 3064 // Look-around. Scan forward until the matching look-ahead end, 3065 // without processing the look-around block. This is overly pessimistic. 3066 3067 // Keep track of the nesting depth of look-around blocks. Boilerplate code for 3068 // lookahead contains two LA_END instructions, so count goes up by two 3069 // for each LA_START. 3070 int32_t depth = (opType == URX_LA_START? 2: 1); 3071 for (;;) { 3072 loc++; 3073 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3074 if (URX_TYPE(op) == URX_LA_START) { 3075 depth+=2; 3076 } 3077 if (URX_TYPE(op) == URX_LB_START) { 3078 depth++; 3079 } 3080 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3081 depth--; 3082 if (depth == 0) { 3083 break; 3084 } 3085 } 3086 if (URX_TYPE(op) == URX_STATE_SAVE) { 3087 // Need this because neg lookahead blocks will FAIL to outside 3088 // of the block. 3089 int32_t jmpDest = URX_VAL(op); 3090 if (jmpDest > loc) { 3091 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3092 forwardedLength.setElementAt(currentLen, jmpDest); 3093 } 3094 } 3095 } 3096 U_ASSERT(loc <= end); 3097 } 3098 } 3099 break; 3100 3101 case URX_LA_END: 3102 case URX_LB_CONT: 3103 case URX_LB_END: 3104 case URX_LBN_CONT: 3105 case URX_LBN_END: 3106 U_ASSERT(FALSE); // Shouldn't get here. These ops should be 3107 // consumed by the scan in URX_LA_START and LB_START 3108 3109 break; 3110 3111 default: 3112 U_ASSERT(FALSE); 3113 } 3114 3115 } 3116 3117 3118 // We have finished walking through the ops. Check whether some forward jump 3119 // propagated a shorter length to location end+1. 3120 if (forwardedLength.elementAti(end+1) < currentLen) { 3121 currentLen = forwardedLength.elementAti(end+1); 3122 } 3123 3124 3125 fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 3126 3127 3128 // Sort out what we should check for when looking for candidate match start positions. 3129 // In order of preference, 3130 // 1. Start of input text buffer. 3131 // 2. A literal string. 3132 // 3. Start of line in multi-line mode. 3133 // 4. A single literal character. 3134 // 5. A character from a set of characters. 3135 // 3136 if (fRXPat->fStartType == START_START) { 3137 // Match only at the start of an input text string. 3138 // start type is already set. We're done. 3139 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 3140 // Match beginning only with a literal string. 3141 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 3142 U_ASSERT(fRXPat->fInitialChars->contains(c)); 3143 fRXPat->fStartType = START_STRING; 3144 fRXPat->fInitialChar = c; 3145 } else if (fRXPat->fStartType == START_LINE) { 3146 // Match at start of line in Multi-Line mode. 3147 // Nothing to do here; everything is already set. 3148 } else if (fRXPat->fMinMatchLen == 0) { 3149 // Zero length match possible. We could start anywhere. 3150 fRXPat->fStartType = START_NO_INFO; 3151 } else if (fRXPat->fInitialChars->size() == 1) { 3152 // All matches begin with the same char. 3153 fRXPat->fStartType = START_CHAR; 3154 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 3155 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 3156 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 3157 fRXPat->fMinMatchLen > 0) { 3158 // Matches start with a set of character smaller than the set of all chars. 3159 fRXPat->fStartType = START_SET; 3160 } else { 3161 // Matches can start with anything 3162 fRXPat->fStartType = START_NO_INFO; 3163 } 3164 3165 return; 3166 } 3167 3168 3169 3170 //------------------------------------------------------------------------------ 3171 // 3172 // minMatchLength Calculate the length of the shortest string that could 3173 // match the specified pattern. 3174 // Length is in 16 bit code units, not code points. 3175 // 3176 // The calculated length may not be exact. The returned 3177 // value may be shorter than the actual minimum; it must 3178 // never be longer. 3179 // 3180 // start and end are the range of p-code operations to be 3181 // examined. The endpoints are included in the range. 3182 // 3183 //------------------------------------------------------------------------------ 3184 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 3185 if (U_FAILURE(*fStatus)) { 3186 return 0; 3187 } 3188 3189 U_ASSERT(start <= end); 3190 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3191 3192 3193 int32_t loc; 3194 int32_t op; 3195 int32_t opType; 3196 int32_t currentLen = 0; 3197 3198 3199 // forwardedLength is a vector holding minimum-match-length values that 3200 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 3201 // It must be one longer than the pattern being checked because some ops 3202 // will jmp to a end-of-block+1 location from within a block, and we must 3203 // count those when checking the block. 3204 UVector32 forwardedLength(end+2, *fStatus); 3205 forwardedLength.setSize(end+2); 3206 for (loc=start; loc<=end+1; loc++) { 3207 forwardedLength.setElementAt(INT32_MAX, loc); 3208 } 3209 3210 for (loc = start; loc<=end; loc++) { 3211 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3212 opType = URX_TYPE(op); 3213 3214 // The loop is advancing linearly through the pattern. 3215 // If the op we are now at was the destination of a branch in the pattern, 3216 // and that path has a shorter minimum length than the current accumulated value, 3217 // replace the current accumulated value. 3218 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 3219 // no-match-possible cases. 3220 if (forwardedLength.elementAti(loc) < currentLen) { 3221 currentLen = forwardedLength.elementAti(loc); 3222 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3223 } 3224 3225 switch (opType) { 3226 // Ops that don't change the total length matched 3227 case URX_RESERVED_OP: 3228 case URX_END: 3229 case URX_STRING_LEN: 3230 case URX_NOP: 3231 case URX_START_CAPTURE: 3232 case URX_END_CAPTURE: 3233 case URX_BACKSLASH_B: 3234 case URX_BACKSLASH_BU: 3235 case URX_BACKSLASH_G: 3236 case URX_BACKSLASH_Z: 3237 case URX_CARET: 3238 case URX_DOLLAR: 3239 case URX_DOLLAR_M: 3240 case URX_DOLLAR_D: 3241 case URX_DOLLAR_MD: 3242 case URX_RELOC_OPRND: 3243 case URX_STO_INP_LOC: 3244 case URX_CARET_M: 3245 case URX_CARET_M_UNIX: 3246 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3247 case URX_BACKREF_I: 3248 3249 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3250 case URX_LD_SP: 3251 3252 case URX_JMP_SAV: 3253 case URX_JMP_SAV_X: 3254 break; 3255 3256 3257 // Ops that match a minimum of one character (one or two 16 bit code units.) 3258 // 3259 case URX_ONECHAR: 3260 case URX_STATIC_SETREF: 3261 case URX_STAT_SETREF_N: 3262 case URX_SETREF: 3263 case URX_BACKSLASH_D: 3264 case URX_BACKSLASH_H: 3265 case URX_BACKSLASH_R: 3266 case URX_BACKSLASH_V: 3267 case URX_ONECHAR_I: 3268 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3269 case URX_DOTANY_ALL: // . matches one or two. 3270 case URX_DOTANY: 3271 case URX_DOTANY_UNIX: 3272 currentLen = safeIncrement(currentLen, 1); 3273 break; 3274 3275 3276 case URX_JMPX: 3277 loc++; // URX_JMPX has an extra operand, ignored here, 3278 // otherwise processed identically to URX_JMP. 3279 U_FALLTHROUGH; 3280 case URX_JMP: 3281 { 3282 int32_t jmpDest = URX_VAL(op); 3283 if (jmpDest < loc) { 3284 // Loop of some kind. Can safely ignore, the worst that will happen 3285 // is that we understate the true minimum length 3286 currentLen = forwardedLength.elementAti(loc+1); 3287 } else { 3288 // Forward jump. Propagate the current min length to the target loc of the jump. 3289 U_ASSERT(jmpDest <= end+1); 3290 if (forwardedLength.elementAti(jmpDest) > currentLen) { 3291 forwardedLength.setElementAt(currentLen, jmpDest); 3292 } 3293 } 3294 } 3295 break; 3296 3297 case URX_BACKTRACK: 3298 { 3299 // Back-tracks are kind of like a branch, except that the min length was 3300 // propagated already, by the state save. 3301 currentLen = forwardedLength.elementAti(loc+1); 3302 } 3303 break; 3304 3305 3306 case URX_STATE_SAVE: 3307 { 3308 // State Save, for forward jumps, propagate the current minimum. 3309 // of the state save. 3310 int32_t jmpDest = URX_VAL(op); 3311 if (jmpDest > loc) { 3312 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3313 forwardedLength.setElementAt(currentLen, jmpDest); 3314 } 3315 } 3316 } 3317 break; 3318 3319 3320 case URX_STRING: 3321 { 3322 loc++; 3323 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3324 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3325 } 3326 break; 3327 3328 3329 case URX_STRING_I: 3330 { 3331 loc++; 3332 // TODO: with full case folding, matching input text may be shorter than 3333 // the string we have here. More smarts could put some bounds on it. 3334 // Assume a min length of one for now. A min length of zero causes 3335 // optimization failures for a pattern like "string"+ 3336 // currentLen += URX_VAL(stringLenOp); 3337 currentLen = safeIncrement(currentLen, 1); 3338 } 3339 break; 3340 3341 case URX_CTR_INIT: 3342 case URX_CTR_INIT_NG: 3343 { 3344 // Loop Init Ops. 3345 // If the min loop count == 0 3346 // move loc forwards to the end of the loop, skipping over the body. 3347 // If the min count is > 0, 3348 // continue normal processing of the body of the loop. 3349 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3350 loopEndLoc = URX_VAL(loopEndLoc); 3351 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3352 if (minLoopCount == 0) { 3353 loc = loopEndLoc; 3354 } else { 3355 loc+=3; // Skips over operands of CTR_INIT 3356 } 3357 } 3358 break; 3359 3360 3361 case URX_CTR_LOOP: 3362 case URX_CTR_LOOP_NG: 3363 // Loop ops. 3364 // The jump is conditional, backwards only. 3365 break; 3366 3367 case URX_LOOP_SR_I: 3368 case URX_LOOP_DOT_I: 3369 case URX_LOOP_C: 3370 // More loop ops. These state-save to themselves. 3371 // don't change the minimum match - could match nothing at all. 3372 break; 3373 3374 3375 case URX_LA_START: 3376 case URX_LB_START: 3377 { 3378 // Look-around. Scan forward until the matching look-ahead end, 3379 // without processing the look-around block. This is overly pessimistic for look-ahead, 3380 // it assumes that the look-ahead match might be zero-length. 3381 // TODO: Positive lookahead could recursively do the block, then continue 3382 // with the longer of the block or the value coming in. Ticket 6060 3383 int32_t depth = (opType == URX_LA_START? 2: 1);; 3384 for (;;) { 3385 loc++; 3386 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3387 if (URX_TYPE(op) == URX_LA_START) { 3388 // The boilerplate for look-ahead includes two LA_END insturctions, 3389 // Depth will be decremented by each one when it is seen. 3390 depth += 2; 3391 } 3392 if (URX_TYPE(op) == URX_LB_START) { 3393 depth++; 3394 } 3395 if (URX_TYPE(op) == URX_LA_END) { 3396 depth--; 3397 if (depth == 0) { 3398 break; 3399 } 3400 } 3401 if (URX_TYPE(op)==URX_LBN_END) { 3402 depth--; 3403 if (depth == 0) { 3404 break; 3405 } 3406 } 3407 if (URX_TYPE(op) == URX_STATE_SAVE) { 3408 // Need this because neg lookahead blocks will FAIL to outside 3409 // of the block. 3410 int32_t jmpDest = URX_VAL(op); 3411 if (jmpDest > loc) { 3412 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3413 forwardedLength.setElementAt(currentLen, jmpDest); 3414 } 3415 } 3416 } 3417 U_ASSERT(loc <= end); 3418 } 3419 } 3420 break; 3421 3422 case URX_LA_END: 3423 case URX_LB_CONT: 3424 case URX_LB_END: 3425 case URX_LBN_CONT: 3426 case URX_LBN_END: 3427 // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3428 // range being sized, which happens when measuring size of look-behind blocks. 3429 break; 3430 3431 default: 3432 U_ASSERT(FALSE); 3433 } 3434 3435 } 3436 3437 // We have finished walking through the ops. Check whether some forward jump 3438 // propagated a shorter length to location end+1. 3439 if (forwardedLength.elementAti(end+1) < currentLen) { 3440 currentLen = forwardedLength.elementAti(end+1); 3441 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3442 } 3443 3444 return currentLen; 3445 } 3446 3447 //------------------------------------------------------------------------------ 3448 // 3449 // maxMatchLength Calculate the length of the longest string that could 3450 // match the specified pattern. 3451 // Length is in 16 bit code units, not code points. 3452 // 3453 // The calculated length may not be exact. The returned 3454 // value may be longer than the actual maximum; it must 3455 // never be shorter. 3456 // 3457 //------------------------------------------------------------------------------ 3458 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3459 if (U_FAILURE(*fStatus)) { 3460 return 0; 3461 } 3462 U_ASSERT(start <= end); 3463 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3464 3465 3466 int32_t loc; 3467 int32_t op; 3468 int32_t opType; 3469 int32_t currentLen = 0; 3470 UVector32 forwardedLength(end+1, *fStatus); 3471 forwardedLength.setSize(end+1); 3472 3473 for (loc=start; loc<=end; loc++) { 3474 forwardedLength.setElementAt(0, loc); 3475 } 3476 3477 for (loc = start; loc<=end; loc++) { 3478 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3479 opType = URX_TYPE(op); 3480 3481 // The loop is advancing linearly through the pattern. 3482 // If the op we are now at was the destination of a branch in the pattern, 3483 // and that path has a longer maximum length than the current accumulated value, 3484 // replace the current accumulated value. 3485 if (forwardedLength.elementAti(loc) > currentLen) { 3486 currentLen = forwardedLength.elementAti(loc); 3487 } 3488 3489 switch (opType) { 3490 // Ops that don't change the total length matched 3491 case URX_RESERVED_OP: 3492 case URX_END: 3493 case URX_STRING_LEN: 3494 case URX_NOP: 3495 case URX_START_CAPTURE: 3496 case URX_END_CAPTURE: 3497 case URX_BACKSLASH_B: 3498 case URX_BACKSLASH_BU: 3499 case URX_BACKSLASH_G: 3500 case URX_BACKSLASH_Z: 3501 case URX_CARET: 3502 case URX_DOLLAR: 3503 case URX_DOLLAR_M: 3504 case URX_DOLLAR_D: 3505 case URX_DOLLAR_MD: 3506 case URX_RELOC_OPRND: 3507 case URX_STO_INP_LOC: 3508 case URX_CARET_M: 3509 case URX_CARET_M_UNIX: 3510 3511 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3512 case URX_LD_SP: 3513 3514 case URX_LB_END: 3515 case URX_LB_CONT: 3516 case URX_LBN_CONT: 3517 case URX_LBN_END: 3518 break; 3519 3520 3521 // Ops that increase that cause an unbounded increase in the length 3522 // of a matched string, or that increase it a hard to characterize way. 3523 // Call the max length unbounded, and stop further checking. 3524 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3525 case URX_BACKREF_I: 3526 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3527 currentLen = INT32_MAX; 3528 break; 3529 3530 3531 // Ops that match a max of one character (possibly two 16 bit code units.) 3532 // 3533 case URX_STATIC_SETREF: 3534 case URX_STAT_SETREF_N: 3535 case URX_SETREF: 3536 case URX_BACKSLASH_D: 3537 case URX_BACKSLASH_H: 3538 case URX_BACKSLASH_R: 3539 case URX_BACKSLASH_V: 3540 case URX_ONECHAR_I: 3541 case URX_DOTANY_ALL: 3542 case URX_DOTANY: 3543 case URX_DOTANY_UNIX: 3544 currentLen = safeIncrement(currentLen, 2); 3545 break; 3546 3547 // Single literal character. Increase current max length by one or two, 3548 // depending on whether the char is in the supplementary range. 3549 case URX_ONECHAR: 3550 currentLen = safeIncrement(currentLen, 1); 3551 if (URX_VAL(op) > 0x10000) { 3552 currentLen = safeIncrement(currentLen, 1); 3553 } 3554 break; 3555 3556 // Jumps. 3557 // 3558 case URX_JMP: 3559 case URX_JMPX: 3560 case URX_JMP_SAV: 3561 case URX_JMP_SAV_X: 3562 { 3563 int32_t jmpDest = URX_VAL(op); 3564 if (jmpDest < loc) { 3565 // Loop of some kind. Max match length is unbounded. 3566 currentLen = INT32_MAX; 3567 } else { 3568 // Forward jump. Propagate the current min length to the target loc of the jump. 3569 if (forwardedLength.elementAti(jmpDest) < currentLen) { 3570 forwardedLength.setElementAt(currentLen, jmpDest); 3571 } 3572 currentLen = 0; 3573 } 3574 } 3575 break; 3576 3577 case URX_BACKTRACK: 3578 // back-tracks are kind of like a branch, except that the max length was 3579 // propagated already, by the state save. 3580 currentLen = forwardedLength.elementAti(loc+1); 3581 break; 3582 3583 3584 case URX_STATE_SAVE: 3585 { 3586 // State Save, for forward jumps, propagate the current minimum. 3587 // of the state save. 3588 // For backwards jumps, they create a loop, maximum 3589 // match length is unbounded. 3590 int32_t jmpDest = URX_VAL(op); 3591 if (jmpDest > loc) { 3592 if (currentLen > forwardedLength.elementAti(jmpDest)) { 3593 forwardedLength.setElementAt(currentLen, jmpDest); 3594 } 3595 } else { 3596 currentLen = INT32_MAX; 3597 } 3598 } 3599 break; 3600 3601 3602 3603 3604 case URX_STRING: 3605 { 3606 loc++; 3607 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3608 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3609 break; 3610 } 3611 3612 case URX_STRING_I: 3613 // TODO: This code assumes that any user string that matches will be no longer 3614 // than our compiled string, with case insensitive matching. 3615 // Our compiled string has been case-folded already. 3616 // 3617 // Any matching user string will have no more code points than our 3618 // compiled (folded) string. Folding may add code points, but 3619 // not remove them. 3620 // 3621 // There is a potential problem if a supplemental code point 3622 // case-folds to a BMP code point. In this case our compiled string 3623 // could be shorter (in code units) than a matching user string. 3624 // 3625 // At this time (Unicode 6.1) there are no such characters, and this case 3626 // is not being handled. A test, intltest regex/Bug9283, will fail if 3627 // any problematic characters are added to Unicode. 3628 // 3629 // If this happens, we can make a set of the BMP chars that the 3630 // troublesome supplementals fold to, scan our string, and bump the 3631 // currentLen one extra for each that is found. 3632 // 3633 { 3634 loc++; 3635 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3636 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3637 } 3638 break; 3639 3640 case URX_CTR_INIT: 3641 case URX_CTR_INIT_NG: 3642 // For Loops, recursively call this function on the pattern for the loop body, 3643 // then multiply the result by the maximum loop count. 3644 { 3645 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); 3646 if (loopEndLoc == loc+4) { 3647 // Loop has an empty body. No affect on max match length. 3648 // Continue processing with code after the loop end. 3649 loc = loopEndLoc; 3650 break; 3651 } 3652 3653 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3)); 3654 if (maxLoopCount == -1) { 3655 // Unbounded Loop. No upper bound on match length. 3656 currentLen = INT32_MAX; 3657 break; 3658 } 3659 3660 U_ASSERT(loopEndLoc >= loc+4); 3661 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. 3662 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount; 3663 if (updatedLen >= INT32_MAX) { 3664 currentLen = INT32_MAX; 3665 break; 3666 } 3667 currentLen = (int32_t)updatedLen; 3668 loc = loopEndLoc; 3669 break; 3670 } 3671 3672 case URX_CTR_LOOP: 3673 case URX_CTR_LOOP_NG: 3674 // These opcodes will be skipped over by code for URX_CRT_INIT. 3675 // We shouldn't encounter them here. 3676 U_ASSERT(FALSE); 3677 break; 3678 3679 case URX_LOOP_SR_I: 3680 case URX_LOOP_DOT_I: 3681 case URX_LOOP_C: 3682 // For anything to do with loops, make the match length unbounded. 3683 currentLen = INT32_MAX; 3684 break; 3685 3686 3687 3688 case URX_LA_START: 3689 case URX_LA_END: 3690 // Look-ahead. Just ignore, treat the look-ahead block as if 3691 // it were normal pattern. Gives a too-long match length, 3692 // but good enough for now. 3693 break; 3694 3695 // End of look-ahead ops should always be consumed by the processing at 3696 // the URX_LA_START op. 3697 // U_ASSERT(FALSE); 3698 // break; 3699 3700 case URX_LB_START: 3701 { 3702 // Look-behind. Scan forward until the matching look-around end, 3703 // without processing the look-behind block. 3704 int32_t depth = 0; 3705 for (;;) { 3706 loc++; 3707 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3708 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 3709 depth++; 3710 } 3711 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3712 if (depth == 0) { 3713 break; 3714 } 3715 depth--; 3716 } 3717 U_ASSERT(loc < end); 3718 } 3719 } 3720 break; 3721 3722 default: 3723 U_ASSERT(FALSE); 3724 } 3725 3726 3727 if (currentLen == INT32_MAX) { 3728 // The maximum length is unbounded. 3729 // Stop further processing of the pattern. 3730 break; 3731 } 3732 3733 } 3734 return currentLen; 3735 3736 } 3737 3738 3739 //------------------------------------------------------------------------------ 3740 // 3741 // stripNOPs Remove any NOP operations from the compiled pattern code. 3742 // Extra NOPs are inserted for some constructs during the initial 3743 // code generation to provide locations that may be patched later. 3744 // Many end up unneeded, and are removed by this function. 3745 // 3746 // In order to minimize the number of passes through the pattern, 3747 // back-reference fixup is also performed here (adjusting 3748 // back-reference operands to point to the correct frame offsets). 3749 // 3750 //------------------------------------------------------------------------------ 3751 void RegexCompile::stripNOPs() { 3752 3753 if (U_FAILURE(*fStatus)) { 3754 return; 3755 } 3756 3757 int32_t end = fRXPat->fCompiledPat->size(); 3758 UVector32 deltas(end, *fStatus); 3759 3760 // Make a first pass over the code, computing the amount that things 3761 // will be offset at each location in the original code. 3762 int32_t loc; 3763 int32_t d = 0; 3764 for (loc=0; loc<end; loc++) { 3765 deltas.addElement(d, *fStatus); 3766 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3767 if (URX_TYPE(op) == URX_NOP) { 3768 d++; 3769 } 3770 } 3771 3772 UnicodeString caseStringBuffer; 3773 3774 // Make a second pass over the code, removing the NOPs by moving following 3775 // code up, and patching operands that refer to code locations that 3776 // are being moved. The array of offsets from the first step is used 3777 // to compute the new operand values. 3778 int32_t src; 3779 int32_t dst = 0; 3780 for (src=0; src<end; src++) { 3781 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3782 int32_t opType = URX_TYPE(op); 3783 switch (opType) { 3784 case URX_NOP: 3785 break; 3786 3787 case URX_STATE_SAVE: 3788 case URX_JMP: 3789 case URX_CTR_LOOP: 3790 case URX_CTR_LOOP_NG: 3791 case URX_RELOC_OPRND: 3792 case URX_JMPX: 3793 case URX_JMP_SAV: 3794 case URX_JMP_SAV_X: 3795 // These are instructions with operands that refer to code locations. 3796 { 3797 int32_t operandAddress = URX_VAL(op); 3798 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3799 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 3800 op = buildOp(opType, fixedOperandAddress); 3801 fRXPat->fCompiledPat->setElementAt(op, dst); 3802 dst++; 3803 break; 3804 } 3805 3806 case URX_BACKREF: 3807 case URX_BACKREF_I: 3808 { 3809 int32_t where = URX_VAL(op); 3810 if (where > fRXPat->fGroupMap->size()) { 3811 error(U_REGEX_INVALID_BACK_REF); 3812 break; 3813 } 3814 where = fRXPat->fGroupMap->elementAti(where-1); 3815 op = buildOp(opType, where); 3816 fRXPat->fCompiledPat->setElementAt(op, dst); 3817 dst++; 3818 3819 fRXPat->fNeedsAltInput = TRUE; 3820 break; 3821 } 3822 case URX_RESERVED_OP: 3823 case URX_RESERVED_OP_N: 3824 case URX_BACKTRACK: 3825 case URX_END: 3826 case URX_ONECHAR: 3827 case URX_STRING: 3828 case URX_STRING_LEN: 3829 case URX_START_CAPTURE: 3830 case URX_END_CAPTURE: 3831 case URX_STATIC_SETREF: 3832 case URX_STAT_SETREF_N: 3833 case URX_SETREF: 3834 case URX_DOTANY: 3835 case URX_FAIL: 3836 case URX_BACKSLASH_B: 3837 case URX_BACKSLASH_BU: 3838 case URX_BACKSLASH_G: 3839 case URX_BACKSLASH_X: 3840 case URX_BACKSLASH_Z: 3841 case URX_DOTANY_ALL: 3842 case URX_BACKSLASH_D: 3843 case URX_CARET: 3844 case URX_DOLLAR: 3845 case URX_CTR_INIT: 3846 case URX_CTR_INIT_NG: 3847 case URX_DOTANY_UNIX: 3848 case URX_STO_SP: 3849 case URX_LD_SP: 3850 case URX_STO_INP_LOC: 3851 case URX_LA_START: 3852 case URX_LA_END: 3853 case URX_ONECHAR_I: 3854 case URX_STRING_I: 3855 case URX_DOLLAR_M: 3856 case URX_CARET_M: 3857 case URX_CARET_M_UNIX: 3858 case URX_LB_START: 3859 case URX_LB_CONT: 3860 case URX_LB_END: 3861 case URX_LBN_CONT: 3862 case URX_LBN_END: 3863 case URX_LOOP_SR_I: 3864 case URX_LOOP_DOT_I: 3865 case URX_LOOP_C: 3866 case URX_DOLLAR_D: 3867 case URX_DOLLAR_MD: 3868 case URX_BACKSLASH_H: 3869 case URX_BACKSLASH_R: 3870 case URX_BACKSLASH_V: 3871 // These instructions are unaltered by the relocation. 3872 fRXPat->fCompiledPat->setElementAt(op, dst); 3873 dst++; 3874 break; 3875 3876 default: 3877 // Some op is unaccounted for. 3878 U_ASSERT(FALSE); 3879 error(U_REGEX_INTERNAL_ERROR); 3880 } 3881 } 3882 3883 fRXPat->fCompiledPat->setSize(dst); 3884 } 3885 3886 3887 3888 3889 //------------------------------------------------------------------------------ 3890 // 3891 // Error Report a rule parse error. 3892 // Only report it if no previous error has been recorded. 3893 // 3894 //------------------------------------------------------------------------------ 3895 void RegexCompile::error(UErrorCode e) { 3896 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) { 3897 *fStatus = e; 3898 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 3899 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 3900 // int64_t. If the values of the latter are out of range for the former, 3901 // set them to the appropriate "field not supported" values. 3902 if (fLineNum > 0x7FFFFFFF) { 3903 fParseErr->line = 0; 3904 fParseErr->offset = -1; 3905 } else if (fCharNum > 0x7FFFFFFF) { 3906 fParseErr->line = (int32_t)fLineNum; 3907 fParseErr->offset = -1; 3908 } else { 3909 fParseErr->line = (int32_t)fLineNum; 3910 fParseErr->offset = (int32_t)fCharNum; 3911 } 3912 3913 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3914 3915 // Fill in the context. 3916 // Note: extractBetween() pins supplied indicies to the string bounds. 3917 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3918 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 3919 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 3920 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3921 } 3922 } 3923 3924 3925 // 3926 // Assorted Unicode character constants. 3927 // Numeric because there is no portable way to enter them as literals. 3928 // (Think EBCDIC). 3929 // 3930 static const UChar chCR = 0x0d; // New lines, for terminating comments. 3931 static const UChar chLF = 0x0a; // Line Feed 3932 static const UChar chPound = 0x23; // '#', introduces a comment. 3933 static const UChar chDigit0 = 0x30; // '0' 3934 static const UChar chDigit7 = 0x37; // '9' 3935 static const UChar chColon = 0x3A; // ':' 3936 static const UChar chE = 0x45; // 'E' 3937 static const UChar chQ = 0x51; // 'Q' 3938 //static const UChar chN = 0x4E; // 'N' 3939 static const UChar chP = 0x50; // 'P' 3940 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 3941 //static const UChar chLBracket = 0x5b; // '[' 3942 static const UChar chRBracket = 0x5d; // ']' 3943 static const UChar chUp = 0x5e; // '^' 3944 static const UChar chLowerP = 0x70; 3945 static const UChar chLBrace = 0x7b; // '{' 3946 static const UChar chRBrace = 0x7d; // '}' 3947 static const UChar chNEL = 0x85; // NEL newline variant 3948 static const UChar chLS = 0x2028; // Unicode Line Separator 3949 3950 3951 //------------------------------------------------------------------------------ 3952 // 3953 // nextCharLL Low Level Next Char from the regex pattern. 3954 // Get a char from the string, keep track of input position 3955 // for error reporting. 3956 // 3957 //------------------------------------------------------------------------------ 3958 UChar32 RegexCompile::nextCharLL() { 3959 UChar32 ch; 3960 3961 if (fPeekChar != -1) { 3962 ch = fPeekChar; 3963 fPeekChar = -1; 3964 return ch; 3965 } 3966 3967 // assume we're already in the right place 3968 ch = UTEXT_NEXT32(fRXPat->fPattern); 3969 if (ch == U_SENTINEL) { 3970 return ch; 3971 } 3972 3973 if (ch == chCR || 3974 ch == chNEL || 3975 ch == chLS || 3976 (ch == chLF && fLastChar != chCR)) { 3977 // Character is starting a new line. Bump up the line number, and 3978 // reset the column to 0. 3979 fLineNum++; 3980 fCharNum=0; 3981 } 3982 else { 3983 // Character is not starting a new line. Except in the case of a 3984 // LF following a CR, increment the column position. 3985 if (ch != chLF) { 3986 fCharNum++; 3987 } 3988 } 3989 fLastChar = ch; 3990 return ch; 3991 } 3992 3993 //------------------------------------------------------------------------------ 3994 // 3995 // peekCharLL Low Level Character Scanning, sneak a peek at the next 3996 // character without actually getting it. 3997 // 3998 //------------------------------------------------------------------------------ 3999 UChar32 RegexCompile::peekCharLL() { 4000 if (fPeekChar == -1) { 4001 fPeekChar = nextCharLL(); 4002 } 4003 return fPeekChar; 4004 } 4005 4006 4007 //------------------------------------------------------------------------------ 4008 // 4009 // nextChar for pattern scanning. At this level, we handle stripping 4010 // out comments and processing some backslash character escapes. 4011 // The rest of the pattern grammar is handled at the next level up. 4012 // 4013 //------------------------------------------------------------------------------ 4014 void RegexCompile::nextChar(RegexPatternChar &c) { 4015 4016 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4017 c.fChar = nextCharLL(); 4018 c.fQuoted = FALSE; 4019 4020 if (fQuoteMode) { 4021 c.fQuoted = TRUE; 4022 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || 4023 c.fChar == (UChar32)-1) { 4024 fQuoteMode = FALSE; // Exit quote mode, 4025 nextCharLL(); // discard the E 4026 nextChar(c); // recurse to get the real next char 4027 } 4028 } 4029 else if (fInBackslashQuote) { 4030 // The current character immediately follows a '\' 4031 // Don't check for any further escapes, just return it as-is. 4032 // Don't set c.fQuoted, because that would prevent the state machine from 4033 // dispatching on the character. 4034 fInBackslashQuote = FALSE; 4035 } 4036 else 4037 { 4038 // We are not in a \Q quoted region \E of the source. 4039 // 4040 if (fModeFlags & UREGEX_COMMENTS) { 4041 // 4042 // We are in free-spacing and comments mode. 4043 // Scan through any white space and comments, until we 4044 // reach a significant character or the end of inut. 4045 for (;;) { 4046 if (c.fChar == (UChar32)-1) { 4047 break; // End of Input 4048 } 4049 if (c.fChar == chPound && fEOLComments == TRUE) { 4050 // Start of a comment. Consume the rest of it, until EOF or a new line 4051 for (;;) { 4052 c.fChar = nextCharLL(); 4053 if (c.fChar == (UChar32)-1 || // EOF 4054 c.fChar == chCR || 4055 c.fChar == chLF || 4056 c.fChar == chNEL || 4057 c.fChar == chLS) { 4058 break; 4059 } 4060 } 4061 } 4062 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 4063 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) { 4064 break; 4065 } 4066 c.fChar = nextCharLL(); 4067 } 4068 } 4069 4070 // 4071 // check for backslash escaped characters. 4072 // 4073 if (c.fChar == chBackSlash) { 4074 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4075 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 4076 // 4077 // A '\' sequence that is handled by ICU's standard unescapeAt function. 4078 // Includes \uxxxx, \n, \r, many others. 4079 // Return the single equivalent character. 4080 // 4081 nextCharLL(); // get & discard the peeked char. 4082 c.fQuoted = TRUE; 4083 4084 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 4085 int32_t endIndex = (int32_t)pos; 4086 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 4087 4088 if (endIndex == pos) { 4089 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4090 } 4091 fCharNum += endIndex - pos; 4092 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 4093 } else { 4094 int32_t offset = 0; 4095 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 4096 4097 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 4098 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 4099 4100 if (offset == 0) { 4101 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4102 } else if (context.lastOffset == offset) { 4103 UTEXT_PREVIOUS32(fRXPat->fPattern); 4104 } else if (context.lastOffset != offset-1) { 4105 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 4106 } 4107 fCharNum += offset; 4108 } 4109 } 4110 else if (peekCharLL() == chDigit0) { 4111 // Octal Escape, using Java Regexp Conventions 4112 // which are \0 followed by 1-3 octal digits. 4113 // Different from ICU Unescape handling of Octal, which does not 4114 // require the leading 0. 4115 // Java also has the convention of only consuming 2 octal digits if 4116 // the three digit number would be > 0xff 4117 // 4118 c.fChar = 0; 4119 nextCharLL(); // Consume the initial 0. 4120 int index; 4121 for (index=0; index<3; index++) { 4122 int32_t ch = peekCharLL(); 4123 if (ch<chDigit0 || ch>chDigit7) { 4124 if (index==0) { 4125 // \0 is not followed by any octal digits. 4126 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4127 } 4128 break; 4129 } 4130 c.fChar <<= 3; 4131 c.fChar += ch&7; 4132 if (c.fChar <= 255) { 4133 nextCharLL(); 4134 } else { 4135 // The last digit made the number too big. Forget we saw it. 4136 c.fChar >>= 3; 4137 } 4138 } 4139 c.fQuoted = TRUE; 4140 } 4141 else if (peekCharLL() == chQ) { 4142 // "\Q" enter quote mode, which will continue until "\E" 4143 fQuoteMode = TRUE; 4144 nextCharLL(); // discard the 'Q'. 4145 nextChar(c); // recurse to get the real next char. 4146 } 4147 else 4148 { 4149 // We are in a '\' escape that will be handled by the state table scanner. 4150 // Just return the backslash, but remember that the following char is to 4151 // be taken literally. 4152 fInBackslashQuote = TRUE; 4153 } 4154 } 4155 } 4156 4157 // re-enable # to end-of-line comments, in case they were disabled. 4158 // They are disabled by the parser upon seeing '(?', but this lasts for 4159 // the fetching of the next character only. 4160 fEOLComments = TRUE; 4161 4162 // putc(c.fChar, stdout); 4163 } 4164 4165 4166 4167 //------------------------------------------------------------------------------ 4168 // 4169 // scanNamedChar 4170 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 4171 // 4172 // The scan position will be at the 'N'. On return 4173 // the scan position should be just after the '}' 4174 // 4175 // Return the UChar32 4176 // 4177 //------------------------------------------------------------------------------ 4178 UChar32 RegexCompile::scanNamedChar() { 4179 if (U_FAILURE(*fStatus)) { 4180 return 0; 4181 } 4182 4183 nextChar(fC); 4184 if (fC.fChar != chLBrace) { 4185 error(U_REGEX_PROPERTY_SYNTAX); 4186 return 0; 4187 } 4188 4189 UnicodeString charName; 4190 for (;;) { 4191 nextChar(fC); 4192 if (fC.fChar == chRBrace) { 4193 break; 4194 } 4195 if (fC.fChar == -1) { 4196 error(U_REGEX_PROPERTY_SYNTAX); 4197 return 0; 4198 } 4199 charName.append(fC.fChar); 4200 } 4201 4202 char name[100]; 4203 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 4204 (uint32_t)charName.length()>=sizeof(name)) { 4205 // All Unicode character names have only invariant characters. 4206 // The API to get a character, given a name, accepts only char *, forcing us to convert, 4207 // which requires this error check 4208 error(U_REGEX_PROPERTY_SYNTAX); 4209 return 0; 4210 } 4211 charName.extract(0, charName.length(), name, sizeof(name), US_INV); 4212 4213 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 4214 if (U_FAILURE(*fStatus)) { 4215 error(U_REGEX_PROPERTY_SYNTAX); 4216 } 4217 4218 nextChar(fC); // Continue overall regex pattern processing with char after the '}' 4219 return theChar; 4220 } 4221 4222 //------------------------------------------------------------------------------ 4223 // 4224 // scanProp Construct a UnicodeSet from the text at the current scan 4225 // position, which will be of the form \p{whaterver} 4226 // 4227 // The scan position will be at the 'p' or 'P'. On return 4228 // the scan position should be just after the '}' 4229 // 4230 // Return a UnicodeSet, constructed from the \P pattern, 4231 // or NULL if the pattern is invalid. 4232 // 4233 //------------------------------------------------------------------------------ 4234 UnicodeSet *RegexCompile::scanProp() { 4235 UnicodeSet *uset = NULL; 4236 4237 if (U_FAILURE(*fStatus)) { 4238 return NULL; 4239 } 4240 (void)chLowerP; // Suppress compiler unused variable warning. 4241 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 4242 UBool negated = (fC.fChar == chP); 4243 4244 UnicodeString propertyName; 4245 nextChar(fC); 4246 if (fC.fChar != chLBrace) { 4247 error(U_REGEX_PROPERTY_SYNTAX); 4248 return NULL; 4249 } 4250 for (;;) { 4251 nextChar(fC); 4252 if (fC.fChar == chRBrace) { 4253 break; 4254 } 4255 if (fC.fChar == -1) { 4256 // Hit the end of the input string without finding the closing '}' 4257 error(U_REGEX_PROPERTY_SYNTAX); 4258 return NULL; 4259 } 4260 propertyName.append(fC.fChar); 4261 } 4262 uset = createSetForProperty(propertyName, negated); 4263 nextChar(fC); // Move input scan to position following the closing '}' 4264 return uset; 4265 } 4266 4267 //------------------------------------------------------------------------------ 4268 // 4269 // scanPosixProp Construct a UnicodeSet from the text at the current scan 4270 // position, which is expected be of the form [:property expression:] 4271 // 4272 // The scan position will be at the opening ':'. On return 4273 // the scan position must be on the closing ']' 4274 // 4275 // Return a UnicodeSet constructed from the pattern, 4276 // or NULL if this is not a valid POSIX-style set expression. 4277 // If not a property expression, restore the initial scan position 4278 // (to the opening ':') 4279 // 4280 // Note: the opening '[:' is not sufficient to guarantee that 4281 // this is a [:property:] expression. 4282 // [:'+=,] is a perfectly good ordinary set expression that 4283 // happens to include ':' as one of its characters. 4284 // 4285 //------------------------------------------------------------------------------ 4286 UnicodeSet *RegexCompile::scanPosixProp() { 4287 UnicodeSet *uset = NULL; 4288 4289 if (U_FAILURE(*fStatus)) { 4290 return NULL; 4291 } 4292 4293 U_ASSERT(fC.fChar == chColon); 4294 4295 // Save the scanner state. 4296 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 4297 int64_t savedScanIndex = fScanIndex; 4298 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4299 UBool savedQuoteMode = fQuoteMode; 4300 UBool savedInBackslashQuote = fInBackslashQuote; 4301 UBool savedEOLComments = fEOLComments; 4302 int64_t savedLineNum = fLineNum; 4303 int64_t savedCharNum = fCharNum; 4304 UChar32 savedLastChar = fLastChar; 4305 UChar32 savedPeekChar = fPeekChar; 4306 RegexPatternChar savedfC = fC; 4307 4308 // Scan for a closing ]. A little tricky because there are some perverse 4309 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 4310 // ending on the second closing ]. 4311 4312 UnicodeString propName; 4313 UBool negated = FALSE; 4314 4315 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 4316 nextChar(fC); 4317 if (fC.fChar == chUp) { 4318 negated = TRUE; 4319 nextChar(fC); 4320 } 4321 4322 // Scan for the closing ":]", collecting the property name along the way. 4323 UBool sawPropSetTerminator = FALSE; 4324 for (;;) { 4325 propName.append(fC.fChar); 4326 nextChar(fC); 4327 if (fC.fQuoted || fC.fChar == -1) { 4328 // Escaped characters or end of input - either says this isn't a [:Property:] 4329 break; 4330 } 4331 if (fC.fChar == chColon) { 4332 nextChar(fC); 4333 if (fC.fChar == chRBracket) { 4334 sawPropSetTerminator = TRUE; 4335 } 4336 break; 4337 } 4338 } 4339 4340 if (sawPropSetTerminator) { 4341 uset = createSetForProperty(propName, negated); 4342 } 4343 else 4344 { 4345 // No closing ":]". 4346 // Restore the original scan position. 4347 // The main scanner will retry the input as a normal set expression, 4348 // not a [:Property:] expression. 4349 fScanIndex = savedScanIndex; 4350 fQuoteMode = savedQuoteMode; 4351 fInBackslashQuote = savedInBackslashQuote; 4352 fEOLComments = savedEOLComments; 4353 fLineNum = savedLineNum; 4354 fCharNum = savedCharNum; 4355 fLastChar = savedLastChar; 4356 fPeekChar = savedPeekChar; 4357 fC = savedfC; 4358 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4359 } 4360 return uset; 4361 } 4362 4363 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4364 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4365 addCategory(set, U_GC_CF_MASK, ec); 4366 } 4367 4368 // 4369 // Create a Unicode Set from a Unicode Property expression. 4370 // This is common code underlying both \p{...} ane [:...:] expressions. 4371 // Includes trying the Java "properties" that aren't supported as 4372 // normal ICU UnicodeSet properties 4373 // 4374 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4375 4376 if (U_FAILURE(*fStatus)) { 4377 return nullptr; 4378 } 4379 LocalPointer<UnicodeSet> set; 4380 UErrorCode status = U_ZERO_ERROR; 4381 4382 do { // non-loop, exists to allow breaks from the block. 4383 // 4384 // First try the property as we received it 4385 // 4386 UnicodeString setExpr; 4387 uint32_t usetFlags = 0; 4388 setExpr.append(u"[\\p{", -1); 4389 setExpr.append(propName); 4390 setExpr.append(u"}]", -1); 4391 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4392 usetFlags |= USET_CASE_INSENSITIVE; 4393 } 4394 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status); 4395 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) { 4396 break; 4397 } 4398 4399 // 4400 // The incoming property wasn't directly recognized by ICU. 4401 4402 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet. 4403 // Java accepts 'word' with mixed case. 4404 // Java accepts 'all' only in all lower case. 4405 4406 status = U_ZERO_ERROR; 4407 if (propName.caseCompare(u"word", -1, 0) == 0) { 4408 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status); 4409 break; 4410 } 4411 if (propName.compare(u"all", -1) == 0) { 4412 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status); 4413 break; 4414 } 4415 4416 4417 // Do Java InBlock expressions 4418 // 4419 UnicodeString mPropName = propName; 4420 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) { 4421 status = U_ZERO_ERROR; 4422 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); 4423 if (U_FAILURE(status)) { 4424 break; 4425 } 4426 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed. 4427 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status); 4428 break; 4429 } 4430 4431 // Check for the Java form "IsBooleanPropertyValue", which we will recast 4432 // as "BooleanPropertyValue". The property value can be either a 4433 // a General Category or a Script Name. 4434 4435 if (propName.startsWith(u"Is", 2) && propName.length()>=3) { 4436 mPropName.remove(0, 2); // Strip the "Is" 4437 if (mPropName.indexOf(u'=') >= 0) { 4438 // Reject any "Is..." property expression containing an '=', that is, 4439 // any non-binary property expression. 4440 status = U_REGEX_PROPERTY_SYNTAX; 4441 break; 4442 } 4443 4444 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) { 4445 mPropName.setTo(u"unassigned", -1); 4446 negated = !negated; 4447 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) { 4448 mPropName.setTo(u"Titlecase_Letter", -1); 4449 } 4450 4451 mPropName.insert(0, u"[\\p{", -1); 4452 mPropName.append(u"}]", -1); 4453 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status); 4454 4455 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { 4456 set->closeOver(USET_CASE_INSENSITIVE); 4457 } 4458 break; 4459 4460 } 4461 4462 if (propName.startsWith(u"java", -1)) { 4463 status = U_ZERO_ERROR; 4464 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); 4465 if (U_FAILURE(status)) { 4466 break; 4467 } 4468 // 4469 // Try the various Java specific properties. 4470 // These all begin with "java" 4471 // 4472 if (propName.compare(u"javaDefined", -1) == 0) { 4473 addCategory(set.getAlias(), U_GC_CN_MASK, status); 4474 set->complement(); 4475 } 4476 else if (propName.compare(u"javaDigit", -1) == 0) { 4477 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4478 } 4479 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) { 4480 addIdentifierIgnorable(set.getAlias(), status); 4481 } 4482 else if (propName.compare(u"javaISOControl", -1) == 0) { 4483 set->add(0, 0x1F).add(0x7F, 0x9F); 4484 } 4485 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) { 4486 addCategory(set.getAlias(), U_GC_L_MASK, status); 4487 addCategory(set.getAlias(), U_GC_SC_MASK, status); 4488 addCategory(set.getAlias(), U_GC_PC_MASK, status); 4489 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4490 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4491 addCategory(set.getAlias(), U_GC_MC_MASK, status); 4492 addCategory(set.getAlias(), U_GC_MN_MASK, status); 4493 addIdentifierIgnorable(set.getAlias(), status); 4494 } 4495 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) { 4496 addCategory(set.getAlias(), U_GC_L_MASK, status); 4497 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4498 addCategory(set.getAlias(), U_GC_SC_MASK, status); 4499 addCategory(set.getAlias(), U_GC_PC_MASK, status); 4500 } 4501 else if (propName.compare(u"javaLetter", -1) == 0) { 4502 addCategory(set.getAlias(), U_GC_L_MASK, status); 4503 } 4504 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) { 4505 addCategory(set.getAlias(), U_GC_L_MASK, status); 4506 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4507 } 4508 else if (propName.compare(u"javaLowerCase", -1) == 0) { 4509 addCategory(set.getAlias(), U_GC_LL_MASK, status); 4510 } 4511 else if (propName.compare(u"javaMirrored", -1) == 0) { 4512 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status); 4513 } 4514 else if (propName.compare(u"javaSpaceChar", -1) == 0) { 4515 addCategory(set.getAlias(), U_GC_Z_MASK, status); 4516 } 4517 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) { 4518 set->add(0x10000, UnicodeSet::MAX_VALUE); 4519 } 4520 else if (propName.compare(u"javaTitleCase", -1) == 0) { 4521 addCategory(set.getAlias(), U_GC_LT_MASK, status); 4522 } 4523 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) { 4524 addCategory(set.getAlias(), U_GC_L_MASK, status); 4525 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4526 } 4527 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) { 4528 addCategory(set.getAlias(), U_GC_L_MASK, status); 4529 addCategory(set.getAlias(), U_GC_PC_MASK, status); 4530 addCategory(set.getAlias(), U_GC_ND_MASK, status); 4531 addCategory(set.getAlias(), U_GC_NL_MASK, status); 4532 addCategory(set.getAlias(), U_GC_MC_MASK, status); 4533 addCategory(set.getAlias(), U_GC_MN_MASK, status); 4534 addIdentifierIgnorable(set.getAlias(), status); 4535 } 4536 else if (propName.compare(u"javaUpperCase", -1) == 0) { 4537 addCategory(set.getAlias(), U_GC_LU_MASK, status); 4538 } 4539 else if (propName.compare(u"javaValidCodePoint", -1) == 0) { 4540 set->add(0, UnicodeSet::MAX_VALUE); 4541 } 4542 else if (propName.compare(u"javaWhitespace", -1) == 0) { 4543 addCategory(set.getAlias(), U_GC_Z_MASK, status); 4544 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4545 set->add(9, 0x0d).add(0x1c, 0x1f); 4546 } else { 4547 status = U_REGEX_PROPERTY_SYNTAX; 4548 } 4549 4550 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { 4551 set->closeOver(USET_CASE_INSENSITIVE); 4552 } 4553 break; 4554 } 4555 4556 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility 4557 // extensions matched it. 4558 status = U_REGEX_PROPERTY_SYNTAX; 4559 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure. 4560 4561 if (U_SUCCESS(status)) { 4562 U_ASSERT(set.isValid()); 4563 if (negated) { 4564 set->complement(); 4565 } 4566 return set.orphan(); 4567 } else { 4568 if (status == U_ILLEGAL_ARGUMENT_ERROR) { 4569 status = U_REGEX_PROPERTY_SYNTAX; 4570 } 4571 error(status); 4572 return nullptr; 4573 } 4574 } 4575 4576 4577 // 4578 // SetEval Part of the evaluation of [set expressions]. 4579 // Perform any pending (stacked) operations with precedence 4580 // equal or greater to that of the next operator encountered 4581 // in the expression. 4582 // 4583 void RegexCompile::setEval(int32_t nextOp) { 4584 UnicodeSet *rightOperand = NULL; 4585 UnicodeSet *leftOperand = NULL; 4586 for (;;) { 4587 U_ASSERT(fSetOpStack.empty()==FALSE); 4588 int32_t pendingSetOperation = fSetOpStack.peeki(); 4589 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4590 break; 4591 } 4592 fSetOpStack.popi(); 4593 U_ASSERT(fSetStack.empty() == FALSE); 4594 rightOperand = (UnicodeSet *)fSetStack.peek(); 4595 switch (pendingSetOperation) { 4596 case setNegation: 4597 rightOperand->complement(); 4598 break; 4599 case setCaseClose: 4600 // TODO: need a simple close function. Ticket 6065 4601 rightOperand->closeOver(USET_CASE_INSENSITIVE); 4602 rightOperand->removeAllStrings(); 4603 break; 4604 case setDifference1: 4605 case setDifference2: 4606 fSetStack.pop(); 4607 leftOperand = (UnicodeSet *)fSetStack.peek(); 4608 leftOperand->removeAll(*rightOperand); 4609 delete rightOperand; 4610 break; 4611 case setIntersection1: 4612 case setIntersection2: 4613 fSetStack.pop(); 4614 leftOperand = (UnicodeSet *)fSetStack.peek(); 4615 leftOperand->retainAll(*rightOperand); 4616 delete rightOperand; 4617 break; 4618 case setUnion: 4619 fSetStack.pop(); 4620 leftOperand = (UnicodeSet *)fSetStack.peek(); 4621 leftOperand->addAll(*rightOperand); 4622 delete rightOperand; 4623 break; 4624 default: 4625 U_ASSERT(FALSE); 4626 break; 4627 } 4628 } 4629 } 4630 4631 void RegexCompile::setPushOp(int32_t op) { 4632 setEval(op); 4633 fSetOpStack.push(op, *fStatus); 4634 fSetStack.push(new UnicodeSet(), *fStatus); 4635 } 4636 4637 U_NAMESPACE_END 4638 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4639 4640