1 // 2 // file: regexcmp.cpp 3 // 4 // Copyright (C) 2002-2010 International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains the ICU regular expression compiler, which is responsible 8 // for processing a regular expression pattern into the compiled form that 9 // is used by the match finding engine. 10 // 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 15 16 #include "unicode/ustring.h" 17 #include "unicode/unistr.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/uchriter.h" 21 #include "unicode/parsepos.h" 22 #include "unicode/parseerr.h" 23 #include "unicode/regex.h" 24 #include "util.h" 25 #include "putilimp.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "uvectr32.h" 29 #include "uvectr64.h" 30 #include "uassert.h" 31 #include "ucln_in.h" 32 #include "uinvchar.h" 33 34 #include "regeximp.h" 35 #include "regexcst.h" // Contains state table for the regex pattern parser. 36 // generated by a Perl script. 37 #include "regexcmp.h" 38 #include "regexst.h" 39 #include "regextxt.h" 40 41 42 43 U_NAMESPACE_BEGIN 44 45 46 //------------------------------------------------------------------------------ 47 // 48 // Constructor. 49 // 50 //------------------------------------------------------------------------------ 51 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 52 fParenStack(status), fSetStack(status), fSetOpStack(status) 53 { 54 // Lazy init of all shared global sets (needed for init()'s empty text) 55 RegexStaticSets::initGlobals(&status); 56 57 fStatus = &status; 58 59 fRXPat = rxp; 60 fScanIndex = 0; 61 fPeekChar = -1; 62 fLineNum = 1; 63 fCharNum = 0; 64 fQuoteMode = FALSE; 65 fInBackslashQuote = FALSE; 66 fModeFlags = fRXPat->fFlags | 0x80000000; 67 fEOLComments = TRUE; 68 69 fMatchOpenParen = -1; 70 fMatchCloseParen = -1; 71 fStringOpStart = -1; 72 73 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 74 status = rxp->fDeferredStatus; 75 } 76 } 77 78 static const UChar chAmp = 0x26; // '&' 79 static const UChar chDash = 0x2d; // '-' 80 81 82 //------------------------------------------------------------------------------ 83 // 84 // Destructor 85 // 86 //------------------------------------------------------------------------------ 87 RegexCompile::~RegexCompile() { 88 } 89 90 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 91 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 92 } 93 94 //------------------------------------------------------------------------------ 95 // 96 // Compile regex pattern. The state machine for rexexp pattern parsing is here. 97 // The state tables are hand-written in the file regexcst.txt, 98 // and converted to the form used here by a perl 99 // script regexcst.pl 100 // 101 //------------------------------------------------------------------------------ 102 void RegexCompile::compile( 103 const UnicodeString &pat, // Source pat to be compiled. 104 UParseError &pp, // Error position info 105 UErrorCode &e) // Error Code 106 { 107 fRXPat->fPatternString = new UnicodeString(pat); 108 UText patternText = UTEXT_INITIALIZER; 109 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 110 111 if (U_SUCCESS(e)) { 112 compile(&patternText, pp, e); 113 utext_close(&patternText); 114 } 115 } 116 117 // 118 // compile, UText mode 119 // All the work is actually done here. 120 // 121 void RegexCompile::compile( 122 UText *pat, // Source pat to be compiled. 123 UParseError &pp, // Error position info 124 UErrorCode &e) // Error Code 125 { 126 fStatus = &e; 127 fParseErr = &pp; 128 fStackPtr = 0; 129 fStack[fStackPtr] = 0; 130 131 if (U_FAILURE(*fStatus)) { 132 return; 133 } 134 135 // There should be no pattern stuff in the RegexPattern object. They can not be reused. 136 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 137 138 // Prepare the RegexPattern object to receive the compiled pattern. 139 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 140 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 141 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 142 143 144 // Initialize the pattern scanning state machine 145 fPatternLength = utext_nativeLength(pat); 146 uint16_t state = 1; 147 const RegexTableEl *tableEl; 148 nextChar(fC); // Fetch the first char from the pattern string. 149 150 // 151 // Main loop for the regex pattern parsing state machine. 152 // Runs once per state transition. 153 // Each time through optionally performs, depending on the state table, 154 // - an advance to the the next pattern char 155 // - an action to be performed. 156 // - pushing or popping a state to/from the local state return stack. 157 // file regexcst.txt is the source for the state table. The logic behind 158 // recongizing the pattern syntax is there, not here. 159 // 160 for (;;) { 161 // Bail out if anything has gone wrong. 162 // Regex pattern parsing stops on the first error encountered. 163 if (U_FAILURE(*fStatus)) { 164 break; 165 } 166 167 U_ASSERT(state != 0); 168 169 // Find the state table element that matches the input char from the pattern, or the 170 // class of the input character. Start with the first table row for this 171 // state, then linearly scan forward until we find a row that matches the 172 // character. The last row for each state always matches all characters, so 173 // the search will stop there, if not before. 174 // 175 tableEl = &gRuleParseStateTable[state]; 176 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 177 fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 178 179 for (;;) { // loop through table rows belonging to this state, looking for one 180 // that matches the current input char. 181 REGEX_SCAN_DEBUG_PRINTF((".")); 182 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 183 // Table row specified an individual character, not a set, and 184 // the input character is not quoted, and 185 // the input character matched it. 186 break; 187 } 188 if (tableEl->fCharClass == 255) { 189 // Table row specified default, match anything character class. 190 break; 191 } 192 if (tableEl->fCharClass == 254 && fC.fQuoted) { 193 // Table row specified "quoted" and the char was quoted. 194 break; 195 } 196 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 197 // Table row specified eof and we hit eof on the input. 198 break; 199 } 200 201 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 202 fC.fQuoted == FALSE && // char is not escaped && 203 fC.fChar != (UChar32)-1) { // char is not EOF 204 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 205 // Table row specified a character class, or set of characters, 206 // and the current char matches it. 207 break; 208 } 209 } 210 211 // No match on this row, advance to the next row for this state, 212 tableEl++; 213 } 214 REGEX_SCAN_DEBUG_PRINTF(("\n")); 215 216 // 217 // We've found the row of the state table that matches the current input 218 // character from the rules string. 219 // Perform any action specified by this row in the state table. 220 if (doParseActions(tableEl->fAction) == FALSE) { 221 // Break out of the state machine loop if the 222 // the action signalled some kind of error, or 223 // the action was to exit, occurs on normal end-of-rules-input. 224 break; 225 } 226 227 if (tableEl->fPushState != 0) { 228 fStackPtr++; 229 if (fStackPtr >= kStackSize) { 230 error(U_REGEX_INTERNAL_ERROR); 231 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 232 fStackPtr--; 233 } 234 fStack[fStackPtr] = tableEl->fPushState; 235 } 236 237 // 238 // NextChar. This is where characters are actually fetched from the pattern. 239 // Happens under control of the 'n' tag in the state table. 240 // 241 if (tableEl->fNextChar) { 242 nextChar(fC); 243 } 244 245 // Get the next state from the table entry, or from the 246 // state stack if the next state was specified as "pop". 247 if (tableEl->fNextState != 255) { 248 state = tableEl->fNextState; 249 } else { 250 state = fStack[fStackPtr]; 251 fStackPtr--; 252 if (fStackPtr < 0) { 253 // state stack underflow 254 // This will occur if the user pattern has mis-matched parentheses, 255 // with extra close parens. 256 // 257 fStackPtr++; 258 error(U_REGEX_MISMATCHED_PAREN); 259 } 260 } 261 262 } 263 264 if (U_FAILURE(*fStatus)) { 265 // Bail out if the pattern had errors. 266 // Set stack cleanup: a successful compile would have left it empty, 267 // but errors can leave temporary sets hanging around. 268 while (!fSetStack.empty()) { 269 delete (UnicodeSet *)fSetStack.pop(); 270 } 271 return; 272 } 273 274 // 275 // The pattern has now been read and processed, and the compiled code generated. 276 // 277 278 // 279 // Compute the number of digits requried for the largest capture group number. 280 // 281 fRXPat->fMaxCaptureDigits = 1; 282 int32_t n = 10; 283 int32_t groupCount = fRXPat->fGroupMap->size(); 284 while (n <= groupCount) { 285 fRXPat->fMaxCaptureDigits++; 286 n *= 10; 287 } 288 289 // 290 // The pattern's fFrameSize so far has accumulated the requirements for 291 // storage for capture parentheses, counters, etc. that are encountered 292 // in the pattern. Add space for the two variables that are always 293 // present in the saved state: the input string position (int64_t) and 294 // the position in the compiled pattern. 295 // 296 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; 297 298 // 299 // Optimization pass 1: NOPs, back-references, and case-folding 300 // 301 stripNOPs(); 302 303 // 304 // Get bounds for the minimum and maximum length of a string that this 305 // pattern can match. Used to avoid looking for matches in strings that 306 // are too short. 307 // 308 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 309 310 // 311 // Optimization pass 2: match start type 312 // 313 matchStartType(); 314 315 // 316 // Set up fast latin-1 range sets 317 // 318 int32_t numSets = fRXPat->fSets->size(); 319 fRXPat->fSets8 = new Regex8BitSet[numSets]; 320 // Null pointer check. 321 if (fRXPat->fSets8 == NULL) { 322 e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 323 return; 324 } 325 int32_t i; 326 for (i=0; i<numSets; i++) { 327 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 328 fRXPat->fSets8[i].init(s); 329 } 330 331 } 332 333 334 335 336 337 //------------------------------------------------------------------------------ 338 // 339 // doParseAction Do some action during regex pattern parsing. 340 // Called by the parse state machine. 341 // 342 // Generation of the match engine PCode happens here, or 343 // in functions called from the parse actions defined here. 344 // 345 // 346 //------------------------------------------------------------------------------ 347 UBool RegexCompile::doParseActions(int32_t action) 348 { 349 UBool returnVal = TRUE; 350 351 switch ((Regex_PatternParseAction)action) { 352 353 case doPatStart: 354 // Start of pattern compiles to: 355 //0 SAVE 2 Fall back to position of FAIL 356 //1 jmp 3 357 //2 FAIL Stop if we ever reach here. 358 //3 NOP Dummy, so start of pattern looks the same as 359 // the start of an ( grouping. 360 //4 NOP Resreved, will be replaced by a save if there are 361 // OR | operators at the top level 362 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); 363 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); 364 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); 365 366 // Standard open nonCapture paren action emits the two NOPs and 367 // sets up the paren stack frame. 368 doParseActions(doOpenNonCaptureParen); 369 break; 370 371 case doPatFinish: 372 // We've scanned to the end of the pattern 373 // The end of pattern compiles to: 374 // URX_END 375 // which will stop the runtime match engine. 376 // Encountering end of pattern also behaves like a close paren, 377 // and forces fixups of the State Save at the beginning of the compiled pattern 378 // and of any OR operations at the top level. 379 // 380 handleCloseParen(); 381 if (fParenStack.size() > 0) { 382 // Missing close paren in pattern. 383 error(U_REGEX_MISMATCHED_PAREN); 384 } 385 386 // add the END operation to the compiled pattern. 387 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); 388 389 // Terminate the pattern compilation state machine. 390 returnVal = FALSE; 391 break; 392 393 394 395 case doOrOperator: 396 // Scanning a '|', as in (A|B) 397 { 398 // Insert a SAVE operation at the start of the pattern section preceding 399 // this OR at this level. This SAVE will branch the match forward 400 // to the right hand side of the OR in the event that the left hand 401 // side fails to match and backtracks. Locate the position for the 402 // save from the location on the top of the parentheses stack. 403 int32_t savePosition = fParenStack.popi(); 404 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 405 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 406 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 407 fRXPat->fCompiledPat->setElementAt(op, savePosition); 408 409 // Append an JMP operation into the compiled pattern. The operand for 410 // the JMP will eventually be the location following the ')' for the 411 // group. This will be patched in later, when the ')' is encountered. 412 op = URX_BUILD(URX_JMP, 0); 413 fRXPat->fCompiledPat->addElement(op, *fStatus); 414 415 // Push the position of the newly added JMP op onto the parentheses stack. 416 // This registers if for fixup when this block's close paren is encountered. 417 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 418 419 // Append a NOP to the compiled pattern. This is the slot reserved 420 // for a SAVE in the event that there is yet another '|' following 421 // this one. 422 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 423 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 424 } 425 break; 426 427 428 case doOpenCaptureParen: 429 // Open Paren. 430 // Compile to a 431 // - NOP, which later may be replaced by a save-state if the 432 // parenthesized group gets a * quantifier, followed by 433 // - START_CAPTURE n where n is stack frame offset to the capture group variables. 434 // - NOP, which may later be replaced by a save-state if there 435 // is an '|' alternation within the parens. 436 // 437 // Each capture group gets three slots in the save stack frame: 438 // 0: Capture Group start position (in input string being matched.) 439 // 1: Capture Group end position. 440 // 2: Start of Match-in-progress. 441 // The first two locations are for a completed capture group, and are 442 // referred to by back references and the like. 443 // The third location stores the capture start position when an START_CAPTURE is 444 // encountered. This will be promoted to a completed capture when (and if) the corresponding 445 // END_CAPTURE is encountered. 446 { 447 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 448 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame. 449 fRXPat->fFrameSize += 3; 450 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); 451 fRXPat->fCompiledPat->addElement(cop, *fStatus); 452 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 453 454 // On the Parentheses stack, start a new frame and add the postions 455 // of the two NOPs. Depending on what follows in the pattern, the 456 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 457 // address of the end of the parenthesized group. 458 fParenStack.push(fModeFlags, *fStatus); // Match mode state 459 fParenStack.push(capturing, *fStatus); // Frame type. 460 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 461 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 462 463 // Save the mapping from group number to stack frame variable position. 464 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 465 } 466 break; 467 468 case doOpenNonCaptureParen: 469 // Open non-caputuring (grouping only) Paren. 470 // Compile to a 471 // - NOP, which later may be replaced by a save-state if the 472 // parenthesized group gets a * quantifier, followed by 473 // - NOP, which may later be replaced by a save-state if there 474 // is an '|' alternation within the parens. 475 { 476 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 477 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 478 479 // On the Parentheses stack, start a new frame and add the postions 480 // of the two NOPs. 481 fParenStack.push(fModeFlags, *fStatus); // Match mode state 482 fParenStack.push(plain, *fStatus); // Begin a new frame. 483 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 484 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 485 } 486 break; 487 488 489 case doOpenAtomicParen: 490 // Open Atomic Paren. (?> 491 // Compile to a 492 // - NOP, which later may be replaced if the parenthesized group 493 // has a quantifier, followed by 494 // - STO_SP save state stack position, so it can be restored at the ")" 495 // - NOP, which may later be replaced by a save-state if there 496 // is an '|' alternation within the parens. 497 { 498 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 499 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 500 fRXPat->fDataSize += 1; // state stack ptr. 501 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); 502 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); 503 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 504 505 // On the Parentheses stack, start a new frame and add the postions 506 // of the two NOPs. Depending on what follows in the pattern, the 507 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 508 // address of the end of the parenthesized group. 509 fParenStack.push(fModeFlags, *fStatus); // Match mode state 510 fParenStack.push(atomic, *fStatus); // Frame type. 511 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 512 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 513 } 514 break; 515 516 517 case doOpenLookAhead: 518 // Positive Look-ahead (?= stuff ) 519 // 520 // Note: Addition of transparent input regions, with the need to 521 // restore the original regions when failing out of a lookahead 522 // block, complicated this sequence. Some conbined opcodes 523 // might make sense - or might not, lookahead aren't that common. 524 // 525 // Caution: min match length optimization knows about this 526 // sequence; don't change without making updates there too. 527 // 528 // Compiles to 529 // 1 START_LA dataLoc Saves SP, Input Pos 530 // 2. STATE_SAVE 4 on failure of lookahead, goto 4 531 // 3 JMP 6 continue ... 532 // 533 // 4. LA_END Look Ahead failed. Restore regions. 534 // 5. BACKTRACK and back track again. 535 // 536 // 6. NOP reserved for use by quantifiers on the block. 537 // Look-ahead can't have quantifiers, but paren stack 538 // compile time conventions require the slot anyhow. 539 // 7. NOP may be replaced if there is are '|' ops in the block. 540 // 8. code for parenthesized stuff. 541 // 9. LA_END 542 // 543 // Two data slots are reserved, for saving the stack ptr and the input position. 544 { 545 int32_t dataLoc = fRXPat->fDataSize; 546 fRXPat->fDataSize += 2; 547 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 548 fRXPat->fCompiledPat->addElement(op, *fStatus); 549 550 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 551 fRXPat->fCompiledPat->addElement(op, *fStatus); 552 553 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 554 fRXPat->fCompiledPat->addElement(op, *fStatus); 555 556 op = URX_BUILD(URX_LA_END, dataLoc); 557 fRXPat->fCompiledPat->addElement(op, *fStatus); 558 559 op = URX_BUILD(URX_BACKTRACK, 0); 560 fRXPat->fCompiledPat->addElement(op, *fStatus); 561 562 op = URX_BUILD(URX_NOP, 0); 563 fRXPat->fCompiledPat->addElement(op, *fStatus); 564 fRXPat->fCompiledPat->addElement(op, *fStatus); 565 566 // On the Parentheses stack, start a new frame and add the postions 567 // of the NOPs. 568 fParenStack.push(fModeFlags, *fStatus); // Match mode state 569 fParenStack.push(lookAhead, *fStatus); // Frame type. 570 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 571 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 572 } 573 break; 574 575 case doOpenLookAheadNeg: 576 // Negated Lookahead. (?! stuff ) 577 // Compiles to 578 // 1. START_LA dataloc 579 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 580 // // which continues with the match. 581 // 3. NOP // Std. Open Paren sequence, for possible '|' 582 // 4. code for parenthesized stuff. 583 // 5. END_LA // Cut back stack, remove saved state from step 2. 584 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 585 // 7. END_LA // Restore match region, in case look-ahead was using 586 // an alternate (transparent) region. 587 { 588 int32_t dataLoc = fRXPat->fDataSize; 589 fRXPat->fDataSize += 2; 590 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 591 fRXPat->fCompiledPat->addElement(op, *fStatus); 592 593 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later. 594 fRXPat->fCompiledPat->addElement(op, *fStatus); 595 596 op = URX_BUILD(URX_NOP, 0); 597 fRXPat->fCompiledPat->addElement(op, *fStatus); 598 599 // On the Parentheses stack, start a new frame and add the postions 600 // of the StateSave and NOP. 601 fParenStack.push(fModeFlags, *fStatus); // Match mode state 602 fParenStack.push(negLookAhead, *fStatus); // Frame type 603 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 604 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 605 606 // Instructions #5 - #7 will be added when the ')' is encountered. 607 } 608 break; 609 610 case doOpenLookBehind: 611 { 612 // Compile a (?<= look-behind open paren. 613 // 614 // Compiles to 615 // 0 URX_LB_START dataLoc 616 // 1 URX_LB_CONT dataLoc 617 // 2 MinMatchLen 618 // 3 MaxMatchLen 619 // 4 URX_NOP Standard '(' boilerplate. 620 // 5 URX_NOP Reserved slot for use with '|' ops within (block). 621 // 6 <code for LookBehind expression> 622 // 7 URX_LB_END dataLoc # Check match len, restore input len 623 // 8 URX_LA_END dataLoc # Restore stack, input pos 624 // 625 // Allocate a block of matcher data, to contain (when running a match) 626 // 0: Stack ptr on entry 627 // 1: Input Index on entry 628 // 2: Start index of match current match attempt. 629 // 3: Original Input String len. 630 631 // Allocate data space 632 int32_t dataLoc = fRXPat->fDataSize; 633 fRXPat->fDataSize += 4; 634 635 // Emit URX_LB_START 636 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 637 fRXPat->fCompiledPat->addElement(op, *fStatus); 638 639 // Emit URX_LB_CONT 640 op = URX_BUILD(URX_LB_CONT, dataLoc); 641 fRXPat->fCompiledPat->addElement(op, *fStatus); 642 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 643 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 644 645 // Emit the NOP 646 op = URX_BUILD(URX_NOP, 0); 647 fRXPat->fCompiledPat->addElement(op, *fStatus); 648 fRXPat->fCompiledPat->addElement(op, *fStatus); 649 650 // On the Parentheses stack, start a new frame and add the postions 651 // of the URX_LB_CONT and the NOP. 652 fParenStack.push(fModeFlags, *fStatus); // Match mode state 653 fParenStack.push(lookBehind, *fStatus); // Frame type 654 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 655 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 656 657 // The final two instructions will be added when the ')' is encountered. 658 } 659 660 break; 661 662 case doOpenLookBehindNeg: 663 { 664 // Compile a (?<! negated look-behind open paren. 665 // 666 // Compiles to 667 // 0 URX_LB_START dataLoc # Save entry stack, input len 668 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 669 // 2 MinMatchLen 670 // 3 MaxMatchLen 671 // 4 continueLoc (9) 672 // 5 URX_NOP Standard '(' boilerplate. 673 // 6 URX_NOP Reserved slot for use with '|' ops within (block). 674 // 7 <code for LookBehind expression> 675 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 676 // 9 ... 677 // 678 // Allocate a block of matcher data, to contain (when running a match) 679 // 0: Stack ptr on entry 680 // 1: Input Index on entry 681 // 2: Start index of match current match attempt. 682 // 3: Original Input String len. 683 684 // Allocate data space 685 int32_t dataLoc = fRXPat->fDataSize; 686 fRXPat->fDataSize += 4; 687 688 // Emit URX_LB_START 689 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 690 fRXPat->fCompiledPat->addElement(op, *fStatus); 691 692 // Emit URX_LBN_CONT 693 op = URX_BUILD(URX_LBN_CONT, dataLoc); 694 fRXPat->fCompiledPat->addElement(op, *fStatus); 695 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 696 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 697 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later. 698 699 // Emit the NOP 700 op = URX_BUILD(URX_NOP, 0); 701 fRXPat->fCompiledPat->addElement(op, *fStatus); 702 fRXPat->fCompiledPat->addElement(op, *fStatus); 703 704 // On the Parentheses stack, start a new frame and add the postions 705 // of the URX_LB_CONT and the NOP. 706 fParenStack.push(fModeFlags, *fStatus); // Match mode state 707 fParenStack.push(lookBehindN, *fStatus); // Frame type 708 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 709 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 710 711 // The final two instructions will be added when the ')' is encountered. 712 } 713 break; 714 715 case doConditionalExpr: 716 // Conditionals such as (?(1)a:b) 717 case doPerlInline: 718 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 719 error(U_REGEX_UNIMPLEMENTED); 720 break; 721 722 723 case doCloseParen: 724 handleCloseParen(); 725 if (fParenStack.size() <= 0) { 726 // Extra close paren, or missing open paren. 727 error(U_REGEX_MISMATCHED_PAREN); 728 } 729 break; 730 731 case doNOP: 732 break; 733 734 735 case doBadOpenParenType: 736 case doRuleError: 737 error(U_REGEX_RULE_SYNTAX); 738 break; 739 740 741 case doMismatchedParenErr: 742 error(U_REGEX_MISMATCHED_PAREN); 743 break; 744 745 case doPlus: 746 // Normal '+' compiles to 747 // 1. stuff to be repeated (already built) 748 // 2. jmp-sav 1 749 // 3. ... 750 // 751 // Or, if the item to be repeated can match a zero length string, 752 // 1. STO_INP_LOC data-loc 753 // 2. body of stuff to be repeated 754 // 3. JMP_SAV_X 2 755 // 4. ... 756 757 // 758 // Or, if the item to be repeated is simple 759 // 1. Item to be repeated. 760 // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 761 // 3. LOOP_C stack location 762 { 763 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 764 int32_t frameLoc; 765 766 // Check for simple constructs, which may get special optimized code. 767 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 768 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 769 770 if (URX_TYPE(repeatedOp) == URX_SETREF) { 771 // Emit optimized code for [char set]+ 772 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 773 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 774 frameLoc = fRXPat->fFrameSize; 775 fRXPat->fFrameSize++; 776 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 777 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 778 break; 779 } 780 781 if (URX_TYPE(repeatedOp) == URX_DOTANY || 782 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 783 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 784 // Emit Optimized code for .+ operations. 785 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 786 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 787 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 788 loopOpI |= 1; 789 } 790 if (fModeFlags & UREGEX_UNIX_LINES) { 791 loopOpI |= 2; 792 } 793 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 794 frameLoc = fRXPat->fFrameSize; 795 fRXPat->fFrameSize++; 796 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 797 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 798 break; 799 } 800 801 } 802 803 // General case. 804 805 // Check for minimum match length of zero, which requires 806 // extra loop-breaking code. 807 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 808 // Zero length match is possible. 809 // Emit the code sequence that can handle it. 810 insertOp(topLoc); 811 frameLoc = fRXPat->fFrameSize; 812 fRXPat->fFrameSize++; 813 814 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); 815 fRXPat->fCompiledPat->setElementAt(op, topLoc); 816 817 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); 818 fRXPat->fCompiledPat->addElement(op, *fStatus); 819 } else { 820 // Simpler code when the repeated body must match something non-empty 821 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); 822 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 823 } 824 } 825 break; 826 827 case doNGPlus: 828 // Non-greedy '+?' compiles to 829 // 1. stuff to be repeated (already built) 830 // 2. state-save 1 831 // 3. ... 832 { 833 int32_t topLoc = blockTopLoc(FALSE); 834 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); 835 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); 836 } 837 break; 838 839 840 case doOpt: 841 // Normal (greedy) ? quantifier. 842 // Compiles to 843 // 1. state save 3 844 // 2. body of optional block 845 // 3. ... 846 // Insert the state save into the compiled pattern, and we're done. 847 { 848 int32_t saveStateLoc = blockTopLoc(TRUE); 849 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 850 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 851 } 852 break; 853 854 case doNGOpt: 855 // Non-greedy ?? quantifier 856 // compiles to 857 // 1. jmp 4 858 // 2. body of optional block 859 // 3 jmp 5 860 // 4. state save 2 861 // 5 ... 862 // This code is less than ideal, with two jmps instead of one, because we can only 863 // insert one instruction at the top of the block being iterated. 864 { 865 int32_t jmp1_loc = blockTopLoc(TRUE); 866 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 867 868 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); 869 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 870 871 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); 872 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); 873 874 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); 875 fRXPat->fCompiledPat->addElement(save_op, *fStatus); 876 } 877 break; 878 879 880 case doStar: 881 // Normal (greedy) * quantifier. 882 // Compiles to 883 // 1. STATE_SAVE 4 884 // 2. body of stuff being iterated over 885 // 3. JMP_SAV 2 886 // 4. ... 887 // 888 // Or, if the body is a simple [Set], 889 // 1. LOOP_SR_I set number 890 // 2. LOOP_C stack location 891 // ... 892 // 893 // Or if this is a .* 894 // 1. LOOP_DOT_I (. matches all mode flag) 895 // 2. LOOP_C stack location 896 // 897 // Or, if the body can match a zero-length string, to inhibit infinite loops, 898 // 1. STATE_SAVE 5 899 // 2. STO_INP_LOC data-loc 900 // 3. body of stuff 901 // 4. JMP_SAV_X 2 902 // 5. ... 903 { 904 // location of item #1, the STATE_SAVE 905 int32_t topLoc = blockTopLoc(FALSE); 906 int32_t dataLoc = -1; 907 908 // Check for simple *, where the construct being repeated 909 // compiled to single opcode, and might be optimizable. 910 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 911 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 912 913 if (URX_TYPE(repeatedOp) == URX_SETREF) { 914 // Emit optimized code for a [char set]* 915 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 916 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 917 dataLoc = fRXPat->fFrameSize; 918 fRXPat->fFrameSize++; 919 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 920 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 921 break; 922 } 923 924 if (URX_TYPE(repeatedOp) == URX_DOTANY || 925 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 926 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 927 // Emit Optimized code for .* operations. 928 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 929 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 930 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 931 loopOpI |= 1; 932 } 933 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 934 loopOpI |= 2; 935 } 936 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 937 dataLoc = fRXPat->fFrameSize; 938 fRXPat->fFrameSize++; 939 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 940 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 941 break; 942 } 943 } 944 945 // Emit general case code for this * 946 // The optimizations did not apply. 947 948 int32_t saveStateLoc = blockTopLoc(TRUE); 949 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); 950 951 // Check for minimum match length of zero, which requires 952 // extra loop-breaking code. 953 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 954 insertOp(saveStateLoc); 955 dataLoc = fRXPat->fFrameSize; 956 fRXPat->fFrameSize++; 957 958 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); 959 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 960 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); 961 } 962 963 // Locate the position in the compiled pattern where the match will continue 964 // after completing the *. (4 or 5 in the comment above) 965 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 966 967 // Put together the save state op store it into the compiled code. 968 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); 969 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 970 971 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 972 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 973 } 974 break; 975 976 case doNGStar: 977 // Non-greedy *? quantifier 978 // compiles to 979 // 1. JMP 3 980 // 2. body of stuff being iterated over 981 // 3. STATE_SAVE 2 982 // 4 ... 983 { 984 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 985 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 986 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); 987 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); 988 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 989 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); 990 } 991 break; 992 993 994 case doIntervalInit: 995 // The '{' opening an interval quantifier was just scanned. 996 // Init the counter varaiables that will accumulate the values as the digits 997 // are scanned. 998 fIntervalLow = 0; 999 fIntervalUpper = -1; 1000 break; 1001 1002 case doIntevalLowerDigit: 1003 // Scanned a digit from the lower value of an {lower,upper} interval 1004 { 1005 int32_t digitValue = u_charDigitValue(fC.fChar); 1006 U_ASSERT(digitValue >= 0); 1007 fIntervalLow = fIntervalLow*10 + digitValue; 1008 if (fIntervalLow < 0) { 1009 error(U_REGEX_NUMBER_TOO_BIG); 1010 } 1011 } 1012 break; 1013 1014 case doIntervalUpperDigit: 1015 // Scanned a digit from the upper value of an {lower,upper} interval 1016 { 1017 if (fIntervalUpper < 0) { 1018 fIntervalUpper = 0; 1019 } 1020 int32_t digitValue = u_charDigitValue(fC.fChar); 1021 U_ASSERT(digitValue >= 0); 1022 fIntervalUpper = fIntervalUpper*10 + digitValue; 1023 if (fIntervalUpper < 0) { 1024 error(U_REGEX_NUMBER_TOO_BIG); 1025 } 1026 } 1027 break; 1028 1029 case doIntervalSame: 1030 // Scanned a single value interval like {27}. Upper = Lower. 1031 fIntervalUpper = fIntervalLow; 1032 break; 1033 1034 case doInterval: 1035 // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1036 if (compileInlineInterval() == FALSE) { 1037 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1038 } 1039 break; 1040 1041 case doPossessiveInterval: 1042 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1043 { 1044 // Remember the loc for the top of the block being looped over. 1045 // (Can not reserve a slot in the compiled pattern at this time, because 1046 // compileInterval needs to reserve also, and blockTopLoc can only reserve 1047 // once per block.) 1048 int32_t topLoc = blockTopLoc(FALSE); 1049 1050 // Produce normal looping code. 1051 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1052 1053 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1054 // just as if the loop was inclosed in atomic parentheses. 1055 1056 // First the STO_SP before the start of the loop 1057 insertOp(topLoc); 1058 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 1059 fRXPat->fDataSize += 1; // state stack ptr. 1060 int32_t op = URX_BUILD(URX_STO_SP, varLoc); 1061 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1062 1063 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1064 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1065 loopOp++; // point LoopOp after the just-inserted STO_SP 1066 fRXPat->fCompiledPat->push(loopOp, *fStatus); 1067 1068 // Then the LD_SP after the end of the loop 1069 op = URX_BUILD(URX_LD_SP, varLoc); 1070 fRXPat->fCompiledPat->addElement(op, *fStatus); 1071 } 1072 1073 break; 1074 1075 case doNGInterval: 1076 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1077 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1078 break; 1079 1080 case doIntervalError: 1081 error(U_REGEX_BAD_INTERVAL); 1082 break; 1083 1084 case doLiteralChar: 1085 // We've just scanned a "normal" character from the pattern, 1086 literalChar(fC.fChar); 1087 break; 1088 1089 1090 case doEscapedLiteralChar: 1091 // We've just scanned an backslashed escaped character with no 1092 // special meaning. It represents itself. 1093 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1094 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1095 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1096 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1097 } 1098 literalChar(fC.fChar); 1099 break; 1100 1101 1102 case doDotAny: 1103 // scanned a ".", match any single character. 1104 { 1105 int32_t op; 1106 if (fModeFlags & UREGEX_DOTALL) { 1107 op = URX_BUILD(URX_DOTANY_ALL, 0); 1108 } else if (fModeFlags & UREGEX_UNIX_LINES) { 1109 op = URX_BUILD(URX_DOTANY_UNIX, 0); 1110 } else { 1111 op = URX_BUILD(URX_DOTANY, 0); 1112 } 1113 fRXPat->fCompiledPat->addElement(op, *fStatus); 1114 } 1115 break; 1116 1117 case doCaret: 1118 { 1119 int32_t op = 0; 1120 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1121 op = URX_CARET; 1122 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1123 op = URX_CARET_M; 1124 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1125 op = URX_CARET; // Only testing true start of input. 1126 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1127 op = URX_CARET_M_UNIX; 1128 } 1129 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1130 } 1131 break; 1132 1133 case doDollar: 1134 { 1135 int32_t op = 0; 1136 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1137 op = URX_DOLLAR; 1138 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1139 op = URX_DOLLAR_M; 1140 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1141 op = URX_DOLLAR_D; 1142 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1143 op = URX_DOLLAR_MD; 1144 } 1145 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1146 } 1147 break; 1148 1149 case doBackslashA: 1150 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); 1151 break; 1152 1153 case doBackslashB: 1154 { 1155 #if UCONFIG_NO_BREAK_ITERATION==1 1156 if (fModeFlags & UREGEX_UWORD) { 1157 error(U_UNSUPPORTED_ERROR); 1158 } 1159 #endif 1160 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1161 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); 1162 } 1163 break; 1164 1165 case doBackslashb: 1166 { 1167 #if UCONFIG_NO_BREAK_ITERATION==1 1168 if (fModeFlags & UREGEX_UWORD) { 1169 error(U_UNSUPPORTED_ERROR); 1170 } 1171 #endif 1172 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1173 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1174 } 1175 break; 1176 1177 case doBackslashD: 1178 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); 1179 break; 1180 1181 case doBackslashd: 1182 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); 1183 break; 1184 1185 case doBackslashG: 1186 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); 1187 break; 1188 1189 case doBackslashS: 1190 fRXPat->fCompiledPat->addElement( 1191 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); 1192 break; 1193 1194 case doBackslashs: 1195 fRXPat->fCompiledPat->addElement( 1196 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); 1197 break; 1198 1199 case doBackslashW: 1200 fRXPat->fCompiledPat->addElement( 1201 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); 1202 break; 1203 1204 case doBackslashw: 1205 fRXPat->fCompiledPat->addElement( 1206 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); 1207 break; 1208 1209 case doBackslashX: 1210 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); 1211 break; 1212 1213 1214 case doBackslashZ: 1215 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); 1216 break; 1217 1218 case doBackslashz: 1219 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); 1220 break; 1221 1222 case doEscapeError: 1223 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1224 break; 1225 1226 case doExit: 1227 returnVal = FALSE; 1228 break; 1229 1230 case doProperty: 1231 { 1232 UnicodeSet *theSet = scanProp(); 1233 compileSet(theSet); 1234 } 1235 break; 1236 1237 case doNamedChar: 1238 { 1239 UChar32 c = scanNamedChar(); 1240 literalChar(c); 1241 } 1242 break; 1243 1244 1245 case doBackRef: 1246 // BackReference. Somewhat unusual in that the front-end can not completely parse 1247 // the regular expression, because the number of digits to be consumed 1248 // depends on the number of capture groups that have been defined. So 1249 // we have to do it here instead. 1250 { 1251 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1252 int32_t groupNum = 0; 1253 UChar32 c = fC.fChar; 1254 1255 for (;;) { 1256 // Loop once per digit, for max allowed number of digits in a back reference. 1257 int32_t digit = u_charDigitValue(c); 1258 groupNum = groupNum * 10 + digit; 1259 if (groupNum >= numCaptureGroups) { 1260 break; 1261 } 1262 c = peekCharLL(); 1263 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1264 break; 1265 } 1266 nextCharLL(); 1267 } 1268 1269 // Scan of the back reference in the source regexp is complete. Now generate 1270 // the compiled code for it. 1271 // Because capture groups can be forward-referenced by back-references, 1272 // we fill the operand with the capture group number. At the end 1273 // of compilation, it will be changed to the variable's location. 1274 U_ASSERT(groupNum > 0); 1275 int32_t op; 1276 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1277 op = URX_BUILD(URX_BACKREF_I, groupNum); 1278 } else { 1279 op = URX_BUILD(URX_BACKREF, groupNum); 1280 } 1281 fRXPat->fCompiledPat->addElement(op, *fStatus); 1282 } 1283 break; 1284 1285 1286 case doPossessivePlus: 1287 // Possessive ++ quantifier. 1288 // Compiles to 1289 // 1. STO_SP 1290 // 2. body of stuff being iterated over 1291 // 3. STATE_SAVE 5 1292 // 4. JMP 2 1293 // 5. LD_SP 1294 // 6. ... 1295 // 1296 // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1297 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1298 // 1299 { 1300 // Emit the STO_SP 1301 int32_t topLoc = blockTopLoc(TRUE); 1302 int32_t stoLoc = fRXPat->fDataSize; 1303 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1304 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1305 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1306 1307 // Emit the STATE_SAVE 1308 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1309 fRXPat->fCompiledPat->addElement(op, *fStatus); 1310 1311 // Emit the JMP 1312 op = URX_BUILD(URX_JMP, topLoc+1); 1313 fRXPat->fCompiledPat->addElement(op, *fStatus); 1314 1315 // Emit the LD_SP 1316 op = URX_BUILD(URX_LD_SP, stoLoc); 1317 fRXPat->fCompiledPat->addElement(op, *fStatus); 1318 } 1319 break; 1320 1321 case doPossessiveStar: 1322 // Possessive *+ quantifier. 1323 // Compiles to 1324 // 1. STO_SP loc 1325 // 2. STATE_SAVE 5 1326 // 3. body of stuff being iterated over 1327 // 4. JMP 2 1328 // 5. LD_SP loc 1329 // 6 ... 1330 // TODO: do something to cut back the state stack each time through the loop. 1331 { 1332 // Reserve two slots at the top of the block. 1333 int32_t topLoc = blockTopLoc(TRUE); 1334 insertOp(topLoc); 1335 1336 // emit STO_SP loc 1337 int32_t stoLoc = fRXPat->fDataSize; 1338 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1339 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1340 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1341 1342 // Emit the SAVE_STATE 5 1343 int32_t L7 = fRXPat->fCompiledPat->size()+1; 1344 op = URX_BUILD(URX_STATE_SAVE, L7); 1345 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1346 1347 // Append the JMP operation. 1348 op = URX_BUILD(URX_JMP, topLoc+1); 1349 fRXPat->fCompiledPat->addElement(op, *fStatus); 1350 1351 // Emit the LD_SP loc 1352 op = URX_BUILD(URX_LD_SP, stoLoc); 1353 fRXPat->fCompiledPat->addElement(op, *fStatus); 1354 } 1355 break; 1356 1357 case doPossessiveOpt: 1358 // Possessive ?+ quantifier. 1359 // Compiles to 1360 // 1. STO_SP loc 1361 // 2. SAVE_STATE 5 1362 // 3. body of optional block 1363 // 4. LD_SP loc 1364 // 5. ... 1365 // 1366 { 1367 // Reserve two slots at the top of the block. 1368 int32_t topLoc = blockTopLoc(TRUE); 1369 insertOp(topLoc); 1370 1371 // Emit the STO_SP 1372 int32_t stoLoc = fRXPat->fDataSize; 1373 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1374 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1375 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1376 1377 // Emit the SAVE_STATE 1378 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1379 op = URX_BUILD(URX_STATE_SAVE, continueLoc); 1380 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1381 1382 // Emit the LD_SP 1383 op = URX_BUILD(URX_LD_SP, stoLoc); 1384 fRXPat->fCompiledPat->addElement(op, *fStatus); 1385 } 1386 break; 1387 1388 1389 case doBeginMatchMode: 1390 fNewModeFlags = fModeFlags; 1391 fSetModeFlag = TRUE; 1392 break; 1393 1394 case doMatchMode: // (?i) and similar 1395 { 1396 int32_t bit = 0; 1397 switch (fC.fChar) { 1398 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1399 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1400 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1401 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1402 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1403 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1404 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1405 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1406 default: 1407 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1408 // by the scanner. 1409 } 1410 if (fSetModeFlag) { 1411 fNewModeFlags |= bit; 1412 } else { 1413 fNewModeFlags &= ~bit; 1414 } 1415 } 1416 break; 1417 1418 case doSetMatchMode: 1419 // We've got a (?i) or similar. The match mode is being changed, but 1420 // the change is not scoped to a parenthesized block. 1421 U_ASSERT(fNewModeFlags < 0); 1422 fModeFlags = fNewModeFlags; 1423 1424 // Prevent any string from spanning across the change of match mode. 1425 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" 1426 fixLiterals(); 1427 break; 1428 1429 1430 case doMatchModeParen: 1431 // We've got a (?i: or similar. Begin a parenthesized block, save old 1432 // mode flags so they can be restored at the close of the block. 1433 // 1434 // Compile to a 1435 // - NOP, which later may be replaced by a save-state if the 1436 // parenthesized group gets a * quantifier, followed by 1437 // - NOP, which may later be replaced by a save-state if there 1438 // is an '|' alternation within the parens. 1439 { 1440 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1441 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1442 1443 // On the Parentheses stack, start a new frame and add the postions 1444 // of the two NOPs (a normal non-capturing () frame, except for the 1445 // saving of the orignal mode flags.) 1446 fParenStack.push(fModeFlags, *fStatus); 1447 fParenStack.push(flags, *fStatus); // Frame Marker 1448 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1449 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1450 1451 // Set the current mode flags to the new values. 1452 U_ASSERT(fNewModeFlags < 0); 1453 fModeFlags = fNewModeFlags; 1454 } 1455 break; 1456 1457 case doBadModeFlag: 1458 error(U_REGEX_INVALID_FLAG); 1459 break; 1460 1461 case doSuppressComments: 1462 // We have just scanned a '(?'. We now need to prevent the character scanner from 1463 // treating a '#' as a to-the-end-of-line comment. 1464 // (This Perl compatibility just gets uglier and uglier to do...) 1465 fEOLComments = FALSE; 1466 break; 1467 1468 1469 case doSetAddAmp: 1470 { 1471 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1472 set->add(chAmp); 1473 } 1474 break; 1475 1476 case doSetAddDash: 1477 { 1478 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1479 set->add(chDash); 1480 } 1481 break; 1482 1483 case doSetBackslash_s: 1484 { 1485 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1486 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1487 break; 1488 } 1489 1490 case doSetBackslash_S: 1491 { 1492 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1493 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1494 SSet.complement(); 1495 set->addAll(SSet); 1496 break; 1497 } 1498 1499 case doSetBackslash_d: 1500 { 1501 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1502 // TODO - make a static set, ticket 6058. 1503 addCategory(set, U_GC_ND_MASK, *fStatus); 1504 break; 1505 } 1506 1507 case doSetBackslash_D: 1508 { 1509 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1510 UnicodeSet digits; 1511 // TODO - make a static set, ticket 6058. 1512 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1513 digits.complement(); 1514 set->addAll(digits); 1515 break; 1516 } 1517 1518 case doSetBackslash_w: 1519 { 1520 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1521 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1522 break; 1523 } 1524 1525 case doSetBackslash_W: 1526 { 1527 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1528 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1529 SSet.complement(); 1530 set->addAll(SSet); 1531 break; 1532 } 1533 1534 case doSetBegin: 1535 fSetStack.push(new UnicodeSet(), *fStatus); 1536 fSetOpStack.push(setStart, *fStatus); 1537 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1538 fSetOpStack.push(setCaseClose, *fStatus); 1539 } 1540 break; 1541 1542 case doSetBeginDifference1: 1543 // We have scanned something like [[abc]-[ 1544 // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1545 // Push a Difference operator, which will cause the new set to be subtracted from what 1546 // went before once it is created. 1547 setPushOp(setDifference1); 1548 fSetOpStack.push(setStart, *fStatus); 1549 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1550 fSetOpStack.push(setCaseClose, *fStatus); 1551 } 1552 break; 1553 1554 case doSetBeginIntersection1: 1555 // We have scanned something like [[abc]&[ 1556 // Need both the '&' operator and the open '[' operator. 1557 setPushOp(setIntersection1); 1558 fSetOpStack.push(setStart, *fStatus); 1559 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1560 fSetOpStack.push(setCaseClose, *fStatus); 1561 } 1562 break; 1563 1564 case doSetBeginUnion: 1565 // We have scanned something like [[abc][ 1566 // Need to handle the union operation explicitly [[abc] | [ 1567 setPushOp(setUnion); 1568 fSetOpStack.push(setStart, *fStatus); 1569 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1570 fSetOpStack.push(setCaseClose, *fStatus); 1571 } 1572 break; 1573 1574 case doSetDifference2: 1575 // We have scanned something like [abc-- 1576 // Consider this to unambiguously be a set difference operator. 1577 setPushOp(setDifference2); 1578 break; 1579 1580 case doSetEnd: 1581 // Have encountered the ']' that closes a set. 1582 // Force the evaluation of any pending operations within this set, 1583 // leave the completed set on the top of the set stack. 1584 setEval(setEnd); 1585 U_ASSERT(fSetOpStack.peeki()==setStart); 1586 fSetOpStack.popi(); 1587 break; 1588 1589 case doSetFinish: 1590 { 1591 // Finished a complete set expression, including all nested sets. 1592 // The close bracket has already triggered clearing out pending set operators, 1593 // the operator stack should be empty and the operand stack should have just 1594 // one entry, the result set. 1595 U_ASSERT(fSetOpStack.empty()); 1596 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1597 U_ASSERT(fSetStack.empty()); 1598 compileSet(theSet); 1599 break; 1600 } 1601 1602 case doSetIntersection2: 1603 // Have scanned something like [abc&& 1604 setPushOp(setIntersection2); 1605 break; 1606 1607 case doSetLiteral: 1608 // Union the just-scanned literal character into the set being built. 1609 // This operation is the highest precedence set operation, so we can always do 1610 // it immediately, without waiting to see what follows. It is necessary to perform 1611 // any pending '-' or '&' operation first, because these have the same precedence 1612 // as union-ing in a literal' 1613 { 1614 setEval(setUnion); 1615 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1616 s->add(fC.fChar); 1617 fLastSetLiteral = fC.fChar; 1618 break; 1619 } 1620 1621 case doSetLiteralEscaped: 1622 // A back-slash escaped literal character was encountered. 1623 // Processing is the same as with setLiteral, above, with the addition of 1624 // the optional check for errors on escaped ASCII letters. 1625 { 1626 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1627 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1628 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1629 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1630 } 1631 setEval(setUnion); 1632 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1633 s->add(fC.fChar); 1634 fLastSetLiteral = fC.fChar; 1635 break; 1636 } 1637 1638 case doSetNamedChar: 1639 // Scanning a \N{UNICODE CHARACTER NAME} 1640 // Aside from the source of the character, the processing is identical to doSetLiteral, 1641 // above. 1642 { 1643 UChar32 c = scanNamedChar(); 1644 setEval(setUnion); 1645 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1646 s->add(c); 1647 fLastSetLiteral = c; 1648 break; 1649 } 1650 1651 case doSetNamedRange: 1652 // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1653 // The left character is already in the set, and is saved in fLastSetLiteral. 1654 // The right side needs to be picked up, the scan is at the 'N'. 1655 // Lower Limit > Upper limit being an error matches both Java 1656 // and ICU UnicodeSet behavior. 1657 { 1658 UChar32 c = scanNamedChar(); 1659 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { 1660 error(U_REGEX_INVALID_RANGE); 1661 } 1662 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1663 s->add(fLastSetLiteral, c); 1664 fLastSetLiteral = c; 1665 break; 1666 } 1667 1668 1669 case doSetNegate: 1670 // Scanned a '^' at the start of a set. 1671 // Push the negation operator onto the set op stack. 1672 // A twist for case-insensitive matching: 1673 // the case closure operation must happen _before_ negation. 1674 // But the case closure operation will already be on the stack if it's required. 1675 // This requires checking for case closure, and swapping the stack order 1676 // if it is present. 1677 { 1678 int32_t tosOp = fSetOpStack.peeki(); 1679 if (tosOp == setCaseClose) { 1680 fSetOpStack.popi(); 1681 fSetOpStack.push(setNegation, *fStatus); 1682 fSetOpStack.push(setCaseClose, *fStatus); 1683 } else { 1684 fSetOpStack.push(setNegation, *fStatus); 1685 } 1686 } 1687 break; 1688 1689 case doSetNoCloseError: 1690 error(U_REGEX_MISSING_CLOSE_BRACKET); 1691 break; 1692 1693 case doSetOpError: 1694 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1695 break; 1696 1697 case doSetPosixProp: 1698 { 1699 UnicodeSet *s = scanPosixProp(); 1700 if (s != NULL) { 1701 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1702 tos->addAll(*s); 1703 delete s; 1704 } // else error. scanProp() reported the error status already. 1705 } 1706 break; 1707 1708 case doSetProp: 1709 // Scanned a \p \P within [brackets]. 1710 { 1711 UnicodeSet *s = scanProp(); 1712 if (s != NULL) { 1713 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1714 tos->addAll(*s); 1715 delete s; 1716 } // else error. scanProp() reported the error status already. 1717 } 1718 break; 1719 1720 1721 case doSetRange: 1722 // We have scanned literal-literal. Add the range to the set. 1723 // The left character is already in the set, and is saved in fLastSetLiteral. 1724 // The right side is the current character. 1725 // Lower Limit > Upper limit being an error matches both Java 1726 // and ICU UnicodeSet behavior. 1727 { 1728 if (fLastSetLiteral > fC.fChar) { 1729 error(U_REGEX_INVALID_RANGE); 1730 } 1731 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1732 s->add(fLastSetLiteral, fC.fChar); 1733 break; 1734 } 1735 1736 1737 default: 1738 U_ASSERT(FALSE); 1739 error(U_REGEX_INTERNAL_ERROR); 1740 break; 1741 } 1742 1743 if (U_FAILURE(*fStatus)) { 1744 returnVal = FALSE; 1745 } 1746 1747 return returnVal; 1748 } 1749 1750 1751 1752 //------------------------------------------------------------------------------ 1753 // 1754 // literalChar We've encountered a literal character from the pattern, 1755 // or an escape sequence that reduces to a character. 1756 // Add it to the string containing all literal chars/strings from 1757 // the pattern. 1758 // If we are in a pattern string already, add the new char to it. 1759 // If we aren't in a pattern string, begin one now. 1760 // 1761 //------------------------------------------------------------------------------ 1762 void RegexCompile::literalChar(UChar32 c) { 1763 int32_t op; // An operation in the compiled pattern. 1764 int32_t opType; 1765 int32_t patternLoc; // A position in the compiled pattern. 1766 int32_t stringLen; 1767 1768 1769 // If the last thing compiled into the pattern was not a literal char, 1770 // force this new literal char to begin a new string, and not append to the previous. 1771 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1772 opType = URX_TYPE(op); 1773 if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) { 1774 fixLiterals(); 1775 } 1776 1777 if (fStringOpStart == -1) { 1778 // First char of a string in the pattern. 1779 // Emit a OneChar op into the compiled pattern. 1780 emitONE_CHAR(c); 1781 1782 // Mark that we might actually be starting a string here 1783 fStringOpStart = fRXPat->fLiteralText.length(); 1784 return; 1785 } 1786 1787 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1788 opType = URX_TYPE(op); 1789 U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); 1790 1791 // If the most recently emitted op is a URX_ONECHAR, 1792 if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { 1793 if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { 1794 // The most recently emitted op is a ONECHAR that was the first half 1795 // of a surrogate pair. Update the ONECHAR's operand to be the 1796 // supplementary code point resulting from both halves of the pair. 1797 c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c); 1798 op = URX_BUILD(opType, c); 1799 patternLoc = fRXPat->fCompiledPat->size() - 1; 1800 fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1801 return; 1802 } 1803 1804 // The most recently emitted op is a ONECHAR. 1805 // We've now received another adjacent char. Change the ONECHAR op 1806 // to a string op. 1807 fRXPat->fLiteralText.append(URX_VAL(op)); 1808 1809 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1810 op = URX_BUILD(URX_STRING_I, fStringOpStart); 1811 } else { 1812 op = URX_BUILD(URX_STRING, fStringOpStart); 1813 } 1814 patternLoc = fRXPat->fCompiledPat->size() - 1; 1815 fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1816 op = URX_BUILD(URX_STRING_LEN, 0); 1817 fRXPat->fCompiledPat->addElement(op, *fStatus); 1818 } 1819 1820 // We are adding onto an existing string 1821 fRXPat->fLiteralText.append(c); 1822 1823 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the 1824 // string length to reflect the new char we just added to the string. 1825 stringLen = fRXPat->fLiteralText.length() - fStringOpStart; 1826 op = URX_BUILD(URX_STRING_LEN, stringLen); 1827 patternLoc = fRXPat->fCompiledPat->size() - 1; 1828 fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1829 } 1830 1831 1832 1833 //------------------------------------------------------------------------------ 1834 // 1835 // emitONE_CHAR emit a ONE_CHAR op into the generated code. 1836 // Choose cased or uncased version, depending on the 1837 // match mode and whether the character itself is cased. 1838 // 1839 //------------------------------------------------------------------------------ 1840 void RegexCompile::emitONE_CHAR(UChar32 c) { 1841 int32_t op; 1842 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1843 u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 1844 // We have a cased character, and are in case insensitive matching mode. 1845 //c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs() now 1846 op = URX_BUILD(URX_ONECHAR_I, c); 1847 } else { 1848 // Uncased char, or case sensitive match mode. 1849 // Either way, just generate a literal compare of the char. 1850 op = URX_BUILD(URX_ONECHAR, c); 1851 } 1852 fRXPat->fCompiledPat->addElement(op, *fStatus); 1853 } 1854 1855 1856 //------------------------------------------------------------------------------ 1857 // 1858 // fixLiterals When compiling something that can follow a literal 1859 // string in a pattern, we need to "fix" any preceding 1860 // string, which will cause any subsequent literals to 1861 // begin a new string, rather than appending to the 1862 // old one. 1863 // 1864 // Optionally, split the last char of the string off into 1865 // a single "ONE_CHAR" operation, so that quantifiers can 1866 // apply to that char alone. Example: abc* 1867 // The * must apply to the 'c' only. 1868 // 1869 //------------------------------------------------------------------------------ 1870 void RegexCompile::fixLiterals(UBool split) { 1871 int32_t stringStart = fStringOpStart; // start index of the current literal string 1872 int32_t op; // An op from/for the compiled pattern. 1873 int32_t opType; // An opcode type from the compiled pattern. 1874 int32_t stringLastCharIdx; 1875 UChar32 lastChar; 1876 int32_t stringNextToLastCharIdx; 1877 UChar32 nextToLastChar; 1878 int32_t stringLen; 1879 1880 fStringOpStart = -1; 1881 if (!split) { 1882 return; 1883 } 1884 1885 // Split: We need to ensure that the last item in the compiled pattern does 1886 // not refer to a literal string of more than one char. If it does, 1887 // separate the last char from the rest of the string. 1888 1889 // If the last operation from the compiled pattern is not a string, 1890 // nothing needs to be done 1891 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1892 opType = URX_TYPE(op); 1893 if (opType != URX_STRING_LEN) { 1894 return; 1895 } 1896 stringLen = URX_VAL(op); 1897 1898 // 1899 // Find the position of the last code point in the string (might be a surrogate pair) 1900 // 1901 stringLastCharIdx = fRXPat->fLiteralText.length(); 1902 stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); 1903 lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx); 1904 1905 // The string should always be at least two code points long, meaning that there 1906 // should be something before the last char position that we just found. 1907 U_ASSERT(stringLastCharIdx > stringStart); 1908 stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); 1909 U_ASSERT(stringNextToLastCharIdx >= stringStart); 1910 nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx); 1911 1912 if (stringNextToLastCharIdx > stringStart) { 1913 // The length of string remaining after removing one char is two or more. 1914 // Leave the string in the compiled pattern, shorten it by one char, 1915 // and append a URX_ONECHAR op for the last char. 1916 stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx); 1917 op = URX_BUILD(URX_STRING_LEN, stringLen); 1918 fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); 1919 emitONE_CHAR(lastChar); 1920 } else { 1921 // The original string consisted of exactly two characters. Replace 1922 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair 1923 // of URX_ONECHARs. 1924 fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2); 1925 emitONE_CHAR(nextToLastChar); 1926 emitONE_CHAR(lastChar); 1927 } 1928 } 1929 1930 1931 1932 1933 1934 1935 //------------------------------------------------------------------------------ 1936 // 1937 // insertOp() Insert a slot for a new opcode into the already 1938 // compiled pattern code. 1939 // 1940 // Fill the slot with a NOP. Our caller will replace it 1941 // with what they really wanted. 1942 // 1943 //------------------------------------------------------------------------------ 1944 void RegexCompile::insertOp(int32_t where) { 1945 UVector64 *code = fRXPat->fCompiledPat; 1946 U_ASSERT(where>0 && where < code->size()); 1947 1948 int32_t nop = URX_BUILD(URX_NOP, 0); 1949 code->insertElementAt(nop, where, *fStatus); 1950 1951 // Walk through the pattern, looking for any ops with targets that 1952 // were moved down by the insert. Fix them. 1953 int32_t loc; 1954 for (loc=0; loc<code->size(); loc++) { 1955 int32_t op = (int32_t)code->elementAti(loc); 1956 int32_t opType = URX_TYPE(op); 1957 int32_t opValue = URX_VAL(op); 1958 if ((opType == URX_JMP || 1959 opType == URX_JMPX || 1960 opType == URX_STATE_SAVE || 1961 opType == URX_CTR_LOOP || 1962 opType == URX_CTR_LOOP_NG || 1963 opType == URX_JMP_SAV || 1964 opType == URX_RELOC_OPRND) && opValue > where) { 1965 // Target location for this opcode is after the insertion point and 1966 // needs to be incremented to adjust for the insertion. 1967 opValue++; 1968 op = URX_BUILD(opType, opValue); 1969 code->setElementAt(op, loc); 1970 } 1971 } 1972 1973 // Now fix up the parentheses stack. All positive values in it are locations in 1974 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 1975 for (loc=0; loc<fParenStack.size(); loc++) { 1976 int32_t x = fParenStack.elementAti(loc); 1977 U_ASSERT(x < code->size()); 1978 if (x>where) { 1979 x++; 1980 fParenStack.setElementAt(x, loc); 1981 } 1982 } 1983 1984 if (fMatchCloseParen > where) { 1985 fMatchCloseParen++; 1986 } 1987 if (fMatchOpenParen > where) { 1988 fMatchOpenParen++; 1989 } 1990 } 1991 1992 1993 1994 //------------------------------------------------------------------------------ 1995 // 1996 // blockTopLoc() Find or create a location in the compiled pattern 1997 // at the start of the operation or block that has 1998 // just been compiled. Needed when a quantifier (* or 1999 // whatever) appears, and we need to add an operation 2000 // at the start of the thing being quantified. 2001 // 2002 // (Parenthesized Blocks) have a slot with a NOP that 2003 // is reserved for this purpose. .* or similar don't 2004 // and a slot needs to be added. 2005 // 2006 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode 2007 // at the returned location. 2008 // FALSE - just return the address, 2009 // do not reserve a location there. 2010 // 2011 //------------------------------------------------------------------------------ 2012 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 2013 int32_t theLoc; 2014 if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 2015 { 2016 // The item just processed is a parenthesized block. 2017 theLoc = fMatchOpenParen; // A slot is already reserved for us. 2018 U_ASSERT(theLoc > 0); 2019 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 2020 } 2021 else { 2022 // Item just compiled is a single thing, a ".", or a single char, or a set reference. 2023 // No slot for STATE_SAVE was pre-reserved in the compiled code. 2024 // We need to make space now. 2025 fixLiterals(TRUE); // If last item was a string, separate the last char. 2026 theLoc = fRXPat->fCompiledPat->size()-1; 2027 if (reserveLoc) { 2028 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/ 2029 int32_t nop = URX_BUILD(URX_NOP, 0); 2030 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 2031 } 2032 } 2033 return theLoc; 2034 } 2035 2036 2037 2038 //------------------------------------------------------------------------------ 2039 // 2040 // handleCloseParen When compiling a close paren, we need to go back 2041 // and fix up any JMP or SAVE operations within the 2042 // parenthesized block that need to target the end 2043 // of the block. The locations of these are kept on 2044 // the paretheses stack. 2045 // 2046 // This function is called both when encountering a 2047 // real ) and at the end of the pattern. 2048 // 2049 //------------------------------------------------------------------------------ 2050 void RegexCompile::handleCloseParen() { 2051 int32_t patIdx; 2052 int32_t patOp; 2053 if (fParenStack.size() <= 0) { 2054 error(U_REGEX_MISMATCHED_PAREN); 2055 return; 2056 } 2057 2058 // Force any literal chars that may follow the close paren to start a new string, 2059 // and not attach to any preceding it. 2060 fixLiterals(FALSE); 2061 2062 // Fixup any operations within the just-closed parenthesized group 2063 // that need to reference the end of the (block). 2064 // (The first one popped from the stack is an unused slot for 2065 // alternation (OR) state save, but applying the fixup to it does no harm.) 2066 for (;;) { 2067 patIdx = fParenStack.popi(); 2068 if (patIdx < 0) { 2069 // value < 0 flags the start of the frame on the paren stack. 2070 break; 2071 } 2072 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 2073 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 2074 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2075 patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2076 fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2077 fMatchOpenParen = patIdx; 2078 } 2079 2080 // At the close of any parenthesized block, restore the match mode flags to 2081 // the value they had at the open paren. Saved value is 2082 // at the top of the paren stack. 2083 fModeFlags = fParenStack.popi(); 2084 U_ASSERT(fModeFlags < 0); 2085 2086 // DO any additional fixups, depending on the specific kind of 2087 // parentesized grouping this is 2088 2089 switch (patIdx) { 2090 case plain: 2091 case flags: 2092 // No additional fixups required. 2093 // (Grouping-only parentheses) 2094 break; 2095 case capturing: 2096 // Capturing Parentheses. 2097 // Insert a End Capture op into the pattern. 2098 // The frame offset of the variables for this cg is obtained from the 2099 // start capture op and put it into the end-capture op. 2100 { 2101 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2102 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2103 2104 int32_t frameVarLocation = URX_VAL(captureOp); 2105 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); 2106 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); 2107 } 2108 break; 2109 case atomic: 2110 // Atomic Parenthesis. 2111 // Insert a LD_SP operation to restore the state stack to the position 2112 // it was when the atomic parens were entered. 2113 { 2114 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2115 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2116 int32_t stoLoc = URX_VAL(stoOp); 2117 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); 2118 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); 2119 } 2120 break; 2121 2122 case lookAhead: 2123 { 2124 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2125 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2126 int32_t dataLoc = URX_VAL(startOp); 2127 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2128 fRXPat->fCompiledPat->addElement(op, *fStatus); 2129 } 2130 break; 2131 2132 case negLookAhead: 2133 { 2134 // See comment at doOpenLookAheadNeg 2135 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 2136 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2137 int32_t dataLoc = URX_VAL(startOp); 2138 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2139 fRXPat->fCompiledPat->addElement(op, *fStatus); 2140 op = URX_BUILD(URX_BACKTRACK, 0); 2141 fRXPat->fCompiledPat->addElement(op, *fStatus); 2142 op = URX_BUILD(URX_LA_END, dataLoc); 2143 fRXPat->fCompiledPat->addElement(op, *fStatus); 2144 2145 // Patch the URX_SAVE near the top of the block. 2146 // The destination of the SAVE is the final LA_END that was just added. 2147 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 2148 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2149 int32_t dest = fRXPat->fCompiledPat->size()-1; 2150 saveOp = URX_BUILD(URX_STATE_SAVE, dest); 2151 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2152 } 2153 break; 2154 2155 case lookBehind: 2156 { 2157 // See comment at doOpenLookBehind. 2158 2159 // Append the URX_LB_END and URX_LA_END to the compiled pattern. 2160 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 2161 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2162 int32_t dataLoc = URX_VAL(startOp); 2163 int32_t op = URX_BUILD(URX_LB_END, dataLoc); 2164 fRXPat->fCompiledPat->addElement(op, *fStatus); 2165 op = URX_BUILD(URX_LA_END, dataLoc); 2166 fRXPat->fCompiledPat->addElement(op, *fStatus); 2167 2168 // Determine the min and max bounds for the length of the 2169 // string that the pattern can match. 2170 // An unbounded upper limit is an error. 2171 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2172 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2173 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2174 if (maxML == INT32_MAX) { 2175 error(U_REGEX_LOOK_BEHIND_LIMIT); 2176 break; 2177 } 2178 U_ASSERT(minML <= maxML); 2179 2180 // Insert the min and max match len bounds into the URX_LB_CONT op that 2181 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2182 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2183 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2184 2185 } 2186 break; 2187 2188 2189 2190 case lookBehindN: 2191 { 2192 // See comment at doOpenLookBehindNeg. 2193 2194 // Append the URX_LBN_END to the compiled pattern. 2195 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2196 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2197 int32_t dataLoc = URX_VAL(startOp); 2198 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); 2199 fRXPat->fCompiledPat->addElement(op, *fStatus); 2200 2201 // Determine the min and max bounds for the length of the 2202 // string that the pattern can match. 2203 // An unbounded upper limit is an error. 2204 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2205 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2206 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2207 if (maxML == INT32_MAX) { 2208 error(U_REGEX_LOOK_BEHIND_LIMIT); 2209 break; 2210 } 2211 U_ASSERT(minML <= maxML); 2212 2213 // Insert the min and max match len bounds into the URX_LB_CONT op that 2214 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2215 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2216 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2217 2218 // Insert the pattern location to continue at after a successful match 2219 // as the last operand of the URX_LBN_CONT 2220 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2221 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2222 } 2223 break; 2224 2225 2226 2227 default: 2228 U_ASSERT(FALSE); 2229 } 2230 2231 // remember the next location in the compiled pattern. 2232 // The compilation of Quantifiers will look at this to see whether its looping 2233 // over a parenthesized block or a single item 2234 fMatchCloseParen = fRXPat->fCompiledPat->size(); 2235 } 2236 2237 2238 2239 //------------------------------------------------------------------------------ 2240 // 2241 // compileSet Compile the pattern operations for a reference to a 2242 // UnicodeSet. 2243 // 2244 //------------------------------------------------------------------------------ 2245 void RegexCompile::compileSet(UnicodeSet *theSet) 2246 { 2247 if (theSet == NULL) { 2248 return; 2249 } 2250 // Remove any strings from the set. 2251 // There shoudn't be any, but just in case. 2252 // (Case Closure can add them; if we had a simple case closure avaialble that 2253 // ignored strings, that would be better.) 2254 theSet->removeAllStrings(); 2255 int32_t setSize = theSet->size(); 2256 2257 switch (setSize) { 2258 case 0: 2259 { 2260 // Set of no elements. Always fails to match. 2261 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); 2262 delete theSet; 2263 } 2264 break; 2265 2266 case 1: 2267 { 2268 // The set contains only a single code point. Put it into 2269 // the compiled pattern as a single char operation rather 2270 // than a set, and discard the set itself. 2271 literalChar(theSet->charAt(0)); 2272 delete theSet; 2273 } 2274 break; 2275 2276 default: 2277 { 2278 // The set contains two or more chars. (the normal case) 2279 // Put it into the compiled pattern as a set. 2280 int32_t setNumber = fRXPat->fSets->size(); 2281 fRXPat->fSets->addElement(theSet, *fStatus); 2282 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); 2283 fRXPat->fCompiledPat->addElement(setOp, *fStatus); 2284 } 2285 } 2286 } 2287 2288 2289 //------------------------------------------------------------------------------ 2290 // 2291 // compileInterval Generate the code for a {min, max} style interval quantifier. 2292 // Except for the specific opcodes used, the code is the same 2293 // for all three types (greedy, non-greedy, possessive) of 2294 // intervals. The opcodes are supplied as parameters. 2295 // 2296 // The code for interval loops has this form: 2297 // 0 CTR_INIT counter loc (in stack frame) 2298 // 1 5 patt address of CTR_LOOP at bottom of block 2299 // 2 min count 2300 // 3 max count (-1 for unbounded) 2301 // 4 ... block to be iterated over 2302 // 5 CTR_LOOP 2303 // 2304 // In 2305 //------------------------------------------------------------------------------ 2306 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2307 { 2308 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2309 // four slots in the compiled code. Reserve them. 2310 int32_t topOfBlock = blockTopLoc(TRUE); 2311 insertOp(topOfBlock); 2312 insertOp(topOfBlock); 2313 insertOp(topOfBlock); 2314 2315 // The operands for the CTR_INIT opcode include the index in the matcher data 2316 // of the counter. Allocate it now. 2317 int32_t counterLoc = fRXPat->fFrameSize; 2318 fRXPat->fFrameSize++; 2319 2320 int32_t op = URX_BUILD(InitOp, counterLoc); 2321 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2322 2323 // The second operand of CTR_INIT is the location following the end of the loop. 2324 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2325 // compilation of something later on causes the code to grow and the target 2326 // position to move. 2327 int32_t loopEnd = fRXPat->fCompiledPat->size(); 2328 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); 2329 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2330 2331 // Followed by the min and max counts. 2332 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2333 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2334 2335 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2336 // Goes at end of the block being looped over, so just append to the code so far. 2337 op = URX_BUILD(LoopOp, topOfBlock); 2338 fRXPat->fCompiledPat->addElement(op, *fStatus); 2339 2340 if ((fIntervalLow & 0xff000000) != 0 || 2341 fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0) { 2342 error(U_REGEX_NUMBER_TOO_BIG); 2343 } 2344 2345 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2346 error(U_REGEX_MAX_LT_MIN); 2347 } 2348 } 2349 2350 2351 2352 UBool RegexCompile::compileInlineInterval() { 2353 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2354 // Too big to inline. Fail, which will cause looping code to be generated. 2355 // (Upper < Lower picks up unbounded upper and errors, both.) 2356 return FALSE; 2357 } 2358 2359 int32_t topOfBlock = blockTopLoc(FALSE); 2360 if (fIntervalUpper == 0) { 2361 // Pathological case. Attempt no matches, as if the block doesn't exist. 2362 fRXPat->fCompiledPat->setSize(topOfBlock); 2363 return TRUE; 2364 } 2365 2366 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2367 // The thing being repeated is not a single op, but some 2368 // more complex block. Do it as a loop, not inlines. 2369 // Note that things "repeated" a max of once are handled as inline, because 2370 // the one copy of the code already generated is just fine. 2371 return FALSE; 2372 } 2373 2374 // Pick up the opcode that is to be repeated 2375 // 2376 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 2377 2378 // Compute the pattern location where the inline sequence 2379 // will end, and set up the state save op that will be needed. 2380 // 2381 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2382 + fIntervalUpper + (fIntervalUpper-fIntervalLow); 2383 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); 2384 if (fIntervalLow == 0) { 2385 insertOp(topOfBlock); 2386 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2387 } 2388 2389 2390 2391 // Loop, emitting the op for the thing being repeated each time. 2392 // Loop starts at 1 because one instance of the op already exists in the pattern, 2393 // it was put there when it was originally encountered. 2394 int32_t i; 2395 for (i=1; i<fIntervalUpper; i++ ) { 2396 if (i == fIntervalLow) { 2397 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2398 } 2399 if (i > fIntervalLow) { 2400 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2401 } 2402 fRXPat->fCompiledPat->addElement(op, *fStatus); 2403 } 2404 return TRUE; 2405 } 2406 2407 2408 2409 //------------------------------------------------------------------------------ 2410 // 2411 // matchStartType Determine how a match can start. 2412 // Used to optimize find() operations. 2413 // 2414 // Operation is very similar to minMatchLength(). Walk the compiled 2415 // pattern, keeping an on-going minimum-match-length. For any 2416 // op where the min match coming in is zero, add that ops possible 2417 // starting matches to the possible starts for the overall pattern. 2418 // 2419 //------------------------------------------------------------------------------ 2420 void RegexCompile::matchStartType() { 2421 if (U_FAILURE(*fStatus)) { 2422 return; 2423 } 2424 2425 2426 int32_t loc; // Location in the pattern of the current op being processed. 2427 int32_t op; // The op being processed 2428 int32_t opType; // The opcode type of the op 2429 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2430 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2431 2432 UBool atStart = TRUE; // True if no part of the pattern yet encountered 2433 // could have advanced the position in a match. 2434 // (Maximum match length so far == 0) 2435 2436 // forwardedLength is a vector holding minimum-match-length values that 2437 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2438 // It must be one longer than the pattern being checked because some ops 2439 // will jmp to a end-of-block+1 location from within a block, and we must 2440 // count those when checking the block. 2441 int32_t end = fRXPat->fCompiledPat->size(); 2442 UVector32 forwardedLength(end+1, *fStatus); 2443 forwardedLength.setSize(end+1); 2444 for (loc=3; loc<end; loc++) { 2445 forwardedLength.setElementAt(INT32_MAX, loc); 2446 } 2447 2448 for (loc = 3; loc<end; loc++) { 2449 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2450 opType = URX_TYPE(op); 2451 2452 // The loop is advancing linearly through the pattern. 2453 // If the op we are now at was the destination of a branch in the pattern, 2454 // and that path has a shorter minimum length than the current accumulated value, 2455 // replace the current accumulated value. 2456 if (forwardedLength.elementAti(loc) < currentLen) { 2457 currentLen = forwardedLength.elementAti(loc); 2458 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2459 } 2460 2461 switch (opType) { 2462 // Ops that don't change the total length matched 2463 case URX_RESERVED_OP: 2464 case URX_END: 2465 case URX_FAIL: 2466 case URX_STRING_LEN: 2467 case URX_NOP: 2468 case URX_START_CAPTURE: 2469 case URX_END_CAPTURE: 2470 case URX_BACKSLASH_B: 2471 case URX_BACKSLASH_BU: 2472 case URX_BACKSLASH_G: 2473 case URX_BACKSLASH_Z: 2474 case URX_DOLLAR: 2475 case URX_DOLLAR_M: 2476 case URX_DOLLAR_D: 2477 case URX_DOLLAR_MD: 2478 case URX_RELOC_OPRND: 2479 case URX_STO_INP_LOC: 2480 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2481 case URX_BACKREF_I: 2482 2483 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2484 case URX_LD_SP: 2485 break; 2486 2487 case URX_CARET: 2488 if (atStart) { 2489 fRXPat->fStartType = START_START; 2490 } 2491 break; 2492 2493 case URX_CARET_M: 2494 case URX_CARET_M_UNIX: 2495 if (atStart) { 2496 fRXPat->fStartType = START_LINE; 2497 } 2498 break; 2499 2500 case URX_ONECHAR: 2501 if (currentLen == 0) { 2502 // This character could appear at the start of a match. 2503 // Add it to the set of possible starting characters. 2504 fRXPat->fInitialChars->add(URX_VAL(op)); 2505 numInitialStrings += 2; 2506 } 2507 currentLen++; 2508 atStart = FALSE; 2509 break; 2510 2511 2512 case URX_SETREF: 2513 if (currentLen == 0) { 2514 int32_t sn = URX_VAL(op); 2515 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2516 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2517 fRXPat->fInitialChars->addAll(*s); 2518 numInitialStrings += 2; 2519 } 2520 currentLen++; 2521 atStart = FALSE; 2522 break; 2523 2524 case URX_LOOP_SR_I: 2525 // [Set]*, like a SETREF, above, in what it can match, 2526 // but may not match at all, so currentLen is not incremented. 2527 if (currentLen == 0) { 2528 int32_t sn = URX_VAL(op); 2529 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2530 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2531 fRXPat->fInitialChars->addAll(*s); 2532 numInitialStrings += 2; 2533 } 2534 atStart = FALSE; 2535 break; 2536 2537 case URX_LOOP_DOT_I: 2538 if (currentLen == 0) { 2539 // .* at the start of a pattern. 2540 // Any character can begin the match. 2541 fRXPat->fInitialChars->clear(); 2542 fRXPat->fInitialChars->complement(); 2543 numInitialStrings += 2; 2544 } 2545 atStart = FALSE; 2546 break; 2547 2548 2549 case URX_STATIC_SETREF: 2550 if (currentLen == 0) { 2551 int32_t sn = URX_VAL(op); 2552 U_ASSERT(sn>0 && sn<URX_LAST_SET); 2553 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2554 fRXPat->fInitialChars->addAll(*s); 2555 numInitialStrings += 2; 2556 } 2557 currentLen++; 2558 atStart = FALSE; 2559 break; 2560 2561 2562 2563 case URX_STAT_SETREF_N: 2564 if (currentLen == 0) { 2565 int32_t sn = URX_VAL(op); 2566 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2567 UnicodeSet sc(*s); 2568 sc.complement(); 2569 fRXPat->fInitialChars->addAll(sc); 2570 numInitialStrings += 2; 2571 } 2572 currentLen++; 2573 atStart = FALSE; 2574 break; 2575 2576 2577 2578 case URX_BACKSLASH_D: 2579 // Digit Char 2580 if (currentLen == 0) { 2581 UnicodeSet s; 2582 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2583 if (URX_VAL(op) != 0) { 2584 s.complement(); 2585 } 2586 fRXPat->fInitialChars->addAll(s); 2587 numInitialStrings += 2; 2588 } 2589 currentLen++; 2590 atStart = FALSE; 2591 break; 2592 2593 2594 case URX_ONECHAR_I: 2595 // Case Insensitive Single Character. 2596 if (currentLen == 0) { 2597 UChar32 c = URX_VAL(op); 2598 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2599 // character may have distinct cased forms. Add all of them 2600 // to the set of possible starting match chars. 2601 UnicodeSet s(c, c); 2602 s.closeOver(USET_CASE_INSENSITIVE); 2603 fRXPat->fInitialChars->addAll(s); 2604 } else { 2605 // Char has no case variants. Just add it as-is to the 2606 // set of possible starting chars. 2607 fRXPat->fInitialChars->add(c); 2608 } 2609 numInitialStrings += 2; 2610 } 2611 currentLen++; 2612 atStart = FALSE; 2613 break; 2614 2615 2616 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2617 case URX_DOTANY_ALL: // . matches one or two. 2618 case URX_DOTANY: 2619 case URX_DOTANY_UNIX: 2620 if (currentLen == 0) { 2621 // These constructs are all bad news when they appear at the start 2622 // of a match. Any character can begin the match. 2623 fRXPat->fInitialChars->clear(); 2624 fRXPat->fInitialChars->complement(); 2625 numInitialStrings += 2; 2626 } 2627 currentLen++; 2628 atStart = FALSE; 2629 break; 2630 2631 2632 case URX_JMPX: 2633 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 2634 case URX_JMP: 2635 { 2636 int32_t jmpDest = URX_VAL(op); 2637 if (jmpDest < loc) { 2638 // Loop of some kind. Can safely ignore, the worst that will happen 2639 // is that we understate the true minimum length 2640 currentLen = forwardedLength.elementAti(loc+1); 2641 2642 } else { 2643 // Forward jump. Propagate the current min length to the target loc of the jump. 2644 U_ASSERT(jmpDest <= end+1); 2645 if (forwardedLength.elementAti(jmpDest) > currentLen) { 2646 forwardedLength.setElementAt(currentLen, jmpDest); 2647 } 2648 } 2649 } 2650 atStart = FALSE; 2651 break; 2652 2653 case URX_JMP_SAV: 2654 case URX_JMP_SAV_X: 2655 // Combo of state save to the next loc, + jmp backwards. 2656 // Net effect on min. length computation is nothing. 2657 atStart = FALSE; 2658 break; 2659 2660 case URX_BACKTRACK: 2661 // Fails are kind of like a branch, except that the min length was 2662 // propagated already, by the state save. 2663 currentLen = forwardedLength.elementAti(loc+1); 2664 atStart = FALSE; 2665 break; 2666 2667 2668 case URX_STATE_SAVE: 2669 { 2670 // State Save, for forward jumps, propagate the current minimum. 2671 // of the state save. 2672 int32_t jmpDest = URX_VAL(op); 2673 if (jmpDest > loc) { 2674 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2675 forwardedLength.setElementAt(currentLen, jmpDest); 2676 } 2677 } 2678 } 2679 atStart = FALSE; 2680 break; 2681 2682 2683 2684 2685 case URX_STRING: 2686 { 2687 loc++; 2688 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2689 int32_t stringLen = URX_VAL(stringLenOp); 2690 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2691 U_ASSERT(stringLenOp >= 2); 2692 if (currentLen == 0) { 2693 // Add the starting character of this string to the set of possible starting 2694 // characters for this pattern. 2695 int32_t stringStartIdx = URX_VAL(op); 2696 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2697 fRXPat->fInitialChars->add(c); 2698 2699 // Remember this string. After the entire pattern has been checked, 2700 // if nothing else is identified that can start a match, we'll use it. 2701 numInitialStrings++; 2702 fRXPat->fInitialStringIdx = stringStartIdx; 2703 fRXPat->fInitialStringLen = stringLen; 2704 } 2705 2706 currentLen += stringLen; 2707 atStart = FALSE; 2708 } 2709 break; 2710 2711 case URX_STRING_I: 2712 { 2713 // Case-insensitive string. Unlike exact-match strings, we won't 2714 // attempt a string search for possible match positions. But we 2715 // do update the set of possible starting characters. 2716 loc++; 2717 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2718 int32_t stringLen = URX_VAL(stringLenOp); 2719 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2720 U_ASSERT(stringLenOp >= 2); 2721 if (currentLen == 0) { 2722 // Add the starting character of this string to the set of possible starting 2723 // characters for this pattern. 2724 int32_t stringStartIdx = URX_VAL(op); 2725 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2726 UnicodeSet s(c, c); 2727 s.closeOver(USET_CASE_INSENSITIVE); 2728 fRXPat->fInitialChars->addAll(s); 2729 numInitialStrings += 2; // Matching on an initial string not possible. 2730 } 2731 currentLen += stringLen; 2732 atStart = FALSE; 2733 } 2734 break; 2735 2736 case URX_CTR_INIT: 2737 case URX_CTR_INIT_NG: 2738 { 2739 // Loop Init Ops. These don't change the min length, but they are 4 word ops 2740 // so location must be updated accordingly. 2741 // Loop Init Ops. 2742 // If the min loop count == 0 2743 // move loc forwards to the end of the loop, skipping over the body. 2744 // If the min count is > 0, 2745 // continue normal processing of the body of the loop. 2746 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 2747 loopEndLoc = URX_VAL(loopEndLoc); 2748 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 2749 if (minLoopCount == 0) { 2750 // Min Loop Count of 0, treat like a forward branch and 2751 // move the current minimum length up to the target 2752 // (end of loop) location. 2753 U_ASSERT(loopEndLoc <= end+1); 2754 if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 2755 forwardedLength.setElementAt(currentLen, loopEndLoc); 2756 } 2757 } 2758 loc+=3; // Skips over operands of CTR_INIT 2759 } 2760 atStart = FALSE; 2761 break; 2762 2763 2764 case URX_CTR_LOOP: 2765 case URX_CTR_LOOP_NG: 2766 // Loop ops. 2767 // The jump is conditional, backwards only. 2768 atStart = FALSE; 2769 break; 2770 2771 case URX_LOOP_C: 2772 // More loop ops. These state-save to themselves. 2773 // don't change the minimum match 2774 atStart = FALSE; 2775 break; 2776 2777 2778 case URX_LA_START: 2779 case URX_LB_START: 2780 { 2781 // Look-around. Scan forward until the matching look-ahead end, 2782 // without processing the look-around block. This is overly pessimistic. 2783 2784 // Keep track of the nesting depth of look-around blocks. Boilerplate code for 2785 // lookahead contains two LA_END instructions, so count goes up by two 2786 // for each LA_START. 2787 int32_t depth = (opType == URX_LA_START? 2: 1); 2788 for (;;) { 2789 loc++; 2790 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2791 if (URX_TYPE(op) == URX_LA_START) { 2792 depth+=2; 2793 } 2794 if (URX_TYPE(op) == URX_LB_START) { 2795 depth++; 2796 } 2797 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 2798 depth--; 2799 if (depth == 0) { 2800 break; 2801 } 2802 } 2803 if (URX_TYPE(op) == URX_STATE_SAVE) { 2804 // Need this because neg lookahead blocks will FAIL to outside 2805 // of the block. 2806 int32_t jmpDest = URX_VAL(op); 2807 if (jmpDest > loc) { 2808 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2809 forwardedLength.setElementAt(currentLen, jmpDest); 2810 } 2811 } 2812 } 2813 U_ASSERT(loc <= end); 2814 } 2815 } 2816 break; 2817 2818 case URX_LA_END: 2819 case URX_LB_CONT: 2820 case URX_LB_END: 2821 case URX_LBN_CONT: 2822 case URX_LBN_END: 2823 U_ASSERT(FALSE); // Shouldn't get here. These ops should be 2824 // consumed by the scan in URX_LA_START and LB_START 2825 2826 break; 2827 2828 default: 2829 U_ASSERT(FALSE); 2830 } 2831 2832 } 2833 2834 2835 // We have finished walking through the ops. Check whether some forward jump 2836 // propagated a shorter length to location end+1. 2837 if (forwardedLength.elementAti(end+1) < currentLen) { 2838 currentLen = forwardedLength.elementAti(end+1); 2839 } 2840 2841 2842 fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 2843 2844 2845 // Sort out what we should check for when looking for candidate match start positions. 2846 // In order of preference, 2847 // 1. Start of input text buffer. 2848 // 2. A literal string. 2849 // 3. Start of line in multi-line mode. 2850 // 4. A single literal character. 2851 // 5. A character from a set of characters. 2852 // 2853 if (fRXPat->fStartType == START_START) { 2854 // Match only at the start of an input text string. 2855 // start type is already set. We're done. 2856 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 2857 // Match beginning only with a literal string. 2858 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 2859 U_ASSERT(fRXPat->fInitialChars->contains(c)); 2860 fRXPat->fStartType = START_STRING; 2861 fRXPat->fInitialChar = c; 2862 } else if (fRXPat->fStartType == START_LINE) { 2863 // Match at start of line in Multi-Line mode. 2864 // Nothing to do here; everything is already set. 2865 } else if (fRXPat->fMinMatchLen == 0) { 2866 // Zero length match possible. We could start anywhere. 2867 fRXPat->fStartType = START_NO_INFO; 2868 } else if (fRXPat->fInitialChars->size() == 1) { 2869 // All matches begin with the same char. 2870 fRXPat->fStartType = START_CHAR; 2871 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 2872 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 2873 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 2874 fRXPat->fMinMatchLen > 0) { 2875 // Matches start with a set of character smaller than the set of all chars. 2876 fRXPat->fStartType = START_SET; 2877 } else { 2878 // Matches can start with anything 2879 fRXPat->fStartType = START_NO_INFO; 2880 } 2881 2882 return; 2883 } 2884 2885 2886 2887 //------------------------------------------------------------------------------ 2888 // 2889 // minMatchLength Calculate the length of the shortest string that could 2890 // match the specified pattern. 2891 // Length is in 16 bit code units, not code points. 2892 // 2893 // The calculated length may not be exact. The returned 2894 // value may be shorter than the actual minimum; it must 2895 // never be longer. 2896 // 2897 // start and end are the range of p-code operations to be 2898 // examined. The endpoints are included in the range. 2899 // 2900 //------------------------------------------------------------------------------ 2901 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 2902 if (U_FAILURE(*fStatus)) { 2903 return 0; 2904 } 2905 2906 U_ASSERT(start <= end); 2907 U_ASSERT(end < fRXPat->fCompiledPat->size()); 2908 2909 2910 int32_t loc; 2911 int32_t op; 2912 int32_t opType; 2913 int32_t currentLen = 0; 2914 2915 2916 // forwardedLength is a vector holding minimum-match-length values that 2917 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2918 // It must be one longer than the pattern being checked because some ops 2919 // will jmp to a end-of-block+1 location from within a block, and we must 2920 // count those when checking the block. 2921 UVector32 forwardedLength(end+2, *fStatus); 2922 forwardedLength.setSize(end+2); 2923 for (loc=start; loc<=end+1; loc++) { 2924 forwardedLength.setElementAt(INT32_MAX, loc); 2925 } 2926 2927 for (loc = start; loc<=end; loc++) { 2928 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2929 opType = URX_TYPE(op); 2930 2931 // The loop is advancing linearly through the pattern. 2932 // If the op we are now at was the destination of a branch in the pattern, 2933 // and that path has a shorter minimum length than the current accumulated value, 2934 // replace the current accumulated value. 2935 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 2936 // no-match-possible cases. 2937 if (forwardedLength.elementAti(loc) < currentLen) { 2938 currentLen = forwardedLength.elementAti(loc); 2939 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2940 } 2941 2942 switch (opType) { 2943 // Ops that don't change the total length matched 2944 case URX_RESERVED_OP: 2945 case URX_END: 2946 case URX_STRING_LEN: 2947 case URX_NOP: 2948 case URX_START_CAPTURE: 2949 case URX_END_CAPTURE: 2950 case URX_BACKSLASH_B: 2951 case URX_BACKSLASH_BU: 2952 case URX_BACKSLASH_G: 2953 case URX_BACKSLASH_Z: 2954 case URX_CARET: 2955 case URX_DOLLAR: 2956 case URX_DOLLAR_M: 2957 case URX_DOLLAR_D: 2958 case URX_DOLLAR_MD: 2959 case URX_RELOC_OPRND: 2960 case URX_STO_INP_LOC: 2961 case URX_CARET_M: 2962 case URX_CARET_M_UNIX: 2963 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2964 case URX_BACKREF_I: 2965 2966 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2967 case URX_LD_SP: 2968 2969 case URX_JMP_SAV: 2970 case URX_JMP_SAV_X: 2971 break; 2972 2973 2974 // Ops that match a minimum of one character (one or two 16 bit code units.) 2975 // 2976 case URX_ONECHAR: 2977 case URX_STATIC_SETREF: 2978 case URX_STAT_SETREF_N: 2979 case URX_SETREF: 2980 case URX_BACKSLASH_D: 2981 case URX_ONECHAR_I: 2982 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2983 case URX_DOTANY_ALL: // . matches one or two. 2984 case URX_DOTANY: 2985 case URX_DOTANY_UNIX: 2986 currentLen++; 2987 break; 2988 2989 2990 case URX_JMPX: 2991 loc++; // URX_JMPX has an extra operand, ignored here, 2992 // otherwise processed identically to URX_JMP. 2993 case URX_JMP: 2994 { 2995 int32_t jmpDest = URX_VAL(op); 2996 if (jmpDest < loc) { 2997 // Loop of some kind. Can safely ignore, the worst that will happen 2998 // is that we understate the true minimum length 2999 currentLen = forwardedLength.elementAti(loc+1); 3000 } else { 3001 // Forward jump. Propagate the current min length to the target loc of the jump. 3002 U_ASSERT(jmpDest <= end+1); 3003 if (forwardedLength.elementAti(jmpDest) > currentLen) { 3004 forwardedLength.setElementAt(currentLen, jmpDest); 3005 } 3006 } 3007 } 3008 break; 3009 3010 case URX_BACKTRACK: 3011 { 3012 // Back-tracks are kind of like a branch, except that the min length was 3013 // propagated already, by the state save. 3014 currentLen = forwardedLength.elementAti(loc+1); 3015 } 3016 break; 3017 3018 3019 case URX_STATE_SAVE: 3020 { 3021 // State Save, for forward jumps, propagate the current minimum. 3022 // of the state save. 3023 int32_t jmpDest = URX_VAL(op); 3024 if (jmpDest > loc) { 3025 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3026 forwardedLength.setElementAt(currentLen, jmpDest); 3027 } 3028 } 3029 } 3030 break; 3031 3032 3033 case URX_STRING: 3034 case URX_STRING_I: 3035 { 3036 loc++; 3037 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3038 currentLen += URX_VAL(stringLenOp); 3039 } 3040 break; 3041 3042 3043 case URX_CTR_INIT: 3044 case URX_CTR_INIT_NG: 3045 { 3046 // Loop Init Ops. 3047 // If the min loop count == 0 3048 // move loc forwards to the end of the loop, skipping over the body. 3049 // If the min count is > 0, 3050 // continue normal processing of the body of the loop. 3051 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3052 loopEndLoc = URX_VAL(loopEndLoc); 3053 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3054 if (minLoopCount == 0) { 3055 loc = loopEndLoc; 3056 } else { 3057 loc+=3; // Skips over operands of CTR_INIT 3058 } 3059 } 3060 break; 3061 3062 3063 case URX_CTR_LOOP: 3064 case URX_CTR_LOOP_NG: 3065 // Loop ops. 3066 // The jump is conditional, backwards only. 3067 break; 3068 3069 case URX_LOOP_SR_I: 3070 case URX_LOOP_DOT_I: 3071 case URX_LOOP_C: 3072 // More loop ops. These state-save to themselves. 3073 // don't change the minimum match - could match nothing at all. 3074 break; 3075 3076 3077 case URX_LA_START: 3078 case URX_LB_START: 3079 { 3080 // Look-around. Scan forward until the matching look-ahead end, 3081 // without processing the look-around block. This is overly pessimistic for look-ahead, 3082 // it assumes that the look-ahead match might be zero-length. 3083 // TODO: Positive lookahead could recursively do the block, then continue 3084 // with the longer of the block or the value coming in. Ticket 6060 3085 int32_t depth = (opType == URX_LA_START? 2: 1);; 3086 for (;;) { 3087 loc++; 3088 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3089 if (URX_TYPE(op) == URX_LA_START) { 3090 // The boilerplate for look-ahead includes two LA_END insturctions, 3091 // Depth will be decremented by each one when it is seen. 3092 depth += 2; 3093 } 3094 if (URX_TYPE(op) == URX_LB_START) { 3095 depth++; 3096 } 3097 if (URX_TYPE(op) == URX_LA_END) { 3098 depth--; 3099 if (depth == 0) { 3100 break; 3101 } 3102 } 3103 if (URX_TYPE(op)==URX_LBN_END) { 3104 depth--; 3105 if (depth == 0) { 3106 break; 3107 } 3108 } 3109 if (URX_TYPE(op) == URX_STATE_SAVE) { 3110 // Need this because neg lookahead blocks will FAIL to outside 3111 // of the block. 3112 int32_t jmpDest = URX_VAL(op); 3113 if (jmpDest > loc) { 3114 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3115 forwardedLength.setElementAt(currentLen, jmpDest); 3116 } 3117 } 3118 } 3119 U_ASSERT(loc <= end); 3120 } 3121 } 3122 break; 3123 3124 case URX_LA_END: 3125 case URX_LB_CONT: 3126 case URX_LB_END: 3127 case URX_LBN_CONT: 3128 case URX_LBN_END: 3129 // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3130 // range being sized, which happens when measuring size of look-behind blocks. 3131 break; 3132 3133 default: 3134 U_ASSERT(FALSE); 3135 } 3136 3137 } 3138 3139 // We have finished walking through the ops. Check whether some forward jump 3140 // propagated a shorter length to location end+1. 3141 if (forwardedLength.elementAti(end+1) < currentLen) { 3142 currentLen = forwardedLength.elementAti(end+1); 3143 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3144 } 3145 3146 return currentLen; 3147 } 3148 3149 3150 3151 //------------------------------------------------------------------------------ 3152 // 3153 // maxMatchLength Calculate the length of the longest string that could 3154 // match the specified pattern. 3155 // Length is in 16 bit code units, not code points. 3156 // 3157 // The calculated length may not be exact. The returned 3158 // value may be longer than the actual maximum; it must 3159 // never be shorter. 3160 // 3161 //------------------------------------------------------------------------------ 3162 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3163 if (U_FAILURE(*fStatus)) { 3164 return 0; 3165 } 3166 U_ASSERT(start <= end); 3167 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3168 3169 3170 int32_t loc; 3171 int32_t op; 3172 int32_t opType; 3173 int32_t currentLen = 0; 3174 UVector32 forwardedLength(end+1, *fStatus); 3175 forwardedLength.setSize(end+1); 3176 3177 for (loc=start; loc<=end; loc++) { 3178 forwardedLength.setElementAt(0, loc); 3179 } 3180 3181 for (loc = start; loc<=end; loc++) { 3182 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3183 opType = URX_TYPE(op); 3184 3185 // The loop is advancing linearly through the pattern. 3186 // If the op we are now at was the destination of a branch in the pattern, 3187 // and that path has a longer maximum length than the current accumulated value, 3188 // replace the current accumulated value. 3189 if (forwardedLength.elementAti(loc) > currentLen) { 3190 currentLen = forwardedLength.elementAti(loc); 3191 } 3192 3193 switch (opType) { 3194 // Ops that don't change the total length matched 3195 case URX_RESERVED_OP: 3196 case URX_END: 3197 case URX_STRING_LEN: 3198 case URX_NOP: 3199 case URX_START_CAPTURE: 3200 case URX_END_CAPTURE: 3201 case URX_BACKSLASH_B: 3202 case URX_BACKSLASH_BU: 3203 case URX_BACKSLASH_G: 3204 case URX_BACKSLASH_Z: 3205 case URX_CARET: 3206 case URX_DOLLAR: 3207 case URX_DOLLAR_M: 3208 case URX_DOLLAR_D: 3209 case URX_DOLLAR_MD: 3210 case URX_RELOC_OPRND: 3211 case URX_STO_INP_LOC: 3212 case URX_CARET_M: 3213 case URX_CARET_M_UNIX: 3214 3215 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3216 case URX_LD_SP: 3217 3218 case URX_LB_END: 3219 case URX_LB_CONT: 3220 case URX_LBN_CONT: 3221 case URX_LBN_END: 3222 break; 3223 3224 3225 // Ops that increase that cause an unbounded increase in the length 3226 // of a matched string, or that increase it a hard to characterize way. 3227 // Call the max length unbounded, and stop further checking. 3228 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3229 case URX_BACKREF_I: 3230 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3231 currentLen = INT32_MAX; 3232 break; 3233 3234 3235 // Ops that match a max of one character (possibly two 16 bit code units.) 3236 // 3237 case URX_STATIC_SETREF: 3238 case URX_STAT_SETREF_N: 3239 case URX_SETREF: 3240 case URX_BACKSLASH_D: 3241 case URX_ONECHAR_I: 3242 case URX_DOTANY_ALL: 3243 case URX_DOTANY: 3244 case URX_DOTANY_UNIX: 3245 currentLen+=2; 3246 break; 3247 3248 // Single literal character. Increase current max length by one or two, 3249 // depending on whether the char is in the supplementary range. 3250 case URX_ONECHAR: 3251 currentLen++; 3252 if (URX_VAL(op) > 0x10000) { 3253 currentLen++; 3254 } 3255 break; 3256 3257 // Jumps. 3258 // 3259 case URX_JMP: 3260 case URX_JMPX: 3261 case URX_JMP_SAV: 3262 case URX_JMP_SAV_X: 3263 { 3264 int32_t jmpDest = URX_VAL(op); 3265 if (jmpDest < loc) { 3266 // Loop of some kind. Max match length is unbounded. 3267 currentLen = INT32_MAX; 3268 } else { 3269 // Forward jump. Propagate the current min length to the target loc of the jump. 3270 if (forwardedLength.elementAti(jmpDest) < currentLen) { 3271 forwardedLength.setElementAt(currentLen, jmpDest); 3272 } 3273 currentLen = 0; 3274 } 3275 } 3276 break; 3277 3278 case URX_BACKTRACK: 3279 // back-tracks are kind of like a branch, except that the max length was 3280 // propagated already, by the state save. 3281 currentLen = forwardedLength.elementAti(loc+1); 3282 break; 3283 3284 3285 case URX_STATE_SAVE: 3286 { 3287 // State Save, for forward jumps, propagate the current minimum. 3288 // of the state save. 3289 // For backwards jumps, they create a loop, maximum 3290 // match length is unbounded. 3291 int32_t jmpDest = URX_VAL(op); 3292 if (jmpDest > loc) { 3293 if (currentLen > forwardedLength.elementAti(jmpDest)) { 3294 forwardedLength.setElementAt(currentLen, jmpDest); 3295 } 3296 } else { 3297 currentLen = INT32_MAX; 3298 } 3299 } 3300 break; 3301 3302 3303 3304 3305 case URX_STRING: 3306 case URX_STRING_I: 3307 { 3308 loc++; 3309 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3310 currentLen += URX_VAL(stringLenOp); 3311 } 3312 break; 3313 3314 3315 case URX_CTR_INIT: 3316 case URX_CTR_INIT_NG: 3317 case URX_CTR_LOOP: 3318 case URX_CTR_LOOP_NG: 3319 case URX_LOOP_SR_I: 3320 case URX_LOOP_DOT_I: 3321 case URX_LOOP_C: 3322 // For anything to do with loops, make the match length unbounded. 3323 // Note: INIT instructions are multi-word. Can ignore because 3324 // INT32_MAX length will stop the per-instruction loop. 3325 currentLen = INT32_MAX; 3326 break; 3327 3328 3329 3330 case URX_LA_START: 3331 case URX_LA_END: 3332 // Look-ahead. Just ignore, treat the look-ahead block as if 3333 // it were normal pattern. Gives a too-long match length, 3334 // but good enough for now. 3335 break; 3336 3337 // End of look-ahead ops should always be consumed by the processing at 3338 // the URX_LA_START op. 3339 // U_ASSERT(FALSE); 3340 // break; 3341 3342 case URX_LB_START: 3343 { 3344 // Look-behind. Scan forward until the matching look-around end, 3345 // without processing the look-behind block. 3346 int32_t depth = 0; 3347 for (;;) { 3348 loc++; 3349 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3350 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 3351 depth++; 3352 } 3353 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3354 if (depth == 0) { 3355 break; 3356 } 3357 depth--; 3358 } 3359 U_ASSERT(loc < end); 3360 } 3361 } 3362 break; 3363 3364 default: 3365 U_ASSERT(FALSE); 3366 } 3367 3368 3369 if (currentLen == INT32_MAX) { 3370 // The maximum length is unbounded. 3371 // Stop further processing of the pattern. 3372 break; 3373 } 3374 3375 } 3376 return currentLen; 3377 3378 } 3379 3380 3381 //------------------------------------------------------------------------------ 3382 // 3383 // stripNOPs Remove any NOP operations from the compiled pattern code. 3384 // Extra NOPs are inserted for some constructs during the initial 3385 // code generation to provide locations that may be patched later. 3386 // Many end up unneeded, and are removed by this function. 3387 // 3388 // In order to minimize the number of passes through the pattern, 3389 // back-reference fixup is also performed here (adjusting 3390 // back-reference operands to point to the correct frame offsets). 3391 // 3392 // In addition, case-insensitive character and string literals are 3393 // now case-folded here, rather than when first parsed or at match 3394 // time. 3395 // 3396 //------------------------------------------------------------------------------ 3397 void RegexCompile::stripNOPs() { 3398 3399 if (U_FAILURE(*fStatus)) { 3400 return; 3401 } 3402 3403 int32_t end = fRXPat->fCompiledPat->size(); 3404 UVector32 deltas(end, *fStatus); 3405 3406 // Make a first pass over the code, computing the amount that things 3407 // will be offset at each location in the original code. 3408 int32_t loc; 3409 int32_t d = 0; 3410 for (loc=0; loc<end; loc++) { 3411 deltas.addElement(d, *fStatus); 3412 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3413 if (URX_TYPE(op) == URX_NOP) { 3414 d++; 3415 } 3416 } 3417 3418 UnicodeString caseStringBuffer; 3419 int32_t stringDelta = 0; 3420 3421 // Make a second pass over the code, removing the NOPs by moving following 3422 // code up, and patching operands that refer to code locations that 3423 // are being moved. The array of offsets from the first step is used 3424 // to compute the new operand values. 3425 int32_t src; 3426 int32_t dst = 0; 3427 for (src=0; src<end; src++) { 3428 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3429 int32_t opType = URX_TYPE(op); 3430 switch (opType) { 3431 case URX_NOP: 3432 break; 3433 3434 case URX_STATE_SAVE: 3435 case URX_JMP: 3436 case URX_CTR_LOOP: 3437 case URX_CTR_LOOP_NG: 3438 case URX_RELOC_OPRND: 3439 case URX_JMPX: 3440 case URX_JMP_SAV: 3441 case URX_JMP_SAV_X: 3442 // These are instructions with operands that refer to code locations. 3443 { 3444 int32_t operandAddress = URX_VAL(op); 3445 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3446 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 3447 op = URX_BUILD(opType, fixedOperandAddress); 3448 fRXPat->fCompiledPat->setElementAt(op, dst); 3449 dst++; 3450 break; 3451 } 3452 3453 case URX_ONECHAR_I: 3454 { 3455 UChar32 c = URX_VAL(op); 3456 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 3457 // We have a cased character to fold 3458 c = u_foldCase(c, U_FOLD_CASE_DEFAULT); 3459 op = URX_BUILD(URX_ONECHAR_I, c); 3460 } 3461 3462 fRXPat->fCompiledPat->setElementAt(op, dst); 3463 dst++; 3464 break; 3465 } 3466 case URX_STRING_I: 3467 { 3468 op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta); 3469 3470 src++; 3471 int32_t lengthOp = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3472 3473 caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp)); 3474 caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT); 3475 3476 int32_t newLen = caseStringBuffer.length(); 3477 if (newLen <= URX_VAL(lengthOp)) { 3478 // don't shift if we don't have to, take the tiny memory hit of a smaller string 3479 fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer); 3480 } else { 3481 // shift other strings over...at least UnicodeString handles this for us! 3482 fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer); 3483 stringDelta += newLen - URX_VAL(lengthOp); 3484 } 3485 lengthOp = URX_BUILD(URX_STRING_LEN, newLen); 3486 3487 fRXPat->fCompiledPat->setElementAt(op, dst); 3488 fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1); 3489 dst += 2; 3490 break; 3491 } 3492 case URX_BACKREF: 3493 case URX_BACKREF_I: 3494 { 3495 int32_t where = URX_VAL(op); 3496 if (where > fRXPat->fGroupMap->size()) { 3497 error(U_REGEX_INVALID_BACK_REF); 3498 break; 3499 } 3500 where = fRXPat->fGroupMap->elementAti(where-1); 3501 op = URX_BUILD(opType, where); 3502 fRXPat->fCompiledPat->setElementAt(op, dst); 3503 dst++; 3504 3505 fRXPat->fNeedsAltInput = TRUE; 3506 break; 3507 } 3508 case URX_STRING: 3509 op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta); 3510 // continue 3511 case URX_RESERVED_OP: 3512 case URX_RESERVED_OP_N: 3513 case URX_BACKTRACK: 3514 case URX_END: 3515 case URX_ONECHAR: 3516 case URX_STRING_LEN: 3517 case URX_START_CAPTURE: 3518 case URX_END_CAPTURE: 3519 case URX_STATIC_SETREF: 3520 case URX_STAT_SETREF_N: 3521 case URX_SETREF: 3522 case URX_DOTANY: 3523 case URX_FAIL: 3524 case URX_BACKSLASH_B: 3525 case URX_BACKSLASH_BU: 3526 case URX_BACKSLASH_G: 3527 case URX_BACKSLASH_X: 3528 case URX_BACKSLASH_Z: 3529 case URX_DOTANY_ALL: 3530 case URX_BACKSLASH_D: 3531 case URX_CARET: 3532 case URX_DOLLAR: 3533 case URX_CTR_INIT: 3534 case URX_CTR_INIT_NG: 3535 case URX_DOTANY_UNIX: 3536 case URX_STO_SP: 3537 case URX_LD_SP: 3538 case URX_STO_INP_LOC: 3539 case URX_LA_START: 3540 case URX_LA_END: 3541 case URX_DOLLAR_M: 3542 case URX_CARET_M: 3543 case URX_CARET_M_UNIX: 3544 case URX_LB_START: 3545 case URX_LB_CONT: 3546 case URX_LB_END: 3547 case URX_LBN_CONT: 3548 case URX_LBN_END: 3549 case URX_LOOP_SR_I: 3550 case URX_LOOP_DOT_I: 3551 case URX_LOOP_C: 3552 case URX_DOLLAR_D: 3553 case URX_DOLLAR_MD: 3554 // These instructions are unaltered by the relocation. 3555 fRXPat->fCompiledPat->setElementAt(op, dst); 3556 dst++; 3557 break; 3558 3559 default: 3560 // Some op is unaccounted for. 3561 U_ASSERT(FALSE); 3562 error(U_REGEX_INTERNAL_ERROR); 3563 } 3564 } 3565 3566 fRXPat->fCompiledPat->setSize(dst); 3567 } 3568 3569 3570 3571 3572 //------------------------------------------------------------------------------ 3573 // 3574 // Error Report a rule parse error. 3575 // Only report it if no previous error has been recorded. 3576 // 3577 //------------------------------------------------------------------------------ 3578 void RegexCompile::error(UErrorCode e) { 3579 if (U_SUCCESS(*fStatus)) { 3580 *fStatus = e; 3581 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 3582 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 3583 // int64_t. If the values of the latter are out of range for the former, 3584 // set them to the appropriate "field not supported" values. 3585 if (fLineNum > 0x7FFFFFFF) { 3586 fParseErr->line = 0; 3587 fParseErr->offset = -1; 3588 } else if (fCharNum > 0x7FFFFFFF) { 3589 fParseErr->line = (int32_t)fLineNum; 3590 fParseErr->offset = -1; 3591 } else { 3592 fParseErr->line = (int32_t)fLineNum; 3593 fParseErr->offset = (int32_t)fCharNum; 3594 } 3595 3596 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3597 3598 // Fill in the context. 3599 // Note: extractBetween() pins supplied indicies to the string bounds. 3600 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3601 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 3602 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 3603 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3604 } 3605 } 3606 3607 3608 // 3609 // Assorted Unicode character constants. 3610 // Numeric because there is no portable way to enter them as literals. 3611 // (Think EBCDIC). 3612 // 3613 static const UChar chCR = 0x0d; // New lines, for terminating comments. 3614 static const UChar chLF = 0x0a; // Line Feed 3615 static const UChar chPound = 0x23; // '#', introduces a comment. 3616 static const UChar chDigit0 = 0x30; // '0' 3617 static const UChar chDigit7 = 0x37; // '9' 3618 static const UChar chColon = 0x3A; // ':' 3619 static const UChar chE = 0x45; // 'E' 3620 static const UChar chQ = 0x51; // 'Q' 3621 static const UChar chN = 0x4E; // 'N' 3622 static const UChar chP = 0x50; // 'P' 3623 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 3624 static const UChar chLBracket = 0x5b; // '[' 3625 static const UChar chRBracket = 0x5d; // ']' 3626 static const UChar chUp = 0x5e; // '^' 3627 static const UChar chLowerP = 0x70; 3628 static const UChar chLBrace = 0x7b; // '{' 3629 static const UChar chRBrace = 0x7d; // '}' 3630 static const UChar chNEL = 0x85; // NEL newline variant 3631 static const UChar chLS = 0x2028; // Unicode Line Separator 3632 3633 3634 //------------------------------------------------------------------------------ 3635 // 3636 // nextCharLL Low Level Next Char from the regex pattern. 3637 // Get a char from the string, keep track of input position 3638 // for error reporting. 3639 // 3640 //------------------------------------------------------------------------------ 3641 UChar32 RegexCompile::nextCharLL() { 3642 UChar32 ch; 3643 3644 if (fPeekChar != -1) { 3645 ch = fPeekChar; 3646 fPeekChar = -1; 3647 return ch; 3648 } 3649 3650 // assume we're already in the right place 3651 ch = UTEXT_NEXT32(fRXPat->fPattern); 3652 if (ch == U_SENTINEL) { 3653 return ch; 3654 } 3655 3656 if (ch == chCR || 3657 ch == chNEL || 3658 ch == chLS || 3659 ch == chLF && fLastChar != chCR) { 3660 // Character is starting a new line. Bump up the line number, and 3661 // reset the column to 0. 3662 fLineNum++; 3663 fCharNum=0; 3664 } 3665 else { 3666 // Character is not starting a new line. Except in the case of a 3667 // LF following a CR, increment the column position. 3668 if (ch != chLF) { 3669 fCharNum++; 3670 } 3671 } 3672 fLastChar = ch; 3673 return ch; 3674 } 3675 3676 //------------------------------------------------------------------------------ 3677 // 3678 // peekCharLL Low Level Character Scanning, sneak a peek at the next 3679 // character without actually getting it. 3680 // 3681 //------------------------------------------------------------------------------ 3682 UChar32 RegexCompile::peekCharLL() { 3683 if (fPeekChar == -1) { 3684 fPeekChar = nextCharLL(); 3685 } 3686 return fPeekChar; 3687 } 3688 3689 3690 //------------------------------------------------------------------------------ 3691 // 3692 // nextChar for pattern scanning. At this level, we handle stripping 3693 // out comments and processing some backslash character escapes. 3694 // The rest of the pattern grammar is handled at the next level up. 3695 // 3696 //------------------------------------------------------------------------------ 3697 void RegexCompile::nextChar(RegexPatternChar &c) { 3698 3699 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3700 c.fChar = nextCharLL(); 3701 c.fQuoted = FALSE; 3702 3703 if (fQuoteMode) { 3704 c.fQuoted = TRUE; 3705 if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) { 3706 fQuoteMode = FALSE; // Exit quote mode, 3707 nextCharLL(); // discard the E 3708 nextChar(c); // recurse to get the real next char 3709 } 3710 } 3711 else if (fInBackslashQuote) { 3712 // The current character immediately follows a '\' 3713 // Don't check for any further escapes, just return it as-is. 3714 // Don't set c.fQuoted, because that would prevent the state machine from 3715 // dispatching on the character. 3716 fInBackslashQuote = FALSE; 3717 } 3718 else 3719 { 3720 // We are not in a \Q quoted region \E of the source. 3721 // 3722 if (fModeFlags & UREGEX_COMMENTS) { 3723 // 3724 // We are in free-spacing and comments mode. 3725 // Scan through any white space and comments, until we 3726 // reach a significant character or the end of inut. 3727 for (;;) { 3728 if (c.fChar == (UChar32)-1) { 3729 break; // End of Input 3730 } 3731 if (c.fChar == chPound && fEOLComments == TRUE) { 3732 // Start of a comment. Consume the rest of it, until EOF or a new line 3733 for (;;) { 3734 c.fChar = nextCharLL(); 3735 if (c.fChar == (UChar32)-1 || // EOF 3736 c.fChar == chCR || 3737 c.fChar == chLF || 3738 c.fChar == chNEL || 3739 c.fChar == chLS) { 3740 break; 3741 } 3742 } 3743 } 3744 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 3745 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) { 3746 break; 3747 } 3748 c.fChar = nextCharLL(); 3749 } 3750 } 3751 3752 // 3753 // check for backslash escaped characters. 3754 // 3755 if (c.fChar == chBackSlash) { 3756 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3757 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 3758 // 3759 // A '\' sequence that is handled by ICU's standard unescapeAt function. 3760 // Includes \uxxxx, \n, \r, many others. 3761 // Return the single equivalent character. 3762 // 3763 nextCharLL(); // get & discard the peeked char. 3764 c.fQuoted = TRUE; 3765 3766 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 3767 int32_t endIndex = (int32_t)pos; 3768 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 3769 3770 if (endIndex == pos) { 3771 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3772 } 3773 fCharNum += endIndex - pos; 3774 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 3775 } else { 3776 int32_t offset = 0; 3777 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 3778 3779 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 3780 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 3781 3782 if (offset == 0) { 3783 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3784 } else if (context.lastOffset == offset) { 3785 UTEXT_PREVIOUS32(fRXPat->fPattern); 3786 } else if (context.lastOffset != offset-1) { 3787 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 3788 } 3789 fCharNum += offset; 3790 } 3791 } 3792 else if (peekCharLL() == chDigit0) { 3793 // Octal Escape, using Java Regexp Conventions 3794 // which are \0 followed by 1-3 octal digits. 3795 // Different from ICU Unescape handling of Octal, which does not 3796 // require the leading 0. 3797 // Java also has the convention of only consuming 2 octal digits if 3798 // the three digit number would be > 0xff 3799 // 3800 c.fChar = 0; 3801 nextCharLL(); // Consume the initial 0. 3802 int index; 3803 for (index=0; index<3; index++) { 3804 int32_t ch = peekCharLL(); 3805 if (ch<chDigit0 || ch>chDigit7) { 3806 if (index==0) { 3807 // \0 is not followed by any octal digits. 3808 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3809 } 3810 break; 3811 } 3812 c.fChar <<= 3; 3813 c.fChar += ch&7; 3814 if (c.fChar <= 255) { 3815 nextCharLL(); 3816 } else { 3817 // The last digit made the number too big. Forget we saw it. 3818 c.fChar >>= 3; 3819 } 3820 } 3821 c.fQuoted = TRUE; 3822 } 3823 else if (peekCharLL() == chQ) { 3824 // "\Q" enter quote mode, which will continue until "\E" 3825 fQuoteMode = TRUE; 3826 nextCharLL(); // discard the 'Q'. 3827 nextChar(c); // recurse to get the real next char. 3828 } 3829 else 3830 { 3831 // We are in a '\' escape that will be handled by the state table scanner. 3832 // Just return the backslash, but remember that the following char is to 3833 // be taken literally. 3834 fInBackslashQuote = TRUE; 3835 } 3836 } 3837 } 3838 3839 // re-enable # to end-of-line comments, in case they were disabled. 3840 // They are disabled by the parser upon seeing '(?', but this lasts for 3841 // the fetching of the next character only. 3842 fEOLComments = TRUE; 3843 3844 // putc(c.fChar, stdout); 3845 } 3846 3847 3848 3849 //------------------------------------------------------------------------------ 3850 // 3851 // scanNamedChar 3852 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 3853 // 3854 // The scan position will be at the 'N'. On return 3855 // the scan position should be just after the '}' 3856 // 3857 // Return the UChar32 3858 // 3859 //------------------------------------------------------------------------------ 3860 UChar32 RegexCompile::scanNamedChar() { 3861 if (U_FAILURE(*fStatus)) { 3862 return 0; 3863 } 3864 3865 nextChar(fC); 3866 if (fC.fChar != chLBrace) { 3867 error(U_REGEX_PROPERTY_SYNTAX); 3868 return 0; 3869 } 3870 3871 UnicodeString charName; 3872 for (;;) { 3873 nextChar(fC); 3874 if (fC.fChar == chRBrace) { 3875 break; 3876 } 3877 if (fC.fChar == -1) { 3878 error(U_REGEX_PROPERTY_SYNTAX); 3879 return 0; 3880 } 3881 charName.append(fC.fChar); 3882 } 3883 3884 char name[100]; 3885 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 3886 (uint32_t)charName.length()>=sizeof(name)) { 3887 // All Unicode character names have only invariant characters. 3888 // The API to get a character, given a name, accepts only char *, forcing us to convert, 3889 // which requires this error check 3890 error(U_REGEX_PROPERTY_SYNTAX); 3891 return 0; 3892 } 3893 charName.extract(0, charName.length(), name, sizeof(name), US_INV); 3894 3895 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 3896 if (U_FAILURE(*fStatus)) { 3897 error(U_REGEX_PROPERTY_SYNTAX); 3898 } 3899 3900 nextChar(fC); // Continue overall regex pattern processing with char after the '}' 3901 return theChar; 3902 } 3903 3904 //------------------------------------------------------------------------------ 3905 // 3906 // scanProp Construct a UnicodeSet from the text at the current scan 3907 // position, which will be of the form \p{whaterver} 3908 // 3909 // The scan position will be at the 'p' or 'P'. On return 3910 // the scan position should be just after the '}' 3911 // 3912 // Return a UnicodeSet, constructed from the \P pattern, 3913 // or NULL if the pattern is invalid. 3914 // 3915 //------------------------------------------------------------------------------ 3916 UnicodeSet *RegexCompile::scanProp() { 3917 UnicodeSet *uset = NULL; 3918 3919 if (U_FAILURE(*fStatus)) { 3920 return NULL; 3921 } 3922 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 3923 UBool negated = (fC.fChar == chP); 3924 3925 UnicodeString propertyName; 3926 nextChar(fC); 3927 if (fC.fChar != chLBrace) { 3928 error(U_REGEX_PROPERTY_SYNTAX); 3929 return NULL; 3930 } 3931 for (;;) { 3932 nextChar(fC); 3933 if (fC.fChar == chRBrace) { 3934 break; 3935 } 3936 if (fC.fChar == -1) { 3937 // Hit the end of the input string without finding the closing '}' 3938 error(U_REGEX_PROPERTY_SYNTAX); 3939 return NULL; 3940 } 3941 propertyName.append(fC.fChar); 3942 } 3943 uset = createSetForProperty(propertyName, negated); 3944 nextChar(fC); // Move input scan to position following the closing '}' 3945 return uset; 3946 } 3947 3948 //------------------------------------------------------------------------------ 3949 // 3950 // scanPosixProp Construct a UnicodeSet from the text at the current scan 3951 // position, which is expected be of the form [:property expression:] 3952 // 3953 // The scan position will be at the opening ':'. On return 3954 // the scan position must be on the closing ']' 3955 // 3956 // Return a UnicodeSet constructed from the pattern, 3957 // or NULL if this is not a valid POSIX-style set expression. 3958 // If not a property expression, restore the initial scan position 3959 // (to the opening ':') 3960 // 3961 // Note: the opening '[:' is not sufficient to guarantee that 3962 // this is a [:property:] expression. 3963 // [:'+=,] is a perfectly good ordinary set expression that 3964 // happens to include ':' as one of its characters. 3965 // 3966 //------------------------------------------------------------------------------ 3967 UnicodeSet *RegexCompile::scanPosixProp() { 3968 UnicodeSet *uset = NULL; 3969 3970 if (U_FAILURE(*fStatus)) { 3971 return NULL; 3972 } 3973 3974 U_ASSERT(fC.fChar == chColon); 3975 3976 // Save the scanner state. 3977 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 3978 int64_t savedScanIndex = fScanIndex; 3979 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3980 UBool savedQuoteMode = fQuoteMode; 3981 UBool savedInBackslashQuote = fInBackslashQuote; 3982 UBool savedEOLComments = fEOLComments; 3983 int64_t savedLineNum = fLineNum; 3984 int64_t savedCharNum = fCharNum; 3985 UChar32 savedLastChar = fLastChar; 3986 UChar32 savedPeekChar = fPeekChar; 3987 RegexPatternChar savedfC = fC; 3988 3989 // Scan for a closing ]. A little tricky because there are some perverse 3990 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 3991 // ending on the second closing ]. 3992 3993 UnicodeString propName; 3994 UBool negated = FALSE; 3995 3996 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 3997 nextChar(fC); 3998 if (fC.fChar == chUp) { 3999 negated = TRUE; 4000 nextChar(fC); 4001 } 4002 4003 // Scan for the closing ":]", collecting the property name along the way. 4004 UBool sawPropSetTerminator = FALSE; 4005 for (;;) { 4006 propName.append(fC.fChar); 4007 nextChar(fC); 4008 if (fC.fQuoted || fC.fChar == -1) { 4009 // Escaped characters or end of input - either says this isn't a [:Property:] 4010 break; 4011 } 4012 if (fC.fChar == chColon) { 4013 nextChar(fC); 4014 if (fC.fChar == chRBracket) { 4015 sawPropSetTerminator = TRUE; 4016 } 4017 break; 4018 } 4019 } 4020 4021 if (sawPropSetTerminator) { 4022 uset = createSetForProperty(propName, negated); 4023 } 4024 else 4025 { 4026 // No closing ":]". 4027 // Restore the original scan position. 4028 // The main scanner will retry the input as a normal set expression, 4029 // not a [:Property:] expression. 4030 fScanIndex = savedScanIndex; 4031 fQuoteMode = savedQuoteMode; 4032 fInBackslashQuote = savedInBackslashQuote; 4033 fEOLComments = savedEOLComments; 4034 fLineNum = savedLineNum; 4035 fCharNum = savedCharNum; 4036 fLastChar = savedLastChar; 4037 fPeekChar = savedPeekChar; 4038 fC = savedfC; 4039 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4040 } 4041 return uset; 4042 } 4043 4044 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4045 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4046 addCategory(set, U_GC_CF_MASK, ec); 4047 } 4048 4049 // 4050 // Create a Unicode Set from a Unicode Property expression. 4051 // This is common code underlying both \p{...} ane [:...:] expressions. 4052 // Includes trying the Java "properties" that aren't supported as 4053 // normal ICU UnicodeSet properties 4054 // 4055 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 4056 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 4057 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4058 UnicodeString setExpr; 4059 UnicodeSet *set; 4060 uint32_t usetFlags = 0; 4061 4062 if (U_FAILURE(*fStatus)) { 4063 return NULL; 4064 } 4065 4066 // 4067 // First try the property as we received it 4068 // 4069 if (negated) { 4070 setExpr.append(negSetPrefix, -1); 4071 } else { 4072 setExpr.append(posSetPrefix, -1); 4073 } 4074 setExpr.append(propName); 4075 setExpr.append(chRBrace); 4076 setExpr.append(chRBracket); 4077 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4078 usetFlags |= USET_CASE_INSENSITIVE; 4079 } 4080 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4081 if (U_SUCCESS(*fStatus)) { 4082 return set; 4083 } 4084 delete set; 4085 set = NULL; 4086 4087 // 4088 // The property as it was didn't work. 4089 // Do emergency fixes - 4090 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 4091 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. 4092 // 4093 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" 4094 // is accepted by Java. The property part of the name is compared 4095 // case-insenstively. The spaces must be exactly as shown, either 4096 // all there, or all omitted, with exactly one at each position 4097 // if they are present. From checking against JDK 1.6 4098 // 4099 // This code should be removed when ICU properties support the Java compatibility names 4100 // (ICU 4.0?) 4101 // 4102 UnicodeString mPropName = propName; 4103 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 4104 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 4105 } 4106 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || 4107 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { 4108 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); 4109 } 4110 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4111 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 4112 } 4113 4114 // See if the property looks like a Java "InBlockName", which 4115 // we will recast as "Block=BlockName" 4116 // 4117 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 4118 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" 4119 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 4120 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 4121 setExpr.append(BLOCK, -1); 4122 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. 4123 setExpr.append(chRBrace); 4124 setExpr.append(chRBracket); 4125 *fStatus = U_ZERO_ERROR; 4126 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4127 if (U_SUCCESS(*fStatus)) { 4128 return set; 4129 } 4130 delete set; 4131 set = NULL; 4132 } 4133 4134 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || 4135 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) 4136 { 4137 UErrorCode localStatus = U_ZERO_ERROR; 4138 //setExpr.remove(); 4139 set = new UnicodeSet(); 4140 // 4141 // Try the various Java specific properties. 4142 // These all begin with "java" 4143 // 4144 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { 4145 addCategory(set, U_GC_CN_MASK, localStatus); 4146 set->complement(); 4147 } 4148 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { 4149 addCategory(set, U_GC_ND_MASK, localStatus); 4150 } 4151 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { 4152 addIdentifierIgnorable(set, localStatus); 4153 } 4154 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { 4155 set->add(0, 0x1F).add(0x7F, 0x9F); 4156 } 4157 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { 4158 addCategory(set, U_GC_L_MASK, localStatus); 4159 addCategory(set, U_GC_SC_MASK, localStatus); 4160 addCategory(set, U_GC_PC_MASK, localStatus); 4161 addCategory(set, U_GC_ND_MASK, localStatus); 4162 addCategory(set, U_GC_NL_MASK, localStatus); 4163 addCategory(set, U_GC_MC_MASK, localStatus); 4164 addCategory(set, U_GC_MN_MASK, localStatus); 4165 addIdentifierIgnorable(set, localStatus); 4166 } 4167 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { 4168 addCategory(set, U_GC_L_MASK, localStatus); 4169 addCategory(set, U_GC_NL_MASK, localStatus); 4170 addCategory(set, U_GC_SC_MASK, localStatus); 4171 addCategory(set, U_GC_PC_MASK, localStatus); 4172 } 4173 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { 4174 addCategory(set, U_GC_L_MASK, localStatus); 4175 } 4176 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { 4177 addCategory(set, U_GC_L_MASK, localStatus); 4178 addCategory(set, U_GC_ND_MASK, localStatus); 4179 } 4180 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { 4181 addCategory(set, U_GC_LL_MASK, localStatus); 4182 } 4183 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { 4184 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); 4185 } 4186 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { 4187 addCategory(set, U_GC_Z_MASK, localStatus); 4188 } 4189 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { 4190 set->add(0x10000, UnicodeSet::MAX_VALUE); 4191 } 4192 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { 4193 addCategory(set, U_GC_LT_MASK, localStatus); 4194 } 4195 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { 4196 addCategory(set, U_GC_L_MASK, localStatus); 4197 addCategory(set, U_GC_NL_MASK, localStatus); 4198 } 4199 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { 4200 addCategory(set, U_GC_L_MASK, localStatus); 4201 addCategory(set, U_GC_PC_MASK, localStatus); 4202 addCategory(set, U_GC_ND_MASK, localStatus); 4203 addCategory(set, U_GC_NL_MASK, localStatus); 4204 addCategory(set, U_GC_MC_MASK, localStatus); 4205 addCategory(set, U_GC_MN_MASK, localStatus); 4206 addIdentifierIgnorable(set, localStatus); 4207 } 4208 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { 4209 addCategory(set, U_GC_LU_MASK, localStatus); 4210 } 4211 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { 4212 set->add(0, UnicodeSet::MAX_VALUE); 4213 } 4214 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { 4215 addCategory(set, U_GC_Z_MASK, localStatus); 4216 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4217 set->add(9, 0x0d).add(0x1c, 0x1f); 4218 } 4219 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4220 set->add(0, UnicodeSet::MAX_VALUE); 4221 } 4222 4223 if (U_SUCCESS(localStatus) && !set->isEmpty()) { 4224 *fStatus = U_ZERO_ERROR; 4225 if (usetFlags & USET_CASE_INSENSITIVE) { 4226 set->closeOver(USET_CASE_INSENSITIVE); 4227 } 4228 if (negated) { 4229 set->complement(); 4230 } 4231 return set; 4232 } 4233 delete set; 4234 set = NULL; 4235 } 4236 error(*fStatus); 4237 return NULL; 4238 } 4239 4240 4241 4242 // 4243 // SetEval Part of the evaluation of [set expressions]. 4244 // Perform any pending (stacked) operations with precedence 4245 // equal or greater to that of the next operator encountered 4246 // in the expression. 4247 // 4248 void RegexCompile::setEval(int32_t nextOp) { 4249 UnicodeSet *rightOperand = NULL; 4250 UnicodeSet *leftOperand = NULL; 4251 for (;;) { 4252 U_ASSERT(fSetOpStack.empty()==FALSE); 4253 int32_t pendingSetOperation = fSetOpStack.peeki(); 4254 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4255 break; 4256 } 4257 fSetOpStack.popi(); 4258 U_ASSERT(fSetStack.empty() == FALSE); 4259 rightOperand = (UnicodeSet *)fSetStack.peek(); 4260 switch (pendingSetOperation) { 4261 case setNegation: 4262 rightOperand->complement(); 4263 break; 4264 case setCaseClose: 4265 // TODO: need a simple close function. Ticket 6065 4266 rightOperand->closeOver(USET_CASE_INSENSITIVE); 4267 rightOperand->removeAllStrings(); 4268 break; 4269 case setDifference1: 4270 case setDifference2: 4271 fSetStack.pop(); 4272 leftOperand = (UnicodeSet *)fSetStack.peek(); 4273 leftOperand->removeAll(*rightOperand); 4274 delete rightOperand; 4275 break; 4276 case setIntersection1: 4277 case setIntersection2: 4278 fSetStack.pop(); 4279 leftOperand = (UnicodeSet *)fSetStack.peek(); 4280 leftOperand->retainAll(*rightOperand); 4281 delete rightOperand; 4282 break; 4283 case setUnion: 4284 fSetStack.pop(); 4285 leftOperand = (UnicodeSet *)fSetStack.peek(); 4286 leftOperand->addAll(*rightOperand); 4287 delete rightOperand; 4288 break; 4289 default: 4290 U_ASSERT(FALSE); 4291 break; 4292 } 4293 } 4294 } 4295 4296 void RegexCompile::setPushOp(int32_t op) { 4297 setEval(op); 4298 fSetOpStack.push(op, *fStatus); 4299 fSetStack.push(new UnicodeSet(), *fStatus); 4300 } 4301 4302 U_NAMESPACE_END 4303 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4304 4305