1 // 2 // file: regexcmp.cpp 3 // 4 // Copyright (C) 2002-2010 International Business Machines Corporation and others. 5 // All Rights Reserved. 6 // 7 // This file contains the ICU regular expression compiler, which is responsible 8 // for processing a regular expression pattern into the compiled form that 9 // is used by the match finding engine. 10 // 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 15 16 #include "unicode/ustring.h" 17 #include "unicode/unistr.h" 18 #include "unicode/uniset.h" 19 #include "unicode/uchar.h" 20 #include "unicode/uchriter.h" 21 #include "unicode/parsepos.h" 22 #include "unicode/parseerr.h" 23 #include "unicode/regex.h" 24 #include "util.h" 25 #include "putilimp.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "uvectr32.h" 29 #include "uvectr64.h" 30 #include "uassert.h" 31 #include "ucln_in.h" 32 #include "uinvchar.h" 33 34 #include "regeximp.h" 35 #include "regexcst.h" // Contains state table for the regex pattern parser. 36 // generated by a Perl script. 37 #include "regexcmp.h" 38 #include "regexst.h" 39 #include "regextxt.h" 40 41 42 43 U_NAMESPACE_BEGIN 44 45 46 //------------------------------------------------------------------------------ 47 // 48 // Constructor. 49 // 50 //------------------------------------------------------------------------------ 51 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 52 fParenStack(status), fSetStack(status), fSetOpStack(status) 53 { 54 // Lazy init of all shared global sets (needed for init()'s empty text) 55 RegexStaticSets::initGlobals(&status); 56 57 fStatus = &status; 58 59 fRXPat = rxp; 60 fScanIndex = 0; 61 fLastChar = -1; 62 fPeekChar = -1; 63 fLineNum = 1; 64 fCharNum = 0; 65 fQuoteMode = FALSE; 66 fInBackslashQuote = FALSE; 67 fModeFlags = fRXPat->fFlags | 0x80000000; 68 fEOLComments = TRUE; 69 70 fMatchOpenParen = -1; 71 fMatchCloseParen = -1; 72 fStringOpStart = -1; 73 74 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 75 status = rxp->fDeferredStatus; 76 } 77 } 78 79 static const UChar chAmp = 0x26; // '&' 80 static const UChar chDash = 0x2d; // '-' 81 82 83 //------------------------------------------------------------------------------ 84 // 85 // Destructor 86 // 87 //------------------------------------------------------------------------------ 88 RegexCompile::~RegexCompile() { 89 } 90 91 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 92 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 93 } 94 95 //------------------------------------------------------------------------------ 96 // 97 // Compile regex pattern. The state machine for rexexp pattern parsing is here. 98 // The state tables are hand-written in the file regexcst.txt, 99 // and converted to the form used here by a perl 100 // script regexcst.pl 101 // 102 //------------------------------------------------------------------------------ 103 void RegexCompile::compile( 104 const UnicodeString &pat, // Source pat to be compiled. 105 UParseError &pp, // Error position info 106 UErrorCode &e) // Error Code 107 { 108 fRXPat->fPatternString = new UnicodeString(pat); 109 UText patternText = UTEXT_INITIALIZER; 110 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 111 112 if (U_SUCCESS(e)) { 113 compile(&patternText, pp, e); 114 utext_close(&patternText); 115 } 116 } 117 118 // 119 // compile, UText mode 120 // All the work is actually done here. 121 // 122 void RegexCompile::compile( 123 UText *pat, // Source pat to be compiled. 124 UParseError &pp, // Error position info 125 UErrorCode &e) // Error Code 126 { 127 fStatus = &e; 128 fParseErr = &pp; 129 fStackPtr = 0; 130 fStack[fStackPtr] = 0; 131 132 if (U_FAILURE(*fStatus)) { 133 return; 134 } 135 136 // There should be no pattern stuff in the RegexPattern object. They can not be reused. 137 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 138 139 // Prepare the RegexPattern object to receive the compiled pattern. 140 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 141 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 142 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 143 144 145 // Initialize the pattern scanning state machine 146 fPatternLength = utext_nativeLength(pat); 147 uint16_t state = 1; 148 const RegexTableEl *tableEl; 149 nextChar(fC); // Fetch the first char from the pattern string. 150 151 // 152 // Main loop for the regex pattern parsing state machine. 153 // Runs once per state transition. 154 // Each time through optionally performs, depending on the state table, 155 // - an advance to the the next pattern char 156 // - an action to be performed. 157 // - pushing or popping a state to/from the local state return stack. 158 // file regexcst.txt is the source for the state table. The logic behind 159 // recongizing the pattern syntax is there, not here. 160 // 161 for (;;) { 162 // Bail out if anything has gone wrong. 163 // Regex pattern parsing stops on the first error encountered. 164 if (U_FAILURE(*fStatus)) { 165 break; 166 } 167 168 U_ASSERT(state != 0); 169 170 // Find the state table element that matches the input char from the pattern, or the 171 // class of the input character. Start with the first table row for this 172 // state, then linearly scan forward until we find a row that matches the 173 // character. The last row for each state always matches all characters, so 174 // the search will stop there, if not before. 175 // 176 tableEl = &gRuleParseStateTable[state]; 177 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 178 fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 179 180 for (;;) { // loop through table rows belonging to this state, looking for one 181 // that matches the current input char. 182 REGEX_SCAN_DEBUG_PRINTF((".")); 183 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 184 // Table row specified an individual character, not a set, and 185 // the input character is not quoted, and 186 // the input character matched it. 187 break; 188 } 189 if (tableEl->fCharClass == 255) { 190 // Table row specified default, match anything character class. 191 break; 192 } 193 if (tableEl->fCharClass == 254 && fC.fQuoted) { 194 // Table row specified "quoted" and the char was quoted. 195 break; 196 } 197 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 198 // Table row specified eof and we hit eof on the input. 199 break; 200 } 201 202 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 203 fC.fQuoted == FALSE && // char is not escaped && 204 fC.fChar != (UChar32)-1) { // char is not EOF 205 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 206 // Table row specified a character class, or set of characters, 207 // and the current char matches it. 208 break; 209 } 210 } 211 212 // No match on this row, advance to the next row for this state, 213 tableEl++; 214 } 215 REGEX_SCAN_DEBUG_PRINTF(("\n")); 216 217 // 218 // We've found the row of the state table that matches the current input 219 // character from the rules string. 220 // Perform any action specified by this row in the state table. 221 if (doParseActions(tableEl->fAction) == FALSE) { 222 // Break out of the state machine loop if the 223 // the action signalled some kind of error, or 224 // the action was to exit, occurs on normal end-of-rules-input. 225 break; 226 } 227 228 if (tableEl->fPushState != 0) { 229 fStackPtr++; 230 if (fStackPtr >= kStackSize) { 231 error(U_REGEX_INTERNAL_ERROR); 232 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 233 fStackPtr--; 234 } 235 fStack[fStackPtr] = tableEl->fPushState; 236 } 237 238 // 239 // NextChar. This is where characters are actually fetched from the pattern. 240 // Happens under control of the 'n' tag in the state table. 241 // 242 if (tableEl->fNextChar) { 243 nextChar(fC); 244 } 245 246 // Get the next state from the table entry, or from the 247 // state stack if the next state was specified as "pop". 248 if (tableEl->fNextState != 255) { 249 state = tableEl->fNextState; 250 } else { 251 state = fStack[fStackPtr]; 252 fStackPtr--; 253 if (fStackPtr < 0) { 254 // state stack underflow 255 // This will occur if the user pattern has mis-matched parentheses, 256 // with extra close parens. 257 // 258 fStackPtr++; 259 error(U_REGEX_MISMATCHED_PAREN); 260 } 261 } 262 263 } 264 265 if (U_FAILURE(*fStatus)) { 266 // Bail out if the pattern had errors. 267 // Set stack cleanup: a successful compile would have left it empty, 268 // but errors can leave temporary sets hanging around. 269 while (!fSetStack.empty()) { 270 delete (UnicodeSet *)fSetStack.pop(); 271 } 272 return; 273 } 274 275 // 276 // The pattern has now been read and processed, and the compiled code generated. 277 // 278 279 // 280 // Compute the number of digits requried for the largest capture group number. 281 // 282 fRXPat->fMaxCaptureDigits = 1; 283 int32_t n = 10; 284 int32_t groupCount = fRXPat->fGroupMap->size(); 285 while (n <= groupCount) { 286 fRXPat->fMaxCaptureDigits++; 287 n *= 10; 288 } 289 290 // 291 // The pattern's fFrameSize so far has accumulated the requirements for 292 // storage for capture parentheses, counters, etc. that are encountered 293 // in the pattern. Add space for the two variables that are always 294 // present in the saved state: the input string position (int64_t) and 295 // the position in the compiled pattern. 296 // 297 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; 298 299 // 300 // Optimization pass 1: NOPs, back-references, and case-folding 301 // 302 stripNOPs(); 303 304 // 305 // Get bounds for the minimum and maximum length of a string that this 306 // pattern can match. Used to avoid looking for matches in strings that 307 // are too short. 308 // 309 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 310 311 // 312 // Optimization pass 2: match start type 313 // 314 matchStartType(); 315 316 // 317 // Set up fast latin-1 range sets 318 // 319 int32_t numSets = fRXPat->fSets->size(); 320 fRXPat->fSets8 = new Regex8BitSet[numSets]; 321 // Null pointer check. 322 if (fRXPat->fSets8 == NULL) { 323 e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 324 return; 325 } 326 int32_t i; 327 for (i=0; i<numSets; i++) { 328 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 329 fRXPat->fSets8[i].init(s); 330 } 331 332 } 333 334 335 336 337 338 //------------------------------------------------------------------------------ 339 // 340 // doParseAction Do some action during regex pattern parsing. 341 // Called by the parse state machine. 342 // 343 // Generation of the match engine PCode happens here, or 344 // in functions called from the parse actions defined here. 345 // 346 // 347 //------------------------------------------------------------------------------ 348 UBool RegexCompile::doParseActions(int32_t action) 349 { 350 UBool returnVal = TRUE; 351 352 switch ((Regex_PatternParseAction)action) { 353 354 case doPatStart: 355 // Start of pattern compiles to: 356 //0 SAVE 2 Fall back to position of FAIL 357 //1 jmp 3 358 //2 FAIL Stop if we ever reach here. 359 //3 NOP Dummy, so start of pattern looks the same as 360 // the start of an ( grouping. 361 //4 NOP Resreved, will be replaced by a save if there are 362 // OR | operators at the top level 363 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); 364 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); 365 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); 366 367 // Standard open nonCapture paren action emits the two NOPs and 368 // sets up the paren stack frame. 369 doParseActions(doOpenNonCaptureParen); 370 break; 371 372 case doPatFinish: 373 // We've scanned to the end of the pattern 374 // The end of pattern compiles to: 375 // URX_END 376 // which will stop the runtime match engine. 377 // Encountering end of pattern also behaves like a close paren, 378 // and forces fixups of the State Save at the beginning of the compiled pattern 379 // and of any OR operations at the top level. 380 // 381 handleCloseParen(); 382 if (fParenStack.size() > 0) { 383 // Missing close paren in pattern. 384 error(U_REGEX_MISMATCHED_PAREN); 385 } 386 387 // add the END operation to the compiled pattern. 388 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); 389 390 // Terminate the pattern compilation state machine. 391 returnVal = FALSE; 392 break; 393 394 395 396 case doOrOperator: 397 // Scanning a '|', as in (A|B) 398 { 399 // Insert a SAVE operation at the start of the pattern section preceding 400 // this OR at this level. This SAVE will branch the match forward 401 // to the right hand side of the OR in the event that the left hand 402 // side fails to match and backtracks. Locate the position for the 403 // save from the location on the top of the parentheses stack. 404 int32_t savePosition = fParenStack.popi(); 405 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 406 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 407 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 408 fRXPat->fCompiledPat->setElementAt(op, savePosition); 409 410 // Append an JMP operation into the compiled pattern. The operand for 411 // the JMP will eventually be the location following the ')' for the 412 // group. This will be patched in later, when the ')' is encountered. 413 op = URX_BUILD(URX_JMP, 0); 414 fRXPat->fCompiledPat->addElement(op, *fStatus); 415 416 // Push the position of the newly added JMP op onto the parentheses stack. 417 // This registers if for fixup when this block's close paren is encountered. 418 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 419 420 // Append a NOP to the compiled pattern. This is the slot reserved 421 // for a SAVE in the event that there is yet another '|' following 422 // this one. 423 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 424 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 425 } 426 break; 427 428 429 case doOpenCaptureParen: 430 // Open Paren. 431 // Compile to a 432 // - NOP, which later may be replaced by a save-state if the 433 // parenthesized group gets a * quantifier, followed by 434 // - START_CAPTURE n where n is stack frame offset to the capture group variables. 435 // - NOP, which may later be replaced by a save-state if there 436 // is an '|' alternation within the parens. 437 // 438 // Each capture group gets three slots in the save stack frame: 439 // 0: Capture Group start position (in input string being matched.) 440 // 1: Capture Group end position. 441 // 2: Start of Match-in-progress. 442 // The first two locations are for a completed capture group, and are 443 // referred to by back references and the like. 444 // The third location stores the capture start position when an START_CAPTURE is 445 // encountered. This will be promoted to a completed capture when (and if) the corresponding 446 // END_CAPTURE is encountered. 447 { 448 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 449 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame. 450 fRXPat->fFrameSize += 3; 451 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); 452 fRXPat->fCompiledPat->addElement(cop, *fStatus); 453 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 454 455 // On the Parentheses stack, start a new frame and add the postions 456 // of the two NOPs. Depending on what follows in the pattern, the 457 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 458 // address of the end of the parenthesized group. 459 fParenStack.push(fModeFlags, *fStatus); // Match mode state 460 fParenStack.push(capturing, *fStatus); // Frame type. 461 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 462 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 463 464 // Save the mapping from group number to stack frame variable position. 465 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 466 } 467 break; 468 469 case doOpenNonCaptureParen: 470 // Open non-caputuring (grouping only) Paren. 471 // Compile to a 472 // - NOP, which later may be replaced by a save-state if the 473 // parenthesized group gets a * quantifier, followed by 474 // - NOP, which may later be replaced by a save-state if there 475 // is an '|' alternation within the parens. 476 { 477 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 478 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 479 480 // On the Parentheses stack, start a new frame and add the postions 481 // of the two NOPs. 482 fParenStack.push(fModeFlags, *fStatus); // Match mode state 483 fParenStack.push(plain, *fStatus); // Begin a new frame. 484 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 485 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 486 } 487 break; 488 489 490 case doOpenAtomicParen: 491 // Open Atomic Paren. (?> 492 // Compile to a 493 // - NOP, which later may be replaced if the parenthesized group 494 // has a quantifier, followed by 495 // - STO_SP save state stack position, so it can be restored at the ")" 496 // - NOP, which may later be replaced by a save-state if there 497 // is an '|' alternation within the parens. 498 { 499 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 500 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 501 fRXPat->fDataSize += 1; // state stack ptr. 502 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); 503 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); 504 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 505 506 // On the Parentheses stack, start a new frame and add the postions 507 // of the two NOPs. Depending on what follows in the pattern, the 508 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 509 // address of the end of the parenthesized group. 510 fParenStack.push(fModeFlags, *fStatus); // Match mode state 511 fParenStack.push(atomic, *fStatus); // Frame type. 512 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 513 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 514 } 515 break; 516 517 518 case doOpenLookAhead: 519 // Positive Look-ahead (?= stuff ) 520 // 521 // Note: Addition of transparent input regions, with the need to 522 // restore the original regions when failing out of a lookahead 523 // block, complicated this sequence. Some conbined opcodes 524 // might make sense - or might not, lookahead aren't that common. 525 // 526 // Caution: min match length optimization knows about this 527 // sequence; don't change without making updates there too. 528 // 529 // Compiles to 530 // 1 START_LA dataLoc Saves SP, Input Pos 531 // 2. STATE_SAVE 4 on failure of lookahead, goto 4 532 // 3 JMP 6 continue ... 533 // 534 // 4. LA_END Look Ahead failed. Restore regions. 535 // 5. BACKTRACK and back track again. 536 // 537 // 6. NOP reserved for use by quantifiers on the block. 538 // Look-ahead can't have quantifiers, but paren stack 539 // compile time conventions require the slot anyhow. 540 // 7. NOP may be replaced if there is are '|' ops in the block. 541 // 8. code for parenthesized stuff. 542 // 9. LA_END 543 // 544 // Two data slots are reserved, for saving the stack ptr and the input position. 545 { 546 int32_t dataLoc = fRXPat->fDataSize; 547 fRXPat->fDataSize += 2; 548 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 549 fRXPat->fCompiledPat->addElement(op, *fStatus); 550 551 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 552 fRXPat->fCompiledPat->addElement(op, *fStatus); 553 554 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 555 fRXPat->fCompiledPat->addElement(op, *fStatus); 556 557 op = URX_BUILD(URX_LA_END, dataLoc); 558 fRXPat->fCompiledPat->addElement(op, *fStatus); 559 560 op = URX_BUILD(URX_BACKTRACK, 0); 561 fRXPat->fCompiledPat->addElement(op, *fStatus); 562 563 op = URX_BUILD(URX_NOP, 0); 564 fRXPat->fCompiledPat->addElement(op, *fStatus); 565 fRXPat->fCompiledPat->addElement(op, *fStatus); 566 567 // On the Parentheses stack, start a new frame and add the postions 568 // of the NOPs. 569 fParenStack.push(fModeFlags, *fStatus); // Match mode state 570 fParenStack.push(lookAhead, *fStatus); // Frame type. 571 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 572 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 573 } 574 break; 575 576 case doOpenLookAheadNeg: 577 // Negated Lookahead. (?! stuff ) 578 // Compiles to 579 // 1. START_LA dataloc 580 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 581 // // which continues with the match. 582 // 3. NOP // Std. Open Paren sequence, for possible '|' 583 // 4. code for parenthesized stuff. 584 // 5. END_LA // Cut back stack, remove saved state from step 2. 585 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 586 // 7. END_LA // Restore match region, in case look-ahead was using 587 // an alternate (transparent) region. 588 { 589 int32_t dataLoc = fRXPat->fDataSize; 590 fRXPat->fDataSize += 2; 591 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 592 fRXPat->fCompiledPat->addElement(op, *fStatus); 593 594 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later. 595 fRXPat->fCompiledPat->addElement(op, *fStatus); 596 597 op = URX_BUILD(URX_NOP, 0); 598 fRXPat->fCompiledPat->addElement(op, *fStatus); 599 600 // On the Parentheses stack, start a new frame and add the postions 601 // of the StateSave and NOP. 602 fParenStack.push(fModeFlags, *fStatus); // Match mode state 603 fParenStack.push(negLookAhead, *fStatus); // Frame type 604 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 605 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 606 607 // Instructions #5 - #7 will be added when the ')' is encountered. 608 } 609 break; 610 611 case doOpenLookBehind: 612 { 613 // Compile a (?<= look-behind open paren. 614 // 615 // Compiles to 616 // 0 URX_LB_START dataLoc 617 // 1 URX_LB_CONT dataLoc 618 // 2 MinMatchLen 619 // 3 MaxMatchLen 620 // 4 URX_NOP Standard '(' boilerplate. 621 // 5 URX_NOP Reserved slot for use with '|' ops within (block). 622 // 6 <code for LookBehind expression> 623 // 7 URX_LB_END dataLoc # Check match len, restore input len 624 // 8 URX_LA_END dataLoc # Restore stack, input pos 625 // 626 // Allocate a block of matcher data, to contain (when running a match) 627 // 0: Stack ptr on entry 628 // 1: Input Index on entry 629 // 2: Start index of match current match attempt. 630 // 3: Original Input String len. 631 632 // Allocate data space 633 int32_t dataLoc = fRXPat->fDataSize; 634 fRXPat->fDataSize += 4; 635 636 // Emit URX_LB_START 637 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 638 fRXPat->fCompiledPat->addElement(op, *fStatus); 639 640 // Emit URX_LB_CONT 641 op = URX_BUILD(URX_LB_CONT, dataLoc); 642 fRXPat->fCompiledPat->addElement(op, *fStatus); 643 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 644 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 645 646 // Emit the NOP 647 op = URX_BUILD(URX_NOP, 0); 648 fRXPat->fCompiledPat->addElement(op, *fStatus); 649 fRXPat->fCompiledPat->addElement(op, *fStatus); 650 651 // On the Parentheses stack, start a new frame and add the postions 652 // of the URX_LB_CONT and the NOP. 653 fParenStack.push(fModeFlags, *fStatus); // Match mode state 654 fParenStack.push(lookBehind, *fStatus); // Frame type 655 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 656 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 657 658 // The final two instructions will be added when the ')' is encountered. 659 } 660 661 break; 662 663 case doOpenLookBehindNeg: 664 { 665 // Compile a (?<! negated look-behind open paren. 666 // 667 // Compiles to 668 // 0 URX_LB_START dataLoc # Save entry stack, input len 669 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 670 // 2 MinMatchLen 671 // 3 MaxMatchLen 672 // 4 continueLoc (9) 673 // 5 URX_NOP Standard '(' boilerplate. 674 // 6 URX_NOP Reserved slot for use with '|' ops within (block). 675 // 7 <code for LookBehind expression> 676 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 677 // 9 ... 678 // 679 // Allocate a block of matcher data, to contain (when running a match) 680 // 0: Stack ptr on entry 681 // 1: Input Index on entry 682 // 2: Start index of match current match attempt. 683 // 3: Original Input String len. 684 685 // Allocate data space 686 int32_t dataLoc = fRXPat->fDataSize; 687 fRXPat->fDataSize += 4; 688 689 // Emit URX_LB_START 690 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 691 fRXPat->fCompiledPat->addElement(op, *fStatus); 692 693 // Emit URX_LBN_CONT 694 op = URX_BUILD(URX_LBN_CONT, dataLoc); 695 fRXPat->fCompiledPat->addElement(op, *fStatus); 696 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 697 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 698 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later. 699 700 // Emit the NOP 701 op = URX_BUILD(URX_NOP, 0); 702 fRXPat->fCompiledPat->addElement(op, *fStatus); 703 fRXPat->fCompiledPat->addElement(op, *fStatus); 704 705 // On the Parentheses stack, start a new frame and add the postions 706 // of the URX_LB_CONT and the NOP. 707 fParenStack.push(fModeFlags, *fStatus); // Match mode state 708 fParenStack.push(lookBehindN, *fStatus); // Frame type 709 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 710 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 711 712 // The final two instructions will be added when the ')' is encountered. 713 } 714 break; 715 716 case doConditionalExpr: 717 // Conditionals such as (?(1)a:b) 718 case doPerlInline: 719 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 720 error(U_REGEX_UNIMPLEMENTED); 721 break; 722 723 724 case doCloseParen: 725 handleCloseParen(); 726 if (fParenStack.size() <= 0) { 727 // Extra close paren, or missing open paren. 728 error(U_REGEX_MISMATCHED_PAREN); 729 } 730 break; 731 732 case doNOP: 733 break; 734 735 736 case doBadOpenParenType: 737 case doRuleError: 738 error(U_REGEX_RULE_SYNTAX); 739 break; 740 741 742 case doMismatchedParenErr: 743 error(U_REGEX_MISMATCHED_PAREN); 744 break; 745 746 case doPlus: 747 // Normal '+' compiles to 748 // 1. stuff to be repeated (already built) 749 // 2. jmp-sav 1 750 // 3. ... 751 // 752 // Or, if the item to be repeated can match a zero length string, 753 // 1. STO_INP_LOC data-loc 754 // 2. body of stuff to be repeated 755 // 3. JMP_SAV_X 2 756 // 4. ... 757 758 // 759 // Or, if the item to be repeated is simple 760 // 1. Item to be repeated. 761 // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 762 // 3. LOOP_C stack location 763 { 764 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 765 int32_t frameLoc; 766 767 // Check for simple constructs, which may get special optimized code. 768 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 769 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 770 771 if (URX_TYPE(repeatedOp) == URX_SETREF) { 772 // Emit optimized code for [char set]+ 773 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 774 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 775 frameLoc = fRXPat->fFrameSize; 776 fRXPat->fFrameSize++; 777 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 778 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 779 break; 780 } 781 782 if (URX_TYPE(repeatedOp) == URX_DOTANY || 783 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 784 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 785 // Emit Optimized code for .+ operations. 786 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 787 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 788 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 789 loopOpI |= 1; 790 } 791 if (fModeFlags & UREGEX_UNIX_LINES) { 792 loopOpI |= 2; 793 } 794 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 795 frameLoc = fRXPat->fFrameSize; 796 fRXPat->fFrameSize++; 797 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 798 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 799 break; 800 } 801 802 } 803 804 // General case. 805 806 // Check for minimum match length of zero, which requires 807 // extra loop-breaking code. 808 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 809 // Zero length match is possible. 810 // Emit the code sequence that can handle it. 811 insertOp(topLoc); 812 frameLoc = fRXPat->fFrameSize; 813 fRXPat->fFrameSize++; 814 815 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); 816 fRXPat->fCompiledPat->setElementAt(op, topLoc); 817 818 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); 819 fRXPat->fCompiledPat->addElement(op, *fStatus); 820 } else { 821 // Simpler code when the repeated body must match something non-empty 822 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); 823 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 824 } 825 } 826 break; 827 828 case doNGPlus: 829 // Non-greedy '+?' compiles to 830 // 1. stuff to be repeated (already built) 831 // 2. state-save 1 832 // 3. ... 833 { 834 int32_t topLoc = blockTopLoc(FALSE); 835 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); 836 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); 837 } 838 break; 839 840 841 case doOpt: 842 // Normal (greedy) ? quantifier. 843 // Compiles to 844 // 1. state save 3 845 // 2. body of optional block 846 // 3. ... 847 // Insert the state save into the compiled pattern, and we're done. 848 { 849 int32_t saveStateLoc = blockTopLoc(TRUE); 850 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 851 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 852 } 853 break; 854 855 case doNGOpt: 856 // Non-greedy ?? quantifier 857 // compiles to 858 // 1. jmp 4 859 // 2. body of optional block 860 // 3 jmp 5 861 // 4. state save 2 862 // 5 ... 863 // This code is less than ideal, with two jmps instead of one, because we can only 864 // insert one instruction at the top of the block being iterated. 865 { 866 int32_t jmp1_loc = blockTopLoc(TRUE); 867 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 868 869 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); 870 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 871 872 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); 873 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); 874 875 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); 876 fRXPat->fCompiledPat->addElement(save_op, *fStatus); 877 } 878 break; 879 880 881 case doStar: 882 // Normal (greedy) * quantifier. 883 // Compiles to 884 // 1. STATE_SAVE 4 885 // 2. body of stuff being iterated over 886 // 3. JMP_SAV 2 887 // 4. ... 888 // 889 // Or, if the body is a simple [Set], 890 // 1. LOOP_SR_I set number 891 // 2. LOOP_C stack location 892 // ... 893 // 894 // Or if this is a .* 895 // 1. LOOP_DOT_I (. matches all mode flag) 896 // 2. LOOP_C stack location 897 // 898 // Or, if the body can match a zero-length string, to inhibit infinite loops, 899 // 1. STATE_SAVE 5 900 // 2. STO_INP_LOC data-loc 901 // 3. body of stuff 902 // 4. JMP_SAV_X 2 903 // 5. ... 904 { 905 // location of item #1, the STATE_SAVE 906 int32_t topLoc = blockTopLoc(FALSE); 907 int32_t dataLoc = -1; 908 909 // Check for simple *, where the construct being repeated 910 // compiled to single opcode, and might be optimizable. 911 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 912 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 913 914 if (URX_TYPE(repeatedOp) == URX_SETREF) { 915 // Emit optimized code for a [char set]* 916 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 917 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 918 dataLoc = fRXPat->fFrameSize; 919 fRXPat->fFrameSize++; 920 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 921 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 922 break; 923 } 924 925 if (URX_TYPE(repeatedOp) == URX_DOTANY || 926 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 927 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 928 // Emit Optimized code for .* operations. 929 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 930 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 931 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 932 loopOpI |= 1; 933 } 934 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 935 loopOpI |= 2; 936 } 937 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 938 dataLoc = fRXPat->fFrameSize; 939 fRXPat->fFrameSize++; 940 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 941 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 942 break; 943 } 944 } 945 946 // Emit general case code for this * 947 // The optimizations did not apply. 948 949 int32_t saveStateLoc = blockTopLoc(TRUE); 950 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); 951 952 // Check for minimum match length of zero, which requires 953 // extra loop-breaking code. 954 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 955 insertOp(saveStateLoc); 956 dataLoc = fRXPat->fFrameSize; 957 fRXPat->fFrameSize++; 958 959 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); 960 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 961 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); 962 } 963 964 // Locate the position in the compiled pattern where the match will continue 965 // after completing the *. (4 or 5 in the comment above) 966 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 967 968 // Put together the save state op store it into the compiled code. 969 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); 970 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 971 972 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 973 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 974 } 975 break; 976 977 case doNGStar: 978 // Non-greedy *? quantifier 979 // compiles to 980 // 1. JMP 3 981 // 2. body of stuff being iterated over 982 // 3. STATE_SAVE 2 983 // 4 ... 984 { 985 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 986 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 987 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); 988 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); 989 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 990 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); 991 } 992 break; 993 994 995 case doIntervalInit: 996 // The '{' opening an interval quantifier was just scanned. 997 // Init the counter varaiables that will accumulate the values as the digits 998 // are scanned. 999 fIntervalLow = 0; 1000 fIntervalUpper = -1; 1001 break; 1002 1003 case doIntevalLowerDigit: 1004 // Scanned a digit from the lower value of an {lower,upper} interval 1005 { 1006 int32_t digitValue = u_charDigitValue(fC.fChar); 1007 U_ASSERT(digitValue >= 0); 1008 fIntervalLow = fIntervalLow*10 + digitValue; 1009 if (fIntervalLow < 0) { 1010 error(U_REGEX_NUMBER_TOO_BIG); 1011 } 1012 } 1013 break; 1014 1015 case doIntervalUpperDigit: 1016 // Scanned a digit from the upper value of an {lower,upper} interval 1017 { 1018 if (fIntervalUpper < 0) { 1019 fIntervalUpper = 0; 1020 } 1021 int32_t digitValue = u_charDigitValue(fC.fChar); 1022 U_ASSERT(digitValue >= 0); 1023 fIntervalUpper = fIntervalUpper*10 + digitValue; 1024 if (fIntervalUpper < 0) { 1025 error(U_REGEX_NUMBER_TOO_BIG); 1026 } 1027 } 1028 break; 1029 1030 case doIntervalSame: 1031 // Scanned a single value interval like {27}. Upper = Lower. 1032 fIntervalUpper = fIntervalLow; 1033 break; 1034 1035 case doInterval: 1036 // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1037 if (compileInlineInterval() == FALSE) { 1038 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1039 } 1040 break; 1041 1042 case doPossessiveInterval: 1043 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1044 { 1045 // Remember the loc for the top of the block being looped over. 1046 // (Can not reserve a slot in the compiled pattern at this time, because 1047 // compileInterval needs to reserve also, and blockTopLoc can only reserve 1048 // once per block.) 1049 int32_t topLoc = blockTopLoc(FALSE); 1050 1051 // Produce normal looping code. 1052 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1053 1054 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1055 // just as if the loop was inclosed in atomic parentheses. 1056 1057 // First the STO_SP before the start of the loop 1058 insertOp(topLoc); 1059 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 1060 fRXPat->fDataSize += 1; // state stack ptr. 1061 int32_t op = URX_BUILD(URX_STO_SP, varLoc); 1062 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1063 1064 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1065 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1066 loopOp++; // point LoopOp after the just-inserted STO_SP 1067 fRXPat->fCompiledPat->push(loopOp, *fStatus); 1068 1069 // Then the LD_SP after the end of the loop 1070 op = URX_BUILD(URX_LD_SP, varLoc); 1071 fRXPat->fCompiledPat->addElement(op, *fStatus); 1072 } 1073 1074 break; 1075 1076 case doNGInterval: 1077 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1078 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1079 break; 1080 1081 case doIntervalError: 1082 error(U_REGEX_BAD_INTERVAL); 1083 break; 1084 1085 case doLiteralChar: 1086 // We've just scanned a "normal" character from the pattern, 1087 literalChar(fC.fChar); 1088 break; 1089 1090 1091 case doEscapedLiteralChar: 1092 // We've just scanned an backslashed escaped character with no 1093 // special meaning. It represents itself. 1094 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1095 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1096 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1097 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1098 } 1099 literalChar(fC.fChar); 1100 break; 1101 1102 1103 case doDotAny: 1104 // scanned a ".", match any single character. 1105 { 1106 int32_t op; 1107 if (fModeFlags & UREGEX_DOTALL) { 1108 op = URX_BUILD(URX_DOTANY_ALL, 0); 1109 } else if (fModeFlags & UREGEX_UNIX_LINES) { 1110 op = URX_BUILD(URX_DOTANY_UNIX, 0); 1111 } else { 1112 op = URX_BUILD(URX_DOTANY, 0); 1113 } 1114 fRXPat->fCompiledPat->addElement(op, *fStatus); 1115 } 1116 break; 1117 1118 case doCaret: 1119 { 1120 int32_t op = 0; 1121 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1122 op = URX_CARET; 1123 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1124 op = URX_CARET_M; 1125 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1126 op = URX_CARET; // Only testing true start of input. 1127 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1128 op = URX_CARET_M_UNIX; 1129 } 1130 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1131 } 1132 break; 1133 1134 case doDollar: 1135 { 1136 int32_t op = 0; 1137 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1138 op = URX_DOLLAR; 1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1140 op = URX_DOLLAR_M; 1141 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1142 op = URX_DOLLAR_D; 1143 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1144 op = URX_DOLLAR_MD; 1145 } 1146 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1147 } 1148 break; 1149 1150 case doBackslashA: 1151 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); 1152 break; 1153 1154 case doBackslashB: 1155 { 1156 #if UCONFIG_NO_BREAK_ITERATION==1 1157 if (fModeFlags & UREGEX_UWORD) { 1158 error(U_UNSUPPORTED_ERROR); 1159 } 1160 #endif 1161 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1162 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); 1163 } 1164 break; 1165 1166 case doBackslashb: 1167 { 1168 #if UCONFIG_NO_BREAK_ITERATION==1 1169 if (fModeFlags & UREGEX_UWORD) { 1170 error(U_UNSUPPORTED_ERROR); 1171 } 1172 #endif 1173 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1174 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1175 } 1176 break; 1177 1178 case doBackslashD: 1179 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); 1180 break; 1181 1182 case doBackslashd: 1183 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); 1184 break; 1185 1186 case doBackslashG: 1187 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); 1188 break; 1189 1190 case doBackslashS: 1191 fRXPat->fCompiledPat->addElement( 1192 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); 1193 break; 1194 1195 case doBackslashs: 1196 fRXPat->fCompiledPat->addElement( 1197 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); 1198 break; 1199 1200 case doBackslashW: 1201 fRXPat->fCompiledPat->addElement( 1202 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); 1203 break; 1204 1205 case doBackslashw: 1206 fRXPat->fCompiledPat->addElement( 1207 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); 1208 break; 1209 1210 case doBackslashX: 1211 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); 1212 break; 1213 1214 1215 case doBackslashZ: 1216 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); 1217 break; 1218 1219 case doBackslashz: 1220 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); 1221 break; 1222 1223 case doEscapeError: 1224 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1225 break; 1226 1227 case doExit: 1228 returnVal = FALSE; 1229 break; 1230 1231 case doProperty: 1232 { 1233 UnicodeSet *theSet = scanProp(); 1234 compileSet(theSet); 1235 } 1236 break; 1237 1238 case doNamedChar: 1239 { 1240 UChar32 c = scanNamedChar(); 1241 literalChar(c); 1242 } 1243 break; 1244 1245 1246 case doBackRef: 1247 // BackReference. Somewhat unusual in that the front-end can not completely parse 1248 // the regular expression, because the number of digits to be consumed 1249 // depends on the number of capture groups that have been defined. So 1250 // we have to do it here instead. 1251 { 1252 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1253 int32_t groupNum = 0; 1254 UChar32 c = fC.fChar; 1255 1256 for (;;) { 1257 // Loop once per digit, for max allowed number of digits in a back reference. 1258 int32_t digit = u_charDigitValue(c); 1259 groupNum = groupNum * 10 + digit; 1260 if (groupNum >= numCaptureGroups) { 1261 break; 1262 } 1263 c = peekCharLL(); 1264 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1265 break; 1266 } 1267 nextCharLL(); 1268 } 1269 1270 // Scan of the back reference in the source regexp is complete. Now generate 1271 // the compiled code for it. 1272 // Because capture groups can be forward-referenced by back-references, 1273 // we fill the operand with the capture group number. At the end 1274 // of compilation, it will be changed to the variable's location. 1275 U_ASSERT(groupNum > 0); 1276 int32_t op; 1277 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1278 op = URX_BUILD(URX_BACKREF_I, groupNum); 1279 } else { 1280 op = URX_BUILD(URX_BACKREF, groupNum); 1281 } 1282 fRXPat->fCompiledPat->addElement(op, *fStatus); 1283 } 1284 break; 1285 1286 1287 case doPossessivePlus: 1288 // Possessive ++ quantifier. 1289 // Compiles to 1290 // 1. STO_SP 1291 // 2. body of stuff being iterated over 1292 // 3. STATE_SAVE 5 1293 // 4. JMP 2 1294 // 5. LD_SP 1295 // 6. ... 1296 // 1297 // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1298 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1299 // 1300 { 1301 // Emit the STO_SP 1302 int32_t topLoc = blockTopLoc(TRUE); 1303 int32_t stoLoc = fRXPat->fDataSize; 1304 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1305 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1306 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1307 1308 // Emit the STATE_SAVE 1309 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1310 fRXPat->fCompiledPat->addElement(op, *fStatus); 1311 1312 // Emit the JMP 1313 op = URX_BUILD(URX_JMP, topLoc+1); 1314 fRXPat->fCompiledPat->addElement(op, *fStatus); 1315 1316 // Emit the LD_SP 1317 op = URX_BUILD(URX_LD_SP, stoLoc); 1318 fRXPat->fCompiledPat->addElement(op, *fStatus); 1319 } 1320 break; 1321 1322 case doPossessiveStar: 1323 // Possessive *+ quantifier. 1324 // Compiles to 1325 // 1. STO_SP loc 1326 // 2. STATE_SAVE 5 1327 // 3. body of stuff being iterated over 1328 // 4. JMP 2 1329 // 5. LD_SP loc 1330 // 6 ... 1331 // TODO: do something to cut back the state stack each time through the loop. 1332 { 1333 // Reserve two slots at the top of the block. 1334 int32_t topLoc = blockTopLoc(TRUE); 1335 insertOp(topLoc); 1336 1337 // emit STO_SP loc 1338 int32_t stoLoc = fRXPat->fDataSize; 1339 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1340 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1341 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1342 1343 // Emit the SAVE_STATE 5 1344 int32_t L7 = fRXPat->fCompiledPat->size()+1; 1345 op = URX_BUILD(URX_STATE_SAVE, L7); 1346 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1347 1348 // Append the JMP operation. 1349 op = URX_BUILD(URX_JMP, topLoc+1); 1350 fRXPat->fCompiledPat->addElement(op, *fStatus); 1351 1352 // Emit the LD_SP loc 1353 op = URX_BUILD(URX_LD_SP, stoLoc); 1354 fRXPat->fCompiledPat->addElement(op, *fStatus); 1355 } 1356 break; 1357 1358 case doPossessiveOpt: 1359 // Possessive ?+ quantifier. 1360 // Compiles to 1361 // 1. STO_SP loc 1362 // 2. SAVE_STATE 5 1363 // 3. body of optional block 1364 // 4. LD_SP loc 1365 // 5. ... 1366 // 1367 { 1368 // Reserve two slots at the top of the block. 1369 int32_t topLoc = blockTopLoc(TRUE); 1370 insertOp(topLoc); 1371 1372 // Emit the STO_SP 1373 int32_t stoLoc = fRXPat->fDataSize; 1374 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1375 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1376 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1377 1378 // Emit the SAVE_STATE 1379 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1380 op = URX_BUILD(URX_STATE_SAVE, continueLoc); 1381 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1382 1383 // Emit the LD_SP 1384 op = URX_BUILD(URX_LD_SP, stoLoc); 1385 fRXPat->fCompiledPat->addElement(op, *fStatus); 1386 } 1387 break; 1388 1389 1390 case doBeginMatchMode: 1391 fNewModeFlags = fModeFlags; 1392 fSetModeFlag = TRUE; 1393 break; 1394 1395 case doMatchMode: // (?i) and similar 1396 { 1397 int32_t bit = 0; 1398 switch (fC.fChar) { 1399 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1400 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1401 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1402 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1403 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1404 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1405 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1406 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1407 default: 1408 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1409 // by the scanner. 1410 } 1411 if (fSetModeFlag) { 1412 fNewModeFlags |= bit; 1413 } else { 1414 fNewModeFlags &= ~bit; 1415 } 1416 } 1417 break; 1418 1419 case doSetMatchMode: 1420 // We've got a (?i) or similar. The match mode is being changed, but 1421 // the change is not scoped to a parenthesized block. 1422 U_ASSERT(fNewModeFlags < 0); 1423 fModeFlags = fNewModeFlags; 1424 1425 // Prevent any string from spanning across the change of match mode. 1426 // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" 1427 fixLiterals(); 1428 break; 1429 1430 1431 case doMatchModeParen: 1432 // We've got a (?i: or similar. Begin a parenthesized block, save old 1433 // mode flags so they can be restored at the close of the block. 1434 // 1435 // Compile to a 1436 // - NOP, which later may be replaced by a save-state if the 1437 // parenthesized group gets a * quantifier, followed by 1438 // - NOP, which may later be replaced by a save-state if there 1439 // is an '|' alternation within the parens. 1440 { 1441 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1442 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1443 1444 // On the Parentheses stack, start a new frame and add the postions 1445 // of the two NOPs (a normal non-capturing () frame, except for the 1446 // saving of the orignal mode flags.) 1447 fParenStack.push(fModeFlags, *fStatus); 1448 fParenStack.push(flags, *fStatus); // Frame Marker 1449 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1450 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1451 1452 // Set the current mode flags to the new values. 1453 U_ASSERT(fNewModeFlags < 0); 1454 fModeFlags = fNewModeFlags; 1455 } 1456 break; 1457 1458 case doBadModeFlag: 1459 error(U_REGEX_INVALID_FLAG); 1460 break; 1461 1462 case doSuppressComments: 1463 // We have just scanned a '(?'. We now need to prevent the character scanner from 1464 // treating a '#' as a to-the-end-of-line comment. 1465 // (This Perl compatibility just gets uglier and uglier to do...) 1466 fEOLComments = FALSE; 1467 break; 1468 1469 1470 case doSetAddAmp: 1471 { 1472 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1473 set->add(chAmp); 1474 } 1475 break; 1476 1477 case doSetAddDash: 1478 { 1479 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1480 set->add(chDash); 1481 } 1482 break; 1483 1484 case doSetBackslash_s: 1485 { 1486 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1487 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1488 break; 1489 } 1490 1491 case doSetBackslash_S: 1492 { 1493 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1494 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1495 SSet.complement(); 1496 set->addAll(SSet); 1497 break; 1498 } 1499 1500 case doSetBackslash_d: 1501 { 1502 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1503 // TODO - make a static set, ticket 6058. 1504 addCategory(set, U_GC_ND_MASK, *fStatus); 1505 break; 1506 } 1507 1508 case doSetBackslash_D: 1509 { 1510 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1511 UnicodeSet digits; 1512 // TODO - make a static set, ticket 6058. 1513 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1514 digits.complement(); 1515 set->addAll(digits); 1516 break; 1517 } 1518 1519 case doSetBackslash_w: 1520 { 1521 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1522 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1523 break; 1524 } 1525 1526 case doSetBackslash_W: 1527 { 1528 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1529 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1530 SSet.complement(); 1531 set->addAll(SSet); 1532 break; 1533 } 1534 1535 case doSetBegin: 1536 fSetStack.push(new UnicodeSet(), *fStatus); 1537 fSetOpStack.push(setStart, *fStatus); 1538 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1539 fSetOpStack.push(setCaseClose, *fStatus); 1540 } 1541 break; 1542 1543 case doSetBeginDifference1: 1544 // We have scanned something like [[abc]-[ 1545 // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1546 // Push a Difference operator, which will cause the new set to be subtracted from what 1547 // went before once it is created. 1548 setPushOp(setDifference1); 1549 fSetOpStack.push(setStart, *fStatus); 1550 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1551 fSetOpStack.push(setCaseClose, *fStatus); 1552 } 1553 break; 1554 1555 case doSetBeginIntersection1: 1556 // We have scanned something like [[abc]&[ 1557 // Need both the '&' operator and the open '[' operator. 1558 setPushOp(setIntersection1); 1559 fSetOpStack.push(setStart, *fStatus); 1560 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1561 fSetOpStack.push(setCaseClose, *fStatus); 1562 } 1563 break; 1564 1565 case doSetBeginUnion: 1566 // We have scanned something like [[abc][ 1567 // Need to handle the union operation explicitly [[abc] | [ 1568 setPushOp(setUnion); 1569 fSetOpStack.push(setStart, *fStatus); 1570 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1571 fSetOpStack.push(setCaseClose, *fStatus); 1572 } 1573 break; 1574 1575 case doSetDifference2: 1576 // We have scanned something like [abc-- 1577 // Consider this to unambiguously be a set difference operator. 1578 setPushOp(setDifference2); 1579 break; 1580 1581 case doSetEnd: 1582 // Have encountered the ']' that closes a set. 1583 // Force the evaluation of any pending operations within this set, 1584 // leave the completed set on the top of the set stack. 1585 setEval(setEnd); 1586 U_ASSERT(fSetOpStack.peeki()==setStart); 1587 fSetOpStack.popi(); 1588 break; 1589 1590 case doSetFinish: 1591 { 1592 // Finished a complete set expression, including all nested sets. 1593 // The close bracket has already triggered clearing out pending set operators, 1594 // the operator stack should be empty and the operand stack should have just 1595 // one entry, the result set. 1596 U_ASSERT(fSetOpStack.empty()); 1597 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1598 U_ASSERT(fSetStack.empty()); 1599 compileSet(theSet); 1600 break; 1601 } 1602 1603 case doSetIntersection2: 1604 // Have scanned something like [abc&& 1605 setPushOp(setIntersection2); 1606 break; 1607 1608 case doSetLiteral: 1609 // Union the just-scanned literal character into the set being built. 1610 // This operation is the highest precedence set operation, so we can always do 1611 // it immediately, without waiting to see what follows. It is necessary to perform 1612 // any pending '-' or '&' operation first, because these have the same precedence 1613 // as union-ing in a literal' 1614 { 1615 setEval(setUnion); 1616 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1617 s->add(fC.fChar); 1618 fLastSetLiteral = fC.fChar; 1619 break; 1620 } 1621 1622 case doSetLiteralEscaped: 1623 // A back-slash escaped literal character was encountered. 1624 // Processing is the same as with setLiteral, above, with the addition of 1625 // the optional check for errors on escaped ASCII letters. 1626 { 1627 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1628 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1629 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1630 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1631 } 1632 setEval(setUnion); 1633 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1634 s->add(fC.fChar); 1635 fLastSetLiteral = fC.fChar; 1636 break; 1637 } 1638 1639 case doSetNamedChar: 1640 // Scanning a \N{UNICODE CHARACTER NAME} 1641 // Aside from the source of the character, the processing is identical to doSetLiteral, 1642 // above. 1643 { 1644 UChar32 c = scanNamedChar(); 1645 setEval(setUnion); 1646 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1647 s->add(c); 1648 fLastSetLiteral = c; 1649 break; 1650 } 1651 1652 case doSetNamedRange: 1653 // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1654 // The left character is already in the set, and is saved in fLastSetLiteral. 1655 // The right side needs to be picked up, the scan is at the 'N'. 1656 // Lower Limit > Upper limit being an error matches both Java 1657 // and ICU UnicodeSet behavior. 1658 { 1659 UChar32 c = scanNamedChar(); 1660 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { 1661 error(U_REGEX_INVALID_RANGE); 1662 } 1663 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1664 s->add(fLastSetLiteral, c); 1665 fLastSetLiteral = c; 1666 break; 1667 } 1668 1669 1670 case doSetNegate: 1671 // Scanned a '^' at the start of a set. 1672 // Push the negation operator onto the set op stack. 1673 // A twist for case-insensitive matching: 1674 // the case closure operation must happen _before_ negation. 1675 // But the case closure operation will already be on the stack if it's required. 1676 // This requires checking for case closure, and swapping the stack order 1677 // if it is present. 1678 { 1679 int32_t tosOp = fSetOpStack.peeki(); 1680 if (tosOp == setCaseClose) { 1681 fSetOpStack.popi(); 1682 fSetOpStack.push(setNegation, *fStatus); 1683 fSetOpStack.push(setCaseClose, *fStatus); 1684 } else { 1685 fSetOpStack.push(setNegation, *fStatus); 1686 } 1687 } 1688 break; 1689 1690 case doSetNoCloseError: 1691 error(U_REGEX_MISSING_CLOSE_BRACKET); 1692 break; 1693 1694 case doSetOpError: 1695 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1696 break; 1697 1698 case doSetPosixProp: 1699 { 1700 UnicodeSet *s = scanPosixProp(); 1701 if (s != NULL) { 1702 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1703 tos->addAll(*s); 1704 delete s; 1705 } // else error. scanProp() reported the error status already. 1706 } 1707 break; 1708 1709 case doSetProp: 1710 // Scanned a \p \P within [brackets]. 1711 { 1712 UnicodeSet *s = scanProp(); 1713 if (s != NULL) { 1714 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1715 tos->addAll(*s); 1716 delete s; 1717 } // else error. scanProp() reported the error status already. 1718 } 1719 break; 1720 1721 1722 case doSetRange: 1723 // We have scanned literal-literal. Add the range to the set. 1724 // The left character is already in the set, and is saved in fLastSetLiteral. 1725 // The right side is the current character. 1726 // Lower Limit > Upper limit being an error matches both Java 1727 // and ICU UnicodeSet behavior. 1728 { 1729 if (fLastSetLiteral > fC.fChar) { 1730 error(U_REGEX_INVALID_RANGE); 1731 } 1732 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1733 s->add(fLastSetLiteral, fC.fChar); 1734 break; 1735 } 1736 1737 1738 default: 1739 U_ASSERT(FALSE); 1740 error(U_REGEX_INTERNAL_ERROR); 1741 break; 1742 } 1743 1744 if (U_FAILURE(*fStatus)) { 1745 returnVal = FALSE; 1746 } 1747 1748 return returnVal; 1749 } 1750 1751 1752 1753 //------------------------------------------------------------------------------ 1754 // 1755 // literalChar We've encountered a literal character from the pattern, 1756 // or an escape sequence that reduces to a character. 1757 // Add it to the string containing all literal chars/strings from 1758 // the pattern. 1759 // If we are in a pattern string already, add the new char to it. 1760 // If we aren't in a pattern string, begin one now. 1761 // 1762 //------------------------------------------------------------------------------ 1763 void RegexCompile::literalChar(UChar32 c) { 1764 int32_t op; // An operation in the compiled pattern. 1765 int32_t opType; 1766 int32_t patternLoc; // A position in the compiled pattern. 1767 int32_t stringLen; 1768 1769 1770 // If the last thing compiled into the pattern was not a literal char, 1771 // force this new literal char to begin a new string, and not append to the previous. 1772 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1773 opType = URX_TYPE(op); 1774 if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) { 1775 fixLiterals(); 1776 } 1777 1778 if (fStringOpStart == -1) { 1779 // First char of a string in the pattern. 1780 // Emit a OneChar op into the compiled pattern. 1781 emitONE_CHAR(c); 1782 1783 // Mark that we might actually be starting a string here 1784 fStringOpStart = fRXPat->fLiteralText.length(); 1785 return; 1786 } 1787 1788 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1789 opType = URX_TYPE(op); 1790 U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); 1791 1792 // If the most recently emitted op is a URX_ONECHAR, 1793 if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { 1794 if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { 1795 // The most recently emitted op is a ONECHAR that was the first half 1796 // of a surrogate pair. Update the ONECHAR's operand to be the 1797 // supplementary code point resulting from both halves of the pair. 1798 c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c); 1799 op = URX_BUILD(opType, c); 1800 patternLoc = fRXPat->fCompiledPat->size() - 1; 1801 fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1802 return; 1803 } 1804 1805 // The most recently emitted op is a ONECHAR. 1806 // We've now received another adjacent char. Change the ONECHAR op 1807 // to a string op. 1808 fRXPat->fLiteralText.append(URX_VAL(op)); 1809 1810 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1811 op = URX_BUILD(URX_STRING_I, fStringOpStart); 1812 } else { 1813 op = URX_BUILD(URX_STRING, fStringOpStart); 1814 } 1815 patternLoc = fRXPat->fCompiledPat->size() - 1; 1816 fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1817 op = URX_BUILD(URX_STRING_LEN, 0); 1818 fRXPat->fCompiledPat->addElement(op, *fStatus); 1819 } 1820 1821 // We are adding onto an existing string 1822 fRXPat->fLiteralText.append(c); 1823 1824 // The pattern contains a URX_SRING / URX_STRING_LEN. Update the 1825 // string length to reflect the new char we just added to the string. 1826 stringLen = fRXPat->fLiteralText.length() - fStringOpStart; 1827 op = URX_BUILD(URX_STRING_LEN, stringLen); 1828 patternLoc = fRXPat->fCompiledPat->size() - 1; 1829 fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1830 } 1831 1832 1833 1834 //------------------------------------------------------------------------------ 1835 // 1836 // emitONE_CHAR emit a ONE_CHAR op into the generated code. 1837 // Choose cased or uncased version, depending on the 1838 // match mode and whether the character itself is cased. 1839 // 1840 //------------------------------------------------------------------------------ 1841 void RegexCompile::emitONE_CHAR(UChar32 c) { 1842 int32_t op; 1843 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1844 u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 1845 // We have a cased character, and are in case insensitive matching mode. 1846 //c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs() now 1847 op = URX_BUILD(URX_ONECHAR_I, c); 1848 } else { 1849 // Uncased char, or case sensitive match mode. 1850 // Either way, just generate a literal compare of the char. 1851 op = URX_BUILD(URX_ONECHAR, c); 1852 } 1853 fRXPat->fCompiledPat->addElement(op, *fStatus); 1854 } 1855 1856 1857 //------------------------------------------------------------------------------ 1858 // 1859 // fixLiterals When compiling something that can follow a literal 1860 // string in a pattern, we need to "fix" any preceding 1861 // string, which will cause any subsequent literals to 1862 // begin a new string, rather than appending to the 1863 // old one. 1864 // 1865 // Optionally, split the last char of the string off into 1866 // a single "ONE_CHAR" operation, so that quantifiers can 1867 // apply to that char alone. Example: abc* 1868 // The * must apply to the 'c' only. 1869 // 1870 //------------------------------------------------------------------------------ 1871 void RegexCompile::fixLiterals(UBool split) { 1872 int32_t stringStart = fStringOpStart; // start index of the current literal string 1873 int32_t op; // An op from/for the compiled pattern. 1874 int32_t opType; // An opcode type from the compiled pattern. 1875 int32_t stringLastCharIdx; 1876 UChar32 lastChar; 1877 int32_t stringNextToLastCharIdx; 1878 UChar32 nextToLastChar; 1879 int32_t stringLen; 1880 1881 fStringOpStart = -1; 1882 if (!split) { 1883 return; 1884 } 1885 1886 // Split: We need to ensure that the last item in the compiled pattern does 1887 // not refer to a literal string of more than one char. If it does, 1888 // separate the last char from the rest of the string. 1889 1890 // If the last operation from the compiled pattern is not a string, 1891 // nothing needs to be done 1892 op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1893 opType = URX_TYPE(op); 1894 if (opType != URX_STRING_LEN) { 1895 return; 1896 } 1897 stringLen = URX_VAL(op); 1898 1899 // 1900 // Find the position of the last code point in the string (might be a surrogate pair) 1901 // 1902 stringLastCharIdx = fRXPat->fLiteralText.length(); 1903 stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); 1904 lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx); 1905 1906 // The string should always be at least two code points long, meaning that there 1907 // should be something before the last char position that we just found. 1908 U_ASSERT(stringLastCharIdx > stringStart); 1909 stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); 1910 U_ASSERT(stringNextToLastCharIdx >= stringStart); 1911 nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx); 1912 1913 if (stringNextToLastCharIdx > stringStart) { 1914 // The length of string remaining after removing one char is two or more. 1915 // Leave the string in the compiled pattern, shorten it by one char, 1916 // and append a URX_ONECHAR op for the last char. 1917 stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx); 1918 op = URX_BUILD(URX_STRING_LEN, stringLen); 1919 fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); 1920 emitONE_CHAR(lastChar); 1921 } else { 1922 // The original string consisted of exactly two characters. Replace 1923 // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair 1924 // of URX_ONECHARs. 1925 fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2); 1926 emitONE_CHAR(nextToLastChar); 1927 emitONE_CHAR(lastChar); 1928 } 1929 } 1930 1931 1932 1933 1934 1935 1936 //------------------------------------------------------------------------------ 1937 // 1938 // insertOp() Insert a slot for a new opcode into the already 1939 // compiled pattern code. 1940 // 1941 // Fill the slot with a NOP. Our caller will replace it 1942 // with what they really wanted. 1943 // 1944 //------------------------------------------------------------------------------ 1945 void RegexCompile::insertOp(int32_t where) { 1946 UVector64 *code = fRXPat->fCompiledPat; 1947 U_ASSERT(where>0 && where < code->size()); 1948 1949 int32_t nop = URX_BUILD(URX_NOP, 0); 1950 code->insertElementAt(nop, where, *fStatus); 1951 1952 // Walk through the pattern, looking for any ops with targets that 1953 // were moved down by the insert. Fix them. 1954 int32_t loc; 1955 for (loc=0; loc<code->size(); loc++) { 1956 int32_t op = (int32_t)code->elementAti(loc); 1957 int32_t opType = URX_TYPE(op); 1958 int32_t opValue = URX_VAL(op); 1959 if ((opType == URX_JMP || 1960 opType == URX_JMPX || 1961 opType == URX_STATE_SAVE || 1962 opType == URX_CTR_LOOP || 1963 opType == URX_CTR_LOOP_NG || 1964 opType == URX_JMP_SAV || 1965 // BEGIN Android-added 1966 // ICU ticket:8666. 1967 opType == URX_JMP_SAV_X || 1968 // END Android-added 1969 opType == URX_RELOC_OPRND) && opValue > where) { 1970 // Target location for this opcode is after the insertion point and 1971 // needs to be incremented to adjust for the insertion. 1972 opValue++; 1973 op = URX_BUILD(opType, opValue); 1974 code->setElementAt(op, loc); 1975 } 1976 } 1977 1978 // Now fix up the parentheses stack. All positive values in it are locations in 1979 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 1980 for (loc=0; loc<fParenStack.size(); loc++) { 1981 int32_t x = fParenStack.elementAti(loc); 1982 U_ASSERT(x < code->size()); 1983 if (x>where) { 1984 x++; 1985 fParenStack.setElementAt(x, loc); 1986 } 1987 } 1988 1989 if (fMatchCloseParen > where) { 1990 fMatchCloseParen++; 1991 } 1992 if (fMatchOpenParen > where) { 1993 fMatchOpenParen++; 1994 } 1995 } 1996 1997 1998 1999 //------------------------------------------------------------------------------ 2000 // 2001 // blockTopLoc() Find or create a location in the compiled pattern 2002 // at the start of the operation or block that has 2003 // just been compiled. Needed when a quantifier (* or 2004 // whatever) appears, and we need to add an operation 2005 // at the start of the thing being quantified. 2006 // 2007 // (Parenthesized Blocks) have a slot with a NOP that 2008 // is reserved for this purpose. .* or similar don't 2009 // and a slot needs to be added. 2010 // 2011 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode 2012 // at the returned location. 2013 // FALSE - just return the address, 2014 // do not reserve a location there. 2015 // 2016 //------------------------------------------------------------------------------ 2017 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 2018 int32_t theLoc; 2019 if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 2020 { 2021 // The item just processed is a parenthesized block. 2022 theLoc = fMatchOpenParen; // A slot is already reserved for us. 2023 U_ASSERT(theLoc > 0); 2024 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 2025 } 2026 else { 2027 // Item just compiled is a single thing, a ".", or a single char, or a set reference. 2028 // No slot for STATE_SAVE was pre-reserved in the compiled code. 2029 // We need to make space now. 2030 fixLiterals(TRUE); // If last item was a string, separate the last char. 2031 theLoc = fRXPat->fCompiledPat->size()-1; 2032 if (reserveLoc) { 2033 /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/ 2034 int32_t nop = URX_BUILD(URX_NOP, 0); 2035 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 2036 } 2037 } 2038 return theLoc; 2039 } 2040 2041 2042 2043 //------------------------------------------------------------------------------ 2044 // 2045 // handleCloseParen When compiling a close paren, we need to go back 2046 // and fix up any JMP or SAVE operations within the 2047 // parenthesized block that need to target the end 2048 // of the block. The locations of these are kept on 2049 // the paretheses stack. 2050 // 2051 // This function is called both when encountering a 2052 // real ) and at the end of the pattern. 2053 // 2054 //------------------------------------------------------------------------------ 2055 void RegexCompile::handleCloseParen() { 2056 int32_t patIdx; 2057 int32_t patOp; 2058 if (fParenStack.size() <= 0) { 2059 error(U_REGEX_MISMATCHED_PAREN); 2060 return; 2061 } 2062 2063 // Force any literal chars that may follow the close paren to start a new string, 2064 // and not attach to any preceding it. 2065 fixLiterals(FALSE); 2066 2067 // Fixup any operations within the just-closed parenthesized group 2068 // that need to reference the end of the (block). 2069 // (The first one popped from the stack is an unused slot for 2070 // alternation (OR) state save, but applying the fixup to it does no harm.) 2071 for (;;) { 2072 patIdx = fParenStack.popi(); 2073 if (patIdx < 0) { 2074 // value < 0 flags the start of the frame on the paren stack. 2075 break; 2076 } 2077 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 2078 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 2079 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2080 patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2081 fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2082 fMatchOpenParen = patIdx; 2083 } 2084 2085 // At the close of any parenthesized block, restore the match mode flags to 2086 // the value they had at the open paren. Saved value is 2087 // at the top of the paren stack. 2088 fModeFlags = fParenStack.popi(); 2089 U_ASSERT(fModeFlags < 0); 2090 2091 // DO any additional fixups, depending on the specific kind of 2092 // parentesized grouping this is 2093 2094 switch (patIdx) { 2095 case plain: 2096 case flags: 2097 // No additional fixups required. 2098 // (Grouping-only parentheses) 2099 break; 2100 case capturing: 2101 // Capturing Parentheses. 2102 // Insert a End Capture op into the pattern. 2103 // The frame offset of the variables for this cg is obtained from the 2104 // start capture op and put it into the end-capture op. 2105 { 2106 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2107 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2108 2109 int32_t frameVarLocation = URX_VAL(captureOp); 2110 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); 2111 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); 2112 } 2113 break; 2114 case atomic: 2115 // Atomic Parenthesis. 2116 // Insert a LD_SP operation to restore the state stack to the position 2117 // it was when the atomic parens were entered. 2118 { 2119 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2120 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2121 int32_t stoLoc = URX_VAL(stoOp); 2122 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); 2123 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); 2124 } 2125 break; 2126 2127 case lookAhead: 2128 { 2129 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2130 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2131 int32_t dataLoc = URX_VAL(startOp); 2132 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2133 fRXPat->fCompiledPat->addElement(op, *fStatus); 2134 } 2135 break; 2136 2137 case negLookAhead: 2138 { 2139 // See comment at doOpenLookAheadNeg 2140 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 2141 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2142 int32_t dataLoc = URX_VAL(startOp); 2143 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2144 fRXPat->fCompiledPat->addElement(op, *fStatus); 2145 op = URX_BUILD(URX_BACKTRACK, 0); 2146 fRXPat->fCompiledPat->addElement(op, *fStatus); 2147 op = URX_BUILD(URX_LA_END, dataLoc); 2148 fRXPat->fCompiledPat->addElement(op, *fStatus); 2149 2150 // Patch the URX_SAVE near the top of the block. 2151 // The destination of the SAVE is the final LA_END that was just added. 2152 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 2153 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2154 int32_t dest = fRXPat->fCompiledPat->size()-1; 2155 saveOp = URX_BUILD(URX_STATE_SAVE, dest); 2156 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2157 } 2158 break; 2159 2160 case lookBehind: 2161 { 2162 // See comment at doOpenLookBehind. 2163 2164 // Append the URX_LB_END and URX_LA_END to the compiled pattern. 2165 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 2166 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2167 int32_t dataLoc = URX_VAL(startOp); 2168 int32_t op = URX_BUILD(URX_LB_END, dataLoc); 2169 fRXPat->fCompiledPat->addElement(op, *fStatus); 2170 op = URX_BUILD(URX_LA_END, dataLoc); 2171 fRXPat->fCompiledPat->addElement(op, *fStatus); 2172 2173 // Determine the min and max bounds for the length of the 2174 // string that the pattern can match. 2175 // An unbounded upper limit is an error. 2176 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2177 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2178 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2179 if (maxML == INT32_MAX) { 2180 error(U_REGEX_LOOK_BEHIND_LIMIT); 2181 break; 2182 } 2183 U_ASSERT(minML <= maxML); 2184 2185 // Insert the min and max match len bounds into the URX_LB_CONT op that 2186 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2187 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2188 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2189 2190 } 2191 break; 2192 2193 2194 2195 case lookBehindN: 2196 { 2197 // See comment at doOpenLookBehindNeg. 2198 2199 // Append the URX_LBN_END to the compiled pattern. 2200 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2201 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2202 int32_t dataLoc = URX_VAL(startOp); 2203 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); 2204 fRXPat->fCompiledPat->addElement(op, *fStatus); 2205 2206 // Determine the min and max bounds for the length of the 2207 // string that the pattern can match. 2208 // An unbounded upper limit is an error. 2209 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2210 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2211 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2212 if (maxML == INT32_MAX) { 2213 error(U_REGEX_LOOK_BEHIND_LIMIT); 2214 break; 2215 } 2216 U_ASSERT(minML <= maxML); 2217 2218 // Insert the min and max match len bounds into the URX_LB_CONT op that 2219 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2220 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2221 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2222 2223 // Insert the pattern location to continue at after a successful match 2224 // as the last operand of the URX_LBN_CONT 2225 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2226 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2227 } 2228 break; 2229 2230 2231 2232 default: 2233 U_ASSERT(FALSE); 2234 } 2235 2236 // remember the next location in the compiled pattern. 2237 // The compilation of Quantifiers will look at this to see whether its looping 2238 // over a parenthesized block or a single item 2239 fMatchCloseParen = fRXPat->fCompiledPat->size(); 2240 } 2241 2242 2243 2244 //------------------------------------------------------------------------------ 2245 // 2246 // compileSet Compile the pattern operations for a reference to a 2247 // UnicodeSet. 2248 // 2249 //------------------------------------------------------------------------------ 2250 void RegexCompile::compileSet(UnicodeSet *theSet) 2251 { 2252 if (theSet == NULL) { 2253 return; 2254 } 2255 // Remove any strings from the set. 2256 // There shoudn't be any, but just in case. 2257 // (Case Closure can add them; if we had a simple case closure avaialble that 2258 // ignored strings, that would be better.) 2259 theSet->removeAllStrings(); 2260 int32_t setSize = theSet->size(); 2261 2262 switch (setSize) { 2263 case 0: 2264 { 2265 // Set of no elements. Always fails to match. 2266 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); 2267 delete theSet; 2268 } 2269 break; 2270 2271 case 1: 2272 { 2273 // The set contains only a single code point. Put it into 2274 // the compiled pattern as a single char operation rather 2275 // than a set, and discard the set itself. 2276 literalChar(theSet->charAt(0)); 2277 delete theSet; 2278 } 2279 break; 2280 2281 default: 2282 { 2283 // The set contains two or more chars. (the normal case) 2284 // Put it into the compiled pattern as a set. 2285 int32_t setNumber = fRXPat->fSets->size(); 2286 fRXPat->fSets->addElement(theSet, *fStatus); 2287 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); 2288 fRXPat->fCompiledPat->addElement(setOp, *fStatus); 2289 } 2290 } 2291 } 2292 2293 2294 //------------------------------------------------------------------------------ 2295 // 2296 // compileInterval Generate the code for a {min, max} style interval quantifier. 2297 // Except for the specific opcodes used, the code is the same 2298 // for all three types (greedy, non-greedy, possessive) of 2299 // intervals. The opcodes are supplied as parameters. 2300 // 2301 // The code for interval loops has this form: 2302 // 0 CTR_INIT counter loc (in stack frame) 2303 // 1 5 patt address of CTR_LOOP at bottom of block 2304 // 2 min count 2305 // 3 max count (-1 for unbounded) 2306 // 4 ... block to be iterated over 2307 // 5 CTR_LOOP 2308 // 2309 // In 2310 //------------------------------------------------------------------------------ 2311 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2312 { 2313 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2314 // four slots in the compiled code. Reserve them. 2315 int32_t topOfBlock = blockTopLoc(TRUE); 2316 insertOp(topOfBlock); 2317 insertOp(topOfBlock); 2318 insertOp(topOfBlock); 2319 2320 // The operands for the CTR_INIT opcode include the index in the matcher data 2321 // of the counter. Allocate it now. 2322 int32_t counterLoc = fRXPat->fFrameSize; 2323 fRXPat->fFrameSize++; 2324 2325 int32_t op = URX_BUILD(InitOp, counterLoc); 2326 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2327 2328 // The second operand of CTR_INIT is the location following the end of the loop. 2329 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2330 // compilation of something later on causes the code to grow and the target 2331 // position to move. 2332 int32_t loopEnd = fRXPat->fCompiledPat->size(); 2333 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); 2334 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2335 2336 // Followed by the min and max counts. 2337 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2338 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2339 2340 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2341 // Goes at end of the block being looped over, so just append to the code so far. 2342 op = URX_BUILD(LoopOp, topOfBlock); 2343 fRXPat->fCompiledPat->addElement(op, *fStatus); 2344 2345 if ((fIntervalLow & 0xff000000) != 0 || 2346 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2347 error(U_REGEX_NUMBER_TOO_BIG); 2348 } 2349 2350 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2351 error(U_REGEX_MAX_LT_MIN); 2352 } 2353 } 2354 2355 2356 2357 UBool RegexCompile::compileInlineInterval() { 2358 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2359 // Too big to inline. Fail, which will cause looping code to be generated. 2360 // (Upper < Lower picks up unbounded upper and errors, both.) 2361 return FALSE; 2362 } 2363 2364 int32_t topOfBlock = blockTopLoc(FALSE); 2365 if (fIntervalUpper == 0) { 2366 // Pathological case. Attempt no matches, as if the block doesn't exist. 2367 fRXPat->fCompiledPat->setSize(topOfBlock); 2368 return TRUE; 2369 } 2370 2371 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2372 // The thing being repeated is not a single op, but some 2373 // more complex block. Do it as a loop, not inlines. 2374 // Note that things "repeated" a max of once are handled as inline, because 2375 // the one copy of the code already generated is just fine. 2376 return FALSE; 2377 } 2378 2379 // Pick up the opcode that is to be repeated 2380 // 2381 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 2382 2383 // Compute the pattern location where the inline sequence 2384 // will end, and set up the state save op that will be needed. 2385 // 2386 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2387 + fIntervalUpper + (fIntervalUpper-fIntervalLow); 2388 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); 2389 if (fIntervalLow == 0) { 2390 insertOp(topOfBlock); 2391 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2392 } 2393 2394 2395 2396 // Loop, emitting the op for the thing being repeated each time. 2397 // Loop starts at 1 because one instance of the op already exists in the pattern, 2398 // it was put there when it was originally encountered. 2399 int32_t i; 2400 for (i=1; i<fIntervalUpper; i++ ) { 2401 if (i == fIntervalLow) { 2402 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2403 } 2404 if (i > fIntervalLow) { 2405 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2406 } 2407 fRXPat->fCompiledPat->addElement(op, *fStatus); 2408 } 2409 return TRUE; 2410 } 2411 2412 2413 2414 //------------------------------------------------------------------------------ 2415 // 2416 // matchStartType Determine how a match can start. 2417 // Used to optimize find() operations. 2418 // 2419 // Operation is very similar to minMatchLength(). Walk the compiled 2420 // pattern, keeping an on-going minimum-match-length. For any 2421 // op where the min match coming in is zero, add that ops possible 2422 // starting matches to the possible starts for the overall pattern. 2423 // 2424 //------------------------------------------------------------------------------ 2425 void RegexCompile::matchStartType() { 2426 if (U_FAILURE(*fStatus)) { 2427 return; 2428 } 2429 2430 2431 int32_t loc; // Location in the pattern of the current op being processed. 2432 int32_t op; // The op being processed 2433 int32_t opType; // The opcode type of the op 2434 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2435 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2436 2437 UBool atStart = TRUE; // True if no part of the pattern yet encountered 2438 // could have advanced the position in a match. 2439 // (Maximum match length so far == 0) 2440 2441 // forwardedLength is a vector holding minimum-match-length values that 2442 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2443 // It must be one longer than the pattern being checked because some ops 2444 // will jmp to a end-of-block+1 location from within a block, and we must 2445 // count those when checking the block. 2446 int32_t end = fRXPat->fCompiledPat->size(); 2447 UVector32 forwardedLength(end+1, *fStatus); 2448 forwardedLength.setSize(end+1); 2449 for (loc=3; loc<end; loc++) { 2450 forwardedLength.setElementAt(INT32_MAX, loc); 2451 } 2452 2453 for (loc = 3; loc<end; loc++) { 2454 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2455 opType = URX_TYPE(op); 2456 2457 // The loop is advancing linearly through the pattern. 2458 // If the op we are now at was the destination of a branch in the pattern, 2459 // and that path has a shorter minimum length than the current accumulated value, 2460 // replace the current accumulated value. 2461 if (forwardedLength.elementAti(loc) < currentLen) { 2462 currentLen = forwardedLength.elementAti(loc); 2463 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2464 } 2465 2466 switch (opType) { 2467 // Ops that don't change the total length matched 2468 case URX_RESERVED_OP: 2469 case URX_END: 2470 case URX_FAIL: 2471 case URX_STRING_LEN: 2472 case URX_NOP: 2473 case URX_START_CAPTURE: 2474 case URX_END_CAPTURE: 2475 case URX_BACKSLASH_B: 2476 case URX_BACKSLASH_BU: 2477 case URX_BACKSLASH_G: 2478 case URX_BACKSLASH_Z: 2479 case URX_DOLLAR: 2480 case URX_DOLLAR_M: 2481 case URX_DOLLAR_D: 2482 case URX_DOLLAR_MD: 2483 case URX_RELOC_OPRND: 2484 case URX_STO_INP_LOC: 2485 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2486 case URX_BACKREF_I: 2487 2488 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2489 case URX_LD_SP: 2490 break; 2491 2492 case URX_CARET: 2493 if (atStart) { 2494 fRXPat->fStartType = START_START; 2495 } 2496 break; 2497 2498 case URX_CARET_M: 2499 case URX_CARET_M_UNIX: 2500 if (atStart) { 2501 fRXPat->fStartType = START_LINE; 2502 } 2503 break; 2504 2505 case URX_ONECHAR: 2506 if (currentLen == 0) { 2507 // This character could appear at the start of a match. 2508 // Add it to the set of possible starting characters. 2509 fRXPat->fInitialChars->add(URX_VAL(op)); 2510 numInitialStrings += 2; 2511 } 2512 currentLen++; 2513 atStart = FALSE; 2514 break; 2515 2516 2517 case URX_SETREF: 2518 if (currentLen == 0) { 2519 int32_t sn = URX_VAL(op); 2520 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2521 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2522 fRXPat->fInitialChars->addAll(*s); 2523 numInitialStrings += 2; 2524 } 2525 currentLen++; 2526 atStart = FALSE; 2527 break; 2528 2529 case URX_LOOP_SR_I: 2530 // [Set]*, like a SETREF, above, in what it can match, 2531 // but may not match at all, so currentLen is not incremented. 2532 if (currentLen == 0) { 2533 int32_t sn = URX_VAL(op); 2534 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2535 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2536 fRXPat->fInitialChars->addAll(*s); 2537 numInitialStrings += 2; 2538 } 2539 atStart = FALSE; 2540 break; 2541 2542 case URX_LOOP_DOT_I: 2543 if (currentLen == 0) { 2544 // .* at the start of a pattern. 2545 // Any character can begin the match. 2546 fRXPat->fInitialChars->clear(); 2547 fRXPat->fInitialChars->complement(); 2548 numInitialStrings += 2; 2549 } 2550 atStart = FALSE; 2551 break; 2552 2553 2554 case URX_STATIC_SETREF: 2555 if (currentLen == 0) { 2556 int32_t sn = URX_VAL(op); 2557 U_ASSERT(sn>0 && sn<URX_LAST_SET); 2558 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2559 fRXPat->fInitialChars->addAll(*s); 2560 numInitialStrings += 2; 2561 } 2562 currentLen++; 2563 atStart = FALSE; 2564 break; 2565 2566 2567 2568 case URX_STAT_SETREF_N: 2569 if (currentLen == 0) { 2570 int32_t sn = URX_VAL(op); 2571 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2572 UnicodeSet sc(*s); 2573 sc.complement(); 2574 fRXPat->fInitialChars->addAll(sc); 2575 numInitialStrings += 2; 2576 } 2577 currentLen++; 2578 atStart = FALSE; 2579 break; 2580 2581 2582 2583 case URX_BACKSLASH_D: 2584 // Digit Char 2585 if (currentLen == 0) { 2586 UnicodeSet s; 2587 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2588 if (URX_VAL(op) != 0) { 2589 s.complement(); 2590 } 2591 fRXPat->fInitialChars->addAll(s); 2592 numInitialStrings += 2; 2593 } 2594 currentLen++; 2595 atStart = FALSE; 2596 break; 2597 2598 2599 case URX_ONECHAR_I: 2600 // Case Insensitive Single Character. 2601 if (currentLen == 0) { 2602 UChar32 c = URX_VAL(op); 2603 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2604 // character may have distinct cased forms. Add all of them 2605 // to the set of possible starting match chars. 2606 UnicodeSet s(c, c); 2607 s.closeOver(USET_CASE_INSENSITIVE); 2608 fRXPat->fInitialChars->addAll(s); 2609 } else { 2610 // Char has no case variants. Just add it as-is to the 2611 // set of possible starting chars. 2612 fRXPat->fInitialChars->add(c); 2613 } 2614 numInitialStrings += 2; 2615 } 2616 currentLen++; 2617 atStart = FALSE; 2618 break; 2619 2620 2621 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2622 case URX_DOTANY_ALL: // . matches one or two. 2623 case URX_DOTANY: 2624 case URX_DOTANY_UNIX: 2625 if (currentLen == 0) { 2626 // These constructs are all bad news when they appear at the start 2627 // of a match. Any character can begin the match. 2628 fRXPat->fInitialChars->clear(); 2629 fRXPat->fInitialChars->complement(); 2630 numInitialStrings += 2; 2631 } 2632 currentLen++; 2633 atStart = FALSE; 2634 break; 2635 2636 2637 case URX_JMPX: 2638 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 2639 case URX_JMP: 2640 { 2641 int32_t jmpDest = URX_VAL(op); 2642 if (jmpDest < loc) { 2643 // Loop of some kind. Can safely ignore, the worst that will happen 2644 // is that we understate the true minimum length 2645 currentLen = forwardedLength.elementAti(loc+1); 2646 2647 } else { 2648 // Forward jump. Propagate the current min length to the target loc of the jump. 2649 U_ASSERT(jmpDest <= end+1); 2650 if (forwardedLength.elementAti(jmpDest) > currentLen) { 2651 forwardedLength.setElementAt(currentLen, jmpDest); 2652 } 2653 } 2654 } 2655 atStart = FALSE; 2656 break; 2657 2658 case URX_JMP_SAV: 2659 case URX_JMP_SAV_X: 2660 // Combo of state save to the next loc, + jmp backwards. 2661 // Net effect on min. length computation is nothing. 2662 atStart = FALSE; 2663 break; 2664 2665 case URX_BACKTRACK: 2666 // Fails are kind of like a branch, except that the min length was 2667 // propagated already, by the state save. 2668 currentLen = forwardedLength.elementAti(loc+1); 2669 atStart = FALSE; 2670 break; 2671 2672 2673 case URX_STATE_SAVE: 2674 { 2675 // State Save, for forward jumps, propagate the current minimum. 2676 // of the state save. 2677 int32_t jmpDest = URX_VAL(op); 2678 if (jmpDest > loc) { 2679 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2680 forwardedLength.setElementAt(currentLen, jmpDest); 2681 } 2682 } 2683 } 2684 atStart = FALSE; 2685 break; 2686 2687 2688 2689 2690 case URX_STRING: 2691 { 2692 loc++; 2693 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2694 int32_t stringLen = URX_VAL(stringLenOp); 2695 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2696 U_ASSERT(stringLenOp >= 2); 2697 if (currentLen == 0) { 2698 // Add the starting character of this string to the set of possible starting 2699 // characters for this pattern. 2700 int32_t stringStartIdx = URX_VAL(op); 2701 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2702 fRXPat->fInitialChars->add(c); 2703 2704 // Remember this string. After the entire pattern has been checked, 2705 // if nothing else is identified that can start a match, we'll use it. 2706 numInitialStrings++; 2707 fRXPat->fInitialStringIdx = stringStartIdx; 2708 fRXPat->fInitialStringLen = stringLen; 2709 } 2710 2711 currentLen += stringLen; 2712 atStart = FALSE; 2713 } 2714 break; 2715 2716 case URX_STRING_I: 2717 { 2718 // Case-insensitive string. Unlike exact-match strings, we won't 2719 // attempt a string search for possible match positions. But we 2720 // do update the set of possible starting characters. 2721 loc++; 2722 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2723 int32_t stringLen = URX_VAL(stringLenOp); 2724 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2725 U_ASSERT(stringLenOp >= 2); 2726 if (currentLen == 0) { 2727 // Add the starting character of this string to the set of possible starting 2728 // characters for this pattern. 2729 int32_t stringStartIdx = URX_VAL(op); 2730 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2731 UnicodeSet s(c, c); 2732 s.closeOver(USET_CASE_INSENSITIVE); 2733 fRXPat->fInitialChars->addAll(s); 2734 numInitialStrings += 2; // Matching on an initial string not possible. 2735 } 2736 currentLen += stringLen; 2737 atStart = FALSE; 2738 } 2739 break; 2740 2741 case URX_CTR_INIT: 2742 case URX_CTR_INIT_NG: 2743 { 2744 // Loop Init Ops. These don't change the min length, but they are 4 word ops 2745 // so location must be updated accordingly. 2746 // Loop Init Ops. 2747 // If the min loop count == 0 2748 // move loc forwards to the end of the loop, skipping over the body. 2749 // If the min count is > 0, 2750 // continue normal processing of the body of the loop. 2751 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 2752 loopEndLoc = URX_VAL(loopEndLoc); 2753 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 2754 if (minLoopCount == 0) { 2755 // Min Loop Count of 0, treat like a forward branch and 2756 // move the current minimum length up to the target 2757 // (end of loop) location. 2758 U_ASSERT(loopEndLoc <= end+1); 2759 if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 2760 forwardedLength.setElementAt(currentLen, loopEndLoc); 2761 } 2762 } 2763 loc+=3; // Skips over operands of CTR_INIT 2764 } 2765 atStart = FALSE; 2766 break; 2767 2768 2769 case URX_CTR_LOOP: 2770 case URX_CTR_LOOP_NG: 2771 // Loop ops. 2772 // The jump is conditional, backwards only. 2773 atStart = FALSE; 2774 break; 2775 2776 case URX_LOOP_C: 2777 // More loop ops. These state-save to themselves. 2778 // don't change the minimum match 2779 atStart = FALSE; 2780 break; 2781 2782 2783 case URX_LA_START: 2784 case URX_LB_START: 2785 { 2786 // Look-around. Scan forward until the matching look-ahead end, 2787 // without processing the look-around block. This is overly pessimistic. 2788 2789 // Keep track of the nesting depth of look-around blocks. Boilerplate code for 2790 // lookahead contains two LA_END instructions, so count goes up by two 2791 // for each LA_START. 2792 int32_t depth = (opType == URX_LA_START? 2: 1); 2793 for (;;) { 2794 loc++; 2795 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2796 if (URX_TYPE(op) == URX_LA_START) { 2797 depth+=2; 2798 } 2799 if (URX_TYPE(op) == URX_LB_START) { 2800 depth++; 2801 } 2802 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 2803 depth--; 2804 if (depth == 0) { 2805 break; 2806 } 2807 } 2808 if (URX_TYPE(op) == URX_STATE_SAVE) { 2809 // Need this because neg lookahead blocks will FAIL to outside 2810 // of the block. 2811 int32_t jmpDest = URX_VAL(op); 2812 if (jmpDest > loc) { 2813 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2814 forwardedLength.setElementAt(currentLen, jmpDest); 2815 } 2816 } 2817 } 2818 U_ASSERT(loc <= end); 2819 } 2820 } 2821 break; 2822 2823 case URX_LA_END: 2824 case URX_LB_CONT: 2825 case URX_LB_END: 2826 case URX_LBN_CONT: 2827 case URX_LBN_END: 2828 U_ASSERT(FALSE); // Shouldn't get here. These ops should be 2829 // consumed by the scan in URX_LA_START and LB_START 2830 2831 break; 2832 2833 default: 2834 U_ASSERT(FALSE); 2835 } 2836 2837 } 2838 2839 2840 // We have finished walking through the ops. Check whether some forward jump 2841 // propagated a shorter length to location end+1. 2842 if (forwardedLength.elementAti(end+1) < currentLen) { 2843 currentLen = forwardedLength.elementAti(end+1); 2844 } 2845 2846 2847 fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 2848 2849 2850 // Sort out what we should check for when looking for candidate match start positions. 2851 // In order of preference, 2852 // 1. Start of input text buffer. 2853 // 2. A literal string. 2854 // 3. Start of line in multi-line mode. 2855 // 4. A single literal character. 2856 // 5. A character from a set of characters. 2857 // 2858 if (fRXPat->fStartType == START_START) { 2859 // Match only at the start of an input text string. 2860 // start type is already set. We're done. 2861 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 2862 // Match beginning only with a literal string. 2863 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 2864 U_ASSERT(fRXPat->fInitialChars->contains(c)); 2865 fRXPat->fStartType = START_STRING; 2866 fRXPat->fInitialChar = c; 2867 } else if (fRXPat->fStartType == START_LINE) { 2868 // Match at start of line in Multi-Line mode. 2869 // Nothing to do here; everything is already set. 2870 } else if (fRXPat->fMinMatchLen == 0) { 2871 // Zero length match possible. We could start anywhere. 2872 fRXPat->fStartType = START_NO_INFO; 2873 } else if (fRXPat->fInitialChars->size() == 1) { 2874 // All matches begin with the same char. 2875 fRXPat->fStartType = START_CHAR; 2876 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 2877 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 2878 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 2879 fRXPat->fMinMatchLen > 0) { 2880 // Matches start with a set of character smaller than the set of all chars. 2881 fRXPat->fStartType = START_SET; 2882 } else { 2883 // Matches can start with anything 2884 fRXPat->fStartType = START_NO_INFO; 2885 } 2886 2887 return; 2888 } 2889 2890 2891 2892 //------------------------------------------------------------------------------ 2893 // 2894 // minMatchLength Calculate the length of the shortest string that could 2895 // match the specified pattern. 2896 // Length is in 16 bit code units, not code points. 2897 // 2898 // The calculated length may not be exact. The returned 2899 // value may be shorter than the actual minimum; it must 2900 // never be longer. 2901 // 2902 // start and end are the range of p-code operations to be 2903 // examined. The endpoints are included in the range. 2904 // 2905 //------------------------------------------------------------------------------ 2906 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 2907 if (U_FAILURE(*fStatus)) { 2908 return 0; 2909 } 2910 2911 U_ASSERT(start <= end); 2912 U_ASSERT(end < fRXPat->fCompiledPat->size()); 2913 2914 2915 int32_t loc; 2916 int32_t op; 2917 int32_t opType; 2918 int32_t currentLen = 0; 2919 2920 2921 // forwardedLength is a vector holding minimum-match-length values that 2922 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2923 // It must be one longer than the pattern being checked because some ops 2924 // will jmp to a end-of-block+1 location from within a block, and we must 2925 // count those when checking the block. 2926 UVector32 forwardedLength(end+2, *fStatus); 2927 forwardedLength.setSize(end+2); 2928 for (loc=start; loc<=end+1; loc++) { 2929 forwardedLength.setElementAt(INT32_MAX, loc); 2930 } 2931 2932 for (loc = start; loc<=end; loc++) { 2933 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2934 opType = URX_TYPE(op); 2935 2936 // The loop is advancing linearly through the pattern. 2937 // If the op we are now at was the destination of a branch in the pattern, 2938 // and that path has a shorter minimum length than the current accumulated value, 2939 // replace the current accumulated value. 2940 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 2941 // no-match-possible cases. 2942 if (forwardedLength.elementAti(loc) < currentLen) { 2943 currentLen = forwardedLength.elementAti(loc); 2944 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2945 } 2946 2947 switch (opType) { 2948 // Ops that don't change the total length matched 2949 case URX_RESERVED_OP: 2950 case URX_END: 2951 case URX_STRING_LEN: 2952 case URX_NOP: 2953 case URX_START_CAPTURE: 2954 case URX_END_CAPTURE: 2955 case URX_BACKSLASH_B: 2956 case URX_BACKSLASH_BU: 2957 case URX_BACKSLASH_G: 2958 case URX_BACKSLASH_Z: 2959 case URX_CARET: 2960 case URX_DOLLAR: 2961 case URX_DOLLAR_M: 2962 case URX_DOLLAR_D: 2963 case URX_DOLLAR_MD: 2964 case URX_RELOC_OPRND: 2965 case URX_STO_INP_LOC: 2966 case URX_CARET_M: 2967 case URX_CARET_M_UNIX: 2968 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2969 case URX_BACKREF_I: 2970 2971 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2972 case URX_LD_SP: 2973 2974 case URX_JMP_SAV: 2975 case URX_JMP_SAV_X: 2976 break; 2977 2978 2979 // Ops that match a minimum of one character (one or two 16 bit code units.) 2980 // 2981 case URX_ONECHAR: 2982 case URX_STATIC_SETREF: 2983 case URX_STAT_SETREF_N: 2984 case URX_SETREF: 2985 case URX_BACKSLASH_D: 2986 case URX_ONECHAR_I: 2987 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2988 case URX_DOTANY_ALL: // . matches one or two. 2989 case URX_DOTANY: 2990 case URX_DOTANY_UNIX: 2991 currentLen++; 2992 break; 2993 2994 2995 case URX_JMPX: 2996 loc++; // URX_JMPX has an extra operand, ignored here, 2997 // otherwise processed identically to URX_JMP. 2998 case URX_JMP: 2999 { 3000 int32_t jmpDest = URX_VAL(op); 3001 if (jmpDest < loc) { 3002 // Loop of some kind. Can safely ignore, the worst that will happen 3003 // is that we understate the true minimum length 3004 currentLen = forwardedLength.elementAti(loc+1); 3005 } else { 3006 // Forward jump. Propagate the current min length to the target loc of the jump. 3007 U_ASSERT(jmpDest <= end+1); 3008 if (forwardedLength.elementAti(jmpDest) > currentLen) { 3009 forwardedLength.setElementAt(currentLen, jmpDest); 3010 } 3011 } 3012 } 3013 break; 3014 3015 case URX_BACKTRACK: 3016 { 3017 // Back-tracks are kind of like a branch, except that the min length was 3018 // propagated already, by the state save. 3019 currentLen = forwardedLength.elementAti(loc+1); 3020 } 3021 break; 3022 3023 3024 case URX_STATE_SAVE: 3025 { 3026 // State Save, for forward jumps, propagate the current minimum. 3027 // of the state save. 3028 int32_t jmpDest = URX_VAL(op); 3029 if (jmpDest > loc) { 3030 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3031 forwardedLength.setElementAt(currentLen, jmpDest); 3032 } 3033 } 3034 } 3035 break; 3036 3037 3038 case URX_STRING: 3039 case URX_STRING_I: 3040 { 3041 loc++; 3042 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3043 currentLen += URX_VAL(stringLenOp); 3044 } 3045 break; 3046 3047 3048 case URX_CTR_INIT: 3049 case URX_CTR_INIT_NG: 3050 { 3051 // Loop Init Ops. 3052 // If the min loop count == 0 3053 // move loc forwards to the end of the loop, skipping over the body. 3054 // If the min count is > 0, 3055 // continue normal processing of the body of the loop. 3056 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3057 loopEndLoc = URX_VAL(loopEndLoc); 3058 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3059 if (minLoopCount == 0) { 3060 loc = loopEndLoc; 3061 } else { 3062 loc+=3; // Skips over operands of CTR_INIT 3063 } 3064 } 3065 break; 3066 3067 3068 case URX_CTR_LOOP: 3069 case URX_CTR_LOOP_NG: 3070 // Loop ops. 3071 // The jump is conditional, backwards only. 3072 break; 3073 3074 case URX_LOOP_SR_I: 3075 case URX_LOOP_DOT_I: 3076 case URX_LOOP_C: 3077 // More loop ops. These state-save to themselves. 3078 // don't change the minimum match - could match nothing at all. 3079 break; 3080 3081 3082 case URX_LA_START: 3083 case URX_LB_START: 3084 { 3085 // Look-around. Scan forward until the matching look-ahead end, 3086 // without processing the look-around block. This is overly pessimistic for look-ahead, 3087 // it assumes that the look-ahead match might be zero-length. 3088 // TODO: Positive lookahead could recursively do the block, then continue 3089 // with the longer of the block or the value coming in. Ticket 6060 3090 int32_t depth = (opType == URX_LA_START? 2: 1);; 3091 for (;;) { 3092 loc++; 3093 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3094 if (URX_TYPE(op) == URX_LA_START) { 3095 // The boilerplate for look-ahead includes two LA_END insturctions, 3096 // Depth will be decremented by each one when it is seen. 3097 depth += 2; 3098 } 3099 if (URX_TYPE(op) == URX_LB_START) { 3100 depth++; 3101 } 3102 if (URX_TYPE(op) == URX_LA_END) { 3103 depth--; 3104 if (depth == 0) { 3105 break; 3106 } 3107 } 3108 if (URX_TYPE(op)==URX_LBN_END) { 3109 depth--; 3110 if (depth == 0) { 3111 break; 3112 } 3113 } 3114 if (URX_TYPE(op) == URX_STATE_SAVE) { 3115 // Need this because neg lookahead blocks will FAIL to outside 3116 // of the block. 3117 int32_t jmpDest = URX_VAL(op); 3118 if (jmpDest > loc) { 3119 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3120 forwardedLength.setElementAt(currentLen, jmpDest); 3121 } 3122 } 3123 } 3124 U_ASSERT(loc <= end); 3125 } 3126 } 3127 break; 3128 3129 case URX_LA_END: 3130 case URX_LB_CONT: 3131 case URX_LB_END: 3132 case URX_LBN_CONT: 3133 case URX_LBN_END: 3134 // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3135 // range being sized, which happens when measuring size of look-behind blocks. 3136 break; 3137 3138 default: 3139 U_ASSERT(FALSE); 3140 } 3141 3142 } 3143 3144 // We have finished walking through the ops. Check whether some forward jump 3145 // propagated a shorter length to location end+1. 3146 if (forwardedLength.elementAti(end+1) < currentLen) { 3147 currentLen = forwardedLength.elementAti(end+1); 3148 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3149 } 3150 3151 return currentLen; 3152 } 3153 3154 3155 3156 //------------------------------------------------------------------------------ 3157 // 3158 // maxMatchLength Calculate the length of the longest string that could 3159 // match the specified pattern. 3160 // Length is in 16 bit code units, not code points. 3161 // 3162 // The calculated length may not be exact. The returned 3163 // value may be longer than the actual maximum; it must 3164 // never be shorter. 3165 // 3166 //------------------------------------------------------------------------------ 3167 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3168 if (U_FAILURE(*fStatus)) { 3169 return 0; 3170 } 3171 U_ASSERT(start <= end); 3172 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3173 3174 3175 int32_t loc; 3176 int32_t op; 3177 int32_t opType; 3178 int32_t currentLen = 0; 3179 UVector32 forwardedLength(end+1, *fStatus); 3180 forwardedLength.setSize(end+1); 3181 3182 for (loc=start; loc<=end; loc++) { 3183 forwardedLength.setElementAt(0, loc); 3184 } 3185 3186 for (loc = start; loc<=end; loc++) { 3187 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3188 opType = URX_TYPE(op); 3189 3190 // The loop is advancing linearly through the pattern. 3191 // If the op we are now at was the destination of a branch in the pattern, 3192 // and that path has a longer maximum length than the current accumulated value, 3193 // replace the current accumulated value. 3194 if (forwardedLength.elementAti(loc) > currentLen) { 3195 currentLen = forwardedLength.elementAti(loc); 3196 } 3197 3198 switch (opType) { 3199 // Ops that don't change the total length matched 3200 case URX_RESERVED_OP: 3201 case URX_END: 3202 case URX_STRING_LEN: 3203 case URX_NOP: 3204 case URX_START_CAPTURE: 3205 case URX_END_CAPTURE: 3206 case URX_BACKSLASH_B: 3207 case URX_BACKSLASH_BU: 3208 case URX_BACKSLASH_G: 3209 case URX_BACKSLASH_Z: 3210 case URX_CARET: 3211 case URX_DOLLAR: 3212 case URX_DOLLAR_M: 3213 case URX_DOLLAR_D: 3214 case URX_DOLLAR_MD: 3215 case URX_RELOC_OPRND: 3216 case URX_STO_INP_LOC: 3217 case URX_CARET_M: 3218 case URX_CARET_M_UNIX: 3219 3220 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3221 case URX_LD_SP: 3222 3223 case URX_LB_END: 3224 case URX_LB_CONT: 3225 case URX_LBN_CONT: 3226 case URX_LBN_END: 3227 break; 3228 3229 3230 // Ops that increase that cause an unbounded increase in the length 3231 // of a matched string, or that increase it a hard to characterize way. 3232 // Call the max length unbounded, and stop further checking. 3233 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3234 case URX_BACKREF_I: 3235 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3236 currentLen = INT32_MAX; 3237 break; 3238 3239 3240 // Ops that match a max of one character (possibly two 16 bit code units.) 3241 // 3242 case URX_STATIC_SETREF: 3243 case URX_STAT_SETREF_N: 3244 case URX_SETREF: 3245 case URX_BACKSLASH_D: 3246 case URX_ONECHAR_I: 3247 case URX_DOTANY_ALL: 3248 case URX_DOTANY: 3249 case URX_DOTANY_UNIX: 3250 currentLen+=2; 3251 break; 3252 3253 // Single literal character. Increase current max length by one or two, 3254 // depending on whether the char is in the supplementary range. 3255 case URX_ONECHAR: 3256 currentLen++; 3257 if (URX_VAL(op) > 0x10000) { 3258 currentLen++; 3259 } 3260 break; 3261 3262 // Jumps. 3263 // 3264 case URX_JMP: 3265 case URX_JMPX: 3266 case URX_JMP_SAV: 3267 case URX_JMP_SAV_X: 3268 { 3269 int32_t jmpDest = URX_VAL(op); 3270 if (jmpDest < loc) { 3271 // Loop of some kind. Max match length is unbounded. 3272 currentLen = INT32_MAX; 3273 } else { 3274 // Forward jump. Propagate the current min length to the target loc of the jump. 3275 if (forwardedLength.elementAti(jmpDest) < currentLen) { 3276 forwardedLength.setElementAt(currentLen, jmpDest); 3277 } 3278 currentLen = 0; 3279 } 3280 } 3281 break; 3282 3283 case URX_BACKTRACK: 3284 // back-tracks are kind of like a branch, except that the max length was 3285 // propagated already, by the state save. 3286 currentLen = forwardedLength.elementAti(loc+1); 3287 break; 3288 3289 3290 case URX_STATE_SAVE: 3291 { 3292 // State Save, for forward jumps, propagate the current minimum. 3293 // of the state save. 3294 // For backwards jumps, they create a loop, maximum 3295 // match length is unbounded. 3296 int32_t jmpDest = URX_VAL(op); 3297 if (jmpDest > loc) { 3298 if (currentLen > forwardedLength.elementAti(jmpDest)) { 3299 forwardedLength.setElementAt(currentLen, jmpDest); 3300 } 3301 } else { 3302 currentLen = INT32_MAX; 3303 } 3304 } 3305 break; 3306 3307 3308 3309 3310 case URX_STRING: 3311 case URX_STRING_I: 3312 { 3313 loc++; 3314 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3315 currentLen += URX_VAL(stringLenOp); 3316 } 3317 break; 3318 3319 3320 case URX_CTR_INIT: 3321 case URX_CTR_INIT_NG: 3322 case URX_CTR_LOOP: 3323 case URX_CTR_LOOP_NG: 3324 case URX_LOOP_SR_I: 3325 case URX_LOOP_DOT_I: 3326 case URX_LOOP_C: 3327 // For anything to do with loops, make the match length unbounded. 3328 // Note: INIT instructions are multi-word. Can ignore because 3329 // INT32_MAX length will stop the per-instruction loop. 3330 currentLen = INT32_MAX; 3331 break; 3332 3333 3334 3335 case URX_LA_START: 3336 case URX_LA_END: 3337 // Look-ahead. Just ignore, treat the look-ahead block as if 3338 // it were normal pattern. Gives a too-long match length, 3339 // but good enough for now. 3340 break; 3341 3342 // End of look-ahead ops should always be consumed by the processing at 3343 // the URX_LA_START op. 3344 // U_ASSERT(FALSE); 3345 // break; 3346 3347 case URX_LB_START: 3348 { 3349 // Look-behind. Scan forward until the matching look-around end, 3350 // without processing the look-behind block. 3351 int32_t depth = 0; 3352 for (;;) { 3353 loc++; 3354 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3355 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 3356 depth++; 3357 } 3358 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3359 if (depth == 0) { 3360 break; 3361 } 3362 depth--; 3363 } 3364 U_ASSERT(loc < end); 3365 } 3366 } 3367 break; 3368 3369 default: 3370 U_ASSERT(FALSE); 3371 } 3372 3373 3374 if (currentLen == INT32_MAX) { 3375 // The maximum length is unbounded. 3376 // Stop further processing of the pattern. 3377 break; 3378 } 3379 3380 } 3381 return currentLen; 3382 3383 } 3384 3385 3386 //------------------------------------------------------------------------------ 3387 // 3388 // stripNOPs Remove any NOP operations from the compiled pattern code. 3389 // Extra NOPs are inserted for some constructs during the initial 3390 // code generation to provide locations that may be patched later. 3391 // Many end up unneeded, and are removed by this function. 3392 // 3393 // In order to minimize the number of passes through the pattern, 3394 // back-reference fixup is also performed here (adjusting 3395 // back-reference operands to point to the correct frame offsets). 3396 // 3397 // In addition, case-insensitive character and string literals are 3398 // now case-folded here, rather than when first parsed or at match 3399 // time. 3400 // 3401 //------------------------------------------------------------------------------ 3402 void RegexCompile::stripNOPs() { 3403 3404 if (U_FAILURE(*fStatus)) { 3405 return; 3406 } 3407 3408 int32_t end = fRXPat->fCompiledPat->size(); 3409 UVector32 deltas(end, *fStatus); 3410 3411 // Make a first pass over the code, computing the amount that things 3412 // will be offset at each location in the original code. 3413 int32_t loc; 3414 int32_t d = 0; 3415 for (loc=0; loc<end; loc++) { 3416 deltas.addElement(d, *fStatus); 3417 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3418 if (URX_TYPE(op) == URX_NOP) { 3419 d++; 3420 } 3421 } 3422 3423 UnicodeString caseStringBuffer; 3424 int32_t stringDelta = 0; 3425 3426 // Make a second pass over the code, removing the NOPs by moving following 3427 // code up, and patching operands that refer to code locations that 3428 // are being moved. The array of offsets from the first step is used 3429 // to compute the new operand values. 3430 int32_t src; 3431 int32_t dst = 0; 3432 for (src=0; src<end; src++) { 3433 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3434 int32_t opType = URX_TYPE(op); 3435 switch (opType) { 3436 case URX_NOP: 3437 break; 3438 3439 case URX_STATE_SAVE: 3440 case URX_JMP: 3441 case URX_CTR_LOOP: 3442 case URX_CTR_LOOP_NG: 3443 case URX_RELOC_OPRND: 3444 case URX_JMPX: 3445 case URX_JMP_SAV: 3446 case URX_JMP_SAV_X: 3447 // These are instructions with operands that refer to code locations. 3448 { 3449 int32_t operandAddress = URX_VAL(op); 3450 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3451 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 3452 op = URX_BUILD(opType, fixedOperandAddress); 3453 fRXPat->fCompiledPat->setElementAt(op, dst); 3454 dst++; 3455 break; 3456 } 3457 3458 case URX_ONECHAR_I: 3459 { 3460 UChar32 c = URX_VAL(op); 3461 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 3462 // We have a cased character to fold 3463 c = u_foldCase(c, U_FOLD_CASE_DEFAULT); 3464 op = URX_BUILD(URX_ONECHAR_I, c); 3465 } 3466 3467 fRXPat->fCompiledPat->setElementAt(op, dst); 3468 dst++; 3469 break; 3470 } 3471 case URX_STRING_I: 3472 { 3473 op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta); 3474 3475 src++; 3476 int32_t lengthOp = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3477 3478 caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp)); 3479 caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT); 3480 3481 int32_t newLen = caseStringBuffer.length(); 3482 if (newLen <= URX_VAL(lengthOp)) { 3483 // don't shift if we don't have to, take the tiny memory hit of a smaller string 3484 fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer); 3485 } else { 3486 // shift other strings over...at least UnicodeString handles this for us! 3487 fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer); 3488 stringDelta += newLen - URX_VAL(lengthOp); 3489 } 3490 lengthOp = URX_BUILD(URX_STRING_LEN, newLen); 3491 3492 fRXPat->fCompiledPat->setElementAt(op, dst); 3493 fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1); 3494 dst += 2; 3495 break; 3496 } 3497 case URX_BACKREF: 3498 case URX_BACKREF_I: 3499 { 3500 int32_t where = URX_VAL(op); 3501 if (where > fRXPat->fGroupMap->size()) { 3502 error(U_REGEX_INVALID_BACK_REF); 3503 break; 3504 } 3505 where = fRXPat->fGroupMap->elementAti(where-1); 3506 op = URX_BUILD(opType, where); 3507 fRXPat->fCompiledPat->setElementAt(op, dst); 3508 dst++; 3509 3510 fRXPat->fNeedsAltInput = TRUE; 3511 break; 3512 } 3513 case URX_STRING: 3514 op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta); 3515 // continue 3516 case URX_RESERVED_OP: 3517 case URX_RESERVED_OP_N: 3518 case URX_BACKTRACK: 3519 case URX_END: 3520 case URX_ONECHAR: 3521 case URX_STRING_LEN: 3522 case URX_START_CAPTURE: 3523 case URX_END_CAPTURE: 3524 case URX_STATIC_SETREF: 3525 case URX_STAT_SETREF_N: 3526 case URX_SETREF: 3527 case URX_DOTANY: 3528 case URX_FAIL: 3529 case URX_BACKSLASH_B: 3530 case URX_BACKSLASH_BU: 3531 case URX_BACKSLASH_G: 3532 case URX_BACKSLASH_X: 3533 case URX_BACKSLASH_Z: 3534 case URX_DOTANY_ALL: 3535 case URX_BACKSLASH_D: 3536 case URX_CARET: 3537 case URX_DOLLAR: 3538 case URX_CTR_INIT: 3539 case URX_CTR_INIT_NG: 3540 case URX_DOTANY_UNIX: 3541 case URX_STO_SP: 3542 case URX_LD_SP: 3543 case URX_STO_INP_LOC: 3544 case URX_LA_START: 3545 case URX_LA_END: 3546 case URX_DOLLAR_M: 3547 case URX_CARET_M: 3548 case URX_CARET_M_UNIX: 3549 case URX_LB_START: 3550 case URX_LB_CONT: 3551 case URX_LB_END: 3552 case URX_LBN_CONT: 3553 case URX_LBN_END: 3554 case URX_LOOP_SR_I: 3555 case URX_LOOP_DOT_I: 3556 case URX_LOOP_C: 3557 case URX_DOLLAR_D: 3558 case URX_DOLLAR_MD: 3559 // These instructions are unaltered by the relocation. 3560 fRXPat->fCompiledPat->setElementAt(op, dst); 3561 dst++; 3562 break; 3563 3564 default: 3565 // Some op is unaccounted for. 3566 U_ASSERT(FALSE); 3567 error(U_REGEX_INTERNAL_ERROR); 3568 } 3569 } 3570 3571 fRXPat->fCompiledPat->setSize(dst); 3572 } 3573 3574 3575 3576 3577 //------------------------------------------------------------------------------ 3578 // 3579 // Error Report a rule parse error. 3580 // Only report it if no previous error has been recorded. 3581 // 3582 //------------------------------------------------------------------------------ 3583 void RegexCompile::error(UErrorCode e) { 3584 if (U_SUCCESS(*fStatus)) { 3585 *fStatus = e; 3586 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 3587 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 3588 // int64_t. If the values of the latter are out of range for the former, 3589 // set them to the appropriate "field not supported" values. 3590 if (fLineNum > 0x7FFFFFFF) { 3591 fParseErr->line = 0; 3592 fParseErr->offset = -1; 3593 } else if (fCharNum > 0x7FFFFFFF) { 3594 fParseErr->line = (int32_t)fLineNum; 3595 fParseErr->offset = -1; 3596 } else { 3597 fParseErr->line = (int32_t)fLineNum; 3598 fParseErr->offset = (int32_t)fCharNum; 3599 } 3600 3601 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3602 3603 // Fill in the context. 3604 // Note: extractBetween() pins supplied indicies to the string bounds. 3605 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3606 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 3607 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 3608 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3609 } 3610 } 3611 3612 3613 // 3614 // Assorted Unicode character constants. 3615 // Numeric because there is no portable way to enter them as literals. 3616 // (Think EBCDIC). 3617 // 3618 static const UChar chCR = 0x0d; // New lines, for terminating comments. 3619 static const UChar chLF = 0x0a; // Line Feed 3620 static const UChar chPound = 0x23; // '#', introduces a comment. 3621 static const UChar chDigit0 = 0x30; // '0' 3622 static const UChar chDigit7 = 0x37; // '9' 3623 static const UChar chColon = 0x3A; // ':' 3624 static const UChar chE = 0x45; // 'E' 3625 static const UChar chQ = 0x51; // 'Q' 3626 static const UChar chN = 0x4E; // 'N' 3627 static const UChar chP = 0x50; // 'P' 3628 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 3629 static const UChar chLBracket = 0x5b; // '[' 3630 static const UChar chRBracket = 0x5d; // ']' 3631 static const UChar chUp = 0x5e; // '^' 3632 static const UChar chLowerP = 0x70; 3633 static const UChar chLBrace = 0x7b; // '{' 3634 static const UChar chRBrace = 0x7d; // '}' 3635 static const UChar chNEL = 0x85; // NEL newline variant 3636 static const UChar chLS = 0x2028; // Unicode Line Separator 3637 3638 3639 //------------------------------------------------------------------------------ 3640 // 3641 // nextCharLL Low Level Next Char from the regex pattern. 3642 // Get a char from the string, keep track of input position 3643 // for error reporting. 3644 // 3645 //------------------------------------------------------------------------------ 3646 UChar32 RegexCompile::nextCharLL() { 3647 UChar32 ch; 3648 3649 if (fPeekChar != -1) { 3650 ch = fPeekChar; 3651 fPeekChar = -1; 3652 return ch; 3653 } 3654 3655 // assume we're already in the right place 3656 ch = UTEXT_NEXT32(fRXPat->fPattern); 3657 if (ch == U_SENTINEL) { 3658 return ch; 3659 } 3660 3661 if (ch == chCR || 3662 ch == chNEL || 3663 ch == chLS || 3664 (ch == chLF && fLastChar != chCR)) { 3665 // Character is starting a new line. Bump up the line number, and 3666 // reset the column to 0. 3667 fLineNum++; 3668 fCharNum=0; 3669 } 3670 else { 3671 // Character is not starting a new line. Except in the case of a 3672 // LF following a CR, increment the column position. 3673 if (ch != chLF) { 3674 fCharNum++; 3675 } 3676 } 3677 fLastChar = ch; 3678 return ch; 3679 } 3680 3681 //------------------------------------------------------------------------------ 3682 // 3683 // peekCharLL Low Level Character Scanning, sneak a peek at the next 3684 // character without actually getting it. 3685 // 3686 //------------------------------------------------------------------------------ 3687 UChar32 RegexCompile::peekCharLL() { 3688 if (fPeekChar == -1) { 3689 fPeekChar = nextCharLL(); 3690 } 3691 return fPeekChar; 3692 } 3693 3694 3695 //------------------------------------------------------------------------------ 3696 // 3697 // nextChar for pattern scanning. At this level, we handle stripping 3698 // out comments and processing some backslash character escapes. 3699 // The rest of the pattern grammar is handled at the next level up. 3700 // 3701 //------------------------------------------------------------------------------ 3702 void RegexCompile::nextChar(RegexPatternChar &c) { 3703 3704 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3705 c.fChar = nextCharLL(); 3706 c.fQuoted = FALSE; 3707 3708 if (fQuoteMode) { 3709 c.fQuoted = TRUE; 3710 if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) { 3711 fQuoteMode = FALSE; // Exit quote mode, 3712 nextCharLL(); // discard the E 3713 nextChar(c); // recurse to get the real next char 3714 } 3715 } 3716 else if (fInBackslashQuote) { 3717 // The current character immediately follows a '\' 3718 // Don't check for any further escapes, just return it as-is. 3719 // Don't set c.fQuoted, because that would prevent the state machine from 3720 // dispatching on the character. 3721 fInBackslashQuote = FALSE; 3722 } 3723 else 3724 { 3725 // We are not in a \Q quoted region \E of the source. 3726 // 3727 if (fModeFlags & UREGEX_COMMENTS) { 3728 // 3729 // We are in free-spacing and comments mode. 3730 // Scan through any white space and comments, until we 3731 // reach a significant character or the end of inut. 3732 for (;;) { 3733 if (c.fChar == (UChar32)-1) { 3734 break; // End of Input 3735 } 3736 if (c.fChar == chPound && fEOLComments == TRUE) { 3737 // Start of a comment. Consume the rest of it, until EOF or a new line 3738 for (;;) { 3739 c.fChar = nextCharLL(); 3740 if (c.fChar == (UChar32)-1 || // EOF 3741 c.fChar == chCR || 3742 c.fChar == chLF || 3743 c.fChar == chNEL || 3744 c.fChar == chLS) { 3745 break; 3746 } 3747 } 3748 } 3749 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 3750 if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) { 3751 break; 3752 } 3753 c.fChar = nextCharLL(); 3754 } 3755 } 3756 3757 // 3758 // check for backslash escaped characters. 3759 // 3760 if (c.fChar == chBackSlash) { 3761 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3762 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 3763 // 3764 // A '\' sequence that is handled by ICU's standard unescapeAt function. 3765 // Includes \uxxxx, \n, \r, many others. 3766 // Return the single equivalent character. 3767 // 3768 nextCharLL(); // get & discard the peeked char. 3769 c.fQuoted = TRUE; 3770 3771 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 3772 int32_t endIndex = (int32_t)pos; 3773 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 3774 3775 if (endIndex == pos) { 3776 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3777 } 3778 fCharNum += endIndex - pos; 3779 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 3780 } else { 3781 int32_t offset = 0; 3782 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 3783 3784 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 3785 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 3786 3787 if (offset == 0) { 3788 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3789 } else if (context.lastOffset == offset) { 3790 UTEXT_PREVIOUS32(fRXPat->fPattern); 3791 } else if (context.lastOffset != offset-1) { 3792 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 3793 } 3794 fCharNum += offset; 3795 } 3796 } 3797 else if (peekCharLL() == chDigit0) { 3798 // Octal Escape, using Java Regexp Conventions 3799 // which are \0 followed by 1-3 octal digits. 3800 // Different from ICU Unescape handling of Octal, which does not 3801 // require the leading 0. 3802 // Java also has the convention of only consuming 2 octal digits if 3803 // the three digit number would be > 0xff 3804 // 3805 c.fChar = 0; 3806 nextCharLL(); // Consume the initial 0. 3807 int index; 3808 for (index=0; index<3; index++) { 3809 int32_t ch = peekCharLL(); 3810 if (ch<chDigit0 || ch>chDigit7) { 3811 if (index==0) { 3812 // \0 is not followed by any octal digits. 3813 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3814 } 3815 break; 3816 } 3817 c.fChar <<= 3; 3818 c.fChar += ch&7; 3819 if (c.fChar <= 255) { 3820 nextCharLL(); 3821 } else { 3822 // The last digit made the number too big. Forget we saw it. 3823 c.fChar >>= 3; 3824 } 3825 } 3826 c.fQuoted = TRUE; 3827 } 3828 else if (peekCharLL() == chQ) { 3829 // "\Q" enter quote mode, which will continue until "\E" 3830 fQuoteMode = TRUE; 3831 nextCharLL(); // discard the 'Q'. 3832 nextChar(c); // recurse to get the real next char. 3833 } 3834 else 3835 { 3836 // We are in a '\' escape that will be handled by the state table scanner. 3837 // Just return the backslash, but remember that the following char is to 3838 // be taken literally. 3839 fInBackslashQuote = TRUE; 3840 } 3841 } 3842 } 3843 3844 // re-enable # to end-of-line comments, in case they were disabled. 3845 // They are disabled by the parser upon seeing '(?', but this lasts for 3846 // the fetching of the next character only. 3847 fEOLComments = TRUE; 3848 3849 // putc(c.fChar, stdout); 3850 } 3851 3852 3853 3854 //------------------------------------------------------------------------------ 3855 // 3856 // scanNamedChar 3857 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 3858 // 3859 // The scan position will be at the 'N'. On return 3860 // the scan position should be just after the '}' 3861 // 3862 // Return the UChar32 3863 // 3864 //------------------------------------------------------------------------------ 3865 UChar32 RegexCompile::scanNamedChar() { 3866 if (U_FAILURE(*fStatus)) { 3867 return 0; 3868 } 3869 3870 nextChar(fC); 3871 if (fC.fChar != chLBrace) { 3872 error(U_REGEX_PROPERTY_SYNTAX); 3873 return 0; 3874 } 3875 3876 UnicodeString charName; 3877 for (;;) { 3878 nextChar(fC); 3879 if (fC.fChar == chRBrace) { 3880 break; 3881 } 3882 if (fC.fChar == -1) { 3883 error(U_REGEX_PROPERTY_SYNTAX); 3884 return 0; 3885 } 3886 charName.append(fC.fChar); 3887 } 3888 3889 char name[100]; 3890 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 3891 (uint32_t)charName.length()>=sizeof(name)) { 3892 // All Unicode character names have only invariant characters. 3893 // The API to get a character, given a name, accepts only char *, forcing us to convert, 3894 // which requires this error check 3895 error(U_REGEX_PROPERTY_SYNTAX); 3896 return 0; 3897 } 3898 charName.extract(0, charName.length(), name, sizeof(name), US_INV); 3899 3900 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 3901 if (U_FAILURE(*fStatus)) { 3902 error(U_REGEX_PROPERTY_SYNTAX); 3903 } 3904 3905 nextChar(fC); // Continue overall regex pattern processing with char after the '}' 3906 return theChar; 3907 } 3908 3909 //------------------------------------------------------------------------------ 3910 // 3911 // scanProp Construct a UnicodeSet from the text at the current scan 3912 // position, which will be of the form \p{whaterver} 3913 // 3914 // The scan position will be at the 'p' or 'P'. On return 3915 // the scan position should be just after the '}' 3916 // 3917 // Return a UnicodeSet, constructed from the \P pattern, 3918 // or NULL if the pattern is invalid. 3919 // 3920 //------------------------------------------------------------------------------ 3921 UnicodeSet *RegexCompile::scanProp() { 3922 UnicodeSet *uset = NULL; 3923 3924 if (U_FAILURE(*fStatus)) { 3925 return NULL; 3926 } 3927 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 3928 UBool negated = (fC.fChar == chP); 3929 3930 UnicodeString propertyName; 3931 nextChar(fC); 3932 if (fC.fChar != chLBrace) { 3933 error(U_REGEX_PROPERTY_SYNTAX); 3934 return NULL; 3935 } 3936 for (;;) { 3937 nextChar(fC); 3938 if (fC.fChar == chRBrace) { 3939 break; 3940 } 3941 if (fC.fChar == -1) { 3942 // Hit the end of the input string without finding the closing '}' 3943 error(U_REGEX_PROPERTY_SYNTAX); 3944 return NULL; 3945 } 3946 propertyName.append(fC.fChar); 3947 } 3948 uset = createSetForProperty(propertyName, negated); 3949 nextChar(fC); // Move input scan to position following the closing '}' 3950 return uset; 3951 } 3952 3953 //------------------------------------------------------------------------------ 3954 // 3955 // scanPosixProp Construct a UnicodeSet from the text at the current scan 3956 // position, which is expected be of the form [:property expression:] 3957 // 3958 // The scan position will be at the opening ':'. On return 3959 // the scan position must be on the closing ']' 3960 // 3961 // Return a UnicodeSet constructed from the pattern, 3962 // or NULL if this is not a valid POSIX-style set expression. 3963 // If not a property expression, restore the initial scan position 3964 // (to the opening ':') 3965 // 3966 // Note: the opening '[:' is not sufficient to guarantee that 3967 // this is a [:property:] expression. 3968 // [:'+=,] is a perfectly good ordinary set expression that 3969 // happens to include ':' as one of its characters. 3970 // 3971 //------------------------------------------------------------------------------ 3972 UnicodeSet *RegexCompile::scanPosixProp() { 3973 UnicodeSet *uset = NULL; 3974 3975 if (U_FAILURE(*fStatus)) { 3976 return NULL; 3977 } 3978 3979 U_ASSERT(fC.fChar == chColon); 3980 3981 // Save the scanner state. 3982 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 3983 int64_t savedScanIndex = fScanIndex; 3984 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3985 UBool savedQuoteMode = fQuoteMode; 3986 UBool savedInBackslashQuote = fInBackslashQuote; 3987 UBool savedEOLComments = fEOLComments; 3988 int64_t savedLineNum = fLineNum; 3989 int64_t savedCharNum = fCharNum; 3990 UChar32 savedLastChar = fLastChar; 3991 UChar32 savedPeekChar = fPeekChar; 3992 RegexPatternChar savedfC = fC; 3993 3994 // Scan for a closing ]. A little tricky because there are some perverse 3995 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 3996 // ending on the second closing ]. 3997 3998 UnicodeString propName; 3999 UBool negated = FALSE; 4000 4001 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 4002 nextChar(fC); 4003 if (fC.fChar == chUp) { 4004 negated = TRUE; 4005 nextChar(fC); 4006 } 4007 4008 // Scan for the closing ":]", collecting the property name along the way. 4009 UBool sawPropSetTerminator = FALSE; 4010 for (;;) { 4011 propName.append(fC.fChar); 4012 nextChar(fC); 4013 if (fC.fQuoted || fC.fChar == -1) { 4014 // Escaped characters or end of input - either says this isn't a [:Property:] 4015 break; 4016 } 4017 if (fC.fChar == chColon) { 4018 nextChar(fC); 4019 if (fC.fChar == chRBracket) { 4020 sawPropSetTerminator = TRUE; 4021 } 4022 break; 4023 } 4024 } 4025 4026 if (sawPropSetTerminator) { 4027 uset = createSetForProperty(propName, negated); 4028 } 4029 else 4030 { 4031 // No closing ":]". 4032 // Restore the original scan position. 4033 // The main scanner will retry the input as a normal set expression, 4034 // not a [:Property:] expression. 4035 fScanIndex = savedScanIndex; 4036 fQuoteMode = savedQuoteMode; 4037 fInBackslashQuote = savedInBackslashQuote; 4038 fEOLComments = savedEOLComments; 4039 fLineNum = savedLineNum; 4040 fCharNum = savedCharNum; 4041 fLastChar = savedLastChar; 4042 fPeekChar = savedPeekChar; 4043 fC = savedfC; 4044 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4045 } 4046 return uset; 4047 } 4048 4049 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4050 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4051 addCategory(set, U_GC_CF_MASK, ec); 4052 } 4053 4054 // 4055 // Create a Unicode Set from a Unicode Property expression. 4056 // This is common code underlying both \p{...} ane [:...:] expressions. 4057 // Includes trying the Java "properties" that aren't supported as 4058 // normal ICU UnicodeSet properties 4059 // 4060 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 4061 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 4062 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4063 UnicodeString setExpr; 4064 UnicodeSet *set; 4065 uint32_t usetFlags = 0; 4066 4067 if (U_FAILURE(*fStatus)) { 4068 return NULL; 4069 } 4070 4071 // 4072 // First try the property as we received it 4073 // 4074 if (negated) { 4075 setExpr.append(negSetPrefix, -1); 4076 } else { 4077 setExpr.append(posSetPrefix, -1); 4078 } 4079 setExpr.append(propName); 4080 setExpr.append(chRBrace); 4081 setExpr.append(chRBracket); 4082 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4083 usetFlags |= USET_CASE_INSENSITIVE; 4084 } 4085 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4086 if (U_SUCCESS(*fStatus)) { 4087 return set; 4088 } 4089 delete set; 4090 set = NULL; 4091 4092 // 4093 // The property as it was didn't work. 4094 4095 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX 4096 // or standard Java, but many other regular expression packages do recognize it. 4097 4098 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { 4099 *fStatus = U_ZERO_ERROR; 4100 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); 4101 if (set == NULL) { 4102 *fStatus = U_MEMORY_ALLOCATION_ERROR; 4103 return set; 4104 } 4105 if (negated) { 4106 set->complement(); 4107 } 4108 return set; 4109 } 4110 4111 4112 // Do Java fixes - 4113 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 4114 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. 4115 // 4116 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" 4117 // is accepted by Java. The property part of the name is compared 4118 // case-insenstively. The spaces must be exactly as shown, either 4119 // all there, or all omitted, with exactly one at each position 4120 // if they are present. From checking against JDK 1.6 4121 // 4122 // This code should be removed when ICU properties support the Java compatibility names 4123 // (ICU 4.0?) 4124 // 4125 UnicodeString mPropName = propName; 4126 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 4127 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 4128 } 4129 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || 4130 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { 4131 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); 4132 } 4133 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4134 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 4135 } 4136 4137 // See if the property looks like a Java "InBlockName", which 4138 // we will recast as "Block=BlockName" 4139 // 4140 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 4141 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" 4142 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 4143 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 4144 setExpr.append(BLOCK, -1); 4145 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. 4146 setExpr.append(chRBrace); 4147 setExpr.append(chRBracket); 4148 *fStatus = U_ZERO_ERROR; 4149 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4150 if (U_SUCCESS(*fStatus)) { 4151 return set; 4152 } 4153 delete set; 4154 set = NULL; 4155 } 4156 4157 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || 4158 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) 4159 { 4160 UErrorCode localStatus = U_ZERO_ERROR; 4161 //setExpr.remove(); 4162 set = new UnicodeSet(); 4163 // 4164 // Try the various Java specific properties. 4165 // These all begin with "java" 4166 // 4167 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { 4168 addCategory(set, U_GC_CN_MASK, localStatus); 4169 set->complement(); 4170 } 4171 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { 4172 addCategory(set, U_GC_ND_MASK, localStatus); 4173 } 4174 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { 4175 addIdentifierIgnorable(set, localStatus); 4176 } 4177 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { 4178 set->add(0, 0x1F).add(0x7F, 0x9F); 4179 } 4180 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { 4181 addCategory(set, U_GC_L_MASK, localStatus); 4182 addCategory(set, U_GC_SC_MASK, localStatus); 4183 addCategory(set, U_GC_PC_MASK, localStatus); 4184 addCategory(set, U_GC_ND_MASK, localStatus); 4185 addCategory(set, U_GC_NL_MASK, localStatus); 4186 addCategory(set, U_GC_MC_MASK, localStatus); 4187 addCategory(set, U_GC_MN_MASK, localStatus); 4188 addIdentifierIgnorable(set, localStatus); 4189 } 4190 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { 4191 addCategory(set, U_GC_L_MASK, localStatus); 4192 addCategory(set, U_GC_NL_MASK, localStatus); 4193 addCategory(set, U_GC_SC_MASK, localStatus); 4194 addCategory(set, U_GC_PC_MASK, localStatus); 4195 } 4196 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { 4197 addCategory(set, U_GC_L_MASK, localStatus); 4198 } 4199 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { 4200 addCategory(set, U_GC_L_MASK, localStatus); 4201 addCategory(set, U_GC_ND_MASK, localStatus); 4202 } 4203 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { 4204 addCategory(set, U_GC_LL_MASK, localStatus); 4205 } 4206 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { 4207 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); 4208 } 4209 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { 4210 addCategory(set, U_GC_Z_MASK, localStatus); 4211 } 4212 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { 4213 set->add(0x10000, UnicodeSet::MAX_VALUE); 4214 } 4215 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { 4216 addCategory(set, U_GC_LT_MASK, localStatus); 4217 } 4218 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { 4219 addCategory(set, U_GC_L_MASK, localStatus); 4220 addCategory(set, U_GC_NL_MASK, localStatus); 4221 } 4222 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { 4223 addCategory(set, U_GC_L_MASK, localStatus); 4224 addCategory(set, U_GC_PC_MASK, localStatus); 4225 addCategory(set, U_GC_ND_MASK, localStatus); 4226 addCategory(set, U_GC_NL_MASK, localStatus); 4227 addCategory(set, U_GC_MC_MASK, localStatus); 4228 addCategory(set, U_GC_MN_MASK, localStatus); 4229 addIdentifierIgnorable(set, localStatus); 4230 } 4231 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { 4232 addCategory(set, U_GC_LU_MASK, localStatus); 4233 } 4234 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { 4235 set->add(0, UnicodeSet::MAX_VALUE); 4236 } 4237 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { 4238 addCategory(set, U_GC_Z_MASK, localStatus); 4239 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4240 set->add(9, 0x0d).add(0x1c, 0x1f); 4241 } 4242 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4243 set->add(0, UnicodeSet::MAX_VALUE); 4244 } 4245 4246 if (U_SUCCESS(localStatus) && !set->isEmpty()) { 4247 *fStatus = U_ZERO_ERROR; 4248 if (usetFlags & USET_CASE_INSENSITIVE) { 4249 set->closeOver(USET_CASE_INSENSITIVE); 4250 } 4251 if (negated) { 4252 set->complement(); 4253 } 4254 return set; 4255 } 4256 delete set; 4257 set = NULL; 4258 } 4259 error(*fStatus); 4260 return NULL; 4261 } 4262 4263 4264 4265 // 4266 // SetEval Part of the evaluation of [set expressions]. 4267 // Perform any pending (stacked) operations with precedence 4268 // equal or greater to that of the next operator encountered 4269 // in the expression. 4270 // 4271 void RegexCompile::setEval(int32_t nextOp) { 4272 UnicodeSet *rightOperand = NULL; 4273 UnicodeSet *leftOperand = NULL; 4274 for (;;) { 4275 U_ASSERT(fSetOpStack.empty()==FALSE); 4276 int32_t pendingSetOperation = fSetOpStack.peeki(); 4277 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4278 break; 4279 } 4280 fSetOpStack.popi(); 4281 U_ASSERT(fSetStack.empty() == FALSE); 4282 rightOperand = (UnicodeSet *)fSetStack.peek(); 4283 switch (pendingSetOperation) { 4284 case setNegation: 4285 rightOperand->complement(); 4286 break; 4287 case setCaseClose: 4288 // TODO: need a simple close function. Ticket 6065 4289 rightOperand->closeOver(USET_CASE_INSENSITIVE); 4290 rightOperand->removeAllStrings(); 4291 break; 4292 case setDifference1: 4293 case setDifference2: 4294 fSetStack.pop(); 4295 leftOperand = (UnicodeSet *)fSetStack.peek(); 4296 leftOperand->removeAll(*rightOperand); 4297 delete rightOperand; 4298 break; 4299 case setIntersection1: 4300 case setIntersection2: 4301 fSetStack.pop(); 4302 leftOperand = (UnicodeSet *)fSetStack.peek(); 4303 leftOperand->retainAll(*rightOperand); 4304 delete rightOperand; 4305 break; 4306 case setUnion: 4307 fSetStack.pop(); 4308 leftOperand = (UnicodeSet *)fSetStack.peek(); 4309 leftOperand->addAll(*rightOperand); 4310 delete rightOperand; 4311 break; 4312 default: 4313 U_ASSERT(FALSE); 4314 break; 4315 } 4316 } 4317 } 4318 4319 void RegexCompile::setPushOp(int32_t op) { 4320 setEval(op); 4321 fSetOpStack.push(op, *fStatus); 4322 fSetStack.push(new UnicodeSet(), *fStatus); 4323 } 4324 4325 U_NAMESPACE_END 4326 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4327 4328