1 // Copyright 2006 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Regular expression parser. 6 7 // The parser is a simple precedence-based parser with a 8 // manual stack. The parsing work is done by the methods 9 // of the ParseState class. The Regexp::Parse function is 10 // essentially just a lexer that calls the ParseState method 11 // for each token. 12 13 // The parser recognizes POSIX extended regular expressions 14 // excluding backreferences, collating elements, and collating 15 // classes. It also allows the empty string as a regular expression 16 // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. 17 // See regexp.h for rationale. 18 19 #include <ctype.h> 20 #include "util/util.h" 21 #include "re2/regexp.h" 22 #include "re2/stringpiece.h" 23 #include "re2/unicode_casefold.h" 24 #include "re2/unicode_groups.h" 25 26 namespace re2 { 27 28 // Regular expression parse state. 29 // The list of parsed regexps so far is maintained as a vector of 30 // Regexp pointers called the stack. Left parenthesis and vertical 31 // bar markers are also placed on the stack, as Regexps with 32 // non-standard opcodes. 33 // Scanning a left parenthesis causes the parser to push a left parenthesis 34 // marker on the stack. 35 // Scanning a vertical bar causes the parser to pop the stack until it finds a 36 // vertical bar or left parenthesis marker (not popping the marker), 37 // concatenate all the popped results, and push them back on 38 // the stack (DoConcatenation). 39 // Scanning a right parenthesis causes the parser to act as though it 40 // has seen a vertical bar, which then leaves the top of the stack in the 41 // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. 42 // The parser pops all this off the stack and creates an alternation of the 43 // regexps (DoAlternation). 44 45 class Regexp::ParseState { 46 public: 47 ParseState(ParseFlags flags, const StringPiece& whole_regexp, 48 RegexpStatus* status); 49 ~ParseState(); 50 51 ParseFlags flags() { return flags_; } 52 int rune_max() { return rune_max_; } 53 54 // Parse methods. All public methods return a bool saying 55 // whether parsing should continue. If a method returns 56 // false, it has set fields in *status_, and the parser 57 // should return NULL. 58 59 // Pushes the given regular expression onto the stack. 60 // Could check for too much memory used here. 61 bool PushRegexp(Regexp* re); 62 63 // Pushes the literal rune r onto the stack. 64 bool PushLiteral(Rune r); 65 66 // Pushes a regexp with the given op (and no args) onto the stack. 67 bool PushSimpleOp(RegexpOp op); 68 69 // Pushes a ^ onto the stack. 70 bool PushCarat(); 71 72 // Pushes a \b (word == true) or \B (word == false) onto the stack. 73 bool PushWordBoundary(bool word); 74 75 // Pushes a $ onto the stack. 76 bool PushDollar(); 77 78 // Pushes a . onto the stack 79 bool PushDot(); 80 81 // Pushes a repeat operator regexp onto the stack. 82 // A valid argument for the operator must already be on the stack. 83 // s is the name of the operator, for use in error messages. 84 bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); 85 86 // Pushes a repetition regexp onto the stack. 87 // A valid argument for the operator must already be on the stack. 88 bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); 89 90 // Checks whether a particular regexp op is a marker. 91 bool IsMarker(RegexpOp op); 92 93 // Processes a left parenthesis in the input. 94 // Pushes a marker onto the stack. 95 bool DoLeftParen(const StringPiece& name); 96 bool DoLeftParenNoCapture(); 97 98 // Processes a vertical bar in the input. 99 bool DoVerticalBar(); 100 101 // Processes a right parenthesis in the input. 102 bool DoRightParen(); 103 104 // Processes the end of input, returning the final regexp. 105 Regexp* DoFinish(); 106 107 // Finishes the regexp if necessary, preparing it for use 108 // in a more complicated expression. 109 // If it is a CharClassBuilder, converts into a CharClass. 110 Regexp* FinishRegexp(Regexp*); 111 112 // These routines don't manipulate the parse stack 113 // directly, but they do need to look at flags_. 114 // ParseCharClass also manipulates the internals of Regexp 115 // while creating *out_re. 116 117 // Parse a character class into *out_re. 118 // Removes parsed text from s. 119 bool ParseCharClass(StringPiece* s, Regexp** out_re, 120 RegexpStatus* status); 121 122 // Parse a character class character into *rp. 123 // Removes parsed text from s. 124 bool ParseCCCharacter(StringPiece* s, Rune *rp, 125 const StringPiece& whole_class, 126 RegexpStatus* status); 127 128 // Parse a character class range into rr. 129 // Removes parsed text from s. 130 bool ParseCCRange(StringPiece* s, RuneRange* rr, 131 const StringPiece& whole_class, 132 RegexpStatus* status); 133 134 // Parse a Perl flag set or non-capturing group from s. 135 bool ParsePerlFlags(StringPiece* s); 136 137 138 // Finishes the current concatenation, 139 // collapsing it into a single regexp on the stack. 140 void DoConcatenation(); 141 142 // Finishes the current alternation, 143 // collapsing it to a single regexp on the stack. 144 void DoAlternation(); 145 146 // Generalized DoAlternation/DoConcatenation. 147 void DoCollapse(RegexpOp op); 148 149 // Maybe concatenate Literals into LiteralString. 150 bool MaybeConcatString(int r, ParseFlags flags); 151 152 private: 153 ParseFlags flags_; 154 StringPiece whole_regexp_; 155 RegexpStatus* status_; 156 Regexp* stacktop_; 157 int ncap_; // number of capturing parens seen 158 int rune_max_; // maximum char value for this encoding 159 160 DISALLOW_EVIL_CONSTRUCTORS(ParseState); 161 }; 162 163 // Pseudo-operators - only on parse stack. 164 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); 165 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); 166 167 Regexp::ParseState::ParseState(ParseFlags flags, 168 const StringPiece& whole_regexp, 169 RegexpStatus* status) 170 : flags_(flags), whole_regexp_(whole_regexp), 171 status_(status), stacktop_(NULL), ncap_(0) { 172 if (flags_ & Latin1) 173 rune_max_ = 0xFF; 174 else 175 rune_max_ = Runemax; 176 } 177 178 // Cleans up by freeing all the regexps on the stack. 179 Regexp::ParseState::~ParseState() { 180 Regexp* next; 181 for (Regexp* re = stacktop_; re != NULL; re = next) { 182 next = re->down_; 183 re->down_ = NULL; 184 if (re->op() == kLeftParen) 185 delete re->name_; 186 re->Decref(); 187 } 188 } 189 190 // Finishes the regexp if necessary, preparing it for use in 191 // a more complex expression. 192 // If it is a CharClassBuilder, converts into a CharClass. 193 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { 194 if (re == NULL) 195 return NULL; 196 re->down_ = NULL; 197 198 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { 199 CharClassBuilder* ccb = re->ccb_; 200 re->ccb_ = NULL; 201 re->cc_ = ccb->GetCharClass(); 202 delete ccb; 203 } 204 205 return re; 206 } 207 208 // Pushes the given regular expression onto the stack. 209 // Could check for too much memory used here. 210 bool Regexp::ParseState::PushRegexp(Regexp* re) { 211 MaybeConcatString(-1, NoParseFlags); 212 213 // Special case: a character class of one character is just 214 // a literal. This is a common idiom for escaping 215 // single characters (e.g., [.] instead of \.), and some 216 // analysis does better with fewer character classes. 217 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. 218 if (re->op_ == kRegexpCharClass) { 219 if (re->ccb_->size() == 1) { 220 Rune r = re->ccb_->begin()->lo; 221 re->Decref(); 222 re = new Regexp(kRegexpLiteral, flags_); 223 re->rune_ = r; 224 } else if (re->ccb_->size() == 2) { 225 Rune r = re->ccb_->begin()->lo; 226 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { 227 re->Decref(); 228 re = new Regexp(kRegexpLiteral, flags_ | FoldCase); 229 re->rune_ = r + 'a' - 'A'; 230 } 231 } 232 } 233 234 if (!IsMarker(re->op())) 235 re->simple_ = re->ComputeSimple(); 236 re->down_ = stacktop_; 237 stacktop_ = re; 238 return true; 239 } 240 241 // Searches the case folding tables and returns the CaseFold* that contains r. 242 // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. 243 // If there isn't one, returns NULL. 244 CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { 245 CaseFold* ef = f + n; 246 247 // Binary search for entry containing r. 248 while (n > 0) { 249 int m = n/2; 250 if (f[m].lo <= r && r <= f[m].hi) 251 return &f[m]; 252 if (r < f[m].lo) { 253 n = m; 254 } else { 255 f += m+1; 256 n -= m+1; 257 } 258 } 259 260 // There is no entry that contains r, but f points 261 // where it would have been. Unless f points at 262 // the end of the array, it points at the next entry 263 // after r. 264 if (f < ef) 265 return f; 266 267 // No entry contains r; no entry contains runes > r. 268 return NULL; 269 } 270 271 // Returns the result of applying the fold f to the rune r. 272 Rune ApplyFold(CaseFold *f, Rune r) { 273 switch (f->delta) { 274 default: 275 return r + f->delta; 276 277 case EvenOddSkip: // even <-> odd but only applies to every other 278 if ((r - f->lo) % 2) 279 return r; 280 // fall through 281 case EvenOdd: // even <-> odd 282 if (r%2 == 0) 283 return r + 1; 284 return r - 1; 285 286 case OddEvenSkip: // odd <-> even but only applies to every other 287 if ((r - f->lo) % 2) 288 return r; 289 // fall through 290 case OddEven: // odd <-> even 291 if (r%2 == 1) 292 return r + 1; 293 return r - 1; 294 } 295 } 296 297 // Returns the next Rune in r's folding cycle (see unicode_casefold.h). 298 // Examples: 299 // CycleFoldRune('A') = 'a' 300 // CycleFoldRune('a') = 'A' 301 // 302 // CycleFoldRune('K') = 'k' 303 // CycleFoldRune('k') = 0x212A (Kelvin) 304 // CycleFoldRune(0x212A) = 'K' 305 // 306 // CycleFoldRune('?') = '?' 307 Rune CycleFoldRune(Rune r) { 308 CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); 309 if (f == NULL || r < f->lo) 310 return r; 311 return ApplyFold(f, r); 312 } 313 314 // Add lo-hi to the class, along with their fold-equivalent characters. 315 // If lo-hi is already in the class, assume that the fold-equivalent 316 // chars are there too, so there's no work to do. 317 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { 318 // AddFoldedRange calls itself recursively for each rune in the fold cycle. 319 // Most folding cycles are small: there aren't any bigger than four in the 320 // current Unicode tables. make_unicode_casefold.py checks that 321 // the cycles are not too long, and we double-check here using depth. 322 if (depth > 10) { 323 LOG(DFATAL) << "AddFoldedRange recurses too much."; 324 return; 325 } 326 327 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done 328 return; 329 330 while (lo <= hi) { 331 CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); 332 if (f == NULL) // lo has no fold, nor does anything above lo 333 break; 334 if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo 335 lo = f->lo; 336 continue; 337 } 338 339 // Add in the result of folding the range lo - f->hi 340 // and that range's fold, recursively. 341 Rune lo1 = lo; 342 Rune hi1 = min<Rune>(hi, f->hi); 343 switch (f->delta) { 344 default: 345 lo1 += f->delta; 346 hi1 += f->delta; 347 break; 348 case EvenOdd: 349 if (lo1%2 == 1) 350 lo1--; 351 if (hi1%2 == 0) 352 hi1++; 353 break; 354 case OddEven: 355 if (lo1%2 == 0) 356 lo1--; 357 if (hi1%2 == 1) 358 hi1++; 359 break; 360 } 361 AddFoldedRange(cc, lo1, hi1, depth+1); 362 363 // Pick up where this fold left off. 364 lo = f->hi + 1; 365 } 366 } 367 368 // Pushes the literal rune r onto the stack. 369 bool Regexp::ParseState::PushLiteral(Rune r) { 370 // Do case folding if needed. 371 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { 372 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 373 re->ccb_ = new CharClassBuilder; 374 Rune r1 = r; 375 do { 376 if (!(flags_ & NeverNL) || r != '\n') { 377 re->ccb_->AddRange(r, r); 378 } 379 r = CycleFoldRune(r); 380 } while (r != r1); 381 re->ccb_->RemoveAbove(rune_max_); 382 return PushRegexp(re); 383 } 384 385 // Exclude newline if applicable. 386 if ((flags_ & NeverNL) && r == '\n') 387 return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); 388 389 // No fancy stuff worked. Ordinary literal. 390 if (MaybeConcatString(r, flags_)) 391 return true; 392 393 Regexp* re = new Regexp(kRegexpLiteral, flags_); 394 re->rune_ = r; 395 return PushRegexp(re); 396 } 397 398 // Pushes a ^ onto the stack. 399 bool Regexp::ParseState::PushCarat() { 400 if (flags_ & OneLine) { 401 return PushSimpleOp(kRegexpBeginText); 402 } 403 return PushSimpleOp(kRegexpBeginLine); 404 } 405 406 // Pushes a \b or \B onto the stack. 407 bool Regexp::ParseState::PushWordBoundary(bool word) { 408 if (word) 409 return PushSimpleOp(kRegexpWordBoundary); 410 return PushSimpleOp(kRegexpNoWordBoundary); 411 } 412 413 // Pushes a $ onto the stack. 414 bool Regexp::ParseState::PushDollar() { 415 if (flags_ & OneLine) { 416 // Clumsy marker so that MimicsPCRE() can tell whether 417 // this kRegexpEndText was a $ and not a \z. 418 Regexp::ParseFlags oflags = flags_; 419 flags_ = flags_ | WasDollar; 420 bool ret = PushSimpleOp(kRegexpEndText); 421 flags_ = oflags; 422 return ret; 423 } 424 return PushSimpleOp(kRegexpEndLine); 425 } 426 427 // Pushes a . onto the stack. 428 bool Regexp::ParseState::PushDot() { 429 if ((flags_ & DotNL) && !(flags_ & NeverNL)) 430 return PushSimpleOp(kRegexpAnyChar); 431 // Rewrite . into [^\n] 432 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 433 re->ccb_ = new CharClassBuilder; 434 re->ccb_->AddRange(0, '\n' - 1); 435 re->ccb_->AddRange('\n' + 1, rune_max_); 436 return PushRegexp(re); 437 } 438 439 // Pushes a regexp with the given op (and no args) onto the stack. 440 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { 441 Regexp* re = new Regexp(op, flags_); 442 return PushRegexp(re); 443 } 444 445 // Pushes a repeat operator regexp onto the stack. 446 // A valid argument for the operator must already be on the stack. 447 // The char c is the name of the operator, for use in error messages. 448 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, 449 bool nongreedy) { 450 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 451 status_->set_code(kRegexpRepeatArgument); 452 status_->set_error_arg(s); 453 return false; 454 } 455 Regexp::ParseFlags fl = flags_; 456 if (nongreedy) 457 fl = fl ^ NonGreedy; 458 Regexp* re = new Regexp(op, fl); 459 re->AllocSub(1); 460 re->down_ = stacktop_->down_; 461 re->sub()[0] = FinishRegexp(stacktop_); 462 re->simple_ = re->ComputeSimple(); 463 stacktop_ = re; 464 return true; 465 } 466 467 // Pushes a repetition regexp onto the stack. 468 // A valid argument for the operator must already be on the stack. 469 bool Regexp::ParseState::PushRepetition(int min, int max, 470 const StringPiece& s, 471 bool nongreedy) { 472 if ((max != -1 && max < min) || min > 1000 || max > 1000) { 473 status_->set_code(kRegexpRepeatSize); 474 status_->set_error_arg(s); 475 return false; 476 } 477 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 478 status_->set_code(kRegexpRepeatArgument); 479 status_->set_error_arg(s); 480 return false; 481 } 482 Regexp::ParseFlags fl = flags_; 483 if (nongreedy) 484 fl = fl ^ NonGreedy; 485 Regexp* re = new Regexp(kRegexpRepeat, fl); 486 re->min_ = min; 487 re->max_ = max; 488 re->AllocSub(1); 489 re->down_ = stacktop_->down_; 490 re->sub()[0] = FinishRegexp(stacktop_); 491 re->simple_ = re->ComputeSimple(); 492 493 stacktop_ = re; 494 return true; 495 } 496 497 // Checks whether a particular regexp op is a marker. 498 bool Regexp::ParseState::IsMarker(RegexpOp op) { 499 return op >= kLeftParen; 500 } 501 502 // Processes a left parenthesis in the input. 503 // Pushes a marker onto the stack. 504 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { 505 Regexp* re = new Regexp(kLeftParen, flags_); 506 re->cap_ = ++ncap_; 507 if (name.data() != NULL) 508 re->name_ = new string(name.as_string()); 509 return PushRegexp(re); 510 } 511 512 // Pushes a non-capturing marker onto the stack. 513 bool Regexp::ParseState::DoLeftParenNoCapture() { 514 Regexp* re = new Regexp(kLeftParen, flags_); 515 re->cap_ = -1; 516 return PushRegexp(re); 517 } 518 519 // Adds r to cc, along with r's upper case if foldascii is set. 520 static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { 521 cc->AddRange(r, r); 522 if (foldascii && 'a' <= r && r <= 'z') 523 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); 524 } 525 526 // Processes a vertical bar in the input. 527 bool Regexp::ParseState::DoVerticalBar() { 528 MaybeConcatString(-1, NoParseFlags); 529 DoConcatenation(); 530 531 // Below the vertical bar is a list to alternate. 532 // Above the vertical bar is a list to concatenate. 533 // We just did the concatenation, so either swap 534 // the result below the vertical bar or push a new 535 // vertical bar on the stack. 536 Regexp* r1; 537 Regexp* r2; 538 if ((r1 = stacktop_) != NULL && 539 (r2 = stacktop_->down_) != NULL && 540 r2->op() == kVerticalBar) { 541 // If above and below vertical bar are literal or char class, 542 // can merge into a single char class. 543 Regexp* r3; 544 if ((r1->op() == kRegexpLiteral || 545 r1->op() == kRegexpCharClass || 546 r1->op() == kRegexpAnyChar) && 547 (r3 = r2->down_) != NULL) { 548 Rune rune; 549 switch (r3->op()) { 550 case kRegexpLiteral: // convert to char class 551 rune = r3->rune_; 552 r3->op_ = kRegexpCharClass; 553 r3->cc_ = NULL; 554 r3->ccb_ = new CharClassBuilder; 555 AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); 556 // fall through 557 case kRegexpCharClass: 558 if (r1->op() == kRegexpLiteral) 559 AddLiteral(r3->ccb_, r1->rune_, 560 r1->parse_flags_ & Regexp::FoldCase); 561 else if (r1->op() == kRegexpCharClass) 562 r3->ccb_->AddCharClass(r1->ccb_); 563 if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { 564 delete r3->ccb_; 565 r3->ccb_ = NULL; 566 r3->op_ = kRegexpAnyChar; 567 } 568 // fall through 569 case kRegexpAnyChar: 570 // pop r1 571 stacktop_ = r2; 572 r1->Decref(); 573 return true; 574 default: 575 break; 576 } 577 } 578 579 // Swap r1 below vertical bar (r2). 580 r1->down_ = r2->down_; 581 r2->down_ = r1; 582 stacktop_ = r2; 583 return true; 584 } 585 return PushSimpleOp(kVerticalBar); 586 } 587 588 // Processes a right parenthesis in the input. 589 bool Regexp::ParseState::DoRightParen() { 590 // Finish the current concatenation and alternation. 591 DoAlternation(); 592 593 // The stack should be: LeftParen regexp 594 // Remove the LeftParen, leaving the regexp, 595 // parenthesized. 596 Regexp* r1; 597 Regexp* r2; 598 if ((r1 = stacktop_) == NULL || 599 (r2 = r1->down_) == NULL || 600 r2->op() != kLeftParen) { 601 status_->set_code(kRegexpMissingParen); 602 status_->set_error_arg(whole_regexp_); 603 return false; 604 } 605 606 // Pop off r1, r2. Will Decref or reuse below. 607 stacktop_ = r2->down_; 608 609 // Restore flags from when paren opened. 610 Regexp* re = r2; 611 flags_ = re->parse_flags(); 612 613 // Rewrite LeftParen as capture if needed. 614 if (re->cap_ > 0) { 615 re->op_ = kRegexpCapture; 616 // re->cap_ is already set 617 re->AllocSub(1); 618 re->sub()[0] = FinishRegexp(r1); 619 re->simple_ = re->ComputeSimple(); 620 } else { 621 re->Decref(); 622 re = r1; 623 } 624 return PushRegexp(re); 625 } 626 627 // Processes the end of input, returning the final regexp. 628 Regexp* Regexp::ParseState::DoFinish() { 629 DoAlternation(); 630 Regexp* re = stacktop_; 631 if (re != NULL && re->down_ != NULL) { 632 status_->set_code(kRegexpMissingParen); 633 status_->set_error_arg(whole_regexp_); 634 return NULL; 635 } 636 stacktop_ = NULL; 637 return FinishRegexp(re); 638 } 639 640 // Returns the leading regexp that re starts with. 641 // The returned Regexp* points into a piece of re, 642 // so it must not be used after the caller calls re->Decref(). 643 Regexp* Regexp::LeadingRegexp(Regexp* re) { 644 if (re->op() == kRegexpEmptyMatch) 645 return NULL; 646 if (re->op() == kRegexpConcat && re->nsub() >= 2) { 647 Regexp** sub = re->sub(); 648 if (sub[0]->op() == kRegexpEmptyMatch) 649 return NULL; 650 return sub[0]; 651 } 652 return re; 653 } 654 655 // Removes LeadingRegexp(re) from re and returns what's left. 656 // Consumes the reference to re and may edit it in place. 657 // If caller wants to hold on to LeadingRegexp(re), 658 // must have already Incref'ed it. 659 Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { 660 if (re->op() == kRegexpEmptyMatch) 661 return re; 662 if (re->op() == kRegexpConcat && re->nsub() >= 2) { 663 Regexp** sub = re->sub(); 664 if (sub[0]->op() == kRegexpEmptyMatch) 665 return re; 666 sub[0]->Decref(); 667 sub[0] = NULL; 668 if (re->nsub() == 2) { 669 // Collapse concatenation to single regexp. 670 Regexp* nre = sub[1]; 671 sub[1] = NULL; 672 re->Decref(); 673 return nre; 674 } 675 // 3 or more -> 2 or more. 676 re->nsub_--; 677 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 678 return re; 679 } 680 Regexp::ParseFlags pf = re->parse_flags(); 681 re->Decref(); 682 return new Regexp(kRegexpEmptyMatch, pf); 683 } 684 685 // Returns the leading string that re starts with. 686 // The returned Rune* points into a piece of re, 687 // so it must not be used after the caller calls re->Decref(). 688 Rune* Regexp::LeadingString(Regexp* re, int *nrune, 689 Regexp::ParseFlags *flags) { 690 while (re->op() == kRegexpConcat && re->nsub() > 0) 691 re = re->sub()[0]; 692 693 *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); 694 695 if (re->op() == kRegexpLiteral) { 696 *nrune = 1; 697 return &re->rune_; 698 } 699 700 if (re->op() == kRegexpLiteralString) { 701 *nrune = re->nrunes_; 702 return re->runes_; 703 } 704 705 *nrune = 0; 706 return NULL; 707 } 708 709 // Removes the first n leading runes from the beginning of re. 710 // Edits re in place. 711 void Regexp::RemoveLeadingString(Regexp* re, int n) { 712 // Chase down concats to find first string. 713 // For regexps generated by parser, nested concats are 714 // flattened except when doing so would overflow the 16-bit 715 // limit on the size of a concatenation, so we should never 716 // see more than two here. 717 Regexp* stk[4]; 718 int d = 0; 719 while (re->op() == kRegexpConcat) { 720 if (d < arraysize(stk)) 721 stk[d++] = re; 722 re = re->sub()[0]; 723 } 724 725 // Remove leading string from re. 726 if (re->op() == kRegexpLiteral) { 727 re->rune_ = 0; 728 re->op_ = kRegexpEmptyMatch; 729 } else if (re->op() == kRegexpLiteralString) { 730 if (n >= re->nrunes_) { 731 delete[] re->runes_; 732 re->runes_ = NULL; 733 re->nrunes_ = 0; 734 re->op_ = kRegexpEmptyMatch; 735 } else if (n == re->nrunes_ - 1) { 736 Rune rune = re->runes_[re->nrunes_ - 1]; 737 delete[] re->runes_; 738 re->runes_ = NULL; 739 re->nrunes_ = 0; 740 re->rune_ = rune; 741 re->op_ = kRegexpLiteral; 742 } else { 743 re->nrunes_ -= n; 744 memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); 745 } 746 } 747 748 // If re is now empty, concatenations might simplify too. 749 while (d-- > 0) { 750 re = stk[d]; 751 Regexp** sub = re->sub(); 752 if (sub[0]->op() == kRegexpEmptyMatch) { 753 sub[0]->Decref(); 754 sub[0] = NULL; 755 // Delete first element of concat. 756 switch (re->nsub()) { 757 case 0: 758 case 1: 759 // Impossible. 760 LOG(DFATAL) << "Concat of " << re->nsub(); 761 re->submany_ = NULL; 762 re->op_ = kRegexpEmptyMatch; 763 break; 764 765 case 2: { 766 // Replace re with sub[1]. 767 Regexp* old = sub[1]; 768 sub[1] = NULL; 769 re->Swap(old); 770 old->Decref(); 771 break; 772 } 773 774 default: 775 // Slide down. 776 re->nsub_--; 777 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 778 break; 779 } 780 } 781 } 782 } 783 784 // Factors common prefixes from alternation. 785 // For example, 786 // ABC|ABD|AEF|BCX|BCY 787 // simplifies to 788 // A(B(C|D)|EF)|BC(X|Y) 789 // which the normal parse state routines will further simplify to 790 // A(B[CD]|EF)|BC[XY] 791 // 792 // Rewrites sub to contain simplified list to alternate and returns 793 // the new length of sub. Adjusts reference counts accordingly 794 // (incoming sub[i] decremented, outgoing sub[i] incremented). 795 796 // It's too much of a pain to write this code with an explicit stack, 797 // so instead we let the caller specify a maximum depth and 798 // don't simplify beyond that. There are around 15 words of local 799 // variables and parameters in the frame, so allowing 8 levels 800 // on a 64-bit machine is still less than a kilobyte of stack and 801 // probably enough benefit for practical uses. 802 const int kFactorAlternationMaxDepth = 8; 803 804 int Regexp::FactorAlternation( 805 Regexp** sub, int n, 806 Regexp::ParseFlags altflags) { 807 return FactorAlternationRecursive(sub, n, altflags, 808 kFactorAlternationMaxDepth); 809 } 810 811 int Regexp::FactorAlternationRecursive( 812 Regexp** sub, int n, 813 Regexp::ParseFlags altflags, 814 int maxdepth) { 815 816 if (maxdepth <= 0) 817 return n; 818 819 // Round 1: Factor out common literal prefixes. 820 Rune *rune = NULL; 821 int nrune = 0; 822 Regexp::ParseFlags runeflags = Regexp::NoParseFlags; 823 int start = 0; 824 int out = 0; 825 for (int i = 0; i <= n; i++) { 826 // Invariant: what was in sub[0:start] has been Decref'ed 827 // and that space has been reused for sub[0:out] (out <= start). 828 // 829 // Invariant: sub[start:i] consists of regexps that all begin 830 // with the string rune[0:nrune]. 831 832 Rune* rune_i = NULL; 833 int nrune_i = 0; 834 Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; 835 if (i < n) { 836 rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); 837 if (runeflags_i == runeflags) { 838 int same = 0; 839 while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) 840 same++; 841 if (same > 0) { 842 // Matches at least one rune in current range. Keep going around. 843 nrune = same; 844 continue; 845 } 846 } 847 } 848 849 // Found end of a run with common leading literal string: 850 // sub[start:i] all begin with rune[0:nrune] but sub[i] 851 // does not even begin with rune[0]. 852 // 853 // Factor out common string and append factored expression to sub[0:out]. 854 if (i == start) { 855 // Nothing to do - first iteration. 856 } else if (i == start+1) { 857 // Just one: don't bother factoring. 858 sub[out++] = sub[start]; 859 } else { 860 // Construct factored form: prefix(suffix1|suffix2|...) 861 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 862 x[0] = LiteralString(rune, nrune, runeflags); 863 for (int j = start; j < i; j++) 864 RemoveLeadingString(sub[j], nrune); 865 int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 866 maxdepth - 1); 867 x[1] = AlternateNoFactor(sub + start, nn, altflags); 868 sub[out++] = Concat(x, 2, altflags); 869 } 870 871 // Prepare for next round (if there is one). 872 if (i < n) { 873 start = i; 874 rune = rune_i; 875 nrune = nrune_i; 876 runeflags = runeflags_i; 877 } 878 } 879 n = out; 880 881 // Round 2: Factor out common complex prefixes, 882 // just the first piece of each concatenation, 883 // whatever it is. This is good enough a lot of the time. 884 start = 0; 885 out = 0; 886 Regexp* first = NULL; 887 for (int i = 0; i <= n; i++) { 888 // Invariant: what was in sub[0:start] has been Decref'ed 889 // and that space has been reused for sub[0:out] (out <= start). 890 // 891 // Invariant: sub[start:i] consists of regexps that all begin with first. 892 893 Regexp* first_i = NULL; 894 if (i < n) { 895 first_i = LeadingRegexp(sub[i]); 896 if (first != NULL && Regexp::Equal(first, first_i)) { 897 continue; 898 } 899 } 900 901 // Found end of a run with common leading regexp: 902 // sub[start:i] all begin with first but sub[i] does not. 903 // 904 // Factor out common regexp and append factored expression to sub[0:out]. 905 if (i == start) { 906 // Nothing to do - first iteration. 907 } else if (i == start+1) { 908 // Just one: don't bother factoring. 909 sub[out++] = sub[start]; 910 } else { 911 // Construct factored form: prefix(suffix1|suffix2|...) 912 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 913 x[0] = first->Incref(); 914 for (int j = start; j < i; j++) 915 sub[j] = RemoveLeadingRegexp(sub[j]); 916 int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 917 maxdepth - 1); 918 x[1] = AlternateNoFactor(sub + start, nn, altflags); 919 sub[out++] = Concat(x, 2, altflags); 920 } 921 922 // Prepare for next round (if there is one). 923 if (i < n) { 924 start = i; 925 first = first_i; 926 } 927 } 928 n = out; 929 930 // Round 3: Collapse runs of single literals into character classes. 931 start = 0; 932 out = 0; 933 for (int i = 0; i <= n; i++) { 934 // Invariant: what was in sub[0:start] has been Decref'ed 935 // and that space has been reused for sub[0:out] (out <= start). 936 // 937 // Invariant: sub[start:i] consists of regexps that are either 938 // literal runes or character classes. 939 940 if (i < n && 941 (sub[i]->op() == kRegexpLiteral || 942 sub[i]->op() == kRegexpCharClass)) 943 continue; 944 945 // sub[i] is not a char or char class; 946 // emit char class for sub[start:i]... 947 if (i == start) { 948 // Nothing to do. 949 } else if (i == start+1) { 950 sub[out++] = sub[start]; 951 } else { 952 // Make new char class. 953 CharClassBuilder ccb; 954 for (int j = start; j < i; j++) { 955 Regexp* re = sub[j]; 956 if (re->op() == kRegexpCharClass) { 957 CharClass* cc = re->cc(); 958 for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) 959 ccb.AddRange(it->lo, it->hi); 960 } else if (re->op() == kRegexpLiteral) { 961 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); 962 } else { 963 LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " 964 << re->ToString(); 965 } 966 re->Decref(); 967 } 968 sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); 969 } 970 971 // ... and then emit sub[i]. 972 if (i < n) 973 sub[out++] = sub[i]; 974 start = i+1; 975 } 976 n = out; 977 978 // Round 4: Collapse runs of empty matches into single empty match. 979 start = 0; 980 out = 0; 981 for (int i = 0; i < n; i++) { 982 if (i + 1 < n && 983 sub[i]->op() == kRegexpEmptyMatch && 984 sub[i+1]->op() == kRegexpEmptyMatch) { 985 sub[i]->Decref(); 986 continue; 987 } 988 sub[out++] = sub[i]; 989 } 990 n = out; 991 992 return n; 993 } 994 995 // Collapse the regexps on top of the stack, down to the 996 // first marker, into a new op node (op == kRegexpAlternate 997 // or op == kRegexpConcat). 998 void Regexp::ParseState::DoCollapse(RegexpOp op) { 999 // Scan backward to marker, counting children of composite. 1000 int n = 0; 1001 Regexp* next = NULL; 1002 Regexp* sub; 1003 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 1004 next = sub->down_; 1005 if (sub->op_ == op) 1006 n += sub->nsub_; 1007 else 1008 n++; 1009 } 1010 1011 // If there's just one child, leave it alone. 1012 // (Concat of one thing is that one thing; alternate of one thing is same.) 1013 if (stacktop_ != NULL && stacktop_->down_ == next) 1014 return; 1015 1016 // Construct op (alternation or concatenation), flattening op of op. 1017 Regexp** subs = new Regexp*[n]; 1018 next = NULL; 1019 int i = n; 1020 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 1021 next = sub->down_; 1022 if (sub->op_ == op) { 1023 Regexp** sub_subs = sub->sub(); 1024 for (int k = sub->nsub_ - 1; k >= 0; k--) 1025 subs[--i] = sub_subs[k]->Incref(); 1026 sub->Decref(); 1027 } else { 1028 subs[--i] = FinishRegexp(sub); 1029 } 1030 } 1031 1032 Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true); 1033 delete[] subs; 1034 re->simple_ = re->ComputeSimple(); 1035 re->down_ = next; 1036 stacktop_ = re; 1037 } 1038 1039 // Finishes the current concatenation, 1040 // collapsing it into a single regexp on the stack. 1041 void Regexp::ParseState::DoConcatenation() { 1042 Regexp* r1 = stacktop_; 1043 if (r1 == NULL || IsMarker(r1->op())) { 1044 // empty concatenation is special case 1045 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); 1046 PushRegexp(re); 1047 } 1048 DoCollapse(kRegexpConcat); 1049 } 1050 1051 // Finishes the current alternation, 1052 // collapsing it to a single regexp on the stack. 1053 void Regexp::ParseState::DoAlternation() { 1054 DoVerticalBar(); 1055 // Now stack top is kVerticalBar. 1056 Regexp* r1 = stacktop_; 1057 stacktop_ = r1->down_; 1058 r1->Decref(); 1059 DoCollapse(kRegexpAlternate); 1060 } 1061 1062 // Incremental conversion of concatenated literals into strings. 1063 // If top two elements on stack are both literal or string, 1064 // collapse into single string. 1065 // Don't walk down the stack -- the parser calls this frequently 1066 // enough that below the bottom two is known to be collapsed. 1067 // Only called when another regexp is about to be pushed 1068 // on the stack, so that the topmost literal is not being considered. 1069 // (Otherwise ab* would turn into (ab)*.) 1070 // If r >= 0, consider pushing a literal r on the stack. 1071 // Return whether that happened. 1072 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { 1073 Regexp* re1; 1074 Regexp* re2; 1075 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) 1076 return false; 1077 1078 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) 1079 return false; 1080 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) 1081 return false; 1082 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) 1083 return false; 1084 1085 if (re2->op_ == kRegexpLiteral) { 1086 // convert into string 1087 Rune rune = re2->rune_; 1088 re2->op_ = kRegexpLiteralString; 1089 re2->nrunes_ = 0; 1090 re2->runes_ = NULL; 1091 re2->AddRuneToString(rune); 1092 } 1093 1094 // push re1 into re2. 1095 if (re1->op_ == kRegexpLiteral) { 1096 re2->AddRuneToString(re1->rune_); 1097 } else { 1098 for (int i = 0; i < re1->nrunes_; i++) 1099 re2->AddRuneToString(re1->runes_[i]); 1100 re1->nrunes_ = 0; 1101 delete[] re1->runes_; 1102 re1->runes_ = NULL; 1103 } 1104 1105 // reuse re1 if possible 1106 if (r >= 0) { 1107 re1->op_ = kRegexpLiteral; 1108 re1->rune_ = r; 1109 re1->parse_flags_ = flags; 1110 return true; 1111 } 1112 1113 stacktop_ = re2; 1114 re1->Decref(); 1115 return false; 1116 } 1117 1118 // Lexing routines. 1119 1120 // Parses a decimal integer, storing it in *n. 1121 // Sets *s to span the remainder of the string. 1122 // Sets *out_re to the regexp for the class. 1123 static bool ParseInteger(StringPiece* s, int* np) { 1124 if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) 1125 return false; 1126 // Disallow leading zeros. 1127 if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) 1128 return false; 1129 int n = 0; 1130 int c; 1131 while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { 1132 // Avoid overflow. 1133 if (n >= 100000000) 1134 return false; 1135 n = n*10 + c - '0'; 1136 s->remove_prefix(1); // digit 1137 } 1138 *np = n; 1139 return true; 1140 } 1141 1142 // Parses a repetition suffix like {1,2} or {2} or {2,}. 1143 // Sets *s to span the remainder of the string on success. 1144 // Sets *lo and *hi to the given range. 1145 // In the case of {2,}, the high number is unbounded; 1146 // sets *hi to -1 to signify this. 1147 // {,2} is NOT a valid suffix. 1148 // The Maybe in the name signifies that the regexp parse 1149 // doesn't fail even if ParseRepetition does, so the StringPiece 1150 // s must NOT be edited unless MaybeParseRepetition returns true. 1151 static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { 1152 StringPiece s = *sp; 1153 if (s.size() == 0 || s[0] != '{') 1154 return false; 1155 s.remove_prefix(1); // '{' 1156 if (!ParseInteger(&s, lo)) 1157 return false; 1158 if (s.size() == 0) 1159 return false; 1160 if (s[0] == ',') { 1161 s.remove_prefix(1); // ',' 1162 if (s.size() == 0) 1163 return false; 1164 if (s[0] == '}') { 1165 // {2,} means at least 2 1166 *hi = -1; 1167 } else { 1168 // {2,4} means 2, 3, or 4. 1169 if (!ParseInteger(&s, hi)) 1170 return false; 1171 } 1172 } else { 1173 // {2} means exactly two 1174 *hi = *lo; 1175 } 1176 if (s.size() == 0 || s[0] != '}') 1177 return false; 1178 s.remove_prefix(1); // '}' 1179 *sp = s; 1180 return true; 1181 } 1182 1183 // Removes the next Rune from the StringPiece and stores it in *r. 1184 // Returns number of bytes removed from sp. 1185 // Behaves as though there is a terminating NUL at the end of sp. 1186 // Argument order is backwards from usual Google style 1187 // but consistent with chartorune. 1188 static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { 1189 int n; 1190 if (fullrune(sp->data(), sp->size())) { 1191 n = chartorune(r, sp->data()); 1192 if (!(n == 1 && *r == Runeerror)) { // no decoding error 1193 sp->remove_prefix(n); 1194 return n; 1195 } 1196 } 1197 1198 status->set_code(kRegexpBadUTF8); 1199 status->set_error_arg(NULL); 1200 return -1; 1201 } 1202 1203 // Return whether name is valid UTF-8. 1204 // If not, set status to kRegexpBadUTF8. 1205 static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { 1206 StringPiece t = s; 1207 Rune r; 1208 while (t.size() > 0) { 1209 if (StringPieceToRune(&r, &t, status) < 0) 1210 return false; 1211 } 1212 return true; 1213 } 1214 1215 // Is c a hex digit? 1216 static int IsHex(int c) { 1217 return ('0' <= c && c <= '9') || 1218 ('A' <= c && c <= 'F') || 1219 ('a' <= c && c <= 'f'); 1220 } 1221 1222 // Convert hex digit to value. 1223 static int UnHex(int c) { 1224 if ('0' <= c && c <= '9') 1225 return c - '0'; 1226 if ('A' <= c && c <= 'F') 1227 return c - 'A' + 10; 1228 if ('a' <= c && c <= 'f') 1229 return c - 'a' + 10; 1230 LOG(DFATAL) << "Bad hex digit " << c; 1231 return 0; 1232 } 1233 1234 // Parse an escape sequence (e.g., \n, \{). 1235 // Sets *s to span the remainder of the string. 1236 // Sets *rp to the named character. 1237 static bool ParseEscape(StringPiece* s, Rune* rp, 1238 RegexpStatus* status, int rune_max) { 1239 const char* begin = s->begin(); 1240 if (s->size() < 1 || (*s)[0] != '\\') { 1241 // Should not happen - caller always checks. 1242 status->set_code(kRegexpInternalError); 1243 status->set_error_arg(NULL); 1244 return false; 1245 } 1246 if (s->size() < 2) { 1247 status->set_code(kRegexpTrailingBackslash); 1248 status->set_error_arg(NULL); 1249 return false; 1250 } 1251 Rune c, c1; 1252 s->remove_prefix(1); // backslash 1253 if (StringPieceToRune(&c, s, status) < 0) 1254 return false; 1255 int code; 1256 switch (c) { 1257 default: 1258 if (c < Runeself && !isalpha(c) && !isdigit(c)) { 1259 // Escaped non-word characters are always themselves. 1260 // PCRE is not quite so rigorous: it accepts things like 1261 // \q, but we don't. We once rejected \_, but too many 1262 // programs and people insist on using it, so allow \_. 1263 *rp = c; 1264 return true; 1265 } 1266 goto BadEscape; 1267 1268 // Octal escapes. 1269 case '1': 1270 case '2': 1271 case '3': 1272 case '4': 1273 case '5': 1274 case '6': 1275 case '7': 1276 // Single non-zero octal digit is a backreference; not supported. 1277 if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') 1278 goto BadEscape; 1279 // fall through 1280 case '0': 1281 // consume up to three octal digits; already have one. 1282 code = c - '0'; 1283 if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { 1284 code = code * 8 + c - '0'; 1285 s->remove_prefix(1); // digit 1286 if (s->size() > 0) { 1287 c = (*s)[0]; 1288 if ('0' <= c && c <= '7') { 1289 code = code * 8 + c - '0'; 1290 s->remove_prefix(1); // digit 1291 } 1292 } 1293 } 1294 *rp = code; 1295 return true; 1296 1297 // Hexadecimal escapes 1298 case 'x': 1299 if (s->size() == 0) 1300 goto BadEscape; 1301 if (StringPieceToRune(&c, s, status) < 0) 1302 return false; 1303 if (c == '{') { 1304 // Any number of digits in braces. 1305 // Update n as we consume the string, so that 1306 // the whole thing gets shown in the error message. 1307 // Perl accepts any text at all; it ignores all text 1308 // after the first non-hex digit. We require only hex digits, 1309 // and at least one. 1310 if (StringPieceToRune(&c, s, status) < 0) 1311 return false; 1312 int nhex = 0; 1313 code = 0; 1314 while (IsHex(c)) { 1315 nhex++; 1316 code = code * 16 + UnHex(c); 1317 if (code > rune_max) 1318 goto BadEscape; 1319 if (s->size() == 0) 1320 goto BadEscape; 1321 if (StringPieceToRune(&c, s, status) < 0) 1322 return false; 1323 } 1324 if (c != '}' || nhex == 0) 1325 goto BadEscape; 1326 *rp = code; 1327 return true; 1328 } 1329 // Easy case: two hex digits. 1330 if (s->size() == 0) 1331 goto BadEscape; 1332 if (StringPieceToRune(&c1, s, status) < 0) 1333 return false; 1334 if (!IsHex(c) || !IsHex(c1)) 1335 goto BadEscape; 1336 *rp = UnHex(c) * 16 + UnHex(c1); 1337 return true; 1338 1339 // C escapes. 1340 case 'n': 1341 *rp = '\n'; 1342 return true; 1343 case 'r': 1344 *rp = '\r'; 1345 return true; 1346 case 't': 1347 *rp = '\t'; 1348 return true; 1349 1350 // Less common C escapes. 1351 case 'a': 1352 *rp = '\a'; 1353 return true; 1354 case 'f': 1355 *rp = '\f'; 1356 return true; 1357 case 'v': 1358 *rp = '\v'; 1359 return true; 1360 1361 // This code is disabled to avoid misparsing 1362 // the Perl word-boundary \b as a backspace 1363 // when in POSIX regexp mode. Surprisingly, 1364 // in Perl, \b means word-boundary but [\b] 1365 // means backspace. We don't support that: 1366 // if you want a backspace embed a literal 1367 // backspace character or use \x08. 1368 // 1369 // case 'b': 1370 // *rp = '\b'; 1371 // return true; 1372 } 1373 1374 LOG(DFATAL) << "Not reached in ParseEscape."; 1375 1376 BadEscape: 1377 // Unrecognized escape sequence. 1378 status->set_code(kRegexpBadEscape); 1379 status->set_error_arg(StringPiece(begin, s->data() - begin)); 1380 return false; 1381 } 1382 1383 // Add a range to the character class, but exclude newline if asked. 1384 // Also handle case folding. 1385 void CharClassBuilder::AddRangeFlags( 1386 Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { 1387 1388 // Take out \n if the flags say so. 1389 bool cutnl = !(parse_flags & Regexp::ClassNL) || 1390 (parse_flags & Regexp::NeverNL); 1391 if (cutnl && lo <= '\n' && '\n' <= hi) { 1392 if (lo < '\n') 1393 AddRangeFlags(lo, '\n' - 1, parse_flags); 1394 if (hi > '\n') 1395 AddRangeFlags('\n' + 1, hi, parse_flags); 1396 return; 1397 } 1398 1399 // If folding case, add fold-equivalent characters too. 1400 if (parse_flags & Regexp::FoldCase) 1401 AddFoldedRange(this, lo, hi, 0); 1402 else 1403 AddRange(lo, hi); 1404 } 1405 1406 // Look for a group with the given name. 1407 static UGroup* LookupGroup(const StringPiece& name, 1408 UGroup *groups, int ngroups) { 1409 // Simple name lookup. 1410 for (int i = 0; i < ngroups; i++) 1411 if (StringPiece(groups[i].name) == name) 1412 return &groups[i]; 1413 return NULL; 1414 } 1415 1416 // Fake UGroup containing all Runes 1417 static URange16 any16[] = { { 0, 65535 } }; 1418 static URange32 any32[] = { { 65536, Runemax } }; 1419 static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; 1420 1421 // Look for a POSIX group with the given name (e.g., "[:^alpha:]") 1422 static UGroup* LookupPosixGroup(const StringPiece& name) { 1423 return LookupGroup(name, posix_groups, num_posix_groups); 1424 } 1425 1426 static UGroup* LookupPerlGroup(const StringPiece& name) { 1427 return LookupGroup(name, perl_groups, num_perl_groups); 1428 } 1429 1430 // Look for a Unicode group with the given name (e.g., "Han") 1431 static UGroup* LookupUnicodeGroup(const StringPiece& name) { 1432 // Special case: "Any" means any. 1433 if (name == StringPiece("Any")) 1434 return &anygroup; 1435 return LookupGroup(name, unicode_groups, num_unicode_groups); 1436 } 1437 1438 // Add a UGroup or its negation to the character class. 1439 static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, 1440 Regexp::ParseFlags parse_flags) { 1441 if (sign == +1) { 1442 for (int i = 0; i < g->nr16; i++) { 1443 cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); 1444 } 1445 for (int i = 0; i < g->nr32; i++) { 1446 cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); 1447 } 1448 } else { 1449 if (parse_flags & Regexp::FoldCase) { 1450 // Normally adding a case-folded group means 1451 // adding all the extra fold-equivalent runes too. 1452 // But if we're adding the negation of the group, 1453 // we have to exclude all the runes that are fold-equivalent 1454 // to what's already missing. Too hard, so do in two steps. 1455 CharClassBuilder ccb1; 1456 AddUGroup(&ccb1, g, +1, parse_flags); 1457 ccb1.Negate(); 1458 cc->AddCharClass(&ccb1); 1459 return; 1460 } 1461 int next = 0; 1462 for (int i = 0; i < g->nr16; i++) { 1463 if (next < g->r16[i].lo) 1464 cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); 1465 next = g->r16[i].hi + 1; 1466 } 1467 for (int i = 0; i < g->nr32; i++) { 1468 if (next < g->r32[i].lo) 1469 cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); 1470 next = g->r32[i].hi + 1; 1471 } 1472 if (next <= Runemax) 1473 cc->AddRangeFlags(next, Runemax, parse_flags); 1474 } 1475 } 1476 1477 // Maybe parse a Perl character class escape sequence. 1478 // Only recognizes the Perl character classes (\d \s \w \D \S \W), 1479 // not the Perl empty-string classes (\b \B \A \Z \z). 1480 // On success, sets *s to span the remainder of the string 1481 // and returns the corresponding UGroup. 1482 // The StringPiece must *NOT* be edited unless the call succeeds. 1483 UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { 1484 if (!(parse_flags & Regexp::PerlClasses)) 1485 return NULL; 1486 if (s->size() < 2 || (*s)[0] != '\\') 1487 return NULL; 1488 // Could use StringPieceToRune, but there aren't 1489 // any non-ASCII Perl group names. 1490 StringPiece name(s->begin(), 2); 1491 UGroup *g = LookupPerlGroup(name); 1492 if (g == NULL) 1493 return NULL; 1494 s->remove_prefix(name.size()); 1495 return g; 1496 } 1497 1498 enum ParseStatus { 1499 kParseOk, // Did some parsing. 1500 kParseError, // Found an error. 1501 kParseNothing, // Decided not to parse. 1502 }; 1503 1504 // Maybe parses a Unicode character group like \p{Han} or \P{Han} 1505 // (the latter is a negated group). 1506 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, 1507 CharClassBuilder *cc, 1508 RegexpStatus* status) { 1509 // Decide whether to parse. 1510 if (!(parse_flags & Regexp::UnicodeGroups)) 1511 return kParseNothing; 1512 if (s->size() < 2 || (*s)[0] != '\\') 1513 return kParseNothing; 1514 Rune c = (*s)[1]; 1515 if (c != 'p' && c != 'P') 1516 return kParseNothing; 1517 1518 // Committed to parse. Results: 1519 int sign = +1; // -1 = negated char class 1520 if (c == 'P') 1521 sign = -1; 1522 StringPiece seq = *s; // \p{Han} or \pL 1523 StringPiece name; // Han or L 1524 s->remove_prefix(2); // '\\', 'p' 1525 1526 if (!StringPieceToRune(&c, s, status)) 1527 return kParseError; 1528 if (c != '{') { 1529 // Name is the bit of string we just skipped over for c. 1530 const char* p = seq.begin() + 2; 1531 name = StringPiece(p, s->begin() - p); 1532 } else { 1533 // Name is in braces. Look for closing } 1534 int end = s->find('}', 0); 1535 if (end == s->npos) { 1536 if (!IsValidUTF8(seq, status)) 1537 return kParseError; 1538 status->set_code(kRegexpBadCharRange); 1539 status->set_error_arg(seq); 1540 return kParseError; 1541 } 1542 name = StringPiece(s->begin(), end); // without '}' 1543 s->remove_prefix(end + 1); // with '}' 1544 if (!IsValidUTF8(name, status)) 1545 return kParseError; 1546 } 1547 1548 // Chop seq where s now begins. 1549 seq = StringPiece(seq.begin(), s->begin() - seq.begin()); 1550 1551 // Look up group 1552 if (name.size() > 0 && name[0] == '^') { 1553 sign = -sign; 1554 name.remove_prefix(1); // '^' 1555 } 1556 UGroup *g = LookupUnicodeGroup(name); 1557 if (g == NULL) { 1558 status->set_code(kRegexpBadCharRange); 1559 status->set_error_arg(seq); 1560 return kParseError; 1561 } 1562 1563 AddUGroup(cc, g, sign, parse_flags); 1564 return kParseOk; 1565 } 1566 1567 // Parses a character class name like [:alnum:]. 1568 // Sets *s to span the remainder of the string. 1569 // Adds the ranges corresponding to the class to ranges. 1570 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, 1571 CharClassBuilder *cc, 1572 RegexpStatus* status) { 1573 // Check begins with [: 1574 const char* p = s->data(); 1575 const char* ep = s->data() + s->size(); 1576 if (ep - p < 2 || p[0] != '[' || p[1] != ':') 1577 return kParseNothing; 1578 1579 // Look for closing :]. 1580 const char* q; 1581 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) 1582 ; 1583 1584 // If no closing :], then ignore. 1585 if (q > ep-2) 1586 return kParseNothing; 1587 1588 // Got it. Check that it's valid. 1589 q += 2; 1590 StringPiece name(p, q-p); 1591 1592 UGroup *g = LookupPosixGroup(name); 1593 if (g == NULL) { 1594 status->set_code(kRegexpBadCharRange); 1595 status->set_error_arg(name); 1596 return kParseError; 1597 } 1598 1599 s->remove_prefix(name.size()); 1600 AddUGroup(cc, g, g->sign, parse_flags); 1601 return kParseOk; 1602 } 1603 1604 // Parses a character inside a character class. 1605 // There are fewer special characters here than in the rest of the regexp. 1606 // Sets *s to span the remainder of the string. 1607 // Sets *rp to the character. 1608 bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, 1609 const StringPiece& whole_class, 1610 RegexpStatus* status) { 1611 if (s->size() == 0) { 1612 status->set_code(kRegexpMissingBracket); 1613 status->set_error_arg(whole_class); 1614 return false; 1615 } 1616 1617 // Allow regular escape sequences even though 1618 // many need not be escaped in this context. 1619 if (s->size() >= 1 && (*s)[0] == '\\') 1620 return ParseEscape(s, rp, status, rune_max_); 1621 1622 // Otherwise take the next rune. 1623 return StringPieceToRune(rp, s, status) >= 0; 1624 } 1625 1626 // Parses a character class character, or, if the character 1627 // is followed by a hyphen, parses a character class range. 1628 // For single characters, rr->lo == rr->hi. 1629 // Sets *s to span the remainder of the string. 1630 // Sets *rp to the character. 1631 bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, 1632 const StringPiece& whole_class, 1633 RegexpStatus* status) { 1634 StringPiece os = *s; 1635 if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) 1636 return false; 1637 // [a-] means (a|-), so check for final ]. 1638 if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { 1639 s->remove_prefix(1); // '-' 1640 if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) 1641 return false; 1642 if (rr->hi < rr->lo) { 1643 status->set_code(kRegexpBadCharRange); 1644 status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); 1645 return false; 1646 } 1647 } else { 1648 rr->hi = rr->lo; 1649 } 1650 return true; 1651 } 1652 1653 // Parses a possibly-negated character class expression like [^abx-z[:digit:]]. 1654 // Sets *s to span the remainder of the string. 1655 // Sets *out_re to the regexp for the class. 1656 bool Regexp::ParseState::ParseCharClass(StringPiece* s, 1657 Regexp** out_re, 1658 RegexpStatus* status) { 1659 StringPiece whole_class = *s; 1660 if (s->size() == 0 || (*s)[0] != '[') { 1661 // Caller checked this. 1662 status->set_code(kRegexpInternalError); 1663 status->set_error_arg(NULL); 1664 return false; 1665 } 1666 bool negated = false; 1667 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 1668 re->ccb_ = new CharClassBuilder; 1669 s->remove_prefix(1); // '[' 1670 if (s->size() > 0 && (*s)[0] == '^') { 1671 s->remove_prefix(1); // '^' 1672 negated = true; 1673 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { 1674 // If NL can't match implicitly, then pretend 1675 // negated classes include a leading \n. 1676 re->ccb_->AddRange('\n', '\n'); 1677 } 1678 } 1679 bool first = true; // ] is okay as first char in class 1680 while (s->size() > 0 && ((*s)[0] != ']' || first)) { 1681 // - is only okay unescaped as first or last in class. 1682 // Except that Perl allows - anywhere. 1683 if ((*s)[0] == '-' && !first && !(flags_&PerlX) && 1684 (s->size() == 1 || (*s)[1] != ']')) { 1685 StringPiece t = *s; 1686 t.remove_prefix(1); // '-' 1687 Rune r; 1688 int n = StringPieceToRune(&r, &t, status); 1689 if (n < 0) { 1690 re->Decref(); 1691 return false; 1692 } 1693 status->set_code(kRegexpBadCharRange); 1694 status->set_error_arg(StringPiece(s->data(), 1+n)); 1695 re->Decref(); 1696 return false; 1697 } 1698 first = false; 1699 1700 // Look for [:alnum:] etc. 1701 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { 1702 switch (ParseCCName(s, flags_, re->ccb_, status)) { 1703 case kParseOk: 1704 continue; 1705 case kParseError: 1706 re->Decref(); 1707 return false; 1708 case kParseNothing: 1709 break; 1710 } 1711 } 1712 1713 // Look for Unicode character group like \p{Han} 1714 if (s->size() > 2 && 1715 (*s)[0] == '\\' && 1716 ((*s)[1] == 'p' || (*s)[1] == 'P')) { 1717 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { 1718 case kParseOk: 1719 continue; 1720 case kParseError: 1721 re->Decref(); 1722 return false; 1723 case kParseNothing: 1724 break; 1725 } 1726 } 1727 1728 // Look for Perl character class symbols (extension). 1729 UGroup *g = MaybeParsePerlCCEscape(s, flags_); 1730 if (g != NULL) { 1731 AddUGroup(re->ccb_, g, g->sign, flags_); 1732 continue; 1733 } 1734 1735 // Otherwise assume single character or simple range. 1736 RuneRange rr; 1737 if (!ParseCCRange(s, &rr, whole_class, status)) { 1738 re->Decref(); 1739 return false; 1740 } 1741 // AddRangeFlags is usually called in response to a class like 1742 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless 1743 // Regexp::ClassNL is set. In an explicit range or singleton 1744 // like we just parsed, we do not filter \n out, so set ClassNL 1745 // in the flags. 1746 re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); 1747 } 1748 if (s->size() == 0) { 1749 status->set_code(kRegexpMissingBracket); 1750 status->set_error_arg(whole_class); 1751 re->Decref(); 1752 return false; 1753 } 1754 s->remove_prefix(1); // ']' 1755 1756 if (negated) 1757 re->ccb_->Negate(); 1758 re->ccb_->RemoveAbove(rune_max_); 1759 1760 *out_re = re; 1761 return true; 1762 } 1763 1764 // Is this a valid capture name? [A-Za-z0-9_]+ 1765 // PCRE limits names to 32 bytes. 1766 // Python rejects names starting with digits. 1767 // We don't enforce either of those. 1768 static bool IsValidCaptureName(const StringPiece& name) { 1769 if (name.size() == 0) 1770 return false; 1771 for (int i = 0; i < name.size(); i++) { 1772 int c = name[i]; 1773 if (('0' <= c && c <= '9') || 1774 ('a' <= c && c <= 'z') || 1775 ('A' <= c && c <= 'Z') || 1776 c == '_') 1777 continue; 1778 return false; 1779 } 1780 return true; 1781 } 1782 1783 // Parses a Perl flag setting or non-capturing group or both, 1784 // like (?i) or (?: or (?i:. Removes from s, updates parse state. 1785 // The caller must check that s begins with "(?". 1786 // Returns true on success. If the Perl flag is not 1787 // well-formed or not supported, sets status_ and returns false. 1788 bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { 1789 StringPiece t = *s; 1790 1791 // Caller is supposed to check this. 1792 if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { 1793 LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; 1794 status_->set_code(kRegexpInternalError); 1795 return false; 1796 } 1797 1798 t.remove_prefix(2); // "(?" 1799 1800 // Check for named captures, first introduced in Python's regexp library. 1801 // As usual, there are three slightly different syntaxes: 1802 // 1803 // (?P<name>expr) the original, introduced by Python 1804 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 1805 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 1806 // 1807 // Perl 5.10 gave in and implemented the Python version too, 1808 // but they claim that the last two are the preferred forms. 1809 // PCRE and languages based on it (specifically, PHP and Ruby) 1810 // support all three as well. EcmaScript 4 uses only the Python form. 1811 // 1812 // In both the open source world (via Code Search) and the 1813 // Google source tree, (?P<expr>name) is the dominant form, 1814 // so that's the one we implement. One is enough. 1815 if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { 1816 // Pull out name. 1817 int end = t.find('>', 2); 1818 if (end == t.npos) { 1819 if (!IsValidUTF8(*s, status_)) 1820 return false; 1821 status_->set_code(kRegexpBadNamedCapture); 1822 status_->set_error_arg(*s); 1823 return false; 1824 } 1825 1826 // t is "P<name>...", t[end] == '>' 1827 StringPiece capture(t.begin()-2, end+3); // "(?P<name>" 1828 StringPiece name(t.begin()+2, end-2); // "name" 1829 if (!IsValidUTF8(name, status_)) 1830 return false; 1831 if (!IsValidCaptureName(name)) { 1832 status_->set_code(kRegexpBadNamedCapture); 1833 status_->set_error_arg(capture); 1834 return false; 1835 } 1836 1837 if (!DoLeftParen(name)) { 1838 // DoLeftParen's failure set status_. 1839 return false; 1840 } 1841 1842 s->remove_prefix(capture.end() - s->begin()); 1843 return true; 1844 } 1845 1846 bool negated = false; 1847 bool sawflags = false; 1848 int nflags = flags_; 1849 Rune c; 1850 for (bool done = false; !done; ) { 1851 if (t.size() == 0) 1852 goto BadPerlOp; 1853 if (StringPieceToRune(&c, &t, status_) < 0) 1854 return false; 1855 switch (c) { 1856 default: 1857 goto BadPerlOp; 1858 1859 // Parse flags. 1860 case 'i': 1861 sawflags = true; 1862 if (negated) 1863 nflags &= ~FoldCase; 1864 else 1865 nflags |= FoldCase; 1866 break; 1867 1868 case 'm': // opposite of our OneLine 1869 sawflags = true; 1870 if (negated) 1871 nflags |= OneLine; 1872 else 1873 nflags &= ~OneLine; 1874 break; 1875 1876 case 's': 1877 sawflags = true; 1878 if (negated) 1879 nflags &= ~DotNL; 1880 else 1881 nflags |= DotNL; 1882 break; 1883 1884 case 'U': 1885 sawflags = true; 1886 if (negated) 1887 nflags &= ~NonGreedy; 1888 else 1889 nflags |= NonGreedy; 1890 break; 1891 1892 // Negation 1893 case '-': 1894 if (negated) 1895 goto BadPerlOp; 1896 negated = true; 1897 sawflags = false; 1898 break; 1899 1900 // Open new group. 1901 case ':': 1902 if (!DoLeftParenNoCapture()) { 1903 // DoLeftParenNoCapture's failure set status_. 1904 return false; 1905 } 1906 done = true; 1907 break; 1908 1909 // Finish flags. 1910 case ')': 1911 done = true; 1912 break; 1913 } 1914 } 1915 1916 if (negated && !sawflags) 1917 goto BadPerlOp; 1918 1919 flags_ = static_cast<Regexp::ParseFlags>(nflags); 1920 *s = t; 1921 return true; 1922 1923 BadPerlOp: 1924 status_->set_code(kRegexpBadPerlOp); 1925 status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); 1926 return false; 1927 } 1928 1929 // Converts latin1 (assumed to be encoded as Latin1 bytes) 1930 // into UTF8 encoding in string. 1931 // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is 1932 // deprecated and because it rejects code points 0x80-0x9F. 1933 void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { 1934 char buf[UTFmax]; 1935 1936 utf->clear(); 1937 for (int i = 0; i < latin1.size(); i++) { 1938 Rune r = latin1[i] & 0xFF; 1939 int n = runetochar(buf, &r); 1940 utf->append(buf, n); 1941 } 1942 } 1943 1944 // Parses the regular expression given by s, 1945 // returning the corresponding Regexp tree. 1946 // The caller must Decref the return value when done with it. 1947 // Returns NULL on error. 1948 Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, 1949 RegexpStatus* status) { 1950 // Make status non-NULL (easier on everyone else). 1951 RegexpStatus xstatus; 1952 if (status == NULL) 1953 status = &xstatus; 1954 1955 ParseState ps(global_flags, s, status); 1956 StringPiece t = s; 1957 1958 // Convert regexp to UTF-8 (easier on the rest of the parser). 1959 if (global_flags & Latin1) { 1960 string* tmp = new string; 1961 ConvertLatin1ToUTF8(t, tmp); 1962 status->set_tmp(tmp); 1963 t = *tmp; 1964 } 1965 1966 if (global_flags & Literal) { 1967 // Special parse loop for literal string. 1968 while (t.size() > 0) { 1969 Rune r; 1970 if (StringPieceToRune(&r, &t, status) < 0) 1971 return NULL; 1972 if (!ps.PushLiteral(r)) 1973 return NULL; 1974 } 1975 return ps.DoFinish(); 1976 } 1977 1978 StringPiece lastunary = NULL; 1979 while (t.size() > 0) { 1980 StringPiece isunary = NULL; 1981 switch (t[0]) { 1982 default: { 1983 Rune r; 1984 if (StringPieceToRune(&r, &t, status) < 0) 1985 return NULL; 1986 if (!ps.PushLiteral(r)) 1987 return NULL; 1988 break; 1989 } 1990 1991 case '(': 1992 // "(?" introduces Perl escape. 1993 if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { 1994 // Flag changes and non-capturing groups. 1995 if (!ps.ParsePerlFlags(&t)) 1996 return NULL; 1997 break; 1998 } 1999 if (!ps.DoLeftParen(NULL)) 2000 return NULL; 2001 t.remove_prefix(1); // '(' 2002 break; 2003 2004 case '|': 2005 if (!ps.DoVerticalBar()) 2006 return NULL; 2007 t.remove_prefix(1); // '|' 2008 break; 2009 2010 case ')': 2011 if (!ps.DoRightParen()) 2012 return NULL; 2013 t.remove_prefix(1); // ')' 2014 break; 2015 2016 case '^': // Beginning of line. 2017 if (!ps.PushCarat()) 2018 return NULL; 2019 t.remove_prefix(1); // '^' 2020 break; 2021 2022 case '$': // End of line. 2023 if (!ps.PushDollar()) 2024 return NULL; 2025 t.remove_prefix(1); // '$' 2026 break; 2027 2028 case '.': // Any character (possibly except newline). 2029 if (!ps.PushDot()) 2030 return NULL; 2031 t.remove_prefix(1); // '.' 2032 break; 2033 2034 case '[': { // Character class. 2035 Regexp* re; 2036 if (!ps.ParseCharClass(&t, &re, status)) 2037 return NULL; 2038 if (!ps.PushRegexp(re)) 2039 return NULL; 2040 break; 2041 } 2042 2043 case '*': { // Zero or more. 2044 RegexpOp op; 2045 op = kRegexpStar; 2046 goto Rep; 2047 case '+': // One or more. 2048 op = kRegexpPlus; 2049 goto Rep; 2050 case '?': // Zero or one. 2051 op = kRegexpQuest; 2052 goto Rep; 2053 Rep: 2054 StringPiece opstr = t; 2055 bool nongreedy = false; 2056 t.remove_prefix(1); // '*' or '+' or '?' 2057 if (ps.flags() & PerlX) { 2058 if (t.size() > 0 && t[0] == '?') { 2059 nongreedy = true; 2060 t.remove_prefix(1); // '?' 2061 } 2062 if (lastunary.size() > 0) { 2063 // In Perl it is not allowed to stack repetition operators: 2064 // a** is a syntax error, not a double-star. 2065 // (and a++ means something else entirely, which we don't support!) 2066 status->set_code(kRegexpRepeatOp); 2067 status->set_error_arg(StringPiece(lastunary.begin(), 2068 t.begin() - lastunary.begin())); 2069 return NULL; 2070 } 2071 } 2072 opstr.set(opstr.data(), t.data() - opstr.data()); 2073 if (!ps.PushRepeatOp(op, opstr, nongreedy)) 2074 return NULL; 2075 isunary = opstr; 2076 break; 2077 } 2078 2079 case '{': { // Counted repetition. 2080 int lo, hi; 2081 StringPiece opstr = t; 2082 if (!MaybeParseRepetition(&t, &lo, &hi)) { 2083 // Treat like a literal. 2084 if (!ps.PushLiteral('{')) 2085 return NULL; 2086 t.remove_prefix(1); // '{' 2087 break; 2088 } 2089 bool nongreedy = false; 2090 if (ps.flags() & PerlX) { 2091 if (t.size() > 0 && t[0] == '?') { 2092 nongreedy = true; 2093 t.remove_prefix(1); // '?' 2094 } 2095 if (lastunary.size() > 0) { 2096 // Not allowed to stack repetition operators. 2097 status->set_code(kRegexpRepeatOp); 2098 status->set_error_arg(StringPiece(lastunary.begin(), 2099 t.begin() - lastunary.begin())); 2100 return NULL; 2101 } 2102 } 2103 opstr.set(opstr.data(), t.data() - opstr.data()); 2104 if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) 2105 return NULL; 2106 isunary = opstr; 2107 break; 2108 } 2109 2110 case '\\': { // Escaped character or Perl sequence. 2111 // \b and \B: word boundary or not 2112 if ((ps.flags() & Regexp::PerlB) && 2113 t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { 2114 if (!ps.PushWordBoundary(t[1] == 'b')) 2115 return NULL; 2116 t.remove_prefix(2); // '\\', 'b' 2117 break; 2118 } 2119 2120 if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { 2121 if (t[1] == 'A') { 2122 if (!ps.PushSimpleOp(kRegexpBeginText)) 2123 return NULL; 2124 t.remove_prefix(2); // '\\', 'A' 2125 break; 2126 } 2127 if (t[1] == 'z') { 2128 if (!ps.PushSimpleOp(kRegexpEndText)) 2129 return NULL; 2130 t.remove_prefix(2); // '\\', 'z' 2131 break; 2132 } 2133 // Do not recognize \Z, because this library can't 2134 // implement the exact Perl/PCRE semantics. 2135 // (This library treats "(?-m)$" as \z, even though 2136 // in Perl and PCRE it is equivalent to \Z.) 2137 2138 if (t[1] == 'C') { // \C: any byte [sic] 2139 if (!ps.PushSimpleOp(kRegexpAnyByte)) 2140 return NULL; 2141 t.remove_prefix(2); // '\\', 'C' 2142 break; 2143 } 2144 2145 if (t[1] == 'Q') { // \Q ... \E: the ... is always literals 2146 t.remove_prefix(2); // '\\', 'Q' 2147 while (t.size() > 0) { 2148 if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { 2149 t.remove_prefix(2); // '\\', 'E' 2150 break; 2151 } 2152 Rune r; 2153 if (StringPieceToRune(&r, &t, status) < 0) 2154 return NULL; 2155 if (!ps.PushLiteral(r)) 2156 return NULL; 2157 } 2158 break; 2159 } 2160 } 2161 2162 if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { 2163 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 2164 re->ccb_ = new CharClassBuilder; 2165 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { 2166 case kParseOk: 2167 if (!ps.PushRegexp(re)) 2168 return NULL; 2169 goto Break2; 2170 case kParseError: 2171 re->Decref(); 2172 return NULL; 2173 case kParseNothing: 2174 re->Decref(); 2175 break; 2176 } 2177 } 2178 2179 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); 2180 if (g != NULL) { 2181 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 2182 re->ccb_ = new CharClassBuilder; 2183 AddUGroup(re->ccb_, g, g->sign, ps.flags()); 2184 if (!ps.PushRegexp(re)) 2185 return NULL; 2186 break; 2187 } 2188 2189 Rune r; 2190 if (!ParseEscape(&t, &r, status, ps.rune_max())) 2191 return NULL; 2192 if (!ps.PushLiteral(r)) 2193 return NULL; 2194 break; 2195 } 2196 } 2197 Break2: 2198 lastunary = isunary; 2199 } 2200 return ps.DoFinish(); 2201 } 2202 2203 } // namespace re2 2204