1 /* 2 * Copyright (C) 2008 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "WRECParser.h" 28 29 #if ENABLE(WREC) 30 31 #include "CharacterClassConstructor.h" 32 #include "WRECFunctors.h" 33 34 using namespace WTF; 35 36 namespace JSC { namespace WREC { 37 38 // These error messages match the error messages used by PCRE. 39 const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier"; 40 const char* Parser::QuantifierWithoutAtom = "nothing to repeat"; 41 const char* Parser::ParenthesesUnmatched = "unmatched parentheses"; 42 const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?"; 43 const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet. 44 const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class"; 45 const char* Parser::CharacterClassOutOfOrder = "range out of order in character class"; 46 const char* Parser::EscapeUnterminated = "\\ at end of pattern"; 47 48 class PatternCharacterSequence { 49 typedef Generator::JumpList JumpList; 50 51 public: 52 PatternCharacterSequence(Generator& generator, JumpList& failures) 53 : m_generator(generator) 54 , m_failures(failures) 55 { 56 } 57 58 size_t size() { return m_sequence.size(); } 59 60 void append(int ch) 61 { 62 m_sequence.append(ch); 63 } 64 65 void flush() 66 { 67 if (!m_sequence.size()) 68 return; 69 70 m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size()); 71 m_sequence.clear(); 72 } 73 74 void flush(const Quantifier& quantifier) 75 { 76 if (!m_sequence.size()) 77 return; 78 79 m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1); 80 81 switch (quantifier.type) { 82 case Quantifier::None: 83 case Quantifier::Error: 84 ASSERT_NOT_REACHED(); 85 break; 86 87 case Quantifier::Greedy: { 88 GeneratePatternCharacterFunctor functor(m_sequence.last()); 89 m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); 90 break; 91 } 92 93 case Quantifier::NonGreedy: { 94 GeneratePatternCharacterFunctor functor(m_sequence.last()); 95 m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max); 96 break; 97 } 98 } 99 100 m_sequence.clear(); 101 } 102 103 private: 104 Generator& m_generator; 105 JumpList& m_failures; 106 Vector<int, 8> m_sequence; 107 }; 108 109 ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier() 110 { 111 switch (peek()) { 112 case '?': 113 consume(); 114 return Quantifier(Quantifier::Greedy, 0, 1); 115 116 case '*': 117 consume(); 118 return Quantifier(Quantifier::Greedy, 0); 119 120 case '+': 121 consume(); 122 return Quantifier(Quantifier::Greedy, 1); 123 124 case '{': { 125 SavedState state(*this); 126 consume(); 127 128 // Accept: {n}, {n,}, {n,m}. 129 // Reject: {n,m} where n > m. 130 // Ignore: Anything else, such as {n, m}. 131 132 if (!peekIsDigit()) { 133 state.restore(); 134 return Quantifier(); 135 } 136 137 unsigned min = consumeNumber(); 138 unsigned max = min; 139 140 if (peek() == ',') { 141 consume(); 142 max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity; 143 } 144 145 if (peek() != '}') { 146 state.restore(); 147 return Quantifier(); 148 } 149 consume(); 150 151 if (min > max) { 152 setError(QuantifierOutOfOrder); 153 return Quantifier(Quantifier::Error); 154 } 155 156 return Quantifier(Quantifier::Greedy, min, max); 157 } 158 159 default: 160 return Quantifier(); // No quantifier. 161 } 162 } 163 164 Quantifier Parser::consumeQuantifier() 165 { 166 Quantifier q = consumeGreedyQuantifier(); 167 168 if ((q.type == Quantifier::Greedy) && (peek() == '?')) { 169 consume(); 170 q.type = Quantifier::NonGreedy; 171 } 172 173 return q; 174 } 175 176 bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert) 177 { 178 Quantifier q = consumeQuantifier(); 179 180 switch (q.type) { 181 case Quantifier::None: { 182 m_generator.generateCharacterClass(failures, charClass, invert); 183 break; 184 } 185 186 case Quantifier::Greedy: { 187 GenerateCharacterClassFunctor functor(&charClass, invert); 188 m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max); 189 break; 190 } 191 192 case Quantifier::NonGreedy: { 193 GenerateCharacterClassFunctor functor(&charClass, invert); 194 m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max); 195 break; 196 } 197 198 case Quantifier::Error: 199 return false; 200 } 201 202 return true; 203 } 204 205 bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId) 206 { 207 Quantifier q = consumeQuantifier(); 208 209 switch (q.type) { 210 case Quantifier::None: { 211 m_generator.generateBackreference(failures, subpatternId); 212 break; 213 } 214 215 case Quantifier::Greedy: 216 case Quantifier::NonGreedy: 217 m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max); 218 return true; 219 220 case Quantifier::Error: 221 return false; 222 } 223 224 return true; 225 } 226 227 bool Parser::parseParentheses(JumpList& failures) 228 { 229 ParenthesesType type = consumeParenthesesType(); 230 231 // FIXME: WREC originally failed to backtrack correctly in cases such as 232 // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For 233 // unsupported parentheses, we fall back on PCRE. 234 235 switch (type) { 236 case Generator::Assertion: { 237 m_generator.generateParenthesesAssertion(failures); 238 239 if (consume() != ')') { 240 setError(ParenthesesUnmatched); 241 return false; 242 } 243 244 Quantifier quantifier = consumeQuantifier(); 245 if (quantifier.type != Quantifier::None && quantifier.min == 0) { 246 setError(ParenthesesNotSupported); 247 return false; 248 } 249 250 return true; 251 } 252 case Generator::InvertedAssertion: { 253 m_generator.generateParenthesesInvertedAssertion(failures); 254 255 if (consume() != ')') { 256 setError(ParenthesesUnmatched); 257 return false; 258 } 259 260 Quantifier quantifier = consumeQuantifier(); 261 if (quantifier.type != Quantifier::None && quantifier.min == 0) { 262 setError(ParenthesesNotSupported); 263 return false; 264 } 265 266 return true; 267 } 268 default: 269 setError(ParenthesesNotSupported); 270 return false; 271 } 272 } 273 274 bool Parser::parseCharacterClass(JumpList& failures) 275 { 276 bool invert = false; 277 if (peek() == '^') { 278 consume(); 279 invert = true; 280 } 281 282 CharacterClassConstructor constructor(m_ignoreCase); 283 284 int ch; 285 while ((ch = peek()) != ']') { 286 switch (ch) { 287 case EndOfPattern: 288 setError(CharacterClassUnmatched); 289 return false; 290 291 case '\\': { 292 consume(); 293 Escape escape = consumeEscape(true); 294 295 switch (escape.type()) { 296 case Escape::PatternCharacter: { 297 int character = PatternCharacterEscape::cast(escape).character(); 298 if (character == '-') 299 constructor.flushBeforeEscapedHyphen(); 300 constructor.put(character); 301 break; 302 } 303 case Escape::CharacterClass: { 304 const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape); 305 ASSERT(!characterClassEscape.invert()); 306 constructor.append(characterClassEscape.characterClass()); 307 break; 308 } 309 case Escape::Error: 310 return false; 311 case Escape::Backreference: 312 case Escape::WordBoundaryAssertion: { 313 ASSERT_NOT_REACHED(); 314 break; 315 } 316 } 317 break; 318 } 319 320 default: 321 consume(); 322 constructor.put(ch); 323 } 324 } 325 consume(); 326 327 // lazily catch reversed ranges ([z-a])in character classes 328 if (constructor.isUpsideDown()) { 329 setError(CharacterClassOutOfOrder); 330 return false; 331 } 332 333 constructor.flush(); 334 CharacterClass charClass = constructor.charClass(); 335 return parseCharacterClassQuantifier(failures, charClass, invert); 336 } 337 338 bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape) 339 { 340 switch (escape.type()) { 341 case Escape::PatternCharacter: 342 ASSERT_NOT_REACHED(); 343 return false; 344 345 case Escape::CharacterClass: 346 return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert()); 347 348 case Escape::Backreference: 349 return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId()); 350 351 case Escape::WordBoundaryAssertion: 352 m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert()); 353 return true; 354 355 case Escape::Error: 356 return false; 357 } 358 359 ASSERT_NOT_REACHED(); 360 return false; 361 } 362 363 Escape Parser::consumeEscape(bool inCharacterClass) 364 { 365 switch (peek()) { 366 case EndOfPattern: 367 setError(EscapeUnterminated); 368 return Escape(Escape::Error); 369 370 // Assertions 371 case 'b': 372 consume(); 373 if (inCharacterClass) 374 return PatternCharacterEscape('\b'); 375 return WordBoundaryAssertionEscape(false); // do not invert 376 case 'B': 377 consume(); 378 if (inCharacterClass) 379 return PatternCharacterEscape('B'); 380 return WordBoundaryAssertionEscape(true); // invert 381 382 // CharacterClassEscape 383 case 'd': 384 consume(); 385 return CharacterClassEscape(CharacterClass::digits(), false); 386 case 's': 387 consume(); 388 return CharacterClassEscape(CharacterClass::spaces(), false); 389 case 'w': 390 consume(); 391 return CharacterClassEscape(CharacterClass::wordchar(), false); 392 case 'D': 393 consume(); 394 return inCharacterClass 395 ? CharacterClassEscape(CharacterClass::nondigits(), false) 396 : CharacterClassEscape(CharacterClass::digits(), true); 397 case 'S': 398 consume(); 399 return inCharacterClass 400 ? CharacterClassEscape(CharacterClass::nonspaces(), false) 401 : CharacterClassEscape(CharacterClass::spaces(), true); 402 case 'W': 403 consume(); 404 return inCharacterClass 405 ? CharacterClassEscape(CharacterClass::nonwordchar(), false) 406 : CharacterClassEscape(CharacterClass::wordchar(), true); 407 408 // DecimalEscape 409 case '1': 410 case '2': 411 case '3': 412 case '4': 413 case '5': 414 case '6': 415 case '7': 416 case '8': 417 case '9': { 418 if (peekDigit() > m_numSubpatterns || inCharacterClass) { 419 // To match Firefox, we parse an invalid backreference in the range [1-7] 420 // as an octal escape. 421 return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal()); 422 } 423 424 int value = 0; 425 do { 426 unsigned newValue = value * 10 + peekDigit(); 427 if (newValue > m_numSubpatterns) 428 break; 429 value = newValue; 430 consume(); 431 } while (peekIsDigit()); 432 433 return BackreferenceEscape(value); 434 } 435 436 // Octal escape 437 case '0': 438 consume(); 439 return PatternCharacterEscape(consumeOctal()); 440 441 // ControlEscape 442 case 'f': 443 consume(); 444 return PatternCharacterEscape('\f'); 445 case 'n': 446 consume(); 447 return PatternCharacterEscape('\n'); 448 case 'r': 449 consume(); 450 return PatternCharacterEscape('\r'); 451 case 't': 452 consume(); 453 return PatternCharacterEscape('\t'); 454 case 'v': 455 consume(); 456 return PatternCharacterEscape('\v'); 457 458 // ControlLetter 459 case 'c': { 460 SavedState state(*this); 461 consume(); 462 463 int control = consume(); 464 // To match Firefox, inside a character class, we also accept numbers 465 // and '_' as control characters. 466 if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) { 467 state.restore(); 468 return PatternCharacterEscape('\\'); 469 } 470 return PatternCharacterEscape(control & 31); 471 } 472 473 // HexEscape 474 case 'x': { 475 consume(); 476 477 SavedState state(*this); 478 int x = consumeHex(2); 479 if (x == -1) { 480 state.restore(); 481 return PatternCharacterEscape('x'); 482 } 483 return PatternCharacterEscape(x); 484 } 485 486 // UnicodeEscape 487 case 'u': { 488 consume(); 489 490 SavedState state(*this); 491 int x = consumeHex(4); 492 if (x == -1) { 493 state.restore(); 494 return PatternCharacterEscape('u'); 495 } 496 return PatternCharacterEscape(x); 497 } 498 499 // IdentityEscape 500 default: 501 return PatternCharacterEscape(consume()); 502 } 503 } 504 505 void Parser::parseAlternative(JumpList& failures) 506 { 507 PatternCharacterSequence sequence(m_generator, failures); 508 509 while (1) { 510 switch (peek()) { 511 case EndOfPattern: 512 case '|': 513 case ')': 514 sequence.flush(); 515 return; 516 517 case '*': 518 case '+': 519 case '?': 520 case '{': { 521 Quantifier q = consumeQuantifier(); 522 523 if (q.type == Quantifier::None) { 524 sequence.append(consume()); 525 continue; 526 } 527 528 if (q.type == Quantifier::Error) 529 return; 530 531 if (!sequence.size()) { 532 setError(QuantifierWithoutAtom); 533 return; 534 } 535 536 sequence.flush(q); 537 continue; 538 } 539 540 case '^': 541 consume(); 542 543 sequence.flush(); 544 m_generator.generateAssertionBOL(failures); 545 continue; 546 547 case '$': 548 consume(); 549 550 sequence.flush(); 551 m_generator.generateAssertionEOL(failures); 552 continue; 553 554 case '.': 555 consume(); 556 557 sequence.flush(); 558 if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true)) 559 return; 560 continue; 561 562 case '[': 563 consume(); 564 565 sequence.flush(); 566 if (!parseCharacterClass(failures)) 567 return; 568 continue; 569 570 case '(': 571 consume(); 572 573 sequence.flush(); 574 if (!parseParentheses(failures)) 575 return; 576 continue; 577 578 case '\\': { 579 consume(); 580 581 Escape escape = consumeEscape(false); 582 if (escape.type() == Escape::PatternCharacter) { 583 sequence.append(PatternCharacterEscape::cast(escape).character()); 584 continue; 585 } 586 587 sequence.flush(); 588 if (!parseNonCharacterEscape(failures, escape)) 589 return; 590 continue; 591 } 592 593 default: 594 sequence.append(consume()); 595 continue; 596 } 597 } 598 } 599 600 /* 601 TOS holds index. 602 */ 603 void Parser::parseDisjunction(JumpList& failures) 604 { 605 parseAlternative(failures); 606 if (peek() != '|') 607 return; 608 609 JumpList successes; 610 do { 611 consume(); 612 m_generator.terminateAlternative(successes, failures); 613 parseAlternative(failures); 614 } while (peek() == '|'); 615 616 m_generator.terminateDisjunction(successes); 617 } 618 619 Generator::ParenthesesType Parser::consumeParenthesesType() 620 { 621 if (peek() != '?') 622 return Generator::Capturing; 623 consume(); 624 625 switch (consume()) { 626 case ':': 627 return Generator::NonCapturing; 628 629 case '=': 630 return Generator::Assertion; 631 632 case '!': 633 return Generator::InvertedAssertion; 634 635 default: 636 setError(ParenthesesTypeInvalid); 637 return Generator::Error; 638 } 639 } 640 641 } } // namespace JSC::WREC 642 643 #endif // ENABLE(WREC) 644