1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Redistribution and use in source and binary forms, with or without 3 // modification, are permitted provided that the following conditions are 4 // met: 5 // 6 // * Redistributions of source code must retain the above copyright 7 // notice, this list of conditions and the following disclaimer. 8 // * Redistributions in binary form must reproduce the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer in the documentation and/or other materials provided 11 // with the distribution. 12 // * Neither the name of Google Inc. nor the names of its 13 // contributors may be used to endorse or promote products derived 14 // from this software without specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28 // Features shared by parsing and pre-parsing scanners. 29 30 #ifndef V8_SCANNER_BASE_H_ 31 #define V8_SCANNER_BASE_H_ 32 33 #include "globals.h" 34 #include "checks.h" 35 #include "allocation.h" 36 #include "token.h" 37 #include "unicode-inl.h" 38 #include "char-predicates.h" 39 #include "utils.h" 40 #include "list-inl.h" 41 42 namespace v8 { 43 namespace internal { 44 45 // Returns the value (0 .. 15) of a hexadecimal character c. 46 // If c is not a legal hexadecimal character, returns a value < 0. 47 inline int HexValue(uc32 c) { 48 c -= '0'; 49 if (static_cast<unsigned>(c) <= 9) return c; 50 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. 51 if (static_cast<unsigned>(c) <= 5) return c + 10; 52 return -1; 53 } 54 55 56 // --------------------------------------------------------------------- 57 // Buffered stream of characters, using an internal UC16 buffer. 58 59 class UC16CharacterStream { 60 public: 61 UC16CharacterStream() : pos_(0) { } 62 virtual ~UC16CharacterStream() { } 63 64 // Returns and advances past the next UC16 character in the input 65 // stream. If there are no more characters, it returns a negative 66 // value. 67 inline uc32 Advance() { 68 if (buffer_cursor_ < buffer_end_ || ReadBlock()) { 69 pos_++; 70 return static_cast<uc32>(*(buffer_cursor_++)); 71 } 72 // Note: currently the following increment is necessary to avoid a 73 // parser problem! The scanner treats the final kEndOfInput as 74 // a character with a position, and does math relative to that 75 // position. 76 pos_++; 77 78 return kEndOfInput; 79 } 80 81 // Return the current position in the character stream. 82 // Starts at zero. 83 inline unsigned pos() const { return pos_; } 84 85 // Skips forward past the next character_count UC16 characters 86 // in the input, or until the end of input if that comes sooner. 87 // Returns the number of characters actually skipped. If less 88 // than character_count, 89 inline unsigned SeekForward(unsigned character_count) { 90 unsigned buffered_chars = 91 static_cast<unsigned>(buffer_end_ - buffer_cursor_); 92 if (character_count <= buffered_chars) { 93 buffer_cursor_ += character_count; 94 pos_ += character_count; 95 return character_count; 96 } 97 return SlowSeekForward(character_count); 98 } 99 100 // Pushes back the most recently read UC16 character (or negative 101 // value if at end of input), i.e., the value returned by the most recent 102 // call to Advance. 103 // Must not be used right after calling SeekForward. 104 virtual void PushBack(int32_t character) = 0; 105 106 protected: 107 static const uc32 kEndOfInput = -1; 108 109 // Ensures that the buffer_cursor_ points to the character at 110 // position pos_ of the input, if possible. If the position 111 // is at or after the end of the input, return false. If there 112 // are more characters available, return true. 113 virtual bool ReadBlock() = 0; 114 virtual unsigned SlowSeekForward(unsigned character_count) = 0; 115 116 const uc16* buffer_cursor_; 117 const uc16* buffer_end_; 118 unsigned pos_; 119 }; 120 121 122 class UnicodeCache { 123 // --------------------------------------------------------------------- 124 // Caching predicates used by scanners. 125 public: 126 UnicodeCache() {} 127 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 128 129 StaticResource<Utf8Decoder>* utf8_decoder() { 130 return &utf8_decoder_; 131 } 132 133 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); } 134 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); } 135 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); } 136 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); } 137 138 private: 139 140 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 141 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 142 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 143 unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 144 StaticResource<Utf8Decoder> utf8_decoder_; 145 146 DISALLOW_COPY_AND_ASSIGN(UnicodeCache); 147 }; 148 149 150 // ---------------------------------------------------------------------------- 151 // LiteralBuffer - Collector of chars of literals. 152 153 class LiteralBuffer { 154 public: 155 LiteralBuffer() : is_ascii_(true), position_(0), backing_store_() { } 156 157 ~LiteralBuffer() { 158 if (backing_store_.length() > 0) { 159 backing_store_.Dispose(); 160 } 161 } 162 163 inline void AddChar(uc16 character) { 164 if (position_ >= backing_store_.length()) ExpandBuffer(); 165 if (is_ascii_) { 166 if (character < kMaxAsciiCharCodeU) { 167 backing_store_[position_] = static_cast<byte>(character); 168 position_ += kASCIISize; 169 return; 170 } 171 ConvertToUC16(); 172 } 173 *reinterpret_cast<uc16*>(&backing_store_[position_]) = character; 174 position_ += kUC16Size; 175 } 176 177 bool is_ascii() { return is_ascii_; } 178 179 Vector<const uc16> uc16_literal() { 180 ASSERT(!is_ascii_); 181 ASSERT((position_ & 0x1) == 0); 182 return Vector<const uc16>( 183 reinterpret_cast<const uc16*>(backing_store_.start()), 184 position_ >> 1); 185 } 186 187 Vector<const char> ascii_literal() { 188 ASSERT(is_ascii_); 189 return Vector<const char>( 190 reinterpret_cast<const char*>(backing_store_.start()), 191 position_); 192 } 193 194 int length() { 195 return is_ascii_ ? position_ : (position_ >> 1); 196 } 197 198 void Reset() { 199 position_ = 0; 200 is_ascii_ = true; 201 } 202 private: 203 static const int kInitialCapacity = 16; 204 static const int kGrowthFactory = 4; 205 static const int kMinConversionSlack = 256; 206 static const int kMaxGrowth = 1 * MB; 207 inline int NewCapacity(int min_capacity) { 208 int capacity = Max(min_capacity, backing_store_.length()); 209 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); 210 return new_capacity; 211 } 212 213 void ExpandBuffer() { 214 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); 215 memcpy(new_store.start(), backing_store_.start(), position_); 216 backing_store_.Dispose(); 217 backing_store_ = new_store; 218 } 219 220 void ConvertToUC16() { 221 ASSERT(is_ascii_); 222 Vector<byte> new_store; 223 int new_content_size = position_ * kUC16Size; 224 if (new_content_size >= backing_store_.length()) { 225 // Ensure room for all currently read characters as UC16 as well 226 // as the character about to be stored. 227 new_store = Vector<byte>::New(NewCapacity(new_content_size)); 228 } else { 229 new_store = backing_store_; 230 } 231 char* src = reinterpret_cast<char*>(backing_store_.start()); 232 uc16* dst = reinterpret_cast<uc16*>(new_store.start()); 233 for (int i = position_ - 1; i >= 0; i--) { 234 dst[i] = src[i]; 235 } 236 if (new_store.start() != backing_store_.start()) { 237 backing_store_.Dispose(); 238 backing_store_ = new_store; 239 } 240 position_ = new_content_size; 241 is_ascii_ = false; 242 } 243 244 bool is_ascii_; 245 int position_; 246 Vector<byte> backing_store_; 247 248 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); 249 }; 250 251 252 // ---------------------------------------------------------------------------- 253 // Scanner base-class. 254 255 // Generic functionality used by both JSON and JavaScript scanners. 256 class Scanner { 257 public: 258 // -1 is outside of the range of any real source code. 259 static const int kNoOctalLocation = -1; 260 261 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 262 263 class LiteralScope { 264 public: 265 explicit LiteralScope(Scanner* self); 266 ~LiteralScope(); 267 void Complete(); 268 269 private: 270 Scanner* scanner_; 271 bool complete_; 272 }; 273 274 explicit Scanner(UnicodeCache* scanner_contants); 275 276 // Returns the current token again. 277 Token::Value current_token() { return current_.token; } 278 279 // One token look-ahead (past the token returned by Next()). 280 Token::Value peek() const { return next_.token; } 281 282 struct Location { 283 Location(int b, int e) : beg_pos(b), end_pos(e) { } 284 Location() : beg_pos(0), end_pos(0) { } 285 286 bool IsValid() const { 287 return beg_pos >= 0 && end_pos >= beg_pos; 288 } 289 290 int beg_pos; 291 int end_pos; 292 }; 293 294 static Location NoLocation() { 295 return Location(-1, -1); 296 } 297 298 // Returns the location information for the current token 299 // (the token returned by Next()). 300 Location location() const { return current_.location; } 301 Location peek_location() const { return next_.location; } 302 303 // Returns the location of the last seen octal literal 304 int octal_position() const { return octal_pos_; } 305 void clear_octal_position() { octal_pos_ = -1; } 306 307 // Returns the literal string, if any, for the current token (the 308 // token returned by Next()). The string is 0-terminated and in 309 // UTF-8 format; they may contain 0-characters. Literal strings are 310 // collected for identifiers, strings, and numbers. 311 // These functions only give the correct result if the literal 312 // was scanned between calls to StartLiteral() and TerminateLiteral(). 313 bool is_literal_ascii() { 314 ASSERT_NOT_NULL(current_.literal_chars); 315 return current_.literal_chars->is_ascii(); 316 } 317 Vector<const char> literal_ascii_string() { 318 ASSERT_NOT_NULL(current_.literal_chars); 319 return current_.literal_chars->ascii_literal(); 320 } 321 Vector<const uc16> literal_uc16_string() { 322 ASSERT_NOT_NULL(current_.literal_chars); 323 return current_.literal_chars->uc16_literal(); 324 } 325 int literal_length() const { 326 ASSERT_NOT_NULL(current_.literal_chars); 327 return current_.literal_chars->length(); 328 } 329 330 // Returns the literal string for the next token (the token that 331 // would be returned if Next() were called). 332 bool is_next_literal_ascii() { 333 ASSERT_NOT_NULL(next_.literal_chars); 334 return next_.literal_chars->is_ascii(); 335 } 336 Vector<const char> next_literal_ascii_string() { 337 ASSERT_NOT_NULL(next_.literal_chars); 338 return next_.literal_chars->ascii_literal(); 339 } 340 Vector<const uc16> next_literal_uc16_string() { 341 ASSERT_NOT_NULL(next_.literal_chars); 342 return next_.literal_chars->uc16_literal(); 343 } 344 int next_literal_length() const { 345 ASSERT_NOT_NULL(next_.literal_chars); 346 return next_.literal_chars->length(); 347 } 348 349 static const int kCharacterLookaheadBufferSize = 1; 350 351 protected: 352 // The current and look-ahead token. 353 struct TokenDesc { 354 Token::Value token; 355 Location location; 356 LiteralBuffer* literal_chars; 357 }; 358 359 // Call this after setting source_ to the input. 360 void Init() { 361 // Set c0_ (one character ahead) 362 ASSERT(kCharacterLookaheadBufferSize == 1); 363 Advance(); 364 // Initialize current_ to not refer to a literal. 365 current_.literal_chars = NULL; 366 } 367 368 // Literal buffer support 369 inline void StartLiteral() { 370 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ? 371 &literal_buffer2_ : &literal_buffer1_; 372 free_buffer->Reset(); 373 next_.literal_chars = free_buffer; 374 } 375 376 inline void AddLiteralChar(uc32 c) { 377 ASSERT_NOT_NULL(next_.literal_chars); 378 next_.literal_chars->AddChar(c); 379 } 380 381 // Complete scanning of a literal. 382 inline void TerminateLiteral() { 383 // Does nothing in the current implementation. 384 } 385 386 // Stops scanning of a literal and drop the collected characters, 387 // e.g., due to an encountered error. 388 inline void DropLiteral() { 389 next_.literal_chars = NULL; 390 } 391 392 inline void AddLiteralCharAdvance() { 393 AddLiteralChar(c0_); 394 Advance(); 395 } 396 397 // Low-level scanning support. 398 void Advance() { c0_ = source_->Advance(); } 399 void PushBack(uc32 ch) { 400 source_->PushBack(c0_); 401 c0_ = ch; 402 } 403 404 inline Token::Value Select(Token::Value tok) { 405 Advance(); 406 return tok; 407 } 408 409 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { 410 Advance(); 411 if (c0_ == next) { 412 Advance(); 413 return then; 414 } else { 415 return else_; 416 } 417 } 418 419 uc32 ScanHexEscape(uc32 c, int length); 420 421 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. 422 uc32 ScanOctalEscape(uc32 c, int length); 423 424 // Return the current source position. 425 int source_pos() { 426 return source_->pos() - kCharacterLookaheadBufferSize; 427 } 428 429 UnicodeCache* unicode_cache_; 430 431 // Buffers collecting literal strings, numbers, etc. 432 LiteralBuffer literal_buffer1_; 433 LiteralBuffer literal_buffer2_; 434 435 TokenDesc current_; // desc for current token (as returned by Next()) 436 TokenDesc next_; // desc for next token (one token look-ahead) 437 438 // Input stream. Must be initialized to an UC16CharacterStream. 439 UC16CharacterStream* source_; 440 441 // Start position of the octal literal last scanned. 442 int octal_pos_; 443 444 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 445 uc32 c0_; 446 }; 447 448 // ---------------------------------------------------------------------------- 449 // JavaScriptScanner - base logic for JavaScript scanning. 450 451 class JavaScriptScanner : public Scanner { 452 public: 453 // A LiteralScope that disables recording of some types of JavaScript 454 // literals. If the scanner is configured to not record the specific 455 // type of literal, the scope will not call StartLiteral. 456 class LiteralScope { 457 public: 458 explicit LiteralScope(JavaScriptScanner* self) 459 : scanner_(self), complete_(false) { 460 scanner_->StartLiteral(); 461 } 462 ~LiteralScope() { 463 if (!complete_) scanner_->DropLiteral(); 464 } 465 void Complete() { 466 scanner_->TerminateLiteral(); 467 complete_ = true; 468 } 469 470 private: 471 JavaScriptScanner* scanner_; 472 bool complete_; 473 }; 474 475 explicit JavaScriptScanner(UnicodeCache* scanner_contants); 476 477 // Returns the next token. 478 Token::Value Next(); 479 480 // Returns true if there was a line terminator before the peek'ed token. 481 bool has_line_terminator_before_next() const { 482 return has_line_terminator_before_next_; 483 } 484 485 // Scans the input as a regular expression pattern, previous 486 // character(s) must be /(=). Returns true if a pattern is scanned. 487 bool ScanRegExpPattern(bool seen_equal); 488 // Returns true if regexp flags are scanned (always since flags can 489 // be empty). 490 bool ScanRegExpFlags(); 491 492 // Tells whether the buffer contains an identifier (no escapes). 493 // Used for checking if a property name is an identifier. 494 static bool IsIdentifier(unibrow::CharacterStream* buffer); 495 496 // Seek forward to the given position. This operation does not 497 // work in general, for instance when there are pushed back 498 // characters, but works for seeking forward until simple delimiter 499 // tokens, which is what it is used for. 500 void SeekForward(int pos); 501 502 protected: 503 bool SkipWhiteSpace(); 504 Token::Value SkipSingleLineComment(); 505 Token::Value SkipMultiLineComment(); 506 507 // Scans a single JavaScript token. 508 void Scan(); 509 510 void ScanDecimalDigits(); 511 Token::Value ScanNumber(bool seen_period); 512 Token::Value ScanIdentifierOrKeyword(); 513 Token::Value ScanIdentifierSuffix(LiteralScope* literal); 514 515 void ScanEscape(); 516 Token::Value ScanString(); 517 518 // Scans a possible HTML comment -- begins with '<!'. 519 Token::Value ScanHtmlComment(); 520 521 // Decodes a unicode escape-sequence which is part of an identifier. 522 // If the escape sequence cannot be decoded the result is kBadChar. 523 uc32 ScanIdentifierUnicodeEscape(); 524 525 bool has_line_terminator_before_next_; 526 }; 527 528 529 // ---------------------------------------------------------------------------- 530 // Keyword matching state machine. 531 532 class KeywordMatcher { 533 // Incrementally recognize keywords. 534 // 535 // Recognized keywords: 536 // break case catch const* continue debugger* default delete do else 537 // finally false for function if in instanceof native* new null 538 // return switch this throw true try typeof var void while with 539 // 540 // *: Actually "future reserved keywords". These are the only ones we 541 // recognize, the remaining are allowed as identifiers. 542 // In ES5 strict mode, we should disallow all reserved keywords. 543 public: 544 KeywordMatcher() 545 : state_(INITIAL), 546 token_(Token::IDENTIFIER), 547 keyword_(NULL), 548 counter_(0), 549 keyword_token_(Token::ILLEGAL) {} 550 551 Token::Value token() { return token_; } 552 553 inline bool AddChar(unibrow::uchar input) { 554 if (state_ != UNMATCHABLE) { 555 Step(input); 556 } 557 return state_ != UNMATCHABLE; 558 } 559 560 void Fail() { 561 token_ = Token::IDENTIFIER; 562 state_ = UNMATCHABLE; 563 } 564 565 private: 566 enum State { 567 UNMATCHABLE, 568 INITIAL, 569 KEYWORD_PREFIX, 570 KEYWORD_MATCHED, 571 C, 572 CA, 573 CO, 574 CON, 575 D, 576 DE, 577 E, 578 EX, 579 F, 580 I, 581 IM, 582 IMP, 583 IN, 584 N, 585 P, 586 PR, 587 S, 588 T, 589 TH, 590 TR, 591 V, 592 W 593 }; 594 595 struct FirstState { 596 const char* keyword; 597 State state; 598 Token::Value token; 599 }; 600 601 // Range of possible first characters of a keyword. 602 static const unsigned int kFirstCharRangeMin = 'b'; 603 static const unsigned int kFirstCharRangeMax = 'y'; 604 static const unsigned int kFirstCharRangeLength = 605 kFirstCharRangeMax - kFirstCharRangeMin + 1; 606 // State map for first keyword character range. 607 static FirstState first_states_[kFirstCharRangeLength]; 608 609 // If input equals keyword's character at position, continue matching keyword 610 // from that position. 611 inline bool MatchKeywordStart(unibrow::uchar input, 612 const char* keyword, 613 int position, 614 Token::Value token_if_match) { 615 if (input != static_cast<unibrow::uchar>(keyword[position])) { 616 return false; 617 } 618 state_ = KEYWORD_PREFIX; 619 this->keyword_ = keyword; 620 this->counter_ = position + 1; 621 this->keyword_token_ = token_if_match; 622 return true; 623 } 624 625 // If input equals match character, transition to new state and return true. 626 inline bool MatchState(unibrow::uchar input, char match, State new_state) { 627 if (input != static_cast<unibrow::uchar>(match)) { 628 return false; 629 } 630 state_ = new_state; 631 return true; 632 } 633 634 inline bool MatchKeyword(unibrow::uchar input, 635 char match, 636 State new_state, 637 Token::Value keyword_token) { 638 if (input != static_cast<unibrow::uchar>(match)) { 639 return false; 640 } 641 state_ = new_state; 642 token_ = keyword_token; 643 return true; 644 } 645 646 void Step(unibrow::uchar input); 647 648 // Current state. 649 State state_; 650 // Token for currently added characters. 651 Token::Value token_; 652 653 // Matching a specific keyword string (there is only one possible valid 654 // keyword with the current prefix). 655 const char* keyword_; 656 int counter_; 657 Token::Value keyword_token_; 658 }; 659 660 661 } } // namespace v8::internal 662 663 #endif // V8_SCANNER_BASE_H_ 664