1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #ifndef V8_SCANNER_H_ 8 #define V8_SCANNER_H_ 9 10 #include "src/allocation.h" 11 #include "src/char-predicates.h" 12 #include "src/checks.h" 13 #include "src/globals.h" 14 #include "src/hashmap.h" 15 #include "src/list.h" 16 #include "src/token.h" 17 #include "src/unicode-inl.h" 18 #include "src/utils.h" 19 20 namespace v8 { 21 namespace internal { 22 23 24 class ParserRecorder; 25 26 27 // Returns the value (0 .. 15) of a hexadecimal character c. 28 // If c is not a legal hexadecimal character, returns a value < 0. 29 inline int HexValue(uc32 c) { 30 c -= '0'; 31 if (static_cast<unsigned>(c) <= 9) return c; 32 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36. 33 if (static_cast<unsigned>(c) <= 5) return c + 10; 34 return -1; 35 } 36 37 38 // --------------------------------------------------------------------- 39 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. 40 // A code unit is a 16 bit value representing either a 16 bit code point 41 // or one part of a surrogate pair that make a single 21 bit code point. 42 43 class Utf16CharacterStream { 44 public: 45 Utf16CharacterStream() : pos_(0) { } 46 virtual ~Utf16CharacterStream() { } 47 48 // Returns and advances past the next UTF-16 code unit in the input 49 // stream. If there are no more code units, it returns a negative 50 // value. 51 inline uc32 Advance() { 52 if (buffer_cursor_ < buffer_end_ || ReadBlock()) { 53 pos_++; 54 return static_cast<uc32>(*(buffer_cursor_++)); 55 } 56 // Note: currently the following increment is necessary to avoid a 57 // parser problem! The scanner treats the final kEndOfInput as 58 // a code unit with a position, and does math relative to that 59 // position. 60 pos_++; 61 62 return kEndOfInput; 63 } 64 65 // Return the current position in the code unit stream. 66 // Starts at zero. 67 inline unsigned pos() const { return pos_; } 68 69 // Skips forward past the next code_unit_count UTF-16 code units 70 // in the input, or until the end of input if that comes sooner. 71 // Returns the number of code units actually skipped. If less 72 // than code_unit_count, 73 inline unsigned SeekForward(unsigned code_unit_count) { 74 unsigned buffered_chars = 75 static_cast<unsigned>(buffer_end_ - buffer_cursor_); 76 if (code_unit_count <= buffered_chars) { 77 buffer_cursor_ += code_unit_count; 78 pos_ += code_unit_count; 79 return code_unit_count; 80 } 81 return SlowSeekForward(code_unit_count); 82 } 83 84 // Pushes back the most recently read UTF-16 code unit (or negative 85 // value if at end of input), i.e., the value returned by the most recent 86 // call to Advance. 87 // Must not be used right after calling SeekForward. 88 virtual void PushBack(int32_t code_unit) = 0; 89 90 protected: 91 static const uc32 kEndOfInput = -1; 92 93 // Ensures that the buffer_cursor_ points to the code_unit at 94 // position pos_ of the input, if possible. If the position 95 // is at or after the end of the input, return false. If there 96 // are more code_units available, return true. 97 virtual bool ReadBlock() = 0; 98 virtual unsigned SlowSeekForward(unsigned code_unit_count) = 0; 99 100 const uint16_t* buffer_cursor_; 101 const uint16_t* buffer_end_; 102 unsigned pos_; 103 }; 104 105 106 // --------------------------------------------------------------------- 107 // Caching predicates used by scanners. 108 109 class UnicodeCache { 110 public: 111 UnicodeCache() {} 112 typedef unibrow::Utf8Decoder<512> Utf8Decoder; 113 114 StaticResource<Utf8Decoder>* utf8_decoder() { 115 return &utf8_decoder_; 116 } 117 118 bool IsIdentifierStart(unibrow::uchar c) { return kIsIdentifierStart.get(c); } 119 bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); } 120 bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); } 121 bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); } 122 bool IsWhiteSpaceOrLineTerminator(unibrow::uchar c) { 123 return kIsWhiteSpaceOrLineTerminator.get(c); 124 } 125 126 private: 127 unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 128 unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 129 unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 130 unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace; 131 unibrow::Predicate<WhiteSpaceOrLineTerminator, 128> 132 kIsWhiteSpaceOrLineTerminator; 133 StaticResource<Utf8Decoder> utf8_decoder_; 134 135 DISALLOW_COPY_AND_ASSIGN(UnicodeCache); 136 }; 137 138 139 // --------------------------------------------------------------------- 140 // DuplicateFinder discovers duplicate symbols. 141 142 class DuplicateFinder { 143 public: 144 explicit DuplicateFinder(UnicodeCache* constants) 145 : unicode_constants_(constants), 146 backing_store_(16), 147 map_(&Match) { } 148 149 int AddOneByteSymbol(Vector<const uint8_t> key, int value); 150 int AddTwoByteSymbol(Vector<const uint16_t> key, int value); 151 // Add a a number literal by converting it (if necessary) 152 // to the string that ToString(ToNumber(literal)) would generate. 153 // and then adding that string with AddAsciiSymbol. 154 // This string is the actual value used as key in an object literal, 155 // and the one that must be different from the other keys. 156 int AddNumber(Vector<const uint8_t> key, int value); 157 158 private: 159 int AddSymbol(Vector<const uint8_t> key, bool is_one_byte, int value); 160 // Backs up the key and its length in the backing store. 161 // The backup is stored with a base 127 encoding of the 162 // length (plus a bit saying whether the string is one byte), 163 // followed by the bytes of the key. 164 uint8_t* BackupKey(Vector<const uint8_t> key, bool is_one_byte); 165 166 // Compare two encoded keys (both pointing into the backing store) 167 // for having the same base-127 encoded lengths and ASCII-ness, 168 // and then having the same 'length' bytes following. 169 static bool Match(void* first, void* second); 170 // Creates a hash from a sequence of bytes. 171 static uint32_t Hash(Vector<const uint8_t> key, bool is_one_byte); 172 // Checks whether a string containing a JS number is its canonical 173 // form. 174 static bool IsNumberCanonical(Vector<const uint8_t> key); 175 176 // Size of buffer. Sufficient for using it to call DoubleToCString in 177 // from conversions.h. 178 static const int kBufferSize = 100; 179 180 UnicodeCache* unicode_constants_; 181 // Backing store used to store strings used as hashmap keys. 182 SequenceCollector<unsigned char> backing_store_; 183 HashMap map_; 184 // Buffer used for string->number->canonical string conversions. 185 char number_buffer_[kBufferSize]; 186 }; 187 188 189 // ---------------------------------------------------------------------------- 190 // LiteralBuffer - Collector of chars of literals. 191 192 class LiteralBuffer { 193 public: 194 LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() { } 195 196 ~LiteralBuffer() { 197 if (backing_store_.length() > 0) { 198 backing_store_.Dispose(); 199 } 200 } 201 202 INLINE(void AddChar(uint32_t code_unit)) { 203 if (position_ >= backing_store_.length()) ExpandBuffer(); 204 if (is_one_byte_) { 205 if (code_unit <= unibrow::Latin1::kMaxChar) { 206 backing_store_[position_] = static_cast<byte>(code_unit); 207 position_ += kOneByteSize; 208 return; 209 } 210 ConvertToTwoByte(); 211 } 212 ASSERT(code_unit < 0x10000u); 213 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit; 214 position_ += kUC16Size; 215 } 216 217 bool is_one_byte() { return is_one_byte_; } 218 219 bool is_contextual_keyword(Vector<const char> keyword) { 220 return is_one_byte() && keyword.length() == position_ && 221 (memcmp(keyword.start(), backing_store_.start(), position_) == 0); 222 } 223 224 Vector<const uint16_t> two_byte_literal() { 225 ASSERT(!is_one_byte_); 226 ASSERT((position_ & 0x1) == 0); 227 return Vector<const uint16_t>( 228 reinterpret_cast<const uint16_t*>(backing_store_.start()), 229 position_ >> 1); 230 } 231 232 Vector<const uint8_t> one_byte_literal() { 233 ASSERT(is_one_byte_); 234 return Vector<const uint8_t>( 235 reinterpret_cast<const uint8_t*>(backing_store_.start()), 236 position_); 237 } 238 239 int length() { 240 return is_one_byte_ ? position_ : (position_ >> 1); 241 } 242 243 void Reset() { 244 position_ = 0; 245 is_one_byte_ = true; 246 } 247 248 private: 249 static const int kInitialCapacity = 16; 250 static const int kGrowthFactory = 4; 251 static const int kMinConversionSlack = 256; 252 static const int kMaxGrowth = 1 * MB; 253 inline int NewCapacity(int min_capacity) { 254 int capacity = Max(min_capacity, backing_store_.length()); 255 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); 256 return new_capacity; 257 } 258 259 void ExpandBuffer() { 260 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); 261 MemCopy(new_store.start(), backing_store_.start(), position_); 262 backing_store_.Dispose(); 263 backing_store_ = new_store; 264 } 265 266 void ConvertToTwoByte() { 267 ASSERT(is_one_byte_); 268 Vector<byte> new_store; 269 int new_content_size = position_ * kUC16Size; 270 if (new_content_size >= backing_store_.length()) { 271 // Ensure room for all currently read code units as UC16 as well 272 // as the code unit about to be stored. 273 new_store = Vector<byte>::New(NewCapacity(new_content_size)); 274 } else { 275 new_store = backing_store_; 276 } 277 uint8_t* src = backing_store_.start(); 278 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start()); 279 for (int i = position_ - 1; i >= 0; i--) { 280 dst[i] = src[i]; 281 } 282 if (new_store.start() != backing_store_.start()) { 283 backing_store_.Dispose(); 284 backing_store_ = new_store; 285 } 286 position_ = new_content_size; 287 is_one_byte_ = false; 288 } 289 290 bool is_one_byte_; 291 int position_; 292 Vector<byte> backing_store_; 293 294 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); 295 }; 296 297 298 // ---------------------------------------------------------------------------- 299 // JavaScript Scanner. 300 301 class Scanner { 302 public: 303 // Scoped helper for literal recording. Automatically drops the literal 304 // if aborting the scanning before it's complete. 305 class LiteralScope { 306 public: 307 explicit LiteralScope(Scanner* self) 308 : scanner_(self), complete_(false) { 309 scanner_->StartLiteral(); 310 } 311 ~LiteralScope() { 312 if (!complete_) scanner_->DropLiteral(); 313 } 314 void Complete() { 315 scanner_->TerminateLiteral(); 316 complete_ = true; 317 } 318 319 private: 320 Scanner* scanner_; 321 bool complete_; 322 }; 323 324 // Representation of an interval of source positions. 325 struct Location { 326 Location(int b, int e) : beg_pos(b), end_pos(e) { } 327 Location() : beg_pos(0), end_pos(0) { } 328 329 bool IsValid() const { 330 return beg_pos >= 0 && end_pos >= beg_pos; 331 } 332 333 static Location invalid() { return Location(-1, -1); } 334 335 int beg_pos; 336 int end_pos; 337 }; 338 339 // -1 is outside of the range of any real source code. 340 static const int kNoOctalLocation = -1; 341 342 explicit Scanner(UnicodeCache* scanner_contants); 343 344 void Initialize(Utf16CharacterStream* source); 345 346 // Returns the next token and advances input. 347 Token::Value Next(); 348 // Returns the current token again. 349 Token::Value current_token() { return current_.token; } 350 // Returns the location information for the current token 351 // (the token last returned by Next()). 352 Location location() const { return current_.location; } 353 354 // Similar functions for the upcoming token. 355 356 // One token look-ahead (past the token returned by Next()). 357 Token::Value peek() const { return next_.token; } 358 359 Location peek_location() const { return next_.location; } 360 361 bool literal_contains_escapes() const { 362 Location location = current_.location; 363 int source_length = (location.end_pos - location.beg_pos); 364 if (current_.token == Token::STRING) { 365 // Subtract delimiters. 366 source_length -= 2; 367 } 368 return current_.literal_chars->length() != source_length; 369 } 370 bool is_literal_contextual_keyword(Vector<const char> keyword) { 371 ASSERT_NOT_NULL(current_.literal_chars); 372 return current_.literal_chars->is_contextual_keyword(keyword); 373 } 374 bool is_next_contextual_keyword(Vector<const char> keyword) { 375 ASSERT_NOT_NULL(next_.literal_chars); 376 return next_.literal_chars->is_contextual_keyword(keyword); 377 } 378 379 Handle<String> AllocateNextLiteralString(Isolate* isolate, 380 PretenureFlag tenured); 381 Handle<String> AllocateInternalizedString(Isolate* isolate); 382 383 double DoubleValue(); 384 bool UnescapedLiteralMatches(const char* data, int length) { 385 if (is_literal_one_byte() && 386 literal_length() == length && 387 !literal_contains_escapes()) { 388 const char* token = 389 reinterpret_cast<const char*>(literal_one_byte_string().start()); 390 return !strncmp(token, data, length); 391 } 392 return false; 393 } 394 void IsGetOrSet(bool* is_get, bool* is_set) { 395 if (is_literal_one_byte() && 396 literal_length() == 3 && 397 !literal_contains_escapes()) { 398 const char* token = 399 reinterpret_cast<const char*>(literal_one_byte_string().start()); 400 *is_get = strncmp(token, "get", 3) == 0; 401 *is_set = !*is_get && strncmp(token, "set", 3) == 0; 402 } 403 } 404 405 int FindNumber(DuplicateFinder* finder, int value); 406 int FindSymbol(DuplicateFinder* finder, int value); 407 408 UnicodeCache* unicode_cache() { return unicode_cache_; } 409 410 // Returns the location of the last seen octal literal. 411 Location octal_position() const { return octal_pos_; } 412 void clear_octal_position() { octal_pos_ = Location::invalid(); } 413 414 // Seek forward to the given position. This operation does not 415 // work in general, for instance when there are pushed back 416 // characters, but works for seeking forward until simple delimiter 417 // tokens, which is what it is used for. 418 void SeekForward(int pos); 419 420 bool HarmonyScoping() const { 421 return harmony_scoping_; 422 } 423 void SetHarmonyScoping(bool scoping) { 424 harmony_scoping_ = scoping; 425 } 426 bool HarmonyModules() const { 427 return harmony_modules_; 428 } 429 void SetHarmonyModules(bool modules) { 430 harmony_modules_ = modules; 431 } 432 bool HarmonyNumericLiterals() const { 433 return harmony_numeric_literals_; 434 } 435 void SetHarmonyNumericLiterals(bool numeric_literals) { 436 harmony_numeric_literals_ = numeric_literals; 437 } 438 439 // Returns true if there was a line terminator before the peek'ed token, 440 // possibly inside a multi-line comment. 441 bool HasAnyLineTerminatorBeforeNext() const { 442 return has_line_terminator_before_next_ || 443 has_multiline_comment_before_next_; 444 } 445 446 // Scans the input as a regular expression pattern, previous 447 // character(s) must be /(=). Returns true if a pattern is scanned. 448 bool ScanRegExpPattern(bool seen_equal); 449 // Returns true if regexp flags are scanned (always since flags can 450 // be empty). 451 bool ScanRegExpFlags(); 452 453 private: 454 // The current and look-ahead token. 455 struct TokenDesc { 456 Token::Value token; 457 Location location; 458 LiteralBuffer* literal_chars; 459 }; 460 461 static const int kCharacterLookaheadBufferSize = 1; 462 463 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. 464 uc32 ScanOctalEscape(uc32 c, int length); 465 466 // Call this after setting source_ to the input. 467 void Init() { 468 // Set c0_ (one character ahead) 469 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1); 470 Advance(); 471 // Initialize current_ to not refer to a literal. 472 current_.literal_chars = NULL; 473 } 474 475 // Literal buffer support 476 inline void StartLiteral() { 477 LiteralBuffer* free_buffer = (current_.literal_chars == &literal_buffer1_) ? 478 &literal_buffer2_ : &literal_buffer1_; 479 free_buffer->Reset(); 480 next_.literal_chars = free_buffer; 481 } 482 483 INLINE(void AddLiteralChar(uc32 c)) { 484 ASSERT_NOT_NULL(next_.literal_chars); 485 next_.literal_chars->AddChar(c); 486 } 487 488 // Complete scanning of a literal. 489 inline void TerminateLiteral() { 490 // Does nothing in the current implementation. 491 } 492 493 // Stops scanning of a literal and drop the collected characters, 494 // e.g., due to an encountered error. 495 inline void DropLiteral() { 496 next_.literal_chars = NULL; 497 } 498 499 inline void AddLiteralCharAdvance() { 500 AddLiteralChar(c0_); 501 Advance(); 502 } 503 504 // Low-level scanning support. 505 void Advance() { c0_ = source_->Advance(); } 506 void PushBack(uc32 ch) { 507 source_->PushBack(c0_); 508 c0_ = ch; 509 } 510 511 inline Token::Value Select(Token::Value tok) { 512 Advance(); 513 return tok; 514 } 515 516 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { 517 Advance(); 518 if (c0_ == next) { 519 Advance(); 520 return then; 521 } else { 522 return else_; 523 } 524 } 525 526 // Returns the literal string, if any, for the current token (the 527 // token last returned by Next()). The string is 0-terminated. 528 // Literal strings are collected for identifiers, strings, and 529 // numbers. 530 // These functions only give the correct result if the literal 531 // was scanned between calls to StartLiteral() and TerminateLiteral(). 532 Vector<const uint8_t> literal_one_byte_string() { 533 ASSERT_NOT_NULL(current_.literal_chars); 534 return current_.literal_chars->one_byte_literal(); 535 } 536 Vector<const uint16_t> literal_two_byte_string() { 537 ASSERT_NOT_NULL(current_.literal_chars); 538 return current_.literal_chars->two_byte_literal(); 539 } 540 bool is_literal_one_byte() { 541 ASSERT_NOT_NULL(current_.literal_chars); 542 return current_.literal_chars->is_one_byte(); 543 } 544 int literal_length() const { 545 ASSERT_NOT_NULL(current_.literal_chars); 546 return current_.literal_chars->length(); 547 } 548 // Returns the literal string for the next token (the token that 549 // would be returned if Next() were called). 550 Vector<const uint8_t> next_literal_one_byte_string() { 551 ASSERT_NOT_NULL(next_.literal_chars); 552 return next_.literal_chars->one_byte_literal(); 553 } 554 Vector<const uint16_t> next_literal_two_byte_string() { 555 ASSERT_NOT_NULL(next_.literal_chars); 556 return next_.literal_chars->two_byte_literal(); 557 } 558 bool is_next_literal_one_byte() { 559 ASSERT_NOT_NULL(next_.literal_chars); 560 return next_.literal_chars->is_one_byte(); 561 } 562 int next_literal_length() const { 563 ASSERT_NOT_NULL(next_.literal_chars); 564 return next_.literal_chars->length(); 565 } 566 567 uc32 ScanHexNumber(int expected_length); 568 569 // Scans a single JavaScript token. 570 void Scan(); 571 572 bool SkipWhiteSpace(); 573 Token::Value SkipSingleLineComment(); 574 Token::Value SkipMultiLineComment(); 575 // Scans a possible HTML comment -- begins with '<!'. 576 Token::Value ScanHtmlComment(); 577 578 void ScanDecimalDigits(); 579 Token::Value ScanNumber(bool seen_period); 580 Token::Value ScanIdentifierOrKeyword(); 581 Token::Value ScanIdentifierSuffix(LiteralScope* literal); 582 583 Token::Value ScanString(); 584 585 // Scans an escape-sequence which is part of a string and adds the 586 // decoded character to the current literal. Returns true if a pattern 587 // is scanned. 588 bool ScanEscape(); 589 // Decodes a Unicode escape-sequence which is part of an identifier. 590 // If the escape sequence cannot be decoded the result is kBadChar. 591 uc32 ScanIdentifierUnicodeEscape(); 592 // Scans a Unicode escape-sequence and adds its characters, 593 // uninterpreted, to the current literal. Used for parsing RegExp 594 // flags. 595 bool ScanLiteralUnicodeEscape(); 596 597 // Return the current source position. 598 int source_pos() { 599 return source_->pos() - kCharacterLookaheadBufferSize; 600 } 601 602 UnicodeCache* unicode_cache_; 603 604 // Buffers collecting literal strings, numbers, etc. 605 LiteralBuffer literal_buffer1_; 606 LiteralBuffer literal_buffer2_; 607 608 TokenDesc current_; // desc for current token (as returned by Next()) 609 TokenDesc next_; // desc for next token (one token look-ahead) 610 611 // Input stream. Must be initialized to an Utf16CharacterStream. 612 Utf16CharacterStream* source_; 613 614 615 // Start position of the octal literal last scanned. 616 Location octal_pos_; 617 618 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 619 uc32 c0_; 620 621 // Whether there is a line terminator whitespace character after 622 // the current token, and before the next. Does not count newlines 623 // inside multiline comments. 624 bool has_line_terminator_before_next_; 625 // Whether there is a multi-line comment that contains a 626 // line-terminator after the current token, and before the next. 627 bool has_multiline_comment_before_next_; 628 // Whether we scan 'let' as a keyword for harmony block-scoped let bindings. 629 bool harmony_scoping_; 630 // Whether we scan 'module', 'import', 'export' as keywords. 631 bool harmony_modules_; 632 // Whether we scan 0o777 and 0b111 as numbers. 633 bool harmony_numeric_literals_; 634 }; 635 636 } } // namespace v8::internal 637 638 #endif // V8_SCANNER_H_ 639