1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/parsing/scanner-character-streams.h" 6 7 #include "include/v8.h" 8 #include "src/counters.h" 9 #include "src/globals.h" 10 #include "src/handles.h" 11 #include "src/objects-inl.h" 12 #include "src/parsing/scanner.h" 13 #include "src/unicode-inl.h" 14 15 namespace v8 { 16 namespace internal { 17 18 namespace { 19 const unibrow::uchar kUtf8Bom = 0xfeff; 20 } // namespace 21 22 // ---------------------------------------------------------------------------- 23 // BufferedUtf16CharacterStreams 24 // 25 // A buffered character stream based on a random access character 26 // source (ReadBlock can be called with pos() pointing to any position, 27 // even positions before the current). 28 class BufferedUtf16CharacterStream : public Utf16CharacterStream { 29 public: 30 BufferedUtf16CharacterStream(); 31 32 protected: 33 static const size_t kBufferSize = 512; 34 35 bool ReadBlock() override; 36 37 // FillBuffer should read up to kBufferSize characters at position and store 38 // them into buffer_[0..]. It returns the number of characters stored. 39 virtual size_t FillBuffer(size_t position) = 0; 40 41 // Fixed sized buffer that this class reads from. 42 // The base class' buffer_start_ should always point to buffer_. 43 uc16 buffer_[kBufferSize]; 44 }; 45 46 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() 47 : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {} 48 49 bool BufferedUtf16CharacterStream::ReadBlock() { 50 DCHECK_EQ(buffer_start_, buffer_); 51 52 size_t position = pos(); 53 buffer_pos_ = position; 54 buffer_cursor_ = buffer_; 55 buffer_end_ = buffer_ + FillBuffer(position); 56 DCHECK_EQ(pos(), position); 57 DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize); 58 return buffer_cursor_ < buffer_end_; 59 } 60 61 // ---------------------------------------------------------------------------- 62 // GenericStringUtf16CharacterStream. 63 // 64 // A stream w/ a data source being a (flattened) Handle<String>. 65 66 class GenericStringUtf16CharacterStream : public BufferedUtf16CharacterStream { 67 public: 68 GenericStringUtf16CharacterStream(Handle<String> data, size_t start_position, 69 size_t end_position); 70 71 protected: 72 size_t FillBuffer(size_t position) override; 73 74 Handle<String> string_; 75 size_t length_; 76 }; 77 78 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( 79 Handle<String> data, size_t start_position, size_t end_position) 80 : string_(data), length_(end_position) { 81 DCHECK_GE(end_position, start_position); 82 DCHECK_GE(static_cast<size_t>(string_->length()), 83 end_position - start_position); 84 buffer_pos_ = start_position; 85 } 86 87 size_t GenericStringUtf16CharacterStream::FillBuffer(size_t from_pos) { 88 if (from_pos >= length_) return 0; 89 90 size_t length = i::Min(kBufferSize, length_ - from_pos); 91 String::WriteToFlat<uc16>(*string_, buffer_, static_cast<int>(from_pos), 92 static_cast<int>(from_pos + length)); 93 return length; 94 } 95 96 // ---------------------------------------------------------------------------- 97 // ExternalTwoByteStringUtf16CharacterStream. 98 // 99 // A stream whose data source is a Handle<ExternalTwoByteString>. It avoids 100 // all data copying. 101 102 class ExternalTwoByteStringUtf16CharacterStream : public Utf16CharacterStream { 103 public: 104 ExternalTwoByteStringUtf16CharacterStream(Handle<ExternalTwoByteString> data, 105 size_t start_position, 106 size_t end_position); 107 108 private: 109 bool ReadBlock() override; 110 111 const uc16* raw_data_; // Pointer to the actual array of characters. 112 size_t start_pos_; 113 size_t end_pos_; 114 }; 115 116 ExternalTwoByteStringUtf16CharacterStream:: 117 ExternalTwoByteStringUtf16CharacterStream( 118 Handle<ExternalTwoByteString> data, size_t start_position, 119 size_t end_position) 120 : raw_data_(data->GetTwoByteData(static_cast<int>(start_position))), 121 start_pos_(start_position), 122 end_pos_(end_position) { 123 buffer_start_ = raw_data_; 124 buffer_cursor_ = raw_data_; 125 buffer_end_ = raw_data_ + (end_pos_ - start_pos_); 126 buffer_pos_ = start_pos_; 127 } 128 129 bool ExternalTwoByteStringUtf16CharacterStream::ReadBlock() { 130 size_t position = pos(); 131 bool have_data = start_pos_ <= position && position < end_pos_; 132 if (have_data) { 133 buffer_pos_ = start_pos_; 134 buffer_cursor_ = raw_data_ + (position - start_pos_), 135 buffer_end_ = raw_data_ + (end_pos_ - start_pos_); 136 } else { 137 buffer_pos_ = position; 138 buffer_cursor_ = raw_data_; 139 buffer_end_ = raw_data_; 140 } 141 return have_data; 142 } 143 144 // ---------------------------------------------------------------------------- 145 // ExternalOneByteStringUtf16CharacterStream 146 // 147 // A stream whose data source is a Handle<ExternalOneByteString>. 148 149 class ExternalOneByteStringUtf16CharacterStream 150 : public BufferedUtf16CharacterStream { 151 public: 152 ExternalOneByteStringUtf16CharacterStream(Handle<ExternalOneByteString> data, 153 size_t start_position, 154 size_t end_position); 155 156 // For testing: 157 ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length); 158 159 protected: 160 size_t FillBuffer(size_t position) override; 161 162 const uint8_t* raw_data_; // Pointer to the actual array of characters. 163 size_t length_; 164 }; 165 166 ExternalOneByteStringUtf16CharacterStream:: 167 ExternalOneByteStringUtf16CharacterStream( 168 Handle<ExternalOneByteString> data, size_t start_position, 169 size_t end_position) 170 : raw_data_(data->GetChars()), length_(end_position) { 171 DCHECK(end_position >= start_position); 172 buffer_pos_ = start_position; 173 } 174 175 ExternalOneByteStringUtf16CharacterStream:: 176 ExternalOneByteStringUtf16CharacterStream(const char* data, size_t length) 177 : raw_data_(reinterpret_cast<const uint8_t*>(data)), length_(length) {} 178 179 size_t ExternalOneByteStringUtf16CharacterStream::FillBuffer(size_t from_pos) { 180 if (from_pos >= length_) return 0; 181 182 size_t length = Min(kBufferSize, length_ - from_pos); 183 i::CopyCharsUnsigned(buffer_, raw_data_ + from_pos, length); 184 return length; 185 } 186 187 // ---------------------------------------------------------------------------- 188 // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data. 189 // 190 // This implementation is fairly complex, since data arrives in chunks which 191 // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given 192 // character position is tricky because the byte position cannot be dericed 193 // from the character position. 194 195 class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream { 196 public: 197 Utf8ExternalStreamingStream( 198 ScriptCompiler::ExternalSourceStream* source_stream, 199 RuntimeCallStats* stats) 200 : current_({0, {0, 0, unibrow::Utf8::Utf8IncrementalBuffer(0)}}), 201 source_stream_(source_stream), 202 stats_(stats) {} 203 ~Utf8ExternalStreamingStream() override { 204 for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data; 205 } 206 207 protected: 208 size_t FillBuffer(size_t position) override; 209 210 private: 211 // A position within the data stream. It stores: 212 // - The 'physical' position (# of bytes in the stream), 213 // - the 'logical' position (# of ucs-2 characters, also within the stream), 214 // - a possibly incomplete utf-8 char at the current 'physical' position. 215 struct StreamPosition { 216 size_t bytes; 217 size_t chars; 218 unibrow::Utf8::Utf8IncrementalBuffer incomplete_char; 219 }; 220 221 // Position contains a StreamPosition and the index of the chunk the position 222 // points into. (The chunk_no could be derived from pos, but that'd be 223 // an expensive search through all chunks.) 224 struct Position { 225 size_t chunk_no; 226 StreamPosition pos; 227 }; 228 229 // A chunk in the list of chunks, containing: 230 // - The chunk data (data pointer and length), and 231 // - the position at the first byte of the chunk. 232 struct Chunk { 233 const uint8_t* data; 234 size_t length; 235 StreamPosition start; 236 }; 237 238 // Within the current chunk, skip forward from current_ towards position. 239 bool SkipToPosition(size_t position); 240 // Within the current chunk, fill the buffer_ (while it has capacity). 241 void FillBufferFromCurrentChunk(); 242 // Fetch a new chunk (assuming current_ is at the end of the current data). 243 bool FetchChunk(); 244 // Search through the chunks and set current_ to point to the given position. 245 // (This call is potentially expensive.) 246 void SearchPosition(size_t position); 247 248 std::vector<Chunk> chunks_; 249 Position current_; 250 ScriptCompiler::ExternalSourceStream* source_stream_; 251 RuntimeCallStats* stats_; 252 }; 253 254 bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) { 255 DCHECK_LE(current_.pos.chars, position); // We can only skip forward. 256 257 // Already there? Then return immediately. 258 if (current_.pos.chars == position) return true; 259 260 const Chunk& chunk = chunks_[current_.chunk_no]; 261 DCHECK(current_.pos.bytes >= chunk.start.bytes); 262 263 unibrow::Utf8::Utf8IncrementalBuffer incomplete_char = 264 chunk.start.incomplete_char; 265 size_t it = current_.pos.bytes - chunk.start.bytes; 266 size_t chars = chunk.start.chars; 267 while (it < chunk.length && chars < position) { 268 unibrow::uchar t = 269 unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char); 270 if (t == kUtf8Bom && current_.pos.chars == 0) { 271 // BOM detected at beginning of the stream. Don't copy it. 272 } else if (t != unibrow::Utf8::kIncomplete) { 273 chars++; 274 if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++; 275 } 276 it++; 277 } 278 279 current_.pos.bytes += it; 280 current_.pos.chars = chars; 281 current_.pos.incomplete_char = incomplete_char; 282 current_.chunk_no += (it == chunk.length); 283 284 return current_.pos.chars == position; 285 } 286 287 void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() { 288 DCHECK_LT(current_.chunk_no, chunks_.size()); 289 DCHECK_EQ(buffer_start_, buffer_cursor_); 290 DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize); 291 292 const Chunk& chunk = chunks_[current_.chunk_no]; 293 294 // The buffer_ is writable, but buffer_*_ members are const. So we get a 295 // non-const pointer into buffer that points to the same char as buffer_end_. 296 uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_); 297 DCHECK_EQ(cursor, buffer_end_); 298 299 // If the current chunk is the last (empty) chunk we'll have to process 300 // any left-over, partial characters. 301 if (chunk.length == 0) { 302 unibrow::uchar t = 303 unibrow::Utf8::ValueOfIncrementalFinish(¤t_.pos.incomplete_char); 304 if (t != unibrow::Utf8::kBufferEmpty) { 305 DCHECK(t < unibrow::Utf16::kMaxNonSurrogateCharCode); 306 *cursor = static_cast<uc16>(t); 307 buffer_end_++; 308 current_.pos.chars++; 309 } 310 return; 311 } 312 313 unibrow::Utf8::Utf8IncrementalBuffer incomplete_char = 314 current_.pos.incomplete_char; 315 size_t it; 316 for (it = current_.pos.bytes - chunk.start.bytes; 317 it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize; it++) { 318 unibrow::uchar t = 319 unibrow::Utf8::ValueOfIncremental(chunk.data[it], &incomplete_char); 320 if (t == unibrow::Utf8::kIncomplete) continue; 321 if (V8_LIKELY(t < kUtf8Bom)) { 322 *(cursor++) = static_cast<uc16>(t); // The by most frequent case. 323 } else if (t == kUtf8Bom && current_.pos.bytes + it == 2) { 324 // BOM detected at beginning of the stream. Don't copy it. 325 } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) { 326 *(cursor++) = static_cast<uc16>(t); 327 } else { 328 *(cursor++) = unibrow::Utf16::LeadSurrogate(t); 329 *(cursor++) = unibrow::Utf16::TrailSurrogate(t); 330 } 331 } 332 333 current_.pos.bytes = chunk.start.bytes + it; 334 current_.pos.chars += (cursor - buffer_end_); 335 current_.pos.incomplete_char = incomplete_char; 336 current_.chunk_no += (it == chunk.length); 337 338 buffer_end_ = cursor; 339 } 340 341 bool Utf8ExternalStreamingStream::FetchChunk() { 342 RuntimeCallTimerScope scope(stats_, &RuntimeCallStats::GetMoreDataCallback); 343 DCHECK_EQ(current_.chunk_no, chunks_.size()); 344 DCHECK(chunks_.empty() || chunks_.back().length != 0); 345 346 const uint8_t* chunk = nullptr; 347 size_t length = source_stream_->GetMoreData(&chunk); 348 chunks_.push_back({chunk, length, current_.pos}); 349 return length > 0; 350 } 351 352 void Utf8ExternalStreamingStream::SearchPosition(size_t position) { 353 // If current_ already points to the right position, we're done. 354 // 355 // This is expected to be the common case, since we typically call 356 // FillBuffer right after the current buffer. 357 if (current_.pos.chars == position) return; 358 359 // No chunks. Fetch at least one, so we can assume !chunks_.empty() below. 360 if (chunks_.empty()) { 361 DCHECK_EQ(current_.chunk_no, 0u); 362 DCHECK_EQ(current_.pos.bytes, 0u); 363 DCHECK_EQ(current_.pos.chars, 0u); 364 FetchChunk(); 365 } 366 367 // Search for the last chunk whose start position is less or equal to 368 // position. 369 size_t chunk_no = chunks_.size() - 1; 370 while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) { 371 chunk_no--; 372 } 373 374 // Did we find the terminating (zero-length) chunk? Then we're seeking 375 // behind the end of the data, and position does not exist. 376 // Set current_ to point to the terminating chunk. 377 if (chunks_[chunk_no].length == 0) { 378 current_ = {chunk_no, chunks_[chunk_no].start}; 379 return; 380 } 381 382 // Did we find the non-last chunk? Then our position must be within chunk_no. 383 if (chunk_no + 1 < chunks_.size()) { 384 // Fancy-pants optimization for ASCII chunks within a utf-8 stream. 385 // (Many web sites declare utf-8 encoding, but use only (or almost only) the 386 // ASCII subset for their JavaScript sources. We can exploit this, by 387 // checking whether the # bytes in a chunk are equal to the # chars, and if 388 // so avoid the expensive SkipToPosition.) 389 bool ascii_only_chunk = 390 (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) == 391 (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars); 392 if (ascii_only_chunk) { 393 size_t skip = position - chunks_[chunk_no].start.chars; 394 current_ = {chunk_no, 395 {chunks_[chunk_no].start.bytes + skip, 396 chunks_[chunk_no].start.chars + skip, 397 unibrow::Utf8::Utf8IncrementalBuffer(0)}}; 398 } else { 399 current_ = {chunk_no, chunks_[chunk_no].start}; 400 SkipToPosition(position); 401 } 402 403 // Since position was within the chunk, SkipToPosition should have found 404 // something. 405 DCHECK_EQ(position, current_.pos.chars); 406 return; 407 } 408 409 // What's left: We're in the last, non-terminating chunk. Our position 410 // may be in the chunk, but it may also be in 'future' chunks, which we'll 411 // have to obtain. 412 DCHECK_EQ(chunk_no, chunks_.size() - 1); 413 current_ = {chunk_no, chunks_[chunk_no].start}; 414 bool have_more_data = true; 415 bool found = SkipToPosition(position); 416 while (have_more_data && !found) { 417 DCHECK_EQ(current_.chunk_no, chunks_.size()); 418 have_more_data = FetchChunk(); 419 found = have_more_data && SkipToPosition(position); 420 } 421 422 // We'll return with a postion != the desired position only if we're out 423 // of data. In that case, we'll point to the terminating chunk. 424 DCHECK_EQ(found, current_.pos.chars == position); 425 DCHECK_EQ(have_more_data, chunks_.back().length != 0); 426 DCHECK_IMPLIES(!found, !have_more_data); 427 DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1); 428 } 429 430 size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) { 431 buffer_cursor_ = buffer_; 432 buffer_end_ = buffer_; 433 434 SearchPosition(position); 435 bool out_of_data = current_.chunk_no != chunks_.size() && 436 chunks_[current_.chunk_no].length == 0; 437 if (out_of_data) return 0; 438 439 // Fill the buffer, until we have at least one char (or are out of data). 440 // (The embedder might give us 1-byte blocks within a utf-8 char, so we 441 // can't guarantee progress with one chunk. Thus we iterate.) 442 while (!out_of_data && buffer_cursor_ == buffer_end_) { 443 // At end of current data, but there might be more? Then fetch it. 444 if (current_.chunk_no == chunks_.size()) { 445 out_of_data = !FetchChunk(); 446 } 447 FillBufferFromCurrentChunk(); 448 } 449 450 DCHECK_EQ(current_.pos.chars - position, 451 static_cast<size_t>(buffer_end_ - buffer_cursor_)); 452 return buffer_end_ - buffer_cursor_; 453 } 454 455 // ---------------------------------------------------------------------------- 456 // Chunks - helper for One- + TwoByteExternalStreamingStream 457 namespace { 458 459 struct Chunk { 460 const uint8_t* data; 461 size_t byte_length; 462 size_t byte_pos; 463 }; 464 465 typedef std::vector<struct Chunk> Chunks; 466 467 void DeleteChunks(Chunks& chunks) { 468 for (size_t i = 0; i < chunks.size(); i++) delete[] chunks[i].data; 469 } 470 471 // Return the chunk index for the chunk containing position. 472 // If position is behind the end of the stream, the index of the last, 473 // zero-length chunk is returned. 474 size_t FindChunk(Chunks& chunks, ScriptCompiler::ExternalSourceStream* source, 475 size_t position, RuntimeCallStats* stats) { 476 size_t end_pos = 477 chunks.empty() ? 0 : (chunks.back().byte_pos + chunks.back().byte_length); 478 479 // Get more data if needed. We usually won't enter the loop body. 480 bool out_of_data = !chunks.empty() && chunks.back().byte_length == 0; 481 { 482 RuntimeCallTimerScope scope(stats, &RuntimeCallStats::GetMoreDataCallback); 483 while (!out_of_data && end_pos <= position + 1) { 484 const uint8_t* chunk = nullptr; 485 size_t len = source->GetMoreData(&chunk); 486 487 chunks.push_back({chunk, len, end_pos}); 488 end_pos += len; 489 out_of_data = (len == 0); 490 } 491 } 492 493 // Here, we should always have at least one chunk, and we either have the 494 // chunk we were looking for, or we're out of data. Also, out_of_data and 495 // end_pos are current (and designate whether we have exhausted the stream, 496 // and the length of data received so far, respectively). 497 DCHECK(!chunks.empty()); 498 DCHECK_EQ(end_pos, chunks.back().byte_pos + chunks.back().byte_length); 499 DCHECK_EQ(out_of_data, chunks.back().byte_length == 0); 500 DCHECK(position < end_pos || out_of_data); 501 502 // Edge case: position is behind the end of stream: Return the last (length 0) 503 // chunk to indicate the end of the stream. 504 if (position >= end_pos) { 505 DCHECK(out_of_data); 506 return chunks.size() - 1; 507 } 508 509 // We almost always 'stream', meaning we want data from the last chunk, so 510 // let's look at chunks back-to-front. 511 size_t chunk_no = chunks.size() - 1; 512 while (chunks[chunk_no].byte_pos > position) { 513 DCHECK_NE(chunk_no, 0u); 514 chunk_no--; 515 } 516 DCHECK_LE(chunks[chunk_no].byte_pos, position); 517 DCHECK_LT(position, chunks[chunk_no].byte_pos + chunks[chunk_no].byte_length); 518 return chunk_no; 519 } 520 521 } // anonymous namespace 522 523 // ---------------------------------------------------------------------------- 524 // OneByteExternalStreamingStream 525 // 526 // A stream of latin-1 encoded, chunked data. 527 528 class OneByteExternalStreamingStream : public BufferedUtf16CharacterStream { 529 public: 530 explicit OneByteExternalStreamingStream( 531 ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats) 532 : source_(source), stats_(stats) {} 533 ~OneByteExternalStreamingStream() override { DeleteChunks(chunks_); } 534 535 protected: 536 size_t FillBuffer(size_t position) override; 537 538 private: 539 Chunks chunks_; 540 ScriptCompiler::ExternalSourceStream* source_; 541 RuntimeCallStats* stats_; 542 }; 543 544 size_t OneByteExternalStreamingStream::FillBuffer(size_t position) { 545 const Chunk& chunk = chunks_[FindChunk(chunks_, source_, position, stats_)]; 546 if (chunk.byte_length == 0) return 0; 547 548 size_t start_pos = position - chunk.byte_pos; 549 size_t len = i::Min(kBufferSize, chunk.byte_length - start_pos); 550 i::CopyCharsUnsigned(buffer_, chunk.data + start_pos, len); 551 return len; 552 } 553 554 #if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64) 555 // ---------------------------------------------------------------------------- 556 // TwoByteExternalStreamingStream 557 // 558 // A stream of ucs-2 data, delivered in chunks. Chunks may be 'cut' into the 559 // middle of characters (or even contain only one byte), which adds a bit 560 // of complexity. This stream avoid all data copying, except for characters 561 // that cross chunk boundaries. 562 563 class TwoByteExternalStreamingStream : public Utf16CharacterStream { 564 public: 565 explicit TwoByteExternalStreamingStream( 566 ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats); 567 ~TwoByteExternalStreamingStream() override; 568 569 protected: 570 bool ReadBlock() override; 571 572 Chunks chunks_; 573 ScriptCompiler::ExternalSourceStream* source_; 574 RuntimeCallStats* stats_; 575 uc16 one_char_buffer_; 576 }; 577 578 TwoByteExternalStreamingStream::TwoByteExternalStreamingStream( 579 ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats) 580 : Utf16CharacterStream(&one_char_buffer_, &one_char_buffer_, 581 &one_char_buffer_, 0), 582 source_(source), 583 stats_(stats), 584 one_char_buffer_(0) {} 585 586 TwoByteExternalStreamingStream::~TwoByteExternalStreamingStream() { 587 DeleteChunks(chunks_); 588 } 589 590 bool TwoByteExternalStreamingStream::ReadBlock() { 591 size_t position = pos(); 592 593 // We'll search for the 2nd byte of our character, to make sure we 594 // have enough data for at least one character. 595 size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_); 596 597 // Out of data? Return 0. 598 if (chunks_[chunk_no].byte_length == 0) { 599 buffer_cursor_ = buffer_start_; 600 buffer_end_ = buffer_start_; 601 return false; 602 } 603 604 Chunk& current = chunks_[chunk_no]; 605 606 // Annoying edge case: Chunks may not be 2-byte aligned, meaning that a 607 // character may be split between the previous and the current chunk. 608 // If we find such a lonely byte at the beginning of the chunk, we'll use 609 // one_char_buffer_ to hold the full character. 610 bool lonely_byte = (chunks_[chunk_no].byte_pos == (2 * position + 1)); 611 if (lonely_byte) { 612 DCHECK_NE(chunk_no, 0u); 613 Chunk& previous_chunk = chunks_[chunk_no - 1]; 614 #ifdef V8_TARGET_BIG_ENDIAN 615 uc16 character = current.data[0] | 616 previous_chunk.data[previous_chunk.byte_length - 1] << 8; 617 #else 618 uc16 character = previous_chunk.data[previous_chunk.byte_length - 1] | 619 current.data[0] << 8; 620 #endif 621 622 one_char_buffer_ = character; 623 buffer_pos_ = position; 624 buffer_start_ = &one_char_buffer_; 625 buffer_cursor_ = &one_char_buffer_; 626 buffer_end_ = &one_char_buffer_ + 1; 627 return true; 628 } 629 630 // Common case: character is in current chunk. 631 DCHECK_LE(current.byte_pos, 2 * position); 632 DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length); 633 634 // Determine # of full ucs-2 chars in stream, and whether we started on an odd 635 // byte boundary. 636 bool odd_start = (current.byte_pos % 2) == 1; 637 size_t number_chars = (current.byte_length - odd_start) / 2; 638 639 // Point the buffer_*_ members into the current chunk and set buffer_cursor_ 640 // to point to position. Be careful when converting the byte positions (in 641 // Chunk) to the ucs-2 character positions (in buffer_*_ members). 642 buffer_start_ = reinterpret_cast<const uint16_t*>(current.data + odd_start); 643 buffer_end_ = buffer_start_ + number_chars; 644 buffer_pos_ = (current.byte_pos + odd_start) / 2; 645 buffer_cursor_ = buffer_start_ + (position - buffer_pos_); 646 DCHECK_EQ(position, pos()); 647 return true; 648 } 649 650 #else 651 652 // ---------------------------------------------------------------------------- 653 // TwoByteExternalBufferedStream 654 // 655 // This class is made specifically to address unaligned access to 16-bit data 656 // in MIPS and ARM architectures. It replaces class 657 // TwoByteExternalStreamingStream which in some cases does have unaligned 658 // accesse to 16-bit data 659 660 class TwoByteExternalBufferedStream : public Utf16CharacterStream { 661 public: 662 explicit TwoByteExternalBufferedStream( 663 ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats); 664 ~TwoByteExternalBufferedStream(); 665 666 protected: 667 static const size_t kBufferSize = 512; 668 669 bool ReadBlock() override; 670 671 // FillBuffer should read up to kBufferSize characters at position and store 672 // them into buffer_[0..]. It returns the number of characters stored. 673 size_t FillBuffer(size_t position, size_t chunk_no); 674 675 // Fixed sized buffer that this class reads from. 676 // The base class' buffer_start_ should always point to buffer_. 677 uc16 buffer_[kBufferSize]; 678 679 Chunks chunks_; 680 ScriptCompiler::ExternalSourceStream* source_; 681 RuntimeCallStats* stats_; 682 }; 683 684 TwoByteExternalBufferedStream::TwoByteExternalBufferedStream( 685 ScriptCompiler::ExternalSourceStream* source, RuntimeCallStats* stats) 686 : Utf16CharacterStream(buffer_, buffer_, buffer_, 0), 687 source_(source), 688 stats_(stats) {} 689 690 TwoByteExternalBufferedStream::~TwoByteExternalBufferedStream() { 691 DeleteChunks(chunks_); 692 } 693 694 bool TwoByteExternalBufferedStream::ReadBlock() { 695 size_t position = pos(); 696 // Find chunk in which the position belongs 697 size_t chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_); 698 699 // Out of data? Return 0. 700 if (chunks_[chunk_no].byte_length == 0) { 701 buffer_cursor_ = buffer_start_; 702 buffer_end_ = buffer_start_; 703 return false; 704 } 705 706 Chunk& current = chunks_[chunk_no]; 707 708 bool odd_start = current.byte_pos % 2; 709 // Common case: character is in current chunk. 710 DCHECK_LE(current.byte_pos, 2 * position + odd_start); 711 DCHECK_LT(2 * position + 1, current.byte_pos + current.byte_length); 712 713 // If character starts on odd address copy text in buffer so there is always 714 // aligned access to characters. This is important on MIPS and ARM 715 // architectures. Otherwise read characters from memory directly. 716 if (!odd_start) { 717 buffer_start_ = reinterpret_cast<const uint16_t*>(current.data); 718 size_t number_chars = current.byte_length / 2; 719 buffer_end_ = buffer_start_ + number_chars; 720 buffer_pos_ = current.byte_pos / 2; 721 buffer_cursor_ = buffer_start_ + (position - buffer_pos_); 722 DCHECK_EQ(position, pos()); 723 return true; 724 } else { 725 buffer_start_ = buffer_; 726 buffer_pos_ = position; 727 buffer_cursor_ = buffer_; 728 buffer_end_ = buffer_ + FillBuffer(position, chunk_no); 729 DCHECK_EQ(pos(), position); 730 DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize); 731 return buffer_cursor_ < buffer_end_; 732 } 733 } 734 735 size_t TwoByteExternalBufferedStream::FillBuffer(size_t position, 736 size_t chunk_no) { 737 DCHECK_EQ(chunks_[chunk_no].byte_pos % 2, 1u); 738 bool odd_start = true; 739 // Align buffer_pos_ to the size of the buffer. 740 { 741 size_t new_pos = position / kBufferSize * kBufferSize; 742 if (new_pos != position) { 743 chunk_no = FindChunk(chunks_, source_, 2 * new_pos + 1, stats_); 744 buffer_pos_ = new_pos; 745 buffer_cursor_ = buffer_start_ + (position - buffer_pos_); 746 position = new_pos; 747 odd_start = chunks_[chunk_no].byte_pos % 2; 748 } 749 } 750 751 Chunk* current = &chunks_[chunk_no]; 752 753 // Annoying edge case: Chunks may not be 2-byte aligned, meaning that a 754 // character may be split between the previous and the current chunk. 755 // If we find such a lonely byte at the beginning of the chunk, we'll copy 756 // it to the first byte in buffer_. 757 size_t totalLength = 0; 758 bool lonely_byte = (current->byte_pos == (2 * position + 1)); 759 if (lonely_byte) { 760 DCHECK_NE(chunk_no, 0u); 761 Chunk& previous_chunk = chunks_[chunk_no - 1]; 762 *reinterpret_cast<uint8_t*>(buffer_) = 763 previous_chunk.data[previous_chunk.byte_length - 1]; 764 totalLength++; 765 } 766 767 // Common case: character is in current chunk. 768 DCHECK_LE(current->byte_pos, 2 * position + odd_start); 769 DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length); 770 771 // Copy characters from current chunk starting from chunk_pos to the end of 772 // buffer or chunk. 773 size_t chunk_pos = position - current->byte_pos / 2; 774 size_t start_offset = odd_start && chunk_pos != 0; 775 size_t bytes_to_move = 776 i::Min(2 * kBufferSize - lonely_byte, 777 current->byte_length - 2 * chunk_pos + start_offset); 778 i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + lonely_byte, 779 current->data + 2 * chunk_pos - start_offset, bytes_to_move); 780 781 // Fill up the rest of the buffer if there is space and data left. 782 totalLength += bytes_to_move; 783 position = (current->byte_pos + current->byte_length) / 2; 784 if (position - buffer_pos_ < kBufferSize) { 785 chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_); 786 current = &chunks_[chunk_no]; 787 odd_start = current->byte_pos % 2; 788 bytes_to_move = i::Min(2 * kBufferSize - totalLength, current->byte_length); 789 while (bytes_to_move) { 790 // Common case: character is in current chunk. 791 DCHECK_LE(current->byte_pos, 2 * position + odd_start); 792 DCHECK_LT(2 * position + 1, current->byte_pos + current->byte_length); 793 794 i::MemMove(reinterpret_cast<uint8_t*>(buffer_) + totalLength, 795 current->data, bytes_to_move); 796 totalLength += bytes_to_move; 797 position = (current->byte_pos + current->byte_length) / 2; 798 chunk_no = FindChunk(chunks_, source_, 2 * position + 1, stats_); 799 current = &chunks_[chunk_no]; 800 odd_start = current->byte_pos % 2; 801 bytes_to_move = 802 i::Min(2 * kBufferSize - totalLength, current->byte_length); 803 } 804 } 805 return totalLength / 2; 806 } 807 #endif 808 809 // ---------------------------------------------------------------------------- 810 // ScannerStream: Create stream instances. 811 812 Utf16CharacterStream* ScannerStream::For(Handle<String> data) { 813 return ScannerStream::For(data, 0, data->length()); 814 } 815 816 Utf16CharacterStream* ScannerStream::For(Handle<String> data, int start_pos, 817 int end_pos) { 818 DCHECK(start_pos >= 0); 819 DCHECK(end_pos <= data->length()); 820 if (data->IsExternalOneByteString()) { 821 return new ExternalOneByteStringUtf16CharacterStream( 822 Handle<ExternalOneByteString>::cast(data), start_pos, end_pos); 823 } else if (data->IsExternalTwoByteString()) { 824 return new ExternalTwoByteStringUtf16CharacterStream( 825 Handle<ExternalTwoByteString>::cast(data), start_pos, end_pos); 826 } else { 827 // TODO(vogelheim): Maybe call data.Flatten() first? 828 return new GenericStringUtf16CharacterStream(data, start_pos, end_pos); 829 } 830 } 831 832 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting( 833 const char* data) { 834 return ScannerStream::ForTesting(data, strlen(data)); 835 } 836 837 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting( 838 const char* data, size_t length) { 839 return std::unique_ptr<Utf16CharacterStream>( 840 new ExternalOneByteStringUtf16CharacterStream(data, length)); 841 } 842 843 Utf16CharacterStream* ScannerStream::For( 844 ScriptCompiler::ExternalSourceStream* source_stream, 845 v8::ScriptCompiler::StreamedSource::Encoding encoding, 846 RuntimeCallStats* stats) { 847 switch (encoding) { 848 case v8::ScriptCompiler::StreamedSource::TWO_BYTE: 849 #if !(V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64) 850 return new TwoByteExternalStreamingStream(source_stream, stats); 851 #else 852 return new TwoByteExternalBufferedStream(source_stream, stats); 853 #endif 854 case v8::ScriptCompiler::StreamedSource::ONE_BYTE: 855 return new OneByteExternalStreamingStream(source_stream, stats); 856 case v8::ScriptCompiler::StreamedSource::UTF8: 857 return new Utf8ExternalStreamingStream(source_stream, stats); 858 } 859 UNREACHABLE(); 860 return nullptr; 861 } 862 863 } // namespace internal 864 } // namespace v8 865