1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/v8.h" 6 7 #include "src/scanner-character-streams.h" 8 9 #include "include/v8.h" 10 #include "src/handles.h" 11 #include "src/unicode-inl.h" 12 13 namespace v8 { 14 namespace internal { 15 16 namespace { 17 18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src, 19 unsigned* src_pos, unsigned src_length, 20 ScriptCompiler::StreamedSource::Encoding encoding) { 21 if (encoding == ScriptCompiler::StreamedSource::UTF8) { 22 return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( 23 dest, length, src, src_pos, src_length); 24 } 25 26 unsigned to_fill = length; 27 if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos; 28 29 if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) { 30 v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill); 31 } else { 32 DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE); 33 v8::internal::CopyChars<uint16_t, uint16_t>( 34 dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill); 35 } 36 *src_pos += to_fill; 37 return to_fill; 38 } 39 40 } // namespace 41 42 43 // ---------------------------------------------------------------------------- 44 // BufferedUtf16CharacterStreams 45 46 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() 47 : Utf16CharacterStream(), 48 pushback_limit_(NULL) { 49 // Initialize buffer as being empty. First read will fill the buffer. 50 buffer_cursor_ = buffer_; 51 buffer_end_ = buffer_; 52 } 53 54 55 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } 56 57 void BufferedUtf16CharacterStream::PushBack(uc32 character) { 58 if (character == kEndOfInput) { 59 pos_--; 60 return; 61 } 62 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { 63 // buffer_ is writable, buffer_cursor_ is const pointer. 64 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); 65 pos_--; 66 return; 67 } 68 SlowPushBack(static_cast<uc16>(character)); 69 } 70 71 72 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { 73 // In pushback mode, the end of the buffer contains pushback, 74 // and the start of the buffer (from buffer start to pushback_limit_) 75 // contains valid data that comes just after the pushback. 76 // We NULL the pushback_limit_ if pushing all the way back to the 77 // start of the buffer. 78 79 if (pushback_limit_ == NULL) { 80 // Enter pushback mode. 81 pushback_limit_ = buffer_end_; 82 buffer_end_ = buffer_ + kBufferSize; 83 buffer_cursor_ = buffer_end_; 84 } 85 // Ensure that there is room for at least one pushback. 86 DCHECK(buffer_cursor_ > buffer_); 87 DCHECK(pos_ > 0); 88 buffer_[--buffer_cursor_ - buffer_] = character; 89 if (buffer_cursor_ == buffer_) { 90 pushback_limit_ = NULL; 91 } else if (buffer_cursor_ < pushback_limit_) { 92 pushback_limit_ = buffer_cursor_; 93 } 94 pos_--; 95 } 96 97 98 bool BufferedUtf16CharacterStream::ReadBlock() { 99 buffer_cursor_ = buffer_; 100 if (pushback_limit_ != NULL) { 101 // Leave pushback mode. 102 buffer_end_ = pushback_limit_; 103 pushback_limit_ = NULL; 104 // If there were any valid characters left at the 105 // start of the buffer, use those. 106 if (buffer_cursor_ < buffer_end_) return true; 107 // Otherwise read a new block. 108 } 109 unsigned length = FillBuffer(pos_); 110 buffer_end_ = buffer_ + length; 111 return length > 0; 112 } 113 114 115 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) { 116 // Leave pushback mode (i.e., ignore that there might be valid data 117 // in the buffer before the pushback_limit_ point). 118 pushback_limit_ = NULL; 119 return BufferSeekForward(delta); 120 } 121 122 123 // ---------------------------------------------------------------------------- 124 // GenericStringUtf16CharacterStream 125 126 127 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( 128 Handle<String> data, 129 unsigned start_position, 130 unsigned end_position) 131 : string_(data), 132 length_(end_position) { 133 DCHECK(end_position >= start_position); 134 pos_ = start_position; 135 } 136 137 138 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } 139 140 141 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) { 142 unsigned old_pos = pos_; 143 pos_ = Min(pos_ + delta, length_); 144 ReadBlock(); 145 return pos_ - old_pos; 146 } 147 148 149 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) { 150 if (from_pos >= length_) return 0; 151 unsigned length = kBufferSize; 152 if (from_pos + length > length_) { 153 length = length_ - from_pos; 154 } 155 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); 156 return length; 157 } 158 159 160 // ---------------------------------------------------------------------------- 161 // Utf8ToUtf16CharacterStream 162 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, 163 unsigned length) 164 : BufferedUtf16CharacterStream(), 165 raw_data_(data), 166 raw_data_length_(length), 167 raw_data_pos_(0), 168 raw_character_position_(0) { 169 ReadBlock(); 170 } 171 172 173 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } 174 175 176 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length, 177 const byte* src, 178 unsigned* src_pos, 179 unsigned src_length) { 180 static const unibrow::uchar kMaxUtf16Character = 0xffff; 181 unsigned i = 0; 182 // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer 183 // one character early (in the normal case), because we need to have at least 184 // two free spaces in the buffer to be sure that the next character will fit. 185 while (i < length - 1) { 186 if (*src_pos == src_length) break; 187 unibrow::uchar c = src[*src_pos]; 188 if (c <= unibrow::Utf8::kMaxOneByteChar) { 189 *src_pos = *src_pos + 1; 190 } else { 191 c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos, 192 src_pos); 193 } 194 if (c > kMaxUtf16Character) { 195 dest[i++] = unibrow::Utf16::LeadSurrogate(c); 196 dest[i++] = unibrow::Utf16::TrailSurrogate(c); 197 } else { 198 dest[i++] = static_cast<uc16>(c); 199 } 200 } 201 return i; 202 } 203 204 205 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { 206 unsigned old_pos = pos_; 207 unsigned target_pos = pos_ + delta; 208 SetRawPosition(target_pos); 209 pos_ = raw_character_position_; 210 ReadBlock(); 211 return pos_ - old_pos; 212 } 213 214 215 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) { 216 SetRawPosition(char_position); 217 if (raw_character_position_ != char_position) { 218 // char_position was not a valid position in the stream (hit the end 219 // while spooling to it). 220 return 0u; 221 } 222 unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_, 223 raw_data_length_); 224 raw_character_position_ = char_position + i; 225 return i; 226 } 227 228 229 static const byte kUtf8MultiByteMask = 0xC0; 230 static const byte kUtf8MultiByteCharFollower = 0x80; 231 232 233 #ifdef DEBUG 234 static const byte kUtf8MultiByteCharStart = 0xC0; 235 static bool IsUtf8MultiCharacterStart(byte first_byte) { 236 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; 237 } 238 #endif 239 240 241 static bool IsUtf8MultiCharacterFollower(byte later_byte) { 242 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; 243 } 244 245 246 // Move the cursor back to point at the preceding UTF-8 character start 247 // in the buffer. 248 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { 249 byte character = buffer[--*cursor]; 250 if (character > unibrow::Utf8::kMaxOneByteChar) { 251 DCHECK(IsUtf8MultiCharacterFollower(character)); 252 // Last byte of a multi-byte character encoding. Step backwards until 253 // pointing to the first byte of the encoding, recognized by having the 254 // top two bits set. 255 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } 256 DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor])); 257 } 258 } 259 260 261 // Move the cursor forward to point at the next following UTF-8 character start 262 // in the buffer. 263 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) { 264 byte character = buffer[(*cursor)++]; 265 if (character > unibrow::Utf8::kMaxOneByteChar) { 266 // First character of a multi-byte character encoding. 267 // The number of most-significant one-bits determines the length of the 268 // encoding: 269 // 110..... - (0xCx, 0xDx) one additional byte (minimum). 270 // 1110.... - (0xEx) two additional bytes. 271 // 11110... - (0xFx) three additional bytes (maximum). 272 DCHECK(IsUtf8MultiCharacterStart(character)); 273 // Additional bytes is: 274 // 1 if value in range 0xC0 .. 0xDF. 275 // 2 if value in range 0xE0 .. 0xEF. 276 // 3 if value in range 0xF0 .. 0xF7. 277 // Encode that in a single value. 278 unsigned additional_bytes = 279 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; 280 *cursor += additional_bytes; 281 DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); 282 } 283 } 284 285 286 // This can't set a raw position between two surrogate pairs, since there 287 // is no position in the UTF8 stream that corresponds to that. This assumes 288 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If 289 // it is illegally coded as two 3 byte sequences then there is no problem here. 290 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) { 291 if (raw_character_position_ > target_position) { 292 // Spool backwards in utf8 buffer. 293 do { 294 int old_pos = raw_data_pos_; 295 Utf8CharacterBack(raw_data_, &raw_data_pos_); 296 raw_character_position_--; 297 DCHECK(old_pos - raw_data_pos_ <= 4); 298 // Step back over both code units for surrogate pairs. 299 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; 300 } while (raw_character_position_ > target_position); 301 // No surrogate pair splitting. 302 DCHECK(raw_character_position_ == target_position); 303 return; 304 } 305 // Spool forwards in the utf8 buffer. 306 while (raw_character_position_ < target_position) { 307 if (raw_data_pos_ == raw_data_length_) return; 308 int old_pos = raw_data_pos_; 309 Utf8CharacterForward(raw_data_, &raw_data_pos_); 310 raw_character_position_++; 311 DCHECK(raw_data_pos_ - old_pos <= 4); 312 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; 313 } 314 // No surrogate pair splitting. 315 DCHECK(raw_character_position_ == target_position); 316 } 317 318 319 unsigned ExternalStreamingStream::FillBuffer(unsigned position) { 320 // Ignore "position" which is the position in the decoded data. Instead, 321 // ExternalStreamingStream keeps track of the position in the raw data. 322 unsigned data_in_buffer = 0; 323 // Note that the UTF-8 decoder might not be able to fill the buffer 324 // completely; it will typically leave the last character empty (see 325 // Utf8ToUtf16CharacterStream::CopyChars). 326 while (data_in_buffer < kBufferSize - 1) { 327 if (current_data_ == NULL) { 328 // GetSomeData will wait until the embedder has enough data. Here's an 329 // interface between the API which uses size_t (which is the correct type 330 // here) and the internal parts which use unsigned. TODO(marja): make the 331 // internal parts use size_t too. 332 current_data_length_ = 333 static_cast<unsigned>(source_stream_->GetMoreData(¤t_data_)); 334 current_data_offset_ = 0; 335 bool data_ends = current_data_length_ == 0; 336 337 // A caveat: a data chunk might end with bytes from an incomplete UTF-8 338 // character (the rest of the bytes will be in the next chunk). 339 if (encoding_ == ScriptCompiler::StreamedSource::UTF8) { 340 HandleUtf8SplitCharacters(&data_in_buffer); 341 if (!data_ends && current_data_offset_ == current_data_length_) { 342 // The data stream didn't end, but we used all the data in the 343 // chunk. This will only happen when the chunk was really small. We 344 // don't handle the case where a UTF-8 character is split over several 345 // chunks; in that case V8 won't crash, but it will be a parse error. 346 delete[] current_data_; 347 current_data_ = NULL; 348 current_data_length_ = 0; 349 current_data_offset_ = 0; 350 continue; // Request a new chunk. 351 } 352 } 353 354 // Did the data stream end? 355 if (data_ends) { 356 DCHECK(utf8_split_char_buffer_length_ == 0); 357 return data_in_buffer; 358 } 359 } 360 361 // Fill the buffer from current_data_. 362 unsigned new_offset = 0; 363 unsigned new_chars_in_buffer = 364 CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer, 365 current_data_ + current_data_offset_, &new_offset, 366 current_data_length_ - current_data_offset_, encoding_); 367 data_in_buffer += new_chars_in_buffer; 368 current_data_offset_ += new_offset; 369 DCHECK(data_in_buffer <= kBufferSize); 370 371 // Did we use all the data in the data chunk? 372 if (current_data_offset_ == current_data_length_) { 373 delete[] current_data_; 374 current_data_ = NULL; 375 current_data_length_ = 0; 376 current_data_offset_ = 0; 377 } 378 } 379 return data_in_buffer; 380 } 381 382 void ExternalStreamingStream::HandleUtf8SplitCharacters( 383 unsigned* data_in_buffer) { 384 // First check if we have leftover data from the last chunk. 385 unibrow::uchar c; 386 if (utf8_split_char_buffer_length_ > 0) { 387 // Move the bytes which are part of the split character (which started in 388 // the previous chunk) into utf8_split_char_buffer_. 389 while (current_data_offset_ < current_data_length_ && 390 utf8_split_char_buffer_length_ < 4 && 391 (c = current_data_[current_data_offset_]) > 392 unibrow::Utf8::kMaxOneByteChar) { 393 utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; 394 ++utf8_split_char_buffer_length_; 395 ++current_data_offset_; 396 } 397 398 // Convert the data in utf8_split_char_buffer_. 399 unsigned new_offset = 0; 400 unsigned new_chars_in_buffer = 401 CopyCharsHelper(buffer_ + *data_in_buffer, 402 kBufferSize - *data_in_buffer, utf8_split_char_buffer_, 403 &new_offset, utf8_split_char_buffer_length_, encoding_); 404 *data_in_buffer += new_chars_in_buffer; 405 // Make sure we used all the data. 406 DCHECK(new_offset == utf8_split_char_buffer_length_); 407 DCHECK(*data_in_buffer <= kBufferSize); 408 409 utf8_split_char_buffer_length_ = 0; 410 } 411 412 // Move bytes which are part of an incomplete character from the end of the 413 // current chunk to utf8_split_char_buffer_. They will be converted when the 414 // next data chunk arrives. Note that all valid UTF-8 characters are at most 4 415 // bytes long, but if the data is invalid, we can have character values bigger 416 // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes. 417 while (current_data_length_ > current_data_offset_ && 418 (c = current_data_[current_data_length_ - 1]) > 419 unibrow::Utf8::kMaxOneByteChar && 420 utf8_split_char_buffer_length_ < 4) { 421 --current_data_length_; 422 ++utf8_split_char_buffer_length_; 423 } 424 CHECK(utf8_split_char_buffer_length_ <= 4); 425 for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { 426 utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i]; 427 } 428 } 429 430 431 // ---------------------------------------------------------------------------- 432 // ExternalTwoByteStringUtf16CharacterStream 433 434 ExternalTwoByteStringUtf16CharacterStream:: 435 ~ExternalTwoByteStringUtf16CharacterStream() { } 436 437 438 ExternalTwoByteStringUtf16CharacterStream 439 ::ExternalTwoByteStringUtf16CharacterStream( 440 Handle<ExternalTwoByteString> data, 441 int start_position, 442 int end_position) 443 : Utf16CharacterStream(), 444 source_(data), 445 raw_data_(data->GetTwoByteData(start_position)) { 446 buffer_cursor_ = raw_data_, 447 buffer_end_ = raw_data_ + (end_position - start_position); 448 pos_ = start_position; 449 } 450 451 } } // namespace v8::internal 452