Home | History | Annotate | Download | only in src
      1 // Copyright 2011 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/v8.h"
      6 
      7 #include "src/scanner-character-streams.h"
      8 
      9 #include "include/v8.h"
     10 #include "src/handles.h"
     11 #include "src/unicode-inl.h"
     12 
     13 namespace v8 {
     14 namespace internal {
     15 
     16 namespace {
     17 
     18 unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src,
     19                          unsigned* src_pos, unsigned src_length,
     20                          ScriptCompiler::StreamedSource::Encoding encoding) {
     21   if (encoding == ScriptCompiler::StreamedSource::UTF8) {
     22     return v8::internal::Utf8ToUtf16CharacterStream::CopyChars(
     23         dest, length, src, src_pos, src_length);
     24   }
     25 
     26   unsigned to_fill = length;
     27   if (to_fill > src_length - *src_pos) to_fill = src_length - *src_pos;
     28 
     29   if (encoding == ScriptCompiler::StreamedSource::ONE_BYTE) {
     30     v8::internal::CopyChars<uint8_t, uint16_t>(dest, src + *src_pos, to_fill);
     31   } else {
     32     DCHECK(encoding == ScriptCompiler::StreamedSource::TWO_BYTE);
     33     v8::internal::CopyChars<uint16_t, uint16_t>(
     34         dest, reinterpret_cast<const uint16_t*>(src + *src_pos), to_fill);
     35   }
     36   *src_pos += to_fill;
     37   return to_fill;
     38 }
     39 
     40 }  // namespace
     41 
     42 
     43 // ----------------------------------------------------------------------------
     44 // BufferedUtf16CharacterStreams
     45 
     46 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
     47     : Utf16CharacterStream(),
     48       pushback_limit_(NULL) {
     49   // Initialize buffer as being empty. First read will fill the buffer.
     50   buffer_cursor_ = buffer_;
     51   buffer_end_ = buffer_;
     52 }
     53 
     54 
     55 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { }
     56 
     57 void BufferedUtf16CharacterStream::PushBack(uc32 character) {
     58   if (character == kEndOfInput) {
     59     pos_--;
     60     return;
     61   }
     62   if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) {
     63     // buffer_ is writable, buffer_cursor_ is const pointer.
     64     buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character);
     65     pos_--;
     66     return;
     67   }
     68   SlowPushBack(static_cast<uc16>(character));
     69 }
     70 
     71 
     72 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) {
     73   // In pushback mode, the end of the buffer contains pushback,
     74   // and the start of the buffer (from buffer start to pushback_limit_)
     75   // contains valid data that comes just after the pushback.
     76   // We NULL the pushback_limit_ if pushing all the way back to the
     77   // start of the buffer.
     78 
     79   if (pushback_limit_ == NULL) {
     80     // Enter pushback mode.
     81     pushback_limit_ = buffer_end_;
     82     buffer_end_ = buffer_ + kBufferSize;
     83     buffer_cursor_ = buffer_end_;
     84   }
     85   // Ensure that there is room for at least one pushback.
     86   DCHECK(buffer_cursor_ > buffer_);
     87   DCHECK(pos_ > 0);
     88   buffer_[--buffer_cursor_ - buffer_] = character;
     89   if (buffer_cursor_ == buffer_) {
     90     pushback_limit_ = NULL;
     91   } else if (buffer_cursor_ < pushback_limit_) {
     92     pushback_limit_ = buffer_cursor_;
     93   }
     94   pos_--;
     95 }
     96 
     97 
     98 bool BufferedUtf16CharacterStream::ReadBlock() {
     99   buffer_cursor_ = buffer_;
    100   if (pushback_limit_ != NULL) {
    101     // Leave pushback mode.
    102     buffer_end_ = pushback_limit_;
    103     pushback_limit_ = NULL;
    104     // If there were any valid characters left at the
    105     // start of the buffer, use those.
    106     if (buffer_cursor_ < buffer_end_) return true;
    107     // Otherwise read a new block.
    108   }
    109   unsigned length = FillBuffer(pos_);
    110   buffer_end_ = buffer_ + length;
    111   return length > 0;
    112 }
    113 
    114 
    115 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) {
    116   // Leave pushback mode (i.e., ignore that there might be valid data
    117   // in the buffer before the pushback_limit_ point).
    118   pushback_limit_ = NULL;
    119   return BufferSeekForward(delta);
    120 }
    121 
    122 
    123 // ----------------------------------------------------------------------------
    124 // GenericStringUtf16CharacterStream
    125 
    126 
    127 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream(
    128     Handle<String> data,
    129     unsigned start_position,
    130     unsigned end_position)
    131     : string_(data),
    132       length_(end_position) {
    133   DCHECK(end_position >= start_position);
    134   pos_ = start_position;
    135 }
    136 
    137 
    138 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { }
    139 
    140 
    141 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) {
    142   unsigned old_pos = pos_;
    143   pos_ = Min(pos_ + delta, length_);
    144   ReadBlock();
    145   return pos_ - old_pos;
    146 }
    147 
    148 
    149 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos) {
    150   if (from_pos >= length_) return 0;
    151   unsigned length = kBufferSize;
    152   if (from_pos + length > length_) {
    153     length = length_ - from_pos;
    154   }
    155   String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length);
    156   return length;
    157 }
    158 
    159 
    160 // ----------------------------------------------------------------------------
    161 // Utf8ToUtf16CharacterStream
    162 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data,
    163                                                        unsigned length)
    164     : BufferedUtf16CharacterStream(),
    165       raw_data_(data),
    166       raw_data_length_(length),
    167       raw_data_pos_(0),
    168       raw_character_position_(0) {
    169   ReadBlock();
    170 }
    171 
    172 
    173 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { }
    174 
    175 
    176 unsigned Utf8ToUtf16CharacterStream::CopyChars(uint16_t* dest, unsigned length,
    177                                                const byte* src,
    178                                                unsigned* src_pos,
    179                                                unsigned src_length) {
    180   static const unibrow::uchar kMaxUtf16Character = 0xffff;
    181   unsigned i = 0;
    182   // Because of the UTF-16 lead and trail surrogates, we stop filling the buffer
    183   // one character early (in the normal case), because we need to have at least
    184   // two free spaces in the buffer to be sure that the next character will fit.
    185   while (i < length - 1) {
    186     if (*src_pos == src_length) break;
    187     unibrow::uchar c = src[*src_pos];
    188     if (c <= unibrow::Utf8::kMaxOneByteChar) {
    189       *src_pos = *src_pos + 1;
    190     } else {
    191       c = unibrow::Utf8::CalculateValue(src + *src_pos, src_length - *src_pos,
    192                                         src_pos);
    193     }
    194     if (c > kMaxUtf16Character) {
    195       dest[i++] = unibrow::Utf16::LeadSurrogate(c);
    196       dest[i++] = unibrow::Utf16::TrailSurrogate(c);
    197     } else {
    198       dest[i++] = static_cast<uc16>(c);
    199     }
    200   }
    201   return i;
    202 }
    203 
    204 
    205 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) {
    206   unsigned old_pos = pos_;
    207   unsigned target_pos = pos_ + delta;
    208   SetRawPosition(target_pos);
    209   pos_ = raw_character_position_;
    210   ReadBlock();
    211   return pos_ - old_pos;
    212 }
    213 
    214 
    215 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position) {
    216   SetRawPosition(char_position);
    217   if (raw_character_position_ != char_position) {
    218     // char_position was not a valid position in the stream (hit the end
    219     // while spooling to it).
    220     return 0u;
    221   }
    222   unsigned i = CopyChars(buffer_, kBufferSize, raw_data_, &raw_data_pos_,
    223                          raw_data_length_);
    224   raw_character_position_ = char_position + i;
    225   return i;
    226 }
    227 
    228 
    229 static const byte kUtf8MultiByteMask = 0xC0;
    230 static const byte kUtf8MultiByteCharFollower = 0x80;
    231 
    232 
    233 #ifdef DEBUG
    234 static const byte kUtf8MultiByteCharStart = 0xC0;
    235 static bool IsUtf8MultiCharacterStart(byte first_byte) {
    236   return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart;
    237 }
    238 #endif
    239 
    240 
    241 static bool IsUtf8MultiCharacterFollower(byte later_byte) {
    242   return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower;
    243 }
    244 
    245 
    246 // Move the cursor back to point at the preceding UTF-8 character start
    247 // in the buffer.
    248 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) {
    249   byte character = buffer[--*cursor];
    250   if (character > unibrow::Utf8::kMaxOneByteChar) {
    251     DCHECK(IsUtf8MultiCharacterFollower(character));
    252     // Last byte of a multi-byte character encoding. Step backwards until
    253     // pointing to the first byte of the encoding, recognized by having the
    254     // top two bits set.
    255     while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { }
    256     DCHECK(IsUtf8MultiCharacterStart(buffer[*cursor]));
    257   }
    258 }
    259 
    260 
    261 // Move the cursor forward to point at the next following UTF-8 character start
    262 // in the buffer.
    263 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) {
    264   byte character = buffer[(*cursor)++];
    265   if (character > unibrow::Utf8::kMaxOneByteChar) {
    266     // First character of a multi-byte character encoding.
    267     // The number of most-significant one-bits determines the length of the
    268     // encoding:
    269     //  110..... - (0xCx, 0xDx) one additional byte (minimum).
    270     //  1110.... - (0xEx) two additional bytes.
    271     //  11110... - (0xFx) three additional bytes (maximum).
    272     DCHECK(IsUtf8MultiCharacterStart(character));
    273     // Additional bytes is:
    274     // 1 if value in range 0xC0 .. 0xDF.
    275     // 2 if value in range 0xE0 .. 0xEF.
    276     // 3 if value in range 0xF0 .. 0xF7.
    277     // Encode that in a single value.
    278     unsigned additional_bytes =
    279         ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03;
    280     *cursor += additional_bytes;
    281     DCHECK(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes]));
    282   }
    283 }
    284 
    285 
    286 // This can't set a raw position between two surrogate pairs, since there
    287 // is no position in the UTF8 stream that corresponds to that.  This assumes
    288 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence.  If
    289 // it is illegally coded as two 3 byte sequences then there is no problem here.
    290 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) {
    291   if (raw_character_position_ > target_position) {
    292     // Spool backwards in utf8 buffer.
    293     do {
    294       int old_pos = raw_data_pos_;
    295       Utf8CharacterBack(raw_data_, &raw_data_pos_);
    296       raw_character_position_--;
    297       DCHECK(old_pos - raw_data_pos_ <= 4);
    298       // Step back over both code units for surrogate pairs.
    299       if (old_pos - raw_data_pos_ == 4) raw_character_position_--;
    300     } while (raw_character_position_ > target_position);
    301     // No surrogate pair splitting.
    302     DCHECK(raw_character_position_ == target_position);
    303     return;
    304   }
    305   // Spool forwards in the utf8 buffer.
    306   while (raw_character_position_ < target_position) {
    307     if (raw_data_pos_ == raw_data_length_) return;
    308     int old_pos = raw_data_pos_;
    309     Utf8CharacterForward(raw_data_, &raw_data_pos_);
    310     raw_character_position_++;
    311     DCHECK(raw_data_pos_ - old_pos <= 4);
    312     if (raw_data_pos_ - old_pos == 4) raw_character_position_++;
    313   }
    314   // No surrogate pair splitting.
    315   DCHECK(raw_character_position_ == target_position);
    316 }
    317 
    318 
    319 unsigned ExternalStreamingStream::FillBuffer(unsigned position) {
    320   // Ignore "position" which is the position in the decoded data. Instead,
    321   // ExternalStreamingStream keeps track of the position in the raw data.
    322   unsigned data_in_buffer = 0;
    323   // Note that the UTF-8 decoder might not be able to fill the buffer
    324   // completely; it will typically leave the last character empty (see
    325   // Utf8ToUtf16CharacterStream::CopyChars).
    326   while (data_in_buffer < kBufferSize - 1) {
    327     if (current_data_ == NULL) {
    328       // GetSomeData will wait until the embedder has enough data. Here's an
    329       // interface between the API which uses size_t (which is the correct type
    330       // here) and the internal parts which use unsigned. TODO(marja): make the
    331       // internal parts use size_t too.
    332       current_data_length_ =
    333           static_cast<unsigned>(source_stream_->GetMoreData(&current_data_));
    334       current_data_offset_ = 0;
    335       bool data_ends = current_data_length_ == 0;
    336 
    337       // A caveat: a data chunk might end with bytes from an incomplete UTF-8
    338       // character (the rest of the bytes will be in the next chunk).
    339       if (encoding_ == ScriptCompiler::StreamedSource::UTF8) {
    340         HandleUtf8SplitCharacters(&data_in_buffer);
    341         if (!data_ends && current_data_offset_ == current_data_length_) {
    342           // The data stream didn't end, but we used all the data in the
    343           // chunk. This will only happen when the chunk was really small. We
    344           // don't handle the case where a UTF-8 character is split over several
    345           // chunks; in that case V8 won't crash, but it will be a parse error.
    346           delete[] current_data_;
    347           current_data_ = NULL;
    348           current_data_length_ = 0;
    349           current_data_offset_ = 0;
    350           continue;  // Request a new chunk.
    351         }
    352       }
    353 
    354       // Did the data stream end?
    355       if (data_ends) {
    356         DCHECK(utf8_split_char_buffer_length_ == 0);
    357         return data_in_buffer;
    358       }
    359     }
    360 
    361     // Fill the buffer from current_data_.
    362     unsigned new_offset = 0;
    363     unsigned new_chars_in_buffer =
    364         CopyCharsHelper(buffer_ + data_in_buffer, kBufferSize - data_in_buffer,
    365                         current_data_ + current_data_offset_, &new_offset,
    366                         current_data_length_ - current_data_offset_, encoding_);
    367     data_in_buffer += new_chars_in_buffer;
    368     current_data_offset_ += new_offset;
    369     DCHECK(data_in_buffer <= kBufferSize);
    370 
    371     // Did we use all the data in the data chunk?
    372     if (current_data_offset_ == current_data_length_) {
    373       delete[] current_data_;
    374       current_data_ = NULL;
    375       current_data_length_ = 0;
    376       current_data_offset_ = 0;
    377     }
    378   }
    379   return data_in_buffer;
    380 }
    381 
    382 void ExternalStreamingStream::HandleUtf8SplitCharacters(
    383     unsigned* data_in_buffer) {
    384   // First check if we have leftover data from the last chunk.
    385   unibrow::uchar c;
    386   if (utf8_split_char_buffer_length_ > 0) {
    387     // Move the bytes which are part of the split character (which started in
    388     // the previous chunk) into utf8_split_char_buffer_.
    389     while (current_data_offset_ < current_data_length_ &&
    390            utf8_split_char_buffer_length_ < 4 &&
    391            (c = current_data_[current_data_offset_]) >
    392                unibrow::Utf8::kMaxOneByteChar) {
    393       utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c;
    394       ++utf8_split_char_buffer_length_;
    395       ++current_data_offset_;
    396     }
    397 
    398     // Convert the data in utf8_split_char_buffer_.
    399     unsigned new_offset = 0;
    400     unsigned new_chars_in_buffer =
    401         CopyCharsHelper(buffer_ + *data_in_buffer,
    402                         kBufferSize - *data_in_buffer, utf8_split_char_buffer_,
    403                         &new_offset, utf8_split_char_buffer_length_, encoding_);
    404     *data_in_buffer += new_chars_in_buffer;
    405     // Make sure we used all the data.
    406     DCHECK(new_offset == utf8_split_char_buffer_length_);
    407     DCHECK(*data_in_buffer <= kBufferSize);
    408 
    409     utf8_split_char_buffer_length_ = 0;
    410   }
    411 
    412   // Move bytes which are part of an incomplete character from the end of the
    413   // current chunk to utf8_split_char_buffer_. They will be converted when the
    414   // next data chunk arrives. Note that all valid UTF-8 characters are at most 4
    415   // bytes long, but if the data is invalid, we can have character values bigger
    416   // than unibrow::Utf8::kMaxOneByteChar for more than 4 consecutive bytes.
    417   while (current_data_length_ > current_data_offset_ &&
    418          (c = current_data_[current_data_length_ - 1]) >
    419              unibrow::Utf8::kMaxOneByteChar &&
    420          utf8_split_char_buffer_length_ < 4) {
    421     --current_data_length_;
    422     ++utf8_split_char_buffer_length_;
    423   }
    424   CHECK(utf8_split_char_buffer_length_ <= 4);
    425   for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) {
    426     utf8_split_char_buffer_[i] = current_data_[current_data_length_ + i];
    427   }
    428 }
    429 
    430 
    431 // ----------------------------------------------------------------------------
    432 // ExternalTwoByteStringUtf16CharacterStream
    433 
    434 ExternalTwoByteStringUtf16CharacterStream::
    435     ~ExternalTwoByteStringUtf16CharacterStream() { }
    436 
    437 
    438 ExternalTwoByteStringUtf16CharacterStream
    439     ::ExternalTwoByteStringUtf16CharacterStream(
    440         Handle<ExternalTwoByteString> data,
    441         int start_position,
    442         int end_position)
    443     : Utf16CharacterStream(),
    444       source_(data),
    445       raw_data_(data->GetTwoByteData(start_position)) {
    446   buffer_cursor_ = raw_data_,
    447   buffer_end_ = raw_data_ + (end_position - start_position);
    448   pos_ = start_position;
    449 }
    450 
    451 } }  // namespace v8::internal
    452