1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/v8.h" 6 7 #include "src/scanner-character-streams.h" 8 9 #include "src/handles.h" 10 #include "src/unicode-inl.h" 11 12 namespace v8 { 13 namespace internal { 14 15 // ---------------------------------------------------------------------------- 16 // BufferedUtf16CharacterStreams 17 18 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream() 19 : Utf16CharacterStream(), 20 pushback_limit_(NULL) { 21 // Initialize buffer as being empty. First read will fill the buffer. 22 buffer_cursor_ = buffer_; 23 buffer_end_ = buffer_; 24 } 25 26 27 BufferedUtf16CharacterStream::~BufferedUtf16CharacterStream() { } 28 29 void BufferedUtf16CharacterStream::PushBack(uc32 character) { 30 if (character == kEndOfInput) { 31 pos_--; 32 return; 33 } 34 if (pushback_limit_ == NULL && buffer_cursor_ > buffer_) { 35 // buffer_ is writable, buffer_cursor_ is const pointer. 36 buffer_[--buffer_cursor_ - buffer_] = static_cast<uc16>(character); 37 pos_--; 38 return; 39 } 40 SlowPushBack(static_cast<uc16>(character)); 41 } 42 43 44 void BufferedUtf16CharacterStream::SlowPushBack(uc16 character) { 45 // In pushback mode, the end of the buffer contains pushback, 46 // and the start of the buffer (from buffer start to pushback_limit_) 47 // contains valid data that comes just after the pushback. 48 // We NULL the pushback_limit_ if pushing all the way back to the 49 // start of the buffer. 50 51 if (pushback_limit_ == NULL) { 52 // Enter pushback mode. 53 pushback_limit_ = buffer_end_; 54 buffer_end_ = buffer_ + kBufferSize; 55 buffer_cursor_ = buffer_end_; 56 } 57 // Ensure that there is room for at least one pushback. 58 ASSERT(buffer_cursor_ > buffer_); 59 ASSERT(pos_ > 0); 60 buffer_[--buffer_cursor_ - buffer_] = character; 61 if (buffer_cursor_ == buffer_) { 62 pushback_limit_ = NULL; 63 } else if (buffer_cursor_ < pushback_limit_) { 64 pushback_limit_ = buffer_cursor_; 65 } 66 pos_--; 67 } 68 69 70 bool BufferedUtf16CharacterStream::ReadBlock() { 71 buffer_cursor_ = buffer_; 72 if (pushback_limit_ != NULL) { 73 // Leave pushback mode. 74 buffer_end_ = pushback_limit_; 75 pushback_limit_ = NULL; 76 // If there were any valid characters left at the 77 // start of the buffer, use those. 78 if (buffer_cursor_ < buffer_end_) return true; 79 // Otherwise read a new block. 80 } 81 unsigned length = FillBuffer(pos_, kBufferSize); 82 buffer_end_ = buffer_ + length; 83 return length > 0; 84 } 85 86 87 unsigned BufferedUtf16CharacterStream::SlowSeekForward(unsigned delta) { 88 // Leave pushback mode (i.e., ignore that there might be valid data 89 // in the buffer before the pushback_limit_ point). 90 pushback_limit_ = NULL; 91 return BufferSeekForward(delta); 92 } 93 94 95 // ---------------------------------------------------------------------------- 96 // GenericStringUtf16CharacterStream 97 98 99 GenericStringUtf16CharacterStream::GenericStringUtf16CharacterStream( 100 Handle<String> data, 101 unsigned start_position, 102 unsigned end_position) 103 : string_(data), 104 length_(end_position) { 105 ASSERT(end_position >= start_position); 106 pos_ = start_position; 107 } 108 109 110 GenericStringUtf16CharacterStream::~GenericStringUtf16CharacterStream() { } 111 112 113 unsigned GenericStringUtf16CharacterStream::BufferSeekForward(unsigned delta) { 114 unsigned old_pos = pos_; 115 pos_ = Min(pos_ + delta, length_); 116 ReadBlock(); 117 return pos_ - old_pos; 118 } 119 120 121 unsigned GenericStringUtf16CharacterStream::FillBuffer(unsigned from_pos, 122 unsigned length) { 123 if (from_pos >= length_) return 0; 124 if (from_pos + length > length_) { 125 length = length_ - from_pos; 126 } 127 String::WriteToFlat<uc16>(*string_, buffer_, from_pos, from_pos + length); 128 return length; 129 } 130 131 132 // ---------------------------------------------------------------------------- 133 // Utf8ToUtf16CharacterStream 134 Utf8ToUtf16CharacterStream::Utf8ToUtf16CharacterStream(const byte* data, 135 unsigned length) 136 : BufferedUtf16CharacterStream(), 137 raw_data_(data), 138 raw_data_length_(length), 139 raw_data_pos_(0), 140 raw_character_position_(0) { 141 ReadBlock(); 142 } 143 144 145 Utf8ToUtf16CharacterStream::~Utf8ToUtf16CharacterStream() { } 146 147 148 unsigned Utf8ToUtf16CharacterStream::BufferSeekForward(unsigned delta) { 149 unsigned old_pos = pos_; 150 unsigned target_pos = pos_ + delta; 151 SetRawPosition(target_pos); 152 pos_ = raw_character_position_; 153 ReadBlock(); 154 return pos_ - old_pos; 155 } 156 157 158 unsigned Utf8ToUtf16CharacterStream::FillBuffer(unsigned char_position, 159 unsigned length) { 160 static const unibrow::uchar kMaxUtf16Character = 0xffff; 161 SetRawPosition(char_position); 162 if (raw_character_position_ != char_position) { 163 // char_position was not a valid position in the stream (hit the end 164 // while spooling to it). 165 return 0u; 166 } 167 unsigned i = 0; 168 while (i < length - 1) { 169 if (raw_data_pos_ == raw_data_length_) break; 170 unibrow::uchar c = raw_data_[raw_data_pos_]; 171 if (c <= unibrow::Utf8::kMaxOneByteChar) { 172 raw_data_pos_++; 173 } else { 174 c = unibrow::Utf8::CalculateValue(raw_data_ + raw_data_pos_, 175 raw_data_length_ - raw_data_pos_, 176 &raw_data_pos_); 177 } 178 if (c > kMaxUtf16Character) { 179 buffer_[i++] = unibrow::Utf16::LeadSurrogate(c); 180 buffer_[i++] = unibrow::Utf16::TrailSurrogate(c); 181 } else { 182 buffer_[i++] = static_cast<uc16>(c); 183 } 184 } 185 raw_character_position_ = char_position + i; 186 return i; 187 } 188 189 190 static const byte kUtf8MultiByteMask = 0xC0; 191 static const byte kUtf8MultiByteCharFollower = 0x80; 192 193 194 #ifdef DEBUG 195 static const byte kUtf8MultiByteCharStart = 0xC0; 196 static bool IsUtf8MultiCharacterStart(byte first_byte) { 197 return (first_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharStart; 198 } 199 #endif 200 201 202 static bool IsUtf8MultiCharacterFollower(byte later_byte) { 203 return (later_byte & kUtf8MultiByteMask) == kUtf8MultiByteCharFollower; 204 } 205 206 207 // Move the cursor back to point at the preceding UTF-8 character start 208 // in the buffer. 209 static inline void Utf8CharacterBack(const byte* buffer, unsigned* cursor) { 210 byte character = buffer[--*cursor]; 211 if (character > unibrow::Utf8::kMaxOneByteChar) { 212 ASSERT(IsUtf8MultiCharacterFollower(character)); 213 // Last byte of a multi-byte character encoding. Step backwards until 214 // pointing to the first byte of the encoding, recognized by having the 215 // top two bits set. 216 while (IsUtf8MultiCharacterFollower(buffer[--*cursor])) { } 217 ASSERT(IsUtf8MultiCharacterStart(buffer[*cursor])); 218 } 219 } 220 221 222 // Move the cursor forward to point at the next following UTF-8 character start 223 // in the buffer. 224 static inline void Utf8CharacterForward(const byte* buffer, unsigned* cursor) { 225 byte character = buffer[(*cursor)++]; 226 if (character > unibrow::Utf8::kMaxOneByteChar) { 227 // First character of a multi-byte character encoding. 228 // The number of most-significant one-bits determines the length of the 229 // encoding: 230 // 110..... - (0xCx, 0xDx) one additional byte (minimum). 231 // 1110.... - (0xEx) two additional bytes. 232 // 11110... - (0xFx) three additional bytes (maximum). 233 ASSERT(IsUtf8MultiCharacterStart(character)); 234 // Additional bytes is: 235 // 1 if value in range 0xC0 .. 0xDF. 236 // 2 if value in range 0xE0 .. 0xEF. 237 // 3 if value in range 0xF0 .. 0xF7. 238 // Encode that in a single value. 239 unsigned additional_bytes = 240 ((0x3211u) >> (((character - 0xC0) >> 2) & 0xC)) & 0x03; 241 *cursor += additional_bytes; 242 ASSERT(!IsUtf8MultiCharacterFollower(buffer[1 + additional_bytes])); 243 } 244 } 245 246 247 // This can't set a raw position between two surrogate pairs, since there 248 // is no position in the UTF8 stream that corresponds to that. This assumes 249 // that the surrogate pair is correctly coded as a 4 byte UTF-8 sequence. If 250 // it is illegally coded as two 3 byte sequences then there is no problem here. 251 void Utf8ToUtf16CharacterStream::SetRawPosition(unsigned target_position) { 252 if (raw_character_position_ > target_position) { 253 // Spool backwards in utf8 buffer. 254 do { 255 int old_pos = raw_data_pos_; 256 Utf8CharacterBack(raw_data_, &raw_data_pos_); 257 raw_character_position_--; 258 ASSERT(old_pos - raw_data_pos_ <= 4); 259 // Step back over both code units for surrogate pairs. 260 if (old_pos - raw_data_pos_ == 4) raw_character_position_--; 261 } while (raw_character_position_ > target_position); 262 // No surrogate pair splitting. 263 ASSERT(raw_character_position_ == target_position); 264 return; 265 } 266 // Spool forwards in the utf8 buffer. 267 while (raw_character_position_ < target_position) { 268 if (raw_data_pos_ == raw_data_length_) return; 269 int old_pos = raw_data_pos_; 270 Utf8CharacterForward(raw_data_, &raw_data_pos_); 271 raw_character_position_++; 272 ASSERT(raw_data_pos_ - old_pos <= 4); 273 if (raw_data_pos_ - old_pos == 4) raw_character_position_++; 274 } 275 // No surrogate pair splitting. 276 ASSERT(raw_character_position_ == target_position); 277 } 278 279 280 // ---------------------------------------------------------------------------- 281 // ExternalTwoByteStringUtf16CharacterStream 282 283 ExternalTwoByteStringUtf16CharacterStream:: 284 ~ExternalTwoByteStringUtf16CharacterStream() { } 285 286 287 ExternalTwoByteStringUtf16CharacterStream 288 ::ExternalTwoByteStringUtf16CharacterStream( 289 Handle<ExternalTwoByteString> data, 290 int start_position, 291 int end_position) 292 : Utf16CharacterStream(), 293 source_(data), 294 raw_data_(data->GetTwoByteData(start_position)) { 295 buffer_cursor_ = raw_data_, 296 buffer_end_ = raw_data_ + (end_position - start_position); 297 pos_ = start_position; 298 } 299 300 } } // namespace v8::internal 301