1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/inspector/string-16.h" 6 7 #include <algorithm> 8 #include <cctype> 9 #include <cstdlib> 10 #include <cstring> 11 #include <limits> 12 #include <string> 13 14 #include "src/base/platform/platform.h" 15 #include "src/conversions.h" 16 17 namespace v8_inspector { 18 19 namespace { 20 21 bool isASCII(UChar c) { return !(c & ~0x7F); } 22 23 bool isSpaceOrNewLine(UChar c) { 24 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9)); 25 } 26 27 int charactersToInteger(const UChar* characters, size_t length, 28 bool* ok = nullptr) { 29 std::vector<char> buffer; 30 buffer.reserve(length + 1); 31 for (size_t i = 0; i < length; ++i) { 32 if (!isASCII(characters[i])) { 33 if (ok) *ok = false; 34 return 0; 35 } 36 buffer.push_back(static_cast<char>(characters[i])); 37 } 38 buffer.push_back('\0'); 39 40 char* endptr; 41 int64_t result = 42 static_cast<int64_t>(std::strtol(buffer.data(), &endptr, 10)); 43 if (ok) { 44 *ok = !(*endptr) && result <= std::numeric_limits<int>::max() && 45 result >= std::numeric_limits<int>::min(); 46 } 47 return static_cast<int>(result); 48 } 49 50 const UChar replacementCharacter = 0xFFFD; 51 using UChar32 = uint32_t; 52 53 inline int inlineUTF8SequenceLengthNonASCII(char b0) { 54 if ((b0 & 0xC0) != 0xC0) return 0; 55 if ((b0 & 0xE0) == 0xC0) return 2; 56 if ((b0 & 0xF0) == 0xE0) return 3; 57 if ((b0 & 0xF8) == 0xF0) return 4; 58 return 0; 59 } 60 61 inline int inlineUTF8SequenceLength(char b0) { 62 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 63 } 64 65 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 66 // into the first byte, depending on how many bytes follow. There are 67 // as many entries in this table as there are UTF-8 sequence types. 68 // (I.e., one byte sequence, two byte... etc.). Remember that sequences 69 // for *legal* UTF-8 will be 4 or fewer bytes total. 70 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 71 0xF0, 0xF8, 0xFC}; 72 73 typedef enum { 74 conversionOK, // conversion successful 75 sourceExhausted, // partial character in source, but hit end 76 targetExhausted, // insuff. room in target for conversion 77 sourceIllegal // source sequence is illegal/malformed 78 } ConversionResult; 79 80 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, 81 const UChar* sourceEnd, char** targetStart, 82 char* targetEnd, bool strict) { 83 ConversionResult result = conversionOK; 84 const UChar* source = *sourceStart; 85 char* target = *targetStart; 86 while (source < sourceEnd) { 87 UChar32 ch; 88 uint32_t bytesToWrite = 0; 89 const UChar32 byteMask = 0xBF; 90 const UChar32 byteMark = 0x80; 91 const UChar* oldSource = 92 source; // In case we have to back up because of target overflow. 93 ch = static_cast<uint16_t>(*source++); 94 // If we have a surrogate pair, convert to UChar32 first. 95 if (ch >= 0xD800 && ch <= 0xDBFF) { 96 // If the 16 bits following the high surrogate are in the source buffer... 97 if (source < sourceEnd) { 98 UChar32 ch2 = static_cast<uint16_t>(*source); 99 // If it's a low surrogate, convert to UChar32. 100 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 101 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; 102 ++source; 103 } else if (strict) { // it's an unpaired high surrogate 104 --source; // return to the illegal value itself 105 result = sourceIllegal; 106 break; 107 } 108 } else { // We don't have the 16 bits following the high surrogate. 109 --source; // return to the high surrogate 110 result = sourceExhausted; 111 break; 112 } 113 } else if (strict) { 114 // UTF-16 surrogate values are illegal in UTF-32 115 if (ch >= 0xDC00 && ch <= 0xDFFF) { 116 --source; // return to the illegal value itself 117 result = sourceIllegal; 118 break; 119 } 120 } 121 // Figure out how many bytes the result will require 122 if (ch < (UChar32)0x80) { 123 bytesToWrite = 1; 124 } else if (ch < (UChar32)0x800) { 125 bytesToWrite = 2; 126 } else if (ch < (UChar32)0x10000) { 127 bytesToWrite = 3; 128 } else if (ch < (UChar32)0x110000) { 129 bytesToWrite = 4; 130 } else { 131 bytesToWrite = 3; 132 ch = replacementCharacter; 133 } 134 135 target += bytesToWrite; 136 if (target > targetEnd) { 137 source = oldSource; // Back up source pointer! 138 target -= bytesToWrite; 139 result = targetExhausted; 140 break; 141 } 142 switch (bytesToWrite) { // note: everything falls through. 143 case 4: 144 *--target = static_cast<char>((ch | byteMark) & byteMask); 145 ch >>= 6; 146 case 3: 147 *--target = static_cast<char>((ch | byteMark) & byteMask); 148 ch >>= 6; 149 case 2: 150 *--target = static_cast<char>((ch | byteMark) & byteMask); 151 ch >>= 6; 152 case 1: 153 *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]); 154 } 155 target += bytesToWrite; 156 } 157 *sourceStart = source; 158 *targetStart = target; 159 return result; 160 } 161 162 /** 163 * Is this code point a BMP code point (U+0000..U+ffff)? 164 * @param c 32-bit code point 165 * @return TRUE or FALSE 166 * @stable ICU 2.8 167 */ 168 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) 169 170 /** 171 * Is this code point a supplementary code point (U+10000..U+10ffff)? 172 * @param c 32-bit code point 173 * @return TRUE or FALSE 174 * @stable ICU 2.8 175 */ 176 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff) 177 178 /** 179 * Is this code point a surrogate (U+d800..U+dfff)? 180 * @param c 32-bit code point 181 * @return TRUE or FALSE 182 * @stable ICU 2.4 183 */ 184 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800) 185 186 /** 187 * Get the lead surrogate (0xd800..0xdbff) for a 188 * supplementary code point (0x10000..0x10ffff). 189 * @param supplementary 32-bit code point (U+10000..U+10ffff) 190 * @return lead surrogate (U+d800..U+dbff) for supplementary 191 * @stable ICU 2.4 192 */ 193 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) 194 195 /** 196 * Get the trail surrogate (0xdc00..0xdfff) for a 197 * supplementary code point (0x10000..0x10ffff). 198 * @param supplementary 32-bit code point (U+10000..U+10ffff) 199 * @return trail surrogate (U+dc00..U+dfff) for supplementary 200 * @stable ICU 2.4 201 */ 202 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00) 203 204 // This must be called with the length pre-determined by the first byte. 205 // If presented with a length > 4, this returns false. The Unicode 206 // definition of UTF-8 goes up to 4-byte sequences. 207 static bool isLegalUTF8(const unsigned char* source, int length) { 208 unsigned char a; 209 const unsigned char* srcptr = source + length; 210 switch (length) { 211 default: 212 return false; 213 // Everything else falls through when "true"... 214 case 4: 215 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 216 case 3: 217 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 218 case 2: 219 if ((a = (*--srcptr)) > 0xBF) return false; 220 221 // no fall-through in this inner switch 222 switch (*source) { 223 case 0xE0: 224 if (a < 0xA0) return false; 225 break; 226 case 0xED: 227 if (a > 0x9F) return false; 228 break; 229 case 0xF0: 230 if (a < 0x90) return false; 231 break; 232 case 0xF4: 233 if (a > 0x8F) return false; 234 break; 235 default: 236 if (a < 0x80) return false; 237 } 238 239 case 1: 240 if (*source >= 0x80 && *source < 0xC2) return false; 241 } 242 if (*source > 0xF4) return false; 243 return true; 244 } 245 246 // Magic values subtracted from a buffer value during UTF8 conversion. 247 // This table contains as many values as there might be trailing bytes 248 // in a UTF-8 sequence. 249 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, 250 0x00003080UL, 251 0x000E2080UL, 252 0x03C82080UL, 253 static_cast<UChar32>(0xFA082080UL), 254 static_cast<UChar32>(0x82082080UL)}; 255 256 static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) { 257 UChar32 character = 0; 258 259 // The cases all fall through. 260 switch (length) { 261 case 6: 262 character += static_cast<unsigned char>(*sequence++); 263 character <<= 6; 264 case 5: 265 character += static_cast<unsigned char>(*sequence++); 266 character <<= 6; 267 case 4: 268 character += static_cast<unsigned char>(*sequence++); 269 character <<= 6; 270 case 3: 271 character += static_cast<unsigned char>(*sequence++); 272 character <<= 6; 273 case 2: 274 character += static_cast<unsigned char>(*sequence++); 275 character <<= 6; 276 case 1: 277 character += static_cast<unsigned char>(*sequence++); 278 } 279 280 return character - offsetsFromUTF8[length - 1]; 281 } 282 283 ConversionResult convertUTF8ToUTF16(const char** sourceStart, 284 const char* sourceEnd, UChar** targetStart, 285 UChar* targetEnd, bool* sourceAllASCII, 286 bool strict) { 287 ConversionResult result = conversionOK; 288 const char* source = *sourceStart; 289 UChar* target = *targetStart; 290 UChar orAllData = 0; 291 while (source < sourceEnd) { 292 int utf8SequenceLength = inlineUTF8SequenceLength(*source); 293 if (sourceEnd - source < utf8SequenceLength) { 294 result = sourceExhausted; 295 break; 296 } 297 // Do this check whether lenient or strict 298 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), 299 utf8SequenceLength)) { 300 result = sourceIllegal; 301 break; 302 } 303 304 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); 305 306 if (target >= targetEnd) { 307 source -= utf8SequenceLength; // Back up source pointer! 308 result = targetExhausted; 309 break; 310 } 311 312 if (U_IS_BMP(character)) { 313 // UTF-16 surrogate values are illegal in UTF-32 314 if (U_IS_SURROGATE(character)) { 315 if (strict) { 316 source -= utf8SequenceLength; // return to the illegal value itself 317 result = sourceIllegal; 318 break; 319 } 320 *target++ = replacementCharacter; 321 orAllData |= replacementCharacter; 322 } else { 323 *target++ = static_cast<UChar>(character); // normal case 324 orAllData |= character; 325 } 326 } else if (U_IS_SUPPLEMENTARY(character)) { 327 // target is a character in range 0xFFFF - 0x10FFFF 328 if (target + 1 >= targetEnd) { 329 source -= utf8SequenceLength; // Back up source pointer! 330 result = targetExhausted; 331 break; 332 } 333 *target++ = U16_LEAD(character); 334 *target++ = U16_TRAIL(character); 335 orAllData = 0xffff; 336 } else { 337 if (strict) { 338 source -= utf8SequenceLength; // return to the start 339 result = sourceIllegal; 340 break; // Bail out; shouldn't continue 341 } else { 342 *target++ = replacementCharacter; 343 orAllData |= replacementCharacter; 344 } 345 } 346 } 347 *sourceStart = source; 348 *targetStart = target; 349 350 if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f); 351 352 return result; 353 } 354 355 // Helper to write a three-byte UTF-8 code point to the buffer, caller must 356 // check room is available. 357 static inline void putUTF8Triple(char*& buffer, UChar ch) { 358 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 359 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 360 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 361 } 362 363 } // namespace 364 365 // static 366 String16 String16::fromInteger(int number) { 367 char arr[50]; 368 v8::internal::Vector<char> buffer(arr, arraysize(arr)); 369 return String16(IntToCString(number, buffer)); 370 } 371 372 // static 373 String16 String16::fromInteger(size_t number) { 374 const size_t kBufferSize = 50; 375 char buffer[kBufferSize]; 376 #if !defined(_WIN32) && !defined(_WIN64) 377 v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number); 378 #else 379 v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number); 380 #endif 381 return String16(buffer); 382 } 383 384 // static 385 String16 String16::fromDouble(double number) { 386 char arr[50]; 387 v8::internal::Vector<char> buffer(arr, arraysize(arr)); 388 return String16(DoubleToCString(number, buffer)); 389 } 390 391 // static 392 String16 String16::fromDouble(double number, int precision) { 393 std::unique_ptr<char[]> str( 394 v8::internal::DoubleToPrecisionCString(number, precision)); 395 return String16(str.get()); 396 } 397 398 int String16::toInteger(bool* ok) const { 399 return charactersToInteger(characters16(), length(), ok); 400 } 401 402 String16 String16::stripWhiteSpace() const { 403 if (!length()) return String16(); 404 405 size_t start = 0; 406 size_t end = length() - 1; 407 408 // skip white space from start 409 while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start; 410 411 // only white space 412 if (start > end) return String16(); 413 414 // skip white space from end 415 while (end && isSpaceOrNewLine(characters16()[end])) --end; 416 417 if (!start && end == length() - 1) return *this; 418 return String16(characters16() + start, end + 1 - start); 419 } 420 421 String16Builder::String16Builder() {} 422 423 void String16Builder::append(const String16& s) { 424 m_buffer.insert(m_buffer.end(), s.characters16(), 425 s.characters16() + s.length()); 426 } 427 428 void String16Builder::append(UChar c) { m_buffer.push_back(c); } 429 430 void String16Builder::append(char c) { 431 UChar u = c; 432 m_buffer.push_back(u); 433 } 434 435 void String16Builder::append(const UChar* characters, size_t length) { 436 m_buffer.insert(m_buffer.end(), characters, characters + length); 437 } 438 439 void String16Builder::append(const char* characters, size_t length) { 440 m_buffer.insert(m_buffer.end(), characters, characters + length); 441 } 442 443 void String16Builder::appendNumber(int number) { 444 const int kBufferSize = 11; 445 char buffer[kBufferSize]; 446 int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number); 447 DCHECK_GT(kBufferSize, chars); 448 m_buffer.insert(m_buffer.end(), buffer, buffer + chars); 449 } 450 451 void String16Builder::appendNumber(size_t number) { 452 const int kBufferSize = 20; 453 char buffer[kBufferSize]; 454 #if !defined(_WIN32) && !defined(_WIN64) 455 int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number); 456 #else 457 int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number); 458 #endif 459 DCHECK_GT(kBufferSize, chars); 460 m_buffer.insert(m_buffer.end(), buffer, buffer + chars); 461 } 462 463 String16 String16Builder::toString() { 464 return String16(m_buffer.data(), m_buffer.size()); 465 } 466 467 void String16Builder::reserveCapacity(size_t capacity) { 468 m_buffer.reserve(capacity); 469 } 470 471 String16 String16::fromUTF8(const char* stringStart, size_t length) { 472 if (!stringStart || !length) return String16(); 473 474 std::vector<UChar> buffer(length); 475 UChar* bufferStart = buffer.data(); 476 477 UChar* bufferCurrent = bufferStart; 478 const char* stringCurrent = stringStart; 479 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, 480 bufferCurrent + buffer.size(), 0, 481 true) != conversionOK) 482 return String16(); 483 484 size_t utf16Length = bufferCurrent - bufferStart; 485 return String16(bufferStart, utf16Length); 486 } 487 488 std::string String16::utf8() const { 489 size_t length = this->length(); 490 491 if (!length) return std::string(""); 492 493 // Allocate a buffer big enough to hold all the characters 494 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 495 // Optimization ideas, if we find this function is hot: 496 // * We could speculatively create a CStringBuffer to contain 'length' 497 // characters, and resize if necessary (i.e. if the buffer contains 498 // non-ascii characters). (Alternatively, scan the buffer first for 499 // ascii characters, so we know this will be sufficient). 500 // * We could allocate a CStringBuffer with an appropriate size to 501 // have a good chance of being able to write the string into the 502 // buffer without reallocing (say, 1.5 x length). 503 if (length > std::numeric_limits<unsigned>::max() / 3) return std::string(); 504 std::vector<char> bufferVector(length * 3); 505 char* buffer = bufferVector.data(); 506 const UChar* characters = m_impl.data(); 507 508 ConversionResult result = 509 convertUTF16ToUTF8(&characters, characters + length, &buffer, 510 buffer + bufferVector.size(), false); 511 DCHECK( 512 result != 513 targetExhausted); // (length * 3) should be sufficient for any conversion 514 515 // Only produced from strict conversion. 516 DCHECK(result != sourceIllegal); 517 518 // Check for an unconverted high surrogate. 519 if (result == sourceExhausted) { 520 // This should be one unpaired high surrogate. Treat it the same 521 // was as an unpaired high surrogate would have been handled in 522 // the middle of a string with non-strict conversion - which is 523 // to say, simply encode it to UTF-8. 524 DCHECK((characters + 1) == (m_impl.data() + length)); 525 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); 526 // There should be room left, since one UChar hasn't been 527 // converted. 528 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); 529 putUTF8Triple(buffer, *characters); 530 } 531 532 return std::string(bufferVector.data(), buffer - bufferVector.data()); 533 } 534 535 } // namespace v8_inspector 536