1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "src/inspector/string-16.h" 6 7 #include <algorithm> 8 #include <cctype> 9 #include <cstdlib> 10 #include <cstring> 11 #include <iomanip> 12 #include <limits> 13 #include <locale> 14 #include <sstream> 15 #include <string> 16 17 #include "src/base/platform/platform.h" 18 #include "src/inspector/protocol-platform.h" 19 20 namespace v8_inspector { 21 22 namespace { 23 24 bool isASCII(UChar c) { return !(c & ~0x7F); } 25 26 bool isSpaceOrNewLine(UChar c) { 27 return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9)); 28 } 29 30 int charactersToInteger(const UChar* characters, size_t length, 31 bool* ok = nullptr) { 32 std::vector<char> buffer; 33 buffer.reserve(length + 1); 34 for (size_t i = 0; i < length; ++i) { 35 if (!isASCII(characters[i])) { 36 if (ok) *ok = false; 37 return 0; 38 } 39 buffer.push_back(static_cast<char>(characters[i])); 40 } 41 buffer.push_back('\0'); 42 43 char* endptr; 44 int64_t result = 45 static_cast<int64_t>(std::strtol(buffer.data(), &endptr, 10)); 46 if (ok) { 47 *ok = !(*endptr) && result <= std::numeric_limits<int>::max() && 48 result >= std::numeric_limits<int>::min(); 49 } 50 return static_cast<int>(result); 51 } 52 53 const UChar replacementCharacter = 0xFFFD; 54 using UChar32 = uint32_t; 55 56 inline int inlineUTF8SequenceLengthNonASCII(char b0) { 57 if ((b0 & 0xC0) != 0xC0) return 0; 58 if ((b0 & 0xE0) == 0xC0) return 2; 59 if ((b0 & 0xF0) == 0xE0) return 3; 60 if ((b0 & 0xF8) == 0xF0) return 4; 61 return 0; 62 } 63 64 inline int inlineUTF8SequenceLength(char b0) { 65 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0); 66 } 67 68 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 69 // into the first byte, depending on how many bytes follow. There are 70 // as many entries in this table as there are UTF-8 sequence types. 71 // (I.e., one byte sequence, two byte... etc.). Remember that sequences 72 // for *legal* UTF-8 will be 4 or fewer bytes total. 73 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 74 0xF0, 0xF8, 0xFC}; 75 76 typedef enum { 77 conversionOK, // conversion successful 78 sourceExhausted, // partial character in source, but hit end 79 targetExhausted, // insuff. room in target for conversion 80 sourceIllegal // source sequence is illegal/malformed 81 } ConversionResult; 82 83 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart, 84 const UChar* sourceEnd, char** targetStart, 85 char* targetEnd, bool strict) { 86 ConversionResult result = conversionOK; 87 const UChar* source = *sourceStart; 88 char* target = *targetStart; 89 while (source < sourceEnd) { 90 UChar32 ch; 91 uint32_t bytesToWrite = 0; 92 const UChar32 byteMask = 0xBF; 93 const UChar32 byteMark = 0x80; 94 const UChar* oldSource = 95 source; // In case we have to back up because of target overflow. 96 ch = static_cast<uint16_t>(*source++); 97 // If we have a surrogate pair, convert to UChar32 first. 98 if (ch >= 0xD800 && ch <= 0xDBFF) { 99 // If the 16 bits following the high surrogate are in the source buffer... 100 if (source < sourceEnd) { 101 UChar32 ch2 = static_cast<uint16_t>(*source); 102 // If it's a low surrogate, convert to UChar32. 103 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 104 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000; 105 ++source; 106 } else if (strict) { // it's an unpaired high surrogate 107 --source; // return to the illegal value itself 108 result = sourceIllegal; 109 break; 110 } 111 } else { // We don't have the 16 bits following the high surrogate. 112 --source; // return to the high surrogate 113 result = sourceExhausted; 114 break; 115 } 116 } else if (strict) { 117 // UTF-16 surrogate values are illegal in UTF-32 118 if (ch >= 0xDC00 && ch <= 0xDFFF) { 119 --source; // return to the illegal value itself 120 result = sourceIllegal; 121 break; 122 } 123 } 124 // Figure out how many bytes the result will require 125 if (ch < (UChar32)0x80) { 126 bytesToWrite = 1; 127 } else if (ch < (UChar32)0x800) { 128 bytesToWrite = 2; 129 } else if (ch < (UChar32)0x10000) { 130 bytesToWrite = 3; 131 } else if (ch < (UChar32)0x110000) { 132 bytesToWrite = 4; 133 } else { 134 bytesToWrite = 3; 135 ch = replacementCharacter; 136 } 137 138 target += bytesToWrite; 139 if (target > targetEnd) { 140 source = oldSource; // Back up source pointer! 141 target -= bytesToWrite; 142 result = targetExhausted; 143 break; 144 } 145 switch (bytesToWrite) { // note: everything falls through. 146 case 4: 147 *--target = static_cast<char>((ch | byteMark) & byteMask); 148 ch >>= 6; 149 case 3: 150 *--target = static_cast<char>((ch | byteMark) & byteMask); 151 ch >>= 6; 152 case 2: 153 *--target = static_cast<char>((ch | byteMark) & byteMask); 154 ch >>= 6; 155 case 1: 156 *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]); 157 } 158 target += bytesToWrite; 159 } 160 *sourceStart = source; 161 *targetStart = target; 162 return result; 163 } 164 165 /** 166 * Is this code point a BMP code point (U+0000..U+ffff)? 167 * @param c 32-bit code point 168 * @return TRUE or FALSE 169 * @stable ICU 2.8 170 */ 171 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff) 172 173 /** 174 * Is this code point a supplementary code point (U+10000..U+10ffff)? 175 * @param c 32-bit code point 176 * @return TRUE or FALSE 177 * @stable ICU 2.8 178 */ 179 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff) 180 181 /** 182 * Is this code point a surrogate (U+d800..U+dfff)? 183 * @param c 32-bit code point 184 * @return TRUE or FALSE 185 * @stable ICU 2.4 186 */ 187 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800) 188 189 /** 190 * Get the lead surrogate (0xd800..0xdbff) for a 191 * supplementary code point (0x10000..0x10ffff). 192 * @param supplementary 32-bit code point (U+10000..U+10ffff) 193 * @return lead surrogate (U+d800..U+dbff) for supplementary 194 * @stable ICU 2.4 195 */ 196 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0) 197 198 /** 199 * Get the trail surrogate (0xdc00..0xdfff) for a 200 * supplementary code point (0x10000..0x10ffff). 201 * @param supplementary 32-bit code point (U+10000..U+10ffff) 202 * @return trail surrogate (U+dc00..U+dfff) for supplementary 203 * @stable ICU 2.4 204 */ 205 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00) 206 207 // This must be called with the length pre-determined by the first byte. 208 // If presented with a length > 4, this returns false. The Unicode 209 // definition of UTF-8 goes up to 4-byte sequences. 210 static bool isLegalUTF8(const unsigned char* source, int length) { 211 unsigned char a; 212 const unsigned char* srcptr = source + length; 213 switch (length) { 214 default: 215 return false; 216 // Everything else falls through when "true"... 217 case 4: 218 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 219 case 3: 220 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 221 case 2: 222 if ((a = (*--srcptr)) > 0xBF) return false; 223 224 // no fall-through in this inner switch 225 switch (*source) { 226 case 0xE0: 227 if (a < 0xA0) return false; 228 break; 229 case 0xED: 230 if (a > 0x9F) return false; 231 break; 232 case 0xF0: 233 if (a < 0x90) return false; 234 break; 235 case 0xF4: 236 if (a > 0x8F) return false; 237 break; 238 default: 239 if (a < 0x80) return false; 240 } 241 242 case 1: 243 if (*source >= 0x80 && *source < 0xC2) return false; 244 } 245 if (*source > 0xF4) return false; 246 return true; 247 } 248 249 // Magic values subtracted from a buffer value during UTF8 conversion. 250 // This table contains as many values as there might be trailing bytes 251 // in a UTF-8 sequence. 252 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL, 253 0x00003080UL, 254 0x000E2080UL, 255 0x03C82080UL, 256 static_cast<UChar32>(0xFA082080UL), 257 static_cast<UChar32>(0x82082080UL)}; 258 259 static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) { 260 UChar32 character = 0; 261 262 // The cases all fall through. 263 switch (length) { 264 case 6: 265 character += static_cast<unsigned char>(*sequence++); 266 character <<= 6; 267 case 5: 268 character += static_cast<unsigned char>(*sequence++); 269 character <<= 6; 270 case 4: 271 character += static_cast<unsigned char>(*sequence++); 272 character <<= 6; 273 case 3: 274 character += static_cast<unsigned char>(*sequence++); 275 character <<= 6; 276 case 2: 277 character += static_cast<unsigned char>(*sequence++); 278 character <<= 6; 279 case 1: 280 character += static_cast<unsigned char>(*sequence++); 281 } 282 283 return character - offsetsFromUTF8[length - 1]; 284 } 285 286 ConversionResult convertUTF8ToUTF16(const char** sourceStart, 287 const char* sourceEnd, UChar** targetStart, 288 UChar* targetEnd, bool* sourceAllASCII, 289 bool strict) { 290 ConversionResult result = conversionOK; 291 const char* source = *sourceStart; 292 UChar* target = *targetStart; 293 UChar orAllData = 0; 294 while (source < sourceEnd) { 295 int utf8SequenceLength = inlineUTF8SequenceLength(*source); 296 if (sourceEnd - source < utf8SequenceLength) { 297 result = sourceExhausted; 298 break; 299 } 300 // Do this check whether lenient or strict 301 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), 302 utf8SequenceLength)) { 303 result = sourceIllegal; 304 break; 305 } 306 307 UChar32 character = readUTF8Sequence(source, utf8SequenceLength); 308 309 if (target >= targetEnd) { 310 source -= utf8SequenceLength; // Back up source pointer! 311 result = targetExhausted; 312 break; 313 } 314 315 if (U_IS_BMP(character)) { 316 // UTF-16 surrogate values are illegal in UTF-32 317 if (U_IS_SURROGATE(character)) { 318 if (strict) { 319 source -= utf8SequenceLength; // return to the illegal value itself 320 result = sourceIllegal; 321 break; 322 } 323 *target++ = replacementCharacter; 324 orAllData |= replacementCharacter; 325 } else { 326 *target++ = static_cast<UChar>(character); // normal case 327 orAllData |= character; 328 } 329 } else if (U_IS_SUPPLEMENTARY(character)) { 330 // target is a character in range 0xFFFF - 0x10FFFF 331 if (target + 1 >= targetEnd) { 332 source -= utf8SequenceLength; // Back up source pointer! 333 result = targetExhausted; 334 break; 335 } 336 *target++ = U16_LEAD(character); 337 *target++ = U16_TRAIL(character); 338 orAllData = 0xffff; 339 } else { 340 if (strict) { 341 source -= utf8SequenceLength; // return to the start 342 result = sourceIllegal; 343 break; // Bail out; shouldn't continue 344 } else { 345 *target++ = replacementCharacter; 346 orAllData |= replacementCharacter; 347 } 348 } 349 } 350 *sourceStart = source; 351 *targetStart = target; 352 353 if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f); 354 355 return result; 356 } 357 358 // Helper to write a three-byte UTF-8 code point to the buffer, caller must 359 // check room is available. 360 static inline void putUTF8Triple(char*& buffer, UChar ch) { 361 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 362 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 363 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 364 } 365 366 } // namespace 367 368 // static 369 String16 String16::fromInteger(int number) { 370 const size_t kBufferSize = 50; 371 char buffer[kBufferSize]; 372 v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number); 373 return String16(buffer); 374 } 375 376 // static 377 String16 String16::fromInteger(size_t number) { 378 const size_t kBufferSize = 50; 379 char buffer[kBufferSize]; 380 #if !defined(_WIN32) && !defined(_WIN64) 381 v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number); 382 #else 383 v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number); 384 #endif 385 return String16(buffer); 386 } 387 388 // static 389 String16 String16::fromDouble(double number) { 390 std::ostringstream s; 391 s.imbue(std::locale("C")); 392 s << std::fixed << std::setprecision(std::numeric_limits<double>::digits10) 393 << number; 394 return String16(s.str().c_str()); 395 } 396 397 // static 398 String16 String16::fromDouble(double number, int precision) { 399 std::ostringstream s; 400 s.imbue(std::locale("C")); 401 s << std::fixed << std::setprecision(precision) << number; 402 return String16(s.str().c_str()); 403 } 404 405 int String16::toInteger(bool* ok) const { 406 return charactersToInteger(characters16(), length(), ok); 407 } 408 409 String16 String16::stripWhiteSpace() const { 410 if (!length()) return String16(); 411 412 size_t start = 0; 413 size_t end = length() - 1; 414 415 // skip white space from start 416 while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start; 417 418 // only white space 419 if (start > end) return String16(); 420 421 // skip white space from end 422 while (end && isSpaceOrNewLine(characters16()[end])) --end; 423 424 if (!start && end == length() - 1) return *this; 425 return String16(characters16() + start, end + 1 - start); 426 } 427 428 String16Builder::String16Builder() {} 429 430 void String16Builder::append(const String16& s) { 431 m_buffer.insert(m_buffer.end(), s.characters16(), 432 s.characters16() + s.length()); 433 } 434 435 void String16Builder::append(UChar c) { m_buffer.push_back(c); } 436 437 void String16Builder::append(char c) { 438 UChar u = c; 439 m_buffer.push_back(u); 440 } 441 442 void String16Builder::append(const UChar* characters, size_t length) { 443 m_buffer.insert(m_buffer.end(), characters, characters + length); 444 } 445 446 void String16Builder::append(const char* characters, size_t length) { 447 m_buffer.insert(m_buffer.end(), characters, characters + length); 448 } 449 450 void String16Builder::appendNumber(int number) { 451 const int kBufferSize = 11; 452 char buffer[kBufferSize]; 453 int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number); 454 DCHECK_GT(kBufferSize, chars); 455 m_buffer.insert(m_buffer.end(), buffer, buffer + chars); 456 } 457 458 void String16Builder::appendNumber(size_t number) { 459 const int kBufferSize = 20; 460 char buffer[kBufferSize]; 461 #if !defined(_WIN32) && !defined(_WIN64) 462 int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number); 463 #else 464 int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number); 465 #endif 466 DCHECK_GT(kBufferSize, chars); 467 m_buffer.insert(m_buffer.end(), buffer, buffer + chars); 468 } 469 470 String16 String16Builder::toString() { 471 return String16(m_buffer.data(), m_buffer.size()); 472 } 473 474 void String16Builder::reserveCapacity(size_t capacity) { 475 m_buffer.reserve(capacity); 476 } 477 478 String16 String16::fromUTF8(const char* stringStart, size_t length) { 479 if (!stringStart || !length) return String16(); 480 481 std::vector<UChar> buffer(length); 482 UChar* bufferStart = buffer.data(); 483 484 UChar* bufferCurrent = bufferStart; 485 const char* stringCurrent = stringStart; 486 if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent, 487 bufferCurrent + buffer.size(), 0, 488 true) != conversionOK) 489 return String16(); 490 491 size_t utf16Length = bufferCurrent - bufferStart; 492 return String16(bufferStart, utf16Length); 493 } 494 495 std::string String16::utf8() const { 496 size_t length = this->length(); 497 498 if (!length) return std::string(""); 499 500 // Allocate a buffer big enough to hold all the characters 501 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 502 // Optimization ideas, if we find this function is hot: 503 // * We could speculatively create a CStringBuffer to contain 'length' 504 // characters, and resize if necessary (i.e. if the buffer contains 505 // non-ascii characters). (Alternatively, scan the buffer first for 506 // ascii characters, so we know this will be sufficient). 507 // * We could allocate a CStringBuffer with an appropriate size to 508 // have a good chance of being able to write the string into the 509 // buffer without reallocing (say, 1.5 x length). 510 if (length > std::numeric_limits<unsigned>::max() / 3) return std::string(); 511 std::vector<char> bufferVector(length * 3); 512 char* buffer = bufferVector.data(); 513 const UChar* characters = m_impl.data(); 514 515 ConversionResult result = 516 convertUTF16ToUTF8(&characters, characters + length, &buffer, 517 buffer + bufferVector.size(), false); 518 DCHECK( 519 result != 520 targetExhausted); // (length * 3) should be sufficient for any conversion 521 522 // Only produced from strict conversion. 523 DCHECK(result != sourceIllegal); 524 525 // Check for an unconverted high surrogate. 526 if (result == sourceExhausted) { 527 // This should be one unpaired high surrogate. Treat it the same 528 // was as an unpaired high surrogate would have been handled in 529 // the middle of a string with non-strict conversion - which is 530 // to say, simply encode it to UTF-8. 531 DCHECK((characters + 1) == (m_impl.data() + length)); 532 DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF)); 533 // There should be room left, since one UChar hasn't been 534 // converted. 535 DCHECK((buffer + 3) <= (buffer + bufferVector.size())); 536 putUTF8Triple(buffer, *characters); 537 } 538 539 return std::string(bufferVector.data(), buffer - bufferVector.data()); 540 } 541 542 } // namespace v8_inspector 543