Home | History | Annotate | Download | only in inspector
      1 // Copyright 2016 the V8 project authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "src/inspector/string-16.h"
      6 
      7 #include <algorithm>
      8 #include <cctype>
      9 #include <cstdlib>
     10 #include <cstring>
     11 #include <limits>
     12 #include <string>
     13 
     14 #include "src/base/platform/platform.h"
     15 #include "src/conversions.h"
     16 
     17 namespace v8_inspector {
     18 
     19 namespace {
     20 
     21 bool isASCII(UChar c) { return !(c & ~0x7F); }
     22 
     23 bool isSpaceOrNewLine(UChar c) {
     24   return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
     25 }
     26 
     27 int charactersToInteger(const UChar* characters, size_t length,
     28                         bool* ok = nullptr) {
     29   std::vector<char> buffer;
     30   buffer.reserve(length + 1);
     31   for (size_t i = 0; i < length; ++i) {
     32     if (!isASCII(characters[i])) {
     33       if (ok) *ok = false;
     34       return 0;
     35     }
     36     buffer.push_back(static_cast<char>(characters[i]));
     37   }
     38   buffer.push_back('\0');
     39 
     40   char* endptr;
     41   int64_t result =
     42       static_cast<int64_t>(std::strtol(buffer.data(), &endptr, 10));
     43   if (ok) {
     44     *ok = !(*endptr) && result <= std::numeric_limits<int>::max() &&
     45           result >= std::numeric_limits<int>::min();
     46   }
     47   return static_cast<int>(result);
     48 }
     49 
     50 const UChar replacementCharacter = 0xFFFD;
     51 using UChar32 = uint32_t;
     52 
     53 inline int inlineUTF8SequenceLengthNonASCII(char b0) {
     54   if ((b0 & 0xC0) != 0xC0) return 0;
     55   if ((b0 & 0xE0) == 0xC0) return 2;
     56   if ((b0 & 0xF0) == 0xE0) return 3;
     57   if ((b0 & 0xF8) == 0xF0) return 4;
     58   return 0;
     59 }
     60 
     61 inline int inlineUTF8SequenceLength(char b0) {
     62   return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
     63 }
     64 
     65 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
     66 // into the first byte, depending on how many bytes follow.  There are
     67 // as many entries in this table as there are UTF-8 sequence types.
     68 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
     69 // for *legal* UTF-8 will be 4 or fewer bytes total.
     70 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
     71                                                0xF0, 0xF8, 0xFC};
     72 
     73 typedef enum {
     74   conversionOK,     // conversion successful
     75   sourceExhausted,  // partial character in source, but hit end
     76   targetExhausted,  // insuff. room in target for conversion
     77   sourceIllegal     // source sequence is illegal/malformed
     78 } ConversionResult;
     79 
     80 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
     81                                     const UChar* sourceEnd, char** targetStart,
     82                                     char* targetEnd, bool strict) {
     83   ConversionResult result = conversionOK;
     84   const UChar* source = *sourceStart;
     85   char* target = *targetStart;
     86   while (source < sourceEnd) {
     87     UChar32 ch;
     88     uint32_t bytesToWrite = 0;
     89     const UChar32 byteMask = 0xBF;
     90     const UChar32 byteMark = 0x80;
     91     const UChar* oldSource =
     92         source;  // In case we have to back up because of target overflow.
     93     ch = static_cast<uint16_t>(*source++);
     94     // If we have a surrogate pair, convert to UChar32 first.
     95     if (ch >= 0xD800 && ch <= 0xDBFF) {
     96       // If the 16 bits following the high surrogate are in the source buffer...
     97       if (source < sourceEnd) {
     98         UChar32 ch2 = static_cast<uint16_t>(*source);
     99         // If it's a low surrogate, convert to UChar32.
    100         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
    101           ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
    102           ++source;
    103         } else if (strict) {  // it's an unpaired high surrogate
    104           --source;           // return to the illegal value itself
    105           result = sourceIllegal;
    106           break;
    107         }
    108       } else {     // We don't have the 16 bits following the high surrogate.
    109         --source;  // return to the high surrogate
    110         result = sourceExhausted;
    111         break;
    112       }
    113     } else if (strict) {
    114       // UTF-16 surrogate values are illegal in UTF-32
    115       if (ch >= 0xDC00 && ch <= 0xDFFF) {
    116         --source;  // return to the illegal value itself
    117         result = sourceIllegal;
    118         break;
    119       }
    120     }
    121     // Figure out how many bytes the result will require
    122     if (ch < (UChar32)0x80) {
    123       bytesToWrite = 1;
    124     } else if (ch < (UChar32)0x800) {
    125       bytesToWrite = 2;
    126     } else if (ch < (UChar32)0x10000) {
    127       bytesToWrite = 3;
    128     } else if (ch < (UChar32)0x110000) {
    129       bytesToWrite = 4;
    130     } else {
    131       bytesToWrite = 3;
    132       ch = replacementCharacter;
    133     }
    134 
    135     target += bytesToWrite;
    136     if (target > targetEnd) {
    137       source = oldSource;  // Back up source pointer!
    138       target -= bytesToWrite;
    139       result = targetExhausted;
    140       break;
    141     }
    142     switch (bytesToWrite) {  // note: everything falls through.
    143       case 4:
    144         *--target = static_cast<char>((ch | byteMark) & byteMask);
    145         ch >>= 6;
    146       case 3:
    147         *--target = static_cast<char>((ch | byteMark) & byteMask);
    148         ch >>= 6;
    149       case 2:
    150         *--target = static_cast<char>((ch | byteMark) & byteMask);
    151         ch >>= 6;
    152       case 1:
    153         *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
    154     }
    155     target += bytesToWrite;
    156   }
    157   *sourceStart = source;
    158   *targetStart = target;
    159   return result;
    160 }
    161 
    162 /**
    163  * Is this code point a BMP code point (U+0000..U+ffff)?
    164  * @param c 32-bit code point
    165  * @return TRUE or FALSE
    166  * @stable ICU 2.8
    167  */
    168 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)
    169 
    170 /**
    171  * Is this code point a supplementary code point (U+10000..U+10ffff)?
    172  * @param c 32-bit code point
    173  * @return TRUE or FALSE
    174  * @stable ICU 2.8
    175  */
    176 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff)
    177 
    178 /**
    179  * Is this code point a surrogate (U+d800..U+dfff)?
    180  * @param c 32-bit code point
    181  * @return TRUE or FALSE
    182  * @stable ICU 2.4
    183  */
    184 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800)
    185 
    186 /**
    187  * Get the lead surrogate (0xd800..0xdbff) for a
    188  * supplementary code point (0x10000..0x10ffff).
    189  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    190  * @return lead surrogate (U+d800..U+dbff) for supplementary
    191  * @stable ICU 2.4
    192  */
    193 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)
    194 
    195 /**
    196  * Get the trail surrogate (0xdc00..0xdfff) for a
    197  * supplementary code point (0x10000..0x10ffff).
    198  * @param supplementary 32-bit code point (U+10000..U+10ffff)
    199  * @return trail surrogate (U+dc00..U+dfff) for supplementary
    200  * @stable ICU 2.4
    201  */
    202 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00)
    203 
    204 // This must be called with the length pre-determined by the first byte.
    205 // If presented with a length > 4, this returns false.  The Unicode
    206 // definition of UTF-8 goes up to 4-byte sequences.
    207 static bool isLegalUTF8(const unsigned char* source, int length) {
    208   unsigned char a;
    209   const unsigned char* srcptr = source + length;
    210   switch (length) {
    211     default:
    212       return false;
    213     // Everything else falls through when "true"...
    214     case 4:
    215       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    216     case 3:
    217       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    218     case 2:
    219       if ((a = (*--srcptr)) > 0xBF) return false;
    220 
    221       // no fall-through in this inner switch
    222       switch (*source) {
    223         case 0xE0:
    224           if (a < 0xA0) return false;
    225           break;
    226         case 0xED:
    227           if (a > 0x9F) return false;
    228           break;
    229         case 0xF0:
    230           if (a < 0x90) return false;
    231           break;
    232         case 0xF4:
    233           if (a > 0x8F) return false;
    234           break;
    235         default:
    236           if (a < 0x80) return false;
    237       }
    238 
    239     case 1:
    240       if (*source >= 0x80 && *source < 0xC2) return false;
    241   }
    242   if (*source > 0xF4) return false;
    243   return true;
    244 }
    245 
    246 // Magic values subtracted from a buffer value during UTF8 conversion.
    247 // This table contains as many values as there might be trailing bytes
    248 // in a UTF-8 sequence.
    249 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
    250                                            0x00003080UL,
    251                                            0x000E2080UL,
    252                                            0x03C82080UL,
    253                                            static_cast<UChar32>(0xFA082080UL),
    254                                            static_cast<UChar32>(0x82082080UL)};
    255 
    256 static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
    257   UChar32 character = 0;
    258 
    259   // The cases all fall through.
    260   switch (length) {
    261     case 6:
    262       character += static_cast<unsigned char>(*sequence++);
    263       character <<= 6;
    264     case 5:
    265       character += static_cast<unsigned char>(*sequence++);
    266       character <<= 6;
    267     case 4:
    268       character += static_cast<unsigned char>(*sequence++);
    269       character <<= 6;
    270     case 3:
    271       character += static_cast<unsigned char>(*sequence++);
    272       character <<= 6;
    273     case 2:
    274       character += static_cast<unsigned char>(*sequence++);
    275       character <<= 6;
    276     case 1:
    277       character += static_cast<unsigned char>(*sequence++);
    278   }
    279 
    280   return character - offsetsFromUTF8[length - 1];
    281 }
    282 
    283 ConversionResult convertUTF8ToUTF16(const char** sourceStart,
    284                                     const char* sourceEnd, UChar** targetStart,
    285                                     UChar* targetEnd, bool* sourceAllASCII,
    286                                     bool strict) {
    287   ConversionResult result = conversionOK;
    288   const char* source = *sourceStart;
    289   UChar* target = *targetStart;
    290   UChar orAllData = 0;
    291   while (source < sourceEnd) {
    292     int utf8SequenceLength = inlineUTF8SequenceLength(*source);
    293     if (sourceEnd - source < utf8SequenceLength) {
    294       result = sourceExhausted;
    295       break;
    296     }
    297     // Do this check whether lenient or strict
    298     if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
    299                      utf8SequenceLength)) {
    300       result = sourceIllegal;
    301       break;
    302     }
    303 
    304     UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
    305 
    306     if (target >= targetEnd) {
    307       source -= utf8SequenceLength;  // Back up source pointer!
    308       result = targetExhausted;
    309       break;
    310     }
    311 
    312     if (U_IS_BMP(character)) {
    313       // UTF-16 surrogate values are illegal in UTF-32
    314       if (U_IS_SURROGATE(character)) {
    315         if (strict) {
    316           source -= utf8SequenceLength;  // return to the illegal value itself
    317           result = sourceIllegal;
    318           break;
    319         }
    320         *target++ = replacementCharacter;
    321         orAllData |= replacementCharacter;
    322       } else {
    323         *target++ = static_cast<UChar>(character);  // normal case
    324         orAllData |= character;
    325       }
    326     } else if (U_IS_SUPPLEMENTARY(character)) {
    327       // target is a character in range 0xFFFF - 0x10FFFF
    328       if (target + 1 >= targetEnd) {
    329         source -= utf8SequenceLength;  // Back up source pointer!
    330         result = targetExhausted;
    331         break;
    332       }
    333       *target++ = U16_LEAD(character);
    334       *target++ = U16_TRAIL(character);
    335       orAllData = 0xffff;
    336     } else {
    337       if (strict) {
    338         source -= utf8SequenceLength;  // return to the start
    339         result = sourceIllegal;
    340         break;  // Bail out; shouldn't continue
    341       } else {
    342         *target++ = replacementCharacter;
    343         orAllData |= replacementCharacter;
    344       }
    345     }
    346   }
    347   *sourceStart = source;
    348   *targetStart = target;
    349 
    350   if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f);
    351 
    352   return result;
    353 }
    354 
    355 // Helper to write a three-byte UTF-8 code point to the buffer, caller must
    356 // check room is available.
    357 static inline void putUTF8Triple(char*& buffer, UChar ch) {
    358   *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
    359   *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
    360   *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
    361 }
    362 
    363 }  // namespace
    364 
    365 // static
    366 String16 String16::fromInteger(int number) {
    367   char arr[50];
    368   v8::internal::Vector<char> buffer(arr, arraysize(arr));
    369   return String16(IntToCString(number, buffer));
    370 }
    371 
    372 // static
    373 String16 String16::fromInteger(size_t number) {
    374   const size_t kBufferSize = 50;
    375   char buffer[kBufferSize];
    376 #if !defined(_WIN32) && !defined(_WIN64)
    377   v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
    378 #else
    379   v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
    380 #endif
    381   return String16(buffer);
    382 }
    383 
    384 // static
    385 String16 String16::fromDouble(double number) {
    386   char arr[50];
    387   v8::internal::Vector<char> buffer(arr, arraysize(arr));
    388   return String16(DoubleToCString(number, buffer));
    389 }
    390 
    391 // static
    392 String16 String16::fromDouble(double number, int precision) {
    393   std::unique_ptr<char[]> str(
    394       v8::internal::DoubleToPrecisionCString(number, precision));
    395   return String16(str.get());
    396 }
    397 
    398 int String16::toInteger(bool* ok) const {
    399   return charactersToInteger(characters16(), length(), ok);
    400 }
    401 
    402 String16 String16::stripWhiteSpace() const {
    403   if (!length()) return String16();
    404 
    405   size_t start = 0;
    406   size_t end = length() - 1;
    407 
    408   // skip white space from start
    409   while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start;
    410 
    411   // only white space
    412   if (start > end) return String16();
    413 
    414   // skip white space from end
    415   while (end && isSpaceOrNewLine(characters16()[end])) --end;
    416 
    417   if (!start && end == length() - 1) return *this;
    418   return String16(characters16() + start, end + 1 - start);
    419 }
    420 
    421 String16Builder::String16Builder() {}
    422 
    423 void String16Builder::append(const String16& s) {
    424   m_buffer.insert(m_buffer.end(), s.characters16(),
    425                   s.characters16() + s.length());
    426 }
    427 
    428 void String16Builder::append(UChar c) { m_buffer.push_back(c); }
    429 
    430 void String16Builder::append(char c) {
    431   UChar u = c;
    432   m_buffer.push_back(u);
    433 }
    434 
    435 void String16Builder::append(const UChar* characters, size_t length) {
    436   m_buffer.insert(m_buffer.end(), characters, characters + length);
    437 }
    438 
    439 void String16Builder::append(const char* characters, size_t length) {
    440   m_buffer.insert(m_buffer.end(), characters, characters + length);
    441 }
    442 
    443 void String16Builder::appendNumber(int number) {
    444   const int kBufferSize = 11;
    445   char buffer[kBufferSize];
    446   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number);
    447   DCHECK_GT(kBufferSize, chars);
    448   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
    449 }
    450 
    451 void String16Builder::appendNumber(size_t number) {
    452   const int kBufferSize = 20;
    453   char buffer[kBufferSize];
    454 #if !defined(_WIN32) && !defined(_WIN64)
    455   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
    456 #else
    457   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
    458 #endif
    459   DCHECK_GT(kBufferSize, chars);
    460   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
    461 }
    462 
    463 String16 String16Builder::toString() {
    464   return String16(m_buffer.data(), m_buffer.size());
    465 }
    466 
    467 void String16Builder::reserveCapacity(size_t capacity) {
    468   m_buffer.reserve(capacity);
    469 }
    470 
    471 String16 String16::fromUTF8(const char* stringStart, size_t length) {
    472   if (!stringStart || !length) return String16();
    473 
    474   std::vector<UChar> buffer(length);
    475   UChar* bufferStart = buffer.data();
    476 
    477   UChar* bufferCurrent = bufferStart;
    478   const char* stringCurrent = stringStart;
    479   if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
    480                          bufferCurrent + buffer.size(), 0,
    481                          true) != conversionOK)
    482     return String16();
    483 
    484   size_t utf16Length = bufferCurrent - bufferStart;
    485   return String16(bufferStart, utf16Length);
    486 }
    487 
    488 std::string String16::utf8() const {
    489   size_t length = this->length();
    490 
    491   if (!length) return std::string("");
    492 
    493   // Allocate a buffer big enough to hold all the characters
    494   // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
    495   // Optimization ideas, if we find this function is hot:
    496   //  * We could speculatively create a CStringBuffer to contain 'length'
    497   //    characters, and resize if necessary (i.e. if the buffer contains
    498   //    non-ascii characters). (Alternatively, scan the buffer first for
    499   //    ascii characters, so we know this will be sufficient).
    500   //  * We could allocate a CStringBuffer with an appropriate size to
    501   //    have a good chance of being able to write the string into the
    502   //    buffer without reallocing (say, 1.5 x length).
    503   if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
    504   std::vector<char> bufferVector(length * 3);
    505   char* buffer = bufferVector.data();
    506   const UChar* characters = m_impl.data();
    507 
    508   ConversionResult result =
    509       convertUTF16ToUTF8(&characters, characters + length, &buffer,
    510                          buffer + bufferVector.size(), false);
    511   DCHECK(
    512       result !=
    513       targetExhausted);  // (length * 3) should be sufficient for any conversion
    514 
    515   // Only produced from strict conversion.
    516   DCHECK(result != sourceIllegal);
    517 
    518   // Check for an unconverted high surrogate.
    519   if (result == sourceExhausted) {
    520     // This should be one unpaired high surrogate. Treat it the same
    521     // was as an unpaired high surrogate would have been handled in
    522     // the middle of a string with non-strict conversion - which is
    523     // to say, simply encode it to UTF-8.
    524     DCHECK((characters + 1) == (m_impl.data() + length));
    525     DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
    526     // There should be room left, since one UChar hasn't been
    527     // converted.
    528     DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
    529     putUTF8Triple(buffer, *characters);
    530   }
    531 
    532   return std::string(bufferVector.data(), buffer - bufferVector.data());
    533 }
    534 
    535 }  // namespace v8_inspector
    536