Home | History | Annotate | Download | only in strings
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/strings/utf_string_conversion_utils.h"
      6 
      7 #include "base/third_party/icu/icu_utf.h"
      8 
      9 namespace base {
     10 
     11 // ReadUnicodeCharacter --------------------------------------------------------
     12 
     13 bool ReadUnicodeCharacter(const char* src,
     14                           int32_t src_len,
     15                           int32_t* char_index,
     16                           uint32_t* code_point_out) {
     17   // U8_NEXT expects to be able to use -1 to signal an error, so we must
     18   // use a signed type for code_point.  But this function returns false
     19   // on error anyway, so code_point_out is unsigned.
     20   int32_t code_point;
     21   CBU8_NEXT(src, *char_index, src_len, code_point);
     22   *code_point_out = static_cast<uint32_t>(code_point);
     23 
     24   // The ICU macro above moves to the next char, we want to point to the last
     25   // char consumed.
     26   (*char_index)--;
     27 
     28   // Validate the decoded value.
     29   return IsValidCodepoint(code_point);
     30 }
     31 
     32 bool ReadUnicodeCharacter(const char16* src,
     33                           int32_t src_len,
     34                           int32_t* char_index,
     35                           uint32_t* code_point) {
     36   if (CBU16_IS_SURROGATE(src[*char_index])) {
     37     if (!CBU16_IS_SURROGATE_LEAD(src[*char_index]) ||
     38         *char_index + 1 >= src_len ||
     39         !CBU16_IS_TRAIL(src[*char_index + 1])) {
     40       // Invalid surrogate pair.
     41       return false;
     42     }
     43 
     44     // Valid surrogate pair.
     45     *code_point = CBU16_GET_SUPPLEMENTARY(src[*char_index],
     46                                           src[*char_index + 1]);
     47     (*char_index)++;
     48   } else {
     49     // Not a surrogate, just one 16-bit word.
     50     *code_point = src[*char_index];
     51   }
     52 
     53   return IsValidCodepoint(*code_point);
     54 }
     55 
     56 #if defined(WCHAR_T_IS_UTF32)
     57 bool ReadUnicodeCharacter(const wchar_t* src,
     58                           int32_t /* src_len */,
     59                           int32_t* char_index,
     60                           uint32_t* code_point) {
     61   // Conversion is easy since the source is 32-bit.
     62   *code_point = src[*char_index];
     63 
     64   // Validate the value.
     65   return IsValidCodepoint(*code_point);
     66 }
     67 #endif  // defined(WCHAR_T_IS_UTF32)
     68 
     69 // WriteUnicodeCharacter -------------------------------------------------------
     70 
     71 size_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) {
     72   if (code_point <= 0x7f) {
     73     // Fast path the common case of one byte.
     74     output->push_back(static_cast<char>(code_point));
     75     return 1;
     76   }
     77 
     78 
     79   // CBU8_APPEND_UNSAFE can append up to 4 bytes.
     80   size_t char_offset = output->length();
     81   size_t original_char_offset = char_offset;
     82   output->resize(char_offset + CBU8_MAX_LENGTH);
     83 
     84   CBU8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
     85 
     86   // CBU8_APPEND_UNSAFE will advance our pointer past the inserted character, so
     87   // it will represent the new length of the string.
     88   output->resize(char_offset);
     89   return char_offset - original_char_offset;
     90 }
     91 
     92 size_t WriteUnicodeCharacter(uint32_t code_point, string16* output) {
     93   if (CBU16_LENGTH(code_point) == 1) {
     94     // Thie code point is in the Basic Multilingual Plane (BMP).
     95     output->push_back(static_cast<char16>(code_point));
     96     return 1;
     97   }
     98   // Non-BMP characters use a double-character encoding.
     99   size_t char_offset = output->length();
    100   output->resize(char_offset + CBU16_MAX_LENGTH);
    101   CBU16_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
    102   return CBU16_MAX_LENGTH;
    103 }
    104 
    105 // Generalized Unicode converter -----------------------------------------------
    106 
    107 template<typename CHAR>
    108 void PrepareForUTF8Output(const CHAR* src,
    109                           size_t src_len,
    110                           std::string* output) {
    111   output->clear();
    112   if (src_len == 0)
    113     return;
    114   if (src[0] < 0x80) {
    115     // Assume that the entire input will be ASCII.
    116     output->reserve(src_len);
    117   } else {
    118     // Assume that the entire input is non-ASCII and will have 3 bytes per char.
    119     output->reserve(src_len * 3);
    120   }
    121 }
    122 
    123 // Instantiate versions we know callers will need.
    124 template void PrepareForUTF8Output(const wchar_t*, size_t, std::string*);
    125 template void PrepareForUTF8Output(const char16*, size_t, std::string*);
    126 
    127 template<typename STRING>
    128 void PrepareForUTF16Or32Output(const char* src,
    129                                size_t src_len,
    130                                STRING* output) {
    131   output->clear();
    132   if (src_len == 0)
    133     return;
    134   if (static_cast<unsigned char>(src[0]) < 0x80) {
    135     // Assume the input is all ASCII, which means 1:1 correspondence.
    136     output->reserve(src_len);
    137   } else {
    138     // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
    139     // character.
    140     output->reserve(src_len / 2);
    141   }
    142 }
    143 
    144 // Instantiate versions we know callers will need.
    145 template void PrepareForUTF16Or32Output(const char*, size_t, std::wstring*);
    146 template void PrepareForUTF16Or32Output(const char*, size_t, string16*);
    147 
    148 }  // namespace base
    149