1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/utf_offset_string_conversions.h" 6 7 #include <algorithm> 8 9 #include "base/scoped_ptr.h" 10 #include "base/string_piece.h" 11 #include "base/utf_string_conversion_utils.h" 12 13 using base::PrepareForUTF16Or32Output; 14 using base::ReadUnicodeCharacter; 15 using base::WriteUnicodeCharacter; 16 17 // Generalized Unicode converter ----------------------------------------------- 18 19 // Converts the given source Unicode character type to the given destination 20 // Unicode character type as a STL string. The given input buffer and size 21 // determine the source, and the given output STL string will be replaced by 22 // the result. 23 template<typename SRC_CHAR> 24 bool ConvertUnicode(const SRC_CHAR* src, 25 size_t src_len, 26 std::wstring* output, 27 std::vector<size_t>* offsets_for_adjustment) { 28 if (offsets_for_adjustment) { 29 std::for_each(offsets_for_adjustment->begin(), 30 offsets_for_adjustment->end(), 31 LimitOffset<std::wstring>(src_len)); 32 } 33 34 // ICU requires 32-bit numbers. 35 bool success = true; 36 AdjustOffset::Adjustments adjustments; 37 int32 src_len32 = static_cast<int32>(src_len); 38 for (int32 i = 0; i < src_len32; i++) { 39 uint32 code_point; 40 size_t original_i = i; 41 size_t chars_written = 0; 42 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { 43 chars_written = WriteUnicodeCharacter(code_point, output); 44 } else { 45 chars_written = WriteUnicodeCharacter(0xFFFD, output); 46 success = false; 47 } 48 if (offsets_for_adjustment) { 49 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last 50 // character read, not after it (so that incrementing it in the loop 51 // increment will place it at the right location), so we need to account 52 // for that in determining the amount that was read. 53 adjustments.push_back(AdjustOffset::Adjustment( 54 original_i, i - original_i + 1, chars_written)); 55 } 56 } 57 58 // Make offset adjustment. 59 if (offsets_for_adjustment && !adjustments.empty()) { 60 std::for_each(offsets_for_adjustment->begin(), 61 offsets_for_adjustment->end(), 62 AdjustOffset(adjustments)); 63 } 64 65 return success; 66 } 67 68 // UTF-8 <-> Wide -------------------------------------------------------------- 69 70 bool UTF8ToWideAndAdjustOffset(const char* src, 71 size_t src_len, 72 std::wstring* output, 73 size_t* offset_for_adjustment) { 74 std::vector<size_t> offsets; 75 if (offset_for_adjustment) 76 offsets.push_back(*offset_for_adjustment); 77 PrepareForUTF16Or32Output(src, src_len, output); 78 bool ret = ConvertUnicode(src, src_len, output, &offsets); 79 if (offset_for_adjustment) 80 *offset_for_adjustment = offsets[0]; 81 return ret; 82 } 83 84 bool UTF8ToWideAndAdjustOffsets(const char* src, 85 size_t src_len, 86 std::wstring* output, 87 std::vector<size_t>* offsets_for_adjustment) { 88 PrepareForUTF16Or32Output(src, src_len, output); 89 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); 90 } 91 92 std::wstring UTF8ToWideAndAdjustOffset(const base::StringPiece& utf8, 93 size_t* offset_for_adjustment) { 94 std::vector<size_t> offsets; 95 if (offset_for_adjustment) 96 offsets.push_back(*offset_for_adjustment); 97 std::wstring result; 98 UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, 99 &offsets); 100 if (offset_for_adjustment) 101 *offset_for_adjustment = offsets[0]; 102 return result; 103 } 104 105 std::wstring UTF8ToWideAndAdjustOffsets(const base::StringPiece& utf8, 106 std::vector<size_t>* 107 offsets_for_adjustment) { 108 std::wstring result; 109 UTF8ToWideAndAdjustOffsets(utf8.data(), utf8.length(), &result, 110 offsets_for_adjustment); 111 return result; 112 } 113 114 // UTF-16 <-> Wide ------------------------------------------------------------- 115 116 #if defined(WCHAR_T_IS_UTF16) 117 118 // When wide == UTF-16, then conversions are a NOP. 119 bool UTF16ToWideAndAdjustOffset(const char16* src, 120 size_t src_len, 121 std::wstring* output, 122 size_t* offset_for_adjustment) { 123 output->assign(src, src_len); 124 if (offset_for_adjustment && (*offset_for_adjustment >= src_len)) 125 *offset_for_adjustment = std::wstring::npos; 126 return true; 127 } 128 129 bool UTF16ToWideAndAdjustOffsets(const char16* src, 130 size_t src_len, 131 std::wstring* output, 132 std::vector<size_t>* offsets_for_adjustment) { 133 output->assign(src, src_len); 134 if (offsets_for_adjustment) { 135 std::for_each(offsets_for_adjustment->begin(), 136 offsets_for_adjustment->end(), 137 LimitOffset<std::wstring>(src_len)); 138 } 139 return true; 140 } 141 142 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, 143 size_t* offset_for_adjustment) { 144 if (offset_for_adjustment && (*offset_for_adjustment >= utf16.length())) 145 *offset_for_adjustment = std::wstring::npos; 146 return utf16; 147 } 148 149 std::wstring UTF16ToWideAndAdjustOffsets( 150 const string16& utf16, 151 std::vector<size_t>* offsets_for_adjustment) { 152 if (offsets_for_adjustment) { 153 std::for_each(offsets_for_adjustment->begin(), 154 offsets_for_adjustment->end(), 155 LimitOffset<std::wstring>(utf16.length())); 156 } 157 return utf16; 158 } 159 160 #elif defined(WCHAR_T_IS_UTF32) 161 162 bool UTF16ToWideAndAdjustOffset(const char16* src, 163 size_t src_len, 164 std::wstring* output, 165 size_t* offset_for_adjustment) { 166 std::vector<size_t> offsets; 167 if (offset_for_adjustment) 168 offsets.push_back(*offset_for_adjustment); 169 output->clear(); 170 // Assume that normally we won't have any non-BMP characters so the counts 171 // will be the same. 172 output->reserve(src_len); 173 bool ret = ConvertUnicode(src, src_len, output, &offsets); 174 if (offset_for_adjustment) 175 *offset_for_adjustment = offsets[0]; 176 return ret; 177 } 178 179 bool UTF16ToWideAndAdjustOffsets(const char16* src, 180 size_t src_len, 181 std::wstring* output, 182 std::vector<size_t>* offsets_for_adjustment) { 183 output->clear(); 184 // Assume that normally we won't have any non-BMP characters so the counts 185 // will be the same. 186 output->reserve(src_len); 187 return ConvertUnicode(src, src_len, output, offsets_for_adjustment); 188 } 189 190 std::wstring UTF16ToWideAndAdjustOffset(const string16& utf16, 191 size_t* offset_for_adjustment) { 192 std::vector<size_t> offsets; 193 if (offset_for_adjustment) 194 offsets.push_back(*offset_for_adjustment); 195 std::wstring result; 196 UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, 197 &offsets); 198 if (offset_for_adjustment) 199 *offset_for_adjustment = offsets[0]; 200 return result; 201 } 202 203 std::wstring UTF16ToWideAndAdjustOffsets( 204 const string16& utf16, 205 std::vector<size_t>* offsets_for_adjustment) { 206 std::wstring result; 207 UTF16ToWideAndAdjustOffsets(utf16.data(), utf16.length(), &result, 208 offsets_for_adjustment); 209 return result; 210 } 211 212 #endif // defined(WCHAR_T_IS_UTF32) 213 214 AdjustOffset::Adjustment::Adjustment(size_t location, 215 size_t old_length, 216 size_t new_length) 217 : location(location), 218 old_length(old_length), 219 new_length(new_length) {} 220 221 AdjustOffset::AdjustOffset(const Adjustments& adjustments) 222 : adjustments_(adjustments) {} 223 224 void AdjustOffset::operator()(size_t& offset) { 225 if (offset == std::wstring::npos) 226 return; 227 size_t adjustment = 0; 228 for (Adjustments::const_iterator i = adjustments_.begin(); 229 i != adjustments_.end(); ++i) { 230 size_t location = i->location; 231 if (offset == location && i->new_length == 0) { 232 offset = std::wstring::npos; 233 return; 234 } 235 if (offset <= location) 236 break; 237 if (offset < (location + i->old_length)) { 238 offset = std::wstring::npos; 239 return; 240 } 241 adjustment += (i->old_length - i->new_length); 242 } 243 offset -= adjustment; 244 } 245