1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/strings/utf_offset_string_conversions.h" 6 7 #include <algorithm> 8 9 #include "base/logging.h" 10 #include "base/memory/scoped_ptr.h" 11 #include "base/strings/string_piece.h" 12 #include "base/strings/utf_string_conversion_utils.h" 13 14 namespace base { 15 16 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, 17 size_t original_length, 18 size_t output_length) 19 : original_offset(original_offset), 20 original_length(original_length), 21 output_length(output_length) { 22 } 23 24 // static 25 void OffsetAdjuster::AdjustOffsets( 26 const Adjustments& adjustments, 27 std::vector<size_t>* offsets_for_adjustment) { 28 if (!offsets_for_adjustment || adjustments.empty()) 29 return; 30 for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); 31 i != offsets_for_adjustment->end(); ++i) 32 AdjustOffset(adjustments, &(*i)); 33 } 34 35 // static 36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, 37 size_t* offset) { 38 if (*offset == string16::npos) 39 return; 40 int adjustment = 0; 41 for (Adjustments::const_iterator i = adjustments.begin(); 42 i != adjustments.end(); ++i) { 43 if (*offset <= i->original_offset) 44 break; 45 if (*offset < (i->original_offset + i->original_length)) { 46 *offset = string16::npos; 47 return; 48 } 49 adjustment += static_cast<int>(i->original_length - i->output_length); 50 } 51 *offset -= adjustment; 52 } 53 54 // static 55 void OffsetAdjuster::UnadjustOffsets( 56 const Adjustments& adjustments, 57 std::vector<size_t>* offsets_for_unadjustment) { 58 if (!offsets_for_unadjustment || adjustments.empty()) 59 return; 60 for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); 61 i != offsets_for_unadjustment->end(); ++i) 62 UnadjustOffset(adjustments, &(*i)); 63 } 64 65 // static 66 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, 67 size_t* offset) { 68 if (*offset == string16::npos) 69 return; 70 int adjustment = 0; 71 for (Adjustments::const_iterator i = adjustments.begin(); 72 i != adjustments.end(); ++i) { 73 if (*offset + adjustment <= i->original_offset) 74 break; 75 adjustment += static_cast<int>(i->original_length - i->output_length); 76 if ((*offset + adjustment) < 77 (i->original_offset + i->original_length)) { 78 *offset = string16::npos; 79 return; 80 } 81 } 82 *offset += adjustment; 83 } 84 85 // static 86 void OffsetAdjuster::MergeSequentialAdjustments( 87 const Adjustments& first_adjustments, 88 Adjustments* adjustments_on_adjusted_string) { 89 Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); 90 Adjustments::const_iterator first_iter = first_adjustments.begin(); 91 // Simultaneously iterate over all |adjustments_on_adjusted_string| and 92 // |first_adjustments|, adding adjustments to or correcting the adjustments 93 // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the 94 // current number of characters collapsed by |first_adjustments| up to this 95 // point. |currently_collapsing| keeps track of the number of characters 96 // collapsed by |first_adjustments| into the current |adjusted_iter|'s 97 // length. These are characters that will change |shift| as soon as we're 98 // done processing the current |adjusted_iter|; they are not yet reflected in 99 // |shift|. 100 size_t shift = 0; 101 size_t currently_collapsing = 0; 102 while (adjusted_iter != adjustments_on_adjusted_string->end()) { 103 if ((first_iter == first_adjustments.end()) || 104 ((adjusted_iter->original_offset + shift + 105 adjusted_iter->original_length) <= first_iter->original_offset)) { 106 // Entire |adjusted_iter| (accounting for its shift and including its 107 // whole original length) comes before |first_iter|. 108 // 109 // Correct the offset at |adjusted_iter| and move onto the next 110 // adjustment that needs revising. 111 adjusted_iter->original_offset += shift; 112 shift += currently_collapsing; 113 currently_collapsing = 0; 114 ++adjusted_iter; 115 } else if ((adjusted_iter->original_offset + shift) > 116 first_iter->original_offset) { 117 // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). 118 119 // It's not possible for the adjustments to overlap. (It shouldn't 120 // be possible that we have an |adjusted_iter->original_offset| that, 121 // when adjusted by the computed |shift|, is in the middle of 122 // |first_iter|'s output's length. After all, that would mean the 123 // current adjustment_on_adjusted_string somehow points to an offset 124 // that was supposed to have been eliminated by the first set of 125 // adjustments.) 126 DCHECK_LE(first_iter->original_offset + first_iter->output_length, 127 adjusted_iter->original_offset + shift); 128 129 // Add the |first_adjustment_iter| to the full set of adjustments while 130 // making sure |adjusted_iter| continues pointing to the same element. 131 // We do this by inserting the |first_adjustment_iter| right before 132 // |adjusted_iter|, then incrementing |adjusted_iter| so it points to 133 // the following element. 134 shift += first_iter->original_length - first_iter->output_length; 135 adjusted_iter = adjustments_on_adjusted_string->insert( 136 adjusted_iter, *first_iter); 137 ++adjusted_iter; 138 ++first_iter; 139 } else { 140 // The first adjustment adjusted something that then got further adjusted 141 // by the second set of adjustments. In other words, |first_iter| points 142 // to something in the range covered by |adjusted_iter|'s length (after 143 // accounting for |shift|). Precisely, 144 // adjusted_iter->original_offset + shift 145 // <= 146 // first_iter->original_offset 147 // <= 148 // adjusted_iter->original_offset + shift + 149 // adjusted_iter->original_length 150 151 // Modify the current |adjusted_iter| to include whatever collapsing 152 // happened in |first_iter|, then advance to the next |first_adjustments| 153 // because we dealt with the current one. 154 const int collapse = static_cast<int>(first_iter->original_length) - 155 static_cast<int>(first_iter->output_length); 156 // This function does not know how to deal with a string that expands and 157 // then gets modified, only strings that collapse and then get modified. 158 DCHECK_GT(collapse, 0); 159 adjusted_iter->original_length += collapse; 160 currently_collapsing += collapse; 161 ++first_iter; 162 } 163 } 164 DCHECK_EQ(0u, currently_collapsing); 165 if (first_iter != first_adjustments.end()) { 166 // Only first adjustments are left. These do not need to be modified. 167 // (Their offsets are already correct with respect to the original string.) 168 // Append them all. 169 DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); 170 adjustments_on_adjusted_string->insert( 171 adjustments_on_adjusted_string->end(), first_iter, 172 first_adjustments.end()); 173 } 174 } 175 176 // Converts the given source Unicode character type to the given destination 177 // Unicode character type as a STL string. The given input buffer and size 178 // determine the source, and the given output STL string will be replaced by 179 // the result. If non-NULL, |adjustments| is set to reflect the all the 180 // alterations to the string that are not one-character-to-one-character. 181 // It will always be sorted by increasing offset. 182 template<typename SrcChar, typename DestStdString> 183 bool ConvertUnicode(const SrcChar* src, 184 size_t src_len, 185 DestStdString* output, 186 OffsetAdjuster::Adjustments* adjustments) { 187 if (adjustments) 188 adjustments->clear(); 189 // ICU requires 32-bit numbers. 190 bool success = true; 191 int32 src_len32 = static_cast<int32>(src_len); 192 for (int32 i = 0; i < src_len32; i++) { 193 uint32 code_point; 194 size_t original_i = i; 195 size_t chars_written = 0; 196 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { 197 chars_written = WriteUnicodeCharacter(code_point, output); 198 } else { 199 chars_written = WriteUnicodeCharacter(0xFFFD, output); 200 success = false; 201 } 202 203 // Only bother writing an adjustment if this modification changed the 204 // length of this character. 205 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last 206 // character read, not after it (so that incrementing it in the loop 207 // increment will place it at the right location), so we need to account 208 // for that in determining the amount that was read. 209 if (adjustments && ((i - original_i + 1) != chars_written)) { 210 adjustments->push_back(OffsetAdjuster::Adjustment( 211 original_i, i - original_i + 1, chars_written)); 212 } 213 } 214 return success; 215 } 216 217 bool UTF8ToUTF16WithAdjustments( 218 const char* src, 219 size_t src_len, 220 string16* output, 221 base::OffsetAdjuster::Adjustments* adjustments) { 222 PrepareForUTF16Or32Output(src, src_len, output); 223 return ConvertUnicode(src, src_len, output, adjustments); 224 } 225 226 string16 UTF8ToUTF16WithAdjustments( 227 const base::StringPiece& utf8, 228 base::OffsetAdjuster::Adjustments* adjustments) { 229 string16 result; 230 UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); 231 return result; 232 } 233 234 string16 UTF8ToUTF16AndAdjustOffsets( 235 const base::StringPiece& utf8, 236 std::vector<size_t>* offsets_for_adjustment) { 237 std::for_each(offsets_for_adjustment->begin(), 238 offsets_for_adjustment->end(), 239 LimitOffset<base::StringPiece>(utf8.length())); 240 OffsetAdjuster::Adjustments adjustments; 241 string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); 242 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); 243 return result; 244 } 245 246 std::string UTF16ToUTF8AndAdjustOffsets( 247 const base::StringPiece16& utf16, 248 std::vector<size_t>* offsets_for_adjustment) { 249 std::for_each(offsets_for_adjustment->begin(), 250 offsets_for_adjustment->end(), 251 LimitOffset<base::StringPiece16>(utf16.length())); 252 std::string result; 253 PrepareForUTF8Output(utf16.data(), utf16.length(), &result); 254 OffsetAdjuster::Adjustments adjustments; 255 ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); 256 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); 257 return result; 258 } 259 260 } // namespace base 261