Home | History | Annotate | Download | only in strings
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/strings/utf_offset_string_conversions.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/logging.h"
     10 #include "base/memory/scoped_ptr.h"
     11 #include "base/strings/string_piece.h"
     12 #include "base/strings/utf_string_conversion_utils.h"
     13 
     14 namespace base {
     15 
     16 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
     17                                        size_t original_length,
     18                                        size_t output_length)
     19     : original_offset(original_offset),
     20       original_length(original_length),
     21       output_length(output_length) {
     22 }
     23 
     24 // static
     25 void OffsetAdjuster::AdjustOffsets(
     26     const Adjustments& adjustments,
     27     std::vector<size_t>* offsets_for_adjustment) {
     28   if (!offsets_for_adjustment || adjustments.empty())
     29     return;
     30   for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
     31        i != offsets_for_adjustment->end(); ++i)
     32     AdjustOffset(adjustments, &(*i));
     33 }
     34 
     35 // static
     36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
     37                                   size_t* offset) {
     38   if (*offset == string16::npos)
     39     return;
     40   int adjustment = 0;
     41   for (Adjustments::const_iterator i = adjustments.begin();
     42        i != adjustments.end(); ++i) {
     43     if (*offset <= i->original_offset)
     44       break;
     45     if (*offset < (i->original_offset + i->original_length)) {
     46       *offset = string16::npos;
     47       return;
     48     }
     49     adjustment += static_cast<int>(i->original_length - i->output_length);
     50   }
     51   *offset -= adjustment;
     52 }
     53 
     54 // static
     55 void OffsetAdjuster::UnadjustOffsets(
     56     const Adjustments& adjustments,
     57     std::vector<size_t>* offsets_for_unadjustment) {
     58   if (!offsets_for_unadjustment || adjustments.empty())
     59     return;
     60   for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
     61        i != offsets_for_unadjustment->end(); ++i)
     62     UnadjustOffset(adjustments, &(*i));
     63 }
     64 
     65 // static
     66 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
     67                                     size_t* offset) {
     68   if (*offset == string16::npos)
     69     return;
     70   int adjustment = 0;
     71   for (Adjustments::const_iterator i = adjustments.begin();
     72        i != adjustments.end(); ++i) {
     73     if (*offset + adjustment <= i->original_offset)
     74       break;
     75     adjustment += static_cast<int>(i->original_length - i->output_length);
     76     if ((*offset + adjustment) <
     77         (i->original_offset + i->original_length)) {
     78       *offset = string16::npos;
     79       return;
     80     }
     81   }
     82   *offset += adjustment;
     83 }
     84 
     85 // static
     86 void OffsetAdjuster::MergeSequentialAdjustments(
     87     const Adjustments& first_adjustments,
     88     Adjustments* adjustments_on_adjusted_string) {
     89   Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
     90   Adjustments::const_iterator first_iter = first_adjustments.begin();
     91   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
     92   // |first_adjustments|, adding adjustments to or correcting the adjustments
     93   // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
     94   // current number of characters collapsed by |first_adjustments| up to this
     95   // point.  |currently_collapsing| keeps track of the number of characters
     96   // collapsed by |first_adjustments| into the current |adjusted_iter|'s
     97   // length.  These are characters that will change |shift| as soon as we're
     98   // done processing the current |adjusted_iter|; they are not yet reflected in
     99   // |shift|.
    100   size_t shift = 0;
    101   size_t currently_collapsing = 0;
    102   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
    103     if ((first_iter == first_adjustments.end()) ||
    104         ((adjusted_iter->original_offset + shift +
    105           adjusted_iter->original_length) <= first_iter->original_offset)) {
    106       // Entire |adjusted_iter| (accounting for its shift and including its
    107       // whole original length) comes before |first_iter|.
    108       //
    109       // Correct the offset at |adjusted_iter| and move onto the next
    110       // adjustment that needs revising.
    111       adjusted_iter->original_offset += shift;
    112       shift += currently_collapsing;
    113       currently_collapsing = 0;
    114       ++adjusted_iter;
    115     } else if ((adjusted_iter->original_offset + shift) >
    116                first_iter->original_offset) {
    117       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
    118 
    119       // It's not possible for the adjustments to overlap.  (It shouldn't
    120       // be possible that we have an |adjusted_iter->original_offset| that,
    121       // when adjusted by the computed |shift|, is in the middle of
    122       // |first_iter|'s output's length.  After all, that would mean the
    123       // current adjustment_on_adjusted_string somehow points to an offset
    124       // that was supposed to have been eliminated by the first set of
    125       // adjustments.)
    126       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
    127                 adjusted_iter->original_offset + shift);
    128 
    129       // Add the |first_adjustment_iter| to the full set of adjustments while
    130       // making sure |adjusted_iter| continues pointing to the same element.
    131       // We do this by inserting the |first_adjustment_iter| right before
    132       // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
    133       // the following element.
    134       shift += first_iter->original_length - first_iter->output_length;
    135       adjusted_iter = adjustments_on_adjusted_string->insert(
    136           adjusted_iter, *first_iter);
    137       ++adjusted_iter;
    138       ++first_iter;
    139     } else {
    140       // The first adjustment adjusted something that then got further adjusted
    141       // by the second set of adjustments.  In other words, |first_iter| points
    142       // to something in the range covered by |adjusted_iter|'s length (after
    143       // accounting for |shift|).  Precisely,
    144       //   adjusted_iter->original_offset + shift
    145       //   <=
    146       //   first_iter->original_offset
    147       //   <=
    148       //   adjusted_iter->original_offset + shift +
    149       //       adjusted_iter->original_length
    150 
    151       // Modify the current |adjusted_iter| to include whatever collapsing
    152       // happened in |first_iter|, then advance to the next |first_adjustments|
    153       // because we dealt with the current one.
    154       const int collapse = static_cast<int>(first_iter->original_length) -
    155           static_cast<int>(first_iter->output_length);
    156       // This function does not know how to deal with a string that expands and
    157       // then gets modified, only strings that collapse and then get modified.
    158       DCHECK_GT(collapse, 0);
    159       adjusted_iter->original_length += collapse;
    160       currently_collapsing += collapse;
    161       ++first_iter;
    162     }
    163   }
    164   DCHECK_EQ(0u, currently_collapsing);
    165   if (first_iter != first_adjustments.end()) {
    166     // Only first adjustments are left.  These do not need to be modified.
    167     // (Their offsets are already correct with respect to the original string.)
    168     // Append them all.
    169     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
    170     adjustments_on_adjusted_string->insert(
    171         adjustments_on_adjusted_string->end(), first_iter,
    172         first_adjustments.end());
    173   }
    174 }
    175 
    176 // Converts the given source Unicode character type to the given destination
    177 // Unicode character type as a STL string. The given input buffer and size
    178 // determine the source, and the given output STL string will be replaced by
    179 // the result.  If non-NULL, |adjustments| is set to reflect the all the
    180 // alterations to the string that are not one-character-to-one-character.
    181 // It will always be sorted by increasing offset.
    182 template<typename SrcChar, typename DestStdString>
    183 bool ConvertUnicode(const SrcChar* src,
    184                     size_t src_len,
    185                     DestStdString* output,
    186                     OffsetAdjuster::Adjustments* adjustments) {
    187   if (adjustments)
    188     adjustments->clear();
    189   // ICU requires 32-bit numbers.
    190   bool success = true;
    191   int32 src_len32 = static_cast<int32>(src_len);
    192   for (int32 i = 0; i < src_len32; i++) {
    193     uint32 code_point;
    194     size_t original_i = i;
    195     size_t chars_written = 0;
    196     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
    197       chars_written = WriteUnicodeCharacter(code_point, output);
    198     } else {
    199       chars_written = WriteUnicodeCharacter(0xFFFD, output);
    200       success = false;
    201     }
    202 
    203     // Only bother writing an adjustment if this modification changed the
    204     // length of this character.
    205     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
    206     // character read, not after it (so that incrementing it in the loop
    207     // increment will place it at the right location), so we need to account
    208     // for that in determining the amount that was read.
    209     if (adjustments && ((i - original_i + 1) != chars_written)) {
    210       adjustments->push_back(OffsetAdjuster::Adjustment(
    211           original_i, i - original_i + 1, chars_written));
    212     }
    213   }
    214   return success;
    215 }
    216 
    217 bool UTF8ToUTF16WithAdjustments(
    218     const char* src,
    219     size_t src_len,
    220     string16* output,
    221     base::OffsetAdjuster::Adjustments* adjustments) {
    222   PrepareForUTF16Or32Output(src, src_len, output);
    223   return ConvertUnicode(src, src_len, output, adjustments);
    224 }
    225 
    226 string16 UTF8ToUTF16WithAdjustments(
    227     const base::StringPiece& utf8,
    228     base::OffsetAdjuster::Adjustments* adjustments) {
    229   string16 result;
    230   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
    231   return result;
    232 }
    233 
    234 string16 UTF8ToUTF16AndAdjustOffsets(
    235     const base::StringPiece& utf8,
    236     std::vector<size_t>* offsets_for_adjustment) {
    237   std::for_each(offsets_for_adjustment->begin(),
    238                 offsets_for_adjustment->end(),
    239                 LimitOffset<base::StringPiece>(utf8.length()));
    240   OffsetAdjuster::Adjustments adjustments;
    241   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
    242   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
    243   return result;
    244 }
    245 
    246 std::string UTF16ToUTF8AndAdjustOffsets(
    247     const base::StringPiece16& utf16,
    248     std::vector<size_t>* offsets_for_adjustment) {
    249   std::for_each(offsets_for_adjustment->begin(),
    250                 offsets_for_adjustment->end(),
    251                 LimitOffset<base::StringPiece16>(utf16.length()));
    252   std::string result;
    253   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
    254   OffsetAdjuster::Adjustments adjustments;
    255   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
    256   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
    257   return result;
    258 }
    259 
    260 }  // namespace base
    261