Home | History | Annotate | Download | only in strings
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 
      7 #include "base/logging.h"
      8 #include "base/strings/string_piece.h"
      9 #include "base/strings/utf_offset_string_conversions.h"
     10 #include "testing/gtest/include/gtest/gtest.h"
     11 
     12 namespace base {
     13 
     14 namespace {
     15 
     16 static const size_t kNpos = string16::npos;
     17 
     18 }  // namespace
     19 
     20 TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
     21   struct UTF8ToUTF16Case {
     22     const char* utf8;
     23     size_t input_offset;
     24     size_t output_offset;
     25   } utf8_to_utf16_cases[] = {
     26     {"", 0, 0},
     27     {"", kNpos, kNpos},
     28     {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
     29     {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
     30     {"\xed\xb0\x80z", 3, 1},
     31     {"A\xF0\x90\x8C\x80z", 1, 1},
     32     {"A\xF0\x90\x8C\x80z", 2, kNpos},
     33     {"A\xF0\x90\x8C\x80z", 5, 3},
     34     {"A\xF0\x90\x8C\x80z", 6, 4},
     35     {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
     36   };
     37   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf8_to_utf16_cases); ++i) {
     38     const size_t offset = utf8_to_utf16_cases[i].input_offset;
     39     std::vector<size_t> offsets;
     40     offsets.push_back(offset);
     41     UTF8ToUTF16AndAdjustOffsets(utf8_to_utf16_cases[i].utf8, &offsets);
     42     EXPECT_EQ(utf8_to_utf16_cases[i].output_offset, offsets[0]);
     43   }
     44 
     45   struct UTF16ToUTF8Case {
     46     char16 utf16[10];
     47     size_t input_offset;
     48     size_t output_offset;
     49   } utf16_to_utf8_cases[] = {
     50       {{}, 0, 0},
     51       // Converted to 3-byte utf-8 sequences
     52       {{0x5909, 0x63DB}, 3, kNpos},
     53       {{0x5909, 0x63DB}, 2, 6},
     54       {{0x5909, 0x63DB}, 1, 3},
     55       {{0x5909, 0x63DB}, 0, 0},
     56       // Converted to 2-byte utf-8 sequences
     57       {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
     58       {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
     59       {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
     60       {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
     61       // Surrogate pair
     62       {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
     63       {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
     64       {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
     65       {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
     66   };
     67   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(utf16_to_utf8_cases); ++i) {
     68     size_t offset = utf16_to_utf8_cases[i].input_offset;
     69     std::vector<size_t> offsets;
     70     offsets.push_back(offset);
     71     UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
     72     EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
     73   }
     74 }
     75 
     76 TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
     77   const size_t kLimit = 10;
     78   const size_t kItems = 20;
     79   std::vector<size_t> size_ts;
     80   for (size_t t = 0; t < kItems; ++t)
     81     size_ts.push_back(t);
     82   std::for_each(size_ts.begin(), size_ts.end(),
     83                 LimitOffset<string16>(kLimit));
     84   size_t unlimited_count = 0;
     85   for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
     86        ++ti) {
     87     if (*ti != kNpos)
     88       ++unlimited_count;
     89   }
     90   EXPECT_EQ(11U, unlimited_count);
     91 
     92   // Reverse the values in the vector and try again.
     93   size_ts.clear();
     94   for (size_t t = kItems; t > 0; --t)
     95     size_ts.push_back(t - 1);
     96   std::for_each(size_ts.begin(), size_ts.end(),
     97                 LimitOffset<string16>(kLimit));
     98   unlimited_count = 0;
     99   for (std::vector<size_t>::iterator ti = size_ts.begin(); ti != size_ts.end();
    100        ++ti) {
    101     if (*ti != kNpos)
    102       ++unlimited_count;
    103   }
    104   EXPECT_EQ(11U, unlimited_count);
    105 }
    106 
    107 TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
    108   // Imagine we have strings as shown in the following cases where the
    109   // X's represent encoded characters.
    110   // 1: abcXXXdef ==> abcXdef
    111   {
    112     std::vector<size_t> offsets;
    113     for (size_t t = 0; t <= 9; ++t)
    114       offsets.push_back(t);
    115     OffsetAdjuster::Adjustments adjustments;
    116     adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
    117     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
    118     size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
    119     EXPECT_EQ(offsets.size(), arraysize(expected_1));
    120     for (size_t i = 0; i < arraysize(expected_1); ++i)
    121       EXPECT_EQ(expected_1[i], offsets[i]);
    122   }
    123 
    124   // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
    125   {
    126     std::vector<size_t> offsets;
    127     for (size_t t = 0; t <= 23; ++t)
    128       offsets.push_back(t);
    129     OffsetAdjuster::Adjustments adjustments;
    130     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
    131     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
    132     adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
    133     adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
    134     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
    135     size_t expected_2[] = {
    136       0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
    137       kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
    138     };
    139     EXPECT_EQ(offsets.size(), arraysize(expected_2));
    140     for (size_t i = 0; i < arraysize(expected_2); ++i)
    141       EXPECT_EQ(expected_2[i], offsets[i]);
    142   }
    143 
    144   // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
    145   {
    146     std::vector<size_t> offsets;
    147     for (size_t t = 0; t <= 17; ++t)
    148       offsets.push_back(t);
    149     OffsetAdjuster::Adjustments adjustments;
    150     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
    151     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
    152     adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
    153     adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
    154     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
    155     size_t expected_3[] = {
    156       0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
    157       12, kNpos, 12
    158     };
    159     EXPECT_EQ(offsets.size(), arraysize(expected_3));
    160     for (size_t i = 0; i < arraysize(expected_3); ++i)
    161       EXPECT_EQ(expected_3[i], offsets[i]);
    162   }
    163 }
    164 
    165 TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
    166   // Imagine we have strings as shown in the following cases where the
    167   // X's represent encoded characters.
    168   // 1: abcXXXdef ==> abcXdef
    169   {
    170     std::vector<size_t> offsets;
    171     for (size_t t = 0; t <= 7; ++t)
    172       offsets.push_back(t);
    173     OffsetAdjuster::Adjustments adjustments;
    174     adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
    175     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
    176     size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
    177     EXPECT_EQ(offsets.size(), arraysize(expected_1));
    178     for (size_t i = 0; i < arraysize(expected_1); ++i)
    179       EXPECT_EQ(expected_1[i], offsets[i]);
    180   }
    181 
    182   // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
    183   {
    184     std::vector<size_t> offsets;
    185     for (size_t t = 0; t <= 14; ++t)
    186       offsets.push_back(t);
    187     OffsetAdjuster::Adjustments adjustments;
    188     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
    189     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
    190     adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
    191     adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
    192     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
    193     size_t expected_2[] = {
    194       0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
    195     };
    196     EXPECT_EQ(offsets.size(), arraysize(expected_2));
    197     for (size_t i = 0; i < arraysize(expected_2); ++i)
    198       EXPECT_EQ(expected_2[i], offsets[i]);
    199   }
    200 
    201   // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
    202   {
    203     std::vector<size_t> offsets;
    204     for (size_t t = 0; t <= 12; ++t)
    205       offsets.push_back(t);
    206     OffsetAdjuster::Adjustments adjustments;
    207     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
    208     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
    209     adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
    210     adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
    211     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
    212     size_t expected_3[] = {
    213       0,  // this could just as easily be 3
    214       4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
    215       15  // this could just as easily be 17
    216     };
    217     EXPECT_EQ(offsets.size(), arraysize(expected_3));
    218     for (size_t i = 0; i < arraysize(expected_3); ++i)
    219       EXPECT_EQ(expected_3[i], offsets[i]);
    220   }
    221 }
    222 
    223 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
    224 // net/base/net_util.{h,cc}.  The two tests EscapeTest.AdjustOffset and
    225 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively.  This
    226 // is simply a short, additional test.
    227 TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
    228   // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
    229 
    230   // Set up |first_adjustments| to
    231   // - remove the leading "a"
    232   // - combine the "bc" into one character (call it ".")
    233   // - remove the "f"
    234   // - remove the "tuv"
    235   // The resulting string should be ".deghijklmnopqrswxyz".
    236   OffsetAdjuster::Adjustments first_adjustments;
    237   first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
    238   first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
    239   first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
    240   first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
    241 
    242   // Set up |adjustments_on_adjusted_string| to
    243   // - combine the "." character that replaced "bc" with "d" into one character
    244   //   (call it "?")
    245   // - remove the "egh"
    246   // - expand the "i" into two characters (call them "12")
    247   // - combine the "jkl" into one character (call it "@")
    248   // - expand the "z" into two characters (call it "34")
    249   // The resulting string should be "?12@mnopqrswxy34".
    250   OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
    251   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
    252       0, 2, 1));
    253   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
    254       2, 3, 0));
    255   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
    256       5, 1, 2));
    257   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
    258       6, 3, 1));
    259   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
    260       19, 1, 2));
    261 
    262   // Now merge the adjustments and check the results.
    263   OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
    264                                              &adjustments_on_adjusted_string);
    265   // The merged adjustments should look like
    266   // - combine abcd into "?"
    267   //   - note: it's also reasonable for the Merge function to instead produce
    268   //     two adjustments instead of this, one to remove a and another to
    269   //     combine bcd into "?".  This test verifies the current behavior.
    270   // - remove efgh
    271   // - expand i into "12"
    272   // - combine jkl into "@"
    273   // - remove tuv
    274   // - expand z into "34"
    275   ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
    276   EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
    277   EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
    278   EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
    279   EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
    280   EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
    281   EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
    282   EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
    283   EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
    284   EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
    285   EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
    286   EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
    287   EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
    288   EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
    289   EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
    290   EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
    291   EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
    292   EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
    293   EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
    294 }
    295 
    296 }  // namaspace base
    297