Home | History | Annotate | Download | only in i18n
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/i18n/break_iterator.h"
      6 
      7 #include "base/strings/string_piece.h"
      8 #include "base/strings/stringprintf.h"
      9 #include "base/strings/utf_string_conversions.h"
     10 #include "testing/gtest/include/gtest/gtest.h"
     11 
     12 namespace base {
     13 namespace i18n {
     14 
     15 TEST(BreakIteratorTest, BreakWordEmpty) {
     16   string16 empty;
     17   BreakIterator iter(empty, BreakIterator::BREAK_WORD);
     18   ASSERT_TRUE(iter.Init());
     19   EXPECT_FALSE(iter.Advance());
     20   EXPECT_FALSE(iter.IsWord());
     21   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
     22   EXPECT_FALSE(iter.IsWord());
     23 }
     24 
     25 TEST(BreakIteratorTest, BreakWord) {
     26   string16 space(UTF8ToUTF16(" "));
     27   string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
     28   BreakIterator iter(str, BreakIterator::BREAK_WORD);
     29   ASSERT_TRUE(iter.Init());
     30   EXPECT_TRUE(iter.Advance());
     31   EXPECT_FALSE(iter.IsWord());
     32   EXPECT_EQ(space, iter.GetString());
     33   EXPECT_TRUE(iter.Advance());
     34   EXPECT_TRUE(iter.IsWord());
     35   EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString());
     36   EXPECT_TRUE(iter.Advance());
     37   EXPECT_FALSE(iter.IsWord());
     38   EXPECT_EQ(space, iter.GetString());
     39   EXPECT_TRUE(iter.Advance());
     40   EXPECT_TRUE(iter.IsWord());
     41   EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString());
     42   EXPECT_TRUE(iter.Advance());
     43   EXPECT_FALSE(iter.IsWord());
     44   EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString());
     45   EXPECT_TRUE(iter.Advance());
     46   EXPECT_FALSE(iter.IsWord());
     47   EXPECT_EQ(space, iter.GetString());
     48   EXPECT_TRUE(iter.Advance());
     49   EXPECT_FALSE(iter.IsWord());
     50   EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString());
     51   EXPECT_TRUE(iter.Advance());
     52   EXPECT_TRUE(iter.IsWord());
     53   EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString());
     54   EXPECT_TRUE(iter.Advance());
     55   EXPECT_FALSE(iter.IsWord());
     56   EXPECT_EQ(space, iter.GetString());
     57   EXPECT_TRUE(iter.Advance());
     58   EXPECT_TRUE(iter.IsWord());
     59   EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
     60   EXPECT_FALSE(iter.Advance());
     61   EXPECT_FALSE(iter.IsWord());
     62   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
     63   EXPECT_FALSE(iter.IsWord());
     64 }
     65 
     66 TEST(BreakIteratorTest, BreakWide16) {
     67   // Two greek words separated by space.
     68   const string16 str(WideToUTF16(
     69       L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
     70       L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
     71   const string16 word1(str.substr(0, 10));
     72   const string16 word2(str.substr(11, 5));
     73   BreakIterator iter(str, BreakIterator::BREAK_WORD);
     74   ASSERT_TRUE(iter.Init());
     75   EXPECT_TRUE(iter.Advance());
     76   EXPECT_TRUE(iter.IsWord());
     77   EXPECT_EQ(word1, iter.GetString());
     78   EXPECT_TRUE(iter.Advance());
     79   EXPECT_FALSE(iter.IsWord());
     80   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
     81   EXPECT_TRUE(iter.Advance());
     82   EXPECT_TRUE(iter.IsWord());
     83   EXPECT_EQ(word2, iter.GetString());
     84   EXPECT_FALSE(iter.Advance());
     85   EXPECT_FALSE(iter.IsWord());
     86   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
     87   EXPECT_FALSE(iter.IsWord());
     88 }
     89 
     90 TEST(BreakIteratorTest, BreakWide32) {
     91   // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
     92   const char* very_wide_char = "\xF0\x9D\x92\x9C";
     93   const string16 str(
     94       UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
     95   const string16 very_wide_word(str.substr(0, 2));
     96 
     97   BreakIterator iter(str, BreakIterator::BREAK_WORD);
     98   ASSERT_TRUE(iter.Init());
     99   EXPECT_TRUE(iter.Advance());
    100   EXPECT_TRUE(iter.IsWord());
    101   EXPECT_EQ(very_wide_word, iter.GetString());
    102   EXPECT_TRUE(iter.Advance());
    103   EXPECT_FALSE(iter.IsWord());
    104   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
    105   EXPECT_TRUE(iter.Advance());
    106   EXPECT_TRUE(iter.IsWord());
    107   EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
    108   EXPECT_FALSE(iter.Advance());
    109   EXPECT_FALSE(iter.IsWord());
    110   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
    111   EXPECT_FALSE(iter.IsWord());
    112 }
    113 
    114 TEST(BreakIteratorTest, BreakSpaceEmpty) {
    115   string16 empty;
    116   BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
    117   ASSERT_TRUE(iter.Init());
    118   EXPECT_FALSE(iter.Advance());
    119   EXPECT_FALSE(iter.IsWord());
    120   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
    121   EXPECT_FALSE(iter.IsWord());
    122 }
    123 
    124 TEST(BreakIteratorTest, BreakSpace) {
    125   string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
    126   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
    127   ASSERT_TRUE(iter.Init());
    128   EXPECT_TRUE(iter.Advance());
    129   EXPECT_FALSE(iter.IsWord());
    130   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
    131   EXPECT_TRUE(iter.Advance());
    132   EXPECT_FALSE(iter.IsWord());
    133   EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
    134   EXPECT_TRUE(iter.Advance());
    135   EXPECT_FALSE(iter.IsWord());
    136   EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
    137   EXPECT_TRUE(iter.Advance());
    138   EXPECT_FALSE(iter.IsWord());
    139   EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
    140   EXPECT_TRUE(iter.Advance());
    141   EXPECT_FALSE(iter.IsWord());
    142   EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
    143   EXPECT_FALSE(iter.Advance());
    144   EXPECT_FALSE(iter.IsWord());
    145   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
    146   EXPECT_FALSE(iter.IsWord());
    147 }
    148 
    149 TEST(BreakIteratorTest, BreakSpaceSP) {
    150   string16 str(UTF8ToUTF16(" foo bar! \npouet boom "));
    151   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
    152   ASSERT_TRUE(iter.Init());
    153   EXPECT_TRUE(iter.Advance());
    154   EXPECT_FALSE(iter.IsWord());
    155   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
    156   EXPECT_TRUE(iter.Advance());
    157   EXPECT_FALSE(iter.IsWord());
    158   EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
    159   EXPECT_TRUE(iter.Advance());
    160   EXPECT_FALSE(iter.IsWord());
    161   EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
    162   EXPECT_TRUE(iter.Advance());
    163   EXPECT_FALSE(iter.IsWord());
    164   EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
    165   EXPECT_TRUE(iter.Advance());
    166   EXPECT_FALSE(iter.IsWord());
    167   EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString());
    168   EXPECT_FALSE(iter.Advance());
    169   EXPECT_FALSE(iter.IsWord());
    170   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
    171   EXPECT_FALSE(iter.IsWord());
    172 }
    173 
    174 TEST(BreakIteratorTest, BreakSpacekWide16) {
    175   // Two Greek words.
    176   const string16 str(WideToUTF16(
    177       L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
    178       L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
    179   const string16 word1(str.substr(0, 11));
    180   const string16 word2(str.substr(11, 5));
    181   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
    182   ASSERT_TRUE(iter.Init());
    183   EXPECT_TRUE(iter.Advance());
    184   EXPECT_FALSE(iter.IsWord());
    185   EXPECT_EQ(word1, iter.GetString());
    186   EXPECT_TRUE(iter.Advance());
    187   EXPECT_FALSE(iter.IsWord());
    188   EXPECT_EQ(word2, iter.GetString());
    189   EXPECT_FALSE(iter.Advance());
    190   EXPECT_FALSE(iter.IsWord());
    191   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
    192   EXPECT_FALSE(iter.IsWord());
    193 }
    194 
    195 TEST(BreakIteratorTest, BreakSpaceWide32) {
    196   // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
    197   const char* very_wide_char = "\xF0\x9D\x92\x9C";
    198   const string16 str(
    199       UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
    200   const string16 very_wide_word(str.substr(0, 3));
    201 
    202   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
    203   ASSERT_TRUE(iter.Init());
    204   EXPECT_TRUE(iter.Advance());
    205   EXPECT_FALSE(iter.IsWord());
    206   EXPECT_EQ(very_wide_word, iter.GetString());
    207   EXPECT_TRUE(iter.Advance());
    208   EXPECT_FALSE(iter.IsWord());
    209   EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
    210   EXPECT_FALSE(iter.Advance());
    211   EXPECT_FALSE(iter.IsWord());
    212   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
    213   EXPECT_FALSE(iter.IsWord());
    214 }
    215 
    216 TEST(BreakIteratorTest, BreakLineEmpty) {
    217   string16 empty;
    218   BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
    219   ASSERT_TRUE(iter.Init());
    220   EXPECT_FALSE(iter.Advance());
    221   EXPECT_FALSE(iter.IsWord());
    222   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
    223   EXPECT_FALSE(iter.IsWord());
    224 }
    225 
    226 TEST(BreakIteratorTest, BreakLine) {
    227   string16 nl(UTF8ToUTF16("\n"));
    228   string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
    229   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
    230   ASSERT_TRUE(iter.Init());
    231   EXPECT_TRUE(iter.Advance());
    232   EXPECT_FALSE(iter.IsWord());
    233   EXPECT_EQ(nl, iter.GetString());
    234   EXPECT_TRUE(iter.Advance());
    235   EXPECT_FALSE(iter.IsWord());
    236   EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
    237   EXPECT_TRUE(iter.Advance());
    238   EXPECT_FALSE(iter.IsWord());
    239   EXPECT_EQ(nl, iter.GetString());
    240   EXPECT_TRUE(iter.Advance());
    241   EXPECT_FALSE(iter.IsWord());
    242   EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString());
    243   EXPECT_FALSE(iter.Advance());
    244   EXPECT_FALSE(iter.IsWord());
    245   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
    246   EXPECT_FALSE(iter.IsWord());
    247 }
    248 
    249 TEST(BreakIteratorTest, BreakLineNL) {
    250   string16 nl(UTF8ToUTF16("\n"));
    251   string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
    252   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
    253   ASSERT_TRUE(iter.Init());
    254   EXPECT_TRUE(iter.Advance());
    255   EXPECT_FALSE(iter.IsWord());
    256   EXPECT_EQ(nl, iter.GetString());
    257   EXPECT_TRUE(iter.Advance());
    258   EXPECT_FALSE(iter.IsWord());
    259   EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
    260   EXPECT_TRUE(iter.Advance());
    261   EXPECT_FALSE(iter.IsWord());
    262   EXPECT_EQ(nl, iter.GetString());
    263   EXPECT_TRUE(iter.Advance());
    264   EXPECT_FALSE(iter.IsWord());
    265   EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString());
    266   EXPECT_FALSE(iter.Advance());
    267   EXPECT_FALSE(iter.IsWord());
    268   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
    269   EXPECT_FALSE(iter.IsWord());
    270 }
    271 
    272 TEST(BreakIteratorTest, BreakLineWide16) {
    273   // Two Greek words separated by newline.
    274   const string16 str(WideToUTF16(
    275       L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
    276       L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
    277   const string16 line1(str.substr(0, 11));
    278   const string16 line2(str.substr(11, 5));
    279   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
    280   ASSERT_TRUE(iter.Init());
    281   EXPECT_TRUE(iter.Advance());
    282   EXPECT_FALSE(iter.IsWord());
    283   EXPECT_EQ(line1, iter.GetString());
    284   EXPECT_TRUE(iter.Advance());
    285   EXPECT_FALSE(iter.IsWord());
    286   EXPECT_EQ(line2, iter.GetString());
    287   EXPECT_FALSE(iter.Advance());
    288   EXPECT_FALSE(iter.IsWord());
    289   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
    290   EXPECT_FALSE(iter.IsWord());
    291 }
    292 
    293 TEST(BreakIteratorTest, BreakLineWide32) {
    294   // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
    295   const char* very_wide_char = "\xF0\x9D\x92\x9C";
    296   const string16 str(
    297       UTF8ToUTF16(base::StringPrintf("%s\na", very_wide_char)));
    298   const string16 very_wide_line(str.substr(0, 3));
    299   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
    300   ASSERT_TRUE(iter.Init());
    301   EXPECT_TRUE(iter.Advance());
    302   EXPECT_FALSE(iter.IsWord());
    303   EXPECT_EQ(very_wide_line, iter.GetString());
    304   EXPECT_TRUE(iter.Advance());
    305   EXPECT_FALSE(iter.IsWord());
    306   EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
    307   EXPECT_FALSE(iter.Advance());
    308   EXPECT_FALSE(iter.IsWord());
    309   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
    310   EXPECT_FALSE(iter.IsWord());
    311 }
    312 
    313 TEST(BreakIteratorTest, BreakCharacter) {
    314   static const wchar_t* kCharacters[] = {
    315     // An English word consisting of four ASCII characters.
    316     L"w", L"o", L"r", L"d", L" ",
    317     // A Hindi word (which means "Hindi") consisting of three Devanagari
    318     // characters.
    319     L"\x0939\x093F", L"\x0928\x094D", L"\x0926\x0940", L" ",
    320     // A Thai word (which means "feel") consisting of three Thai characters.
    321     L"\x0E23\x0E39\x0E49", L"\x0E2A\x0E36", L"\x0E01", L" ",
    322   };
    323   std::vector<string16> characters;
    324   string16 text;
    325   for (size_t i = 0; i < arraysize(kCharacters); ++i) {
    326     characters.push_back(WideToUTF16(kCharacters[i]));
    327     text.append(characters.back());
    328   }
    329   BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
    330   ASSERT_TRUE(iter.Init());
    331   for (size_t i = 0; i < arraysize(kCharacters); ++i) {
    332     EXPECT_TRUE(iter.Advance());
    333     EXPECT_EQ(characters[i], iter.GetString());
    334   }
    335 }
    336 
    337 }  // namespace i18n
    338 }  // namespace base
    339