1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/i18n/break_iterator.h" 6 7 #include "base/strings/string_piece.h" 8 #include "base/strings/stringprintf.h" 9 #include "base/strings/utf_string_conversions.h" 10 #include "testing/gtest/include/gtest/gtest.h" 11 12 namespace base { 13 namespace i18n { 14 15 TEST(BreakIteratorTest, BreakWordEmpty) { 16 string16 empty; 17 BreakIterator iter(empty, BreakIterator::BREAK_WORD); 18 ASSERT_TRUE(iter.Init()); 19 EXPECT_FALSE(iter.Advance()); 20 EXPECT_FALSE(iter.IsWord()); 21 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 22 EXPECT_FALSE(iter.IsWord()); 23 } 24 25 TEST(BreakIteratorTest, BreakWord) { 26 string16 space(UTF8ToUTF16(" ")); 27 string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); 28 BreakIterator iter(str, BreakIterator::BREAK_WORD); 29 ASSERT_TRUE(iter.Init()); 30 EXPECT_TRUE(iter.Advance()); 31 EXPECT_FALSE(iter.IsWord()); 32 EXPECT_EQ(space, iter.GetString()); 33 EXPECT_TRUE(iter.Advance()); 34 EXPECT_TRUE(iter.IsWord()); 35 EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString()); 36 EXPECT_TRUE(iter.Advance()); 37 EXPECT_FALSE(iter.IsWord()); 38 EXPECT_EQ(space, iter.GetString()); 39 EXPECT_TRUE(iter.Advance()); 40 EXPECT_TRUE(iter.IsWord()); 41 EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString()); 42 EXPECT_TRUE(iter.Advance()); 43 EXPECT_FALSE(iter.IsWord()); 44 EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString()); 45 EXPECT_TRUE(iter.Advance()); 46 EXPECT_FALSE(iter.IsWord()); 47 EXPECT_EQ(space, iter.GetString()); 48 EXPECT_TRUE(iter.Advance()); 49 EXPECT_FALSE(iter.IsWord()); 50 EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString()); 51 EXPECT_TRUE(iter.Advance()); 52 EXPECT_TRUE(iter.IsWord()); 53 EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString()); 54 EXPECT_TRUE(iter.Advance()); 55 EXPECT_FALSE(iter.IsWord()); 56 EXPECT_EQ(space, iter.GetString()); 57 EXPECT_TRUE(iter.Advance()); 58 EXPECT_TRUE(iter.IsWord()); 59 EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); 60 EXPECT_FALSE(iter.Advance()); 61 EXPECT_FALSE(iter.IsWord()); 62 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 63 EXPECT_FALSE(iter.IsWord()); 64 } 65 66 TEST(BreakIteratorTest, BreakWide16) { 67 // Two greek words separated by space. 68 const string16 str(WideToUTF16( 69 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 70 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); 71 const string16 word1(str.substr(0, 10)); 72 const string16 word2(str.substr(11, 5)); 73 BreakIterator iter(str, BreakIterator::BREAK_WORD); 74 ASSERT_TRUE(iter.Init()); 75 EXPECT_TRUE(iter.Advance()); 76 EXPECT_TRUE(iter.IsWord()); 77 EXPECT_EQ(word1, iter.GetString()); 78 EXPECT_TRUE(iter.Advance()); 79 EXPECT_FALSE(iter.IsWord()); 80 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); 81 EXPECT_TRUE(iter.Advance()); 82 EXPECT_TRUE(iter.IsWord()); 83 EXPECT_EQ(word2, iter.GetString()); 84 EXPECT_FALSE(iter.Advance()); 85 EXPECT_FALSE(iter.IsWord()); 86 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 87 EXPECT_FALSE(iter.IsWord()); 88 } 89 90 TEST(BreakIteratorTest, BreakWide32) { 91 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A 92 const char* very_wide_char = "\xF0\x9D\x92\x9C"; 93 const string16 str( 94 UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char))); 95 const string16 very_wide_word(str.substr(0, 2)); 96 97 BreakIterator iter(str, BreakIterator::BREAK_WORD); 98 ASSERT_TRUE(iter.Init()); 99 EXPECT_TRUE(iter.Advance()); 100 EXPECT_TRUE(iter.IsWord()); 101 EXPECT_EQ(very_wide_word, iter.GetString()); 102 EXPECT_TRUE(iter.Advance()); 103 EXPECT_FALSE(iter.IsWord()); 104 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); 105 EXPECT_TRUE(iter.Advance()); 106 EXPECT_TRUE(iter.IsWord()); 107 EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); 108 EXPECT_FALSE(iter.Advance()); 109 EXPECT_FALSE(iter.IsWord()); 110 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 111 EXPECT_FALSE(iter.IsWord()); 112 } 113 114 TEST(BreakIteratorTest, BreakSpaceEmpty) { 115 string16 empty; 116 BreakIterator iter(empty, BreakIterator::BREAK_SPACE); 117 ASSERT_TRUE(iter.Init()); 118 EXPECT_FALSE(iter.Advance()); 119 EXPECT_FALSE(iter.IsWord()); 120 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 121 EXPECT_FALSE(iter.IsWord()); 122 } 123 124 TEST(BreakIteratorTest, BreakSpace) { 125 string16 str(UTF8ToUTF16(" foo bar! \npouet boom")); 126 BreakIterator iter(str, BreakIterator::BREAK_SPACE); 127 ASSERT_TRUE(iter.Init()); 128 EXPECT_TRUE(iter.Advance()); 129 EXPECT_FALSE(iter.IsWord()); 130 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); 131 EXPECT_TRUE(iter.Advance()); 132 EXPECT_FALSE(iter.IsWord()); 133 EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString()); 134 EXPECT_TRUE(iter.Advance()); 135 EXPECT_FALSE(iter.IsWord()); 136 EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString()); 137 EXPECT_TRUE(iter.Advance()); 138 EXPECT_FALSE(iter.IsWord()); 139 EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString()); 140 EXPECT_TRUE(iter.Advance()); 141 EXPECT_FALSE(iter.IsWord()); 142 EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString()); 143 EXPECT_FALSE(iter.Advance()); 144 EXPECT_FALSE(iter.IsWord()); 145 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 146 EXPECT_FALSE(iter.IsWord()); 147 } 148 149 TEST(BreakIteratorTest, BreakSpaceSP) { 150 string16 str(UTF8ToUTF16(" foo bar! \npouet boom ")); 151 BreakIterator iter(str, BreakIterator::BREAK_SPACE); 152 ASSERT_TRUE(iter.Init()); 153 EXPECT_TRUE(iter.Advance()); 154 EXPECT_FALSE(iter.IsWord()); 155 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString()); 156 EXPECT_TRUE(iter.Advance()); 157 EXPECT_FALSE(iter.IsWord()); 158 EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString()); 159 EXPECT_TRUE(iter.Advance()); 160 EXPECT_FALSE(iter.IsWord()); 161 EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString()); 162 EXPECT_TRUE(iter.Advance()); 163 EXPECT_FALSE(iter.IsWord()); 164 EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString()); 165 EXPECT_TRUE(iter.Advance()); 166 EXPECT_FALSE(iter.IsWord()); 167 EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString()); 168 EXPECT_FALSE(iter.Advance()); 169 EXPECT_FALSE(iter.IsWord()); 170 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 171 EXPECT_FALSE(iter.IsWord()); 172 } 173 174 TEST(BreakIteratorTest, BreakSpacekWide16) { 175 // Two Greek words. 176 const string16 str(WideToUTF16( 177 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 178 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2")); 179 const string16 word1(str.substr(0, 11)); 180 const string16 word2(str.substr(11, 5)); 181 BreakIterator iter(str, BreakIterator::BREAK_SPACE); 182 ASSERT_TRUE(iter.Init()); 183 EXPECT_TRUE(iter.Advance()); 184 EXPECT_FALSE(iter.IsWord()); 185 EXPECT_EQ(word1, iter.GetString()); 186 EXPECT_TRUE(iter.Advance()); 187 EXPECT_FALSE(iter.IsWord()); 188 EXPECT_EQ(word2, iter.GetString()); 189 EXPECT_FALSE(iter.Advance()); 190 EXPECT_FALSE(iter.IsWord()); 191 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 192 EXPECT_FALSE(iter.IsWord()); 193 } 194 195 TEST(BreakIteratorTest, BreakSpaceWide32) { 196 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A 197 const char* very_wide_char = "\xF0\x9D\x92\x9C"; 198 const string16 str( 199 UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char))); 200 const string16 very_wide_word(str.substr(0, 3)); 201 202 BreakIterator iter(str, BreakIterator::BREAK_SPACE); 203 ASSERT_TRUE(iter.Init()); 204 EXPECT_TRUE(iter.Advance()); 205 EXPECT_FALSE(iter.IsWord()); 206 EXPECT_EQ(very_wide_word, iter.GetString()); 207 EXPECT_TRUE(iter.Advance()); 208 EXPECT_FALSE(iter.IsWord()); 209 EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); 210 EXPECT_FALSE(iter.Advance()); 211 EXPECT_FALSE(iter.IsWord()); 212 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 213 EXPECT_FALSE(iter.IsWord()); 214 } 215 216 TEST(BreakIteratorTest, BreakLineEmpty) { 217 string16 empty; 218 BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE); 219 ASSERT_TRUE(iter.Init()); 220 EXPECT_FALSE(iter.Advance()); 221 EXPECT_FALSE(iter.IsWord()); 222 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 223 EXPECT_FALSE(iter.IsWord()); 224 } 225 226 TEST(BreakIteratorTest, BreakLine) { 227 string16 nl(UTF8ToUTF16("\n")); 228 string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom")); 229 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE); 230 ASSERT_TRUE(iter.Init()); 231 EXPECT_TRUE(iter.Advance()); 232 EXPECT_FALSE(iter.IsWord()); 233 EXPECT_EQ(nl, iter.GetString()); 234 EXPECT_TRUE(iter.Advance()); 235 EXPECT_FALSE(iter.IsWord()); 236 EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString()); 237 EXPECT_TRUE(iter.Advance()); 238 EXPECT_FALSE(iter.IsWord()); 239 EXPECT_EQ(nl, iter.GetString()); 240 EXPECT_TRUE(iter.Advance()); 241 EXPECT_FALSE(iter.IsWord()); 242 EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString()); 243 EXPECT_FALSE(iter.Advance()); 244 EXPECT_FALSE(iter.IsWord()); 245 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 246 EXPECT_FALSE(iter.IsWord()); 247 } 248 249 TEST(BreakIteratorTest, BreakLineNL) { 250 string16 nl(UTF8ToUTF16("\n")); 251 string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n")); 252 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE); 253 ASSERT_TRUE(iter.Init()); 254 EXPECT_TRUE(iter.Advance()); 255 EXPECT_FALSE(iter.IsWord()); 256 EXPECT_EQ(nl, iter.GetString()); 257 EXPECT_TRUE(iter.Advance()); 258 EXPECT_FALSE(iter.IsWord()); 259 EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString()); 260 EXPECT_TRUE(iter.Advance()); 261 EXPECT_FALSE(iter.IsWord()); 262 EXPECT_EQ(nl, iter.GetString()); 263 EXPECT_TRUE(iter.Advance()); 264 EXPECT_FALSE(iter.IsWord()); 265 EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString()); 266 EXPECT_FALSE(iter.Advance()); 267 EXPECT_FALSE(iter.IsWord()); 268 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 269 EXPECT_FALSE(iter.IsWord()); 270 } 271 272 TEST(BreakIteratorTest, BreakLineWide16) { 273 // Two Greek words separated by newline. 274 const string16 str(WideToUTF16( 275 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 276 L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2")); 277 const string16 line1(str.substr(0, 11)); 278 const string16 line2(str.substr(11, 5)); 279 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE); 280 ASSERT_TRUE(iter.Init()); 281 EXPECT_TRUE(iter.Advance()); 282 EXPECT_FALSE(iter.IsWord()); 283 EXPECT_EQ(line1, iter.GetString()); 284 EXPECT_TRUE(iter.Advance()); 285 EXPECT_FALSE(iter.IsWord()); 286 EXPECT_EQ(line2, iter.GetString()); 287 EXPECT_FALSE(iter.Advance()); 288 EXPECT_FALSE(iter.IsWord()); 289 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 290 EXPECT_FALSE(iter.IsWord()); 291 } 292 293 TEST(BreakIteratorTest, BreakLineWide32) { 294 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A 295 const char* very_wide_char = "\xF0\x9D\x92\x9C"; 296 const string16 str( 297 UTF8ToUTF16(base::StringPrintf("%s\na", very_wide_char))); 298 const string16 very_wide_line(str.substr(0, 3)); 299 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE); 300 ASSERT_TRUE(iter.Init()); 301 EXPECT_TRUE(iter.Advance()); 302 EXPECT_FALSE(iter.IsWord()); 303 EXPECT_EQ(very_wide_line, iter.GetString()); 304 EXPECT_TRUE(iter.Advance()); 305 EXPECT_FALSE(iter.IsWord()); 306 EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString()); 307 EXPECT_FALSE(iter.Advance()); 308 EXPECT_FALSE(iter.IsWord()); 309 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end. 310 EXPECT_FALSE(iter.IsWord()); 311 } 312 313 TEST(BreakIteratorTest, BreakCharacter) { 314 static const wchar_t* kCharacters[] = { 315 // An English word consisting of four ASCII characters. 316 L"w", L"o", L"r", L"d", L" ", 317 // A Hindi word (which means "Hindi") consisting of three Devanagari 318 // characters. 319 L"\x0939\x093F", L"\x0928\x094D", L"\x0926\x0940", L" ", 320 // A Thai word (which means "feel") consisting of three Thai characters. 321 L"\x0E23\x0E39\x0E49", L"\x0E2A\x0E36", L"\x0E01", L" ", 322 }; 323 std::vector<string16> characters; 324 string16 text; 325 for (size_t i = 0; i < arraysize(kCharacters); ++i) { 326 characters.push_back(WideToUTF16(kCharacters[i])); 327 text.append(characters.back()); 328 } 329 BreakIterator iter(text, BreakIterator::BREAK_CHARACTER); 330 ASSERT_TRUE(iter.Init()); 331 for (size_t i = 0; i < arraysize(kCharacters); ++i) { 332 EXPECT_TRUE(iter.Advance()); 333 EXPECT_EQ(characters[i], iter.GetString()); 334 } 335 } 336 337 } // namespace i18n 338 } // namespace base 339