1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <string> 6 #include <vector> 7 8 #include "base/format_macros.h" 9 #include "base/strings/string_split.h" 10 #include "base/strings/stringprintf.h" 11 #include "base/strings/utf_string_conversions.h" 12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" 13 #include "testing/gtest/include/gtest/gtest.h" 14 15 namespace { 16 17 struct TestCase { 18 const char* language; 19 bool allow_contraction; 20 const wchar_t* expected_words; 21 }; 22 23 } // namespace 24 25 // Tests whether or not our SpellcheckWordIterator can extract only words used 26 // by the specified language from a multi-language text. 27 TEST(SpellcheckWordIteratorTest, SplitWord) { 28 // An input text. This text includes words of several languages. (Some words 29 // are not separated with whitespace characters.) Our SpellcheckWordIterator 30 // should extract only the words used by the specified language from this text 31 // and normalize them so our spell-checker can check their spellings. 32 const wchar_t kTestText[] = 33 // Graphic characters 34 L"!@#$%^&*()" 35 // Latin (including a contraction character and a ligature). 36 L"hello:hello a\xFB03x" 37 // Greek 38 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5" 39 // Cyrillic 40 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" 41 L"\x0443\x0439\x0442\x0435" 42 // Hebrew (including niqquds) 43 L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd " 44 // Hebrew words with U+0027 and U+05F3 45 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 " 46 // Hebrew words with U+0022 and U+05F4 47 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc " 48 // Hebrew words enclosed with ASCII quotes. 49 L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'" 50 // Arabic (including vowel marks) 51 L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627" 52 L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a" 53 L"\x0652\x0643\x064f\x0645\x0652" 54 // Hindi 55 L"\x0930\x093E\x091C\x0927\x093E\x0928" 56 // Thai 57 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04" 58 L"\x0e23\x0e31\x0e1a" 59 // Hiraganas 60 L"\x3053\x3093\x306B\x3061\x306F" 61 // CJKV ideographs 62 L"\x4F60\x597D" 63 // Hangul Syllables 64 L"\xC548\xB155\xD558\xC138\xC694" 65 // Full-width latin : Hello 66 L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F " 67 L"e.g.,"; 68 69 // The languages and expected results used in this test. 70 static const TestCase kTestCases[] = { 71 { 72 // English (keep contraction words) 73 "en-US", true, L"hello:hello affix Hello e.g" 74 }, { 75 // English (split contraction words) 76 "en-US", false, L"hello hello affix Hello e g" 77 }, { 78 // Greek 79 "el-GR", true, 80 L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5" 81 }, { 82 // Russian 83 "ru-RU", true, 84 L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" 85 L"\x0443\x0439\x0442\x0435" 86 }, { 87 // Hebrew 88 "he-IL", true, 89 L"\x05e9\x05dc\x05d5\x05dd " 90 L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 " 91 L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc " 92 L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5" 93 }, { 94 // Arabic 95 "ar", true, 96 L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639" 97 L"\x0644\x064a\x0643\x0645" 98 }, { 99 // Hindi 100 "hi-IN", true, 101 L"\x0930\x093E\x091C\x0927\x093E\x0928" 102 }, { 103 // Thai 104 "th-TH", true, 105 L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04" 106 L"\x0e23\x0e31\x0e1a" 107 }, { 108 // Korean 109 "ko-KR", true, 110 L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161" 111 L"\x1109\x1166\x110b\x116d" 112 }, 113 }; 114 115 for (size_t i = 0; i < arraysize(kTestCases); ++i) { 116 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i, 117 kTestCases[i].language)); 118 119 SpellcheckCharAttribute attributes; 120 attributes.SetDefaultLanguage(kTestCases[i].language); 121 122 string16 input(WideToUTF16(kTestText)); 123 SpellcheckWordIterator iterator; 124 EXPECT_TRUE(iterator.Initialize(&attributes, 125 kTestCases[i].allow_contraction)); 126 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length())); 127 128 std::vector<string16> expected_words; 129 base::SplitString( 130 WideToUTF16(kTestCases[i].expected_words), ' ', &expected_words); 131 132 string16 actual_word; 133 int actual_start, actual_end; 134 size_t index = 0; 135 while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) { 136 EXPECT_TRUE(index < expected_words.size()); 137 if (index < expected_words.size()) 138 EXPECT_EQ(expected_words[index], actual_word); 139 ++index; 140 } 141 } 142 } 143 144 // Tests whether our SpellcheckWordIterator extracts an empty word without 145 // getting stuck in an infinite loop when inputting a Khmer text. (This is a 146 // regression test for Issue 46278.) 147 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) { 148 SpellcheckCharAttribute attributes; 149 attributes.SetDefaultLanguage("en-US"); 150 151 const wchar_t kTestText[] = L"\x1791\x17c1\x002e"; 152 string16 input(WideToUTF16(kTestText)); 153 154 SpellcheckWordIterator iterator; 155 EXPECT_TRUE(iterator.Initialize(&attributes, true)); 156 EXPECT_TRUE(iterator.SetText(input.c_str(), input.length())); 157 158 // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following 159 // iterator.GetNextWord() call gets stuck in an infinite loop. Therefore, this 160 // test succeeds if this call returns without timeouts. 161 string16 actual_word; 162 int actual_start, actual_end; 163 EXPECT_FALSE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end)); 164 EXPECT_EQ(0, actual_start); 165 EXPECT_EQ(0, actual_end); 166 } 167 168 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters 169 // on LTR languages. On the other hand, it should not treat ASCII numbers as 170 // word characters on RTL languages because they change the text direction from 171 // RTL to LTR. 172 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) { 173 // A set of a language, a dummy word, and a text direction used in this test. 174 // For each language, this test splits a dummy word, which consists of ASCII 175 // numbers and an alphabet of the language, into words. When ASCII numbers are 176 // treated as word characters, the split word becomes equal to the dummy word. 177 // Otherwise, the split word does not include ASCII numbers. 178 static const struct { 179 const char* language; 180 const wchar_t* text; 181 bool left_to_right; 182 } kTestCases[] = { 183 { 184 // English 185 "en-US", L"0123456789" L"a", true, 186 }, { 187 // Greek 188 "el-GR", L"0123456789" L"\x03B1", true, 189 }, { 190 // Russian 191 "ru-RU", L"0123456789" L"\x0430", true, 192 }, { 193 // Hebrew 194 "he-IL", L"0123456789" L"\x05D0", false, 195 }, { 196 // Arabic 197 "ar", L"0123456789" L"\x0627", false, 198 }, { 199 // Hindi 200 "hi-IN", L"0123456789" L"\x0905", true, 201 }, { 202 // Thai 203 "th-TH", L"0123456789" L"\x0e01", true, 204 }, { 205 // Korean 206 "ko-KR", L"0123456789" L"\x1100\x1161", true, 207 }, 208 }; 209 210 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) { 211 SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i, 212 kTestCases[i].language)); 213 214 SpellcheckCharAttribute attributes; 215 attributes.SetDefaultLanguage(kTestCases[i].language); 216 217 string16 input_word(WideToUTF16(kTestCases[i].text)); 218 SpellcheckWordIterator iterator; 219 EXPECT_TRUE(iterator.Initialize(&attributes, true)); 220 EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length())); 221 222 string16 actual_word; 223 int actual_start, actual_end; 224 EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end)); 225 if (kTestCases[i].left_to_right) 226 EXPECT_EQ(input_word, actual_word); 227 else 228 EXPECT_NE(input_word, actual_word); 229 } 230 } 231 232 TEST(SpellcheckWordIteratorTest, Initialization) { 233 // Test initialization works when a default language is set. 234 { 235 SpellcheckCharAttribute attributes; 236 attributes.SetDefaultLanguage("en-US"); 237 238 SpellcheckWordIterator iterator; 239 EXPECT_TRUE(iterator.Initialize(&attributes, true)); 240 } 241 242 // Test initialization fails when no default language is set. 243 { 244 SpellcheckCharAttribute attributes; 245 246 SpellcheckWordIterator iterator; 247 EXPECT_FALSE(iterator.Initialize(&attributes, true)); 248 } 249 } 250