Home | History | Annotate | Download | only in spellchecker
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <string>
      6 #include <vector>
      7 
      8 #include "base/format_macros.h"
      9 #include "base/strings/string_split.h"
     10 #include "base/strings/stringprintf.h"
     11 #include "base/strings/utf_string_conversions.h"
     12 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
     13 #include "testing/gtest/include/gtest/gtest.h"
     14 
     15 namespace {
     16 
     17 struct TestCase {
     18     const char* language;
     19     bool allow_contraction;
     20     const wchar_t* expected_words;
     21 };
     22 
     23 }  // namespace
     24 
     25 // Tests whether or not our SpellcheckWordIterator can extract only words used
     26 // by the specified language from a multi-language text.
     27 TEST(SpellcheckWordIteratorTest, SplitWord) {
     28   // An input text. This text includes words of several languages. (Some words
     29   // are not separated with whitespace characters.) Our SpellcheckWordIterator
     30   // should extract only the words used by the specified language from this text
     31   // and normalize them so our spell-checker can check their spellings.
     32   const wchar_t kTestText[] =
     33       // Graphic characters
     34       L"!@#$%^&*()"
     35       // Latin (including a contraction character and a ligature).
     36       L"hello:hello a\xFB03x"
     37       // Greek
     38       L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
     39       // Cyrillic
     40       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
     41       L"\x0443\x0439\x0442\x0435"
     42       // Hebrew (including niqquds)
     43       L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
     44       // Hebrew words with U+0027 and U+05F3
     45       L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
     46       // Hebrew words with U+0022 and U+05F4
     47       L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
     48       // Hebrew words enclosed with ASCII quotes.
     49       L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
     50       // Arabic (including vowel marks)
     51       L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
     52       L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
     53       L"\x0652\x0643\x064f\x0645\x0652"
     54       // Hindi
     55       L"\x0930\x093E\x091C\x0927\x093E\x0928"
     56       // Thai
     57       L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
     58       L"\x0e23\x0e31\x0e1a"
     59       // Hiraganas
     60       L"\x3053\x3093\x306B\x3061\x306F"
     61       // CJKV ideographs
     62       L"\x4F60\x597D"
     63       // Hangul Syllables
     64       L"\xC548\xB155\xD558\xC138\xC694"
     65       // Full-width latin : Hello
     66       L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
     67       L"e.g.,";
     68 
     69   // The languages and expected results used in this test.
     70   static const TestCase kTestCases[] = {
     71     {
     72       // English (keep contraction words)
     73       "en-US", true, L"hello:hello affix Hello e.g"
     74     }, {
     75       // English (split contraction words)
     76       "en-US", false, L"hello hello affix Hello e g"
     77     }, {
     78       // Greek
     79       "el-GR", true,
     80       L"\x03B3\x03B5\x03B9\x03AC\x0020\x03C3\x03BF\x03C5"
     81     }, {
     82       // Russian
     83       "ru-RU", true,
     84       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
     85       L"\x0443\x0439\x0442\x0435"
     86     }, {
     87       // Hebrew
     88       "he-IL", true,
     89       L"\x05e9\x05dc\x05d5\x05dd "
     90       L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
     91       L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
     92       L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
     93     }, {
     94       // Arabic
     95       "ar", true,
     96       L"\x0627\x0644\x0633\x0644\x0627\x0645\x0020\x0639"
     97       L"\x0644\x064a\x0643\x0645"
     98     }, {
     99       // Hindi
    100       "hi-IN", true,
    101       L"\x0930\x093E\x091C\x0927\x093E\x0928"
    102     }, {
    103       // Thai
    104       "th-TH", true,
    105       L"\x0e2a\x0e27\x0e31\x0e2a\x0e14\x0e35\x0020\x0e04"
    106       L"\x0e23\x0e31\x0e1a"
    107     }, {
    108       // Korean
    109       "ko-KR", true,
    110       L"\x110b\x1161\x11ab\x1102\x1167\x11bc\x1112\x1161"
    111       L"\x1109\x1166\x110b\x116d"
    112     },
    113   };
    114 
    115   for (size_t i = 0; i < arraysize(kTestCases); ++i) {
    116     SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
    117                                     kTestCases[i].language));
    118 
    119     SpellcheckCharAttribute attributes;
    120     attributes.SetDefaultLanguage(kTestCases[i].language);
    121 
    122     string16 input(WideToUTF16(kTestText));
    123     SpellcheckWordIterator iterator;
    124     EXPECT_TRUE(iterator.Initialize(&attributes,
    125                                     kTestCases[i].allow_contraction));
    126     EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
    127 
    128     std::vector<string16> expected_words;
    129     base::SplitString(
    130         WideToUTF16(kTestCases[i].expected_words), ' ', &expected_words);
    131 
    132     string16 actual_word;
    133     int actual_start, actual_end;
    134     size_t index = 0;
    135     while (iterator.GetNextWord(&actual_word, &actual_start, &actual_end)) {
    136       EXPECT_TRUE(index < expected_words.size());
    137       if (index < expected_words.size())
    138         EXPECT_EQ(expected_words[index], actual_word);
    139       ++index;
    140     }
    141   }
    142 }
    143 
    144 // Tests whether our SpellcheckWordIterator extracts an empty word without
    145 // getting stuck in an infinite loop when inputting a Khmer text. (This is a
    146 // regression test for Issue 46278.)
    147 TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
    148   SpellcheckCharAttribute attributes;
    149   attributes.SetDefaultLanguage("en-US");
    150 
    151   const wchar_t kTestText[] = L"\x1791\x17c1\x002e";
    152   string16 input(WideToUTF16(kTestText));
    153 
    154   SpellcheckWordIterator iterator;
    155   EXPECT_TRUE(iterator.Initialize(&attributes, true));
    156   EXPECT_TRUE(iterator.SetText(input.c_str(), input.length()));
    157 
    158   // When SpellcheckWordIterator uses an inconsistent ICU ruleset, the following
    159   // iterator.GetNextWord() call gets stuck in an infinite loop. Therefore, this
    160   // test succeeds if this call returns without timeouts.
    161   string16 actual_word;
    162   int actual_start, actual_end;
    163   EXPECT_FALSE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
    164   EXPECT_EQ(0, actual_start);
    165   EXPECT_EQ(0, actual_end);
    166 }
    167 
    168 // Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
    169 // on LTR languages. On the other hand, it should not treat ASCII numbers as
    170 // word characters on RTL languages because they change the text direction from
    171 // RTL to LTR.
    172 TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
    173   // A set of a language, a dummy word, and a text direction used in this test.
    174   // For each language, this test splits a dummy word, which consists of ASCII
    175   // numbers and an alphabet of the language, into words. When ASCII numbers are
    176   // treated as word characters, the split word becomes equal to the dummy word.
    177   // Otherwise, the split word does not include ASCII numbers.
    178   static const struct {
    179     const char* language;
    180     const wchar_t* text;
    181     bool left_to_right;
    182   } kTestCases[] = {
    183     {
    184       // English
    185       "en-US", L"0123456789" L"a", true,
    186     }, {
    187       // Greek
    188       "el-GR", L"0123456789" L"\x03B1", true,
    189     }, {
    190       // Russian
    191       "ru-RU", L"0123456789" L"\x0430", true,
    192     }, {
    193       // Hebrew
    194       "he-IL", L"0123456789" L"\x05D0", false,
    195     }, {
    196       // Arabic
    197       "ar",  L"0123456789" L"\x0627", false,
    198     }, {
    199       // Hindi
    200       "hi-IN", L"0123456789" L"\x0905", true,
    201     }, {
    202       // Thai
    203       "th-TH", L"0123456789" L"\x0e01", true,
    204     }, {
    205       // Korean
    206       "ko-KR", L"0123456789" L"\x1100\x1161", true,
    207     },
    208   };
    209 
    210   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) {
    211     SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
    212                                     kTestCases[i].language));
    213 
    214     SpellcheckCharAttribute attributes;
    215     attributes.SetDefaultLanguage(kTestCases[i].language);
    216 
    217     string16 input_word(WideToUTF16(kTestCases[i].text));
    218     SpellcheckWordIterator iterator;
    219     EXPECT_TRUE(iterator.Initialize(&attributes, true));
    220     EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
    221 
    222     string16 actual_word;
    223     int actual_start, actual_end;
    224     EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
    225     if (kTestCases[i].left_to_right)
    226       EXPECT_EQ(input_word, actual_word);
    227     else
    228       EXPECT_NE(input_word, actual_word);
    229   }
    230 }
    231 
    232 TEST(SpellcheckWordIteratorTest, Initialization) {
    233   // Test initialization works when a default language is set.
    234   {
    235     SpellcheckCharAttribute attributes;
    236     attributes.SetDefaultLanguage("en-US");
    237 
    238     SpellcheckWordIterator iterator;
    239     EXPECT_TRUE(iterator.Initialize(&attributes, true));
    240   }
    241 
    242   // Test initialization fails when no default language is set.
    243   {
    244     SpellcheckCharAttribute attributes;
    245 
    246     SpellcheckWordIterator iterator;
    247     EXPECT_FALSE(iterator.Initialize(&attributes, true));
    248   }
    249 }
    250