Home | History | Annotate | Download | only in convert_dict
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <map>
      6 #include <string>
      7 
      8 #include "base/file_util.h"
      9 #include "base/format_macros.h"
     10 #include "base/i18n/icu_string_conversions.h"
     11 #include "base/strings/stringprintf.h"
     12 #include "base/strings/utf_string_conversions.h"
     13 #include "chrome/tools/convert_dict/aff_reader.h"
     14 #include "chrome/tools/convert_dict/dic_reader.h"
     15 #include "testing/gtest/include/gtest/gtest.h"
     16 #include "third_party/hunspell/google/bdict_reader.h"
     17 #include "third_party/hunspell/google/bdict_writer.h"
     18 
     19 namespace {
     20 
     21 // Compares the given word list with the serialized trie to make sure they
     22 // are the same.
     23 // (This function is copied from "chrome/tools/convert_dict/convert_dict.cc").
     24 bool VerifyWords(const convert_dict::DicReader::WordList& org_words,
     25                  const std::string& serialized) {
     26   hunspell::BDictReader reader;
     27   EXPECT_TRUE(
     28       reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()),
     29       serialized.size()));
     30 
     31   hunspell::WordIterator iter = reader.GetAllWordIterator();
     32 
     33   int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD];
     34 
     35   static const int kBufSize = 128;
     36   char buf[kBufSize];
     37   for (size_t i = 0; i < org_words.size(); i++) {
     38     SCOPED_TRACE(base::StringPrintf(
     39         "org_words[%" PRIuS "]: %s", i, org_words[i].first.c_str()));
     40 
     41     int affix_matches = iter.Advance(buf, kBufSize, affix_ids);
     42     EXPECT_NE(0, affix_matches);
     43     EXPECT_EQ(org_words[i].first, std::string(buf));
     44     EXPECT_EQ(affix_matches, static_cast<int>(org_words[i].second.size()));
     45 
     46     // Check the individual affix indices.
     47     for (size_t affix_index = 0; affix_index < org_words[i].second.size();
     48          affix_index++) {
     49       EXPECT_EQ(affix_ids[affix_index], org_words[i].second[affix_index]);
     50     }
     51   }
     52 
     53   return true;
     54 }
     55 
     56 // Implements the test process used by ConvertDictTest.
     57 // This function encapsulates all complicated operations used by
     58 // ConvertDictTest so we can conceal them from the tests themselves.
     59 // This function consists of the following parts:
     60 // * Creates a dummy affix file and a dictionary file.
     61 // * Reads the dummy files.
     62 // * Creates bdict data.
     63 // * Verify the bdict data.
     64 void RunDictionaryTest(const char* codepage,
     65                        const std::map<string16, bool>& word_list) {
     66   // Create an affix data and a dictionary data.
     67   std::string aff_data(base::StringPrintf("SET %s\n", codepage));
     68 
     69   std::string dic_data(base::StringPrintf("%" PRIuS "\n", word_list.size()));
     70   for (std::map<string16, bool>::const_iterator it = word_list.begin();
     71        it != word_list.end(); ++it) {
     72     std::string encoded_word;
     73     EXPECT_TRUE(UTF16ToCodepage(it->first,
     74                                 codepage,
     75                                 base::OnStringConversionError::FAIL,
     76                                 &encoded_word));
     77     dic_data += encoded_word;
     78     dic_data += "\n";
     79   }
     80 
     81   // Create a temporary affix file and a dictionary file from the test data.
     82   base::FilePath aff_file;
     83   file_util::CreateTemporaryFile(&aff_file);
     84   file_util::WriteFile(aff_file, aff_data.c_str(), aff_data.length());
     85 
     86   base::FilePath dic_file;
     87   file_util::CreateTemporaryFile(&dic_file);
     88   file_util::WriteFile(dic_file, dic_data.c_str(), dic_data.length());
     89 
     90   {
     91     // Read the above affix file with AffReader and read the dictionary file
     92     // with DicReader, respectively.
     93     convert_dict::AffReader aff_reader(aff_file);
     94     EXPECT_TRUE(aff_reader.Read());
     95 
     96     convert_dict::DicReader dic_reader(dic_file);
     97     EXPECT_TRUE(dic_reader.Read(&aff_reader));
     98 
     99     // Verify this DicReader includes all the input words.
    100     EXPECT_EQ(word_list.size(), dic_reader.words().size());
    101     for (size_t i = 0; i < dic_reader.words().size(); ++i) {
    102       SCOPED_TRACE(base::StringPrintf("dic_reader.words()[%" PRIuS "]: %s",
    103                                       i, dic_reader.words()[i].first.c_str()));
    104       string16 word(UTF8ToUTF16(dic_reader.words()[i].first));
    105       EXPECT_TRUE(word_list.find(word) != word_list.end());
    106     }
    107 
    108     // Create BDICT data and verify it.
    109     hunspell::BDictWriter writer;
    110     writer.SetComment(aff_reader.comments());
    111     writer.SetAffixRules(aff_reader.affix_rules());
    112     writer.SetAffixGroups(aff_reader.GetAffixGroups());
    113     writer.SetReplacements(aff_reader.replacements());
    114     writer.SetOtherCommands(aff_reader.other_commands());
    115     writer.SetWords(dic_reader.words());
    116 
    117     std::string bdict_data = writer.GetBDict();
    118     VerifyWords(dic_reader.words(), bdict_data);
    119     EXPECT_TRUE(hunspell::BDict::Verify(bdict_data.data(), bdict_data.size()));
    120 
    121     // Trim the end of this BDICT and verify our verifier tells these trimmed
    122     // BDICTs are corrupted.
    123     for (size_t i = 1; i < bdict_data.size(); ++i) {
    124       SCOPED_TRACE(base::StringPrintf("i = %" PRIuS, i));
    125       EXPECT_FALSE(hunspell::BDict::Verify(bdict_data.data(),
    126                                            bdict_data.size() - i));
    127     }
    128   }
    129 
    130   // Deletes the temporary files.
    131   // We need to delete them after the above AffReader and DicReader are deleted
    132   // since they close the input files in their destructors.
    133   base::DeleteFile(aff_file, false);
    134   base::DeleteFile(dic_file, false);
    135 }
    136 
    137 }  // namespace
    138 
    139 // Tests whether or not our DicReader can read all the input English words
    140 TEST(ConvertDictTest, English) {
    141   const char kCodepage[] = "UTF-8";
    142   const wchar_t* kWords[] = {
    143     L"I",
    144     L"he",
    145     L"she",
    146     L"it",
    147     L"we",
    148     L"you",
    149     L"they",
    150   };
    151 
    152   std::map<string16, bool> word_list;
    153   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
    154     word_list.insert(std::make_pair<string16, bool>(WideToUTF16(kWords[i]),
    155                                                     true));
    156 
    157   RunDictionaryTest(kCodepage, word_list);
    158 }
    159 
    160 // Tests whether or not our DicReader can read all the input Russian words.
    161 TEST(ConvertDictTest, Russian) {
    162   const char kCodepage[] = "KOI8-R";
    163   const wchar_t* kWords[] = {
    164     L"\x044f",
    165     L"\x0442\x044b",
    166     L"\x043e\x043d",
    167     L"\x043e\x043d\x0430",
    168     L"\x043e\x043d\x043e",
    169     L"\x043c\x044b",
    170     L"\x0432\x044b",
    171     L"\x043e\x043d\x0438",
    172   };
    173 
    174   std::map<string16, bool> word_list;
    175   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
    176     word_list.insert(std::make_pair<string16, bool>(WideToUTF16(kWords[i]),
    177                                                     true));
    178 
    179   RunDictionaryTest(kCodepage, word_list);
    180 }
    181 
    182 // Tests whether or not our DicReader can read all the input Hungarian words.
    183 TEST(ConvertDictTest, Hungarian) {
    184   const char kCodepage[] = "ISO8859-2";
    185   const wchar_t* kWords[] = {
    186     L"\x00e9\x006e",
    187     L"\x0074\x0065",
    188     L"\x0151",
    189     L"\x00f6\x006e",
    190     L"\x006d\x0061\x0067\x0061",
    191     L"\x006d\x0069",
    192     L"\x0074\x0069",
    193     L"\x0151\x006b",
    194     L"\x00f6\x006e\x00f6\x006b",
    195     L"\x006d\x0061\x0067\x0075\x006b",
    196   };
    197 
    198   std::map<string16, bool> word_list;
    199   for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i)
    200     word_list.insert(std::make_pair<string16, bool>(WideToUTF16(kWords[i]),
    201                                                     true));
    202 
    203   RunDictionaryTest(kCodepage, word_list);
    204 }
    205