1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This tool converts Hunspell .aff/.dic pairs to a combined binary dictionary 6 // format (.bdic). This format is more compact, and can be more efficiently 7 // read by the client application. 8 // 9 // We do this conversion manually before publishing dictionary files. It is not 10 // part of any build process. 11 // 12 // See PrintHelp() below for usage. 13 14 #include <stdio.h> 15 16 #include "base/at_exit.h" 17 #include "base/file_util.h" 18 #include "base/files/file_path.h" 19 #include "base/i18n/icu_util.h" 20 #include "base/logging.h" 21 #include "base/process/memory.h" 22 #include "base/strings/string_util.h" 23 #include "chrome/tools/convert_dict/aff_reader.h" 24 #include "chrome/tools/convert_dict/dic_reader.h" 25 #include "third_party/hunspell/google/bdict_reader.h" 26 #include "third_party/hunspell/google/bdict_writer.h" 27 28 namespace { 29 30 // Compares the given word list with the serialized trie to make sure they 31 // are the same. 32 bool VerifyWords(const convert_dict::DicReader::WordList& org_words, 33 const std::string& serialized) { 34 hunspell::BDictReader reader; 35 if (!reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()), 36 serialized.size())) { 37 printf("BDict is invalid\n"); 38 return false; 39 } 40 hunspell::WordIterator iter = reader.GetAllWordIterator(); 41 42 int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD]; 43 44 static const int buf_size = 128; 45 char buf[buf_size]; 46 for (size_t i = 0; i < org_words.size(); i++) { 47 int affix_matches = iter.Advance(buf, buf_size, affix_ids); 48 if (affix_matches == 0) { 49 printf("Found the end before we expected\n"); 50 return false; 51 } 52 53 if (org_words[i].first != buf) { 54 printf("Word doesn't match, word #%s\n", buf); 55 return false; 56 } 57 58 if (affix_matches != static_cast<int>(org_words[i].second.size())) { 59 printf("Different number of affix indices, word #%s\n", buf); 60 return false; 61 } 62 63 // Check the individual affix indices. 64 for (size_t affix_index = 0; affix_index < org_words[i].second.size(); 65 affix_index++) { 66 if (affix_ids[affix_index] != org_words[i].second[affix_index]) { 67 printf("Index doesn't match, word #%s\n", buf); 68 return false; 69 } 70 } 71 } 72 73 return true; 74 } 75 76 int PrintHelp() { 77 printf("Usage: convert_dict <dicfile base name>\n\n"); 78 printf("Example:\n"); 79 printf(" convert_dict en-US\nwill read en-US.dic, en-US.dic_delta, and " 80 "en-US.aff from the current directory and generate en-US.bdic\n\n"); 81 return 1; 82 } 83 84 } // namespace 85 86 #if defined(OS_WIN) 87 int wmain(int argc, wchar_t* argv[]) { 88 #else 89 int main(int argc, char* argv[]) { 90 #endif 91 base::EnableTerminationOnHeapCorruption(); 92 if (argc != 2) 93 return PrintHelp(); 94 95 base::AtExitManager exit_manager; 96 icu_util::Initialize(); 97 98 base::FilePath file_base = base::FilePath(argv[1]); 99 100 base::FilePath aff_path = 101 file_base.ReplaceExtension(FILE_PATH_LITERAL(".aff")); 102 printf("Reading %" PRFilePath " ...\n", aff_path.value().c_str()); 103 convert_dict::AffReader aff_reader(aff_path); 104 if (!aff_reader.Read()) { 105 printf("Unable to read the aff file.\n"); 106 return 1; 107 } 108 109 base::FilePath dic_path = 110 file_base.ReplaceExtension(FILE_PATH_LITERAL(".dic")); 111 printf("Reading %" PRFilePath " ...\n", dic_path.value().c_str()); 112 // DicReader will also read the .dic_delta file. 113 convert_dict::DicReader dic_reader(dic_path); 114 if (!dic_reader.Read(&aff_reader)) { 115 printf("Unable to read the dic file.\n"); 116 return 1; 117 } 118 119 hunspell::BDictWriter writer; 120 writer.SetComment(aff_reader.comments()); 121 writer.SetAffixRules(aff_reader.affix_rules()); 122 writer.SetAffixGroups(aff_reader.GetAffixGroups()); 123 writer.SetReplacements(aff_reader.replacements()); 124 writer.SetOtherCommands(aff_reader.other_commands()); 125 writer.SetWords(dic_reader.words()); 126 127 printf("Serializing...\n"); 128 std::string serialized = writer.GetBDict(); 129 130 printf("Verifying...\n"); 131 if (!VerifyWords(dic_reader.words(), serialized)) { 132 printf("ERROR converting, the dictionary does not check out OK."); 133 return 1; 134 } 135 136 base::FilePath out_path = 137 file_base.ReplaceExtension(FILE_PATH_LITERAL(".bdic")); 138 printf("Writing %" PRFilePath " ...\n", out_path.value().c_str()); 139 FILE* out_file = file_util::OpenFile(out_path, "wb"); 140 if (!out_file) { 141 printf("ERROR writing file\n"); 142 return 1; 143 } 144 size_t written = fwrite(&serialized[0], 1, serialized.size(), out_file); 145 CHECK(written == serialized.size()); 146 file_util::CloseFile(out_file); 147 148 return 0; 149 } 150