1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/tools/convert_dict/dic_reader.h" 6 7 #include <algorithm> 8 #include <set> 9 10 #include "base/file_util.h" 11 #include "base/strings/string_util.h" 12 #include "chrome/tools/convert_dict/aff_reader.h" 13 #include "chrome/tools/convert_dict/hunspell_reader.h" 14 15 namespace convert_dict { 16 17 namespace { 18 19 // Maps each unique word to the unique affix group IDs associated with it. 20 typedef std::map<std::string, std::set<int> > WordSet; 21 22 void SplitDicLine(const std::string& line, std::vector<std::string>* output) { 23 // We split the line on a slash not preceded by a backslash. A slash at the 24 // beginning of the line is not a separator either. 25 size_t slash_index = line.size(); 26 for (size_t i = 0; i < line.size(); i++) { 27 if (line[i] == '/' && i > 0 && line[i - 1] != '\\') { 28 slash_index = i; 29 break; 30 } 31 } 32 33 output->clear(); 34 35 // Everything before the slash index is the first term. We also need to 36 // convert all escaped slashes ("\/" sequences) to regular slashes. 37 std::string word = line.substr(0, slash_index); 38 ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); 39 output->push_back(word); 40 41 // Everything (if anything) after the slash is the second. 42 if (slash_index < line.size() - 1) 43 output->push_back(line.substr(slash_index + 1)); 44 } 45 46 // This function reads words from a .dic file, or a .dic_delta file. Note that 47 // we read 'all' the words in the file, irrespective of the word count given 48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta 49 // file, the first line actually does _not_ have the number of words. In order 50 // to control this, we use the |file_has_word_count_in_the_first_line| 51 // parameter to tell this method whether the first non empty line in the file 52 // contains the number of words or not. If it does, skip the first line. If it 53 // does not, then the first line contains a word. 54 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, 55 const char* file_type, const char* encoding, 56 bool file_has_word_count_in_the_first_line) { 57 int line_number = 0; 58 while (!feof(file)) { 59 std::string line = ReadLine(file); 60 line_number++; 61 StripComment(&line); 62 if (line.empty()) 63 continue; 64 65 if (file_has_word_count_in_the_first_line) { 66 // Skip the first nonempty line, this is the line count. We don't bother 67 // with it and just read all the lines. 68 file_has_word_count_in_the_first_line = false; 69 continue; 70 } 71 72 std::vector<std::string> split; 73 SplitDicLine(line, &split); 74 if (split.empty() || split.size() > 2) { 75 printf("Line %d has extra slashes in the %s file\n", line_number, 76 file_type); 77 return false; 78 } 79 80 // The first part is the word, the second (optional) part is the affix. We 81 // always use UTF-8 as the encoding to simplify life. 82 std::string utf8word; 83 std::string encoding_string(encoding); 84 if (encoding_string == "UTF-8") { 85 utf8word = split[0]; 86 } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { 87 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", 88 line_number, encoding, file_type); 89 return false; 90 } 91 92 // We always convert the affix to an index. 0 means no affix. 93 int affix_index = 0; 94 if (split.size() == 2) { 95 // Got a rule, which is the stuff after the slash. The line may also have 96 // an optional term separated by a tab. This is the morphological 97 // description. We don't care about this (it is used in the tests to 98 // generate a nice dump), so we remove it. 99 size_t split1_tab_offset = split[1].find('\t'); 100 if (split1_tab_offset != std::string::npos) 101 split[1] = split[1].substr(0, split1_tab_offset); 102 103 if (aff_reader->has_indexed_affixes()) 104 affix_index = atoi(split[1].c_str()); 105 else 106 affix_index = aff_reader->GetAFIndexForAFString(split[1]); 107 } 108 109 // Discard the morphological description if it is attached to the first 110 // token. (It is attached to the first token if a word doesn't have affix 111 // rules.) 112 size_t word_tab_offset = utf8word.find('\t'); 113 if (word_tab_offset != std::string::npos) 114 utf8word = utf8word.substr(0, word_tab_offset); 115 116 WordSet::iterator found = word_set->find(utf8word); 117 std::set<int> affix_vector; 118 affix_vector.insert(affix_index); 119 120 if (found == word_set->end()) 121 word_set->insert(std::make_pair(utf8word, affix_vector)); 122 else 123 found->second.insert(affix_index); 124 } 125 126 return true; 127 } 128 129 } // namespace 130 131 DicReader::DicReader(const base::FilePath& path) { 132 file_ = file_util::OpenFile(path, "r"); 133 134 base::FilePath additional_path = 135 path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta")); 136 additional_words_file_ = file_util::OpenFile(additional_path, "r"); 137 138 if (additional_words_file_) 139 printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str()); 140 else 141 printf("%" PRFilePath " not found.\n", additional_path.value().c_str()); 142 } 143 144 DicReader::~DicReader() { 145 if (file_) 146 file_util::CloseFile(file_); 147 if (additional_words_file_) 148 file_util::CloseFile(additional_words_file_); 149 } 150 151 bool DicReader::Read(AffReader* aff_reader) { 152 if (!file_) 153 return false; 154 155 WordSet word_set; 156 157 // Add words from the dic file to the word set. 158 // Note that the first line is the word count in the file. 159 if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", 160 aff_reader->encoding(), true)) 161 return false; 162 163 // Add words from the .dic_delta file to the word set, if it exists. 164 // The first line is the first word to add. Word count line is not present. 165 // NOTE: These additional words should be encoded as UTF-8. 166 if (additional_words_file_ != NULL) { 167 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", 168 "UTF-8", false); 169 } 170 // Make sure the words are sorted, they may be unsorted in the input. 171 for (WordSet::iterator word = word_set.begin(); word != word_set.end(); 172 ++word) { 173 std::vector<int> affixes; 174 for (std::set<int>::iterator aff = word->second.begin(); 175 aff != word->second.end(); ++aff) 176 affixes.push_back(*aff); 177 178 // Double check that the affixes are sorted. This isn't strictly necessary 179 // but it's nice for the file to have a fixed layout. 180 std::sort(affixes.begin(), affixes.end()); 181 std::reverse(affixes.begin(), affixes.end()); 182 words_.push_back(std::make_pair(word->first, affixes)); 183 } 184 185 // Double-check that the words are sorted. 186 std::sort(words_.begin(), words_.end()); 187 return true; 188 } 189 190 } // namespace convert_dict 191