1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "chrome/tools/convert_dict/aff_reader.h" 6 7 #include <algorithm> 8 9 #include "base/file_util.h" 10 #include "base/i18n/icu_string_conversions.h" 11 #include "base/strings/string_split.h" 12 #include "base/strings/stringprintf.h" 13 #include "base/strings/utf_string_conversions.h" 14 #include "chrome/tools/convert_dict/hunspell_reader.h" 15 16 namespace convert_dict { 17 18 namespace { 19 20 // Returns true if the given line begins with the given case-sensitive 21 // NULL-terminated ASCII string. 22 bool StringBeginsWith(const std::string& str, const char* with) { 23 size_t cur = 0; 24 while (cur < str.size() && with[cur] != 0) { 25 if (str[cur] != with[cur]) 26 return false; 27 cur++; 28 } 29 return with[cur] == 0; 30 } 31 32 // Collapses runs of spaces to only one space. 33 void CollapseDuplicateSpaces(std::string* str) { 34 int prev_space = false; 35 for (size_t i = 0; i < str->length(); i++) { 36 if ((*str)[i] == ' ') { 37 if (prev_space) { 38 str->erase(str->begin() + i); 39 i--; 40 } 41 prev_space = true; 42 } else { 43 prev_space = false; 44 } 45 } 46 } 47 48 // Print an error message and terminate execution 49 void Panic(const char* fmt, ...) { 50 va_list ap; 51 printf("ERROR: "); 52 va_start(ap, fmt); 53 vprintf(fmt, ap); 54 va_end(ap); 55 printf("\n"); 56 exit(1); 57 } 58 59 } // namespace 60 61 AffReader::AffReader(const base::FilePath& path) 62 : has_indexed_affixes_(false) { 63 file_ = base::OpenFile(path, "r"); 64 65 // Default to Latin1 in case the file doesn't specify it. 66 encoding_ = "ISO8859-1"; 67 } 68 69 AffReader::~AffReader() { 70 if (file_) 71 base::CloseFile(file_); 72 } 73 74 bool AffReader::Read() { 75 if (!file_) 76 return false; 77 78 // TODO(brettw) handle byte order mark. 79 80 bool got_command = false; 81 bool got_first_af = false; 82 bool got_first_rep = false; 83 84 has_indexed_affixes_ = false; 85 86 while (!feof(file_)) { 87 std::string line = ReadLine(file_); 88 89 // Save comment lines before any commands. 90 if (!got_command && !line.empty() && line[0] == '#') { 91 intro_comment_.append(line); 92 intro_comment_.push_back('\n'); 93 continue; 94 } 95 96 StripComment(&line); 97 if (line.empty()) 98 continue; 99 got_command = true; 100 101 if (StringBeginsWith(line, "SET ")) { 102 // Character set encoding. 103 encoding_ = line.substr(4); 104 TrimLine(&encoding_); 105 } else if (StringBeginsWith(line, "AF ")) { 106 // Affix. The first one is the number of ones following which we don't 107 // bother with. 108 has_indexed_affixes_ = true; 109 if (got_first_af) { 110 std::string group(line.substr(3)); 111 AddAffixGroup(&group); 112 } else { 113 got_first_af = true; 114 } 115 } else if (StringBeginsWith(line, "SFX ") || 116 StringBeginsWith(line, "PFX ")) { 117 AddAffix(&line); 118 } else if (StringBeginsWith(line, "REP ")) { 119 // The first rep line is the number of ones following which we don't 120 // bother with. 121 if (got_first_rep) { 122 std::string replacement(line.substr(4)); 123 AddReplacement(&replacement); 124 } else { 125 got_first_rep = true; 126 } 127 } else if (StringBeginsWith(line, "TRY ") || 128 StringBeginsWith(line, "MAP ")) { 129 HandleEncodedCommand(line); 130 } else if (StringBeginsWith(line, "IGNORE ")) { 131 Panic("We don't support the IGNORE command yet. This would change how " 132 "we would insert things in our lookup table."); 133 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { 134 Panic("We don't support the COMPLEXPREFIXES command yet. This would " 135 "mean we have to insert words backwards as well (I think)"); 136 } else { 137 // All other commands get stored in the other commands list. 138 HandleRawCommand(line); 139 } 140 } 141 142 return true; 143 } 144 145 bool AffReader::EncodingToUTF8(const std::string& encoded, 146 std::string* utf8) const { 147 std::wstring wide_word; 148 if (!base::CodepageToWide(encoded, encoding(), 149 base::OnStringConversionError::FAIL, &wide_word)) 150 return false; 151 *utf8 = WideToUTF8(wide_word); 152 return true; 153 } 154 155 int AffReader::GetAFIndexForAFString(const std::string& af_string) { 156 std::map<std::string, int>::iterator found = affix_groups_.find(af_string); 157 if (found != affix_groups_.end()) 158 return found->second; 159 std::string my_string(af_string); 160 return AddAffixGroup(&my_string); 161 } 162 163 // We convert the data from our map to an indexed list, and also prefix each 164 // line with "AF" for the parser to read later. 165 std::vector<std::string> AffReader::GetAffixGroups() const { 166 int max_id = 0; 167 for (std::map<std::string, int>::const_iterator i = affix_groups_.begin(); 168 i != affix_groups_.end(); ++i) { 169 if (i->second > max_id) 170 max_id = i->second; 171 } 172 173 std::vector<std::string> ret; 174 175 ret.resize(max_id); 176 for (std::map<std::string, int>::const_iterator i = affix_groups_.begin(); 177 i != affix_groups_.end(); ++i) { 178 // Convert the indices into 1-based. 179 ret[i->second - 1] = std::string("AF ") + i->first; 180 } 181 182 return ret; 183 } 184 185 int AffReader::AddAffixGroup(std::string* rule) { 186 TrimLine(rule); 187 188 // We use the 1-based index of the rule. This matches the way Hunspell 189 // refers to the numbers. 190 int affix_id = static_cast<int>(affix_groups_.size()) + 1; 191 affix_groups_.insert(std::make_pair(*rule, affix_id)); 192 return affix_id; 193 } 194 195 void AffReader::AddAffix(std::string* rule) { 196 TrimLine(rule); 197 CollapseDuplicateSpaces(rule); 198 199 // These lines have two forms: 200 // AFX D Y 4 <- First line, lists how many affixes for "D" there are. 201 // AFX D 0 d e <- Following lines. 202 // We want to ensure the two last groups on the last line are encoded in 203 // UTF-8, and we want to make sure that the affix identifier "D" is *not* 204 // encoded, since that's basically an 8-bit identifier. 205 206 // Count to the third space. Everything after that will be re-encoded. This 207 // will re-encode the number on the first line, but that will be a NOP. If 208 // there are not that many groups, we won't reencode it, but pass it through. 209 int found_spaces = 0; 210 std::string token; 211 for (size_t i = 0; i < rule->length(); i++) { 212 if ((*rule)[i] == ' ') { 213 found_spaces++; 214 if (found_spaces == 3) { 215 size_t part_start = i; 216 std::string part; 217 if (token[0] != 'Y' && token[0] != 'N') { 218 // This token represents a stripping prefix or suffix, which is 219 // either a length or a string to be replaced. 220 // We also reencode them to UTF-8. 221 part_start = i - token.length(); 222 } 223 part = rule->substr(part_start); // From here to end. 224 225 if (part.find('-') != std::string::npos) { 226 // This rule has a morph rule used by old Hungarian dictionaries. 227 // When a line has a morph rule, its format becomes as listed below. 228 // AFX D 0 d e - M 229 // To make hunspell work more happily, replace this morph rule with 230 // a compound flag as listed below. 231 // AFX D 0 d/M e 232 std::vector<std::string> tokens; 233 base::SplitString(part, ' ', &tokens); 234 if (tokens.size() >= 5) { 235 part = base::StringPrintf("%s %s/%s %s", 236 tokens[0].c_str(), 237 tokens[1].c_str(), 238 tokens[4].c_str(), 239 tokens[2].c_str()); 240 } 241 } 242 243 size_t slash_index = part.find('/'); 244 if (slash_index != std::string::npos && !has_indexed_affixes()) { 245 // This can also have a rule string associated with it following a 246 // slash. For example: 247 // PFX P 0 foo/Y . 248 // The "Y" is a flag. For example, the aff file might have a line: 249 // COMPOUNDFLAG Y 250 // so that means that this prefix would be a compound one. 251 // 252 // It expects these rules to use the same alias rules as the .dic 253 // file. We've forced it to use aliases, which is a numerical index 254 // instead of these character flags, and this needs to be consistent. 255 256 std::string before_flags = part.substr(0, slash_index + 1); 257 258 // After the slash are both the flags, then whitespace, then the part 259 // that tells us what to strip. 260 std::vector<std::string> after_slash; 261 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); 262 if (after_slash.size() == 0) { 263 Panic("Found 0 terms after slash in affix rule '%s', " 264 "but need at least 2.", 265 part.c_str()); 266 } 267 if (after_slash.size() == 1) { 268 printf("WARNING: Found 1 term after slash in affix rule '%s', " 269 "but expected at least 2. Adding '.'.\n", 270 part.c_str()); 271 after_slash.push_back("."); 272 } 273 // Note that we may get a third term here which is the morphological 274 // description of this rule. This happens in the tests only, so we can 275 // just ignore it. 276 277 part = base::StringPrintf("%s%d %s", 278 before_flags.c_str(), 279 GetAFIndexForAFString(after_slash[0]), 280 after_slash[1].c_str()); 281 } 282 283 // Reencode from here 284 std::string reencoded; 285 if (!EncodingToUTF8(part, &reencoded)) 286 Panic("Cannot encode affix rule part '%s' to utf8.", part.c_str()); 287 288 *rule = rule->substr(0, part_start) + reencoded; 289 break; 290 } 291 token.clear(); 292 } else { 293 token.push_back((*rule)[i]); 294 } 295 } 296 297 affix_rules_.push_back(*rule); 298 } 299 300 void AffReader::AddReplacement(std::string* rule) { 301 TrimLine(rule); 302 CollapseDuplicateSpaces(rule); 303 304 std::string utf8rule; 305 if (!EncodingToUTF8(*rule, &utf8rule)) 306 Panic("Cannot encode replacement rule '%s' to utf8.", rule->c_str()); 307 308 // The first space separates key and value. 309 size_t space_index = utf8rule.find(' '); 310 if (space_index == std::string::npos) 311 Panic("Did not find a space in '%s'.", utf8rule.c_str()); 312 313 std::vector<std::string> split; 314 split.push_back(utf8rule.substr(0, space_index)); 315 split.push_back(utf8rule.substr(space_index + 1)); 316 317 // Underscores are used to represent spaces in most aff files 318 // (since the line is parsed on spaces). 319 std::replace(split[0].begin(), split[0].end(), '_', ' '); 320 std::replace(split[1].begin(), split[1].end(), '_', ' '); 321 322 replacements_.push_back(std::make_pair(split[0], split[1])); 323 } 324 325 void AffReader::HandleRawCommand(const std::string& line) { 326 other_commands_.push_back(line); 327 } 328 329 void AffReader::HandleEncodedCommand(const std::string& line) { 330 std::string utf8; 331 if (!EncodingToUTF8(line, &utf8)) 332 Panic("Cannot encode command '%s' to utf8.", line.c_str()); 333 other_commands_.push_back(utf8); 334 } 335 336 } // namespace convert_dict 337