Home | History | Annotate | Download | only in convert_dict
      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/tools/convert_dict/aff_reader.h"
      6 
      7 #include <algorithm>
      8 
      9 #include "base/files/file_util.h"
     10 #include "base/i18n/icu_string_conversions.h"
     11 #include "base/strings/string_split.h"
     12 #include "base/strings/stringprintf.h"
     13 #include "base/strings/utf_string_conversions.h"
     14 #include "chrome/tools/convert_dict/hunspell_reader.h"
     15 
     16 namespace convert_dict {
     17 
     18 namespace {
     19 
     20 // Returns true if the given line begins with the given case-sensitive
     21 // NULL-terminated ASCII string.
     22 bool StringBeginsWith(const std::string& str, const char* with) {
     23   size_t cur = 0;
     24   while (cur < str.size() && with[cur] != 0) {
     25     if (str[cur] != with[cur])
     26       return false;
     27     cur++;
     28   }
     29   return with[cur] == 0;
     30 }
     31 
     32 // Collapses runs of spaces to only one space.
     33 void CollapseDuplicateSpaces(std::string* str) {
     34   int prev_space = false;
     35   for (size_t i = 0; i < str->length(); i++) {
     36     if ((*str)[i] == ' ') {
     37       if (prev_space) {
     38         str->erase(str->begin() + i);
     39         i--;
     40       }
     41       prev_space = true;
     42     } else {
     43       prev_space = false;
     44     }
     45   }
     46 }
     47 
     48 // Print an error message and terminate execution
     49 void Panic(const char* fmt, ...) {
     50   va_list ap;
     51   printf("ERROR: ");
     52   va_start(ap, fmt);
     53   vprintf(fmt, ap);
     54   va_end(ap);
     55   printf("\n");
     56   exit(1);
     57 }
     58 
     59 }  // namespace
     60 
     61 AffReader::AffReader(const base::FilePath& path)
     62     : has_indexed_affixes_(false) {
     63   file_ = base::OpenFile(path, "r");
     64 
     65   // Default to Latin1 in case the file doesn't specify it.
     66   encoding_ = "ISO8859-1";
     67 }
     68 
     69 AffReader::~AffReader() {
     70   if (file_)
     71     base::CloseFile(file_);
     72 }
     73 
     74 bool AffReader::Read() {
     75   if (!file_)
     76     return false;
     77 
     78   // TODO(brettw) handle byte order mark.
     79 
     80   bool got_command = false;
     81   bool got_first_af = false;
     82   bool got_first_rep = false;
     83 
     84   has_indexed_affixes_ = false;
     85 
     86   while (!feof(file_)) {
     87     std::string line = ReadLine(file_);
     88 
     89     // Save comment lines before any commands.
     90     if (!got_command && !line.empty() && line[0] == '#') {
     91       intro_comment_.append(line);
     92       intro_comment_.push_back('\n');
     93       continue;
     94     }
     95 
     96     StripComment(&line);
     97     if (line.empty())
     98       continue;
     99     got_command = true;
    100 
    101     if (StringBeginsWith(line, "SET ")) {
    102       // Character set encoding.
    103       encoding_ = line.substr(4);
    104       TrimLine(&encoding_);
    105     } else if (StringBeginsWith(line, "AF ")) {
    106       // Affix. The first one is the number of ones following which we don't
    107       // bother with.
    108       has_indexed_affixes_ = true;
    109       if (got_first_af) {
    110         std::string group(line.substr(3));
    111         AddAffixGroup(&group);
    112       } else {
    113         got_first_af = true;
    114       }
    115     } else if (StringBeginsWith(line, "SFX ") ||
    116                StringBeginsWith(line, "PFX ")) {
    117       AddAffix(&line);
    118     } else if (StringBeginsWith(line, "REP ")) {
    119       // The first rep line is the number of ones following which we don't
    120       // bother with.
    121       if (got_first_rep) {
    122         std::string replacement(line.substr(4));
    123         AddReplacement(&replacement);
    124       } else {
    125         got_first_rep = true;
    126       }
    127     } else if (StringBeginsWith(line, "TRY ") ||
    128                StringBeginsWith(line, "MAP ")) {
    129       HandleEncodedCommand(line);
    130     } else if (StringBeginsWith(line, "IGNORE ")) {
    131       Panic("We don't support the IGNORE command yet. This would change how "
    132         "we would insert things in our lookup table.");
    133     } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
    134       Panic("We don't support the COMPLEXPREFIXES command yet. This would "
    135         "mean we have to insert words backwards as well (I think)");
    136     } else {
    137       // All other commands get stored in the other commands list.
    138       HandleRawCommand(line);
    139     }
    140   }
    141 
    142   return true;
    143 }
    144 
    145 bool AffReader::EncodingToUTF8(const std::string& encoded,
    146                                std::string* utf8) const {
    147   std::wstring wide_word;
    148   if (!base::CodepageToWide(encoded, encoding(),
    149                             base::OnStringConversionError::FAIL, &wide_word))
    150     return false;
    151   *utf8 = base::WideToUTF8(wide_word);
    152   return true;
    153 }
    154 
    155 int AffReader::GetAFIndexForAFString(const std::string& af_string) {
    156   std::map<std::string, int>::iterator found = affix_groups_.find(af_string);
    157   if (found != affix_groups_.end())
    158     return found->second;
    159   std::string my_string(af_string);
    160   return AddAffixGroup(&my_string);
    161 }
    162 
    163 // We convert the data from our map to an indexed list, and also prefix each
    164 // line with "AF" for the parser to read later.
    165 std::vector<std::string> AffReader::GetAffixGroups() const {
    166   int max_id = 0;
    167   for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
    168        i != affix_groups_.end(); ++i) {
    169     if (i->second > max_id)
    170       max_id = i->second;
    171   }
    172 
    173   std::vector<std::string> ret;
    174 
    175   ret.resize(max_id);
    176   for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
    177        i != affix_groups_.end(); ++i) {
    178     // Convert the indices into 1-based.
    179     ret[i->second - 1] = std::string("AF ") + i->first;
    180   }
    181 
    182   return ret;
    183 }
    184 
    185 int AffReader::AddAffixGroup(std::string* rule) {
    186   TrimLine(rule);
    187 
    188   // We use the 1-based index of the rule. This matches the way Hunspell
    189   // refers to the numbers.
    190   int affix_id = static_cast<int>(affix_groups_.size()) + 1;
    191   affix_groups_.insert(std::make_pair(*rule, affix_id));
    192   return affix_id;
    193 }
    194 
    195 void AffReader::AddAffix(std::string* rule) {
    196   TrimLine(rule);
    197   CollapseDuplicateSpaces(rule);
    198 
    199   // These lines have two forms:
    200   //   AFX D Y 4       <- First line, lists how many affixes for "D" there are.
    201   //   AFX D   0 d e   <- Following lines.
    202   // We want to ensure the two last groups on the last line are encoded in
    203   // UTF-8, and we want to make sure that the affix identifier "D" is *not*
    204   // encoded, since that's basically an 8-bit identifier.
    205 
    206   // Count to the third space. Everything after that will be re-encoded. This
    207   // will re-encode the number on the first line, but that will be a NOP. If
    208   // there are not that many groups, we won't reencode it, but pass it through.
    209   int found_spaces = 0;
    210   std::string token;
    211   for (size_t i = 0; i < rule->length(); i++) {
    212     if ((*rule)[i] == ' ') {
    213       found_spaces++;
    214       if (found_spaces == 3) {
    215         size_t part_start = i;
    216         std::string part;
    217         if (token[0] != 'Y' && token[0] != 'N') {
    218           // This token represents a stripping prefix or suffix, which is
    219           // either a length or a string to be replaced.
    220           // We also reencode them to UTF-8.
    221           part_start = i - token.length();
    222         }
    223         part = rule->substr(part_start);  // From here to end.
    224 
    225         if (part.find('-') != std::string::npos) {
    226           // This rule has a morph rule used by old Hungarian dictionaries.
    227           // When a line has a morph rule, its format becomes as listed below.
    228           //   AFX D   0 d e - M
    229           // To make hunspell work more happily, replace this morph rule with
    230           // a compound flag as listed below.
    231           //   AFX D   0 d/M e
    232           std::vector<std::string> tokens;
    233           base::SplitString(part, ' ', &tokens);
    234           if (tokens.size() >= 5) {
    235             part = base::StringPrintf("%s %s/%s %s",
    236                                       tokens[0].c_str(),
    237                                       tokens[1].c_str(),
    238                                       tokens[4].c_str(),
    239                                       tokens[2].c_str());
    240           }
    241         }
    242 
    243         size_t slash_index = part.find('/');
    244         if (slash_index != std::string::npos && !has_indexed_affixes()) {
    245           // This can also have a rule string associated with it following a
    246           // slash. For example:
    247           //    PFX P   0 foo/Y  .
    248           // The "Y" is a flag. For example, the aff file might have a line:
    249           //    COMPOUNDFLAG Y
    250           // so that means that this prefix would be a compound one.
    251           //
    252           // It expects these rules to use the same alias rules as the .dic
    253           // file. We've forced it to use aliases, which is a numerical index
    254           // instead of these character flags, and this needs to be consistent.
    255 
    256           std::string before_flags = part.substr(0, slash_index + 1);
    257 
    258           // After the slash are both the flags, then whitespace, then the part
    259           // that tells us what to strip.
    260           std::vector<std::string> after_slash;
    261           base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
    262           if (after_slash.size() == 0) {
    263             Panic("Found 0 terms after slash in affix rule '%s', "
    264                       "but need at least 2.",
    265                    part.c_str());
    266           }
    267           if (after_slash.size() == 1) {
    268             printf("WARNING: Found 1 term after slash in affix rule '%s', "
    269                       "but expected at least 2. Adding '.'.\n",
    270                    part.c_str());
    271             after_slash.push_back(".");
    272           }
    273           // Note that we may get a third term here which is the morphological
    274           // description of this rule. This happens in the tests only, so we can
    275           // just ignore it.
    276 
    277           part = base::StringPrintf("%s%d %s",
    278                                     before_flags.c_str(),
    279                                     GetAFIndexForAFString(after_slash[0]),
    280                                     after_slash[1].c_str());
    281         }
    282 
    283         // Reencode from here
    284         std::string reencoded;
    285         if (!EncodingToUTF8(part, &reencoded))
    286           Panic("Cannot encode affix rule part '%s' to utf8.", part.c_str());
    287 
    288         *rule = rule->substr(0, part_start) + reencoded;
    289         break;
    290       }
    291       token.clear();
    292     } else {
    293       token.push_back((*rule)[i]);
    294     }
    295   }
    296 
    297   affix_rules_.push_back(*rule);
    298 }
    299 
    300 void AffReader::AddReplacement(std::string* rule) {
    301   TrimLine(rule);
    302   CollapseDuplicateSpaces(rule);
    303 
    304   std::string utf8rule;
    305   if (!EncodingToUTF8(*rule, &utf8rule))
    306     Panic("Cannot encode replacement rule '%s' to utf8.", rule->c_str());
    307 
    308   // The first space separates key and value.
    309   size_t space_index = utf8rule.find(' ');
    310   if (space_index == std::string::npos)
    311     Panic("Did not find a space in '%s'.", utf8rule.c_str());
    312 
    313   std::vector<std::string> split;
    314   split.push_back(utf8rule.substr(0, space_index));
    315   split.push_back(utf8rule.substr(space_index + 1));
    316 
    317   // Underscores are used to represent spaces in most aff files
    318   // (since the line is parsed on spaces).
    319   std::replace(split[0].begin(), split[0].end(), '_', ' ');
    320   std::replace(split[1].begin(), split[1].end(), '_', ' ');
    321 
    322   replacements_.push_back(std::make_pair(split[0], split[1]));
    323 }
    324 
    325 void AffReader::HandleRawCommand(const std::string& line) {
    326   other_commands_.push_back(line);
    327 }
    328 
    329 void AffReader::HandleEncodedCommand(const std::string& line) {
    330   std::string utf8;
    331   if (!EncodingToUTF8(line, &utf8))
    332     Panic("Cannot encode command '%s' to utf8.", line.c_str());
    333   other_commands_.push_back(utf8);
    334 }
    335 
    336 }  // namespace convert_dict
    337