Home | History | Annotate | Download | only in convert_dict
      1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "chrome/tools/convert_dict/dic_reader.h"
      6 
      7 #include <algorithm>
      8 #include <set>
      9 
     10 #include "base/files/file_util.h"
     11 #include "base/strings/string_util.h"
     12 #include "chrome/tools/convert_dict/aff_reader.h"
     13 #include "chrome/tools/convert_dict/hunspell_reader.h"
     14 
     15 namespace convert_dict {
     16 
     17 namespace {
     18 
     19 // Maps each unique word to the unique affix group IDs associated with it.
     20 typedef std::map<std::string, std::set<int> > WordSet;
     21 
     22 void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
     23   // We split the line on a slash not preceded by a backslash. A slash at the
     24   // beginning of the line is not a separator either.
     25   size_t slash_index = line.size();
     26   for (size_t i = 0; i < line.size(); i++) {
     27     if (line[i] == '/' && i > 0 && line[i - 1] != '\\') {
     28       slash_index = i;
     29       break;
     30     }
     31   }
     32 
     33   output->clear();
     34 
     35   // Everything before the slash index is the first term. We also need to
     36   // convert all escaped slashes ("\/" sequences) to regular slashes.
     37   std::string word = line.substr(0, slash_index);
     38   ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");
     39   output->push_back(word);
     40 
     41   // Everything (if anything) after the slash is the second.
     42   if (slash_index < line.size() - 1)
     43     output->push_back(line.substr(slash_index + 1));
     44 }
     45 
     46 // This function reads words from a .dic file, or a .dic_delta file. Note that
     47 // we read 'all' the words in the file, irrespective of the word count given
     48 // in the first non empty line of a .dic file. Also note that, for a .dic_delta
     49 // file, the first line actually does _not_ have the number of words. In order
     50 // to control this, we use the |file_has_word_count_in_the_first_line|
     51 // parameter to tell this method whether the first non empty line in the file
     52 // contains the number of words or not. If it does, skip the first line. If it
     53 // does not, then the first line contains a word.
     54 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
     55                      const char* file_type, const char* encoding,
     56                      bool file_has_word_count_in_the_first_line) {
     57   int line_number = 0;
     58   while (!feof(file)) {
     59     std::string line = ReadLine(file);
     60     line_number++;
     61     StripComment(&line);
     62     if (line.empty())
     63       continue;
     64 
     65     if (file_has_word_count_in_the_first_line) {
     66       // Skip the first nonempty line, this is the line count. We don't bother
     67       // with it and just read all the lines.
     68       file_has_word_count_in_the_first_line = false;
     69       continue;
     70     }
     71 
     72     std::vector<std::string> split;
     73     SplitDicLine(line, &split);
     74     if (split.empty() || split.size() > 2) {
     75       printf("Line %d has extra slashes in the %s file\n", line_number,
     76              file_type);
     77       return false;
     78     }
     79 
     80     // The first part is the word, the second (optional) part is the affix. We
     81     // always use UTF-8 as the encoding to simplify life.
     82     std::string utf8word;
     83     std::string encoding_string(encoding);
     84     if (encoding_string == "UTF-8") {
     85       utf8word = split[0];
     86     } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
     87       printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
     88              line_number, encoding, file_type);
     89       return false;
     90     }
     91 
     92     // We always convert the affix to an index. 0 means no affix.
     93     int affix_index = 0;
     94     if (split.size() == 2) {
     95       // Got a rule, which is the stuff after the slash. The line may also have
     96       // an optional term separated by a tab. This is the morphological
     97       // description. We don't care about this (it is used in the tests to
     98       // generate a nice dump), so we remove it.
     99       size_t split1_tab_offset = split[1].find('\t');
    100       if (split1_tab_offset != std::string::npos)
    101         split[1] = split[1].substr(0, split1_tab_offset);
    102 
    103       if (aff_reader->has_indexed_affixes())
    104         affix_index = atoi(split[1].c_str());
    105       else
    106         affix_index = aff_reader->GetAFIndexForAFString(split[1]);
    107     }
    108 
    109     // Discard the morphological description if it is attached to the first
    110     // token. (It is attached to the first token if a word doesn't have affix
    111     // rules.)
    112     size_t word_tab_offset = utf8word.find('\t');
    113     if (word_tab_offset != std::string::npos)
    114       utf8word = utf8word.substr(0, word_tab_offset);
    115 
    116     WordSet::iterator found = word_set->find(utf8word);
    117     std::set<int> affix_vector;
    118     affix_vector.insert(affix_index);
    119 
    120     if (found == word_set->end())
    121       word_set->insert(std::make_pair(utf8word, affix_vector));
    122     else
    123       found->second.insert(affix_index);
    124   }
    125 
    126   return true;
    127 }
    128 
    129 }  // namespace
    130 
    131 DicReader::DicReader(const base::FilePath& path) {
    132   file_ = base::OpenFile(path, "r");
    133 
    134   base::FilePath additional_path =
    135       path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta"));
    136   additional_words_file_ = base::OpenFile(additional_path, "r");
    137 
    138   if (additional_words_file_)
    139     printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str());
    140   else
    141     printf("%" PRFilePath " not found.\n", additional_path.value().c_str());
    142 }
    143 
    144 DicReader::~DicReader() {
    145   if (file_)
    146     base::CloseFile(file_);
    147   if (additional_words_file_)
    148     base::CloseFile(additional_words_file_);
    149 }
    150 
    151 bool DicReader::Read(AffReader* aff_reader) {
    152   if (!file_)
    153     return false;
    154 
    155   WordSet word_set;
    156 
    157   // Add words from the dic file to the word set.
    158   // Note that the first line is the word count in the file.
    159   if (!PopulateWordSet(&word_set, file_, aff_reader, "dic",
    160                        aff_reader->encoding(), true))
    161     return false;
    162 
    163   // Add words from the .dic_delta file to the word set, if it exists.
    164   // The first line is the first word to add. Word count line is not present.
    165   // NOTE: These additional words should be encoded as UTF-8.
    166   if (additional_words_file_ != NULL) {
    167     PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
    168                     "UTF-8", false);
    169   }
    170   // Make sure the words are sorted, they may be unsorted in the input.
    171   for (WordSet::iterator word = word_set.begin(); word != word_set.end();
    172        ++word) {
    173     std::vector<int> affixes;
    174     for (std::set<int>::iterator aff = word->second.begin();
    175          aff != word->second.end(); ++aff)
    176       affixes.push_back(*aff);
    177 
    178     // Double check that the affixes are sorted. This isn't strictly necessary
    179     // but it's nice for the file to have a fixed layout.
    180     std::sort(affixes.begin(), affixes.end());
    181     std::reverse(affixes.begin(), affixes.end());
    182     words_.push_back(std::make_pair(word->first, affixes));
    183   }
    184 
    185   // Double-check that the words are sorted.
    186   std::sort(words_.begin(), words_.end());
    187   return true;
    188 }
    189 
    190 }  // namespace convert_dict
    191