tools/convert_dict/aff_reader.cc

// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/tools/convert_dict/aff_reader.h"

#include <algorithm>

#include "base/file_util.h"
#include "base/i18n/icu_string_conversions.h"
#include "base/strings/string_split.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "chrome/tools/convert_dict/hunspell_reader.h"

namespace convert_dict {

namespace {

// Returns true if the given line begins with the given case-sensitive
// NULL-terminated ASCII string.
bool StringBeginsWith(const std::string& str, const char* with) {
  size_t cur = 0;
  while (cur < str.size() && with[cur] != 0) {
    if (str[cur] != with[cur])
      return false;
    cur++;
  }
  return with[cur] == 0;
}

// Collapses runs of spaces to only one space.
void CollapseDuplicateSpaces(std::string* str) {
  int prev_space = false;
  for (size_t i = 0; i < str->length(); i++) {
    if ((*str)[i] == ' ') {
      if (prev_space) {
        str->erase(str->begin() + i);
        i--;
      }
      prev_space = true;
    } else {
      prev_space = false;
    }
  }
}

// Print an error message and terminate execution
void Panic(const char* fmt, ...) {
  va_list ap;
  printf("ERROR: ");
  va_start(ap, fmt);
  vprintf(fmt, ap);
  va_end(ap);
  printf("\n");
  exit(1);
}

}  // namespace

AffReader::AffReader(const base::FilePath& path)
    : has_indexed_affixes_(false) {
  file_ = base::OpenFile(path, "r");

  // Default to Latin1 in case the file doesn't specify it.
  encoding_ = "ISO8859-1";
}

AffReader::~AffReader() {
  if (file_)
    base::CloseFile(file_);
}

bool AffReader::Read() {
  if (!file_)
    return false;

  // TODO(brettw) handle byte order mark.

  bool got_command = false;
  bool got_first_af = false;
  bool got_first_rep = false;

  has_indexed_affixes_ = false;

  while (!feof(file_)) {
    std::string line = ReadLine(file_);

    // Save comment lines before any commands.
    if (!got_command && !line.empty() && line[0] == '#') {
      intro_comment_.append(line);
      intro_comment_.push_back('\n');
      continue;
    }

    StripComment(&line);
    if (line.empty())
      continue;
    got_command = true;

    if (StringBeginsWith(line, "SET ")) {
      // Character set encoding.
      encoding_ = line.substr(4);
      TrimLine(&encoding_);
    } else if (StringBeginsWith(line, "AF ")) {
      // Affix. The first one is the number of ones following which we don't
      // bother with.
      has_indexed_affixes_ = true;
      if (got_first_af) {
        std::string group(line.substr(3));
        AddAffixGroup(&group);
      } else {
        got_first_af = true;
      }
    } else if (StringBeginsWith(line, "SFX ") ||
               StringBeginsWith(line, "PFX ")) {
      AddAffix(&line);
    } else if (StringBeginsWith(line, "REP ")) {
      // The first rep line is the number of ones following which we don't
      // bother with.
      if (got_first_rep) {
        std::string replacement(line.substr(4));
        AddReplacement(&replacement);
      } else {
        got_first_rep = true;
      }
    } else if (StringBeginsWith(line, "TRY ") ||
               StringBeginsWith(line, "MAP ")) {
      HandleEncodedCommand(line);
    } else if (StringBeginsWith(line, "IGNORE ")) {
      Panic("We don't support the IGNORE command yet. This would change how "
        "we would insert things in our lookup table.");
    } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
      Panic("We don't support the COMPLEXPREFIXES command yet. This would "
        "mean we have to insert words backwards as well (I think)");
    } else {
      // All other commands get stored in the other commands list.
      HandleRawCommand(line);
    }
  }

  return true;
}

bool AffReader::EncodingToUTF8(const std::string& encoded,
                               std::string* utf8) const {
  std::wstring wide_word;
  if (!base::CodepageToWide(encoded, encoding(),
                            base::OnStringConversionError::FAIL, &wide_word))
    return false;
  *utf8 = WideToUTF8(wide_word);
  return true;
}

int AffReader::GetAFIndexForAFString(const std::string& af_string) {
  std::map<std::string, int>::iterator found = affix_groups_.find(af_string);
  if (found != affix_groups_.end())
    return found->second;
  std::string my_string(af_string);
  return AddAffixGroup(&my_string);
}

// We convert the data from our map to an indexed list, and also prefix each
// line with "AF" for the parser to read later.
std::vector<std::string> AffReader::GetAffixGroups() const {
  int max_id = 0;
  for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
       i != affix_groups_.end(); ++i) {
    if (i->second > max_id)
      max_id = i->second;
  }

  std::vector<std::string> ret;

  ret.resize(max_id);
  for (std::map<std::string, int>::const_iterator i = affix_groups_.begin();
       i != affix_groups_.end(); ++i) {
    // Convert the indices into 1-based.
    ret[i->second - 1] = std::string("AF ") + i->first;
  }

  return ret;
}

int AffReader::AddAffixGroup(std::string* rule) {
  TrimLine(rule);

  // We use the 1-based index of the rule. This matches the way Hunspell
  // refers to the numbers.
  int affix_id = static_cast<int>(affix_groups_.size()) + 1;
  affix_groups_.insert(std::make_pair(*rule, affix_id));
  return affix_id;
}

void AffReader::AddAffix(std::string* rule) {
  TrimLine(rule);
  CollapseDuplicateSpaces(rule);

  // These lines have two forms:
  //   AFX D Y 4       <- First line, lists how many affixes for "D" there are.
  //   AFX D   0 d e   <- Following lines.
  // We want to ensure the two last groups on the last line are encoded in
  // UTF-8, and we want to make sure that the affix identifier "D" is *not*
  // encoded, since that's basically an 8-bit identifier.

  // Count to the third space. Everything after that will be re-encoded. This
  // will re-encode the number on the first line, but that will be a NOP. If
  // there are not that many groups, we won't reencode it, but pass it through.
  int found_spaces = 0;
  std::string token;
  for (size_t i = 0; i < rule->length(); i++) {
    if ((*rule)[i] == ' ') {
      found_spaces++;
      if (found_spaces == 3) {
        size_t part_start = i;
        std::string part;
        if (token[0] != 'Y' && token[0] != 'N') {
          // This token represents a stripping prefix or suffix, which is
          // either a length or a string to be replaced.
          // We also reencode them to UTF-8.
          part_start = i - token.length();
        }
        part = rule->substr(part_start);  // From here to end.

        if (part.find('-') != std::string::npos) {
          // This rule has a morph rule used by old Hungarian dictionaries.
          // When a line has a morph rule, its format becomes as listed below.
          //   AFX D   0 d e - M
          // To make hunspell work more happily, replace this morph rule with
          // a compound flag as listed below.
          //   AFX D   0 d/M e
          std::vector<std::string> tokens;
          base::SplitString(part, ' ', &tokens);
          if (tokens.size() >= 5) {
            part = base::StringPrintf("%s %s/%s %s",
                                      tokens[0].c_str(),
                                      tokens[1].c_str(),
                                      tokens[4].c_str(),
                                      tokens[2].c_str());
          }
        }

        size_t slash_index = part.find('/');
        if (slash_index != std::string::npos && !has_indexed_affixes()) {
          // This can also have a rule string associated with it following a
          // slash. For example:
          //    PFX P   0 foo/Y  .
          // The "Y" is a flag. For example, the aff file might have a line:
          //    COMPOUNDFLAG Y
          // so that means that this prefix would be a compound one.
          //
          // It expects these rules to use the same alias rules as the .dic
          // file. We've forced it to use aliases, which is a numerical index
          // instead of these character flags, and this needs to be consistent.

          std::string before_flags = part.substr(0, slash_index + 1);

          // After the slash are both the flags, then whitespace, then the part
          // that tells us what to strip.
          std::vector<std::string> after_slash;
          base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
          if (after_slash.size() == 0) {
            Panic("Found 0 terms after slash in affix rule '%s', "
                      "but need at least 2.",
                   part.c_str());
          }
          if (after_slash.size() == 1) {
            printf("WARNING: Found 1 term after slash in affix rule '%s', "
                      "but expected at least 2. Adding '.'.\n",
                   part.c_str());
            after_slash.push_back(".");
          }
          // Note that we may get a third term here which is the morphological
          // description of this rule. This happens in the tests only, so we can
          // just ignore it.

          part = base::StringPrintf("%s%d %s",
                                    before_flags.c_str(),
                                    GetAFIndexForAFString(after_slash[0]),
                                    after_slash[1].c_str());
        }

        // Reencode from here
        std::string reencoded;
        if (!EncodingToUTF8(part, &reencoded))
          Panic("Cannot encode affix rule part '%s' to utf8.", part.c_str());

        *rule = rule->substr(0, part_start) + reencoded;
        break;
      }
      token.clear();
    } else {
      token.push_back((*rule)[i]);
    }
  }

  affix_rules_.push_back(*rule);
}

void AffReader::AddReplacement(std::string* rule) {
  TrimLine(rule);
  CollapseDuplicateSpaces(rule);

  std::string utf8rule;
  if (!EncodingToUTF8(*rule, &utf8rule))
    Panic("Cannot encode replacement rule '%s' to utf8.", rule->c_str());

  // The first space separates key and value.
  size_t space_index = utf8rule.find(' ');
  if (space_index == std::string::npos)
    Panic("Did not find a space in '%s'.", utf8rule.c_str());

  std::vector<std::string> split;
  split.push_back(utf8rule.substr(0, space_index));
  split.push_back(utf8rule.substr(space_index + 1));

  // Underscores are used to represent spaces in most aff files
  // (since the line is parsed on spaces).
  std::replace(split[0].begin(), split[0].end(), '_', ' ');
  std::replace(split[1].begin(), split[1].end(), '_', ' ');

  replacements_.push_back(std::make_pair(split[0], split[1]));
}

void AffReader::HandleRawCommand(const std::string& line) {
  other_commands_.push_back(line);
}

void AffReader::HandleEncodedCommand(const std::string& line) {
  std::string utf8;
  if (!EncodingToUTF8(line, &utf8))
    Panic("Cannot encode command '%s' to utf8.", line.c_str());
  other_commands_.push_back(utf8);
}

}  // namespace convert_dict