tools/tld_cleanup/tld_cleanup_util.cc

// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "net/tools/tld_cleanup/tld_cleanup_util.h"

#include "base/file_util.h"
#include "base/logging.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "url/gurl.h"
#include "url/url_parse.h"

namespace {

const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

const int kExceptionRule = 1;
const int kWildcardRule = 2;
const int kPrivateRule = 4;
}

namespace net {
namespace tld_cleanup {

// Writes the list of domain rules contained in the 'rules' set to the
// 'outfile', with each rule terminated by a LF.  The file must already have
// been created with write access.
bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
  std::string data;
  data.append("%{\n"
              "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
              "// Use of this source code is governed by a BSD-style license "
              "that can be\n"
              "// found in the LICENSE file.\n\n"
              "// This file is generated by net/tools/tld_cleanup/.\n"
              "// DO NOT MANUALLY EDIT!\n"
              "%}\n"
              "struct DomainRule {\n"
              "  int name_offset;\n"
              "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
              "};\n"
              "%%\n");

  for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
    data.append(i->first);
    data.append(", ");
    int type = 0;
    if (i->second.exception) {
      type = kExceptionRule;
    } else if (i->second.wildcard) {
      type = kWildcardRule;
    }
    if (i->second.is_private) {
      type += kPrivateRule;
    }
    data.append(base::IntToString(type));
    data.append("\n");
  }

  data.append("%%\n");

  int written = file_util::WriteFile(outfile,
                                     data.data(),
                                     static_cast<int>(data.size()));

  return written == static_cast<int>(data.size());
}

// Adjusts the rule to a standard form: removes single extraneous dots and
// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
// valid; logs a warning and returns kWarning if it is probably invalid; and
// logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
  NormalizeResult result = kSuccess;

  // Strip single leading and trailing dots.
  if (domain->at(0) == '.')
    domain->erase(0, 1);
  if (domain->empty()) {
    LOG(WARNING) << "Ignoring empty rule";
    return kWarning;
  }
  if (domain->at(domain->size() - 1) == '.')
    domain->erase(domain->size() - 1, 1);
  if (domain->empty()) {
    LOG(WARNING) << "Ignoring empty rule";
    return kWarning;
  }

  // Allow single leading '*.' or '!', saved here so it's not canonicalized.
  size_t start_offset = 0;
  if (domain->at(0) == '!') {
    domain->erase(0, 1);
    rule->exception = true;
  } else if (domain->find("*.") == 0) {
    domain->erase(0, 2);
    rule->wildcard = true;
  }
  if (domain->empty()) {
    LOG(WARNING) << "Ignoring empty rule";
    return kWarning;
  }

  // Warn about additional '*.' or '!'.
  if (domain->find("*.", start_offset) != std::string::npos ||
      domain->find('!', start_offset) != std::string::npos) {
    LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
    result = kWarning;
  }

  // Make a GURL and normalize it, then get the host back out.
  std::string url = "http://";
  url.append(*domain);
  GURL gurl(url);
  const std::string& spec = gurl.possibly_invalid_spec();
  url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
  if (host.len < 0) {
    LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
    return kError;
  }
  if (!gurl.is_valid()) {
    LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
    result = kWarning;
  }
  domain->assign(spec.substr(host.begin, host.len));

  return result;
}

NormalizeResult NormalizeDataToRuleMap(const std::string data,
                                       RuleMap* rules) {
  CHECK(rules);
  // We do a lot of string assignment during parsing, but simplicity is more
  // important than performance here.
  std::string domain;
  NormalizeResult result = kSuccess;
  size_t line_start = 0;
  size_t line_end = 0;
  bool is_private = false;
  RuleMap extra_rules;
  int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
  int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
  while (line_start < data.size()) {
    if (line_start + begin_private_length < data.size() &&
        !data.compare(line_start, begin_private_length,
                      kBeginPrivateDomainsComment)) {
      is_private = true;
      line_end = line_start + begin_private_length;
    } else if (line_start + end_private_length < data.size() &&
        !data.compare(line_start, end_private_length,
                      kEndPrivateDomainsComment)) {
      is_private = false;
      line_end = line_start + end_private_length;
    } else if (line_start + 1 < data.size() &&
        data[line_start] == '/' &&
        data[line_start + 1] == '/') {
      // Skip comments.
      line_end = data.find_first_of("\r\n", line_start);
      if (line_end == std::string::npos)
        line_end = data.size();
    } else {
      // Truncate at first whitespace.
      line_end = data.find_first_of("\r\n \t", line_start);
      if (line_end == std::string::npos)
        line_end = data.size();
      domain.assign(data.data(), line_start, line_end - line_start);

      Rule rule;
      rule.wildcard = false;
      rule.exception = false;
      rule.is_private = is_private;
      NormalizeResult new_result = NormalizeRule(&domain, &rule);
      if (new_result != kError) {
        // Check the existing rules to make sure we don't have an exception and
        // wildcard for the same rule, or that the same domain is listed as both
        // private and not private. If we did, we'd have to update our
        // parsing code to handle this case.
        CHECK(rules->find(domain) == rules->end());

        (*rules)[domain] = rule;
        // Add true TLD for multi-level rules.  We don't add them right now, in
        // case there's an exception or wild card that either exists or might be
        // added in a later iteration.  In those cases, there's no need to add
        // it and it would just slow down parsing the data.
        size_t tld_start = domain.find_last_of('.');
        if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
          std::string extra_rule_domain = domain.substr(tld_start + 1);
          RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
          Rule extra_rule;
          extra_rule.exception = false;
          extra_rule.wildcard = false;
          if (iter == extra_rules.end()) {
            extra_rule.is_private = is_private;
          } else {
            // A rule already exists, so we ensure that if any of the entries is
            // not private the result should be that the entry is not private.
            // An example is .au which is not listed as a real TLD, but only
            // lists second-level domains such as com.au. Subdomains of .au
            // (eg. blogspot.com.au) are also listed in the private section,
            // which is processed later, so this ensures that the real TLD
            // (eg. .au) is listed as public.
            extra_rule.is_private = is_private && iter->second.is_private;
          }
          extra_rules[extra_rule_domain] = extra_rule;
        }
      }
      result = std::max(result, new_result);
    }

    // Find beginning of next non-empty line.
    line_start = data.find_first_of("\r\n", line_end);
    if (line_start == std::string::npos)
      line_start = data.size();
    line_start = data.find_first_not_of("\r\n", line_start);
    if (line_start == std::string::npos)
      line_start = data.size();
  }

  for (RuleMap::const_iterator iter = extra_rules.begin();
       iter != extra_rules.end();
       ++iter) {
    if (rules->find(iter->first) == rules->end()) {
      (*rules)[iter->first] = iter->second;
    }
  }

  return result;
}

NormalizeResult NormalizeFile(const base::FilePath& in_filename,
                              const base::FilePath& out_filename) {
  RuleMap rules;
  std::string data;
  if (!file_util::ReadFileToString(in_filename, &data)) {
    LOG(ERROR) << "Unable to read file";
    // We return success since we've already reported the error.
    return kSuccess;
  }

  NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

  if (!WriteRules(rules, out_filename)) {
    LOG(ERROR) << "Error(s) writing output file";
    result = kError;
  }

  return result;
}


}  // namespace tld_cleanup
}  // namespace net