Home | History | Annotate | Download | only in tld_cleanup
      1 // Copyright 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
      6 
      7 #include "base/files/file_util.h"
      8 #include "base/logging.h"
      9 #include "base/strings/string_number_conversions.h"
     10 #include "base/strings/string_util.h"
     11 #include "url/gurl.h"
     12 #include "url/url_parse.h"
     13 
     14 namespace {
     15 
     16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
     17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
     18 
     19 const int kExceptionRule = 1;
     20 const int kWildcardRule = 2;
     21 const int kPrivateRule = 4;
     22 }
     23 
     24 namespace net {
     25 namespace tld_cleanup {
     26 
     27 // Writes the list of domain rules contained in the 'rules' set to the
     28 // 'outfile', with each rule terminated by a LF.  The file must already have
     29 // been created with write access.
     30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
     31   std::string data;
     32   data.append("%{\n"
     33               "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
     34               "// Use of this source code is governed by a BSD-style license "
     35               "that can be\n"
     36               "// found in the LICENSE file.\n\n"
     37               "// This file is generated by net/tools/tld_cleanup/.\n"
     38               "// DO NOT MANUALLY EDIT!\n"
     39               "%}\n"
     40               "struct DomainRule {\n"
     41               "  int name_offset;\n"
     42               "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
     43               "};\n"
     44               "%%\n");
     45 
     46   for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
     47     data.append(i->first);
     48     data.append(", ");
     49     int type = 0;
     50     if (i->second.exception) {
     51       type = kExceptionRule;
     52     } else if (i->second.wildcard) {
     53       type = kWildcardRule;
     54     }
     55     if (i->second.is_private) {
     56       type += kPrivateRule;
     57     }
     58     data.append(base::IntToString(type));
     59     data.append("\n");
     60   }
     61 
     62   data.append("%%\n");
     63 
     64   int written = base::WriteFile(outfile,
     65                                      data.data(),
     66                                      static_cast<int>(data.size()));
     67 
     68   return written == static_cast<int>(data.size());
     69 }
     70 
     71 // Adjusts the rule to a standard form: removes single extraneous dots and
     72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
     73 // valid; logs a warning and returns kWarning if it is probably invalid; and
     74 // logs an error and returns kError if the rule is (almost) certainly invalid.
     75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
     76   NormalizeResult result = kSuccess;
     77 
     78   // Strip single leading and trailing dots.
     79   if (domain->at(0) == '.')
     80     domain->erase(0, 1);
     81   if (domain->empty()) {
     82     LOG(WARNING) << "Ignoring empty rule";
     83     return kWarning;
     84   }
     85   if (domain->at(domain->size() - 1) == '.')
     86     domain->erase(domain->size() - 1, 1);
     87   if (domain->empty()) {
     88     LOG(WARNING) << "Ignoring empty rule";
     89     return kWarning;
     90   }
     91 
     92   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
     93   size_t start_offset = 0;
     94   if (domain->at(0) == '!') {
     95     domain->erase(0, 1);
     96     rule->exception = true;
     97   } else if (domain->find("*.") == 0) {
     98     domain->erase(0, 2);
     99     rule->wildcard = true;
    100   }
    101   if (domain->empty()) {
    102     LOG(WARNING) << "Ignoring empty rule";
    103     return kWarning;
    104   }
    105 
    106   // Warn about additional '*.' or '!'.
    107   if (domain->find("*.", start_offset) != std::string::npos ||
    108       domain->find('!', start_offset) != std::string::npos) {
    109     LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
    110     result = kWarning;
    111   }
    112 
    113   // Make a GURL and normalize it, then get the host back out.
    114   std::string url = "http://";
    115   url.append(*domain);
    116   GURL gurl(url);
    117   const std::string& spec = gurl.possibly_invalid_spec();
    118   url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
    119   if (host.len < 0) {
    120     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
    121     return kError;
    122   }
    123   if (!gurl.is_valid()) {
    124     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
    125     result = kWarning;
    126   }
    127   domain->assign(spec.substr(host.begin, host.len));
    128 
    129   return result;
    130 }
    131 
    132 NormalizeResult NormalizeDataToRuleMap(const std::string data,
    133                                        RuleMap* rules) {
    134   CHECK(rules);
    135   // We do a lot of string assignment during parsing, but simplicity is more
    136   // important than performance here.
    137   std::string domain;
    138   NormalizeResult result = kSuccess;
    139   size_t line_start = 0;
    140   size_t line_end = 0;
    141   bool is_private = false;
    142   RuleMap extra_rules;
    143   int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
    144   int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
    145   while (line_start < data.size()) {
    146     if (line_start + begin_private_length < data.size() &&
    147         !data.compare(line_start, begin_private_length,
    148                       kBeginPrivateDomainsComment)) {
    149       is_private = true;
    150       line_end = line_start + begin_private_length;
    151     } else if (line_start + end_private_length < data.size() &&
    152         !data.compare(line_start, end_private_length,
    153                       kEndPrivateDomainsComment)) {
    154       is_private = false;
    155       line_end = line_start + end_private_length;
    156     } else if (line_start + 1 < data.size() &&
    157         data[line_start] == '/' &&
    158         data[line_start + 1] == '/') {
    159       // Skip comments.
    160       line_end = data.find_first_of("\r\n", line_start);
    161       if (line_end == std::string::npos)
    162         line_end = data.size();
    163     } else {
    164       // Truncate at first whitespace.
    165       line_end = data.find_first_of("\r\n \t", line_start);
    166       if (line_end == std::string::npos)
    167         line_end = data.size();
    168       domain.assign(data.data(), line_start, line_end - line_start);
    169 
    170       Rule rule;
    171       rule.wildcard = false;
    172       rule.exception = false;
    173       rule.is_private = is_private;
    174       NormalizeResult new_result = NormalizeRule(&domain, &rule);
    175       if (new_result != kError) {
    176         // Check the existing rules to make sure we don't have an exception and
    177         // wildcard for the same rule, or that the same domain is listed as both
    178         // private and not private. If we did, we'd have to update our
    179         // parsing code to handle this case.
    180         CHECK(rules->find(domain) == rules->end())
    181             << "Duplicate rule found for " << domain;
    182 
    183         (*rules)[domain] = rule;
    184         // Add true TLD for multi-level rules.  We don't add them right now, in
    185         // case there's an exception or wild card that either exists or might be
    186         // added in a later iteration.  In those cases, there's no need to add
    187         // it and it would just slow down parsing the data.
    188         size_t tld_start = domain.find_last_of('.');
    189         if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
    190           std::string extra_rule_domain = domain.substr(tld_start + 1);
    191           RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
    192           Rule extra_rule;
    193           extra_rule.exception = false;
    194           extra_rule.wildcard = false;
    195           if (iter == extra_rules.end()) {
    196             extra_rule.is_private = is_private;
    197           } else {
    198             // A rule already exists, so we ensure that if any of the entries is
    199             // not private the result should be that the entry is not private.
    200             // An example is .au which is not listed as a real TLD, but only
    201             // lists second-level domains such as com.au. Subdomains of .au
    202             // (eg. blogspot.com.au) are also listed in the private section,
    203             // which is processed later, so this ensures that the real TLD
    204             // (eg. .au) is listed as public.
    205             extra_rule.is_private = is_private && iter->second.is_private;
    206           }
    207           extra_rules[extra_rule_domain] = extra_rule;
    208         }
    209       }
    210       result = std::max(result, new_result);
    211     }
    212 
    213     // Find beginning of next non-empty line.
    214     line_start = data.find_first_of("\r\n", line_end);
    215     if (line_start == std::string::npos)
    216       line_start = data.size();
    217     line_start = data.find_first_not_of("\r\n", line_start);
    218     if (line_start == std::string::npos)
    219       line_start = data.size();
    220   }
    221 
    222   for (RuleMap::const_iterator iter = extra_rules.begin();
    223        iter != extra_rules.end();
    224        ++iter) {
    225     if (rules->find(iter->first) == rules->end()) {
    226       (*rules)[iter->first] = iter->second;
    227     }
    228   }
    229 
    230   return result;
    231 }
    232 
    233 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
    234                               const base::FilePath& out_filename) {
    235   RuleMap rules;
    236   std::string data;
    237   if (!base::ReadFileToString(in_filename, &data)) {
    238     LOG(ERROR) << "Unable to read file";
    239     // We return success since we've already reported the error.
    240     return kSuccess;
    241   }
    242 
    243   NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
    244 
    245   if (!WriteRules(rules, out_filename)) {
    246     LOG(ERROR) << "Error(s) writing output file";
    247     result = kError;
    248   }
    249 
    250   return result;
    251 }
    252 
    253 
    254 }  // namespace tld_cleanup
    255 }  // namespace net
    256