Home | History | Annotate | Download | only in tld_cleanup
      1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // This command-line program converts an effective-TLD data file in UTF-8 from
      6 // the format provided by Mozilla to the format expected by Chrome.  This
      7 // program generates an intermediate file which is then used by gperf to
      8 // generate a perfect hash map.  The benefit of this approach is that no time is
      9 // spent on program initialization to generate the map of this data.
     10 //
     11 // Running this program finds "effective_tld_names.cc" in the expected location
     12 // in the source checkout and generates "effective_tld_names.gperf" next to it.
     13 //
     14 // Any errors or warnings from this program are recorded in tld_cleanup.log.
     15 //
     16 // In particular, it
     17 //  * Strips blank lines and comments, as well as notes for individual rules.
     18 //  * Strips a single leading and/or trailing dot from each rule, if present.
     19 //  * Logs a warning if a rule contains '!' or '*.' other than at the beginning
     20 //    of the rule.  (This also catches multiple ! or *. at the start of a rule.)
     21 //  * Logs a warning if GURL reports a rule as invalid, but keeps the rule.
     22 //  * Canonicalizes each rule's domain by converting it to a GURL and back.
     23 //  * Adds explicit rules for true TLDs found in any rule.
     24 
     25 #include <map>
     26 #include <set>
     27 #include <string>
     28 
     29 #include "base/at_exit.h"
     30 #include "base/command_line.h"
     31 #include "base/file_util.h"
     32 #include "base/i18n/icu_util.h"
     33 #include "base/logging.h"
     34 #include "base/file_path.h"
     35 #include "base/file_util.h"
     36 #include "base/path_service.h"
     37 #include "base/process_util.h"
     38 #include "base/string_util.h"
     39 #include "googleurl/src/gurl.h"
     40 #include "googleurl/src/url_parse.h"
     41 
     42 namespace {
     43 struct Rule {
     44   bool exception;
     45   bool wildcard;
     46 };
     47 
     48 typedef std::map<std::string, Rule> RuleMap;
     49 typedef std::set<std::string> RuleSet;
     50 }
     51 
     52 // Writes the list of domain rules contained in the 'rules' set to the
     53 // 'outfile', with each rule terminated by a LF.  The file must already have
     54 // been created with write access.
     55 bool WriteRules(const RuleMap& rules, FilePath outfile) {
     56   std::string data;
     57   data.append(
     58       "%{\n"
     59       "// Copyright (c) 2009 The Chromium Authors. All rights reserved.\n"
     60       "// Use of this source code is governed by a BSD-style license that\n"
     61       "// can be found in the LICENSE file.\n\n"
     62       "// This file is generated by net/tools/tld_cleanup/.\n"
     63       "// DO NOT MANUALLY EDIT!\n"
     64       "%}\n"
     65       "struct DomainRule {\n"
     66       "  const char *name;\n"
     67       "  int type;  // 1: exception, 2: wildcard\n"
     68       "};\n"
     69       "%%\n"
     70   );
     71 
     72   for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
     73     data.append(i->first);
     74     data.append(", ");
     75     if (i->second.exception) {
     76       data.append("1");
     77     } else if (i->second.wildcard) {
     78       data.append("2");
     79     } else {
     80       data.append("0");
     81     }
     82     data.append("\n");
     83   }
     84 
     85   data.append("%%\n");
     86 
     87   int written = file_util::WriteFile(outfile, data.data(), data.size());
     88 
     89   return written == static_cast<int>(data.size());
     90 }
     91 
     92 // These result codes should be in increasing order of severity.
     93 typedef enum {
     94   kSuccess,
     95   kWarning,
     96   kError,
     97 } NormalizeResult;
     98 
     99 // Adjusts the rule to a standard form: removes single extraneous dots and
    100 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
    101 // valid; logs a warning and returns kWarning if it is probably invalid; and
    102 // logs an error and returns kError if the rule is (almost) certainly invalid.
    103 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
    104   NormalizeResult result = kSuccess;
    105 
    106   // Strip single leading and trailing dots.
    107   if (domain->at(0) == '.')
    108     domain->erase(0, 1);
    109   if (domain->empty()) {
    110     LOG(WARNING) << "Ignoring empty rule";
    111     return kWarning;
    112   }
    113   if (domain->at(domain->size() - 1) == '.')
    114     domain->erase(domain->size() - 1, 1);
    115   if (domain->empty()) {
    116     LOG(WARNING) << "Ignoring empty rule";
    117     return kWarning;
    118   }
    119 
    120   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
    121   size_t start_offset = 0;
    122   if (domain->at(0) == '!') {
    123     domain->erase(0, 1);
    124     rule->exception = true;
    125   } else if (domain->find("*.") == 0) {
    126     domain->erase(0, 2);
    127     rule->wildcard = true;
    128   }
    129   if (domain->empty()) {
    130     LOG(WARNING) << "Ignoring empty rule";
    131     return kWarning;
    132   }
    133 
    134   // Warn about additional '*.' or '!'.
    135   if (domain->find("*.", start_offset) != std::string::npos ||
    136       domain->find('!', start_offset) != std::string::npos) {
    137     LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
    138     result = kWarning;
    139   }
    140 
    141   // Make a GURL and normalize it, then get the host back out.
    142   std::string url = "http://";
    143   url.append(*domain);
    144   GURL gurl(url);
    145   const std::string& spec = gurl.possibly_invalid_spec();
    146   url_parse::Component host = gurl.parsed_for_possibly_invalid_spec().host;
    147   if (host.len < 0) {
    148     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
    149     return kError;
    150   }
    151   if (!gurl.is_valid()) {
    152     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
    153     result = kWarning;
    154   }
    155   domain->assign(spec.substr(host.begin, host.len));
    156 
    157   return result;
    158 }
    159 
    160 // Loads the file described by 'in_filename', converts it to the desired format
    161 // (see the file comments above), and saves it into 'out_filename'.  Returns
    162 // the most severe of the result codes encountered when normalizing the rules.
    163 NormalizeResult NormalizeFile(const FilePath& in_filename,
    164                               const FilePath& out_filename) {
    165   std::string data;
    166   if (!file_util::ReadFileToString(in_filename, &data)) {
    167     LOG(ERROR) << "Unable to read file";
    168     // We return success since we've already reported the error.
    169     return kSuccess;
    170   }
    171 
    172   // We do a lot of string assignment during parsing, but simplicity is more
    173   // important than performance here.
    174   std::string domain;
    175   NormalizeResult result = kSuccess;
    176   size_t line_start = 0;
    177   size_t line_end = 0;
    178   RuleMap rules;
    179   RuleSet extra_rules;
    180   while (line_start < data.size()) {
    181     // Skip comments.
    182     if (line_start + 1 < data.size() &&
    183         data[line_start] == '/' &&
    184         data[line_start + 1] == '/') {
    185       line_end = data.find_first_of("\r\n", line_start);
    186       if (line_end == std::string::npos)
    187         line_end = data.size();
    188     } else {
    189       // Truncate at first whitespace.
    190       line_end = data.find_first_of("\r\n \t", line_start);
    191       if (line_end == std::string::npos)
    192         line_end = data.size();
    193       domain.assign(data.data(), line_start, line_end - line_start);
    194 
    195       Rule rule;
    196       rule.wildcard = false;
    197       rule.exception = false;
    198       NormalizeResult new_result = NormalizeRule(&domain, &rule);
    199       if (new_result != kError) {
    200         // Check the existing rules to make sure we don't have an exception and
    201         // wildcard for the same rule.  If we did, we'd have to update our
    202         // parsing code to handle this case.
    203         CHECK(rules.find(domain) == rules.end());
    204 
    205         rules[domain] = rule;
    206         // Add true TLD for multi-level rules.  We don't add them right now, in
    207         // case there's an exception or wild card that either exists or might be
    208         // added in a later iteration.  In those cases, there's no need to add
    209         // it and it would just slow down parsing the data.
    210         size_t tld_start = domain.find_last_of('.');
    211         if (tld_start != std::string::npos && tld_start + 1 < domain.size())
    212           extra_rules.insert(domain.substr(tld_start + 1));
    213       }
    214       result = std::max(result, new_result);
    215     }
    216 
    217     // Find beginning of next non-empty line.
    218     line_start = data.find_first_of("\r\n", line_end);
    219     if (line_start == std::string::npos)
    220       line_start = data.size();
    221     line_start = data.find_first_not_of("\r\n", line_start);
    222     if (line_start == std::string::npos)
    223       line_start = data.size();
    224   }
    225 
    226   for (RuleSet::const_iterator iter = extra_rules.begin();
    227        iter != extra_rules.end();
    228        ++iter) {
    229     if (rules.find(*iter) == rules.end()) {
    230       Rule rule;
    231       rule.exception = false;
    232       rule.wildcard = false;
    233       rules[*iter] = rule;
    234     }
    235   }
    236 
    237   if (!WriteRules(rules, out_filename)) {
    238     LOG(ERROR) << "Error(s) writing output file";
    239     result = kError;
    240   }
    241 
    242   return result;
    243 }
    244 
    245 int main(int argc, const char* argv[]) {
    246   base::EnableTerminationOnHeapCorruption();
    247   if (argc != 1) {
    248     fprintf(stderr, "Normalizes and verifies UTF-8 TLD data files\n");
    249     fprintf(stderr, "Usage: %s\n", argv[0]);
    250     return 1;
    251   }
    252 
    253   // Manages the destruction of singletons.
    254   base::AtExitManager exit_manager;
    255 
    256   // Only use OutputDebugString in debug mode.
    257 #ifdef NDEBUG
    258   logging::LoggingDestination destination = logging::LOG_ONLY_TO_FILE;
    259 #else
    260   logging::LoggingDestination destination =
    261       logging::LOG_TO_BOTH_FILE_AND_SYSTEM_DEBUG_LOG;
    262 #endif
    263 
    264   CommandLine::Init(argc, argv);
    265 
    266   FilePath log_filename;
    267   PathService::Get(base::DIR_EXE, &log_filename);
    268   log_filename = log_filename.AppendASCII("tld_cleanup.log");
    269   logging::InitLogging(
    270       log_filename.value().c_str(),
    271       destination,
    272       logging::LOCK_LOG_FILE,
    273       logging::DELETE_OLD_LOG_FILE,
    274       logging::DISABLE_DCHECK_FOR_NON_OFFICIAL_RELEASE_BUILDS);
    275 
    276   icu_util::Initialize();
    277 
    278   FilePath input_file;
    279   PathService::Get(base::DIR_SOURCE_ROOT, &input_file);
    280   input_file = input_file.Append(FILE_PATH_LITERAL("net"))
    281                          .Append(FILE_PATH_LITERAL("base"))
    282                          .Append(FILE_PATH_LITERAL("effective_tld_names.dat"));
    283   FilePath output_file;
    284   PathService::Get(base::DIR_SOURCE_ROOT, &output_file);
    285   output_file = output_file.Append(FILE_PATH_LITERAL("net"))
    286                            .Append(FILE_PATH_LITERAL("base"))
    287                            .Append(FILE_PATH_LITERAL(
    288                                "effective_tld_names.gperf"));
    289   NormalizeResult result = NormalizeFile(input_file, output_file);
    290   if (result != kSuccess) {
    291     fprintf(stderr,
    292             "Errors or warnings processing file.  See log in tld_cleanup.log.");
    293   }
    294 
    295   if (result == kError)
    296     return 1;
    297   return 0;
    298 }
    299