1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "net/tools/tld_cleanup/tld_cleanup_util.h" 6 7 #include "base/file_util.h" 8 #include "base/logging.h" 9 #include "base/strings/string_number_conversions.h" 10 #include "base/strings/string_util.h" 11 #include "url/gurl.h" 12 #include "url/url_parse.h" 13 14 namespace { 15 16 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS==="; 17 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS==="; 18 19 const int kExceptionRule = 1; 20 const int kWildcardRule = 2; 21 const int kPrivateRule = 4; 22 } 23 24 namespace net { 25 namespace tld_cleanup { 26 27 // Writes the list of domain rules contained in the 'rules' set to the 28 // 'outfile', with each rule terminated by a LF. The file must already have 29 // been created with write access. 30 bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) { 31 std::string data; 32 data.append("%{\n" 33 "// Copyright 2012 The Chromium Authors. All rights reserved.\n" 34 "// Use of this source code is governed by a BSD-style license " 35 "that can be\n" 36 "// found in the LICENSE file.\n\n" 37 "// This file is generated by net/tools/tld_cleanup/.\n" 38 "// DO NOT MANUALLY EDIT!\n" 39 "%}\n" 40 "struct DomainRule {\n" 41 " int name_offset;\n" 42 " int type; // flags: 1: exception, 2: wildcard, 4: private\n" 43 "};\n" 44 "%%\n"); 45 46 for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) { 47 data.append(i->first); 48 data.append(", "); 49 int type = 0; 50 if (i->second.exception) { 51 type = kExceptionRule; 52 } else if (i->second.wildcard) { 53 type = kWildcardRule; 54 } 55 if (i->second.is_private) { 56 type += kPrivateRule; 57 } 58 data.append(base::IntToString(type)); 59 data.append("\n"); 60 } 61 62 data.append("%%\n"); 63 64 int written = base::WriteFile(outfile, 65 data.data(), 66 static_cast<int>(data.size())); 67 68 return written == static_cast<int>(data.size()); 69 } 70 71 // Adjusts the rule to a standard form: removes single extraneous dots and 72 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as 73 // valid; logs a warning and returns kWarning if it is probably invalid; and 74 // logs an error and returns kError if the rule is (almost) certainly invalid. 75 NormalizeResult NormalizeRule(std::string* domain, Rule* rule) { 76 NormalizeResult result = kSuccess; 77 78 // Strip single leading and trailing dots. 79 if (domain->at(0) == '.') 80 domain->erase(0, 1); 81 if (domain->empty()) { 82 LOG(WARNING) << "Ignoring empty rule"; 83 return kWarning; 84 } 85 if (domain->at(domain->size() - 1) == '.') 86 domain->erase(domain->size() - 1, 1); 87 if (domain->empty()) { 88 LOG(WARNING) << "Ignoring empty rule"; 89 return kWarning; 90 } 91 92 // Allow single leading '*.' or '!', saved here so it's not canonicalized. 93 size_t start_offset = 0; 94 if (domain->at(0) == '!') { 95 domain->erase(0, 1); 96 rule->exception = true; 97 } else if (domain->find("*.") == 0) { 98 domain->erase(0, 2); 99 rule->wildcard = true; 100 } 101 if (domain->empty()) { 102 LOG(WARNING) << "Ignoring empty rule"; 103 return kWarning; 104 } 105 106 // Warn about additional '*.' or '!'. 107 if (domain->find("*.", start_offset) != std::string::npos || 108 domain->find('!', start_offset) != std::string::npos) { 109 LOG(WARNING) << "Keeping probably invalid rule: " << *domain; 110 result = kWarning; 111 } 112 113 // Make a GURL and normalize it, then get the host back out. 114 std::string url = "http://"; 115 url.append(*domain); 116 GURL gurl(url); 117 const std::string& spec = gurl.possibly_invalid_spec(); 118 url::Component host = gurl.parsed_for_possibly_invalid_spec().host; 119 if (host.len < 0) { 120 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain; 121 return kError; 122 } 123 if (!gurl.is_valid()) { 124 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain; 125 result = kWarning; 126 } 127 domain->assign(spec.substr(host.begin, host.len)); 128 129 return result; 130 } 131 132 NormalizeResult NormalizeDataToRuleMap(const std::string data, 133 RuleMap* rules) { 134 CHECK(rules); 135 // We do a lot of string assignment during parsing, but simplicity is more 136 // important than performance here. 137 std::string domain; 138 NormalizeResult result = kSuccess; 139 size_t line_start = 0; 140 size_t line_end = 0; 141 bool is_private = false; 142 RuleMap extra_rules; 143 int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1; 144 int end_private_length = arraysize(kEndPrivateDomainsComment) - 1; 145 while (line_start < data.size()) { 146 if (line_start + begin_private_length < data.size() && 147 !data.compare(line_start, begin_private_length, 148 kBeginPrivateDomainsComment)) { 149 is_private = true; 150 line_end = line_start + begin_private_length; 151 } else if (line_start + end_private_length < data.size() && 152 !data.compare(line_start, end_private_length, 153 kEndPrivateDomainsComment)) { 154 is_private = false; 155 line_end = line_start + end_private_length; 156 } else if (line_start + 1 < data.size() && 157 data[line_start] == '/' && 158 data[line_start + 1] == '/') { 159 // Skip comments. 160 line_end = data.find_first_of("\r\n", line_start); 161 if (line_end == std::string::npos) 162 line_end = data.size(); 163 } else { 164 // Truncate at first whitespace. 165 line_end = data.find_first_of("\r\n \t", line_start); 166 if (line_end == std::string::npos) 167 line_end = data.size(); 168 domain.assign(data.data(), line_start, line_end - line_start); 169 170 Rule rule; 171 rule.wildcard = false; 172 rule.exception = false; 173 rule.is_private = is_private; 174 NormalizeResult new_result = NormalizeRule(&domain, &rule); 175 if (new_result != kError) { 176 // Check the existing rules to make sure we don't have an exception and 177 // wildcard for the same rule, or that the same domain is listed as both 178 // private and not private. If we did, we'd have to update our 179 // parsing code to handle this case. 180 CHECK(rules->find(domain) == rules->end()) 181 << "Duplicate rule found for " << domain; 182 183 (*rules)[domain] = rule; 184 // Add true TLD for multi-level rules. We don't add them right now, in 185 // case there's an exception or wild card that either exists or might be 186 // added in a later iteration. In those cases, there's no need to add 187 // it and it would just slow down parsing the data. 188 size_t tld_start = domain.find_last_of('.'); 189 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) { 190 std::string extra_rule_domain = domain.substr(tld_start + 1); 191 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain); 192 Rule extra_rule; 193 extra_rule.exception = false; 194 extra_rule.wildcard = false; 195 if (iter == extra_rules.end()) { 196 extra_rule.is_private = is_private; 197 } else { 198 // A rule already exists, so we ensure that if any of the entries is 199 // not private the result should be that the entry is not private. 200 // An example is .au which is not listed as a real TLD, but only 201 // lists second-level domains such as com.au. Subdomains of .au 202 // (eg. blogspot.com.au) are also listed in the private section, 203 // which is processed later, so this ensures that the real TLD 204 // (eg. .au) is listed as public. 205 extra_rule.is_private = is_private && iter->second.is_private; 206 } 207 extra_rules[extra_rule_domain] = extra_rule; 208 } 209 } 210 result = std::max(result, new_result); 211 } 212 213 // Find beginning of next non-empty line. 214 line_start = data.find_first_of("\r\n", line_end); 215 if (line_start == std::string::npos) 216 line_start = data.size(); 217 line_start = data.find_first_not_of("\r\n", line_start); 218 if (line_start == std::string::npos) 219 line_start = data.size(); 220 } 221 222 for (RuleMap::const_iterator iter = extra_rules.begin(); 223 iter != extra_rules.end(); 224 ++iter) { 225 if (rules->find(iter->first) == rules->end()) { 226 (*rules)[iter->first] = iter->second; 227 } 228 } 229 230 return result; 231 } 232 233 NormalizeResult NormalizeFile(const base::FilePath& in_filename, 234 const base::FilePath& out_filename) { 235 RuleMap rules; 236 std::string data; 237 if (!base::ReadFileToString(in_filename, &data)) { 238 LOG(ERROR) << "Unable to read file"; 239 // We return success since we've already reported the error. 240 return kSuccess; 241 } 242 243 NormalizeResult result = NormalizeDataToRuleMap(data, &rules); 244 245 if (!WriteRules(rules, out_filename)) { 246 LOG(ERROR) << "Error(s) writing output file"; 247 result = kError; 248 } 249 250 return result; 251 } 252 253 254 } // namespace tld_cleanup 255 } // namespace net 256