Home | History | Annotate | Download | only in registry_controlled_domains
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // NB: Modelled after Mozilla's code (originally written by Pamela Greene,
      6 // later modified by others), but almost entirely rewritten for Chrome.
      7 //   (netwerk/dns/src/nsEffectiveTLDService.cpp)
      8 /* ***** BEGIN LICENSE BLOCK *****
      9  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
     10  *
     11  * The contents of this file are subject to the Mozilla Public License Version
     12  * 1.1 (the "License"); you may not use this file except in compliance with
     13  * the License. You may obtain a copy of the License at
     14  * http://www.mozilla.org/MPL/
     15  *
     16  * Software distributed under the License is distributed on an "AS IS" basis,
     17  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
     18  * for the specific language governing rights and limitations under the
     19  * License.
     20  *
     21  * The Original Code is Mozilla Effective-TLD Service
     22  *
     23  * The Initial Developer of the Original Code is
     24  * Google Inc.
     25  * Portions created by the Initial Developer are Copyright (C) 2006
     26  * the Initial Developer. All Rights Reserved.
     27  *
     28  * Contributor(s):
     29  *   Pamela Greene <pamg.bugs (at) gmail.com> (original author)
     30  *   Daniel Witte <dwitte (at) stanford.edu>
     31  *
     32  * Alternatively, the contents of this file may be used under the terms of
     33  * either the GNU General Public License Version 2 or later (the "GPL"), or
     34  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
     35  * in which case the provisions of the GPL or the LGPL are applicable instead
     36  * of those above. If you wish to allow use of your version of this file only
     37  * under the terms of either the GPL or the LGPL, and not to allow others to
     38  * use your version of this file under the terms of the MPL, indicate your
     39  * decision by deleting the provisions above and replace them with the notice
     40  * and other provisions required by the GPL or the LGPL. If you do not delete
     41  * the provisions above, a recipient may use your version of this file under
     42  * the terms of any one of the MPL, the GPL or the LGPL.
     43  *
     44  * ***** END LICENSE BLOCK ***** */
     45 
     46 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
     47 
     48 #include "base/logging.h"
     49 #include "base/strings/string_util.h"
     50 #include "base/strings/utf_string_conversions.h"
     51 #include "net/base/net_module.h"
     52 #include "net/base/net_util.h"
     53 #include "url/gurl.h"
     54 #include "url/url_parse.h"
     55 
     56 namespace net {
     57 namespace registry_controlled_domains {
     58 
     59 namespace {
     60 #include "net/base/registry_controlled_domains/effective_tld_names-inc.cc"
     61 
     62 // See make_dafsa.py for documentation of the generated dafsa byte array.
     63 
     64 const unsigned char* g_graph = kDafsa;
     65 size_t g_graph_length = sizeof(kDafsa);
     66 
     67 const int kNotFound = -1;
     68 const int kExceptionRule = 1;
     69 const int kWildcardRule = 2;
     70 const int kPrivateRule = 4;
     71 
     72 // Read next offset from pos.
     73 // Returns true if an offset could be read, false otherwise.
     74 bool GetNextOffset(const unsigned char** pos, const unsigned char* end,
     75                    const unsigned char** offset) {
     76   if (*pos == end)
     77     return false;
     78 
     79   // When reading an offset the byte array must always contain at least
     80   // three more bytes to consume. First the offset to read, then a node
     81   // to skip over and finally a destination node. No object can be smaller
     82   // than one byte.
     83   CHECK_LT(*pos + 2, end);
     84   size_t bytes_consumed;
     85   switch (**pos & 0x60) {
     86     case 0x60:  // Read three byte offset
     87       *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
     88       bytes_consumed = 3;
     89       break;
     90     case 0x40:  // Read two byte offset
     91       *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
     92       bytes_consumed = 2;
     93       break;
     94     default:
     95       *offset += (*pos)[0] & 0x3F;
     96       bytes_consumed = 1;
     97   }
     98   if ((**pos & 0x80) != 0) {
     99     *pos = end;
    100   } else {
    101     *pos += bytes_consumed;
    102   }
    103   return true;
    104 }
    105 
    106 // Check if byte at offset is last in label.
    107 bool IsEOL(const unsigned char* offset, const unsigned char* end) {
    108   CHECK_LT(offset, end);
    109   return (*offset & 0x80) != 0;
    110 }
    111 
    112 // Check if byte at offset matches first character in key.
    113 // This version matches characters not last in label.
    114 bool IsMatch(const unsigned char* offset, const unsigned char* end,
    115              const char* key) {
    116   CHECK_LT(offset, end);
    117   return *offset == *key;
    118 }
    119 
    120 // Check if byte at offset matches first character in key.
    121 // This version matches characters last in label.
    122 bool IsEndCharMatch(const unsigned char* offset, const unsigned char* end,
    123                     const char* key) {
    124   CHECK_LT(offset, end);
    125   return *offset == (*key | 0x80);
    126 }
    127 
    128 // Read return value at offset.
    129 // Returns true if a return value could be read, false otherwise.
    130 bool GetReturnValue(const unsigned char* offset, const unsigned char* end,
    131                     int* return_value) {
    132   CHECK_LT(offset, end);
    133   if ((*offset & 0xE0) == 0x80) {
    134     *return_value = *offset & 0x0F;
    135     return true;
    136   }
    137   return false;
    138 }
    139 
    140 // Lookup a domain key in a byte array generated by make_dafsa.py.
    141 // The rule type is returned if key is found, otherwise kNotFound is returned.
    142 int LookupString(const unsigned char* graph, size_t length, const char* key,
    143                  size_t key_length) {
    144   const unsigned char* pos = graph;
    145   const unsigned char* end = graph + length;
    146   const unsigned char* offset = pos;
    147   const char* key_end = key + key_length;
    148   while (GetNextOffset(&pos, end, &offset)) {
    149     //   char <char>+ end_char offsets
    150     //   char <char>+ return value
    151     //   char end_char offsets
    152     //   char return value
    153     //   end_char offsets
    154     //   return_value
    155     bool did_consume = false;
    156     if (key != key_end && !IsEOL(offset, end)) {
    157       // Leading <char> is not a match. Don't dive into this child
    158       if (!IsMatch(offset, end, key))
    159         continue;
    160       did_consume = true;
    161       ++offset;
    162       ++key;
    163       // Possible matches at this point:
    164       // <char>+ end_char offsets
    165       // <char>+ return value
    166       // end_char offsets
    167       // return value
    168       // Remove all remaining <char> nodes possible
    169       while (!IsEOL(offset, end) && key != key_end) {
    170         if (!IsMatch(offset, end, key))
    171           return kNotFound;
    172         ++key;
    173         ++offset;
    174       }
    175     }
    176     // Possible matches at this point:
    177     // end_char offsets
    178     // return_value
    179     // If one or more <char> elements were consumed, a failure
    180     // to match is terminal. Otherwise, try the next node.
    181     if (key == key_end) {
    182       int return_value;
    183       if (GetReturnValue(offset, end, &return_value))
    184         return return_value;
    185       // The DAFSA guarantees that if the first char is a match, all
    186       // remaining char elements MUST match if the key is truly present.
    187       if (did_consume)
    188         return kNotFound;
    189       continue;
    190     }
    191     if (!IsEndCharMatch(offset, end, key)) {
    192       if (did_consume)
    193         return kNotFound;  // Unexpected
    194       continue;
    195     }
    196     ++key;
    197     pos = ++offset;  // Dive into child
    198   }
    199   return kNotFound;  // No match
    200 }
    201 
    202 size_t GetRegistryLengthImpl(
    203     const std::string& host,
    204     UnknownRegistryFilter unknown_filter,
    205     PrivateRegistryFilter private_filter) {
    206   DCHECK(!host.empty());
    207 
    208   // Skip leading dots.
    209   const size_t host_check_begin = host.find_first_not_of('.');
    210   if (host_check_begin == std::string::npos)
    211     return 0;  // Host is only dots.
    212 
    213   // A single trailing dot isn't relevant in this determination, but does need
    214   // to be included in the final returned length.
    215   size_t host_check_len = host.length();
    216   if (host[host_check_len - 1] == '.') {
    217     --host_check_len;
    218     DCHECK(host_check_len > 0);  // If this weren't true, the host would be ".",
    219                                  // and we'd have already returned above.
    220     if (host[host_check_len - 1] == '.')
    221       return 0;  // Multiple trailing dots.
    222   }
    223 
    224   // Walk up the domain tree, most specific to least specific,
    225   // looking for matches at each level.
    226   size_t prev_start = std::string::npos;
    227   size_t curr_start = host_check_begin;
    228   size_t next_dot = host.find('.', curr_start);
    229   if (next_dot >= host_check_len)  // Catches std::string::npos as well.
    230     return 0;  // This can't have a registry + domain.
    231   while (1) {
    232     const char* domain_str = host.data() + curr_start;
    233     size_t domain_length = host_check_len - curr_start;
    234     int type = LookupString(g_graph, g_graph_length, domain_str, domain_length);
    235     bool do_check =
    236         type != kNotFound && (!(type & kPrivateRule) ||
    237                               private_filter == INCLUDE_PRIVATE_REGISTRIES);
    238 
    239     // If the apparent match is a private registry and we're not including
    240     // those, it can't be an actual match.
    241     if (do_check) {
    242       // Exception rules override wildcard rules when the domain is an exact
    243       // match, but wildcards take precedence when there's a subdomain.
    244       if (type & kWildcardRule && (prev_start != std::string::npos)) {
    245         // If prev_start == host_check_begin, then the host is the registry
    246         // itself, so return 0.
    247         return (prev_start == host_check_begin) ? 0
    248                                                 : (host.length() - prev_start);
    249       }
    250 
    251       if (type & kExceptionRule) {
    252         if (next_dot == std::string::npos) {
    253           // If we get here, we had an exception rule with no dots (e.g.
    254           // "!foo").  This would only be valid if we had a corresponding
    255           // wildcard rule, which would have to be "*".  But we explicitly
    256           // disallow that case, so this kind of rule is invalid.
    257           NOTREACHED() << "Invalid exception rule";
    258           return 0;
    259         }
    260         return host.length() - next_dot - 1;
    261       }
    262 
    263       // If curr_start == host_check_begin, then the host is the registry
    264       // itself, so return 0.
    265       return (curr_start == host_check_begin) ? 0
    266                                               : (host.length() - curr_start);
    267     }
    268 
    269     if (next_dot >= host_check_len)  // Catches std::string::npos as well.
    270       break;
    271 
    272     prev_start = curr_start;
    273     curr_start = next_dot + 1;
    274     next_dot = host.find('.', curr_start);
    275   }
    276 
    277   // No rule found in the registry.  curr_start now points to the first
    278   // character of the last subcomponent of the host, so if we allow unknown
    279   // registries, return the length of this subcomponent.
    280   return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ?
    281       (host.length() - curr_start) : 0;
    282 }
    283 
    284 std::string GetDomainAndRegistryImpl(
    285     const std::string& host, PrivateRegistryFilter private_filter) {
    286   DCHECK(!host.empty());
    287 
    288   // Find the length of the registry for this host.
    289   const size_t registry_length =
    290       GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter);
    291   if ((registry_length == std::string::npos) || (registry_length == 0))
    292     return std::string();  // No registry.
    293   // The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding
    294   // subcomponent length.
    295   DCHECK(host.length() >= 2);
    296   if (registry_length > (host.length() - 2)) {
    297     NOTREACHED() <<
    298         "Host does not have at least one subcomponent before registry!";
    299     return std::string();
    300   }
    301 
    302   // Move past the dot preceding the registry, and search for the next previous
    303   // dot.  Return the host from after that dot, or the whole host when there is
    304   // no dot.
    305   const size_t dot = host.rfind('.', host.length() - registry_length - 2);
    306   if (dot == std::string::npos)
    307     return host;
    308   return host.substr(dot + 1);
    309 }
    310 
    311 }  // namespace
    312 
    313 std::string GetDomainAndRegistry(
    314     const GURL& gurl,
    315     PrivateRegistryFilter filter) {
    316   const url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
    317   if ((host.len <= 0) || gurl.HostIsIPAddress())
    318     return std::string();
    319   return GetDomainAndRegistryImpl(std::string(
    320       gurl.possibly_invalid_spec().data() + host.begin, host.len), filter);
    321 }
    322 
    323 std::string GetDomainAndRegistry(
    324     const std::string& host,
    325     PrivateRegistryFilter filter) {
    326   url::CanonHostInfo host_info;
    327   const std::string canon_host(CanonicalizeHost(host, &host_info));
    328   if (canon_host.empty() || host_info.IsIPAddress())
    329     return std::string();
    330   return GetDomainAndRegistryImpl(canon_host, filter);
    331 }
    332 
    333 bool SameDomainOrHost(
    334     const GURL& gurl1,
    335     const GURL& gurl2,
    336     PrivateRegistryFilter filter) {
    337   // See if both URLs have a known domain + registry, and those values are the
    338   // same.
    339   const std::string domain1(GetDomainAndRegistry(gurl1, filter));
    340   const std::string domain2(GetDomainAndRegistry(gurl2, filter));
    341   if (!domain1.empty() || !domain2.empty())
    342     return domain1 == domain2;
    343 
    344   // No domains.  See if the hosts are identical.
    345   const url::Component host1 = gurl1.parsed_for_possibly_invalid_spec().host;
    346   const url::Component host2 = gurl2.parsed_for_possibly_invalid_spec().host;
    347   if ((host1.len <= 0) || (host1.len != host2.len))
    348     return false;
    349   return !strncmp(gurl1.possibly_invalid_spec().data() + host1.begin,
    350                   gurl2.possibly_invalid_spec().data() + host2.begin,
    351                   host1.len);
    352 }
    353 
    354 size_t GetRegistryLength(
    355     const GURL& gurl,
    356     UnknownRegistryFilter unknown_filter,
    357     PrivateRegistryFilter private_filter) {
    358   const url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
    359   if (host.len <= 0)
    360     return std::string::npos;
    361   if (gurl.HostIsIPAddress())
    362     return 0;
    363   return GetRegistryLengthImpl(
    364       std::string(gurl.possibly_invalid_spec().data() + host.begin, host.len),
    365       unknown_filter,
    366       private_filter);
    367 }
    368 
    369 size_t GetRegistryLength(
    370     const std::string& host,
    371     UnknownRegistryFilter unknown_filter,
    372     PrivateRegistryFilter private_filter) {
    373   url::CanonHostInfo host_info;
    374   const std::string canon_host(CanonicalizeHost(host, &host_info));
    375   if (canon_host.empty())
    376     return std::string::npos;
    377   if (host_info.IsIPAddress())
    378     return 0;
    379   return GetRegistryLengthImpl(canon_host, unknown_filter, private_filter);
    380 }
    381 
    382 void SetFindDomainGraph() {
    383   g_graph = kDafsa;
    384   g_graph_length = sizeof(kDafsa);
    385 }
    386 
    387 void SetFindDomainGraph(const unsigned char* domains, size_t length) {
    388   CHECK(domains);
    389   CHECK_NE(length, 0u);
    390   g_graph = domains;
    391   g_graph_length = length;
    392 }
    393 
    394 }  // namespace registry_controlled_domains
    395 }  // namespace net
    396