Home | History | Annotate | Download | only in net
      1 /*
      2  * Copyright (C) 2009 The Guava Authors
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.common.net;
     18 
     19 import static com.google.common.base.Preconditions.checkArgument;
     20 import static com.google.common.base.Preconditions.checkNotNull;
     21 import static com.google.common.base.Preconditions.checkState;
     22 
     23 import com.google.common.annotations.Beta;
     24 import com.google.common.annotations.GwtCompatible;
     25 import com.google.common.base.Ascii;
     26 import com.google.common.base.CharMatcher;
     27 import com.google.common.base.Joiner;
     28 import com.google.common.base.Objects;
     29 import com.google.common.base.Splitter;
     30 import com.google.common.collect.ImmutableList;
     31 
     32 import java.util.List;
     33 
     34 import javax.annotation.Nullable;
     35 
     36 /**
     37  * An immutable well-formed internet domain name, such as {@code com} or {@code
     38  * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
     39  * network interactions take place. Thus there is no guarantee that the domain
     40  * actually exists on the internet.
     41  *
     42  * <p>One common use of this class is to determine whether a given string is
     43  * likely to represent an addressable domain on the web -- that is, for a
     44  * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
     45  * result in a webpage being displayed? In the past, this test was frequently
     46  * done by determining whether the domain ended with a {@linkplain
     47  * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
     48  * this test is no longer accurate. There are many domains which are both public
     49  * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
     50  * result, the only useful test to determine if a domain is a plausible web host
     51  * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
     52  * which (currently) are not hosts, such as {@code "com"}), but given that any
     53  * public suffix may become a host without warning, it is better to err on the
     54  * side of permissiveness and thus avoid spurious rejection of valid sites.
     55  *
     56  * <p>During construction, names are normalized in two ways:
     57  * <ol>
     58  * <li>ASCII uppercase characters are converted to lowercase.
     59  * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
     60  * converted to the ASCII period.
     61  * </ol>
     62  * The normalized values will be returned from {@link #name()} and
     63  * {@link #parts()}, and will be reflected in the result of
     64  * {@link #equals(Object)}.
     65  *
     66  * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
     67  * internationalized domain names</a> such as {@code .cn} are supported, as
     68  * are the equivalent <a
     69  * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
     70  * Punycode-encoded</a> versions.
     71  *
     72  * @author Craig Berry
     73  * @since 5.0
     74  */
     75 @Beta
     76 @GwtCompatible(emulated = true)
     77 public final class InternetDomainName {
     78 
     79   private static final CharMatcher DOTS_MATCHER =
     80       CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
     81   private static final Splitter DOT_SPLITTER = Splitter.on('.');
     82   private static final Joiner DOT_JOINER = Joiner.on('.');
     83 
     84   /**
     85    * Value of {@link #publicSuffixIndex} which indicates that no public suffix
     86    * was found.
     87    */
     88   private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
     89 
     90   private static final String DOT_REGEX = "\\.";
     91 
     92   /**
     93    * Maximum parts (labels) in a domain name. This value arises from
     94    * the 255-octet limit described in
     95    * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
     96    * the fact that the encoding of each part occupies at least two bytes
     97    * (dot plus label externally, length byte plus label internally). Thus, if
     98    * all labels have the minimum size of one byte, 127 of them will fit.
     99    */
    100   private static final int MAX_PARTS = 127;
    101 
    102   /**
    103    * Maximum length of a full domain name, including separators, and
    104    * leaving room for the root label. See
    105    * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
    106    */
    107   private static final int MAX_LENGTH = 253;
    108 
    109   /**
    110    * Maximum size of a single part of a domain name. See
    111    * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
    112    */
    113   private static final int MAX_DOMAIN_PART_LENGTH = 63;
    114 
    115   /**
    116    * The full domain name, converted to lower case.
    117    */
    118   private final String name;
    119 
    120   /**
    121    * The parts of the domain name, converted to lower case.
    122    */
    123   private final ImmutableList<String> parts;
    124 
    125   /**
    126    * The index in the {@link #parts()} list at which the public suffix begins.
    127    * For example, for the domain name {@code www.google.co.uk}, the value would
    128    * be 2 (the index of the {@code co} part). The value is negative
    129    * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
    130    * found.
    131    */
    132   private final int publicSuffixIndex;
    133 
    134   /**
    135    * Constructor used to implement {@link #from(String)}, and from subclasses.
    136    */
    137   InternetDomainName(String name) {
    138     // Normalize:
    139     // * ASCII characters to lowercase
    140     // * All dot-like characters to '.'
    141     // * Strip trailing '.'
    142 
    143     name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
    144 
    145     if (name.endsWith(".")) {
    146       name = name.substring(0, name.length() - 1);
    147     }
    148 
    149     checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
    150     this.name = name;
    151 
    152     this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
    153     checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
    154     checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
    155 
    156     this.publicSuffixIndex = findPublicSuffix();
    157   }
    158 
    159   /**
    160    * Returns the index of the leftmost part of the public suffix, or -1 if not
    161    * found. Note that the value defined as the "public suffix" may not be a
    162    * public suffix according to {@link #isPublicSuffix()} if the domain ends
    163    * with an excluded domain pattern such as {@code "nhs.uk"}.
    164    */
    165   private int findPublicSuffix() {
    166     final int partsSize = parts.size();
    167 
    168     for (int i = 0; i < partsSize; i++) {
    169       String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
    170 
    171       if (TldPatterns.EXACT.contains(ancestorName)) {
    172         return i;
    173       }
    174 
    175       // Excluded domains (e.g. !nhs.uk) use the next highest
    176       // domain as the effective public suffix (e.g. uk).
    177 
    178       if (TldPatterns.EXCLUDED.contains(ancestorName)) {
    179         return i + 1;
    180       }
    181 
    182       if (matchesWildcardPublicSuffix(ancestorName)) {
    183         return i;
    184       }
    185     }
    186 
    187     return NO_PUBLIC_SUFFIX_FOUND;
    188   }
    189 
    190   /**
    191    * A deprecated synonym for {@link #from(String)}.
    192    *
    193    * @param domain A domain name (not IP address)
    194    * @throws IllegalArgumentException if {@code name} is not syntactically valid
    195    *     according to {@link #isValidLenient}
    196    * @since 8.0 (previously named {@code from})
    197    * @deprecated Use {@link #from(String)}
    198    */
    199   @Deprecated
    200   public static InternetDomainName fromLenient(String domain) {
    201     return from(domain);
    202   }
    203 
    204   /**
    205    * Returns an instance of {@link InternetDomainName} after lenient
    206    * validation.  Specifically, validation against <a
    207    * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
    208    * ("Internationalizing Domain Names in Applications") is skipped, while
    209    * validation against <a
    210    * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
    211    * the following ways:
    212    * <ul>
    213    * <li>Any part containing non-ASCII characters is considered valid.
    214    * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
    215    * <li>Parts other than the final part may start with a digit.
    216    * </ul>
    217    *
    218    *
    219    * @param domain A domain name (not IP address)
    220    * @throws IllegalArgumentException if {@code name} is not syntactically valid
    221    *     according to {@link #isValid}
    222    * @since 10.0 (previously named {@code fromLenient})
    223    */
    224   public static InternetDomainName from(String domain) {
    225     return new InternetDomainName(checkNotNull(domain));
    226   }
    227 
    228   /**
    229    * Validation method used by {@from} to ensure that the domain name is
    230    * syntactically valid according to RFC 1035.
    231    *
    232    * @return Is the domain name syntactically valid?
    233    */
    234   private static boolean validateSyntax(List<String> parts) {
    235     final int lastIndex = parts.size() - 1;
    236 
    237     // Validate the last part specially, as it has different syntax rules.
    238 
    239     if (!validatePart(parts.get(lastIndex), true)) {
    240       return false;
    241     }
    242 
    243     for (int i = 0; i < lastIndex; i++) {
    244       String part = parts.get(i);
    245       if (!validatePart(part, false)) {
    246         return false;
    247       }
    248     }
    249 
    250     return true;
    251   }
    252 
    253   private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
    254 
    255   private static final CharMatcher PART_CHAR_MATCHER =
    256       CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
    257 
    258   /**
    259    * Helper method for {@link #validateSyntax(List)}. Validates that one part of
    260    * a domain name is valid.
    261    *
    262    * @param part The domain name part to be validated
    263    * @param isFinalPart Is this the final (rightmost) domain part?
    264    * @return Whether the part is valid
    265    */
    266   private static boolean validatePart(String part, boolean isFinalPart) {
    267 
    268     // These tests could be collapsed into one big boolean expression, but
    269     // they have been left as independent tests for clarity.
    270 
    271     if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
    272       return false;
    273     }
    274 
    275     /*
    276      * GWT claims to support java.lang.Character's char-classification methods,
    277      * but it actually only works for ASCII. So for now, assume any non-ASCII
    278      * characters are valid. The only place this seems to be documented is here:
    279      * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
    280      *
    281      * <p>ASCII characters in the part are expected to be valid per RFC 1035,
    282      * with underscore also being allowed due to widespread practice.
    283      */
    284 
    285     String asciiChars = CharMatcher.ASCII.retainFrom(part);
    286 
    287     if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
    288       return false;
    289     }
    290 
    291     // No initial or final dashes or underscores.
    292 
    293     if (DASH_MATCHER.matches(part.charAt(0))
    294         || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
    295       return false;
    296     }
    297 
    298     /*
    299      * Note that we allow (in contravention of a strict interpretation of the
    300      * relevant RFCs) domain parts other than the last may begin with a digit
    301      * (for example, "3com.com"). It's important to disallow an initial digit in
    302      * the last part; it's the only thing that stops an IPv4 numeric address
    303      * like 127.0.0.1 from looking like a valid domain name.
    304      */
    305 
    306     if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
    307       return false;
    308     }
    309 
    310     return true;
    311   }
    312 
    313   /**
    314    * Returns the domain name, normalized to all lower case.
    315    */
    316   public String name() {
    317     return name;
    318   }
    319 
    320   /**
    321    * Returns the individual components of this domain name, normalized to all
    322    * lower case. For example, for the domain name {@code mail.google.com}, this
    323    * method returns the list {@code ["mail", "google", "com"]}.
    324    */
    325   public ImmutableList<String> parts() {
    326     return parts;
    327   }
    328 
    329   /**
    330    * Indicates whether this domain name represents a <i>public suffix</i>, as
    331    * defined by the Mozilla Foundation's
    332    * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
    333    * suffix is one under which Internet users can directly register names, such
    334    * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
    335    * names that are <i>not</i> public suffixes include {@code google}, {@code
    336    * google.com} and {@code foo.co.uk}.
    337    *
    338    * @return {@code true} if this domain name appears exactly on the public
    339    *     suffix list
    340    * @since 6.0
    341    */
    342   public boolean isPublicSuffix() {
    343     return publicSuffixIndex == 0;
    344   }
    345 
    346   /**
    347    * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
    348    * public suffix}, including if it is a public suffix itself. For example,
    349    * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
    350    * {@code com}, but not for {@code google} or {@code google.foo}. This is
    351    * the recommended method for determining whether a domain is potentially an
    352    * addressable host.
    353    *
    354    * @since 6.0
    355    */
    356   public boolean hasPublicSuffix() {
    357     return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
    358   }
    359 
    360   /**
    361    * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
    362    * domain name, or {@code null} if no public suffix is present.
    363    *
    364    * @since 6.0
    365    */
    366   public InternetDomainName publicSuffix() {
    367     return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
    368   }
    369 
    370   /**
    371    * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
    372    * public suffix}, while not being a public suffix itself. For example,
    373    * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
    374    * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
    375    * google.foo}.
    376    *
    377    * <p><b>Warning:</b> a {@code false} result from this method does not imply
    378    * that the domain does not represent an addressable host, as many public
    379    * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
    380    * that test.
    381    *
    382    * <p>This method can be used to determine whether it will probably be
    383    * possible to set cookies on the domain, though even that depends on
    384    * individual browsers' implementations of cookie controls. See
    385    * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
    386    *
    387    * @since 6.0
    388    */
    389   public boolean isUnderPublicSuffix() {
    390     return publicSuffixIndex > 0;
    391   }
    392 
    393   /**
    394    * Indicates whether this domain name is composed of exactly one subdomain
    395    * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
    396    * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
    397    * but not for {@code www.google.com} or {@code co.uk}.
    398    *
    399    * <p><b>Warning:</b> A {@code true} result from this method does not imply
    400    * that the domain is at the highest level which is addressable as a host, as
    401    * many public suffixes are also addressable hosts. For example, the domain
    402    * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
    403    * return {@code true} from this method. But {@code uk.com} is itself an
    404    * addressable host.
    405    *
    406    * <p>This method can be used to determine whether a domain is probably the
    407    * highest level for which cookies may be set, though even that depends on
    408    * individual browsers' implementations of cookie controls. See
    409    * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
    410    *
    411    * @since 6.0
    412    */
    413   public boolean isTopPrivateDomain() {
    414     return publicSuffixIndex == 1;
    415   }
    416 
    417   /**
    418    * Returns the portion of this domain name that is one level beneath the
    419    * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
    420    * {@code google.co.uk}, since {@code co.uk} is a public suffix.
    421    *
    422    * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
    423    * instance is returned.
    424    *
    425    * <p>This method should not be used to determine the topmost parent domain
    426    * which is addressable as a host, as many public suffixes are also
    427    * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
    428    * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
    429    * from this method. But {@code uk.com} is itself an addressable host.
    430    *
    431    * <p>This method can be used to determine the probable highest level parent
    432    * domain for which cookies may be set, though even that depends on individual
    433    * browsers' implementations of cookie controls.
    434    *
    435    * @throws IllegalStateException if this domain does not end with a
    436    *     public suffix
    437    * @since 6.0
    438    */
    439   public InternetDomainName topPrivateDomain() {
    440     if (isTopPrivateDomain()) {
    441       return this;
    442     }
    443     checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
    444     return ancestor(publicSuffixIndex - 1);
    445   }
    446 
    447   /**
    448    * Indicates whether this domain is composed of two or more parts.
    449    */
    450   public boolean hasParent() {
    451     return parts.size() > 1;
    452   }
    453 
    454   /**
    455    * Returns an {@code InternetDomainName} that is the immediate ancestor of
    456    * this one; that is, the current domain with the leftmost part removed. For
    457    * example, the parent of {@code www.google.com} is {@code google.com}.
    458    *
    459    * @throws IllegalStateException if the domain has no parent, as determined
    460    *     by {@link #hasParent}
    461    */
    462   public InternetDomainName parent() {
    463     checkState(hasParent(), "Domain '%s' has no parent", name);
    464     return ancestor(1);
    465   }
    466 
    467   /**
    468    * Returns the ancestor of the current domain at the given number of levels
    469    * "higher" (rightward) in the subdomain list. The number of levels must be
    470    * non-negative, and less than {@code N-1}, where {@code N} is the number of
    471    * parts in the domain.
    472    *
    473    * <p>TODO: Reasonable candidate for addition to public API.
    474    */
    475   private InternetDomainName ancestor(int levels) {
    476     return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
    477   }
    478 
    479   /**
    480    * Creates and returns a new {@code InternetDomainName} by prepending the
    481    * argument and a dot to the current name. For example, {@code
    482    * InternetDomainName.from("foo.com").child("www.bar")} returns a new
    483    * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
    484    * lenient validation is performed, as described {@link #from(String) here}.
    485    *
    486    * @throws NullPointerException if leftParts is null
    487    * @throws IllegalArgumentException if the resulting name is not valid
    488    */
    489   public InternetDomainName child(String leftParts) {
    490     return from(checkNotNull(leftParts) + "." + name);
    491   }
    492 
    493   /**
    494    * A deprecated synonym for {@link #isValid(String)}.
    495    *
    496    * @since 8.0 (previously named {@code isValid})
    497    * @deprecated Use {@link #isValid(String)} instead
    498    */
    499   @Deprecated
    500   public static boolean isValidLenient(String name) {
    501     return isValid(name);
    502   }
    503 
    504   /**
    505    * Indicates whether the argument is a syntactically valid domain name using
    506    * lenient validation. Specifically, validation against <a
    507    * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
    508    * ("Internationalizing Domain Names in Applications") is skipped.
    509    *
    510    * <p>The following two code snippets are equivalent:
    511    *
    512    * <pre>   {@code
    513    *
    514    *   domainName = InternetDomainName.isValid(name)
    515    *       ? InternetDomainName.from(name)
    516    *       : DEFAULT_DOMAIN;
    517    *   }</pre>
    518    *
    519    * <pre>   {@code
    520    *
    521    *   try {
    522    *     domainName = InternetDomainName.from(name);
    523    *   } catch (IllegalArgumentException e) {
    524    *     domainName = DEFAULT_DOMAIN;
    525    *   }}</pre>
    526    *
    527    * @since 8.0 (previously named {@code isValidLenient})
    528    */
    529   public static boolean isValid(String name) {
    530     try {
    531       from(name);
    532       return true;
    533     } catch (IllegalArgumentException e) {
    534       return false;
    535     }
    536   }
    537 
    538   /**
    539    * Does the domain name match one of the "wildcard" patterns (e.g.
    540    * {@code "*.ar"})?
    541    */
    542   private static boolean matchesWildcardPublicSuffix(String domain) {
    543     final String[] pieces = domain.split(DOT_REGEX, 2);
    544     return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
    545   }
    546 
    547   // TODO: specify this to return the same as name(); remove name()
    548   @Override
    549   public String toString() {
    550     return Objects.toStringHelper(this).add("name", name).toString();
    551   }
    552 
    553   /**
    554    * Equality testing is based on the text supplied by the caller,
    555    * after normalization as described in the class documentation. For
    556    * example, a non-ASCII Unicode domain name and the Punycode version
    557    * of the same domain name would not be considered equal.
    558    *
    559    */
    560   @Override
    561   public boolean equals(@Nullable Object object) {
    562     if (object == this) {
    563       return true;
    564     }
    565 
    566     if (object instanceof InternetDomainName) {
    567       InternetDomainName that = (InternetDomainName) object;
    568       return this.name.equals(that.name);
    569     }
    570 
    571     return false;
    572   }
    573 
    574   @Override
    575   public int hashCode() {
    576     return name.hashCode();
    577   }
    578 }
    579