Home | History | Annotate | Download | only in net
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
      4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
      5  *
      6  * This code is free software; you can redistribute it and/or modify it
      7  * under the terms of the GNU General Public License version 2 only, as
      8  * published by the Free Software Foundation.  Oracle designates this
      9  * particular file as subject to the "Classpath" exception as provided
     10  * by Oracle in the LICENSE file that accompanied this code.
     11  *
     12  * This code is distributed in the hope that it will be useful, but WITHOUT
     13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     15  * version 2 for more details (a copy is included in the LICENSE file that
     16  * accompanied this code).
     17  *
     18  * You should have received a copy of the GNU General Public License version
     19  * 2 along with this work; if not, write to the Free Software Foundation,
     20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
     21  *
     22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
     23  * or visit www.oracle.com if you need additional information or have any
     24  * questions.
     25  */
     26 package java.net;
     27 
     28 import android.icu.text.IDNA;
     29 
     30 /**
     31  * Provides methods to convert internationalized domain names (IDNs) between
     32  * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
     33  * Internationalized domain names can use characters from the entire range of
     34  * Unicode, while traditional domain names are restricted to ASCII characters.
     35  * ACE is an encoding of Unicode strings that uses only ASCII characters and
     36  * can be used with software (such as the Domain Name System) that only
     37  * understands traditional domain names.
     38  *
     39  * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
     40  * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
     41  * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
     42  * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
     43  * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
     44  * domain name string back and forth.
     45  *
     46  * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
     47  *   <ul>
     48  *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
     49  *         can contain code points that are unassigned in Unicode 3.2, which is the
     50  *         Unicode version on which IDN conversion is based. If the flag is not used,
     51  *         the presence of such unassigned code points is treated as an error.
     52  *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
     53  *         It is an error if they don't meet the requirements.
     54  *   </ul>
     55  * These flags can be logically OR'ed together.
     56  *
     57  * <p>The security consideration is important with respect to internationalization
     58  * domain name support. For example, English domain names may be <i>homographed</i>
     59  * - maliciously misspelled by substitution of non-Latin letters.
     60  * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
     61  * discusses security issues of IDN support as well as possible solutions.
     62  * Applications are responsible for taking adequate security measures when using
     63  * international domain names.
     64  *
     65  * @author Edward Wang
     66  * @since 1.6
     67  *
     68  */
     69 public final class IDN {
     70     /**
     71      * Flag to allow processing of unassigned code points
     72      */
     73     public static final int ALLOW_UNASSIGNED = 0x01;
     74 
     75     /**
     76      * Flag to turn on the check against STD-3 ASCII rules
     77      */
     78     public static final int USE_STD3_ASCII_RULES = 0x02;
     79 
     80 
     81     /**
     82      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
     83      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
     84      *
     85      * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
     86      * If ToASCII operation fails, an IllegalArgumentException will be thrown.
     87      * In this case, the input string should not be used in an internationalized domain name.
     88      *
     89      * <p> A label is an individual part of a domain name. The original ToASCII operation,
     90      * as defined in RFC 3490, only operates on a single label. This method can handle
     91      * both label and entire domain name, by assuming that labels in a domain name are
     92      * always separated by dots. The following characters are recognized as dots:
     93      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
     94      * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
     95      * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
     96      * in output translated string.
     97      *
     98      * @param input     the string to be processed
     99      * @param flag      process flag; can be 0 or any logical OR of possible flags
    100      *
    101      * @return          the translated {@code String}
    102      *
    103      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
    104      */
    105     public static String toASCII(String input, int flag) {
    106         // BEGIN Android-changed: Use ICU4J implementation
    107         try {
    108             return IDNA.convertIDNToASCII(input, flag).toString();
    109         } catch (android.icu.text.StringPrepParseException e) {
    110             throw new IllegalArgumentException("Invalid input to toASCII: " + input, e);
    111         }
    112         // END Android-changed: Use ICU4J implementation
    113     }
    114 
    115 
    116     /**
    117      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
    118      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
    119      *
    120      * <p> This convenience method works as if by invoking the
    121      * two-argument counterpart as follows:
    122      * <blockquote>
    123      * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
    124      * </blockquote>
    125      *
    126      * @param input     the string to be processed
    127      *
    128      * @return          the translated {@code String}
    129      *
    130      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
    131      */
    132     public static String toASCII(String input) {
    133         return toASCII(input, 0);
    134     }
    135 
    136 
    137     /**
    138      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
    139      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
    140      *
    141      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
    142      *
    143      * <p> A label is an individual part of a domain name. The original ToUnicode operation,
    144      * as defined in RFC 3490, only operates on a single label. This method can handle
    145      * both label and entire domain name, by assuming that labels in a domain name are
    146      * always separated by dots. The following characters are recognized as dots:
    147      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
    148      * and &#0092;uFF61 (halfwidth ideographic full stop).
    149      *
    150      * @param input     the string to be processed
    151      * @param flag      process flag; can be 0 or any logical OR of possible flags
    152      *
    153      * @return          the translated {@code String}
    154      */
    155     public static String toUnicode(String input, int flag) {
    156         // BEGIN Android-changed: Use ICU4J implementation
    157         try {
    158             // ICU only translates separators to ASCII for toASCII.
    159             // Java expects the translation for toUnicode too.
    160             return convertFullStop(IDNA.convertIDNToUnicode(input, flag)).toString();
    161         } catch (android.icu.text.StringPrepParseException e) {
    162             // The RI documentation explicitly states that if the conversion was unsuccessful
    163             // the original string is returned.
    164             return input;
    165         }
    166         // END Android-changed: Use ICU4J implementation
    167     }
    168 
    169     // BEGIN Android-added: Use ICU4J implementation
    170     private static boolean isLabelSeperator(char c) {
    171         return (c == '\u3002' || c == '\uff0e' || c == '\uff61');
    172     }
    173 
    174     private static StringBuffer convertFullStop(StringBuffer input) {
    175         for (int i = 0; i < input.length(); i++) {
    176             if (isLabelSeperator(input.charAt(i))) {
    177                 input.setCharAt(i, '.');
    178             }
    179         }
    180         return input;
    181     }
    182     // END Android-added: Use ICU4J implementation
    183 
    184     /**
    185      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
    186      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
    187      *
    188      * <p> This convenience method works as if by invoking the
    189      * two-argument counterpart as follows:
    190      * <blockquote>
    191      * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
    192      * </blockquote>
    193      *
    194      * @param input     the string to be processed
    195      *
    196      * @return          the translated {@code String}
    197      */
    198     public static String toUnicode(String input) {
    199         return toUnicode(input, 0);
    200     }
    201 
    202 
    203     /* ---------------- Private members -------------- */
    204 
    205     // Android-removed: Private helper methods, unused because we use ICU.
    206     /*
    207     // ACE Prefix is "xn--"
    208     private static final String ACE_PREFIX = "xn--";
    209     private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
    210 
    211     private static final int MAX_LABEL_LENGTH   = 63;
    212 
    213     // single instance of nameprep
    214     private static StringPrep namePrep = null;
    215 
    216     static {
    217         InputStream stream = null;
    218 
    219         try {
    220             final String IDN_PROFILE = "uidna.spp";
    221             if (System.getSecurityManager() != null) {
    222                 stream = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
    223                     public InputStream run() {
    224                         return StringPrep.class.getResourceAsStream(IDN_PROFILE);
    225                     }
    226                 });
    227             } else {
    228                 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
    229             }
    230 
    231             namePrep = new StringPrep(stream);
    232             stream.close();
    233         } catch (IOException e) {
    234             // should never reach here
    235             assert false;
    236         }
    237     }
    238     */
    239 
    240     /* ---------------- Private operations -------------- */
    241 
    242 
    243     //
    244     // to suppress the default zero-argument constructor
    245     //
    246     private IDN() {}
    247 
    248     // Android-removed: Private helper methods, unused because we use ICU.
    249     /*
    250     //
    251     // toASCII operation; should only apply to a single label
    252     //
    253     private static String toASCIIInternal(String label, int flag)
    254     {
    255         // step 1
    256         // Check if the string contains code points outside the ASCII range 0..0x7c.
    257         boolean isASCII  = isAllASCII(label);
    258         StringBuffer dest;
    259 
    260         // step 2
    261         // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
    262         if (!isASCII) {
    263             UCharacterIterator iter = UCharacterIterator.getInstance(label);
    264             try {
    265                 dest = namePrep.prepare(iter, flag);
    266             } catch (java.text.ParseException e) {
    267                 throw new IllegalArgumentException(e);
    268             }
    269         } else {
    270             dest = new StringBuffer(label);
    271         }
    272 
    273         // step 8, move forward to check the smallest number of the code points
    274         // the length must be inside 1..63
    275         if (dest.length() == 0) {
    276             throw new IllegalArgumentException(
    277                         "Empty label is not a legal name");
    278         }
    279 
    280         // step 3
    281         // Verify the absence of non-LDH ASCII code points
    282         //   0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
    283         // Verify the absence of leading and trailing hyphen
    284         boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
    285         if (useSTD3ASCIIRules) {
    286             for (int i = 0; i < dest.length(); i++) {
    287                 int c = dest.charAt(i);
    288                 if (isNonLDHAsciiCodePoint(c)) {
    289                     throw new IllegalArgumentException(
    290                         "Contains non-LDH ASCII characters");
    291                 }
    292             }
    293 
    294             if (dest.charAt(0) == '-' ||
    295                 dest.charAt(dest.length() - 1) == '-') {
    296 
    297                 throw new IllegalArgumentException(
    298                         "Has leading or trailing hyphen");
    299             }
    300         }
    301 
    302         if (!isASCII) {
    303             // step 4
    304             // If all code points are inside 0..0x7f, skip to step 8
    305             if (!isAllASCII(dest.toString())) {
    306                 // step 5
    307                 // verify the sequence does not begin with ACE prefix
    308                 if(!startsWithACEPrefix(dest)){
    309 
    310                     // step 6
    311                     // encode the sequence with punycode
    312                     try {
    313                         dest = Punycode.encode(dest, null);
    314                     } catch (java.text.ParseException e) {
    315                         throw new IllegalArgumentException(e);
    316                     }
    317 
    318                     dest = toASCIILower(dest);
    319 
    320                     // step 7
    321                     // prepend the ACE prefix
    322                     dest.insert(0, ACE_PREFIX);
    323                 } else {
    324                     throw new IllegalArgumentException("The input starts with the ACE Prefix");
    325                 }
    326 
    327             }
    328         }
    329 
    330         // step 8
    331         // the length must be inside 1..63
    332         if (dest.length() > MAX_LABEL_LENGTH) {
    333             throw new IllegalArgumentException("The label in the input is too long");
    334         }
    335 
    336         return dest.toString();
    337     }
    338 
    339     //
    340     // toUnicode operation; should only apply to a single label
    341     //
    342     private static String toUnicodeInternal(String label, int flag) {
    343         boolean[] caseFlags = null;
    344         StringBuffer dest;
    345 
    346         // step 1
    347         // find out if all the codepoints in input are ASCII
    348         boolean isASCII = isAllASCII(label);
    349 
    350         if(!isASCII){
    351             // step 2
    352             // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
    353             try {
    354                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
    355                 dest = namePrep.prepare(iter, flag);
    356             } catch (Exception e) {
    357                 // toUnicode never fails; if any step fails, return the input string
    358                 return label;
    359             }
    360         } else {
    361             dest = new StringBuffer(label);
    362         }
    363 
    364         // step 3
    365         // verify ACE Prefix
    366         if(startsWithACEPrefix(dest)) {
    367 
    368             // step 4
    369             // Remove the ACE Prefix
    370             String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
    371 
    372             try {
    373                 // step 5
    374                 // Decode using punycode
    375                 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
    376 
    377                 // step 6
    378                 // Apply toASCII
    379                 String toASCIIOut = toASCII(decodeOut.toString(), flag);
    380 
    381                 // step 7
    382                 // verify
    383                 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
    384                     // step 8
    385                     // return output of step 5
    386                     return decodeOut.toString();
    387                 }
    388             } catch (Exception ignored) {
    389                 // no-op
    390             }
    391         }
    392 
    393         // just return the input
    394         return label;
    395     }
    396 
    397 
    398     //
    399     // LDH stands for "letter/digit/hyphen", with characters restricted to the
    400     // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
    401     // <->.
    402     // Non LDH refers to characters in the ASCII range, but which are not
    403     // letters, digits or the hypen.
    404     //
    405     // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F
    406     //
    407     private static boolean isNonLDHAsciiCodePoint(int ch){
    408         return (0x0000 <= ch && ch <= 0x002C) ||
    409                (0x002E <= ch && ch <= 0x002F) ||
    410                (0x003A <= ch && ch <= 0x0040) ||
    411                (0x005B <= ch && ch <= 0x0060) ||
    412                (0x007B <= ch && ch <= 0x007F);
    413     }
    414 
    415     //
    416     // search dots in a string and return the index of that character;
    417     // or if there is no dots, return the length of input string
    418     // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
    419     // and \uFF61 (halfwidth ideographic full stop).
    420     //
    421     private static int searchDots(String s, int start) {
    422         int i;
    423         for (i = start; i < s.length(); i++) {
    424             if (isLabelSeparator(s.charAt(i))) {
    425                 break;
    426             }
    427         }
    428 
    429         return i;
    430     }
    431 
    432     //
    433     // to check if a string is a root label, ".".
    434     //
    435     private static boolean isRootLabel(String s) {
    436         return (s.length() == 1 && isLabelSeparator(s.charAt(0)));
    437     }
    438 
    439     //
    440     // to check if a character is a label separator, i.e. a dot character.
    441     //
    442     private static boolean isLabelSeparator(char c) {
    443         return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');
    444     }
    445 
    446     //
    447     // to check if a string only contains US-ASCII code point
    448     //
    449     private static boolean isAllASCII(String input) {
    450         boolean isASCII = true;
    451         for (int i = 0; i < input.length(); i++) {
    452             int c = input.charAt(i);
    453             if (c > 0x7F) {
    454                 isASCII = false;
    455                 break;
    456             }
    457         }
    458         return isASCII;
    459     }
    460 
    461     //
    462     // to check if a string starts with ACE-prefix
    463     //
    464     private static boolean startsWithACEPrefix(StringBuffer input){
    465         boolean startsWithPrefix = true;
    466 
    467         if(input.length() < ACE_PREFIX_LENGTH){
    468             return false;
    469         }
    470         for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
    471             if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
    472                 startsWithPrefix = false;
    473             }
    474         }
    475         return startsWithPrefix;
    476     }
    477 
    478     private static char toASCIILower(char ch){
    479         if('A' <= ch && ch <= 'Z'){
    480             return (char)(ch + 'a' - 'A');
    481         }
    482         return ch;
    483     }
    484 
    485     private static StringBuffer toASCIILower(StringBuffer input){
    486         StringBuffer dest = new StringBuffer();
    487         for(int i = 0; i < input.length();i++){
    488             dest.append(toASCIILower(input.charAt(i)));
    489         }
    490         return dest;
    491     }
    492     */
    493 }
    494