Home | History | Annotate | Download | only in net
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
      4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
      5  *
      6  * This code is free software; you can redistribute it and/or modify it
      7  * under the terms of the GNU General Public License version 2 only, as
      8  * published by the Free Software Foundation.  Oracle designates this
      9  * particular file as subject to the "Classpath" exception as provided
     10  * by Oracle in the LICENSE file that accompanied this code.
     11  *
     12  * This code is distributed in the hope that it will be useful, but WITHOUT
     13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     15  * version 2 for more details (a copy is included in the LICENSE file that
     16  * accompanied this code).
     17  *
     18  * You should have received a copy of the GNU General Public License version
     19  * 2 along with this work; if not, write to the Free Software Foundation,
     20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
     21  *
     22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
     23  * or visit www.oracle.com if you need additional information or have any
     24  * questions.
     25  */
     26 package java.net;
     27 
     28 import android.icu.text.IDNA;
     29 
     30 /**
     31  * Provides methods to convert internationalized domain names (IDNs) between
     32  * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
     33  * Internationalized domain names can use characters from the entire range of
     34  * Unicode, while traditional domain names are restricted to ASCII characters.
     35  * ACE is an encoding of Unicode strings that uses only ASCII characters and
     36  * can be used with software (such as the Domain Name System) that only
     37  * understands traditional domain names.
     38  *
     39  * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
     40  * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
     41  * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
     42  * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
     43  * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
     44  * domain name string back and forth.
     45  *
     46  * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
     47  *   <ul>
     48  *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
     49  *         can contain code points that are unassigned in Unicode 3.2, which is the
     50  *         Unicode version on which IDN conversion is based. If the flag is not used,
     51  *         the presence of such unassigned code points is treated as an error.
     52  *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
     53  *         It is an error if they don't meet the requirements.
     54  *   </ul>
     55  * These flags can be logically OR'ed together.
     56  *
     57  * <p>The security consideration is important with respect to internationalization
     58  * domain name support. For example, English domain names may be <i>homographed</i>
     59  * - maliciously misspelled by substitution of non-Latin letters.
     60  * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
     61  * discusses security issues of IDN support as well as possible solutions.
     62  * Applications are responsible for taking adequate security measures when using
     63  * international domain names.
     64  *
     65  * @author Edward Wang
     66  * @since 1.6
     67  *
     68  */
     69 public final class IDN {
     70     /**
     71      * Flag to allow processing of unassigned code points
     72      */
     73     public static final int ALLOW_UNASSIGNED = 0x01;
     74 
     75     /**
     76      * Flag to turn on the check against STD-3 ASCII rules
     77      */
     78     public static final int USE_STD3_ASCII_RULES = 0x02;
     79 
     80     private IDN() {
     81     }
     82 
     83 
     84     /**
     85      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
     86      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
     87      *
     88      * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
     89      * If ToASCII operation fails, an IllegalArgumentException will be thrown.
     90      * In this case, the input string should not be used in an internationalized domain name.
     91      *
     92      * <p> A label is an individual part of a domain name. The original ToASCII operation,
     93      * as defined in RFC 3490, only operates on a single label. This method can handle
     94      * both label and entire domain name, by assuming that labels in a domain name are
     95      * always separated by dots. The following characters are recognized as dots:
     96      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
     97      * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
     98      * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
     99      * in output translated string.
    100      *
    101      * @param input     the string to be processed
    102      * @param flag      process flag; can be 0 or any logical OR of possible flags
    103      *
    104      * @return the translated <tt>String</tt>
    105      *
    106      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
    107      */
    108     public static String toASCII(String input, int flag) {
    109         try {
    110             return IDNA.convertIDNToASCII(input, flag).toString();
    111         } catch (android.icu.text.StringPrepParseException e) {
    112             throw new IllegalArgumentException("Invalid input to toASCII: " + input, e);
    113         }
    114     }
    115 
    116 
    117     /**
    118      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
    119      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
    120      *
    121      * <p> This convenience method works as if by invoking the
    122      * two-argument counterpart as follows:
    123      * <blockquote><tt>
    124      * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
    125      * </tt></blockquote>
    126      *
    127      * @param input     the string to be processed
    128      *
    129      * @return the translated <tt>String</tt>
    130      *
    131      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
    132      */
    133     public static String toASCII(String input) {
    134         return toASCII(input, 0);
    135     }
    136 
    137 
    138     /**
    139      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
    140      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
    141      *
    142      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
    143      *
    144      * <p> A label is an individual part of a domain name. The original ToUnicode operation,
    145      * as defined in RFC 3490, only operates on a single label. This method can handle
    146      * both label and entire domain name, by assuming that labels in a domain name are
    147      * always separated by dots. The following characters are recognized as dots:
    148      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
    149      * and &#0092;uFF61 (halfwidth ideographic full stop).
    150      *
    151      * @param input     the string to be processed
    152      * @param flag      process flag; can be 0 or any logical OR of possible flags
    153      *
    154      * @return the translated <tt>String</tt>
    155      */
    156     public static String toUnicode(String input, int flag) {
    157         try {
    158             // ICU only translates separators to ASCII for toASCII.
    159             // Java expects the translation for toUnicode too.
    160             return convertFullStop(IDNA.convertIDNToUnicode(input, flag)).toString();
    161         } catch (android.icu.text.StringPrepParseException e) {
    162             // The RI documentation explicitly states that if the conversion was unsuccessful
    163             // the original string is returned.
    164             return input;
    165         }
    166     }
    167 
    168     private static boolean isLabelSeperator(char c) {
    169         return (c == '\u3002' || c == '\uff0e' || c == '\uff61');
    170     }
    171 
    172     private static StringBuffer convertFullStop(StringBuffer input) {
    173         for (int i = 0; i < input.length(); i++) {
    174             if (isLabelSeperator(input.charAt(i))) {
    175                 input.setCharAt(i, '.');
    176             }
    177         }
    178         return input;
    179     }
    180 
    181 
    182     /**
    183      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
    184      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
    185      *
    186      * <p> This convenience method works as if by invoking the
    187      * two-argument counterpart as follows:
    188      * <blockquote><tt>
    189      * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
    190      * </tt></blockquote>
    191      *
    192      * @param input     the string to be processed
    193      *
    194      * @return the translated <tt>String</tt>
    195      */
    196     public static String toUnicode(String input) {
    197         return toUnicode(input, 0);
    198     }
    199 }
    200