Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2003-2010, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 package com.ibm.icu.impl;
     10 
     11 import com.ibm.icu.text.IDNA;
     12 import com.ibm.icu.text.StringPrep;
     13 import com.ibm.icu.text.StringPrepParseException;
     14 import com.ibm.icu.text.UCharacterIterator;
     15 
     16 /**
     17  * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
     18  * while extending that class to support IDNA2008/UTS #46 as well.
     19  * @author Ram Viswanadha
     20  */
     21 public final class IDNA2003 {
     22     /* IDNA ACE Prefix is "xn--" */
     23     private static char[] ACE_PREFIX                = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
     24     //private static final int ACE_PREFIX_LENGTH      = ACE_PREFIX.length;
     25 
     26     private static final int MAX_LABEL_LENGTH       = 63;
     27     private static final int HYPHEN                 = 0x002D;
     28     private static final int CAPITAL_A              = 0x0041;
     29     private static final int CAPITAL_Z              = 0x005A;
     30     private static final int LOWER_CASE_DELTA       = 0x0020;
     31     private static final int FULL_STOP              = 0x002E;
     32     private static final int MAX_DOMAIN_NAME_LENGTH = 255;
     33 
     34     // The NamePrep profile object
     35     private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
     36 
     37     private static boolean startsWithPrefix(StringBuffer src){
     38         boolean startsWithPrefix = true;
     39 
     40         if(src.length() < ACE_PREFIX.length){
     41             return false;
     42         }
     43         for(int i=0; i<ACE_PREFIX.length;i++){
     44             if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
     45                 startsWithPrefix = false;
     46             }
     47         }
     48         return startsWithPrefix;
     49     }
     50 
     51     private static char toASCIILower(char ch){
     52         if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
     53             return (char)(ch + LOWER_CASE_DELTA);
     54         }
     55         return ch;
     56     }
     57 
     58     private static StringBuffer toASCIILower(CharSequence src){
     59         StringBuffer dest = new StringBuffer();
     60         for(int i=0; i<src.length();i++){
     61             dest.append(toASCIILower(src.charAt(i)));
     62         }
     63         return dest;
     64     }
     65 
     66     private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
     67         char c1,c2;
     68         int rc;
     69         for(int i =0;/* no condition */;i++) {
     70             /* If we reach the ends of both strings then they match */
     71             if(i == s1.length()) {
     72                 return 0;
     73             }
     74 
     75             c1 = s1.charAt(i);
     76             c2 = s2.charAt(i);
     77 
     78             /* Case-insensitive comparison */
     79             if(c1!=c2) {
     80                 rc=toASCIILower(c1)-toASCIILower(c2);
     81                 if(rc!=0) {
     82                     return rc;
     83                 }
     84             }
     85         }
     86     }
     87 
     88     private static int getSeparatorIndex(char[] src,int start, int limit){
     89         for(; start<limit;start++){
     90             if(isLabelSeparator(src[start])){
     91                 return start;
     92             }
     93         }
     94         // we have not found the separator just return length
     95         return start;
     96     }
     97 
     98     /*
     99     private static int getSeparatorIndex(UCharacterIterator iter){
    100         int currentIndex = iter.getIndex();
    101         int separatorIndex = 0;
    102         int ch;
    103         while((ch=iter.next())!= UCharacterIterator.DONE){
    104             if(isLabelSeparator(ch)){
    105                 separatorIndex = iter.getIndex();
    106                 iter.setIndex(currentIndex);
    107                 return separatorIndex;
    108             }
    109         }
    110         // reset index
    111         iter.setIndex(currentIndex);
    112         // we have not found the separator just return the length
    113 
    114     }
    115     */
    116 
    117 
    118     private static boolean isLDHChar(int ch){
    119         // high runner case
    120         if(ch>0x007A){
    121             return false;
    122         }
    123         //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
    124         if( (ch==0x002D) ||
    125             (0x0030 <= ch && ch <= 0x0039) ||
    126             (0x0041 <= ch && ch <= 0x005A) ||
    127             (0x0061 <= ch && ch <= 0x007A)
    128           ){
    129             return true;
    130         }
    131         return false;
    132     }
    133 
    134     /**
    135      * Ascertain if the given code point is a label separator as
    136      * defined by the IDNA RFC
    137      *
    138      * @param ch The code point to be ascertained
    139      * @return true if the char is a label separator
    140      * @stable ICU 2.8
    141      */
    142     private static boolean isLabelSeparator(int ch){
    143         switch(ch){
    144             case 0x002e:
    145             case 0x3002:
    146             case 0xFF0E:
    147             case 0xFF61:
    148                 return true;
    149             default:
    150                 return false;
    151         }
    152     }
    153 
    154     public static StringBuffer convertToASCII(UCharacterIterator src, int options)
    155             throws StringPrepParseException{
    156 
    157         boolean[] caseFlags = null;
    158 
    159         // the source contains all ascii codepoints
    160         boolean srcIsASCII  = true;
    161         // assume the source contains all LDH codepoints
    162         boolean srcIsLDH = true;
    163 
    164         //get the options
    165         boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
    166         int ch;
    167         // step 1
    168         while((ch = src.next())!= UCharacterIterator.DONE){
    169             if(ch> 0x7f){
    170                 srcIsASCII = false;
    171             }
    172         }
    173         int failPos = -1;
    174         src.setToStart();
    175         StringBuffer processOut = null;
    176         // step 2 is performed only if the source contains non ASCII
    177         if(!srcIsASCII){
    178             // step 2
    179             processOut = namePrep.prepare(src, options);
    180         }else{
    181             processOut = new StringBuffer(src.getText());
    182         }
    183         int poLen = processOut.length();
    184 
    185         if(poLen==0){
    186             throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
    187         }
    188         StringBuffer dest = new StringBuffer();
    189 
    190         // reset the variable to verify if output of prepare is ASCII or not
    191         srcIsASCII = true;
    192 
    193         // step 3 & 4
    194         for(int j=0;j<poLen;j++ ){
    195             ch=processOut.charAt(j);
    196             if(ch > 0x7F){
    197                 srcIsASCII = false;
    198             }else if(isLDHChar(ch)==false){
    199                 // here we do not assemble surrogates
    200                 // since we know that LDH code points
    201                 // are in the ASCII range only
    202                 srcIsLDH = false;
    203                 failPos = j;
    204             }
    205         }
    206 
    207         if(useSTD3ASCIIRules == true){
    208             // verify 3a and 3b
    209             if( srcIsLDH == false /* source contains some non-LDH characters */
    210                 || processOut.charAt(0) ==  HYPHEN
    211                 || processOut.charAt(processOut.length()-1) == HYPHEN){
    212 
    213                 /* populate the parseError struct */
    214                 if(srcIsLDH==false){
    215                      throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
    216                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
    217                                               processOut.toString(),
    218                                              (failPos>0) ? (failPos-1) : failPos);
    219                 }else if(processOut.charAt(0) == HYPHEN){
    220                     throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
    221                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
    222 
    223                 }else{
    224                      throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
    225                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
    226                                               processOut.toString(),
    227                                               (poLen>0) ? poLen-1 : poLen);
    228 
    229                 }
    230             }
    231         }
    232         if(srcIsASCII){
    233             dest =  processOut;
    234         }else{
    235             // step 5 : verify the sequence does not begin with ACE prefix
    236             if(!startsWithPrefix(processOut)){
    237 
    238                 //step 6: encode the sequence with punycode
    239                 caseFlags = new boolean[poLen];
    240 
    241                 StringBuilder punyout = Punycode.encode(processOut,caseFlags);
    242 
    243                 // convert all codepoints to lower case ASCII
    244                 StringBuffer lowerOut = toASCIILower(punyout);
    245 
    246                 //Step 7: prepend the ACE prefix
    247                 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
    248                 //Step 6: copy the contents in b2 into dest
    249                 dest.append(lowerOut);
    250             }else{
    251 
    252                 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
    253                                          StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
    254             }
    255         }
    256         if(dest.length() > MAX_LABEL_LENGTH){
    257             throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
    258                                      StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
    259         }
    260         return dest;
    261     }
    262 
    263     public static StringBuffer convertIDNToASCII(String src,int options)
    264             throws StringPrepParseException{
    265 
    266         char[] srcArr = src.toCharArray();
    267         StringBuffer result = new StringBuffer();
    268         int sepIndex=0;
    269         int oldSepIndex=0;
    270         for(;;){
    271             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
    272             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
    273             //make sure this is not a root label separator.
    274             if(!(label.length()==0 && sepIndex==srcArr.length)){
    275                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
    276                 result.append(convertToASCII(iter,options));
    277             }
    278             if(sepIndex==srcArr.length){
    279                 break;
    280             }
    281 
    282             // increment the sepIndex to skip past the separator
    283             sepIndex++;
    284             oldSepIndex = sepIndex;
    285             result.append((char)FULL_STOP);
    286         }
    287         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
    288             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
    289         }
    290         return result;
    291     }
    292 
    293     public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
    294             throws StringPrepParseException{
    295 
    296         boolean[] caseFlags = null;
    297 
    298         // the source contains all ascii codepoints
    299         boolean srcIsASCII  = true;
    300         // assume the source contains all LDH codepoints
    301         //boolean srcIsLDH = true;
    302 
    303         //get the options
    304         //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
    305 
    306         //int failPos = -1;
    307         int ch;
    308         int saveIndex = src.getIndex();
    309         // step 1: find out if all the codepoints in src are ASCII
    310         while((ch=src.next())!= UCharacterIterator.DONE){
    311             if(ch>0x7F){
    312                 srcIsASCII = false;
    313             }/*else if((srcIsLDH = isLDHChar(ch))==false){
    314                 failPos = src.getIndex();
    315             }*/
    316         }
    317         StringBuffer processOut;
    318 
    319         if(srcIsASCII == false){
    320             try {
    321                 // step 2: process the string
    322                 src.setIndex(saveIndex);
    323                 processOut = namePrep.prepare(src,options);
    324             } catch (StringPrepParseException ex) {
    325                 return new StringBuffer(src.getText());
    326             }
    327 
    328         }else{
    329             //just point to source
    330             processOut = new StringBuffer(src.getText());
    331         }
    332         // TODO:
    333         // The RFC states that
    334         // <quote>
    335         // ToUnicode never fails. If any step fails, then the original input
    336         // is returned immediately in that step.
    337         // </quote>
    338 
    339         //step 3: verify ACE Prefix
    340         if(startsWithPrefix(processOut)){
    341             StringBuffer decodeOut = null;
    342 
    343             //step 4: Remove the ACE Prefix
    344             String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
    345 
    346             //step 5: Decode using punycode
    347             try {
    348                 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
    349             } catch (StringPrepParseException e) {
    350                 decodeOut = null;
    351             }
    352 
    353             //step 6:Apply toASCII
    354             if (decodeOut != null) {
    355                 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
    356 
    357                 //step 7: verify
    358                 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
    359 //                    throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
    360 //                                             StringPrepParseException.VERIFICATION_ERROR);
    361                     decodeOut = null;
    362                 }
    363             }
    364 
    365             //step 8: return output of step 5
    366              if (decodeOut != null) {
    367                  return decodeOut;
    368              }
    369         }
    370 
    371 //        }else{
    372 //            // verify that STD3 ASCII rules are satisfied
    373 //            if(useSTD3ASCIIRules == true){
    374 //                if( srcIsLDH == false /* source contains some non-LDH characters */
    375 //                    || processOut.charAt(0) ==  HYPHEN
    376 //                    || processOut.charAt(processOut.length()-1) == HYPHEN){
    377 //
    378 //                    if(srcIsLDH==false){
    379 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
    380 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
    381 //                                                 (failPos>0) ? (failPos-1) : failPos);
    382 //                    }else if(processOut.charAt(0) == HYPHEN){
    383 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
    384 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
    385 //                                                 processOut.toString(),0);
    386 //
    387 //                    }else{
    388 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
    389 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
    390 //                                                 processOut.toString(),
    391 //                                                 processOut.length());
    392 //
    393 //                    }
    394 //                }
    395 //            }
    396 //            // just return the source
    397 //            return new StringBuffer(src.getText());
    398 //        }
    399 
    400         return new StringBuffer(src.getText());
    401     }
    402 
    403     public static StringBuffer convertIDNToUnicode(String src, int options)
    404             throws StringPrepParseException{
    405 
    406         char[] srcArr = src.toCharArray();
    407         StringBuffer result = new StringBuffer();
    408         int sepIndex=0;
    409         int oldSepIndex=0;
    410         for(;;){
    411             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
    412             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
    413             if(label.length()==0 && sepIndex!=srcArr.length ){
    414                 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
    415             }
    416             UCharacterIterator iter = UCharacterIterator.getInstance(label);
    417             result.append(convertToUnicode(iter,options));
    418             if(sepIndex==srcArr.length){
    419                 break;
    420             }
    421             // Unlike the ToASCII operation we don't normalize the label separators
    422             result.append(srcArr[sepIndex]);
    423             // increment the sepIndex to skip past the separator
    424             sepIndex++;
    425             oldSepIndex =sepIndex;
    426         }
    427         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
    428             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
    429         }
    430         return result;
    431     }
    432 
    433     public static int compare(String s1, String s2, int options) throws StringPrepParseException{
    434         StringBuffer s1Out = convertIDNToASCII(s1, options);
    435         StringBuffer s2Out = convertIDNToASCII(s2, options);
    436         return compareCaseInsensitiveASCII(s1Out,s2Out);
    437     }
    438 }
    439