Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2003-2014, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.impl;
     10 
     11 import com.ibm.icu.lang.UCharacter;
     12 import com.ibm.icu.text.StringPrepParseException;
     13 import com.ibm.icu.text.UTF16;
     14 
     15 /**
     16  * Ported code from ICU punycode.c
     17  * @author ram
     18  */
     19 public final class Punycode {
     20 
     21     /* Punycode parameters for Bootstring */
     22     private static final int BASE           = 36;
     23     private static final int TMIN           = 1;
     24     private static final int TMAX           = 26;
     25     private static final int SKEW           = 38;
     26     private static final int DAMP           = 700;
     27     private static final int INITIAL_BIAS   = 72;
     28     private static final int INITIAL_N      = 0x80;
     29 
     30     /* "Basic" Unicode/ASCII code points */
     31     private static final char HYPHEN        = 0x2d;
     32     private static final char DELIMITER     = HYPHEN;
     33 
     34     private static final int ZERO           = 0x30;
     35     //private static final int NINE           = 0x39;
     36 
     37     private static final int SMALL_A        = 0x61;
     38     private static final int SMALL_Z        = 0x7a;
     39 
     40     private static final int CAPITAL_A      = 0x41;
     41     private static final int CAPITAL_Z      = 0x5a;
     42 
     43     private static int adaptBias(int delta, int length, boolean firstTime){
     44         if(firstTime){
     45             delta /=DAMP;
     46         }else{
     47             delta /=  2;
     48         }
     49         delta += delta/length;
     50 
     51         int count=0;
     52         for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
     53             delta/=(BASE-TMIN);
     54         }
     55 
     56         return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
     57     }
     58 
     59     /**
     60      * basicToDigit[] contains the numeric value of a basic code
     61      * point (for use in representing integers) in the range 0 to
     62      * BASE-1, or -1 if b is does not represent a value.
     63      */
     64     static final int[]    basicToDigit= new int[]{
     65         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     66         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     67 
     68         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     69         26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
     70 
     71         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
     72         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
     73 
     74         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
     75         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
     76 
     77         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     78         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     79 
     80         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     81         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     82 
     83         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     84         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     85 
     86         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     87         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
     88     };
     89 
     90     ///CLOVER:OFF
     91     private static char asciiCaseMap(char b, boolean uppercase) {
     92         if(uppercase) {
     93             if(SMALL_A<=b && b<=SMALL_Z) {
     94                 b-=(SMALL_A-CAPITAL_A);
     95             }
     96         } else {
     97             if(CAPITAL_A<=b && b<=CAPITAL_Z) {
     98                 b+=(SMALL_A-CAPITAL_A);
     99             }
    100         }
    101         return b;
    102     }
    103     ///CLOVER:ON
    104     /**
    105      * digitToBasic() returns the basic code point whose value
    106      * (when used for representing integers) is d, which must be in the
    107      * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
    108      * nonzero, in which case the uppercase form is used.
    109      */
    110     private static char digitToBasic(int digit, boolean uppercase) {
    111         /*  0..25 map to ASCII a..z or A..Z */
    112         /* 26..35 map to ASCII 0..9         */
    113         if(digit<26) {
    114             if(uppercase) {
    115                 return (char)(CAPITAL_A+digit);
    116             } else {
    117                 return (char)(SMALL_A+digit);
    118             }
    119         } else {
    120             return (char)((ZERO-26)+digit);
    121         }
    122     }
    123     /**
    124      * Converts Unicode to Punycode.
    125      * The input string must not contain single, unpaired surrogates.
    126      * The output will be represented as an array of ASCII code points.
    127      *
    128      * @param src The source of the String Buffer passed.
    129      * @param caseFlags The boolean array of case flags.
    130      * @return An array of ASCII code points.
    131      */
    132     public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{
    133         int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount;
    134         char c, c2;
    135         int srcLength = src.length();
    136         int[] cpBuffer = new int[srcLength];
    137         StringBuilder dest = new StringBuilder(srcLength);
    138         /*
    139          * Handle the basic code points and
    140          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
    141          */
    142         srcCPCount=0;
    143 
    144         for(j=0; j<srcLength; ++j) {
    145             c=src.charAt(j);
    146             if(isBasic(c)) {
    147                 cpBuffer[srcCPCount++]=0;
    148                 dest.append(caseFlags!=null ? asciiCaseMap(c, caseFlags[j]) : c);
    149             } else {
    150                 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
    151                 if(!UTF16.isSurrogate(c)) {
    152                     n|=c;
    153                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
    154                     ++j;
    155 
    156                     n|=UCharacter.getCodePoint(c, c2);
    157                 } else {
    158                     /* error: unmatched surrogate */
    159                     throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND);
    160                 }
    161                 cpBuffer[srcCPCount++]=n;
    162             }
    163         }
    164 
    165         /* Finish the basic string - if it is not empty - with a delimiter. */
    166         basicLength=dest.length();
    167         if(basicLength>0) {
    168             dest.append(DELIMITER);
    169         }
    170 
    171         /*
    172          * handledCPCount is the number of code points that have been handled
    173          * basicLength is the number of basic code points
    174          * destLength is the number of chars that have been output
    175          */
    176 
    177         /* Initialize the state: */
    178         n=INITIAL_N;
    179         delta=0;
    180         bias=INITIAL_BIAS;
    181 
    182         /* Main encoding loop: */
    183         for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
    184             /*
    185              * All non-basic code points < n have been handled already.
    186              * Find the next larger one:
    187              */
    188             for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
    189                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    190                 if(n<=q && q<m) {
    191                     m=q;
    192                 }
    193             }
    194 
    195             /*
    196              * Increase delta enough to advance the decoder's
    197              * <n,i> state to <m,0>, but guard against overflow:
    198              */
    199             if(m-n>(0x7fffffff-delta)/(handledCPCount+1)) {
    200                 throw new IllegalStateException("Internal program error");
    201             }
    202             delta+=(m-n)*(handledCPCount+1);
    203             n=m;
    204 
    205             /* Encode a sequence of same code points n */
    206             for(j=0; j<srcCPCount; ++j) {
    207                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    208                 if(q<n) {
    209                     ++delta;
    210                 } else if(q==n) {
    211                     /* Represent delta as a generalized variable-length integer: */
    212                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
    213 
    214                         /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    215 
    216                         t=k-bias;
    217                         if(t<TMIN) {
    218                             t=TMIN;
    219                         } else if(t>TMAX) {
    220                             t=TMAX;
    221                         }
    222                         */
    223 
    224                         t=k-bias;
    225                         if(t<TMIN) {
    226                             t=TMIN;
    227                         } else if(k>=(bias+TMAX)) {
    228                             t=TMAX;
    229                         }
    230 
    231                         if(q<t) {
    232                             break;
    233                         }
    234 
    235                         dest.append(digitToBasic(t+(q-t)%(BASE-t), false));
    236                         q=(q-t)/(BASE-t);
    237                     }
    238 
    239                     dest.append(digitToBasic(q, (cpBuffer[j]<0)));
    240                     bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
    241                     delta=0;
    242                     ++handledCPCount;
    243                 }
    244             }
    245 
    246             ++delta;
    247             ++n;
    248         }
    249 
    250         return dest;
    251     }
    252 
    253     private static boolean isBasic(int ch){
    254         return (ch < INITIAL_N);
    255     }
    256     ///CLOVER:OFF
    257     private static boolean isBasicUpperCase(int ch){
    258         return( CAPITAL_A<=ch && ch >= CAPITAL_Z);
    259     }
    260     ///CLOVER:ON
    261     private static boolean isSurrogate(int ch){
    262         return (((ch)&0xfffff800)==0xd800);
    263     }
    264     /**
    265      * Converts Punycode to Unicode.
    266      * The Unicode string will be at most as long as the Punycode string.
    267      *
    268      * @param src The source of the string buffer being passed.
    269      * @param caseFlags The array of boolean case flags.
    270      * @return StringBuilder string.
    271      */
    272     public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
    273                                throws StringPrepParseException{
    274         int srcLength = src.length();
    275         StringBuilder dest = new StringBuilder(src.length());
    276         int n, i, bias, basicLength, j, in, oldi, w, k, digit, t,
    277                 destCPCount, firstSupplementaryIndex, cpLength;
    278         char b;
    279 
    280         /*
    281          * Handle the basic code points:
    282          * Let basicLength be the number of input code points
    283          * before the last delimiter, or 0 if there is none,
    284          * then copy the first basicLength code points to the output.
    285          *
    286          * The following loop iterates backward.
    287          */
    288         for(j=srcLength; j>0;) {
    289             if(src.charAt(--j)==DELIMITER) {
    290                 break;
    291             }
    292         }
    293         basicLength=destCPCount=j;
    294 
    295         for(j=0; j<basicLength; ++j) {
    296             b=src.charAt(j);
    297             if(!isBasic(b)) {
    298                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND);
    299             }
    300             dest.append(b);
    301 
    302             if(caseFlags!=null && j<caseFlags.length) {
    303                 caseFlags[j]=isBasicUpperCase(b);
    304             }
    305         }
    306 
    307         /* Initialize the state: */
    308         n=INITIAL_N;
    309         i=0;
    310         bias=INITIAL_BIAS;
    311         firstSupplementaryIndex=1000000000;
    312 
    313         /*
    314          * Main decoding loop:
    315          * Start just after the last delimiter if any
    316          * basic code points were copied; start at the beginning otherwise.
    317          */
    318         for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
    319             /*
    320              * in is the index of the next character to be consumed, and
    321              * destCPCount is the number of code points in the output array.
    322              *
    323              * Decode a generalized variable-length integer into delta,
    324              * which gets added to i.  The overflow checking is easier
    325              * if we increase i as we go, then subtract off its starting
    326              * value at the end to obtain delta.
    327              */
    328             for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
    329                 if(in>=srcLength) {
    330                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
    331                 }
    332 
    333                 digit=basicToDigit[src.charAt(in++) & 0xFF];
    334                 if(digit<0) {
    335                     throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
    336                 }
    337                 if(digit>(0x7fffffff-i)/w) {
    338                     /* integer overflow */
    339                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
    340                 }
    341 
    342                 i+=digit*w;
    343                 t=k-bias;
    344                 if(t<TMIN) {
    345                     t=TMIN;
    346                 } else if(k>=(bias+TMAX)) {
    347                     t=TMAX;
    348                 }
    349                 if(digit<t) {
    350                     break;
    351                 }
    352 
    353                 if(w>0x7fffffff/(BASE-t)) {
    354                     /* integer overflow */
    355                     throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
    356                 }
    357                 w*=BASE-t;
    358             }
    359 
    360             /*
    361              * Modification from sample code:
    362              * Increments destCPCount here,
    363              * where needed instead of in for() loop tail.
    364              */
    365             ++destCPCount;
    366             bias=adaptBias(i-oldi, destCPCount, (oldi==0));
    367 
    368             /*
    369              * i was supposed to wrap around from (incremented) destCPCount to 0,
    370              * incrementing n each time, so we'll fix that now:
    371              */
    372             if(i/destCPCount>(0x7fffffff-n)) {
    373                 /* integer overflow */
    374                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
    375             }
    376 
    377             n+=i/destCPCount;
    378             i%=destCPCount;
    379             /* not needed for Punycode: */
    380             /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
    381 
    382             if(n>0x10ffff || isSurrogate(n)) {
    383                 /* Unicode code point overflow */
    384                 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
    385             }
    386 
    387             /* Insert n at position i of the output: */
    388             cpLength=Character.charCount(n);
    389             int codeUnitIndex;
    390 
    391             /*
    392              * Handle indexes when supplementary code points are present.
    393              *
    394              * In almost all cases, there will be only BMP code points before i
    395              * and even in the entire string.
    396              * This is handled with the same efficiency as with UTF-32.
    397              *
    398              * Only the rare cases with supplementary code points are handled
    399              * more slowly - but not too bad since this is an insertion anyway.
    400              */
    401             if(i<=firstSupplementaryIndex) {
    402                 codeUnitIndex=i;
    403                 if(cpLength>1) {
    404                     firstSupplementaryIndex=codeUnitIndex;
    405                 } else {
    406                     ++firstSupplementaryIndex;
    407                 }
    408             } else {
    409                 codeUnitIndex=dest.offsetByCodePoints(firstSupplementaryIndex, i-firstSupplementaryIndex);
    410             }
    411 
    412             /* use the UChar index codeUnitIndex instead of the code point index i */
    413             if(caseFlags!=null && (dest.length()+cpLength)<=caseFlags.length) {
    414                 if(codeUnitIndex<dest.length()) {
    415                     System.arraycopy(caseFlags, codeUnitIndex,
    416                                      caseFlags, codeUnitIndex+cpLength,
    417                                      dest.length()-codeUnitIndex);
    418                 }
    419                 /* Case of last character determines uppercase flag: */
    420                 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
    421                 if(cpLength==2) {
    422                     caseFlags[codeUnitIndex+1]=false;
    423                 }
    424             }
    425             if(cpLength==1) {
    426                 /* BMP, insert one code unit */
    427                 dest.insert(codeUnitIndex, (char)n);
    428             } else {
    429                 /* supplementary character, insert two code units */
    430                 dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n));
    431                 dest.insert(codeUnitIndex+1, UTF16.getTrailSurrogate(n));
    432             }
    433             ++i;
    434         }
    435         return dest;
    436     }
    437 }
    438