Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  punycode.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jan31
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 
     18 /* This ICU code derived from: */
     19 /*
     20 punycode.c 0.4.0 (2001-Nov-17-Sat)
     21 http://www.cs.berkeley.edu/~amc/idn/
     22 Adam M. Costello
     23 http://www.nicemice.net/amc/
     24 
     25 Disclaimer and license
     26 
     27     Regarding this entire document or any portion of it (including
     28     the pseudocode and C code), the author makes no guarantees and
     29     is not responsible for any damage resulting from its use.  The
     30     author grants irrevocable permission to anyone to use, modify,
     31     and distribute it in any way that does not diminish the rights
     32     of anyone else to use, modify, and distribute it, provided that
     33     redistributed derivative works do not contain misleading author or
     34     version information.  Derivative works need not be licensed under
     35     similar terms.
     36 */
     37 /*
     38  * ICU modifications:
     39  * - ICU data types and coding conventions
     40  * - ICU string buffer handling with implicit source lengths
     41  *   and destination preflighting
     42  * - UTF-16 handling
     43  */
     44 
     45 #include "unicode/utypes.h"
     46 
     47 #if !UCONFIG_NO_IDNA
     48 
     49 #include "unicode/ustring.h"
     50 #include "unicode/utf.h"
     51 #include "unicode/utf16.h"
     52 #include "ustr_imp.h"
     53 #include "cstring.h"
     54 #include "cmemory.h"
     55 #include "punycode.h"
     56 #include "uassert.h"
     57 
     58 
     59 /* Punycode ----------------------------------------------------------------- */
     60 
     61 /* Punycode parameters for Bootstring */
     62 #define BASE            36
     63 #define TMIN            1
     64 #define TMAX            26
     65 #define SKEW            38
     66 #define DAMP            700
     67 #define INITIAL_BIAS    72
     68 #define INITIAL_N       0x80
     69 
     70 /* "Basic" Unicode/ASCII code points */
     71 #define _HYPHEN         0X2d
     72 #define DELIMITER       _HYPHEN
     73 
     74 #define _ZERO_          0X30
     75 #define _NINE           0x39
     76 
     77 #define _SMALL_A        0X61
     78 #define _SMALL_Z        0X7a
     79 
     80 #define _CAPITAL_A      0X41
     81 #define _CAPITAL_Z      0X5a
     82 
     83 #define IS_BASIC(c) ((c)<0x80)
     84 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
     85 
     86 /**
     87  * digitToBasic() returns the basic code point whose value
     88  * (when used for representing integers) is d, which must be in the
     89  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
     90  * nonzero, in which case the uppercase form is used.
     91  */
     92 static inline char
     93 digitToBasic(int32_t digit, UBool uppercase) {
     94     /*  0..25 map to ASCII a..z or A..Z */
     95     /* 26..35 map to ASCII 0..9         */
     96     if(digit<26) {
     97         if(uppercase) {
     98             return (char)(_CAPITAL_A+digit);
     99         } else {
    100             return (char)(_SMALL_A+digit);
    101         }
    102     } else {
    103         return (char)((_ZERO_-26)+digit);
    104     }
    105 }
    106 
    107 /**
    108  * basicToDigit[] contains the numeric value of a basic code
    109  * point (for use in representing integers) in the range 0 to
    110  * BASE-1, or -1 if b is does not represent a value.
    111  */
    112 static const int8_t
    113 basicToDigit[256]={
    114     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    115     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    116 
    117     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    118     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
    119 
    120     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    121     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    122 
    123     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    124     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    125 
    126     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    127     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    128 
    129     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    130     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    131 
    132     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    133     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    134 
    135     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    136     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
    137 };
    138 
    139 static inline char
    140 asciiCaseMap(char b, UBool uppercase) {
    141     if(uppercase) {
    142         if(_SMALL_A<=b && b<=_SMALL_Z) {
    143             b-=(_SMALL_A-_CAPITAL_A);
    144         }
    145     } else {
    146         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
    147             b+=(_SMALL_A-_CAPITAL_A);
    148         }
    149     }
    150     return b;
    151 }
    152 
    153 /* Punycode-specific Bootstring code ---------------------------------------- */
    154 
    155 /*
    156  * The following code omits the {parts} of the pseudo-algorithm in the spec
    157  * that are not used with the Punycode parameter set.
    158  */
    159 
    160 /* Bias adaptation function. */
    161 static int32_t
    162 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    163     int32_t count;
    164 
    165     if(firstTime) {
    166         delta/=DAMP;
    167     } else {
    168         delta/=2;
    169     }
    170 
    171     delta+=delta/length;
    172     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
    173         delta/=(BASE-TMIN);
    174     }
    175 
    176     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
    177 }
    178 
    179 #define MAX_CP_COUNT    200
    180 
    181 U_CFUNC int32_t
    182 u_strToPunycode(const UChar *src, int32_t srcLength,
    183                 UChar *dest, int32_t destCapacity,
    184                 const UBool *caseFlags,
    185                 UErrorCode *pErrorCode) {
    186 
    187     int32_t cpBuffer[MAX_CP_COUNT];
    188     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    189     UChar c, c2;
    190 
    191     /* argument checking */
    192     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    193         return 0;
    194     }
    195 
    196     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
    197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    198         return 0;
    199     }
    200 
    201     /*
    202      * Handle the basic code points and
    203      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
    204      */
    205     srcCPCount=destLength=0;
    206     if(srcLength==-1) {
    207         /* NUL-terminated input */
    208         for(j=0; /* no condition */; ++j) {
    209             if((c=src[j])==0) {
    210                 break;
    211             }
    212             if(srcCPCount==MAX_CP_COUNT) {
    213                 /* too many input code points */
    214                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    215                 return 0;
    216             }
    217             if(IS_BASIC(c)) {
    218                 cpBuffer[srcCPCount++]=0;
    219                 if(destLength<destCapacity) {
    220                     dest[destLength]=
    221                         caseFlags!=NULL ?
    222                             asciiCaseMap((char)c, caseFlags[j]) :
    223                             (char)c;
    224                 }
    225                 ++destLength;
    226             } else {
    227                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
    228                 if(U16_IS_SINGLE(c)) {
    229                     n|=c;
    230                 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
    231                     ++j;
    232                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
    233                 } else {
    234                     /* error: unmatched surrogate */
    235                     *pErrorCode=U_INVALID_CHAR_FOUND;
    236                     return 0;
    237                 }
    238                 cpBuffer[srcCPCount++]=n;
    239             }
    240         }
    241     } else {
    242         /* length-specified input */
    243         for(j=0; j<srcLength; ++j) {
    244             if(srcCPCount==MAX_CP_COUNT) {
    245                 /* too many input code points */
    246                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    247                 return 0;
    248             }
    249             c=src[j];
    250             if(IS_BASIC(c)) {
    251                 cpBuffer[srcCPCount++]=0;
    252                 if(destLength<destCapacity) {
    253                     dest[destLength]=
    254                         caseFlags!=NULL ?
    255                             asciiCaseMap((char)c, caseFlags[j]) :
    256                             (char)c;
    257                 }
    258                 ++destLength;
    259             } else {
    260                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
    261                 if(U16_IS_SINGLE(c)) {
    262                     n|=c;
    263                 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
    264                     ++j;
    265                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
    266                 } else {
    267                     /* error: unmatched surrogate */
    268                     *pErrorCode=U_INVALID_CHAR_FOUND;
    269                     return 0;
    270                 }
    271                 cpBuffer[srcCPCount++]=n;
    272             }
    273         }
    274     }
    275 
    276     /* Finish the basic string - if it is not empty - with a delimiter. */
    277     basicLength=destLength;
    278     if(basicLength>0) {
    279         if(destLength<destCapacity) {
    280             dest[destLength]=DELIMITER;
    281         }
    282         ++destLength;
    283     }
    284 
    285     /*
    286      * handledCPCount is the number of code points that have been handled
    287      * basicLength is the number of basic code points
    288      * destLength is the number of chars that have been output
    289      */
    290 
    291     /* Initialize the state: */
    292     n=INITIAL_N;
    293     delta=0;
    294     bias=INITIAL_BIAS;
    295 
    296     /* Main encoding loop: */
    297     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
    298         /*
    299          * All non-basic code points < n have been handled already.
    300          * Find the next larger one:
    301          */
    302         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
    303             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    304             if(n<=q && q<m) {
    305                 m=q;
    306             }
    307         }
    308 
    309         /*
    310          * Increase delta enough to advance the decoder's
    311          * <n,i> state to <m,0>, but guard against overflow:
    312          */
    313         if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
    314             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    315             return 0;
    316         }
    317         delta+=(m-n)*(handledCPCount+1);
    318         n=m;
    319 
    320         /* Encode a sequence of same code points n */
    321         for(j=0; j<srcCPCount; ++j) {
    322             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    323             if(q<n) {
    324                 ++delta;
    325             } else if(q==n) {
    326                 /* Represent delta as a generalized variable-length integer: */
    327                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
    328 
    329                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    330 
    331                     t=k-bias;
    332                     if(t<TMIN) {
    333                         t=TMIN;
    334                     } else if(t>TMAX) {
    335                         t=TMAX;
    336                     }
    337                     */
    338 
    339                     t=k-bias;
    340                     if(t<TMIN) {
    341                         t=TMIN;
    342                     } else if(k>=(bias+TMAX)) {
    343                         t=TMAX;
    344                     }
    345 
    346                     if(q<t) {
    347                         break;
    348                     }
    349 
    350                     if(destLength<destCapacity) {
    351                         dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
    352                     }
    353                     ++destLength;
    354                     q=(q-t)/(BASE-t);
    355                 }
    356 
    357                 if(destLength<destCapacity) {
    358                     dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
    359                 }
    360                 ++destLength;
    361                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
    362                 delta=0;
    363                 ++handledCPCount;
    364             }
    365         }
    366 
    367         ++delta;
    368         ++n;
    369     }
    370 
    371     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    372 }
    373 
    374 U_CFUNC int32_t
    375 u_strFromPunycode(const UChar *src, int32_t srcLength,
    376                   UChar *dest, int32_t destCapacity,
    377                   UBool *caseFlags,
    378                   UErrorCode *pErrorCode) {
    379     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
    380             destCPCount, firstSupplementaryIndex, cpLength;
    381     UChar b;
    382 
    383     /* argument checking */
    384     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    385         return 0;
    386     }
    387 
    388     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
    389         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    390         return 0;
    391     }
    392 
    393     if(srcLength==-1) {
    394         srcLength=u_strlen(src);
    395     }
    396 
    397     /*
    398      * Handle the basic code points:
    399      * Let basicLength be the number of input code points
    400      * before the last delimiter, or 0 if there is none,
    401      * then copy the first basicLength code points to the output.
    402      *
    403      * The two following loops iterate backward.
    404      */
    405     for(j=srcLength; j>0;) {
    406         if(src[--j]==DELIMITER) {
    407             break;
    408         }
    409     }
    410     destLength=basicLength=destCPCount=j;
    411     U_ASSERT(destLength>=0);
    412 
    413     while(j>0) {
    414         b=src[--j];
    415         if(!IS_BASIC(b)) {
    416             *pErrorCode=U_INVALID_CHAR_FOUND;
    417             return 0;
    418         }
    419 
    420         if(j<destCapacity) {
    421             dest[j]=(UChar)b;
    422 
    423             if(caseFlags!=NULL) {
    424                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
    425             }
    426         }
    427     }
    428 
    429     /* Initialize the state: */
    430     n=INITIAL_N;
    431     i=0;
    432     bias=INITIAL_BIAS;
    433     firstSupplementaryIndex=1000000000;
    434 
    435     /*
    436      * Main decoding loop:
    437      * Start just after the last delimiter if any
    438      * basic code points were copied; start at the beginning otherwise.
    439      */
    440     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
    441         /*
    442          * in is the index of the next character to be consumed, and
    443          * destCPCount is the number of code points in the output array.
    444          *
    445          * Decode a generalized variable-length integer into delta,
    446          * which gets added to i.  The overflow checking is easier
    447          * if we increase i as we go, then subtract off its starting
    448          * value at the end to obtain delta.
    449          */
    450         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
    451             if(in>=srcLength) {
    452                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    453                 return 0;
    454             }
    455 
    456             digit=basicToDigit[(uint8_t)src[in++]];
    457             if(digit<0) {
    458                 *pErrorCode=U_INVALID_CHAR_FOUND;
    459                 return 0;
    460             }
    461             if(digit>(0x7fffffff-i)/w) {
    462                 /* integer overflow */
    463                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    464                 return 0;
    465             }
    466 
    467             i+=digit*w;
    468             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    469             t=k-bias;
    470             if(t<TMIN) {
    471                 t=TMIN;
    472             } else if(t>TMAX) {
    473                 t=TMAX;
    474             }
    475             */
    476             t=k-bias;
    477             if(t<TMIN) {
    478                 t=TMIN;
    479             } else if(k>=(bias+TMAX)) {
    480                 t=TMAX;
    481             }
    482             if(digit<t) {
    483                 break;
    484             }
    485 
    486             if(w>0x7fffffff/(BASE-t)) {
    487                 /* integer overflow */
    488                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    489                 return 0;
    490             }
    491             w*=BASE-t;
    492         }
    493 
    494         /*
    495          * Modification from sample code:
    496          * Increments destCPCount here,
    497          * where needed instead of in for() loop tail.
    498          */
    499         ++destCPCount;
    500         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
    501 
    502         /*
    503          * i was supposed to wrap around from (incremented) destCPCount to 0,
    504          * incrementing n each time, so we'll fix that now:
    505          */
    506         if(i/destCPCount>(0x7fffffff-n)) {
    507             /* integer overflow */
    508             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    509             return 0;
    510         }
    511 
    512         n+=i/destCPCount;
    513         i%=destCPCount;
    514         /* not needed for Punycode: */
    515         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
    516 
    517         if(n>0x10ffff || U_IS_SURROGATE(n)) {
    518             /* Unicode code point overflow */
    519             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    520             return 0;
    521         }
    522 
    523         /* Insert n at position i of the output: */
    524         cpLength=U16_LENGTH(n);
    525         if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
    526             int32_t codeUnitIndex;
    527 
    528             /*
    529              * Handle indexes when supplementary code points are present.
    530              *
    531              * In almost all cases, there will be only BMP code points before i
    532              * and even in the entire string.
    533              * This is handled with the same efficiency as with UTF-32.
    534              *
    535              * Only the rare cases with supplementary code points are handled
    536              * more slowly - but not too bad since this is an insertion anyway.
    537              */
    538             if(i<=firstSupplementaryIndex) {
    539                 codeUnitIndex=i;
    540                 if(cpLength>1) {
    541                     firstSupplementaryIndex=codeUnitIndex;
    542                 } else {
    543                     ++firstSupplementaryIndex;
    544                 }
    545             } else {
    546                 codeUnitIndex=firstSupplementaryIndex;
    547                 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
    548             }
    549 
    550             /* use the UChar index codeUnitIndex instead of the code point index i */
    551             if(codeUnitIndex<destLength) {
    552                 uprv_memmove(dest+codeUnitIndex+cpLength,
    553                              dest+codeUnitIndex,
    554                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
    555                 if(caseFlags!=NULL) {
    556                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
    557                                  caseFlags+codeUnitIndex,
    558                                  destLength-codeUnitIndex);
    559                 }
    560             }
    561             if(cpLength==1) {
    562                 /* BMP, insert one code unit */
    563                 dest[codeUnitIndex]=(UChar)n;
    564             } else {
    565                 /* supplementary character, insert two code units */
    566                 dest[codeUnitIndex]=U16_LEAD(n);
    567                 dest[codeUnitIndex+1]=U16_TRAIL(n);
    568             }
    569             if(caseFlags!=NULL) {
    570                 /* Case of last character determines uppercase flag: */
    571                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
    572                 if(cpLength==2) {
    573                     caseFlags[codeUnitIndex+1]=FALSE;
    574                 }
    575             }
    576         }
    577         destLength+=cpLength;
    578         U_ASSERT(destLength>=0);
    579         ++i;
    580     }
    581 
    582     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    583 }
    584 
    585 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
    586 
    587 #endif /* #if !UCONFIG_NO_IDNA */
    588