Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  punycode.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jan31
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 
     18 /* This ICU code derived from: */
     19 /*
     20 punycode.c 0.4.0 (2001-Nov-17-Sat)
     21 http://www.cs.berkeley.edu/~amc/idn/
     22 Adam M. Costello
     23 http://www.nicemice.net/amc/
     24 
     25 Disclaimer and license
     26 
     27     Regarding this entire document or any portion of it (including
     28     the pseudocode and C code), the author makes no guarantees and
     29     is not responsible for any damage resulting from its use.  The
     30     author grants irrevocable permission to anyone to use, modify,
     31     and distribute it in any way that does not diminish the rights
     32     of anyone else to use, modify, and distribute it, provided that
     33     redistributed derivative works do not contain misleading author or
     34     version information.  Derivative works need not be licensed under
     35     similar terms.
     36 */
     37 /*
     38  * ICU modifications:
     39  * - ICU data types and coding conventions
     40  * - ICU string buffer handling with implicit source lengths
     41  *   and destination preflighting
     42  * - UTF-16 handling
     43  */
     44 
     45 #include "unicode/utypes.h"
     46 
     47 #if !UCONFIG_NO_IDNA
     48 
     49 #include "ustr_imp.h"
     50 #include "cstring.h"
     51 #include "cmemory.h"
     52 #include "punycode.h"
     53 #include "unicode/ustring.h"
     54 
     55 
     56 /* Punycode ----------------------------------------------------------------- */
     57 
     58 /* Punycode parameters for Bootstring */
     59 #define BASE            36
     60 #define TMIN            1
     61 #define TMAX            26
     62 #define SKEW            38
     63 #define DAMP            700
     64 #define INITIAL_BIAS    72
     65 #define INITIAL_N       0x80
     66 
     67 /* "Basic" Unicode/ASCII code points */
     68 #define _HYPHEN         0X2d
     69 #define DELIMITER       _HYPHEN
     70 
     71 #define _ZERO_          0X30
     72 #define _NINE           0x39
     73 
     74 #define _SMALL_A        0X61
     75 #define _SMALL_Z        0X7a
     76 
     77 #define _CAPITAL_A      0X41
     78 #define _CAPITAL_Z      0X5a
     79 
     80 #define IS_BASIC(c) ((c)<0x80)
     81 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
     82 
     83 /**
     84  * digitToBasic() returns the basic code point whose value
     85  * (when used for representing integers) is d, which must be in the
     86  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
     87  * nonzero, in which case the uppercase form is used.
     88  */
     89 static U_INLINE char
     90 digitToBasic(int32_t digit, UBool uppercase) {
     91     /*  0..25 map to ASCII a..z or A..Z */
     92     /* 26..35 map to ASCII 0..9         */
     93     if(digit<26) {
     94         if(uppercase) {
     95             return (char)(_CAPITAL_A+digit);
     96         } else {
     97             return (char)(_SMALL_A+digit);
     98         }
     99     } else {
    100         return (char)((_ZERO_-26)+digit);
    101     }
    102 }
    103 
    104 /**
    105  * basicToDigit[] contains the numeric value of a basic code
    106  * point (for use in representing integers) in the range 0 to
    107  * BASE-1, or -1 if b is does not represent a value.
    108  */
    109 static const int8_t
    110 basicToDigit[256]={
    111     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    112     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    113 
    114     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    115     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
    116 
    117     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    118     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    119 
    120     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    121     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    122 
    123     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    124     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    125 
    126     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    127     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    128 
    129     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    130     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    131 
    132     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    133     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
    134 };
    135 
    136 static U_INLINE char
    137 asciiCaseMap(char b, UBool uppercase) {
    138     if(uppercase) {
    139         if(_SMALL_A<=b && b<=_SMALL_Z) {
    140             b-=(_SMALL_A-_CAPITAL_A);
    141         }
    142     } else {
    143         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
    144             b+=(_SMALL_A-_CAPITAL_A);
    145         }
    146     }
    147     return b;
    148 }
    149 
    150 /* Punycode-specific Bootstring code ---------------------------------------- */
    151 
    152 /*
    153  * The following code omits the {parts} of the pseudo-algorithm in the spec
    154  * that are not used with the Punycode parameter set.
    155  */
    156 
    157 /* Bias adaptation function. */
    158 static int32_t
    159 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    160     int32_t count;
    161 
    162     if(firstTime) {
    163         delta/=DAMP;
    164     } else {
    165         delta/=2;
    166     }
    167 
    168     delta+=delta/length;
    169     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
    170         delta/=(BASE-TMIN);
    171     }
    172 
    173     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
    174 }
    175 
    176 #define MAX_CP_COUNT    200
    177 
    178 U_CFUNC int32_t
    179 u_strToPunycode(const UChar *src, int32_t srcLength,
    180                 UChar *dest, int32_t destCapacity,
    181                 const UBool *caseFlags,
    182                 UErrorCode *pErrorCode) {
    183 
    184     int32_t cpBuffer[MAX_CP_COUNT];
    185     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    186     UChar c, c2;
    187 
    188     /* argument checking */
    189     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    190         return 0;
    191     }
    192 
    193     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
    194         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    195         return 0;
    196     }
    197 
    198     /*
    199      * Handle the basic code points and
    200      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
    201      */
    202     srcCPCount=destLength=0;
    203     if(srcLength==-1) {
    204         /* NUL-terminated input */
    205         for(j=0; /* no condition */; ++j) {
    206             if((c=src[j])==0) {
    207                 break;
    208             }
    209             if(srcCPCount==MAX_CP_COUNT) {
    210                 /* too many input code points */
    211                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    212                 return 0;
    213             }
    214             if(IS_BASIC(c)) {
    215                 cpBuffer[srcCPCount++]=0;
    216                 if(destLength<destCapacity) {
    217                     dest[destLength]=
    218                         caseFlags!=NULL ?
    219                             asciiCaseMap((char)c, caseFlags[j]) :
    220                             (char)c;
    221                 }
    222                 ++destLength;
    223             } else {
    224                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
    225                 if(UTF_IS_SINGLE(c)) {
    226                     n|=c;
    227                 } else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) {
    228                     ++j;
    229                     n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
    230                 } else {
    231                     /* error: unmatched surrogate */
    232                     *pErrorCode=U_INVALID_CHAR_FOUND;
    233                     return 0;
    234                 }
    235                 cpBuffer[srcCPCount++]=n;
    236             }
    237         }
    238     } else {
    239         /* length-specified input */
    240         for(j=0; j<srcLength; ++j) {
    241             if(srcCPCount==MAX_CP_COUNT) {
    242                 /* too many input code points */
    243                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    244                 return 0;
    245             }
    246             c=src[j];
    247             if(IS_BASIC(c)) {
    248                 cpBuffer[srcCPCount++]=0;
    249                 if(destLength<destCapacity) {
    250                     dest[destLength]=
    251                         caseFlags!=NULL ?
    252                             asciiCaseMap((char)c, caseFlags[j]) :
    253                             (char)c;
    254                 }
    255                 ++destLength;
    256             } else {
    257                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
    258                 if(UTF_IS_SINGLE(c)) {
    259                     n|=c;
    260                 } else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) {
    261                     ++j;
    262                     n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2);
    263                 } else {
    264                     /* error: unmatched surrogate */
    265                     *pErrorCode=U_INVALID_CHAR_FOUND;
    266                     return 0;
    267                 }
    268                 cpBuffer[srcCPCount++]=n;
    269             }
    270         }
    271     }
    272 
    273     /* Finish the basic string - if it is not empty - with a delimiter. */
    274     basicLength=destLength;
    275     if(basicLength>0) {
    276         if(destLength<destCapacity) {
    277             dest[destLength]=DELIMITER;
    278         }
    279         ++destLength;
    280     }
    281 
    282     /*
    283      * handledCPCount is the number of code points that have been handled
    284      * basicLength is the number of basic code points
    285      * destLength is the number of chars that have been output
    286      */
    287 
    288     /* Initialize the state: */
    289     n=INITIAL_N;
    290     delta=0;
    291     bias=INITIAL_BIAS;
    292 
    293     /* Main encoding loop: */
    294     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
    295         /*
    296          * All non-basic code points < n have been handled already.
    297          * Find the next larger one:
    298          */
    299         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
    300             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    301             if(n<=q && q<m) {
    302                 m=q;
    303             }
    304         }
    305 
    306         /*
    307          * Increase delta enough to advance the decoder's
    308          * <n,i> state to <m,0>, but guard against overflow:
    309          */
    310         if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
    311             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    312             return 0;
    313         }
    314         delta+=(m-n)*(handledCPCount+1);
    315         n=m;
    316 
    317         /* Encode a sequence of same code points n */
    318         for(j=0; j<srcCPCount; ++j) {
    319             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    320             if(q<n) {
    321                 ++delta;
    322             } else if(q==n) {
    323                 /* Represent delta as a generalized variable-length integer: */
    324                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
    325 
    326                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    327 
    328                     t=k-bias;
    329                     if(t<TMIN) {
    330                         t=TMIN;
    331                     } else if(t>TMAX) {
    332                         t=TMAX;
    333                     }
    334                     */
    335 
    336                     t=k-bias;
    337                     if(t<TMIN) {
    338                         t=TMIN;
    339                     } else if(k>=(bias+TMAX)) {
    340                         t=TMAX;
    341                     }
    342 
    343                     if(q<t) {
    344                         break;
    345                     }
    346 
    347                     if(destLength<destCapacity) {
    348                         dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
    349                     }
    350                     ++destLength;
    351                     q=(q-t)/(BASE-t);
    352                 }
    353 
    354                 if(destLength<destCapacity) {
    355                     dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
    356                 }
    357                 ++destLength;
    358                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
    359                 delta=0;
    360                 ++handledCPCount;
    361             }
    362         }
    363 
    364         ++delta;
    365         ++n;
    366     }
    367 
    368     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    369 }
    370 
    371 U_CFUNC int32_t
    372 u_strFromPunycode(const UChar *src, int32_t srcLength,
    373                   UChar *dest, int32_t destCapacity,
    374                   UBool *caseFlags,
    375                   UErrorCode *pErrorCode) {
    376     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
    377             destCPCount, firstSupplementaryIndex, cpLength;
    378     UChar b;
    379 
    380     /* argument checking */
    381     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    382         return 0;
    383     }
    384 
    385     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
    386         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    387         return 0;
    388     }
    389 
    390     if(srcLength==-1) {
    391         srcLength=u_strlen(src);
    392     }
    393 
    394     /*
    395      * Handle the basic code points:
    396      * Let basicLength be the number of input code points
    397      * before the last delimiter, or 0 if there is none,
    398      * then copy the first basicLength code points to the output.
    399      *
    400      * The two following loops iterate backward.
    401      */
    402     for(j=srcLength; j>0;) {
    403         if(src[--j]==DELIMITER) {
    404             break;
    405         }
    406     }
    407     destLength=basicLength=destCPCount=j;
    408 
    409     while(j>0) {
    410         b=src[--j];
    411         if(!IS_BASIC(b)) {
    412             *pErrorCode=U_INVALID_CHAR_FOUND;
    413             return 0;
    414         }
    415 
    416         if(j<destCapacity) {
    417             dest[j]=(UChar)b;
    418 
    419             if(caseFlags!=NULL) {
    420                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
    421             }
    422         }
    423     }
    424 
    425     /* Initialize the state: */
    426     n=INITIAL_N;
    427     i=0;
    428     bias=INITIAL_BIAS;
    429     firstSupplementaryIndex=1000000000;
    430 
    431     /*
    432      * Main decoding loop:
    433      * Start just after the last delimiter if any
    434      * basic code points were copied; start at the beginning otherwise.
    435      */
    436     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
    437         /*
    438          * in is the index of the next character to be consumed, and
    439          * destCPCount is the number of code points in the output array.
    440          *
    441          * Decode a generalized variable-length integer into delta,
    442          * which gets added to i.  The overflow checking is easier
    443          * if we increase i as we go, then subtract off its starting
    444          * value at the end to obtain delta.
    445          */
    446         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
    447             if(in>=srcLength) {
    448                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    449                 return 0;
    450             }
    451 
    452             digit=basicToDigit[(uint8_t)src[in++]];
    453             if(digit<0) {
    454                 *pErrorCode=U_INVALID_CHAR_FOUND;
    455                 return 0;
    456             }
    457             if(digit>(0x7fffffff-i)/w) {
    458                 /* integer overflow */
    459                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    460                 return 0;
    461             }
    462 
    463             i+=digit*w;
    464             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    465             t=k-bias;
    466             if(t<TMIN) {
    467                 t=TMIN;
    468             } else if(t>TMAX) {
    469                 t=TMAX;
    470             }
    471             */
    472             t=k-bias;
    473             if(t<TMIN) {
    474                 t=TMIN;
    475             } else if(k>=(bias+TMAX)) {
    476                 t=TMAX;
    477             }
    478             if(digit<t) {
    479                 break;
    480             }
    481 
    482             if(w>0x7fffffff/(BASE-t)) {
    483                 /* integer overflow */
    484                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    485                 return 0;
    486             }
    487             w*=BASE-t;
    488         }
    489 
    490         /*
    491          * Modification from sample code:
    492          * Increments destCPCount here,
    493          * where needed instead of in for() loop tail.
    494          */
    495         ++destCPCount;
    496         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
    497 
    498         /*
    499          * i was supposed to wrap around from (incremented) destCPCount to 0,
    500          * incrementing n each time, so we'll fix that now:
    501          */
    502         if(i/destCPCount>(0x7fffffff-n)) {
    503             /* integer overflow */
    504             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    505             return 0;
    506         }
    507 
    508         n+=i/destCPCount;
    509         i%=destCPCount;
    510         /* not needed for Punycode: */
    511         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
    512 
    513         if(n>0x10ffff || UTF_IS_SURROGATE(n)) {
    514             /* Unicode code point overflow */
    515             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    516             return 0;
    517         }
    518 
    519         /* Insert n at position i of the output: */
    520         cpLength=UTF_CHAR_LENGTH(n);
    521         if((destLength+cpLength)<=destCapacity) {
    522             int32_t codeUnitIndex;
    523 
    524             /*
    525              * Handle indexes when supplementary code points are present.
    526              *
    527              * In almost all cases, there will be only BMP code points before i
    528              * and even in the entire string.
    529              * This is handled with the same efficiency as with UTF-32.
    530              *
    531              * Only the rare cases with supplementary code points are handled
    532              * more slowly - but not too bad since this is an insertion anyway.
    533              */
    534             if(i<=firstSupplementaryIndex) {
    535                 codeUnitIndex=i;
    536                 if(cpLength>1) {
    537                     firstSupplementaryIndex=codeUnitIndex;
    538                 } else {
    539                     ++firstSupplementaryIndex;
    540                 }
    541             } else {
    542                 codeUnitIndex=firstSupplementaryIndex;
    543                 UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
    544             }
    545 
    546             /* use the UChar index codeUnitIndex instead of the code point index i */
    547             if(codeUnitIndex<destLength) {
    548                 uprv_memmove(dest+codeUnitIndex+cpLength,
    549                              dest+codeUnitIndex,
    550                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
    551                 if(caseFlags!=NULL) {
    552                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
    553                                  caseFlags+codeUnitIndex,
    554                                  destLength-codeUnitIndex);
    555                 }
    556             }
    557             if(cpLength==1) {
    558                 /* BMP, insert one code unit */
    559                 dest[codeUnitIndex]=(UChar)n;
    560             } else {
    561                 /* supplementary character, insert two code units */
    562                 dest[codeUnitIndex]=UTF16_LEAD(n);
    563                 dest[codeUnitIndex+1]=UTF16_TRAIL(n);
    564             }
    565             if(caseFlags!=NULL) {
    566                 /* Case of last character determines uppercase flag: */
    567                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
    568                 if(cpLength==2) {
    569                     caseFlags[codeUnitIndex+1]=FALSE;
    570                 }
    571             }
    572         }
    573         destLength+=cpLength;
    574         ++i;
    575     }
    576 
    577     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    578 }
    579 
    580 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
    581 
    582 #endif /* #if !UCONFIG_NO_IDNA */
    583