Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2002-2011, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  punycode.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002jan31
     16 *   created by: Markus W. Scherer
     17 */
     18 
     19 
     20 /* This ICU code derived from: */
     21 /*
     22 punycode.c 0.4.0 (2001-Nov-17-Sat)
     23 http://www.cs.berkeley.edu/~amc/idn/
     24 Adam M. Costello
     25 http://www.nicemice.net/amc/
     26 
     27 Disclaimer and license
     28 
     29     Regarding this entire document or any portion of it (including
     30     the pseudocode and C code), the author makes no guarantees and
     31     is not responsible for any damage resulting from its use.  The
     32     author grants irrevocable permission to anyone to use, modify,
     33     and distribute it in any way that does not diminish the rights
     34     of anyone else to use, modify, and distribute it, provided that
     35     redistributed derivative works do not contain misleading author or
     36     version information.  Derivative works need not be licensed under
     37     similar terms.
     38 */
     39 /*
     40  * ICU modifications:
     41  * - ICU data types and coding conventions
     42  * - ICU string buffer handling with implicit source lengths
     43  *   and destination preflighting
     44  * - UTF-16 handling
     45  */
     46 
     47 #include "unicode/utypes.h"
     48 
     49 #if !UCONFIG_NO_IDNA
     50 
     51 #include "unicode/ustring.h"
     52 #include "unicode/utf.h"
     53 #include "unicode/utf16.h"
     54 #include "ustr_imp.h"
     55 #include "cstring.h"
     56 #include "cmemory.h"
     57 #include "punycode.h"
     58 #include "uassert.h"
     59 
     60 
     61 /* Punycode ----------------------------------------------------------------- */
     62 
     63 /* Punycode parameters for Bootstring */
     64 #define BASE            36
     65 #define TMIN            1
     66 #define TMAX            26
     67 #define SKEW            38
     68 #define DAMP            700
     69 #define INITIAL_BIAS    72
     70 #define INITIAL_N       0x80
     71 
     72 /* "Basic" Unicode/ASCII code points */
     73 #define _HYPHEN         0X2d
     74 #define DELIMITER       _HYPHEN
     75 
     76 #define _ZERO_          0X30
     77 #define _NINE           0x39
     78 
     79 #define _SMALL_A        0X61
     80 #define _SMALL_Z        0X7a
     81 
     82 #define _CAPITAL_A      0X41
     83 #define _CAPITAL_Z      0X5a
     84 
     85 #define IS_BASIC(c) ((c)<0x80)
     86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
     87 
     88 /**
     89  * digitToBasic() returns the basic code point whose value
     90  * (when used for representing integers) is d, which must be in the
     91  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
     92  * nonzero, in which case the uppercase form is used.
     93  */
     94 static inline char
     95 digitToBasic(int32_t digit, UBool uppercase) {
     96     /*  0..25 map to ASCII a..z or A..Z */
     97     /* 26..35 map to ASCII 0..9         */
     98     if(digit<26) {
     99         if(uppercase) {
    100             return (char)(_CAPITAL_A+digit);
    101         } else {
    102             return (char)(_SMALL_A+digit);
    103         }
    104     } else {
    105         return (char)((_ZERO_-26)+digit);
    106     }
    107 }
    108 
    109 /**
    110  * basicToDigit[] contains the numeric value of a basic code
    111  * point (for use in representing integers) in the range 0 to
    112  * BASE-1, or -1 if b is does not represent a value.
    113  */
    114 static const int8_t
    115 basicToDigit[256]={
    116     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    117     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    118 
    119     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    120     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
    121 
    122     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    123     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    124 
    125     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    126     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
    127 
    128     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    129     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    130 
    131     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    132     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    133 
    134     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    135     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    136 
    137     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    138     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
    139 };
    140 
    141 static inline char
    142 asciiCaseMap(char b, UBool uppercase) {
    143     if(uppercase) {
    144         if(_SMALL_A<=b && b<=_SMALL_Z) {
    145             b-=(_SMALL_A-_CAPITAL_A);
    146         }
    147     } else {
    148         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
    149             b+=(_SMALL_A-_CAPITAL_A);
    150         }
    151     }
    152     return b;
    153 }
    154 
    155 /* Punycode-specific Bootstring code ---------------------------------------- */
    156 
    157 /*
    158  * The following code omits the {parts} of the pseudo-algorithm in the spec
    159  * that are not used with the Punycode parameter set.
    160  */
    161 
    162 /* Bias adaptation function. */
    163 static int32_t
    164 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
    165     int32_t count;
    166 
    167     if(firstTime) {
    168         delta/=DAMP;
    169     } else {
    170         delta/=2;
    171     }
    172 
    173     delta+=delta/length;
    174     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
    175         delta/=(BASE-TMIN);
    176     }
    177 
    178     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
    179 }
    180 
    181 #define MAX_CP_COUNT    200
    182 
    183 U_CFUNC int32_t
    184 u_strToPunycode(const UChar *src, int32_t srcLength,
    185                 UChar *dest, int32_t destCapacity,
    186                 const UBool *caseFlags,
    187                 UErrorCode *pErrorCode) {
    188 
    189     int32_t cpBuffer[MAX_CP_COUNT];
    190     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
    191     UChar c, c2;
    192 
    193     /* argument checking */
    194     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    195         return 0;
    196     }
    197 
    198     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
    199         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    200         return 0;
    201     }
    202 
    203     /*
    204      * Handle the basic code points and
    205      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
    206      */
    207     srcCPCount=destLength=0;
    208     if(srcLength==-1) {
    209         /* NUL-terminated input */
    210         for(j=0; /* no condition */; ++j) {
    211             if((c=src[j])==0) {
    212                 break;
    213             }
    214             if(srcCPCount==MAX_CP_COUNT) {
    215                 /* too many input code points */
    216                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    217                 return 0;
    218             }
    219             if(IS_BASIC(c)) {
    220                 cpBuffer[srcCPCount++]=0;
    221                 if(destLength<destCapacity) {
    222                     dest[destLength]=
    223                         caseFlags!=NULL ?
    224                             asciiCaseMap((char)c, caseFlags[j]) :
    225                             (char)c;
    226                 }
    227                 ++destLength;
    228             } else {
    229                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
    230                 if(U16_IS_SINGLE(c)) {
    231                     n|=c;
    232                 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
    233                     ++j;
    234                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
    235                 } else {
    236                     /* error: unmatched surrogate */
    237                     *pErrorCode=U_INVALID_CHAR_FOUND;
    238                     return 0;
    239                 }
    240                 cpBuffer[srcCPCount++]=n;
    241             }
    242         }
    243     } else {
    244         /* length-specified input */
    245         for(j=0; j<srcLength; ++j) {
    246             if(srcCPCount==MAX_CP_COUNT) {
    247                 /* too many input code points */
    248                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    249                 return 0;
    250             }
    251             c=src[j];
    252             if(IS_BASIC(c)) {
    253                 cpBuffer[srcCPCount++]=0;
    254                 if(destLength<destCapacity) {
    255                     dest[destLength]=
    256                         caseFlags!=NULL ?
    257                             asciiCaseMap((char)c, caseFlags[j]) :
    258                             (char)c;
    259                 }
    260                 ++destLength;
    261             } else {
    262                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
    263                 if(U16_IS_SINGLE(c)) {
    264                     n|=c;
    265                 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
    266                     ++j;
    267                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
    268                 } else {
    269                     /* error: unmatched surrogate */
    270                     *pErrorCode=U_INVALID_CHAR_FOUND;
    271                     return 0;
    272                 }
    273                 cpBuffer[srcCPCount++]=n;
    274             }
    275         }
    276     }
    277 
    278     /* Finish the basic string - if it is not empty - with a delimiter. */
    279     basicLength=destLength;
    280     if(basicLength>0) {
    281         if(destLength<destCapacity) {
    282             dest[destLength]=DELIMITER;
    283         }
    284         ++destLength;
    285     }
    286 
    287     /*
    288      * handledCPCount is the number of code points that have been handled
    289      * basicLength is the number of basic code points
    290      * destLength is the number of chars that have been output
    291      */
    292 
    293     /* Initialize the state: */
    294     n=INITIAL_N;
    295     delta=0;
    296     bias=INITIAL_BIAS;
    297 
    298     /* Main encoding loop: */
    299     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
    300         /*
    301          * All non-basic code points < n have been handled already.
    302          * Find the next larger one:
    303          */
    304         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
    305             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    306             if(n<=q && q<m) {
    307                 m=q;
    308             }
    309         }
    310 
    311         /*
    312          * Increase delta enough to advance the decoder's
    313          * <n,i> state to <m,0>, but guard against overflow:
    314          */
    315         if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
    316             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
    317             return 0;
    318         }
    319         delta+=(m-n)*(handledCPCount+1);
    320         n=m;
    321 
    322         /* Encode a sequence of same code points n */
    323         for(j=0; j<srcCPCount; ++j) {
    324             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
    325             if(q<n) {
    326                 ++delta;
    327             } else if(q==n) {
    328                 /* Represent delta as a generalized variable-length integer: */
    329                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
    330 
    331                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    332 
    333                     t=k-bias;
    334                     if(t<TMIN) {
    335                         t=TMIN;
    336                     } else if(t>TMAX) {
    337                         t=TMAX;
    338                     }
    339                     */
    340 
    341                     t=k-bias;
    342                     if(t<TMIN) {
    343                         t=TMIN;
    344                     } else if(k>=(bias+TMAX)) {
    345                         t=TMAX;
    346                     }
    347 
    348                     if(q<t) {
    349                         break;
    350                     }
    351 
    352                     if(destLength<destCapacity) {
    353                         dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
    354                     }
    355                     ++destLength;
    356                     q=(q-t)/(BASE-t);
    357                 }
    358 
    359                 if(destLength<destCapacity) {
    360                     dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
    361                 }
    362                 ++destLength;
    363                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
    364                 delta=0;
    365                 ++handledCPCount;
    366             }
    367         }
    368 
    369         ++delta;
    370         ++n;
    371     }
    372 
    373     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    374 }
    375 
    376 U_CFUNC int32_t
    377 u_strFromPunycode(const UChar *src, int32_t srcLength,
    378                   UChar *dest, int32_t destCapacity,
    379                   UBool *caseFlags,
    380                   UErrorCode *pErrorCode) {
    381     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
    382             destCPCount, firstSupplementaryIndex, cpLength;
    383     UChar b;
    384 
    385     /* argument checking */
    386     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    387         return 0;
    388     }
    389 
    390     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
    391         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    392         return 0;
    393     }
    394 
    395     if(srcLength==-1) {
    396         srcLength=u_strlen(src);
    397     }
    398 
    399     /*
    400      * Handle the basic code points:
    401      * Let basicLength be the number of input code points
    402      * before the last delimiter, or 0 if there is none,
    403      * then copy the first basicLength code points to the output.
    404      *
    405      * The two following loops iterate backward.
    406      */
    407     for(j=srcLength; j>0;) {
    408         if(src[--j]==DELIMITER) {
    409             break;
    410         }
    411     }
    412     destLength=basicLength=destCPCount=j;
    413     U_ASSERT(destLength>=0);
    414 
    415     while(j>0) {
    416         b=src[--j];
    417         if(!IS_BASIC(b)) {
    418             *pErrorCode=U_INVALID_CHAR_FOUND;
    419             return 0;
    420         }
    421 
    422         if(j<destCapacity) {
    423             dest[j]=(UChar)b;
    424 
    425             if(caseFlags!=NULL) {
    426                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
    427             }
    428         }
    429     }
    430 
    431     /* Initialize the state: */
    432     n=INITIAL_N;
    433     i=0;
    434     bias=INITIAL_BIAS;
    435     firstSupplementaryIndex=1000000000;
    436 
    437     /*
    438      * Main decoding loop:
    439      * Start just after the last delimiter if any
    440      * basic code points were copied; start at the beginning otherwise.
    441      */
    442     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
    443         /*
    444          * in is the index of the next character to be consumed, and
    445          * destCPCount is the number of code points in the output array.
    446          *
    447          * Decode a generalized variable-length integer into delta,
    448          * which gets added to i.  The overflow checking is easier
    449          * if we increase i as we go, then subtract off its starting
    450          * value at the end to obtain delta.
    451          */
    452         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
    453             if(in>=srcLength) {
    454                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    455                 return 0;
    456             }
    457 
    458             digit=basicToDigit[(uint8_t)src[in++]];
    459             if(digit<0) {
    460                 *pErrorCode=U_INVALID_CHAR_FOUND;
    461                 return 0;
    462             }
    463             if(digit>(0x7fffffff-i)/w) {
    464                 /* integer overflow */
    465                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    466                 return 0;
    467             }
    468 
    469             i+=digit*w;
    470             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
    471             t=k-bias;
    472             if(t<TMIN) {
    473                 t=TMIN;
    474             } else if(t>TMAX) {
    475                 t=TMAX;
    476             }
    477             */
    478             t=k-bias;
    479             if(t<TMIN) {
    480                 t=TMIN;
    481             } else if(k>=(bias+TMAX)) {
    482                 t=TMAX;
    483             }
    484             if(digit<t) {
    485                 break;
    486             }
    487 
    488             if(w>0x7fffffff/(BASE-t)) {
    489                 /* integer overflow */
    490                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    491                 return 0;
    492             }
    493             w*=BASE-t;
    494         }
    495 
    496         /*
    497          * Modification from sample code:
    498          * Increments destCPCount here,
    499          * where needed instead of in for() loop tail.
    500          */
    501         ++destCPCount;
    502         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
    503 
    504         /*
    505          * i was supposed to wrap around from (incremented) destCPCount to 0,
    506          * incrementing n each time, so we'll fix that now:
    507          */
    508         if(i/destCPCount>(0x7fffffff-n)) {
    509             /* integer overflow */
    510             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    511             return 0;
    512         }
    513 
    514         n+=i/destCPCount;
    515         i%=destCPCount;
    516         /* not needed for Punycode: */
    517         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
    518 
    519         if(n>0x10ffff || U_IS_SURROGATE(n)) {
    520             /* Unicode code point overflow */
    521             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    522             return 0;
    523         }
    524 
    525         /* Insert n at position i of the output: */
    526         cpLength=U16_LENGTH(n);
    527         if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
    528             int32_t codeUnitIndex;
    529 
    530             /*
    531              * Handle indexes when supplementary code points are present.
    532              *
    533              * In almost all cases, there will be only BMP code points before i
    534              * and even in the entire string.
    535              * This is handled with the same efficiency as with UTF-32.
    536              *
    537              * Only the rare cases with supplementary code points are handled
    538              * more slowly - but not too bad since this is an insertion anyway.
    539              */
    540             if(i<=firstSupplementaryIndex) {
    541                 codeUnitIndex=i;
    542                 if(cpLength>1) {
    543                     firstSupplementaryIndex=codeUnitIndex;
    544                 } else {
    545                     ++firstSupplementaryIndex;
    546                 }
    547             } else {
    548                 codeUnitIndex=firstSupplementaryIndex;
    549                 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
    550             }
    551 
    552             /* use the UChar index codeUnitIndex instead of the code point index i */
    553             if(codeUnitIndex<destLength) {
    554                 uprv_memmove(dest+codeUnitIndex+cpLength,
    555                              dest+codeUnitIndex,
    556                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
    557                 if(caseFlags!=NULL) {
    558                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
    559                                  caseFlags+codeUnitIndex,
    560                                  destLength-codeUnitIndex);
    561                 }
    562             }
    563             if(cpLength==1) {
    564                 /* BMP, insert one code unit */
    565                 dest[codeUnitIndex]=(UChar)n;
    566             } else {
    567                 /* supplementary character, insert two code units */
    568                 dest[codeUnitIndex]=U16_LEAD(n);
    569                 dest[codeUnitIndex+1]=U16_TRAIL(n);
    570             }
    571             if(caseFlags!=NULL) {
    572                 /* Case of last character determines uppercase flag: */
    573                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
    574                 if(cpLength==2) {
    575                     caseFlags[codeUnitIndex+1]=FALSE;
    576                 }
    577             }
    578         }
    579         destLength+=cpLength;
    580         U_ASSERT(destLength>=0);
    581         ++i;
    582     }
    583 
    584     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
    585 }
    586 
    587 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
    588 
    589 #endif /* #if !UCONFIG_NO_IDNA */
    590