1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: punycode.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jan31 14 * created by: Markus W. Scherer 15 */ 16 17 18 /* This ICU code derived from: */ 19 /* 20 punycode.c 0.4.0 (2001-Nov-17-Sat) 21 http://www.cs.berkeley.edu/~amc/idn/ 22 Adam M. Costello 23 http://www.nicemice.net/amc/ 24 25 Disclaimer and license 26 27 Regarding this entire document or any portion of it (including 28 the pseudocode and C code), the author makes no guarantees and 29 is not responsible for any damage resulting from its use. The 30 author grants irrevocable permission to anyone to use, modify, 31 and distribute it in any way that does not diminish the rights 32 of anyone else to use, modify, and distribute it, provided that 33 redistributed derivative works do not contain misleading author or 34 version information. Derivative works need not be licensed under 35 similar terms. 36 */ 37 /* 38 * ICU modifications: 39 * - ICU data types and coding conventions 40 * - ICU string buffer handling with implicit source lengths 41 * and destination preflighting 42 * - UTF-16 handling 43 */ 44 45 #include "unicode/utypes.h" 46 47 #if !UCONFIG_NO_IDNA 48 49 #include "unicode/ustring.h" 50 #include "unicode/utf.h" 51 #include "unicode/utf16.h" 52 #include "ustr_imp.h" 53 #include "cstring.h" 54 #include "cmemory.h" 55 #include "punycode.h" 56 #include "uassert.h" 57 58 59 /* Punycode ----------------------------------------------------------------- */ 60 61 /* Punycode parameters for Bootstring */ 62 #define BASE 36 63 #define TMIN 1 64 #define TMAX 26 65 #define SKEW 38 66 #define DAMP 700 67 #define INITIAL_BIAS 72 68 #define INITIAL_N 0x80 69 70 /* "Basic" Unicode/ASCII code points */ 71 #define _HYPHEN 0X2d 72 #define DELIMITER _HYPHEN 73 74 #define _ZERO_ 0X30 75 #define _NINE 0x39 76 77 #define _SMALL_A 0X61 78 #define _SMALL_Z 0X7a 79 80 #define _CAPITAL_A 0X41 81 #define _CAPITAL_Z 0X5a 82 83 #define IS_BASIC(c) ((c)<0x80) 84 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z) 85 86 /** 87 * digitToBasic() returns the basic code point whose value 88 * (when used for representing integers) is d, which must be in the 89 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 90 * nonzero, in which case the uppercase form is used. 91 */ 92 static inline char 93 digitToBasic(int32_t digit, UBool uppercase) { 94 /* 0..25 map to ASCII a..z or A..Z */ 95 /* 26..35 map to ASCII 0..9 */ 96 if(digit<26) { 97 if(uppercase) { 98 return (char)(_CAPITAL_A+digit); 99 } else { 100 return (char)(_SMALL_A+digit); 101 } 102 } else { 103 return (char)((_ZERO_-26)+digit); 104 } 105 } 106 107 /** 108 * basicToDigit[] contains the numeric value of a basic code 109 * point (for use in representing integers) in the range 0 to 110 * BASE-1, or -1 if b is does not represent a value. 111 */ 112 static const int8_t 113 basicToDigit[256]={ 114 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 115 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 116 117 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 118 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 119 120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 121 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 122 123 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 124 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 125 126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 127 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 128 129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 130 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 131 132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 134 135 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 136 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 137 }; 138 139 static inline char 140 asciiCaseMap(char b, UBool uppercase) { 141 if(uppercase) { 142 if(_SMALL_A<=b && b<=_SMALL_Z) { 143 b-=(_SMALL_A-_CAPITAL_A); 144 } 145 } else { 146 if(_CAPITAL_A<=b && b<=_CAPITAL_Z) { 147 b+=(_SMALL_A-_CAPITAL_A); 148 } 149 } 150 return b; 151 } 152 153 /* Punycode-specific Bootstring code ---------------------------------------- */ 154 155 /* 156 * The following code omits the {parts} of the pseudo-algorithm in the spec 157 * that are not used with the Punycode parameter set. 158 */ 159 160 /* Bias adaptation function. */ 161 static int32_t 162 adaptBias(int32_t delta, int32_t length, UBool firstTime) { 163 int32_t count; 164 165 if(firstTime) { 166 delta/=DAMP; 167 } else { 168 delta/=2; 169 } 170 171 delta+=delta/length; 172 for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 173 delta/=(BASE-TMIN); 174 } 175 176 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 177 } 178 179 #define MAX_CP_COUNT 200 180 181 U_CFUNC int32_t 182 u_strToPunycode(const UChar *src, int32_t srcLength, 183 UChar *dest, int32_t destCapacity, 184 const UBool *caseFlags, 185 UErrorCode *pErrorCode) { 186 187 int32_t cpBuffer[MAX_CP_COUNT]; 188 int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; 189 UChar c, c2; 190 191 /* argument checking */ 192 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 193 return 0; 194 } 195 196 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { 197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 198 return 0; 199 } 200 201 /* 202 * Handle the basic code points and 203 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 204 */ 205 srcCPCount=destLength=0; 206 if(srcLength==-1) { 207 /* NUL-terminated input */ 208 for(j=0; /* no condition */; ++j) { 209 if((c=src[j])==0) { 210 break; 211 } 212 if(srcCPCount==MAX_CP_COUNT) { 213 /* too many input code points */ 214 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 215 return 0; 216 } 217 if(IS_BASIC(c)) { 218 cpBuffer[srcCPCount++]=0; 219 if(destLength<destCapacity) { 220 dest[destLength]= 221 caseFlags!=NULL ? 222 asciiCaseMap((char)c, caseFlags[j]) : 223 (char)c; 224 } 225 ++destLength; 226 } else { 227 n=(caseFlags!=NULL && caseFlags[j])<<31L; 228 if(U16_IS_SINGLE(c)) { 229 n|=c; 230 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) { 231 ++j; 232 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2); 233 } else { 234 /* error: unmatched surrogate */ 235 *pErrorCode=U_INVALID_CHAR_FOUND; 236 return 0; 237 } 238 cpBuffer[srcCPCount++]=n; 239 } 240 } 241 } else { 242 /* length-specified input */ 243 for(j=0; j<srcLength; ++j) { 244 if(srcCPCount==MAX_CP_COUNT) { 245 /* too many input code points */ 246 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 247 return 0; 248 } 249 c=src[j]; 250 if(IS_BASIC(c)) { 251 cpBuffer[srcCPCount++]=0; 252 if(destLength<destCapacity) { 253 dest[destLength]= 254 caseFlags!=NULL ? 255 asciiCaseMap((char)c, caseFlags[j]) : 256 (char)c; 257 } 258 ++destLength; 259 } else { 260 n=(caseFlags!=NULL && caseFlags[j])<<31L; 261 if(U16_IS_SINGLE(c)) { 262 n|=c; 263 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) { 264 ++j; 265 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2); 266 } else { 267 /* error: unmatched surrogate */ 268 *pErrorCode=U_INVALID_CHAR_FOUND; 269 return 0; 270 } 271 cpBuffer[srcCPCount++]=n; 272 } 273 } 274 } 275 276 /* Finish the basic string - if it is not empty - with a delimiter. */ 277 basicLength=destLength; 278 if(basicLength>0) { 279 if(destLength<destCapacity) { 280 dest[destLength]=DELIMITER; 281 } 282 ++destLength; 283 } 284 285 /* 286 * handledCPCount is the number of code points that have been handled 287 * basicLength is the number of basic code points 288 * destLength is the number of chars that have been output 289 */ 290 291 /* Initialize the state: */ 292 n=INITIAL_N; 293 delta=0; 294 bias=INITIAL_BIAS; 295 296 /* Main encoding loop: */ 297 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 298 /* 299 * All non-basic code points < n have been handled already. 300 * Find the next larger one: 301 */ 302 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 303 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 304 if(n<=q && q<m) { 305 m=q; 306 } 307 } 308 309 /* 310 * Increase delta enough to advance the decoder's 311 * <n,i> state to <m,0>, but guard against overflow: 312 */ 313 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { 314 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 315 return 0; 316 } 317 delta+=(m-n)*(handledCPCount+1); 318 n=m; 319 320 /* Encode a sequence of same code points n */ 321 for(j=0; j<srcCPCount; ++j) { 322 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 323 if(q<n) { 324 ++delta; 325 } else if(q==n) { 326 /* Represent delta as a generalized variable-length integer: */ 327 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 328 329 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 330 331 t=k-bias; 332 if(t<TMIN) { 333 t=TMIN; 334 } else if(t>TMAX) { 335 t=TMAX; 336 } 337 */ 338 339 t=k-bias; 340 if(t<TMIN) { 341 t=TMIN; 342 } else if(k>=(bias+TMAX)) { 343 t=TMAX; 344 } 345 346 if(q<t) { 347 break; 348 } 349 350 if(destLength<destCapacity) { 351 dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0); 352 } 353 ++destLength; 354 q=(q-t)/(BASE-t); 355 } 356 357 if(destLength<destCapacity) { 358 dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0)); 359 } 360 ++destLength; 361 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength)); 362 delta=0; 363 ++handledCPCount; 364 } 365 } 366 367 ++delta; 368 ++n; 369 } 370 371 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 372 } 373 374 U_CFUNC int32_t 375 u_strFromPunycode(const UChar *src, int32_t srcLength, 376 UChar *dest, int32_t destCapacity, 377 UBool *caseFlags, 378 UErrorCode *pErrorCode) { 379 int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, 380 destCPCount, firstSupplementaryIndex, cpLength; 381 UChar b; 382 383 /* argument checking */ 384 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 385 return 0; 386 } 387 388 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { 389 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 390 return 0; 391 } 392 393 if(srcLength==-1) { 394 srcLength=u_strlen(src); 395 } 396 397 /* 398 * Handle the basic code points: 399 * Let basicLength be the number of input code points 400 * before the last delimiter, or 0 if there is none, 401 * then copy the first basicLength code points to the output. 402 * 403 * The two following loops iterate backward. 404 */ 405 for(j=srcLength; j>0;) { 406 if(src[--j]==DELIMITER) { 407 break; 408 } 409 } 410 destLength=basicLength=destCPCount=j; 411 U_ASSERT(destLength>=0); 412 413 while(j>0) { 414 b=src[--j]; 415 if(!IS_BASIC(b)) { 416 *pErrorCode=U_INVALID_CHAR_FOUND; 417 return 0; 418 } 419 420 if(j<destCapacity) { 421 dest[j]=(UChar)b; 422 423 if(caseFlags!=NULL) { 424 caseFlags[j]=IS_BASIC_UPPERCASE(b); 425 } 426 } 427 } 428 429 /* Initialize the state: */ 430 n=INITIAL_N; 431 i=0; 432 bias=INITIAL_BIAS; 433 firstSupplementaryIndex=1000000000; 434 435 /* 436 * Main decoding loop: 437 * Start just after the last delimiter if any 438 * basic code points were copied; start at the beginning otherwise. 439 */ 440 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 441 /* 442 * in is the index of the next character to be consumed, and 443 * destCPCount is the number of code points in the output array. 444 * 445 * Decode a generalized variable-length integer into delta, 446 * which gets added to i. The overflow checking is easier 447 * if we increase i as we go, then subtract off its starting 448 * value at the end to obtain delta. 449 */ 450 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 451 if(in>=srcLength) { 452 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 453 return 0; 454 } 455 456 digit=basicToDigit[(uint8_t)src[in++]]; 457 if(digit<0) { 458 *pErrorCode=U_INVALID_CHAR_FOUND; 459 return 0; 460 } 461 if(digit>(0x7fffffff-i)/w) { 462 /* integer overflow */ 463 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 464 return 0; 465 } 466 467 i+=digit*w; 468 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 469 t=k-bias; 470 if(t<TMIN) { 471 t=TMIN; 472 } else if(t>TMAX) { 473 t=TMAX; 474 } 475 */ 476 t=k-bias; 477 if(t<TMIN) { 478 t=TMIN; 479 } else if(k>=(bias+TMAX)) { 480 t=TMAX; 481 } 482 if(digit<t) { 483 break; 484 } 485 486 if(w>0x7fffffff/(BASE-t)) { 487 /* integer overflow */ 488 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 489 return 0; 490 } 491 w*=BASE-t; 492 } 493 494 /* 495 * Modification from sample code: 496 * Increments destCPCount here, 497 * where needed instead of in for() loop tail. 498 */ 499 ++destCPCount; 500 bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0)); 501 502 /* 503 * i was supposed to wrap around from (incremented) destCPCount to 0, 504 * incrementing n each time, so we'll fix that now: 505 */ 506 if(i/destCPCount>(0x7fffffff-n)) { 507 /* integer overflow */ 508 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 509 return 0; 510 } 511 512 n+=i/destCPCount; 513 i%=destCPCount; 514 /* not needed for Punycode: */ 515 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 516 517 if(n>0x10ffff || U_IS_SURROGATE(n)) { 518 /* Unicode code point overflow */ 519 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 520 return 0; 521 } 522 523 /* Insert n at position i of the output: */ 524 cpLength=U16_LENGTH(n); 525 if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) { 526 int32_t codeUnitIndex; 527 528 /* 529 * Handle indexes when supplementary code points are present. 530 * 531 * In almost all cases, there will be only BMP code points before i 532 * and even in the entire string. 533 * This is handled with the same efficiency as with UTF-32. 534 * 535 * Only the rare cases with supplementary code points are handled 536 * more slowly - but not too bad since this is an insertion anyway. 537 */ 538 if(i<=firstSupplementaryIndex) { 539 codeUnitIndex=i; 540 if(cpLength>1) { 541 firstSupplementaryIndex=codeUnitIndex; 542 } else { 543 ++firstSupplementaryIndex; 544 } 545 } else { 546 codeUnitIndex=firstSupplementaryIndex; 547 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex); 548 } 549 550 /* use the UChar index codeUnitIndex instead of the code point index i */ 551 if(codeUnitIndex<destLength) { 552 uprv_memmove(dest+codeUnitIndex+cpLength, 553 dest+codeUnitIndex, 554 (destLength-codeUnitIndex)*U_SIZEOF_UCHAR); 555 if(caseFlags!=NULL) { 556 uprv_memmove(caseFlags+codeUnitIndex+cpLength, 557 caseFlags+codeUnitIndex, 558 destLength-codeUnitIndex); 559 } 560 } 561 if(cpLength==1) { 562 /* BMP, insert one code unit */ 563 dest[codeUnitIndex]=(UChar)n; 564 } else { 565 /* supplementary character, insert two code units */ 566 dest[codeUnitIndex]=U16_LEAD(n); 567 dest[codeUnitIndex+1]=U16_TRAIL(n); 568 } 569 if(caseFlags!=NULL) { 570 /* Case of last character determines uppercase flag: */ 571 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]); 572 if(cpLength==2) { 573 caseFlags[codeUnitIndex+1]=FALSE; 574 } 575 } 576 } 577 destLength+=cpLength; 578 U_ASSERT(destLength>=0); 579 ++i; 580 } 581 582 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 583 } 584 585 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */ 586 587 #endif /* #if !UCONFIG_NO_IDNA */ 588