1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2003, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: punycode.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jan31 14 * created by: Markus W. Scherer 15 */ 16 17 18 /* This ICU code derived from: */ 19 /* 20 punycode.c 0.4.0 (2001-Nov-17-Sat) 21 http://www.cs.berkeley.edu/~amc/idn/ 22 Adam M. Costello 23 http://www.nicemice.net/amc/ 24 25 Disclaimer and license 26 27 Regarding this entire document or any portion of it (including 28 the pseudocode and C code), the author makes no guarantees and 29 is not responsible for any damage resulting from its use. The 30 author grants irrevocable permission to anyone to use, modify, 31 and distribute it in any way that does not diminish the rights 32 of anyone else to use, modify, and distribute it, provided that 33 redistributed derivative works do not contain misleading author or 34 version information. Derivative works need not be licensed under 35 similar terms. 36 */ 37 /* 38 * ICU modifications: 39 * - ICU data types and coding conventions 40 * - ICU string buffer handling with implicit source lengths 41 * and destination preflighting 42 * - UTF-16 handling 43 */ 44 45 #include "unicode/utypes.h" 46 47 #if !UCONFIG_NO_IDNA 48 49 #include "ustr_imp.h" 50 #include "cstring.h" 51 #include "cmemory.h" 52 #include "punycode.h" 53 #include "unicode/ustring.h" 54 55 56 /* Punycode ----------------------------------------------------------------- */ 57 58 /* Punycode parameters for Bootstring */ 59 #define BASE 36 60 #define TMIN 1 61 #define TMAX 26 62 #define SKEW 38 63 #define DAMP 700 64 #define INITIAL_BIAS 72 65 #define INITIAL_N 0x80 66 67 /* "Basic" Unicode/ASCII code points */ 68 #define _HYPHEN 0X2d 69 #define DELIMITER _HYPHEN 70 71 #define _ZERO_ 0X30 72 #define _NINE 0x39 73 74 #define _SMALL_A 0X61 75 #define _SMALL_Z 0X7a 76 77 #define _CAPITAL_A 0X41 78 #define _CAPITAL_Z 0X5a 79 80 #define IS_BASIC(c) ((c)<0x80) 81 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z) 82 83 /** 84 * digitToBasic() returns the basic code point whose value 85 * (when used for representing integers) is d, which must be in the 86 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 87 * nonzero, in which case the uppercase form is used. 88 */ 89 static U_INLINE char 90 digitToBasic(int32_t digit, UBool uppercase) { 91 /* 0..25 map to ASCII a..z or A..Z */ 92 /* 26..35 map to ASCII 0..9 */ 93 if(digit<26) { 94 if(uppercase) { 95 return (char)(_CAPITAL_A+digit); 96 } else { 97 return (char)(_SMALL_A+digit); 98 } 99 } else { 100 return (char)((_ZERO_-26)+digit); 101 } 102 } 103 104 /** 105 * basicToDigit[] contains the numeric value of a basic code 106 * point (for use in representing integers) in the range 0 to 107 * BASE-1, or -1 if b is does not represent a value. 108 */ 109 static const int8_t 110 basicToDigit[256]={ 111 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 112 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 113 114 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 115 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 116 117 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 118 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 119 120 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 121 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 122 123 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 124 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 125 126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 127 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 128 129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 130 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 131 132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 133 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 134 }; 135 136 static U_INLINE char 137 asciiCaseMap(char b, UBool uppercase) { 138 if(uppercase) { 139 if(_SMALL_A<=b && b<=_SMALL_Z) { 140 b-=(_SMALL_A-_CAPITAL_A); 141 } 142 } else { 143 if(_CAPITAL_A<=b && b<=_CAPITAL_Z) { 144 b+=(_SMALL_A-_CAPITAL_A); 145 } 146 } 147 return b; 148 } 149 150 /* Punycode-specific Bootstring code ---------------------------------------- */ 151 152 /* 153 * The following code omits the {parts} of the pseudo-algorithm in the spec 154 * that are not used with the Punycode parameter set. 155 */ 156 157 /* Bias adaptation function. */ 158 static int32_t 159 adaptBias(int32_t delta, int32_t length, UBool firstTime) { 160 int32_t count; 161 162 if(firstTime) { 163 delta/=DAMP; 164 } else { 165 delta/=2; 166 } 167 168 delta+=delta/length; 169 for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 170 delta/=(BASE-TMIN); 171 } 172 173 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 174 } 175 176 #define MAX_CP_COUNT 200 177 178 U_CFUNC int32_t 179 u_strToPunycode(const UChar *src, int32_t srcLength, 180 UChar *dest, int32_t destCapacity, 181 const UBool *caseFlags, 182 UErrorCode *pErrorCode) { 183 184 int32_t cpBuffer[MAX_CP_COUNT]; 185 int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; 186 UChar c, c2; 187 188 /* argument checking */ 189 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 190 return 0; 191 } 192 193 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { 194 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 195 return 0; 196 } 197 198 /* 199 * Handle the basic code points and 200 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 201 */ 202 srcCPCount=destLength=0; 203 if(srcLength==-1) { 204 /* NUL-terminated input */ 205 for(j=0; /* no condition */; ++j) { 206 if((c=src[j])==0) { 207 break; 208 } 209 if(srcCPCount==MAX_CP_COUNT) { 210 /* too many input code points */ 211 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 212 return 0; 213 } 214 if(IS_BASIC(c)) { 215 cpBuffer[srcCPCount++]=0; 216 if(destLength<destCapacity) { 217 dest[destLength]= 218 caseFlags!=NULL ? 219 asciiCaseMap((char)c, caseFlags[j]) : 220 (char)c; 221 } 222 ++destLength; 223 } else { 224 n=(caseFlags!=NULL && caseFlags[j])<<31L; 225 if(UTF_IS_SINGLE(c)) { 226 n|=c; 227 } else if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(c2=src[j+1])) { 228 ++j; 229 n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2); 230 } else { 231 /* error: unmatched surrogate */ 232 *pErrorCode=U_INVALID_CHAR_FOUND; 233 return 0; 234 } 235 cpBuffer[srcCPCount++]=n; 236 } 237 } 238 } else { 239 /* length-specified input */ 240 for(j=0; j<srcLength; ++j) { 241 if(srcCPCount==MAX_CP_COUNT) { 242 /* too many input code points */ 243 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 244 return 0; 245 } 246 c=src[j]; 247 if(IS_BASIC(c)) { 248 if(destLength<destCapacity) { 249 cpBuffer[srcCPCount++]=0; 250 dest[destLength]= 251 caseFlags!=NULL ? 252 asciiCaseMap((char)c, caseFlags[j]) : 253 (char)c; 254 } 255 ++destLength; 256 } else { 257 n=(caseFlags!=NULL && caseFlags[j])<<31L; 258 if(UTF_IS_SINGLE(c)) { 259 n|=c; 260 } else if(UTF_IS_LEAD(c) && (j+1)<srcLength && UTF_IS_TRAIL(c2=src[j+1])) { 261 ++j; 262 n|=(int32_t)UTF16_GET_PAIR_VALUE(c, c2); 263 } else { 264 /* error: unmatched surrogate */ 265 *pErrorCode=U_INVALID_CHAR_FOUND; 266 return 0; 267 } 268 cpBuffer[srcCPCount++]=n; 269 } 270 } 271 } 272 273 /* Finish the basic string - if it is not empty - with a delimiter. */ 274 basicLength=destLength; 275 if(basicLength>0) { 276 if(destLength<destCapacity) { 277 dest[destLength]=DELIMITER; 278 } 279 ++destLength; 280 } 281 282 /* 283 * handledCPCount is the number of code points that have been handled 284 * basicLength is the number of basic code points 285 * destLength is the number of chars that have been output 286 */ 287 288 /* Initialize the state: */ 289 n=INITIAL_N; 290 delta=0; 291 bias=INITIAL_BIAS; 292 293 /* Main encoding loop: */ 294 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 295 /* 296 * All non-basic code points < n have been handled already. 297 * Find the next larger one: 298 */ 299 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 300 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 301 if(n<=q && q<m) { 302 m=q; 303 } 304 } 305 306 /* 307 * Increase delta enough to advance the decoder's 308 * <n,i> state to <m,0>, but guard against overflow: 309 */ 310 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { 311 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 312 return 0; 313 } 314 delta+=(m-n)*(handledCPCount+1); 315 n=m; 316 317 /* Encode a sequence of same code points n */ 318 for(j=0; j<srcCPCount; ++j) { 319 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 320 if(q<n) { 321 ++delta; 322 } else if(q==n) { 323 /* Represent delta as a generalized variable-length integer: */ 324 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 325 326 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 327 328 t=k-bias; 329 if(t<TMIN) { 330 t=TMIN; 331 } else if(t>TMAX) { 332 t=TMAX; 333 } 334 */ 335 336 t=k-bias; 337 if(t<TMIN) { 338 t=TMIN; 339 } else if(k>=(bias+TMAX)) { 340 t=TMAX; 341 } 342 343 if(q<t) { 344 break; 345 } 346 347 if(destLength<destCapacity) { 348 dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), 0); 349 } 350 q=(q-t)/(BASE-t); 351 } 352 353 if(destLength<destCapacity) { 354 dest[destLength++]=digitToBasic(q, (UBool)(cpBuffer[j]<0)); 355 } 356 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength)); 357 delta=0; 358 ++handledCPCount; 359 } 360 } 361 362 ++delta; 363 ++n; 364 } 365 366 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 367 } 368 369 U_CFUNC int32_t 370 u_strFromPunycode(const UChar *src, int32_t srcLength, 371 UChar *dest, int32_t destCapacity, 372 UBool *caseFlags, 373 UErrorCode *pErrorCode) { 374 int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, 375 destCPCount, firstSupplementaryIndex, cpLength; 376 UChar b; 377 378 /* argument checking */ 379 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 380 return 0; 381 } 382 383 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { 384 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 385 return 0; 386 } 387 388 if(srcLength==-1) { 389 srcLength=u_strlen(src); 390 } 391 392 /* 393 * Handle the basic code points: 394 * Let basicLength be the number of input code points 395 * before the last delimiter, or 0 if there is none, 396 * then copy the first basicLength code points to the output. 397 * 398 * The two following loops iterate backward. 399 */ 400 for(j=srcLength; j>0;) { 401 if(src[--j]==DELIMITER) { 402 break; 403 } 404 } 405 destLength=basicLength=destCPCount=j; 406 407 while(j>0) { 408 b=src[--j]; 409 if(!IS_BASIC(b)) { 410 *pErrorCode=U_INVALID_CHAR_FOUND; 411 return 0; 412 } 413 414 if(j<destCapacity) { 415 dest[j]=(UChar)b; 416 417 if(caseFlags!=NULL) { 418 caseFlags[j]=IS_BASIC_UPPERCASE(b); 419 } 420 } 421 } 422 423 /* Initialize the state: */ 424 n=INITIAL_N; 425 i=0; 426 bias=INITIAL_BIAS; 427 firstSupplementaryIndex=1000000000; 428 429 /* 430 * Main decoding loop: 431 * Start just after the last delimiter if any 432 * basic code points were copied; start at the beginning otherwise. 433 */ 434 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 435 /* 436 * in is the index of the next character to be consumed, and 437 * destCPCount is the number of code points in the output array. 438 * 439 * Decode a generalized variable-length integer into delta, 440 * which gets added to i. The overflow checking is easier 441 * if we increase i as we go, then subtract off its starting 442 * value at the end to obtain delta. 443 */ 444 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 445 if(in>=srcLength) { 446 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 447 return 0; 448 } 449 450 digit=basicToDigit[(uint8_t)src[in++]]; 451 if(digit<0) { 452 *pErrorCode=U_INVALID_CHAR_FOUND; 453 return 0; 454 } 455 if(digit>(0x7fffffff-i)/w) { 456 /* integer overflow */ 457 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 458 return 0; 459 } 460 461 i+=digit*w; 462 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 463 t=k-bias; 464 if(t<TMIN) { 465 t=TMIN; 466 } else if(t>TMAX) { 467 t=TMAX; 468 } 469 */ 470 t=k-bias; 471 if(t<TMIN) { 472 t=TMIN; 473 } else if(k>=(bias+TMAX)) { 474 t=TMAX; 475 } 476 if(digit<t) { 477 break; 478 } 479 480 if(w>0x7fffffff/(BASE-t)) { 481 /* integer overflow */ 482 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 483 return 0; 484 } 485 w*=BASE-t; 486 } 487 488 /* 489 * Modification from sample code: 490 * Increments destCPCount here, 491 * where needed instead of in for() loop tail. 492 */ 493 ++destCPCount; 494 bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0)); 495 496 /* 497 * i was supposed to wrap around from (incremented) destCPCount to 0, 498 * incrementing n each time, so we'll fix that now: 499 */ 500 if(i/destCPCount>(0x7fffffff-n)) { 501 /* integer overflow */ 502 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 503 return 0; 504 } 505 506 n+=i/destCPCount; 507 i%=destCPCount; 508 /* not needed for Punycode: */ 509 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 510 511 if(n>0x10ffff || UTF_IS_SURROGATE(n)) { 512 /* Unicode code point overflow */ 513 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 514 return 0; 515 } 516 517 /* Insert n at position i of the output: */ 518 cpLength=UTF_CHAR_LENGTH(n); 519 if((destLength+cpLength)<destCapacity) { 520 int32_t codeUnitIndex; 521 522 /* 523 * Handle indexes when supplementary code points are present. 524 * 525 * In almost all cases, there will be only BMP code points before i 526 * and even in the entire string. 527 * This is handled with the same efficiency as with UTF-32. 528 * 529 * Only the rare cases with supplementary code points are handled 530 * more slowly - but not too bad since this is an insertion anyway. 531 */ 532 if(i<=firstSupplementaryIndex) { 533 codeUnitIndex=i; 534 if(cpLength>1) { 535 firstSupplementaryIndex=codeUnitIndex; 536 } else { 537 ++firstSupplementaryIndex; 538 } 539 } else { 540 codeUnitIndex=firstSupplementaryIndex; 541 UTF_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex); 542 } 543 544 /* use the UChar index codeUnitIndex instead of the code point index i */ 545 if(codeUnitIndex<destLength) { 546 uprv_memmove(dest+codeUnitIndex+cpLength, 547 dest+codeUnitIndex, 548 (destLength-codeUnitIndex)*U_SIZEOF_UCHAR); 549 if(caseFlags!=NULL) { 550 uprv_memmove(caseFlags+codeUnitIndex+cpLength, 551 caseFlags+codeUnitIndex, 552 destLength-codeUnitIndex); 553 } 554 } 555 if(cpLength==1) { 556 /* BMP, insert one code unit */ 557 dest[codeUnitIndex]=(UChar)n; 558 } else { 559 /* supplementary character, insert two code units */ 560 dest[codeUnitIndex]=UTF16_LEAD(n); 561 dest[codeUnitIndex+1]=UTF16_TRAIL(n); 562 } 563 if(caseFlags!=NULL) { 564 /* Case of last character determines uppercase flag: */ 565 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]); 566 if(cpLength==2) { 567 caseFlags[codeUnitIndex+1]=FALSE; 568 } 569 } 570 } 571 destLength+=cpLength; 572 ++i; 573 } 574 575 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 576 } 577 578 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */ 579 580 #endif /* #if !UCONFIG_NO_IDNA */ 581