1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: punycode.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002jan31 16 * created by: Markus W. Scherer 17 */ 18 19 20 /* This ICU code derived from: */ 21 /* 22 punycode.c 0.4.0 (2001-Nov-17-Sat) 23 http://www.cs.berkeley.edu/~amc/idn/ 24 Adam M. Costello 25 http://www.nicemice.net/amc/ 26 27 Disclaimer and license 28 29 Regarding this entire document or any portion of it (including 30 the pseudocode and C code), the author makes no guarantees and 31 is not responsible for any damage resulting from its use. The 32 author grants irrevocable permission to anyone to use, modify, 33 and distribute it in any way that does not diminish the rights 34 of anyone else to use, modify, and distribute it, provided that 35 redistributed derivative works do not contain misleading author or 36 version information. Derivative works need not be licensed under 37 similar terms. 38 */ 39 /* 40 * ICU modifications: 41 * - ICU data types and coding conventions 42 * - ICU string buffer handling with implicit source lengths 43 * and destination preflighting 44 * - UTF-16 handling 45 */ 46 47 #include "unicode/utypes.h" 48 49 #if !UCONFIG_NO_IDNA 50 51 #include "unicode/ustring.h" 52 #include "unicode/utf.h" 53 #include "unicode/utf16.h" 54 #include "ustr_imp.h" 55 #include "cstring.h" 56 #include "cmemory.h" 57 #include "punycode.h" 58 #include "uassert.h" 59 60 61 /* Punycode ----------------------------------------------------------------- */ 62 63 /* Punycode parameters for Bootstring */ 64 #define BASE 36 65 #define TMIN 1 66 #define TMAX 26 67 #define SKEW 38 68 #define DAMP 700 69 #define INITIAL_BIAS 72 70 #define INITIAL_N 0x80 71 72 /* "Basic" Unicode/ASCII code points */ 73 #define _HYPHEN 0X2d 74 #define DELIMITER _HYPHEN 75 76 #define _ZERO_ 0X30 77 #define _NINE 0x39 78 79 #define _SMALL_A 0X61 80 #define _SMALL_Z 0X7a 81 82 #define _CAPITAL_A 0X41 83 #define _CAPITAL_Z 0X5a 84 85 #define IS_BASIC(c) ((c)<0x80) 86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z) 87 88 /** 89 * digitToBasic() returns the basic code point whose value 90 * (when used for representing integers) is d, which must be in the 91 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 92 * nonzero, in which case the uppercase form is used. 93 */ 94 static inline char 95 digitToBasic(int32_t digit, UBool uppercase) { 96 /* 0..25 map to ASCII a..z or A..Z */ 97 /* 26..35 map to ASCII 0..9 */ 98 if(digit<26) { 99 if(uppercase) { 100 return (char)(_CAPITAL_A+digit); 101 } else { 102 return (char)(_SMALL_A+digit); 103 } 104 } else { 105 return (char)((_ZERO_-26)+digit); 106 } 107 } 108 109 /** 110 * basicToDigit[] contains the numeric value of a basic code 111 * point (for use in representing integers) in the range 0 to 112 * BASE-1, or -1 if b is does not represent a value. 113 */ 114 static const int8_t 115 basicToDigit[256]={ 116 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 117 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 118 119 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 120 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 121 122 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 123 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 124 125 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 127 128 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 129 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 130 131 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 132 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 133 134 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 135 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 136 137 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 138 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 139 }; 140 141 static inline char 142 asciiCaseMap(char b, UBool uppercase) { 143 if(uppercase) { 144 if(_SMALL_A<=b && b<=_SMALL_Z) { 145 b-=(_SMALL_A-_CAPITAL_A); 146 } 147 } else { 148 if(_CAPITAL_A<=b && b<=_CAPITAL_Z) { 149 b+=(_SMALL_A-_CAPITAL_A); 150 } 151 } 152 return b; 153 } 154 155 /* Punycode-specific Bootstring code ---------------------------------------- */ 156 157 /* 158 * The following code omits the {parts} of the pseudo-algorithm in the spec 159 * that are not used with the Punycode parameter set. 160 */ 161 162 /* Bias adaptation function. */ 163 static int32_t 164 adaptBias(int32_t delta, int32_t length, UBool firstTime) { 165 int32_t count; 166 167 if(firstTime) { 168 delta/=DAMP; 169 } else { 170 delta/=2; 171 } 172 173 delta+=delta/length; 174 for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 175 delta/=(BASE-TMIN); 176 } 177 178 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 179 } 180 181 #define MAX_CP_COUNT 200 182 183 U_CFUNC int32_t 184 u_strToPunycode(const UChar *src, int32_t srcLength, 185 UChar *dest, int32_t destCapacity, 186 const UBool *caseFlags, 187 UErrorCode *pErrorCode) { 188 189 int32_t cpBuffer[MAX_CP_COUNT]; 190 int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; 191 UChar c, c2; 192 193 /* argument checking */ 194 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 195 return 0; 196 } 197 198 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { 199 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 200 return 0; 201 } 202 203 /* 204 * Handle the basic code points and 205 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 206 */ 207 srcCPCount=destLength=0; 208 if(srcLength==-1) { 209 /* NUL-terminated input */ 210 for(j=0; /* no condition */; ++j) { 211 if((c=src[j])==0) { 212 break; 213 } 214 if(srcCPCount==MAX_CP_COUNT) { 215 /* too many input code points */ 216 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 217 return 0; 218 } 219 if(IS_BASIC(c)) { 220 cpBuffer[srcCPCount++]=0; 221 if(destLength<destCapacity) { 222 dest[destLength]= 223 caseFlags!=NULL ? 224 asciiCaseMap((char)c, caseFlags[j]) : 225 (char)c; 226 } 227 ++destLength; 228 } else { 229 n=(caseFlags!=NULL && caseFlags[j])<<31L; 230 if(U16_IS_SINGLE(c)) { 231 n|=c; 232 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) { 233 ++j; 234 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2); 235 } else { 236 /* error: unmatched surrogate */ 237 *pErrorCode=U_INVALID_CHAR_FOUND; 238 return 0; 239 } 240 cpBuffer[srcCPCount++]=n; 241 } 242 } 243 } else { 244 /* length-specified input */ 245 for(j=0; j<srcLength; ++j) { 246 if(srcCPCount==MAX_CP_COUNT) { 247 /* too many input code points */ 248 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 249 return 0; 250 } 251 c=src[j]; 252 if(IS_BASIC(c)) { 253 cpBuffer[srcCPCount++]=0; 254 if(destLength<destCapacity) { 255 dest[destLength]= 256 caseFlags!=NULL ? 257 asciiCaseMap((char)c, caseFlags[j]) : 258 (char)c; 259 } 260 ++destLength; 261 } else { 262 n=(caseFlags!=NULL && caseFlags[j])<<31L; 263 if(U16_IS_SINGLE(c)) { 264 n|=c; 265 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) { 266 ++j; 267 n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2); 268 } else { 269 /* error: unmatched surrogate */ 270 *pErrorCode=U_INVALID_CHAR_FOUND; 271 return 0; 272 } 273 cpBuffer[srcCPCount++]=n; 274 } 275 } 276 } 277 278 /* Finish the basic string - if it is not empty - with a delimiter. */ 279 basicLength=destLength; 280 if(basicLength>0) { 281 if(destLength<destCapacity) { 282 dest[destLength]=DELIMITER; 283 } 284 ++destLength; 285 } 286 287 /* 288 * handledCPCount is the number of code points that have been handled 289 * basicLength is the number of basic code points 290 * destLength is the number of chars that have been output 291 */ 292 293 /* Initialize the state: */ 294 n=INITIAL_N; 295 delta=0; 296 bias=INITIAL_BIAS; 297 298 /* Main encoding loop: */ 299 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 300 /* 301 * All non-basic code points < n have been handled already. 302 * Find the next larger one: 303 */ 304 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 305 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 306 if(n<=q && q<m) { 307 m=q; 308 } 309 } 310 311 /* 312 * Increase delta enough to advance the decoder's 313 * <n,i> state to <m,0>, but guard against overflow: 314 */ 315 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { 316 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 317 return 0; 318 } 319 delta+=(m-n)*(handledCPCount+1); 320 n=m; 321 322 /* Encode a sequence of same code points n */ 323 for(j=0; j<srcCPCount; ++j) { 324 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 325 if(q<n) { 326 ++delta; 327 } else if(q==n) { 328 /* Represent delta as a generalized variable-length integer: */ 329 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 330 331 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 332 333 t=k-bias; 334 if(t<TMIN) { 335 t=TMIN; 336 } else if(t>TMAX) { 337 t=TMAX; 338 } 339 */ 340 341 t=k-bias; 342 if(t<TMIN) { 343 t=TMIN; 344 } else if(k>=(bias+TMAX)) { 345 t=TMAX; 346 } 347 348 if(q<t) { 349 break; 350 } 351 352 if(destLength<destCapacity) { 353 dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0); 354 } 355 ++destLength; 356 q=(q-t)/(BASE-t); 357 } 358 359 if(destLength<destCapacity) { 360 dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0)); 361 } 362 ++destLength; 363 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength)); 364 delta=0; 365 ++handledCPCount; 366 } 367 } 368 369 ++delta; 370 ++n; 371 } 372 373 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 374 } 375 376 U_CFUNC int32_t 377 u_strFromPunycode(const UChar *src, int32_t srcLength, 378 UChar *dest, int32_t destCapacity, 379 UBool *caseFlags, 380 UErrorCode *pErrorCode) { 381 int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, 382 destCPCount, firstSupplementaryIndex, cpLength; 383 UChar b; 384 385 /* argument checking */ 386 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 387 return 0; 388 } 389 390 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { 391 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 392 return 0; 393 } 394 395 if(srcLength==-1) { 396 srcLength=u_strlen(src); 397 } 398 399 /* 400 * Handle the basic code points: 401 * Let basicLength be the number of input code points 402 * before the last delimiter, or 0 if there is none, 403 * then copy the first basicLength code points to the output. 404 * 405 * The two following loops iterate backward. 406 */ 407 for(j=srcLength; j>0;) { 408 if(src[--j]==DELIMITER) { 409 break; 410 } 411 } 412 destLength=basicLength=destCPCount=j; 413 U_ASSERT(destLength>=0); 414 415 while(j>0) { 416 b=src[--j]; 417 if(!IS_BASIC(b)) { 418 *pErrorCode=U_INVALID_CHAR_FOUND; 419 return 0; 420 } 421 422 if(j<destCapacity) { 423 dest[j]=(UChar)b; 424 425 if(caseFlags!=NULL) { 426 caseFlags[j]=IS_BASIC_UPPERCASE(b); 427 } 428 } 429 } 430 431 /* Initialize the state: */ 432 n=INITIAL_N; 433 i=0; 434 bias=INITIAL_BIAS; 435 firstSupplementaryIndex=1000000000; 436 437 /* 438 * Main decoding loop: 439 * Start just after the last delimiter if any 440 * basic code points were copied; start at the beginning otherwise. 441 */ 442 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 443 /* 444 * in is the index of the next character to be consumed, and 445 * destCPCount is the number of code points in the output array. 446 * 447 * Decode a generalized variable-length integer into delta, 448 * which gets added to i. The overflow checking is easier 449 * if we increase i as we go, then subtract off its starting 450 * value at the end to obtain delta. 451 */ 452 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 453 if(in>=srcLength) { 454 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 455 return 0; 456 } 457 458 digit=basicToDigit[(uint8_t)src[in++]]; 459 if(digit<0) { 460 *pErrorCode=U_INVALID_CHAR_FOUND; 461 return 0; 462 } 463 if(digit>(0x7fffffff-i)/w) { 464 /* integer overflow */ 465 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 466 return 0; 467 } 468 469 i+=digit*w; 470 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 471 t=k-bias; 472 if(t<TMIN) { 473 t=TMIN; 474 } else if(t>TMAX) { 475 t=TMAX; 476 } 477 */ 478 t=k-bias; 479 if(t<TMIN) { 480 t=TMIN; 481 } else if(k>=(bias+TMAX)) { 482 t=TMAX; 483 } 484 if(digit<t) { 485 break; 486 } 487 488 if(w>0x7fffffff/(BASE-t)) { 489 /* integer overflow */ 490 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 491 return 0; 492 } 493 w*=BASE-t; 494 } 495 496 /* 497 * Modification from sample code: 498 * Increments destCPCount here, 499 * where needed instead of in for() loop tail. 500 */ 501 ++destCPCount; 502 bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0)); 503 504 /* 505 * i was supposed to wrap around from (incremented) destCPCount to 0, 506 * incrementing n each time, so we'll fix that now: 507 */ 508 if(i/destCPCount>(0x7fffffff-n)) { 509 /* integer overflow */ 510 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 511 return 0; 512 } 513 514 n+=i/destCPCount; 515 i%=destCPCount; 516 /* not needed for Punycode: */ 517 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 518 519 if(n>0x10ffff || U_IS_SURROGATE(n)) { 520 /* Unicode code point overflow */ 521 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 522 return 0; 523 } 524 525 /* Insert n at position i of the output: */ 526 cpLength=U16_LENGTH(n); 527 if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) { 528 int32_t codeUnitIndex; 529 530 /* 531 * Handle indexes when supplementary code points are present. 532 * 533 * In almost all cases, there will be only BMP code points before i 534 * and even in the entire string. 535 * This is handled with the same efficiency as with UTF-32. 536 * 537 * Only the rare cases with supplementary code points are handled 538 * more slowly - but not too bad since this is an insertion anyway. 539 */ 540 if(i<=firstSupplementaryIndex) { 541 codeUnitIndex=i; 542 if(cpLength>1) { 543 firstSupplementaryIndex=codeUnitIndex; 544 } else { 545 ++firstSupplementaryIndex; 546 } 547 } else { 548 codeUnitIndex=firstSupplementaryIndex; 549 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex); 550 } 551 552 /* use the UChar index codeUnitIndex instead of the code point index i */ 553 if(codeUnitIndex<destLength) { 554 uprv_memmove(dest+codeUnitIndex+cpLength, 555 dest+codeUnitIndex, 556 (destLength-codeUnitIndex)*U_SIZEOF_UCHAR); 557 if(caseFlags!=NULL) { 558 uprv_memmove(caseFlags+codeUnitIndex+cpLength, 559 caseFlags+codeUnitIndex, 560 destLength-codeUnitIndex); 561 } 562 } 563 if(cpLength==1) { 564 /* BMP, insert one code unit */ 565 dest[codeUnitIndex]=(UChar)n; 566 } else { 567 /* supplementary character, insert two code units */ 568 dest[codeUnitIndex]=U16_LEAD(n); 569 dest[codeUnitIndex+1]=U16_TRAIL(n); 570 } 571 if(caseFlags!=NULL) { 572 /* Case of last character determines uppercase flag: */ 573 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]); 574 if(cpLength==2) { 575 caseFlags[codeUnitIndex+1]=FALSE; 576 } 577 } 578 } 579 destLength+=cpLength; 580 U_ASSERT(destLength>=0); 581 ++i; 582 } 583 584 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 585 } 586 587 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */ 588 589 #endif /* #if !UCONFIG_NO_IDNA */ 590