1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: uts46.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010mar09 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_IDNA 20 21 #include "unicode/idna.h" 22 #include "unicode/normalizer2.h" 23 #include "unicode/uscript.h" 24 #include "unicode/ustring.h" 25 #include "unicode/utf16.h" 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "punycode.h" 29 #include "ubidi_props.h" 30 #include "ustr_imp.h" 31 32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: 33 // 34 // The domain name length limit is 255 octets in an internal DNS representation 35 // where the last ("root") label is the empty label 36 // represented by length byte 0 alone. 37 // In a conventional string, this translates to 253 characters, or 254 38 // if there is a trailing dot for the root label. 39 40 U_NAMESPACE_BEGIN 41 42 // Severe errors which usually result in a U+FFFD replacement character in the result string. 43 const uint32_t severeErrors= 44 UIDNA_ERROR_LEADING_COMBINING_MARK| 45 UIDNA_ERROR_DISALLOWED| 46 UIDNA_ERROR_PUNYCODE| 47 UIDNA_ERROR_LABEL_HAS_DOT| 48 UIDNA_ERROR_INVALID_ACE_LABEL; 49 50 static inline UBool 51 isASCIIString(const UnicodeString &dest) { 52 const UChar *s=dest.getBuffer(); 53 const UChar *limit=s+dest.length(); 54 while(s<limit) { 55 if(*s++>0x7f) { 56 return FALSE; 57 } 58 } 59 return TRUE; 60 } 61 62 static UBool 63 isASCIIOkBiDi(const UChar *s, int32_t length); 64 65 static UBool 66 isASCIIOkBiDi(const char *s, int32_t length); 67 68 // IDNA class default implementations -------------------------------------- *** 69 70 IDNA::~IDNA() {} 71 72 void 73 IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest, 74 IDNAInfo &info, UErrorCode &errorCode) const { 75 if(U_SUCCESS(errorCode)) { 76 UnicodeString destString; 77 labelToASCII(UnicodeString::fromUTF8(label), destString, 78 info, errorCode).toUTF8(dest); 79 } 80 } 81 82 void 83 IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest, 84 IDNAInfo &info, UErrorCode &errorCode) const { 85 if(U_SUCCESS(errorCode)) { 86 UnicodeString destString; 87 labelToUnicode(UnicodeString::fromUTF8(label), destString, 88 info, errorCode).toUTF8(dest); 89 } 90 } 91 92 void 93 IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest, 94 IDNAInfo &info, UErrorCode &errorCode) const { 95 if(U_SUCCESS(errorCode)) { 96 UnicodeString destString; 97 nameToASCII(UnicodeString::fromUTF8(name), destString, 98 info, errorCode).toUTF8(dest); 99 } 100 } 101 102 void 103 IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest, 104 IDNAInfo &info, UErrorCode &errorCode) const { 105 if(U_SUCCESS(errorCode)) { 106 UnicodeString destString; 107 nameToUnicode(UnicodeString::fromUTF8(name), destString, 108 info, errorCode).toUTF8(dest); 109 } 110 } 111 112 // UTS46 class declaration ------------------------------------------------- *** 113 114 class UTS46 : public IDNA { 115 public: 116 UTS46(uint32_t options, UErrorCode &errorCode); 117 virtual ~UTS46(); 118 119 virtual UnicodeString & 120 labelToASCII(const UnicodeString &label, UnicodeString &dest, 121 IDNAInfo &info, UErrorCode &errorCode) const; 122 123 virtual UnicodeString & 124 labelToUnicode(const UnicodeString &label, UnicodeString &dest, 125 IDNAInfo &info, UErrorCode &errorCode) const; 126 127 virtual UnicodeString & 128 nameToASCII(const UnicodeString &name, UnicodeString &dest, 129 IDNAInfo &info, UErrorCode &errorCode) const; 130 131 virtual UnicodeString & 132 nameToUnicode(const UnicodeString &name, UnicodeString &dest, 133 IDNAInfo &info, UErrorCode &errorCode) const; 134 135 virtual void 136 labelToASCII_UTF8(StringPiece label, ByteSink &dest, 137 IDNAInfo &info, UErrorCode &errorCode) const; 138 139 virtual void 140 labelToUnicodeUTF8(StringPiece label, ByteSink &dest, 141 IDNAInfo &info, UErrorCode &errorCode) const; 142 143 virtual void 144 nameToASCII_UTF8(StringPiece name, ByteSink &dest, 145 IDNAInfo &info, UErrorCode &errorCode) const; 146 147 virtual void 148 nameToUnicodeUTF8(StringPiece name, ByteSink &dest, 149 IDNAInfo &info, UErrorCode &errorCode) const; 150 151 private: 152 UnicodeString & 153 process(const UnicodeString &src, 154 UBool isLabel, UBool toASCII, 155 UnicodeString &dest, 156 IDNAInfo &info, UErrorCode &errorCode) const; 157 158 void 159 processUTF8(StringPiece src, 160 UBool isLabel, UBool toASCII, 161 ByteSink &dest, 162 IDNAInfo &info, UErrorCode &errorCode) const; 163 164 UnicodeString & 165 processUnicode(const UnicodeString &src, 166 int32_t labelStart, int32_t mappingStart, 167 UBool isLabel, UBool toASCII, 168 UnicodeString &dest, 169 IDNAInfo &info, UErrorCode &errorCode) const; 170 171 // returns the new dest.length() 172 int32_t 173 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 174 UErrorCode &errorCode) const; 175 176 // returns the new label length 177 int32_t 178 processLabel(UnicodeString &dest, 179 int32_t labelStart, int32_t labelLength, 180 UBool toASCII, 181 IDNAInfo &info, UErrorCode &errorCode) const; 182 int32_t 183 markBadACELabel(UnicodeString &dest, 184 int32_t labelStart, int32_t labelLength, 185 UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const; 186 187 void 188 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 189 190 UBool 191 isLabelOkContextJ(const UChar *label, int32_t labelLength) const; 192 193 void 194 checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 195 196 const Normalizer2 &uts46Norm2; // uts46.nrm 197 uint32_t options; 198 }; 199 200 IDNA * 201 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { 202 if(U_SUCCESS(errorCode)) { 203 IDNA *idna=new UTS46(options, errorCode); 204 if(idna==NULL) { 205 errorCode=U_MEMORY_ALLOCATION_ERROR; 206 } else if(U_FAILURE(errorCode)) { 207 delete idna; 208 idna=NULL; 209 } 210 return idna; 211 } else { 212 return NULL; 213 } 214 } 215 216 // UTS46 implementation ---------------------------------------------------- *** 217 218 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) 219 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)), 220 options(opt) {} 221 222 UTS46::~UTS46() {} 223 224 UnicodeString & 225 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, 226 IDNAInfo &info, UErrorCode &errorCode) const { 227 return process(label, TRUE, TRUE, dest, info, errorCode); 228 } 229 230 UnicodeString & 231 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, 232 IDNAInfo &info, UErrorCode &errorCode) const { 233 return process(label, TRUE, FALSE, dest, info, errorCode); 234 } 235 236 UnicodeString & 237 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, 238 IDNAInfo &info, UErrorCode &errorCode) const { 239 process(name, FALSE, TRUE, dest, info, errorCode); 240 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 && 241 isASCIIString(dest) && 242 (dest.length()>254 || dest[253]!=0x2e) 243 ) { 244 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 245 } 246 return dest; 247 } 248 249 UnicodeString & 250 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, 251 IDNAInfo &info, UErrorCode &errorCode) const { 252 return process(name, FALSE, FALSE, dest, info, errorCode); 253 } 254 255 void 256 UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest, 257 IDNAInfo &info, UErrorCode &errorCode) const { 258 processUTF8(label, TRUE, TRUE, dest, info, errorCode); 259 } 260 261 void 262 UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest, 263 IDNAInfo &info, UErrorCode &errorCode) const { 264 processUTF8(label, TRUE, FALSE, dest, info, errorCode); 265 } 266 267 void 268 UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest, 269 IDNAInfo &info, UErrorCode &errorCode) const { 270 processUTF8(name, FALSE, TRUE, dest, info, errorCode); 271 } 272 273 void 274 UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest, 275 IDNAInfo &info, UErrorCode &errorCode) const { 276 processUTF8(name, FALSE, FALSE, dest, info, errorCode); 277 } 278 279 // UTS #46 data for ASCII characters. 280 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 281 // and passes through all other ASCII characters. 282 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed 283 // using this data. 284 // The ASCII fastpath also uses this data. 285 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 286 static const int8_t asciiData[128]={ 287 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 288 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 289 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 290 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 291 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 293 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 294 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 296 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 297 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 299 }; 300 301 UnicodeString & 302 UTS46::process(const UnicodeString &src, 303 UBool isLabel, UBool toASCII, 304 UnicodeString &dest, 305 IDNAInfo &info, UErrorCode &errorCode) const { 306 // uts46Norm2.normalize() would do all of this error checking and setup, 307 // but with the ASCII fastpath we do not always call it, and do not 308 // call it first. 309 if(U_FAILURE(errorCode)) { 310 dest.setToBogus(); 311 return dest; 312 } 313 const UChar *srcArray=src.getBuffer(); 314 if(&dest==&src || srcArray==NULL) { 315 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 316 dest.setToBogus(); 317 return dest; 318 } 319 // Arguments are fine, reset output values. 320 dest.remove(); 321 info.reset(); 322 int32_t srcLength=src.length(); 323 if(srcLength==0) { 324 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 325 return dest; 326 } 327 UChar *destArray=dest.getBuffer(srcLength); 328 if(destArray==NULL) { 329 errorCode=U_MEMORY_ALLOCATION_ERROR; 330 return dest; 331 } 332 // ASCII fastpath 333 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 334 int32_t labelStart=0; 335 int32_t i; 336 for(i=0;; ++i) { 337 if(i==srcLength) { 338 if(toASCII) { 339 if((i-labelStart)>63) { 340 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 341 } 342 // There is a trailing dot if labelStart==i. 343 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 344 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 345 } 346 } 347 info.errors|=info.labelErrors; 348 dest.releaseBuffer(i); 349 return dest; 350 } 351 UChar c=srcArray[i]; 352 if(c>0x7f) { 353 break; 354 } 355 int cData=asciiData[c]; 356 if(cData>0) { 357 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 358 } else if(cData<0 && disallowNonLDHDot) { 359 break; // Replacing with U+FFFD can be complicated for toASCII. 360 } else { 361 destArray[i]=c; 362 if(c==0x2d) { // hyphen 363 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 364 // "??--..." is Punycode or forbidden. 365 ++i; // '-' was copied to dest already 366 break; 367 } 368 if(i==labelStart) { 369 // label starts with "-" 370 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 371 } 372 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 373 // label ends with "-" 374 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 375 } 376 } else if(c==0x2e) { // dot 377 if(isLabel) { 378 // Replacing with U+FFFD can be complicated for toASCII. 379 ++i; // '.' was copied to dest already 380 break; 381 } 382 if(i==labelStart) { 383 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 384 } 385 if(toASCII && (i-labelStart)>63) { 386 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 387 } 388 info.errors|=info.labelErrors; 389 info.labelErrors=0; 390 labelStart=i+1; 391 } 392 } 393 } 394 info.errors|=info.labelErrors; 395 dest.releaseBuffer(i); 396 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); 397 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 398 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart))) 399 ) { 400 info.errors|=UIDNA_ERROR_BIDI; 401 } 402 return dest; 403 } 404 405 void 406 UTS46::processUTF8(StringPiece src, 407 UBool isLabel, UBool toASCII, 408 ByteSink &dest, 409 IDNAInfo &info, UErrorCode &errorCode) const { 410 if(U_FAILURE(errorCode)) { 411 return; 412 } 413 const char *srcArray=src.data(); 414 int32_t srcLength=src.length(); 415 if(srcArray==NULL && srcLength!=0) { 416 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 417 return; 418 } 419 // Arguments are fine, reset output values. 420 info.reset(); 421 if(srcLength==0) { 422 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 423 dest.Flush(); 424 return; 425 } 426 UnicodeString destString; 427 int32_t labelStart=0; 428 if(srcLength<=256) { // length of stackArray[] 429 // ASCII fastpath 430 char stackArray[256]; 431 int32_t destCapacity; 432 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, 433 stackArray, UPRV_LENGTHOF(stackArray), &destCapacity); 434 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 435 int32_t i; 436 for(i=0;; ++i) { 437 if(i==srcLength) { 438 if(toASCII) { 439 if((i-labelStart)>63) { 440 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 441 } 442 // There is a trailing dot if labelStart==i. 443 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 444 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 445 } 446 } 447 info.errors|=info.labelErrors; 448 dest.Append(destArray, i); 449 dest.Flush(); 450 return; 451 } 452 char c=srcArray[i]; 453 if((int8_t)c<0) { // (uint8_t)c>0x7f 454 break; 455 } 456 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char. 457 if(cData>0) { 458 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 459 } else if(cData<0 && disallowNonLDHDot) { 460 break; // Replacing with U+FFFD can be complicated for toASCII. 461 } else { 462 destArray[i]=c; 463 if(c==0x2d) { // hyphen 464 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 465 // "??--..." is Punycode or forbidden. 466 break; 467 } 468 if(i==labelStart) { 469 // label starts with "-" 470 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 471 } 472 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 473 // label ends with "-" 474 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 475 } 476 } else if(c==0x2e) { // dot 477 if(isLabel) { 478 break; // Replacing with U+FFFD can be complicated for toASCII. 479 } 480 if(i==labelStart) { 481 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 482 } 483 if(toASCII && (i-labelStart)>63) { 484 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 485 } 486 info.errors|=info.labelErrors; 487 info.labelErrors=0; 488 labelStart=i+1; 489 } 490 } 491 } 492 info.errors|=info.labelErrors; 493 // Convert the processed ASCII prefix of the current label to UTF-16. 494 int32_t mappingStart=i-labelStart; 495 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); 496 // Output the previous ASCII labels and process the rest of src in UTF-16. 497 dest.Append(destArray, labelStart); 498 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart, 499 isLabel, toASCII, 500 destString, info, errorCode); 501 } else { 502 // src is too long for the ASCII fastpath implementation. 503 processUnicode(UnicodeString::fromUTF8(src), 0, 0, 504 isLabel, toASCII, 505 destString, info, errorCode); 506 } 507 destString.toUTF8(dest); // calls dest.Flush() 508 if(toASCII && !isLabel) { 509 // length==labelStart==254 means that there is a trailing dot (ok) and 510 // destString is empty (do not index at 253-labelStart). 511 int32_t length=labelStart+destString.length(); 512 if( length>=254 && isASCIIString(destString) && 513 (length>254 || 514 (labelStart<254 && destString[253-labelStart]!=0x2e)) 515 ) { 516 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 517 } 518 } 519 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 520 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart))) 521 ) { 522 info.errors|=UIDNA_ERROR_BIDI; 523 } 524 } 525 526 UnicodeString & 527 UTS46::processUnicode(const UnicodeString &src, 528 int32_t labelStart, int32_t mappingStart, 529 UBool isLabel, UBool toASCII, 530 UnicodeString &dest, 531 IDNAInfo &info, UErrorCode &errorCode) const { 532 if(mappingStart==0) { 533 uts46Norm2.normalize(src, dest, errorCode); 534 } else { 535 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode); 536 } 537 if(U_FAILURE(errorCode)) { 538 return dest; 539 } 540 UBool doMapDevChars= 541 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : 542 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; 543 const UChar *destArray=dest.getBuffer(); 544 int32_t destLength=dest.length(); 545 int32_t labelLimit=labelStart; 546 while(labelLimit<destLength) { 547 UChar c=destArray[labelLimit]; 548 if(c==0x2e && !isLabel) { 549 int32_t labelLength=labelLimit-labelStart; 550 int32_t newLength=processLabel(dest, labelStart, labelLength, 551 toASCII, info, errorCode); 552 info.errors|=info.labelErrors; 553 info.labelErrors=0; 554 if(U_FAILURE(errorCode)) { 555 return dest; 556 } 557 destArray=dest.getBuffer(); 558 destLength+=newLength-labelLength; 559 labelLimit=labelStart+=newLength+1; 560 continue; 561 } else if(c<0xdf) { 562 // pass 563 } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 564 info.isTransDiff=TRUE; 565 if(doMapDevChars) { 566 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); 567 if(U_FAILURE(errorCode)) { 568 return dest; 569 } 570 destArray=dest.getBuffer(); 571 // All deviation characters have been mapped, no need to check for them again. 572 doMapDevChars=FALSE; 573 // Do not increment labelLimit in case c was removed. 574 continue; 575 } 576 } else if(U16_IS_SURROGATE(c)) { 577 if(U16_IS_SURROGATE_LEAD(c) ? 578 (labelLimit+1)==destLength || !U16_IS_TRAIL(destArray[labelLimit+1]) : 579 labelLimit==labelStart || !U16_IS_LEAD(destArray[labelLimit-1])) { 580 // Map an unpaired surrogate to U+FFFD before normalization so that when 581 // that removes characters we do not turn two unpaired ones into a pair. 582 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 583 dest.setCharAt(labelLimit, 0xfffd); 584 destArray=dest.getBuffer(); 585 } 586 } 587 ++labelLimit; 588 } 589 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 590 // but not an empty label elsewhere nor a completely empty domain name. 591 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 592 if(0==labelStart || labelStart<labelLimit) { 593 processLabel(dest, labelStart, labelLimit-labelStart, 594 toASCII, info, errorCode); 595 info.errors|=info.labelErrors; 596 } 597 return dest; 598 } 599 600 int32_t 601 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 602 UErrorCode &errorCode) const { 603 if(U_FAILURE(errorCode)) { 604 return 0; 605 } 606 int32_t length=dest.length(); 607 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); 608 if(s==NULL) { 609 errorCode=U_MEMORY_ALLOCATION_ERROR; 610 return length; 611 } 612 int32_t capacity=dest.getCapacity(); 613 UBool didMapDevChars=FALSE; 614 int32_t readIndex=mappingStart, writeIndex=mappingStart; 615 do { 616 UChar c=s[readIndex++]; 617 switch(c) { 618 case 0xdf: 619 // Map sharp s to ss. 620 didMapDevChars=TRUE; 621 s[writeIndex++]=0x73; // Replace sharp s with first s. 622 // Insert second s and account for possible buffer reallocation. 623 if(writeIndex==readIndex) { 624 if(length==capacity) { 625 dest.releaseBuffer(length); 626 s=dest.getBuffer(length+1); 627 if(s==NULL) { 628 errorCode=U_MEMORY_ALLOCATION_ERROR; 629 return length; 630 } 631 capacity=dest.getCapacity(); 632 } 633 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); 634 ++readIndex; 635 } 636 s[writeIndex++]=0x73; 637 ++length; 638 break; 639 case 0x3c2: // Map final sigma to nonfinal sigma. 640 didMapDevChars=TRUE; 641 s[writeIndex++]=0x3c3; 642 break; 643 case 0x200c: // Ignore/remove ZWNJ. 644 case 0x200d: // Ignore/remove ZWJ. 645 didMapDevChars=TRUE; 646 --length; 647 break; 648 default: 649 // Only really necessary if writeIndex was different from readIndex. 650 s[writeIndex++]=c; 651 break; 652 } 653 } while(writeIndex<length); 654 dest.releaseBuffer(length); 655 if(didMapDevChars) { 656 // Mapping deviation characters might have resulted in an un-NFC string. 657 // We could use either the NFC or the UTS #46 normalizer. 658 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 659 UnicodeString normalized; 660 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); 661 if(U_SUCCESS(errorCode)) { 662 dest.replace(labelStart, 0x7fffffff, normalized); 663 if(dest.isBogus()) { 664 errorCode=U_MEMORY_ALLOCATION_ERROR; 665 } 666 return dest.length(); 667 } 668 } 669 return length; 670 } 671 672 // Some non-ASCII characters are equivalent to sequences with 673 // non-LDH ASCII characters. To find them: 674 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 675 static inline UBool 676 isNonASCIIDisallowedSTD3Valid(UChar32 c) { 677 return c==0x2260 || c==0x226E || c==0x226F; 678 } 679 680 // Replace the label in dest with the label string, if the label was modified. 681 // If &label==&dest then the label was modified in-place and labelLength 682 // is the new label length, different from label.length(). 683 // If &label!=&dest then labelLength==label.length(). 684 // Returns labelLength (= the new label length). 685 static int32_t 686 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, 687 const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) { 688 if(U_FAILURE(errorCode)) { 689 return 0; 690 } 691 if(&label!=&dest) { 692 dest.replace(destLabelStart, destLabelLength, label); 693 if(dest.isBogus()) { 694 errorCode=U_MEMORY_ALLOCATION_ERROR; 695 return 0; 696 } 697 } 698 return labelLength; 699 } 700 701 int32_t 702 UTS46::processLabel(UnicodeString &dest, 703 int32_t labelStart, int32_t labelLength, 704 UBool toASCII, 705 IDNAInfo &info, UErrorCode &errorCode) const { 706 if(U_FAILURE(errorCode)) { 707 return 0; 708 } 709 UnicodeString fromPunycode; 710 UnicodeString *labelString; 711 const UChar *label=dest.getBuffer()+labelStart; 712 int32_t destLabelStart=labelStart; 713 int32_t destLabelLength=labelLength; 714 UBool wasPunycode; 715 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { 716 // Label starts with "xn--", try to un-Punycode it. 717 wasPunycode=TRUE; 718 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit 719 if(unicodeBuffer==NULL) { 720 // Should never occur if we used capacity==-1 which uses the internal buffer. 721 errorCode=U_MEMORY_ALLOCATION_ERROR; 722 return labelLength; 723 } 724 UErrorCode punycodeErrorCode=U_ZERO_ERROR; 725 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, 726 unicodeBuffer, fromPunycode.getCapacity(), 727 NULL, &punycodeErrorCode); 728 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { 729 fromPunycode.releaseBuffer(0); 730 unicodeBuffer=fromPunycode.getBuffer(unicodeLength); 731 if(unicodeBuffer==NULL) { 732 errorCode=U_MEMORY_ALLOCATION_ERROR; 733 return labelLength; 734 } 735 punycodeErrorCode=U_ZERO_ERROR; 736 unicodeLength=u_strFromPunycode(label+4, labelLength-4, 737 unicodeBuffer, fromPunycode.getCapacity(), 738 NULL, &punycodeErrorCode); 739 } 740 fromPunycode.releaseBuffer(unicodeLength); 741 if(U_FAILURE(punycodeErrorCode)) { 742 info.labelErrors|=UIDNA_ERROR_PUNYCODE; 743 return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); 744 } 745 // Check for NFC, and for characters that are not 746 // valid or deviation characters according to the normalizer. 747 // If there is something wrong, then the string will change. 748 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 749 // Deviation characters are ok in Punycode even in transitional processing. 750 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 751 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 752 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); 753 if(U_FAILURE(errorCode)) { 754 return labelLength; 755 } 756 if(!isValid) { 757 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 758 return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); 759 } 760 labelString=&fromPunycode; 761 label=fromPunycode.getBuffer(); 762 labelStart=0; 763 labelLength=fromPunycode.length(); 764 } else { 765 wasPunycode=FALSE; 766 labelString=&dest; 767 } 768 // Validity check 769 if(labelLength==0) { 770 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 771 return replaceLabel(dest, destLabelStart, destLabelLength, 772 *labelString, labelLength, errorCode); 773 } 774 // labelLength>0 775 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { 776 // label starts with "??--" 777 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; 778 } 779 if(label[0]==0x2d) { 780 // label starts with "-" 781 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 782 } 783 if(label[labelLength-1]==0x2d) { 784 // label ends with "-" 785 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 786 } 787 // If the label was not a Punycode label, then it was the result of 788 // mapping, normalization and label segmentation. 789 // If the label was in Punycode, then we mapped it again above 790 // and checked its validity. 791 // Now we handle the STD3 restriction to LDH characters (if set) 792 // and we look for U+FFFD which indicates disallowed characters 793 // in a non-Punycode label or U+FFFD itself in a Punycode label. 794 // We also check for dots which can come from the input to a single-label function. 795 // Ok to cast away const because we own the UnicodeString. 796 UChar *s=(UChar *)label; 797 const UChar *limit=label+labelLength; 798 UChar oredChars=0; 799 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 800 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 801 do { 802 UChar c=*s; 803 if(c<=0x7f) { 804 if(c==0x2e) { 805 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 806 *s=0xfffd; 807 } else if(disallowNonLDHDot && asciiData[c]<0) { 808 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 809 *s=0xfffd; 810 } 811 } else { 812 oredChars|=c; 813 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 814 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 815 *s=0xfffd; 816 } else if(c==0xfffd) { 817 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 818 } 819 } 820 ++s; 821 } while(s<limit); 822 // Check for a leading combining mark after other validity checks 823 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. 824 UChar32 c; 825 int32_t cpLength=0; 826 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 827 U16_NEXT_UNSAFE(label, cpLength, c); 828 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 829 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; 830 labelString->replace(labelStart, cpLength, (UChar)0xfffd); 831 label=labelString->getBuffer()+labelStart; 832 labelLength+=1-cpLength; 833 if(labelString==&dest) { 834 destLabelLength=labelLength; 835 } 836 } 837 if((info.labelErrors&severeErrors)==0) { 838 // Do contextual checks only if we do not have U+FFFD from a severe error 839 // because U+FFFD can make these checks fail. 840 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { 841 checkLabelBiDi(label, labelLength, info); 842 } 843 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 844 !isLabelOkContextJ(label, labelLength) 845 ) { 846 info.labelErrors|=UIDNA_ERROR_CONTEXTJ; 847 } 848 if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 849 checkLabelContextO(label, labelLength, info); 850 } 851 if(toASCII) { 852 if(wasPunycode) { 853 // Leave a Punycode label unchanged if it has no severe errors. 854 if(destLabelLength>63) { 855 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 856 } 857 return destLabelLength; 858 } else if(oredChars>=0x80) { 859 // Contains non-ASCII characters. 860 UnicodeString punycode; 861 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length 862 if(buffer==NULL) { 863 errorCode=U_MEMORY_ALLOCATION_ERROR; 864 return destLabelLength; 865 } 866 buffer[0]=0x78; // Write "xn--". 867 buffer[1]=0x6e; 868 buffer[2]=0x2d; 869 buffer[3]=0x2d; 870 int32_t punycodeLength=u_strToPunycode(label, labelLength, 871 buffer+4, punycode.getCapacity()-4, 872 NULL, &errorCode); 873 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 874 errorCode=U_ZERO_ERROR; 875 punycode.releaseBuffer(4); 876 buffer=punycode.getBuffer(4+punycodeLength); 877 if(buffer==NULL) { 878 errorCode=U_MEMORY_ALLOCATION_ERROR; 879 return destLabelLength; 880 } 881 punycodeLength=u_strToPunycode(label, labelLength, 882 buffer+4, punycode.getCapacity()-4, 883 NULL, &errorCode); 884 } 885 punycodeLength+=4; 886 punycode.releaseBuffer(punycodeLength); 887 if(U_FAILURE(errorCode)) { 888 return destLabelLength; 889 } 890 if(punycodeLength>63) { 891 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 892 } 893 return replaceLabel(dest, destLabelStart, destLabelLength, 894 punycode, punycodeLength, errorCode); 895 } else { 896 // all-ASCII label 897 if(labelLength>63) { 898 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 899 } 900 } 901 } 902 } else { 903 // If a Punycode label has severe errors, 904 // then leave it but make sure it does not look valid. 905 if(wasPunycode) { 906 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 907 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode); 908 } 909 } 910 return replaceLabel(dest, destLabelStart, destLabelLength, 911 *labelString, labelLength, errorCode); 912 } 913 914 // Make sure an ACE label does not look valid. 915 // Append U+FFFD if the label has only LDH characters. 916 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD. 917 int32_t 918 UTS46::markBadACELabel(UnicodeString &dest, 919 int32_t labelStart, int32_t labelLength, 920 UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const { 921 if(U_FAILURE(errorCode)) { 922 return 0; 923 } 924 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 925 UBool isASCII=TRUE; 926 UBool onlyLDH=TRUE; 927 const UChar *label=dest.getBuffer()+labelStart; 928 // Ok to cast away const because we own the UnicodeString. 929 UChar *s=(UChar *)label+4; // After the initial "xn--". 930 const UChar *limit=label+labelLength; 931 do { 932 UChar c=*s; 933 if(c<=0x7f) { 934 if(c==0x2e) { 935 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 936 *s=0xfffd; 937 isASCII=onlyLDH=FALSE; 938 } else if(asciiData[c]<0) { 939 onlyLDH=FALSE; 940 if(disallowNonLDHDot) { 941 *s=0xfffd; 942 isASCII=FALSE; 943 } 944 } 945 } else { 946 isASCII=onlyLDH=FALSE; 947 } 948 } while(++s<limit); 949 if(onlyLDH) { 950 dest.insert(labelStart+labelLength, (UChar)0xfffd); 951 if(dest.isBogus()) { 952 errorCode=U_MEMORY_ALLOCATION_ERROR; 953 return 0; 954 } 955 ++labelLength; 956 } else { 957 if(toASCII && isASCII && labelLength>63) { 958 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 959 } 960 } 961 return labelLength; 962 } 963 964 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); 965 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); 966 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; 967 968 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); 969 970 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); 971 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 972 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); 973 974 const uint32_t ES_CS_ET_ON_BN_NSM_MASK= 975 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| 976 U_MASK(U_COMMON_NUMBER_SEPARATOR)| 977 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| 978 U_MASK(U_OTHER_NEUTRAL)| 979 U_MASK(U_BOUNDARY_NEUTRAL)| 980 U_MASK(U_DIR_NON_SPACING_MARK); 981 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 982 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 983 984 // We scan the whole label and check both for whether it contains RTL characters 985 // and whether it passes the BiDi Rule. 986 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 987 // that a domain name is a BiDi domain name (has an RTL label) only after 988 // processing several earlier labels. 989 void 990 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 991 // IDNA2008 BiDi rule 992 // Get the directionality of the first character. 993 UChar32 c; 994 int32_t i=0; 995 U16_NEXT_UNSAFE(label, i, c); 996 uint32_t firstMask=U_MASK(u_charDirection(c)); 997 // 1. The first character must be a character with BIDI property L, R 998 // or AL. If it has the R or AL property, it is an RTL label; if it 999 // has the L property, it is an LTR label. 1000 if((firstMask&~L_R_AL_MASK)!=0) { 1001 info.isOkBiDi=FALSE; 1002 } 1003 // Get the directionality of the last non-NSM character. 1004 uint32_t lastMask; 1005 for(;;) { 1006 if(i>=labelLength) { 1007 lastMask=firstMask; 1008 break; 1009 } 1010 U16_PREV_UNSAFE(label, labelLength, c); 1011 UCharDirection dir=u_charDirection(c); 1012 if(dir!=U_DIR_NON_SPACING_MARK) { 1013 lastMask=U_MASK(dir); 1014 break; 1015 } 1016 } 1017 // 3. In an RTL label, the end of the label must be a character with 1018 // BIDI property R, AL, EN or AN, followed by zero or more 1019 // characters with BIDI property NSM. 1020 // 6. In an LTR label, the end of the label must be a character with 1021 // BIDI property L or EN, followed by zero or more characters with 1022 // BIDI property NSM. 1023 if( (firstMask&L_MASK)!=0 ? 1024 (lastMask&~L_EN_MASK)!=0 : 1025 (lastMask&~R_AL_EN_AN_MASK)!=0 1026 ) { 1027 info.isOkBiDi=FALSE; 1028 } 1029 // Add the directionalities of the intervening characters. 1030 uint32_t mask=firstMask|lastMask; 1031 while(i<labelLength) { 1032 U16_NEXT_UNSAFE(label, i, c); 1033 mask|=U_MASK(u_charDirection(c)); 1034 } 1035 if(firstMask&L_MASK) { 1036 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1037 // ES, CS, ET, ON, BN and NSM are allowed. 1038 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1039 info.isOkBiDi=FALSE; 1040 } 1041 } else { 1042 // 2. In an RTL label, only characters with the BIDI properties R, AL, 1043 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 1044 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1045 info.isOkBiDi=FALSE; 1046 } 1047 // 4. In an RTL label, if an EN is present, no AN may be present, and 1048 // vice versa. 1049 if((mask&EN_AN_MASK)==EN_AN_MASK) { 1050 info.isOkBiDi=FALSE; 1051 } 1052 } 1053 // An RTL label is a label that contains at least one character of type 1054 // R, AL or AN. [...] 1055 // A "BIDI domain name" is a domain name that contains at least one RTL 1056 // label. [...] 1057 // The following rule, consisting of six conditions, applies to labels 1058 // in BIDI domain names. 1059 if((mask&R_AL_AN_MASK)!=0) { 1060 info.isBiDi=TRUE; 1061 } 1062 } 1063 1064 // Special code for the ASCII prefix of a BiDi domain name. 1065 // The ASCII prefix is all-LTR. 1066 1067 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 1068 // 1. The first character must be a character with BIDI property L [...] 1069 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1070 // ES, CS, ET, ON, BN and NSM are allowed. 1071 // 6. In an LTR label, the end of the label must be a character with 1072 // BIDI property L or EN [...] 1073 1074 // UTF-16 version, called for mapped ASCII prefix. 1075 // Cannot contain uppercase A-Z. 1076 // s[length-1] must be the trailing dot. 1077 static UBool 1078 isASCIIOkBiDi(const UChar *s, int32_t length) { 1079 int32_t labelStart=0; 1080 for(int32_t i=0; i<length; ++i) { 1081 UChar c=s[i]; 1082 if(c==0x2e) { // dot 1083 if(i>labelStart) { 1084 c=s[i-1]; 1085 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { 1086 // Last character in the label is not an L or EN. 1087 return FALSE; 1088 } 1089 } 1090 labelStart=i+1; 1091 } else if(i==labelStart) { 1092 if(!(0x61<=c && c<=0x7a)) { 1093 // First character in the label is not an L. 1094 return FALSE; 1095 } 1096 } else { 1097 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1098 // Intermediate character in the label is a B, S or WS. 1099 return FALSE; 1100 } 1101 } 1102 } 1103 return TRUE; 1104 } 1105 1106 // UTF-8 version, called for source ASCII prefix. 1107 // Can contain uppercase A-Z. 1108 // s[length-1] must be the trailing dot. 1109 static UBool 1110 isASCIIOkBiDi(const char *s, int32_t length) { 1111 int32_t labelStart=0; 1112 for(int32_t i=0; i<length; ++i) { 1113 char c=s[i]; 1114 if(c==0x2e) { // dot 1115 if(i>labelStart) { 1116 c=s[i-1]; 1117 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { 1118 // Last character in the label is not an L or EN. 1119 return FALSE; 1120 } 1121 } 1122 labelStart=i+1; 1123 } else if(i==labelStart) { 1124 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { 1125 // First character in the label is not an L. 1126 return FALSE; 1127 } 1128 } else { 1129 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1130 // Intermediate character in the label is a B, S or WS. 1131 return FALSE; 1132 } 1133 } 1134 } 1135 return TRUE; 1136 } 1137 1138 UBool 1139 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { 1140 // [IDNA2008-Tables] 1141 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 1142 for(int32_t i=0; i<labelLength; ++i) { 1143 if(label[i]==0x200c) { 1144 // Appendix A.1. ZERO WIDTH NON-JOINER 1145 // Rule Set: 1146 // False; 1147 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1148 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 1149 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 1150 if(i==0) { 1151 return FALSE; 1152 } 1153 UChar32 c; 1154 int32_t j=i; 1155 U16_PREV_UNSAFE(label, j, c); 1156 if(uts46Norm2.getCombiningClass(c)==9) { 1157 continue; 1158 } 1159 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 1160 for(;;) { 1161 UJoiningType type=ubidi_getJoiningType(c); 1162 if(type==U_JT_TRANSPARENT) { 1163 if(j==0) { 1164 return FALSE; 1165 } 1166 U16_PREV_UNSAFE(label, j, c); 1167 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { 1168 break; // precontext fulfilled 1169 } else { 1170 return FALSE; 1171 } 1172 } 1173 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 1174 for(j=i+1;;) { 1175 if(j==labelLength) { 1176 return FALSE; 1177 } 1178 U16_NEXT_UNSAFE(label, j, c); 1179 UJoiningType type=ubidi_getJoiningType(c); 1180 if(type==U_JT_TRANSPARENT) { 1181 // just skip this character 1182 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { 1183 break; // postcontext fulfilled 1184 } else { 1185 return FALSE; 1186 } 1187 } 1188 } else if(label[i]==0x200d) { 1189 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 1190 // Rule Set: 1191 // False; 1192 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1193 if(i==0) { 1194 return FALSE; 1195 } 1196 UChar32 c; 1197 int32_t j=i; 1198 U16_PREV_UNSAFE(label, j, c); 1199 if(uts46Norm2.getCombiningClass(c)!=9) { 1200 return FALSE; 1201 } 1202 } 1203 } 1204 return TRUE; 1205 } 1206 1207 void 1208 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 1209 int32_t labelEnd=labelLength-1; // inclusive 1210 int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx 1211 for(int32_t i=0; i<=labelEnd; ++i) { 1212 UChar32 c=label[i]; 1213 if(c<0xb7) { 1214 // ASCII fastpath 1215 } else if(c<=0x6f9) { 1216 if(c==0xb7) { 1217 // Appendix A.3. MIDDLE DOT (U+00B7) 1218 // Rule Set: 1219 // False; 1220 // If Before(cp) .eq. U+006C And 1221 // After(cp) .eq. U+006C Then True; 1222 if(!(0<i && label[i-1]==0x6c && 1223 i<labelEnd && label[i+1]==0x6c)) { 1224 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1225 } 1226 } else if(c==0x375) { 1227 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 1228 // Rule Set: 1229 // False; 1230 // If Script(After(cp)) .eq. Greek Then True; 1231 UScriptCode script=USCRIPT_INVALID_CODE; 1232 if(i<labelEnd) { 1233 UErrorCode errorCode=U_ZERO_ERROR; 1234 int32_t j=i+1; 1235 U16_NEXT(label, j, labelLength, c); 1236 script=uscript_getScript(c, &errorCode); 1237 } 1238 if(script!=USCRIPT_GREEK) { 1239 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1240 } 1241 } else if(c==0x5f3 || c==0x5f4) { 1242 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 1243 // Rule Set: 1244 // False; 1245 // If Script(Before(cp)) .eq. Hebrew Then True; 1246 // 1247 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 1248 // Rule Set: 1249 // False; 1250 // If Script(Before(cp)) .eq. Hebrew Then True; 1251 UScriptCode script=USCRIPT_INVALID_CODE; 1252 if(0<i) { 1253 UErrorCode errorCode=U_ZERO_ERROR; 1254 int32_t j=i; 1255 U16_PREV(label, 0, j, c); 1256 script=uscript_getScript(c, &errorCode); 1257 } 1258 if(script!=USCRIPT_HEBREW) { 1259 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1260 } 1261 } else if(0x660<=c /* && c<=0x6f9 */) { 1262 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 1263 // Rule Set: 1264 // True; 1265 // For All Characters: 1266 // If cp .in. 06F0..06F9 Then False; 1267 // End For; 1268 // 1269 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 1270 // Rule Set: 1271 // True; 1272 // For All Characters: 1273 // If cp .in. 0660..0669 Then False; 1274 // End For; 1275 if(c<=0x669) { 1276 if(arabicDigits>0) { 1277 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1278 } 1279 arabicDigits=-1; 1280 } else if(0x6f0<=c) { 1281 if(arabicDigits<0) { 1282 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1283 } 1284 arabicDigits=1; 1285 } 1286 } 1287 } else if(c==0x30fb) { 1288 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 1289 // Rule Set: 1290 // False; 1291 // For All Characters: 1292 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 1293 // End For; 1294 UErrorCode errorCode=U_ZERO_ERROR; 1295 for(int j=0;;) { 1296 if(j>labelEnd) { 1297 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1298 break; 1299 } 1300 U16_NEXT(label, j, labelLength, c); 1301 UScriptCode script=uscript_getScript(c, &errorCode); 1302 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { 1303 break; 1304 } 1305 } 1306 } 1307 } 1308 } 1309 1310 U_NAMESPACE_END 1311 1312 // C API ------------------------------------------------------------------- *** 1313 1314 U_NAMESPACE_USE 1315 1316 U_CAPI UIDNA * U_EXPORT2 1317 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { 1318 return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode)); 1319 } 1320 1321 U_CAPI void U_EXPORT2 1322 uidna_close(UIDNA *idna) { 1323 delete reinterpret_cast<IDNA *>(idna); 1324 } 1325 1326 static UBool 1327 checkArgs(const void *label, int32_t length, 1328 void *dest, int32_t capacity, 1329 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1330 if(U_FAILURE(*pErrorCode)) { 1331 return FALSE; 1332 } 1333 // sizeof(UIDNAInfo)=16 in the first API version. 1334 if(pInfo==NULL || pInfo->size<16) { 1335 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1336 return FALSE; 1337 } 1338 if( (label==NULL ? length!=0 : length<-1) || 1339 (dest==NULL ? capacity!=0 : capacity<0) || 1340 (dest==label && label!=NULL) 1341 ) { 1342 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1343 return FALSE; 1344 } 1345 // Set all *pInfo bytes to 0 except for the size field itself. 1346 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); 1347 return TRUE; 1348 } 1349 1350 static void 1351 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { 1352 pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); 1353 pInfo->errors=info.getErrors(); 1354 } 1355 1356 U_CAPI int32_t U_EXPORT2 1357 uidna_labelToASCII(const UIDNA *idna, 1358 const UChar *label, int32_t length, 1359 UChar *dest, int32_t capacity, 1360 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1361 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1362 return 0; 1363 } 1364 UnicodeString src((UBool)(length<0), label, length); 1365 UnicodeString destString(dest, 0, capacity); 1366 IDNAInfo info; 1367 reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode); 1368 idnaInfoToStruct(info, pInfo); 1369 return destString.extract(dest, capacity, *pErrorCode); 1370 } 1371 1372 U_CAPI int32_t U_EXPORT2 1373 uidna_labelToUnicode(const UIDNA *idna, 1374 const UChar *label, int32_t length, 1375 UChar *dest, int32_t capacity, 1376 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1377 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1378 return 0; 1379 } 1380 UnicodeString src((UBool)(length<0), label, length); 1381 UnicodeString destString(dest, 0, capacity); 1382 IDNAInfo info; 1383 reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode); 1384 idnaInfoToStruct(info, pInfo); 1385 return destString.extract(dest, capacity, *pErrorCode); 1386 } 1387 1388 U_CAPI int32_t U_EXPORT2 1389 uidna_nameToASCII(const UIDNA *idna, 1390 const UChar *name, int32_t length, 1391 UChar *dest, int32_t capacity, 1392 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1393 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1394 return 0; 1395 } 1396 UnicodeString src((UBool)(length<0), name, length); 1397 UnicodeString destString(dest, 0, capacity); 1398 IDNAInfo info; 1399 reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode); 1400 idnaInfoToStruct(info, pInfo); 1401 return destString.extract(dest, capacity, *pErrorCode); 1402 } 1403 1404 U_CAPI int32_t U_EXPORT2 1405 uidna_nameToUnicode(const UIDNA *idna, 1406 const UChar *name, int32_t length, 1407 UChar *dest, int32_t capacity, 1408 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1409 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1410 return 0; 1411 } 1412 UnicodeString src((UBool)(length<0), name, length); 1413 UnicodeString destString(dest, 0, capacity); 1414 IDNAInfo info; 1415 reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode); 1416 idnaInfoToStruct(info, pInfo); 1417 return destString.extract(dest, capacity, *pErrorCode); 1418 } 1419 1420 U_CAPI int32_t U_EXPORT2 1421 uidna_labelToASCII_UTF8(const UIDNA *idna, 1422 const char *label, int32_t length, 1423 char *dest, int32_t capacity, 1424 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1425 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1426 return 0; 1427 } 1428 StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length); 1429 CheckedArrayByteSink sink(dest, capacity); 1430 IDNAInfo info; 1431 reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode); 1432 idnaInfoToStruct(info, pInfo); 1433 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1434 } 1435 1436 U_CAPI int32_t U_EXPORT2 1437 uidna_labelToUnicodeUTF8(const UIDNA *idna, 1438 const char *label, int32_t length, 1439 char *dest, int32_t capacity, 1440 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1441 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1442 return 0; 1443 } 1444 StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length); 1445 CheckedArrayByteSink sink(dest, capacity); 1446 IDNAInfo info; 1447 reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode); 1448 idnaInfoToStruct(info, pInfo); 1449 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1450 } 1451 1452 U_CAPI int32_t U_EXPORT2 1453 uidna_nameToASCII_UTF8(const UIDNA *idna, 1454 const char *name, int32_t length, 1455 char *dest, int32_t capacity, 1456 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1457 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1458 return 0; 1459 } 1460 StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length); 1461 CheckedArrayByteSink sink(dest, capacity); 1462 IDNAInfo info; 1463 reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode); 1464 idnaInfoToStruct(info, pInfo); 1465 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1466 } 1467 1468 U_CAPI int32_t U_EXPORT2 1469 uidna_nameToUnicodeUTF8(const UIDNA *idna, 1470 const char *name, int32_t length, 1471 char *dest, int32_t capacity, 1472 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1473 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1474 return 0; 1475 } 1476 StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length); 1477 CheckedArrayByteSink sink(dest, capacity); 1478 IDNAInfo info; 1479 reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode); 1480 idnaInfoToStruct(info, pInfo); 1481 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1482 } 1483 1484 #endif // UCONFIG_NO_IDNA 1485