1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: uts46.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010mar09 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_IDNA 18 19 #include "unicode/idna.h" 20 #include "unicode/normalizer2.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ustring.h" 23 #include "unicode/utf16.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "punycode.h" 27 #include "ubidi_props.h" 28 #include "ustr_imp.h" 29 30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 31 32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: 33 // 34 // The domain name length limit is 255 octets in an internal DNS representation 35 // where the last ("root") label is the empty label 36 // represented by length byte 0 alone. 37 // In a conventional string, this translates to 253 characters, or 254 38 // if there is a trailing dot for the root label. 39 40 U_NAMESPACE_BEGIN 41 42 // Severe errors which usually result in a U+FFFD replacement character in the result string. 43 const uint32_t severeErrors= 44 UIDNA_ERROR_LEADING_COMBINING_MARK| 45 UIDNA_ERROR_DISALLOWED| 46 UIDNA_ERROR_PUNYCODE| 47 UIDNA_ERROR_LABEL_HAS_DOT| 48 UIDNA_ERROR_INVALID_ACE_LABEL; 49 50 static inline UBool 51 isASCIIString(const UnicodeString &dest) { 52 const UChar *s=dest.getBuffer(); 53 const UChar *limit=s+dest.length(); 54 while(s<limit) { 55 if(*s++>0x7f) { 56 return FALSE; 57 } 58 } 59 return TRUE; 60 } 61 62 static UBool 63 isASCIIOkBiDi(const UChar *s, int32_t length); 64 65 static UBool 66 isASCIIOkBiDi(const char *s, int32_t length); 67 68 // IDNA class default implementations -------------------------------------- *** 69 70 IDNA::~IDNA() {} 71 72 void 73 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 74 IDNAInfo &info, UErrorCode &errorCode) const { 75 if(U_SUCCESS(errorCode)) { 76 UnicodeString destString; 77 labelToASCII(UnicodeString::fromUTF8(label), destString, 78 info, errorCode).toUTF8(dest); 79 } 80 } 81 82 void 83 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 84 IDNAInfo &info, UErrorCode &errorCode) const { 85 if(U_SUCCESS(errorCode)) { 86 UnicodeString destString; 87 labelToUnicode(UnicodeString::fromUTF8(label), destString, 88 info, errorCode).toUTF8(dest); 89 } 90 } 91 92 void 93 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 94 IDNAInfo &info, UErrorCode &errorCode) const { 95 if(U_SUCCESS(errorCode)) { 96 UnicodeString destString; 97 nameToASCII(UnicodeString::fromUTF8(name), destString, 98 info, errorCode).toUTF8(dest); 99 } 100 } 101 102 void 103 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 104 IDNAInfo &info, UErrorCode &errorCode) const { 105 if(U_SUCCESS(errorCode)) { 106 UnicodeString destString; 107 nameToUnicode(UnicodeString::fromUTF8(name), destString, 108 info, errorCode).toUTF8(dest); 109 } 110 } 111 112 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(IDNA) 113 114 // UTS46 class declaration ------------------------------------------------- *** 115 116 class UTS46 : public IDNA { 117 public: 118 UTS46(uint32_t options, UErrorCode &errorCode); 119 virtual ~UTS46(); 120 121 virtual UnicodeString & 122 labelToASCII(const UnicodeString &label, UnicodeString &dest, 123 IDNAInfo &info, UErrorCode &errorCode) const; 124 125 virtual UnicodeString & 126 labelToUnicode(const UnicodeString &label, UnicodeString &dest, 127 IDNAInfo &info, UErrorCode &errorCode) const; 128 129 virtual UnicodeString & 130 nameToASCII(const UnicodeString &name, UnicodeString &dest, 131 IDNAInfo &info, UErrorCode &errorCode) const; 132 133 virtual UnicodeString & 134 nameToUnicode(const UnicodeString &name, UnicodeString &dest, 135 IDNAInfo &info, UErrorCode &errorCode) const; 136 137 virtual void 138 labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 139 IDNAInfo &info, UErrorCode &errorCode) const; 140 141 virtual void 142 labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 143 IDNAInfo &info, UErrorCode &errorCode) const; 144 145 virtual void 146 nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 147 IDNAInfo &info, UErrorCode &errorCode) const; 148 149 virtual void 150 nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 151 IDNAInfo &info, UErrorCode &errorCode) const; 152 153 private: 154 UnicodeString & 155 process(const UnicodeString &src, 156 UBool isLabel, UBool toASCII, 157 UnicodeString &dest, 158 IDNAInfo &info, UErrorCode &errorCode) const; 159 160 void 161 processUTF8(const StringPiece &src, 162 UBool isLabel, UBool toASCII, 163 ByteSink &dest, 164 IDNAInfo &info, UErrorCode &errorCode) const; 165 166 UnicodeString & 167 processUnicode(const UnicodeString &src, 168 int32_t labelStart, int32_t mappingStart, 169 UBool isLabel, UBool toASCII, 170 UnicodeString &dest, 171 IDNAInfo &info, UErrorCode &errorCode) const; 172 173 // returns the new dest.length() 174 int32_t 175 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 176 UErrorCode &errorCode) const; 177 178 // returns the new label length 179 int32_t 180 processLabel(UnicodeString &dest, 181 int32_t labelStart, int32_t labelLength, 182 UBool toASCII, 183 IDNAInfo &info, UErrorCode &errorCode) const; 184 int32_t 185 markBadACELabel(UnicodeString &dest, 186 int32_t labelStart, int32_t labelLength, 187 UBool toASCII, IDNAInfo &info) const; 188 189 void 190 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 191 192 UBool 193 isLabelOkContextJ(const UChar *label, int32_t labelLength) const; 194 195 void 196 checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 197 198 const Normalizer2 &uts46Norm2; // uts46.nrm 199 uint32_t options; 200 }; 201 202 IDNA * 203 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { 204 if(U_SUCCESS(errorCode)) { 205 IDNA *idna=new UTS46(options, errorCode); 206 if(idna==NULL) { 207 errorCode=U_MEMORY_ALLOCATION_ERROR; 208 } else if(U_FAILURE(errorCode)) { 209 delete idna; 210 idna=NULL; 211 } 212 return idna; 213 } else { 214 return NULL; 215 } 216 } 217 218 // UTS46 implementation ---------------------------------------------------- *** 219 220 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) 221 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)), 222 options(opt) {} 223 224 UTS46::~UTS46() {} 225 226 UnicodeString & 227 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, 228 IDNAInfo &info, UErrorCode &errorCode) const { 229 return process(label, TRUE, TRUE, dest, info, errorCode); 230 } 231 232 UnicodeString & 233 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, 234 IDNAInfo &info, UErrorCode &errorCode) const { 235 return process(label, TRUE, FALSE, dest, info, errorCode); 236 } 237 238 UnicodeString & 239 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, 240 IDNAInfo &info, UErrorCode &errorCode) const { 241 process(name, FALSE, TRUE, dest, info, errorCode); 242 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 && 243 isASCIIString(dest) && 244 (dest.length()>254 || dest[253]!=0x2e) 245 ) { 246 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 247 } 248 return dest; 249 } 250 251 UnicodeString & 252 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, 253 IDNAInfo &info, UErrorCode &errorCode) const { 254 return process(name, FALSE, FALSE, dest, info, errorCode); 255 } 256 257 void 258 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 259 IDNAInfo &info, UErrorCode &errorCode) const { 260 processUTF8(label, TRUE, TRUE, dest, info, errorCode); 261 } 262 263 void 264 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 265 IDNAInfo &info, UErrorCode &errorCode) const { 266 processUTF8(label, TRUE, FALSE, dest, info, errorCode); 267 } 268 269 void 270 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 271 IDNAInfo &info, UErrorCode &errorCode) const { 272 processUTF8(name, FALSE, TRUE, dest, info, errorCode); 273 } 274 275 void 276 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 277 IDNAInfo &info, UErrorCode &errorCode) const { 278 processUTF8(name, FALSE, FALSE, dest, info, errorCode); 279 } 280 281 // UTS #46 data for ASCII characters. 282 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 283 // and passes through all other ASCII characters. 284 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed 285 // using this data. 286 // The ASCII fastpath also uses this data. 287 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 288 static const int8_t asciiData[128]={ 289 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 290 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 291 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 292 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 293 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 295 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 296 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 297 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 298 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 299 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 301 }; 302 303 UnicodeString & 304 UTS46::process(const UnicodeString &src, 305 UBool isLabel, UBool toASCII, 306 UnicodeString &dest, 307 IDNAInfo &info, UErrorCode &errorCode) const { 308 // uts46Norm2.normalize() would do all of this error checking and setup, 309 // but with the ASCII fastpath we do not always call it, and do not 310 // call it first. 311 if(U_FAILURE(errorCode)) { 312 dest.setToBogus(); 313 return dest; 314 } 315 const UChar *srcArray=src.getBuffer(); 316 if(&dest==&src || srcArray==NULL) { 317 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 318 dest.setToBogus(); 319 return dest; 320 } 321 // Arguments are fine, reset output values. 322 dest.remove(); 323 info.reset(); 324 int32_t srcLength=src.length(); 325 if(srcLength==0) { 326 if(toASCII) { 327 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 328 } 329 return dest; 330 } 331 UChar *destArray=dest.getBuffer(srcLength); 332 if(destArray==NULL) { 333 errorCode=U_MEMORY_ALLOCATION_ERROR; 334 return dest; 335 } 336 // ASCII fastpath 337 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 338 int32_t labelStart=0; 339 int32_t i; 340 for(i=0;; ++i) { 341 if(i==srcLength) { 342 if(toASCII) { 343 if((i-labelStart)>63) { 344 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 345 } 346 // There is a trailing dot if labelStart==i. 347 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 348 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 349 } 350 } 351 info.errors|=info.labelErrors; 352 dest.releaseBuffer(i); 353 return dest; 354 } 355 UChar c=srcArray[i]; 356 if(c>0x7f) { 357 break; 358 } 359 int cData=asciiData[c]; 360 if(cData>0) { 361 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 362 } else if(cData<0 && disallowNonLDHDot) { 363 break; // Replacing with U+FFFD can be complicated for toASCII. 364 } else { 365 destArray[i]=c; 366 if(c==0x2d) { // hyphen 367 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 368 // "??--..." is Punycode or forbidden. 369 ++i; // '-' was copied to dest already 370 break; 371 } 372 if(i==labelStart) { 373 // label starts with "-" 374 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 375 } 376 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 377 // label ends with "-" 378 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 379 } 380 } else if(c==0x2e) { // dot 381 if(isLabel) { 382 // Replacing with U+FFFD can be complicated for toASCII. 383 ++i; // '.' was copied to dest already 384 break; 385 } 386 if(toASCII) { 387 // Permit an empty label at the end but not elsewhere. 388 if(i==labelStart && i<(srcLength-1)) { 389 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 390 } else if((i-labelStart)>63) { 391 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 392 } 393 } 394 info.errors|=info.labelErrors; 395 info.labelErrors=0; 396 labelStart=i+1; 397 } 398 } 399 } 400 info.errors|=info.labelErrors; 401 dest.releaseBuffer(i); 402 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); 403 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 404 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart))) 405 ) { 406 info.errors|=UIDNA_ERROR_BIDI; 407 } 408 return dest; 409 } 410 411 void 412 UTS46::processUTF8(const StringPiece &src, 413 UBool isLabel, UBool toASCII, 414 ByteSink &dest, 415 IDNAInfo &info, UErrorCode &errorCode) const { 416 if(U_FAILURE(errorCode)) { 417 return; 418 } 419 const char *srcArray=src.data(); 420 int32_t srcLength=src.length(); 421 if(srcArray==NULL && srcLength!=0) { 422 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 423 return; 424 } 425 // Arguments are fine, reset output values. 426 info.reset(); 427 if(srcLength==0) { 428 if(toASCII) { 429 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 430 } 431 dest.Flush(); 432 return; 433 } 434 UnicodeString destString; 435 int32_t labelStart=0; 436 if(srcLength<=256) { // length of stackArray[] 437 // ASCII fastpath 438 char stackArray[256]; 439 int32_t destCapacity; 440 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, 441 stackArray, LENGTHOF(stackArray), &destCapacity); 442 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 443 int32_t i; 444 for(i=0;; ++i) { 445 if(i==srcLength) { 446 if(toASCII) { 447 if((i-labelStart)>63) { 448 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 449 } 450 // There is a trailing dot if labelStart==i. 451 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 452 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 453 } 454 } 455 info.errors|=info.labelErrors; 456 dest.Append(destArray, i); 457 dest.Flush(); 458 return; 459 } 460 char c=srcArray[i]; 461 if((int8_t)c<0) { // (uint8_t)c>0x7f 462 break; 463 } 464 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char. 465 if(cData>0) { 466 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 467 } else if(cData<0 && disallowNonLDHDot) { 468 break; // Replacing with U+FFFD can be complicated for toASCII. 469 } else { 470 destArray[i]=c; 471 if(c==0x2d) { // hyphen 472 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 473 // "??--..." is Punycode or forbidden. 474 break; 475 } 476 if(i==labelStart) { 477 // label starts with "-" 478 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 479 } 480 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 481 // label ends with "-" 482 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 483 } 484 } else if(c==0x2e) { // dot 485 if(isLabel) { 486 break; // Replacing with U+FFFD can be complicated for toASCII. 487 } 488 if(toASCII) { 489 // Permit an empty label at the end but not elsewhere. 490 if(i==labelStart && i<(srcLength-1)) { 491 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 492 } else if((i-labelStart)>63) { 493 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 494 } 495 } 496 info.errors|=info.labelErrors; 497 info.labelErrors=0; 498 labelStart=i+1; 499 } 500 } 501 } 502 info.errors|=info.labelErrors; 503 // Convert the processed ASCII prefix of the current label to UTF-16. 504 int32_t mappingStart=i-labelStart; 505 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); 506 // Output the previous ASCII labels and process the rest of src in UTF-16. 507 dest.Append(destArray, labelStart); 508 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart, 509 isLabel, toASCII, 510 destString, info, errorCode); 511 } else { 512 // src is too long for the ASCII fastpath implementation. 513 processUnicode(UnicodeString::fromUTF8(src), 0, 0, 514 isLabel, toASCII, 515 destString, info, errorCode); 516 } 517 destString.toUTF8(dest); // calls dest.Flush() 518 if(toASCII && !isLabel) { 519 // length==labelStart==254 means that there is a trailing dot (ok) and 520 // destString is empty (do not index at 253-labelStart). 521 int32_t length=labelStart+destString.length(); 522 if( length>=254 && isASCIIString(destString) && 523 (length>254 || 524 (labelStart<254 && destString[253-labelStart]!=0x2e)) 525 ) { 526 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 527 } 528 } 529 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 530 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart))) 531 ) { 532 info.errors|=UIDNA_ERROR_BIDI; 533 } 534 } 535 536 UnicodeString & 537 UTS46::processUnicode(const UnicodeString &src, 538 int32_t labelStart, int32_t mappingStart, 539 UBool isLabel, UBool toASCII, 540 UnicodeString &dest, 541 IDNAInfo &info, UErrorCode &errorCode) const { 542 if(mappingStart==0) { 543 uts46Norm2.normalize(src, dest, errorCode); 544 } else { 545 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode); 546 } 547 if(U_FAILURE(errorCode)) { 548 return dest; 549 } 550 UBool doMapDevChars= 551 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : 552 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; 553 const UChar *destArray=dest.getBuffer(); 554 int32_t destLength=dest.length(); 555 int32_t labelLimit=labelStart; 556 while(labelLimit<destLength) { 557 UChar c=destArray[labelLimit]; 558 if(c==0x2e && !isLabel) { 559 int32_t labelLength=labelLimit-labelStart; 560 int32_t newLength=processLabel(dest, labelStart, labelLength, 561 toASCII, info, errorCode); 562 info.errors|=info.labelErrors; 563 info.labelErrors=0; 564 if(U_FAILURE(errorCode)) { 565 return dest; 566 } 567 destArray=dest.getBuffer(); 568 destLength+=newLength-labelLength; 569 labelLimit=labelStart+=newLength+1; 570 } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 571 info.isTransDiff=TRUE; 572 if(doMapDevChars) { 573 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); 574 if(U_FAILURE(errorCode)) { 575 return dest; 576 } 577 destArray=dest.getBuffer(); 578 // Do not increment labelLimit in case c was removed. 579 // All deviation characters have been mapped, no need to check for them again. 580 doMapDevChars=FALSE; 581 } else { 582 ++labelLimit; 583 } 584 } else { 585 ++labelLimit; 586 } 587 } 588 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 589 // but not an empty label elsewhere nor a completely empty domain name. 590 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 591 if(0==labelStart || labelStart<labelLimit) { 592 processLabel(dest, labelStart, labelLimit-labelStart, 593 toASCII, info, errorCode); 594 info.errors|=info.labelErrors; 595 } 596 return dest; 597 } 598 599 int32_t 600 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 601 UErrorCode &errorCode) const { 602 int32_t length=dest.length(); 603 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); 604 if(s==NULL) { 605 errorCode=U_MEMORY_ALLOCATION_ERROR; 606 return length; 607 } 608 int32_t capacity=dest.getCapacity(); 609 UBool didMapDevChars=FALSE; 610 int32_t readIndex=mappingStart, writeIndex=mappingStart; 611 do { 612 UChar c=s[readIndex++]; 613 switch(c) { 614 case 0xdf: 615 // Map sharp s to ss. 616 didMapDevChars=TRUE; 617 s[writeIndex++]=0x73; // Replace sharp s with first s. 618 // Insert second s and account for possible buffer reallocation. 619 if(writeIndex==readIndex) { 620 if(length==capacity) { 621 dest.releaseBuffer(length); 622 s=dest.getBuffer(length+1); 623 if(s==NULL) { 624 errorCode=U_MEMORY_ALLOCATION_ERROR; 625 return length; 626 } 627 capacity=dest.getCapacity(); 628 } 629 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); 630 ++readIndex; 631 } 632 s[writeIndex++]=0x73; 633 ++length; 634 break; 635 case 0x3c2: // Map final sigma to nonfinal sigma. 636 didMapDevChars=TRUE; 637 s[writeIndex++]=0x3c3; 638 break; 639 case 0x200c: // Ignore/remove ZWNJ. 640 case 0x200d: // Ignore/remove ZWJ. 641 didMapDevChars=TRUE; 642 --length; 643 break; 644 default: 645 // Only really necessary if writeIndex was different from readIndex. 646 s[writeIndex++]=c; 647 break; 648 } 649 } while(writeIndex<length); 650 dest.releaseBuffer(length); 651 if(didMapDevChars) { 652 // Mapping deviation characters might have resulted in an un-NFC string. 653 // We could use either the NFC or the UTS #46 normalizer. 654 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 655 UnicodeString normalized; 656 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); 657 if(U_SUCCESS(errorCode)) { 658 dest.replace(labelStart, 0x7fffffff, normalized); 659 return dest.length(); 660 } 661 } 662 return length; 663 } 664 665 // Some non-ASCII characters are equivalent to sequences with 666 // non-LDH ASCII characters. To find them: 667 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 668 static inline UBool 669 isNonASCIIDisallowedSTD3Valid(UChar32 c) { 670 return c==0x2260 || c==0x226E || c==0x226F; 671 } 672 673 // Replace the label in dest with the label string, if the label was modified. 674 // If &label==&dest then the label was modified in-place and labelLength 675 // is the new label length, different from label.length(). 676 // If &label!=&dest then labelLength==label.length(). 677 // Returns labelLength (= the new label length). 678 static int32_t 679 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, 680 const UnicodeString &label, int32_t labelLength) { 681 if(&label!=&dest) { 682 dest.replace(destLabelStart, destLabelLength, label); 683 } 684 return labelLength; 685 } 686 687 int32_t 688 UTS46::processLabel(UnicodeString &dest, 689 int32_t labelStart, int32_t labelLength, 690 UBool toASCII, 691 IDNAInfo &info, UErrorCode &errorCode) const { 692 UnicodeString fromPunycode; 693 UnicodeString *labelString; 694 const UChar *label=dest.getBuffer()+labelStart; 695 int32_t destLabelStart=labelStart; 696 int32_t destLabelLength=labelLength; 697 UBool wasPunycode; 698 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { 699 // Label starts with "xn--", try to un-Punycode it. 700 wasPunycode=TRUE; 701 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit 702 if(unicodeBuffer==NULL) { 703 // Should never occur if we used capacity==-1 which uses the internal buffer. 704 errorCode=U_MEMORY_ALLOCATION_ERROR; 705 return labelLength; 706 } 707 UErrorCode punycodeErrorCode=U_ZERO_ERROR; 708 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, 709 unicodeBuffer, fromPunycode.getCapacity(), 710 NULL, &punycodeErrorCode); 711 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { 712 fromPunycode.releaseBuffer(0); 713 unicodeBuffer=fromPunycode.getBuffer(unicodeLength); 714 if(unicodeBuffer==NULL) { 715 errorCode=U_MEMORY_ALLOCATION_ERROR; 716 return labelLength; 717 } 718 punycodeErrorCode=U_ZERO_ERROR; 719 unicodeLength=u_strFromPunycode(label+4, labelLength-4, 720 unicodeBuffer, fromPunycode.getCapacity(), 721 NULL, &punycodeErrorCode); 722 } 723 fromPunycode.releaseBuffer(unicodeLength); 724 if(U_FAILURE(punycodeErrorCode)) { 725 info.labelErrors|=UIDNA_ERROR_PUNYCODE; 726 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 727 } 728 // Check for NFC, and for characters that are not 729 // valid or deviation characters according to the normalizer. 730 // If there is something wrong, then the string will change. 731 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 732 // Deviation characters are ok in Punycode even in transitional processing. 733 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 734 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 735 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); 736 if(U_FAILURE(errorCode)) { 737 return labelLength; 738 } 739 if(!isValid) { 740 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 741 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 742 } 743 labelString=&fromPunycode; 744 label=fromPunycode.getBuffer(); 745 labelStart=0; 746 labelLength=fromPunycode.length(); 747 } else { 748 wasPunycode=FALSE; 749 labelString=&dest; 750 } 751 // Validity check 752 if(labelLength==0) { 753 if(toASCII) { 754 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 755 } 756 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength); 757 } 758 // labelLength>0 759 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { 760 // label starts with "??--" 761 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; 762 } 763 if(label[0]==0x2d) { 764 // label starts with "-" 765 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 766 } 767 if(label[labelLength-1]==0x2d) { 768 // label ends with "-" 769 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 770 } 771 // If the label was not a Punycode label, then it was the result of 772 // mapping, normalization and label segmentation. 773 // If the label was in Punycode, then we mapped it again above 774 // and checked its validity. 775 // Now we handle the STD3 restriction to LDH characters (if set) 776 // and we look for U+FFFD which indicates disallowed characters 777 // in a non-Punycode label or U+FFFD itself in a Punycode label. 778 // We also check for dots which can come from the input to a single-label function. 779 // Ok to cast away const because we own the UnicodeString. 780 UChar *s=(UChar *)label; 781 const UChar *limit=label+labelLength; 782 UChar oredChars=0; 783 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 784 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 785 do { 786 UChar c=*s; 787 if(c<=0x7f) { 788 if(c==0x2e) { 789 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 790 *s=0xfffd; 791 } else if(disallowNonLDHDot && asciiData[c]<0) { 792 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 793 *s=0xfffd; 794 } 795 } else { 796 oredChars|=c; 797 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 798 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 799 *s=0xfffd; 800 } else if(c==0xfffd) { 801 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 802 } 803 } 804 ++s; 805 } while(s<limit); 806 // Check for a leading combining mark after other validity checks 807 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. 808 UChar32 c; 809 int32_t cpLength=0; 810 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 811 U16_NEXT_UNSAFE(label, cpLength, c); 812 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 813 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; 814 labelString->replace(labelStart, cpLength, (UChar)0xfffd); 815 label=labelString->getBuffer()+labelStart; 816 labelLength+=1-cpLength; 817 if(labelString==&dest) { 818 destLabelLength=labelLength; 819 } 820 } 821 if((info.labelErrors&severeErrors)==0) { 822 // Do contextual checks only if we do not have U+FFFD from a severe error 823 // because U+FFFD can make these checks fail. 824 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { 825 checkLabelBiDi(label, labelLength, info); 826 } 827 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 828 !isLabelOkContextJ(label, labelLength) 829 ) { 830 info.labelErrors|=UIDNA_ERROR_CONTEXTJ; 831 } 832 if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 833 checkLabelContextO(label, labelLength, info); 834 } 835 if(toASCII) { 836 if(wasPunycode) { 837 // Leave a Punycode label unchanged if it has no severe errors. 838 if(destLabelLength>63) { 839 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 840 } 841 return destLabelLength; 842 } else if(oredChars>=0x80) { 843 // Contains non-ASCII characters. 844 UnicodeString punycode; 845 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length 846 if(buffer==NULL) { 847 errorCode=U_MEMORY_ALLOCATION_ERROR; 848 return destLabelLength; 849 } 850 buffer[0]=0x78; // Write "xn--". 851 buffer[1]=0x6e; 852 buffer[2]=0x2d; 853 buffer[3]=0x2d; 854 int32_t punycodeLength=u_strToPunycode(label, labelLength, 855 buffer+4, punycode.getCapacity()-4, 856 NULL, &errorCode); 857 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 858 errorCode=U_ZERO_ERROR; 859 punycode.releaseBuffer(4); 860 buffer=punycode.getBuffer(4+punycodeLength); 861 if(buffer==NULL) { 862 errorCode=U_MEMORY_ALLOCATION_ERROR; 863 return destLabelLength; 864 } 865 punycodeLength=u_strToPunycode(label, labelLength, 866 buffer+4, punycode.getCapacity()-4, 867 NULL, &errorCode); 868 } 869 punycodeLength+=4; 870 punycode.releaseBuffer(punycodeLength); 871 if(U_FAILURE(errorCode)) { 872 return destLabelLength; 873 } 874 if(punycodeLength>63) { 875 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 876 } 877 return replaceLabel(dest, destLabelStart, destLabelLength, 878 punycode, punycodeLength); 879 } else { 880 // all-ASCII label 881 if(labelLength>63) { 882 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 883 } 884 } 885 } 886 } else { 887 // If a Punycode label has severe errors, 888 // then leave it but make sure it does not look valid. 889 if(wasPunycode) { 890 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 891 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); 892 } 893 } 894 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength); 895 } 896 897 // Make sure an ACE label does not look valid. 898 // Append U+FFFD if the label has only LDH characters. 899 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD. 900 int32_t 901 UTS46::markBadACELabel(UnicodeString &dest, 902 int32_t labelStart, int32_t labelLength, 903 UBool toASCII, IDNAInfo &info) const { 904 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 905 UBool isASCII=TRUE; 906 UBool onlyLDH=TRUE; 907 const UChar *label=dest.getBuffer()+labelStart; 908 // Ok to cast away const because we own the UnicodeString. 909 UChar *s=(UChar *)label+4; // After the initial "xn--". 910 const UChar *limit=label+labelLength; 911 do { 912 UChar c=*s; 913 if(c<=0x7f) { 914 if(c==0x2e) { 915 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 916 *s=0xfffd; 917 isASCII=onlyLDH=FALSE; 918 } else if(asciiData[c]<0) { 919 onlyLDH=FALSE; 920 if(disallowNonLDHDot) { 921 *s=0xfffd; 922 isASCII=FALSE; 923 } 924 } 925 } else { 926 isASCII=onlyLDH=FALSE; 927 } 928 } while(++s<limit); 929 if(onlyLDH) { 930 dest.insert(labelStart+labelLength, (UChar)0xfffd); 931 ++labelLength; 932 } else { 933 if(toASCII && isASCII && labelLength>63) { 934 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 935 } 936 } 937 return labelLength; 938 } 939 940 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); 941 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); 942 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; 943 944 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); 945 946 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); 947 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 948 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); 949 950 const uint32_t ES_CS_ET_ON_BN_NSM_MASK= 951 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| 952 U_MASK(U_COMMON_NUMBER_SEPARATOR)| 953 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| 954 U_MASK(U_OTHER_NEUTRAL)| 955 U_MASK(U_BOUNDARY_NEUTRAL)| 956 U_MASK(U_DIR_NON_SPACING_MARK); 957 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 958 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 959 960 // We scan the whole label and check both for whether it contains RTL characters 961 // and whether it passes the BiDi Rule. 962 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 963 // that a domain name is a BiDi domain name (has an RTL label) only after 964 // processing several earlier labels. 965 void 966 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 967 // IDNA2008 BiDi rule 968 // Get the directionality of the first character. 969 UChar32 c; 970 int32_t i=0; 971 U16_NEXT_UNSAFE(label, i, c); 972 uint32_t firstMask=U_MASK(u_charDirection(c)); 973 // 1. The first character must be a character with BIDI property L, R 974 // or AL. If it has the R or AL property, it is an RTL label; if it 975 // has the L property, it is an LTR label. 976 if((firstMask&~L_R_AL_MASK)!=0) { 977 info.isOkBiDi=FALSE; 978 } 979 // Get the directionality of the last non-NSM character. 980 uint32_t lastMask; 981 for(;;) { 982 if(i>=labelLength) { 983 lastMask=firstMask; 984 break; 985 } 986 U16_PREV_UNSAFE(label, labelLength, c); 987 UCharDirection dir=u_charDirection(c); 988 if(dir!=U_DIR_NON_SPACING_MARK) { 989 lastMask=U_MASK(dir); 990 break; 991 } 992 } 993 // 3. In an RTL label, the end of the label must be a character with 994 // BIDI property R, AL, EN or AN, followed by zero or more 995 // characters with BIDI property NSM. 996 // 6. In an LTR label, the end of the label must be a character with 997 // BIDI property L or EN, followed by zero or more characters with 998 // BIDI property NSM. 999 if( (firstMask&L_MASK)!=0 ? 1000 (lastMask&~L_EN_MASK)!=0 : 1001 (lastMask&~R_AL_EN_AN_MASK)!=0 1002 ) { 1003 info.isOkBiDi=FALSE; 1004 } 1005 // Get the directionalities of the intervening characters. 1006 uint32_t mask=0; 1007 while(i<labelLength) { 1008 U16_NEXT_UNSAFE(label, i, c); 1009 mask|=U_MASK(u_charDirection(c)); 1010 } 1011 if(firstMask&L_MASK) { 1012 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1013 // ES, CS, ET, ON, BN and NSM are allowed. 1014 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1015 info.isOkBiDi=FALSE; 1016 } 1017 } else { 1018 // 2. In an RTL label, only characters with the BIDI properties R, AL, 1019 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 1020 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1021 info.isOkBiDi=FALSE; 1022 } 1023 // 4. In an RTL label, if an EN is present, no AN may be present, and 1024 // vice versa. 1025 if((mask&EN_AN_MASK)==EN_AN_MASK) { 1026 info.isOkBiDi=FALSE; 1027 } 1028 } 1029 // An RTL label is a label that contains at least one character of type 1030 // R, AL or AN. [...] 1031 // A "BIDI domain name" is a domain name that contains at least one RTL 1032 // label. [...] 1033 // The following rule, consisting of six conditions, applies to labels 1034 // in BIDI domain names. 1035 if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) { 1036 info.isBiDi=TRUE; 1037 } 1038 } 1039 1040 // Special code for the ASCII prefix of a BiDi domain name. 1041 // The ASCII prefix is all-LTR. 1042 1043 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 1044 // 1. The first character must be a character with BIDI property L [...] 1045 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1046 // ES, CS, ET, ON, BN and NSM are allowed. 1047 // 6. In an LTR label, the end of the label must be a character with 1048 // BIDI property L or EN [...] 1049 1050 // UTF-16 version, called for mapped ASCII prefix. 1051 // Cannot contain uppercase A-Z. 1052 // s[length-1] must be the trailing dot. 1053 static UBool 1054 isASCIIOkBiDi(const UChar *s, int32_t length) { 1055 int32_t labelStart=0; 1056 for(int32_t i=0; i<length; ++i) { 1057 UChar c=s[i]; 1058 if(c==0x2e) { // dot 1059 if(i>labelStart) { 1060 c=s[i-1]; 1061 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { 1062 // Last character in the label is not an L or EN. 1063 return FALSE; 1064 } 1065 } 1066 labelStart=i+1; 1067 } else if(i==labelStart) { 1068 if(!(0x61<=c && c<=0x7a)) { 1069 // First character in the label is not an L. 1070 return FALSE; 1071 } 1072 } else { 1073 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1074 // Intermediate character in the label is a B, S or WS. 1075 return FALSE; 1076 } 1077 } 1078 } 1079 return TRUE; 1080 } 1081 1082 // UTF-8 version, called for source ASCII prefix. 1083 // Can contain uppercase A-Z. 1084 // s[length-1] must be the trailing dot. 1085 static UBool 1086 isASCIIOkBiDi(const char *s, int32_t length) { 1087 int32_t labelStart=0; 1088 for(int32_t i=0; i<length; ++i) { 1089 char c=s[i]; 1090 if(c==0x2e) { // dot 1091 if(i>labelStart) { 1092 c=s[i-1]; 1093 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { 1094 // Last character in the label is not an L or EN. 1095 return FALSE; 1096 } 1097 } 1098 labelStart=i+1; 1099 } else if(i==labelStart) { 1100 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { 1101 // First character in the label is not an L. 1102 return FALSE; 1103 } 1104 } else { 1105 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1106 // Intermediate character in the label is a B, S or WS. 1107 return FALSE; 1108 } 1109 } 1110 } 1111 return TRUE; 1112 } 1113 1114 UBool 1115 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { 1116 const UBiDiProps *bdp=ubidi_getSingleton(); 1117 // [IDNA2008-Tables] 1118 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 1119 for(int32_t i=0; i<labelLength; ++i) { 1120 if(label[i]==0x200c) { 1121 // Appendix A.1. ZERO WIDTH NON-JOINER 1122 // Rule Set: 1123 // False; 1124 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1125 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 1126 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 1127 if(i==0) { 1128 return FALSE; 1129 } 1130 UChar32 c; 1131 int32_t j=i; 1132 U16_PREV_UNSAFE(label, j, c); 1133 if(uts46Norm2.getCombiningClass(c)==9) { 1134 continue; 1135 } 1136 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 1137 for(;;) { 1138 UJoiningType type=ubidi_getJoiningType(bdp, c); 1139 if(type==U_JT_TRANSPARENT) { 1140 if(j==0) { 1141 return FALSE; 1142 } 1143 U16_PREV_UNSAFE(label, j, c); 1144 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { 1145 break; // precontext fulfilled 1146 } else { 1147 return FALSE; 1148 } 1149 } 1150 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 1151 for(j=i+1;;) { 1152 if(j==labelLength) { 1153 return FALSE; 1154 } 1155 U16_NEXT_UNSAFE(label, j, c); 1156 UJoiningType type=ubidi_getJoiningType(bdp, c); 1157 if(type==U_JT_TRANSPARENT) { 1158 // just skip this character 1159 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { 1160 break; // postcontext fulfilled 1161 } else { 1162 return FALSE; 1163 } 1164 } 1165 } else if(label[i]==0x200d) { 1166 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 1167 // Rule Set: 1168 // False; 1169 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1170 if(i==0) { 1171 return FALSE; 1172 } 1173 UChar32 c; 1174 int32_t j=i; 1175 U16_PREV_UNSAFE(label, j, c); 1176 if(uts46Norm2.getCombiningClass(c)!=9) { 1177 return FALSE; 1178 } 1179 } 1180 } 1181 return TRUE; 1182 } 1183 1184 void 1185 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 1186 int32_t labelEnd=labelLength-1; // inclusive 1187 int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx 1188 for(int32_t i=0; i<=labelEnd; ++i) { 1189 UChar32 c=label[i]; 1190 if(c<0xb7) { 1191 // ASCII fastpath 1192 } else if(c<=0x6f9) { 1193 if(c==0xb7) { 1194 // Appendix A.3. MIDDLE DOT (U+00B7) 1195 // Rule Set: 1196 // False; 1197 // If Before(cp) .eq. U+006C And 1198 // After(cp) .eq. U+006C Then True; 1199 if(!(0<i && label[i-1]==0x6c && 1200 i<labelEnd && label[i+1]==0x6c)) { 1201 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1202 } 1203 } else if(c==0x375) { 1204 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 1205 // Rule Set: 1206 // False; 1207 // If Script(After(cp)) .eq. Greek Then True; 1208 UScriptCode script=USCRIPT_INVALID_CODE; 1209 if(i<labelEnd) { 1210 UErrorCode errorCode=U_ZERO_ERROR; 1211 int32_t j=i+1; 1212 U16_NEXT(label, j, labelLength, c); 1213 script=uscript_getScript(c, &errorCode); 1214 } 1215 if(script!=USCRIPT_GREEK) { 1216 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1217 } 1218 } else if(c==0x5f3 || c==0x5f4) { 1219 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 1220 // Rule Set: 1221 // False; 1222 // If Script(Before(cp)) .eq. Hebrew Then True; 1223 // 1224 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 1225 // Rule Set: 1226 // False; 1227 // If Script(Before(cp)) .eq. Hebrew Then True; 1228 UScriptCode script=USCRIPT_INVALID_CODE; 1229 if(0<i) { 1230 UErrorCode errorCode=U_ZERO_ERROR; 1231 int32_t j=i; 1232 U16_PREV(label, 0, j, c); 1233 script=uscript_getScript(c, &errorCode); 1234 } 1235 if(script!=USCRIPT_HEBREW) { 1236 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1237 } 1238 } else if(0x660<=c /* && c<=0x6f9 */) { 1239 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 1240 // Rule Set: 1241 // True; 1242 // For All Characters: 1243 // If cp .in. 06F0..06F9 Then False; 1244 // End For; 1245 // 1246 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 1247 // Rule Set: 1248 // True; 1249 // For All Characters: 1250 // If cp .in. 0660..0669 Then False; 1251 // End For; 1252 if(c<=0x669) { 1253 if(arabicDigits>0) { 1254 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1255 } 1256 arabicDigits=-1; 1257 } else if(0x6f0<=c) { 1258 if(arabicDigits<0) { 1259 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1260 } 1261 arabicDigits=1; 1262 } 1263 } 1264 } else if(c==0x30fb) { 1265 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 1266 // Rule Set: 1267 // False; 1268 // For All Characters: 1269 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 1270 // End For; 1271 UErrorCode errorCode=U_ZERO_ERROR; 1272 for(int j=0;;) { 1273 if(j>labelEnd) { 1274 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1275 break; 1276 } 1277 U16_NEXT(label, j, labelLength, c); 1278 UScriptCode script=uscript_getScript(c, &errorCode); 1279 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { 1280 break; 1281 } 1282 } 1283 } 1284 } 1285 } 1286 1287 U_NAMESPACE_END 1288 1289 // C API ------------------------------------------------------------------- *** 1290 1291 U_NAMESPACE_USE 1292 1293 U_CAPI UIDNA * U_EXPORT2 1294 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { 1295 return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode)); 1296 } 1297 1298 U_CAPI void U_EXPORT2 1299 uidna_close(UIDNA *idna) { 1300 delete reinterpret_cast<IDNA *>(idna); 1301 } 1302 1303 static UBool 1304 checkArgs(const void *label, int32_t length, 1305 void *dest, int32_t capacity, 1306 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1307 if(U_FAILURE(*pErrorCode)) { 1308 return FALSE; 1309 } 1310 // sizeof(UIDNAInfo)=16 in the first API version. 1311 if(pInfo==NULL || pInfo->size<16) { 1312 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1313 return FALSE; 1314 } 1315 if( (label==NULL ? length!=0 : length<-1) || 1316 (dest==NULL ? capacity!=0 : capacity<0) || 1317 (dest==label && label!=NULL) 1318 ) { 1319 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1320 return FALSE; 1321 } 1322 // Set all *pInfo bytes to 0 except for the size field itself. 1323 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); 1324 return TRUE; 1325 } 1326 1327 static void 1328 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { 1329 pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); 1330 pInfo->errors=info.getErrors(); 1331 } 1332 1333 U_CAPI int32_t U_EXPORT2 1334 uidna_labelToASCII(const UIDNA *idna, 1335 const UChar *label, int32_t length, 1336 UChar *dest, int32_t capacity, 1337 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1338 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1339 return 0; 1340 } 1341 UnicodeString src((UBool)(length<0), label, length); 1342 UnicodeString destString(dest, 0, capacity); 1343 IDNAInfo info; 1344 reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode); 1345 idnaInfoToStruct(info, pInfo); 1346 return destString.extract(dest, capacity, *pErrorCode); 1347 } 1348 1349 U_CAPI int32_t U_EXPORT2 1350 uidna_labelToUnicode(const UIDNA *idna, 1351 const UChar *label, int32_t length, 1352 UChar *dest, int32_t capacity, 1353 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1354 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1355 return 0; 1356 } 1357 UnicodeString src((UBool)(length<0), label, length); 1358 UnicodeString destString(dest, 0, capacity); 1359 IDNAInfo info; 1360 reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode); 1361 idnaInfoToStruct(info, pInfo); 1362 return destString.extract(dest, capacity, *pErrorCode); 1363 } 1364 1365 U_CAPI int32_t U_EXPORT2 1366 uidna_nameToASCII(const UIDNA *idna, 1367 const UChar *name, int32_t length, 1368 UChar *dest, int32_t capacity, 1369 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1370 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1371 return 0; 1372 } 1373 UnicodeString src((UBool)(length<0), name, length); 1374 UnicodeString destString(dest, 0, capacity); 1375 IDNAInfo info; 1376 reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode); 1377 idnaInfoToStruct(info, pInfo); 1378 return destString.extract(dest, capacity, *pErrorCode); 1379 } 1380 1381 U_CAPI int32_t U_EXPORT2 1382 uidna_nameToUnicode(const UIDNA *idna, 1383 const UChar *name, int32_t length, 1384 UChar *dest, int32_t capacity, 1385 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1386 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1387 return 0; 1388 } 1389 UnicodeString src((UBool)(length<0), name, length); 1390 UnicodeString destString(dest, 0, capacity); 1391 IDNAInfo info; 1392 reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode); 1393 idnaInfoToStruct(info, pInfo); 1394 return destString.extract(dest, capacity, *pErrorCode); 1395 } 1396 1397 U_CAPI int32_t U_EXPORT2 1398 uidna_labelToASCII_UTF8(const UIDNA *idna, 1399 const char *label, int32_t length, 1400 char *dest, int32_t capacity, 1401 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1402 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1403 return 0; 1404 } 1405 StringPiece src(label, length<0 ? uprv_strlen(label) : length); 1406 CheckedArrayByteSink sink(dest, capacity); 1407 IDNAInfo info; 1408 reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode); 1409 idnaInfoToStruct(info, pInfo); 1410 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1411 } 1412 1413 U_CAPI int32_t U_EXPORT2 1414 uidna_labelToUnicodeUTF8(const UIDNA *idna, 1415 const char *label, int32_t length, 1416 char *dest, int32_t capacity, 1417 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1418 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1419 return 0; 1420 } 1421 StringPiece src(label, length<0 ? uprv_strlen(label) : length); 1422 CheckedArrayByteSink sink(dest, capacity); 1423 IDNAInfo info; 1424 reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode); 1425 idnaInfoToStruct(info, pInfo); 1426 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1427 } 1428 1429 U_CAPI int32_t U_EXPORT2 1430 uidna_nameToASCII_UTF8(const UIDNA *idna, 1431 const char *name, int32_t length, 1432 char *dest, int32_t capacity, 1433 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1434 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1435 return 0; 1436 } 1437 StringPiece src(name, length<0 ? uprv_strlen(name) : length); 1438 CheckedArrayByteSink sink(dest, capacity); 1439 IDNAInfo info; 1440 reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode); 1441 idnaInfoToStruct(info, pInfo); 1442 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1443 } 1444 1445 U_CAPI int32_t U_EXPORT2 1446 uidna_nameToUnicodeUTF8(const UIDNA *idna, 1447 const char *name, int32_t length, 1448 char *dest, int32_t capacity, 1449 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1450 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1451 return 0; 1452 } 1453 StringPiece src(name, length<0 ? uprv_strlen(name) : length); 1454 CheckedArrayByteSink sink(dest, capacity); 1455 IDNAInfo info; 1456 reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode); 1457 idnaInfoToStruct(info, pInfo); 1458 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1459 } 1460 1461 #endif // UCONFIG_NO_IDNA 1462