1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: uts46.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010mar09 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_IDNA 18 19 #include "unicode/idna.h" 20 #include "unicode/normalizer2.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ustring.h" 23 #include "unicode/utf16.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "punycode.h" 27 #include "ubidi_props.h" 28 #include "ustr_imp.h" 29 30 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 31 32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: 33 // 34 // The domain name length limit is 255 octets in an internal DNS representation 35 // where the last ("root") label is the empty label 36 // represented by length byte 0 alone. 37 // In a conventional string, this translates to 253 characters, or 254 38 // if there is a trailing dot for the root label. 39 40 U_NAMESPACE_BEGIN 41 42 // Severe errors which usually result in a U+FFFD replacement character in the result string. 43 const uint32_t severeErrors= 44 UIDNA_ERROR_LEADING_COMBINING_MARK| 45 UIDNA_ERROR_DISALLOWED| 46 UIDNA_ERROR_PUNYCODE| 47 UIDNA_ERROR_LABEL_HAS_DOT| 48 UIDNA_ERROR_INVALID_ACE_LABEL; 49 50 static inline UBool 51 isASCIIString(const UnicodeString &dest) { 52 const UChar *s=dest.getBuffer(); 53 const UChar *limit=s+dest.length(); 54 while(s<limit) { 55 if(*s++>0x7f) { 56 return FALSE; 57 } 58 } 59 return TRUE; 60 } 61 62 static UBool 63 isASCIIOkBiDi(const UChar *s, int32_t length); 64 65 static UBool 66 isASCIIOkBiDi(const char *s, int32_t length); 67 68 // IDNA class default implementations -------------------------------------- *** 69 70 IDNA::~IDNA() {} 71 72 void 73 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 74 IDNAInfo &info, UErrorCode &errorCode) const { 75 if(U_SUCCESS(errorCode)) { 76 UnicodeString destString; 77 labelToASCII(UnicodeString::fromUTF8(label), destString, 78 info, errorCode).toUTF8(dest); 79 } 80 } 81 82 void 83 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 84 IDNAInfo &info, UErrorCode &errorCode) const { 85 if(U_SUCCESS(errorCode)) { 86 UnicodeString destString; 87 labelToUnicode(UnicodeString::fromUTF8(label), destString, 88 info, errorCode).toUTF8(dest); 89 } 90 } 91 92 void 93 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 94 IDNAInfo &info, UErrorCode &errorCode) const { 95 if(U_SUCCESS(errorCode)) { 96 UnicodeString destString; 97 nameToASCII(UnicodeString::fromUTF8(name), destString, 98 info, errorCode).toUTF8(dest); 99 } 100 } 101 102 void 103 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 104 IDNAInfo &info, UErrorCode &errorCode) const { 105 if(U_SUCCESS(errorCode)) { 106 UnicodeString destString; 107 nameToUnicode(UnicodeString::fromUTF8(name), destString, 108 info, errorCode).toUTF8(dest); 109 } 110 } 111 112 // UTS46 class declaration ------------------------------------------------- *** 113 114 class UTS46 : public IDNA { 115 public: 116 UTS46(uint32_t options, UErrorCode &errorCode); 117 virtual ~UTS46(); 118 119 virtual UnicodeString & 120 labelToASCII(const UnicodeString &label, UnicodeString &dest, 121 IDNAInfo &info, UErrorCode &errorCode) const; 122 123 virtual UnicodeString & 124 labelToUnicode(const UnicodeString &label, UnicodeString &dest, 125 IDNAInfo &info, UErrorCode &errorCode) const; 126 127 virtual UnicodeString & 128 nameToASCII(const UnicodeString &name, UnicodeString &dest, 129 IDNAInfo &info, UErrorCode &errorCode) const; 130 131 virtual UnicodeString & 132 nameToUnicode(const UnicodeString &name, UnicodeString &dest, 133 IDNAInfo &info, UErrorCode &errorCode) const; 134 135 virtual void 136 labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 137 IDNAInfo &info, UErrorCode &errorCode) const; 138 139 virtual void 140 labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 141 IDNAInfo &info, UErrorCode &errorCode) const; 142 143 virtual void 144 nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 145 IDNAInfo &info, UErrorCode &errorCode) const; 146 147 virtual void 148 nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 149 IDNAInfo &info, UErrorCode &errorCode) const; 150 151 private: 152 UnicodeString & 153 process(const UnicodeString &src, 154 UBool isLabel, UBool toASCII, 155 UnicodeString &dest, 156 IDNAInfo &info, UErrorCode &errorCode) const; 157 158 void 159 processUTF8(const StringPiece &src, 160 UBool isLabel, UBool toASCII, 161 ByteSink &dest, 162 IDNAInfo &info, UErrorCode &errorCode) const; 163 164 UnicodeString & 165 processUnicode(const UnicodeString &src, 166 int32_t labelStart, int32_t mappingStart, 167 UBool isLabel, UBool toASCII, 168 UnicodeString &dest, 169 IDNAInfo &info, UErrorCode &errorCode) const; 170 171 // returns the new dest.length() 172 int32_t 173 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 174 UErrorCode &errorCode) const; 175 176 // returns the new label length 177 int32_t 178 processLabel(UnicodeString &dest, 179 int32_t labelStart, int32_t labelLength, 180 UBool toASCII, 181 IDNAInfo &info, UErrorCode &errorCode) const; 182 int32_t 183 markBadACELabel(UnicodeString &dest, 184 int32_t labelStart, int32_t labelLength, 185 UBool toASCII, IDNAInfo &info) const; 186 187 void 188 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 189 190 UBool 191 isLabelOkContextJ(const UChar *label, int32_t labelLength) const; 192 193 void 194 checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 195 196 const Normalizer2 &uts46Norm2; // uts46.nrm 197 uint32_t options; 198 }; 199 200 IDNA * 201 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { 202 if(U_SUCCESS(errorCode)) { 203 IDNA *idna=new UTS46(options, errorCode); 204 if(idna==NULL) { 205 errorCode=U_MEMORY_ALLOCATION_ERROR; 206 } else if(U_FAILURE(errorCode)) { 207 delete idna; 208 idna=NULL; 209 } 210 return idna; 211 } else { 212 return NULL; 213 } 214 } 215 216 // UTS46 implementation ---------------------------------------------------- *** 217 218 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) 219 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)), 220 options(opt) {} 221 222 UTS46::~UTS46() {} 223 224 UnicodeString & 225 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, 226 IDNAInfo &info, UErrorCode &errorCode) const { 227 return process(label, TRUE, TRUE, dest, info, errorCode); 228 } 229 230 UnicodeString & 231 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, 232 IDNAInfo &info, UErrorCode &errorCode) const { 233 return process(label, TRUE, FALSE, dest, info, errorCode); 234 } 235 236 UnicodeString & 237 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, 238 IDNAInfo &info, UErrorCode &errorCode) const { 239 process(name, FALSE, TRUE, dest, info, errorCode); 240 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 && 241 isASCIIString(dest) && 242 (dest.length()>254 || dest[253]!=0x2e) 243 ) { 244 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 245 } 246 return dest; 247 } 248 249 UnicodeString & 250 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, 251 IDNAInfo &info, UErrorCode &errorCode) const { 252 return process(name, FALSE, FALSE, dest, info, errorCode); 253 } 254 255 void 256 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 257 IDNAInfo &info, UErrorCode &errorCode) const { 258 processUTF8(label, TRUE, TRUE, dest, info, errorCode); 259 } 260 261 void 262 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 263 IDNAInfo &info, UErrorCode &errorCode) const { 264 processUTF8(label, TRUE, FALSE, dest, info, errorCode); 265 } 266 267 void 268 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 269 IDNAInfo &info, UErrorCode &errorCode) const { 270 processUTF8(name, FALSE, TRUE, dest, info, errorCode); 271 } 272 273 void 274 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 275 IDNAInfo &info, UErrorCode &errorCode) const { 276 processUTF8(name, FALSE, FALSE, dest, info, errorCode); 277 } 278 279 // UTS #46 data for ASCII characters. 280 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 281 // and passes through all other ASCII characters. 282 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed 283 // using this data. 284 // The ASCII fastpath also uses this data. 285 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 286 static const int8_t asciiData[128]={ 287 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 288 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 289 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 290 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 291 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 293 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 294 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 295 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 296 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 297 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 299 }; 300 301 UnicodeString & 302 UTS46::process(const UnicodeString &src, 303 UBool isLabel, UBool toASCII, 304 UnicodeString &dest, 305 IDNAInfo &info, UErrorCode &errorCode) const { 306 // uts46Norm2.normalize() would do all of this error checking and setup, 307 // but with the ASCII fastpath we do not always call it, and do not 308 // call it first. 309 if(U_FAILURE(errorCode)) { 310 dest.setToBogus(); 311 return dest; 312 } 313 const UChar *srcArray=src.getBuffer(); 314 if(&dest==&src || srcArray==NULL) { 315 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 316 dest.setToBogus(); 317 return dest; 318 } 319 // Arguments are fine, reset output values. 320 dest.remove(); 321 info.reset(); 322 int32_t srcLength=src.length(); 323 if(srcLength==0) { 324 if(toASCII) { 325 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 326 } 327 return dest; 328 } 329 UChar *destArray=dest.getBuffer(srcLength); 330 if(destArray==NULL) { 331 errorCode=U_MEMORY_ALLOCATION_ERROR; 332 return dest; 333 } 334 // ASCII fastpath 335 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 336 int32_t labelStart=0; 337 int32_t i; 338 for(i=0;; ++i) { 339 if(i==srcLength) { 340 if(toASCII) { 341 if((i-labelStart)>63) { 342 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 343 } 344 // There is a trailing dot if labelStart==i. 345 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 346 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 347 } 348 } 349 info.errors|=info.labelErrors; 350 dest.releaseBuffer(i); 351 return dest; 352 } 353 UChar c=srcArray[i]; 354 if(c>0x7f) { 355 break; 356 } 357 int cData=asciiData[c]; 358 if(cData>0) { 359 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 360 } else if(cData<0 && disallowNonLDHDot) { 361 break; // Replacing with U+FFFD can be complicated for toASCII. 362 } else { 363 destArray[i]=c; 364 if(c==0x2d) { // hyphen 365 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 366 // "??--..." is Punycode or forbidden. 367 ++i; // '-' was copied to dest already 368 break; 369 } 370 if(i==labelStart) { 371 // label starts with "-" 372 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 373 } 374 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 375 // label ends with "-" 376 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 377 } 378 } else if(c==0x2e) { // dot 379 if(isLabel) { 380 // Replacing with U+FFFD can be complicated for toASCII. 381 ++i; // '.' was copied to dest already 382 break; 383 } 384 if(toASCII) { 385 // Permit an empty label at the end but not elsewhere. 386 if(i==labelStart && i<(srcLength-1)) { 387 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 388 } else if((i-labelStart)>63) { 389 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 390 } 391 } 392 info.errors|=info.labelErrors; 393 info.labelErrors=0; 394 labelStart=i+1; 395 } 396 } 397 } 398 info.errors|=info.labelErrors; 399 dest.releaseBuffer(i); 400 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); 401 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 402 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart))) 403 ) { 404 info.errors|=UIDNA_ERROR_BIDI; 405 } 406 return dest; 407 } 408 409 void 410 UTS46::processUTF8(const StringPiece &src, 411 UBool isLabel, UBool toASCII, 412 ByteSink &dest, 413 IDNAInfo &info, UErrorCode &errorCode) const { 414 if(U_FAILURE(errorCode)) { 415 return; 416 } 417 const char *srcArray=src.data(); 418 int32_t srcLength=src.length(); 419 if(srcArray==NULL && srcLength!=0) { 420 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 421 return; 422 } 423 // Arguments are fine, reset output values. 424 info.reset(); 425 if(srcLength==0) { 426 if(toASCII) { 427 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 428 } 429 dest.Flush(); 430 return; 431 } 432 UnicodeString destString; 433 int32_t labelStart=0; 434 if(srcLength<=256) { // length of stackArray[] 435 // ASCII fastpath 436 char stackArray[256]; 437 int32_t destCapacity; 438 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, 439 stackArray, LENGTHOF(stackArray), &destCapacity); 440 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 441 int32_t i; 442 for(i=0;; ++i) { 443 if(i==srcLength) { 444 if(toASCII) { 445 if((i-labelStart)>63) { 446 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 447 } 448 // There is a trailing dot if labelStart==i. 449 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 450 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 451 } 452 } 453 info.errors|=info.labelErrors; 454 dest.Append(destArray, i); 455 dest.Flush(); 456 return; 457 } 458 char c=srcArray[i]; 459 if((int8_t)c<0) { // (uint8_t)c>0x7f 460 break; 461 } 462 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char. 463 if(cData>0) { 464 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 465 } else if(cData<0 && disallowNonLDHDot) { 466 break; // Replacing with U+FFFD can be complicated for toASCII. 467 } else { 468 destArray[i]=c; 469 if(c==0x2d) { // hyphen 470 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 471 // "??--..." is Punycode or forbidden. 472 break; 473 } 474 if(i==labelStart) { 475 // label starts with "-" 476 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 477 } 478 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 479 // label ends with "-" 480 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 481 } 482 } else if(c==0x2e) { // dot 483 if(isLabel) { 484 break; // Replacing with U+FFFD can be complicated for toASCII. 485 } 486 if(toASCII) { 487 // Permit an empty label at the end but not elsewhere. 488 if(i==labelStart && i<(srcLength-1)) { 489 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 490 } else if((i-labelStart)>63) { 491 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 492 } 493 } 494 info.errors|=info.labelErrors; 495 info.labelErrors=0; 496 labelStart=i+1; 497 } 498 } 499 } 500 info.errors|=info.labelErrors; 501 // Convert the processed ASCII prefix of the current label to UTF-16. 502 int32_t mappingStart=i-labelStart; 503 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); 504 // Output the previous ASCII labels and process the rest of src in UTF-16. 505 dest.Append(destArray, labelStart); 506 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart, 507 isLabel, toASCII, 508 destString, info, errorCode); 509 } else { 510 // src is too long for the ASCII fastpath implementation. 511 processUnicode(UnicodeString::fromUTF8(src), 0, 0, 512 isLabel, toASCII, 513 destString, info, errorCode); 514 } 515 destString.toUTF8(dest); // calls dest.Flush() 516 if(toASCII && !isLabel) { 517 // length==labelStart==254 means that there is a trailing dot (ok) and 518 // destString is empty (do not index at 253-labelStart). 519 int32_t length=labelStart+destString.length(); 520 if( length>=254 && isASCIIString(destString) && 521 (length>254 || 522 (labelStart<254 && destString[253-labelStart]!=0x2e)) 523 ) { 524 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 525 } 526 } 527 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 528 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart))) 529 ) { 530 info.errors|=UIDNA_ERROR_BIDI; 531 } 532 } 533 534 UnicodeString & 535 UTS46::processUnicode(const UnicodeString &src, 536 int32_t labelStart, int32_t mappingStart, 537 UBool isLabel, UBool toASCII, 538 UnicodeString &dest, 539 IDNAInfo &info, UErrorCode &errorCode) const { 540 if(mappingStart==0) { 541 uts46Norm2.normalize(src, dest, errorCode); 542 } else { 543 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode); 544 } 545 if(U_FAILURE(errorCode)) { 546 return dest; 547 } 548 UBool doMapDevChars= 549 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : 550 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; 551 const UChar *destArray=dest.getBuffer(); 552 int32_t destLength=dest.length(); 553 int32_t labelLimit=labelStart; 554 while(labelLimit<destLength) { 555 UChar c=destArray[labelLimit]; 556 if(c==0x2e && !isLabel) { 557 int32_t labelLength=labelLimit-labelStart; 558 int32_t newLength=processLabel(dest, labelStart, labelLength, 559 toASCII, info, errorCode); 560 info.errors|=info.labelErrors; 561 info.labelErrors=0; 562 if(U_FAILURE(errorCode)) { 563 return dest; 564 } 565 destArray=dest.getBuffer(); 566 destLength+=newLength-labelLength; 567 labelLimit=labelStart+=newLength+1; 568 } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 569 info.isTransDiff=TRUE; 570 if(doMapDevChars) { 571 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); 572 if(U_FAILURE(errorCode)) { 573 return dest; 574 } 575 destArray=dest.getBuffer(); 576 // Do not increment labelLimit in case c was removed. 577 // All deviation characters have been mapped, no need to check for them again. 578 doMapDevChars=FALSE; 579 } else { 580 ++labelLimit; 581 } 582 } else { 583 ++labelLimit; 584 } 585 } 586 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 587 // but not an empty label elsewhere nor a completely empty domain name. 588 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 589 if(0==labelStart || labelStart<labelLimit) { 590 processLabel(dest, labelStart, labelLimit-labelStart, 591 toASCII, info, errorCode); 592 info.errors|=info.labelErrors; 593 } 594 return dest; 595 } 596 597 int32_t 598 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 599 UErrorCode &errorCode) const { 600 int32_t length=dest.length(); 601 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); 602 if(s==NULL) { 603 errorCode=U_MEMORY_ALLOCATION_ERROR; 604 return length; 605 } 606 int32_t capacity=dest.getCapacity(); 607 UBool didMapDevChars=FALSE; 608 int32_t readIndex=mappingStart, writeIndex=mappingStart; 609 do { 610 UChar c=s[readIndex++]; 611 switch(c) { 612 case 0xdf: 613 // Map sharp s to ss. 614 didMapDevChars=TRUE; 615 s[writeIndex++]=0x73; // Replace sharp s with first s. 616 // Insert second s and account for possible buffer reallocation. 617 if(writeIndex==readIndex) { 618 if(length==capacity) { 619 dest.releaseBuffer(length); 620 s=dest.getBuffer(length+1); 621 if(s==NULL) { 622 errorCode=U_MEMORY_ALLOCATION_ERROR; 623 return length; 624 } 625 capacity=dest.getCapacity(); 626 } 627 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); 628 ++readIndex; 629 } 630 s[writeIndex++]=0x73; 631 ++length; 632 break; 633 case 0x3c2: // Map final sigma to nonfinal sigma. 634 didMapDevChars=TRUE; 635 s[writeIndex++]=0x3c3; 636 break; 637 case 0x200c: // Ignore/remove ZWNJ. 638 case 0x200d: // Ignore/remove ZWJ. 639 didMapDevChars=TRUE; 640 --length; 641 break; 642 default: 643 // Only really necessary if writeIndex was different from readIndex. 644 s[writeIndex++]=c; 645 break; 646 } 647 } while(writeIndex<length); 648 dest.releaseBuffer(length); 649 if(didMapDevChars) { 650 // Mapping deviation characters might have resulted in an un-NFC string. 651 // We could use either the NFC or the UTS #46 normalizer. 652 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 653 UnicodeString normalized; 654 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); 655 if(U_SUCCESS(errorCode)) { 656 dest.replace(labelStart, 0x7fffffff, normalized); 657 return dest.length(); 658 } 659 } 660 return length; 661 } 662 663 // Some non-ASCII characters are equivalent to sequences with 664 // non-LDH ASCII characters. To find them: 665 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 666 static inline UBool 667 isNonASCIIDisallowedSTD3Valid(UChar32 c) { 668 return c==0x2260 || c==0x226E || c==0x226F; 669 } 670 671 // Replace the label in dest with the label string, if the label was modified. 672 // If &label==&dest then the label was modified in-place and labelLength 673 // is the new label length, different from label.length(). 674 // If &label!=&dest then labelLength==label.length(). 675 // Returns labelLength (= the new label length). 676 static int32_t 677 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, 678 const UnicodeString &label, int32_t labelLength) { 679 if(&label!=&dest) { 680 dest.replace(destLabelStart, destLabelLength, label); 681 } 682 return labelLength; 683 } 684 685 int32_t 686 UTS46::processLabel(UnicodeString &dest, 687 int32_t labelStart, int32_t labelLength, 688 UBool toASCII, 689 IDNAInfo &info, UErrorCode &errorCode) const { 690 UnicodeString fromPunycode; 691 UnicodeString *labelString; 692 const UChar *label=dest.getBuffer()+labelStart; 693 int32_t destLabelStart=labelStart; 694 int32_t destLabelLength=labelLength; 695 UBool wasPunycode; 696 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { 697 // Label starts with "xn--", try to un-Punycode it. 698 wasPunycode=TRUE; 699 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit 700 if(unicodeBuffer==NULL) { 701 // Should never occur if we used capacity==-1 which uses the internal buffer. 702 errorCode=U_MEMORY_ALLOCATION_ERROR; 703 return labelLength; 704 } 705 UErrorCode punycodeErrorCode=U_ZERO_ERROR; 706 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, 707 unicodeBuffer, fromPunycode.getCapacity(), 708 NULL, &punycodeErrorCode); 709 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { 710 fromPunycode.releaseBuffer(0); 711 unicodeBuffer=fromPunycode.getBuffer(unicodeLength); 712 if(unicodeBuffer==NULL) { 713 errorCode=U_MEMORY_ALLOCATION_ERROR; 714 return labelLength; 715 } 716 punycodeErrorCode=U_ZERO_ERROR; 717 unicodeLength=u_strFromPunycode(label+4, labelLength-4, 718 unicodeBuffer, fromPunycode.getCapacity(), 719 NULL, &punycodeErrorCode); 720 } 721 fromPunycode.releaseBuffer(unicodeLength); 722 if(U_FAILURE(punycodeErrorCode)) { 723 info.labelErrors|=UIDNA_ERROR_PUNYCODE; 724 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 725 } 726 // Check for NFC, and for characters that are not 727 // valid or deviation characters according to the normalizer. 728 // If there is something wrong, then the string will change. 729 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 730 // Deviation characters are ok in Punycode even in transitional processing. 731 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 732 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 733 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); 734 if(U_FAILURE(errorCode)) { 735 return labelLength; 736 } 737 if(!isValid) { 738 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 739 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 740 } 741 labelString=&fromPunycode; 742 label=fromPunycode.getBuffer(); 743 labelStart=0; 744 labelLength=fromPunycode.length(); 745 } else { 746 wasPunycode=FALSE; 747 labelString=&dest; 748 } 749 // Validity check 750 if(labelLength==0) { 751 if(toASCII) { 752 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 753 } 754 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength); 755 } 756 // labelLength>0 757 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { 758 // label starts with "??--" 759 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; 760 } 761 if(label[0]==0x2d) { 762 // label starts with "-" 763 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 764 } 765 if(label[labelLength-1]==0x2d) { 766 // label ends with "-" 767 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 768 } 769 // If the label was not a Punycode label, then it was the result of 770 // mapping, normalization and label segmentation. 771 // If the label was in Punycode, then we mapped it again above 772 // and checked its validity. 773 // Now we handle the STD3 restriction to LDH characters (if set) 774 // and we look for U+FFFD which indicates disallowed characters 775 // in a non-Punycode label or U+FFFD itself in a Punycode label. 776 // We also check for dots which can come from the input to a single-label function. 777 // Ok to cast away const because we own the UnicodeString. 778 UChar *s=(UChar *)label; 779 const UChar *limit=label+labelLength; 780 UChar oredChars=0; 781 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 782 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 783 do { 784 UChar c=*s; 785 if(c<=0x7f) { 786 if(c==0x2e) { 787 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 788 *s=0xfffd; 789 } else if(disallowNonLDHDot && asciiData[c]<0) { 790 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 791 *s=0xfffd; 792 } 793 } else { 794 oredChars|=c; 795 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 796 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 797 *s=0xfffd; 798 } else if(c==0xfffd) { 799 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 800 } 801 } 802 ++s; 803 } while(s<limit); 804 // Check for a leading combining mark after other validity checks 805 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. 806 UChar32 c; 807 int32_t cpLength=0; 808 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 809 U16_NEXT_UNSAFE(label, cpLength, c); 810 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 811 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; 812 labelString->replace(labelStart, cpLength, (UChar)0xfffd); 813 label=labelString->getBuffer()+labelStart; 814 labelLength+=1-cpLength; 815 if(labelString==&dest) { 816 destLabelLength=labelLength; 817 } 818 } 819 if((info.labelErrors&severeErrors)==0) { 820 // Do contextual checks only if we do not have U+FFFD from a severe error 821 // because U+FFFD can make these checks fail. 822 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { 823 checkLabelBiDi(label, labelLength, info); 824 } 825 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 826 !isLabelOkContextJ(label, labelLength) 827 ) { 828 info.labelErrors|=UIDNA_ERROR_CONTEXTJ; 829 } 830 if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 831 checkLabelContextO(label, labelLength, info); 832 } 833 if(toASCII) { 834 if(wasPunycode) { 835 // Leave a Punycode label unchanged if it has no severe errors. 836 if(destLabelLength>63) { 837 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 838 } 839 return destLabelLength; 840 } else if(oredChars>=0x80) { 841 // Contains non-ASCII characters. 842 UnicodeString punycode; 843 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length 844 if(buffer==NULL) { 845 errorCode=U_MEMORY_ALLOCATION_ERROR; 846 return destLabelLength; 847 } 848 buffer[0]=0x78; // Write "xn--". 849 buffer[1]=0x6e; 850 buffer[2]=0x2d; 851 buffer[3]=0x2d; 852 int32_t punycodeLength=u_strToPunycode(label, labelLength, 853 buffer+4, punycode.getCapacity()-4, 854 NULL, &errorCode); 855 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 856 errorCode=U_ZERO_ERROR; 857 punycode.releaseBuffer(4); 858 buffer=punycode.getBuffer(4+punycodeLength); 859 if(buffer==NULL) { 860 errorCode=U_MEMORY_ALLOCATION_ERROR; 861 return destLabelLength; 862 } 863 punycodeLength=u_strToPunycode(label, labelLength, 864 buffer+4, punycode.getCapacity()-4, 865 NULL, &errorCode); 866 } 867 punycodeLength+=4; 868 punycode.releaseBuffer(punycodeLength); 869 if(U_FAILURE(errorCode)) { 870 return destLabelLength; 871 } 872 if(punycodeLength>63) { 873 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 874 } 875 return replaceLabel(dest, destLabelStart, destLabelLength, 876 punycode, punycodeLength); 877 } else { 878 // all-ASCII label 879 if(labelLength>63) { 880 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 881 } 882 } 883 } 884 } else { 885 // If a Punycode label has severe errors, 886 // then leave it but make sure it does not look valid. 887 if(wasPunycode) { 888 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 889 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); 890 } 891 } 892 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength); 893 } 894 895 // Make sure an ACE label does not look valid. 896 // Append U+FFFD if the label has only LDH characters. 897 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD. 898 int32_t 899 UTS46::markBadACELabel(UnicodeString &dest, 900 int32_t labelStart, int32_t labelLength, 901 UBool toASCII, IDNAInfo &info) const { 902 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 903 UBool isASCII=TRUE; 904 UBool onlyLDH=TRUE; 905 const UChar *label=dest.getBuffer()+labelStart; 906 // Ok to cast away const because we own the UnicodeString. 907 UChar *s=(UChar *)label+4; // After the initial "xn--". 908 const UChar *limit=label+labelLength; 909 do { 910 UChar c=*s; 911 if(c<=0x7f) { 912 if(c==0x2e) { 913 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 914 *s=0xfffd; 915 isASCII=onlyLDH=FALSE; 916 } else if(asciiData[c]<0) { 917 onlyLDH=FALSE; 918 if(disallowNonLDHDot) { 919 *s=0xfffd; 920 isASCII=FALSE; 921 } 922 } 923 } else { 924 isASCII=onlyLDH=FALSE; 925 } 926 } while(++s<limit); 927 if(onlyLDH) { 928 dest.insert(labelStart+labelLength, (UChar)0xfffd); 929 ++labelLength; 930 } else { 931 if(toASCII && isASCII && labelLength>63) { 932 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 933 } 934 } 935 return labelLength; 936 } 937 938 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); 939 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); 940 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; 941 942 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); 943 944 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); 945 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 946 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); 947 948 const uint32_t ES_CS_ET_ON_BN_NSM_MASK= 949 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| 950 U_MASK(U_COMMON_NUMBER_SEPARATOR)| 951 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| 952 U_MASK(U_OTHER_NEUTRAL)| 953 U_MASK(U_BOUNDARY_NEUTRAL)| 954 U_MASK(U_DIR_NON_SPACING_MARK); 955 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 956 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 957 958 // We scan the whole label and check both for whether it contains RTL characters 959 // and whether it passes the BiDi Rule. 960 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 961 // that a domain name is a BiDi domain name (has an RTL label) only after 962 // processing several earlier labels. 963 void 964 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 965 // IDNA2008 BiDi rule 966 // Get the directionality of the first character. 967 UChar32 c; 968 int32_t i=0; 969 U16_NEXT_UNSAFE(label, i, c); 970 uint32_t firstMask=U_MASK(u_charDirection(c)); 971 // 1. The first character must be a character with BIDI property L, R 972 // or AL. If it has the R or AL property, it is an RTL label; if it 973 // has the L property, it is an LTR label. 974 if((firstMask&~L_R_AL_MASK)!=0) { 975 info.isOkBiDi=FALSE; 976 } 977 // Get the directionality of the last non-NSM character. 978 uint32_t lastMask; 979 for(;;) { 980 if(i>=labelLength) { 981 lastMask=firstMask; 982 break; 983 } 984 U16_PREV_UNSAFE(label, labelLength, c); 985 UCharDirection dir=u_charDirection(c); 986 if(dir!=U_DIR_NON_SPACING_MARK) { 987 lastMask=U_MASK(dir); 988 break; 989 } 990 } 991 // 3. In an RTL label, the end of the label must be a character with 992 // BIDI property R, AL, EN or AN, followed by zero or more 993 // characters with BIDI property NSM. 994 // 6. In an LTR label, the end of the label must be a character with 995 // BIDI property L or EN, followed by zero or more characters with 996 // BIDI property NSM. 997 if( (firstMask&L_MASK)!=0 ? 998 (lastMask&~L_EN_MASK)!=0 : 999 (lastMask&~R_AL_EN_AN_MASK)!=0 1000 ) { 1001 info.isOkBiDi=FALSE; 1002 } 1003 // Get the directionalities of the intervening characters. 1004 uint32_t mask=0; 1005 while(i<labelLength) { 1006 U16_NEXT_UNSAFE(label, i, c); 1007 mask|=U_MASK(u_charDirection(c)); 1008 } 1009 if(firstMask&L_MASK) { 1010 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1011 // ES, CS, ET, ON, BN and NSM are allowed. 1012 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1013 info.isOkBiDi=FALSE; 1014 } 1015 } else { 1016 // 2. In an RTL label, only characters with the BIDI properties R, AL, 1017 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 1018 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1019 info.isOkBiDi=FALSE; 1020 } 1021 // 4. In an RTL label, if an EN is present, no AN may be present, and 1022 // vice versa. 1023 if((mask&EN_AN_MASK)==EN_AN_MASK) { 1024 info.isOkBiDi=FALSE; 1025 } 1026 } 1027 // An RTL label is a label that contains at least one character of type 1028 // R, AL or AN. [...] 1029 // A "BIDI domain name" is a domain name that contains at least one RTL 1030 // label. [...] 1031 // The following rule, consisting of six conditions, applies to labels 1032 // in BIDI domain names. 1033 if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) { 1034 info.isBiDi=TRUE; 1035 } 1036 } 1037 1038 // Special code for the ASCII prefix of a BiDi domain name. 1039 // The ASCII prefix is all-LTR. 1040 1041 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 1042 // 1. The first character must be a character with BIDI property L [...] 1043 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1044 // ES, CS, ET, ON, BN and NSM are allowed. 1045 // 6. In an LTR label, the end of the label must be a character with 1046 // BIDI property L or EN [...] 1047 1048 // UTF-16 version, called for mapped ASCII prefix. 1049 // Cannot contain uppercase A-Z. 1050 // s[length-1] must be the trailing dot. 1051 static UBool 1052 isASCIIOkBiDi(const UChar *s, int32_t length) { 1053 int32_t labelStart=0; 1054 for(int32_t i=0; i<length; ++i) { 1055 UChar c=s[i]; 1056 if(c==0x2e) { // dot 1057 if(i>labelStart) { 1058 c=s[i-1]; 1059 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { 1060 // Last character in the label is not an L or EN. 1061 return FALSE; 1062 } 1063 } 1064 labelStart=i+1; 1065 } else if(i==labelStart) { 1066 if(!(0x61<=c && c<=0x7a)) { 1067 // First character in the label is not an L. 1068 return FALSE; 1069 } 1070 } else { 1071 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1072 // Intermediate character in the label is a B, S or WS. 1073 return FALSE; 1074 } 1075 } 1076 } 1077 return TRUE; 1078 } 1079 1080 // UTF-8 version, called for source ASCII prefix. 1081 // Can contain uppercase A-Z. 1082 // s[length-1] must be the trailing dot. 1083 static UBool 1084 isASCIIOkBiDi(const char *s, int32_t length) { 1085 int32_t labelStart=0; 1086 for(int32_t i=0; i<length; ++i) { 1087 char c=s[i]; 1088 if(c==0x2e) { // dot 1089 if(i>labelStart) { 1090 c=s[i-1]; 1091 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { 1092 // Last character in the label is not an L or EN. 1093 return FALSE; 1094 } 1095 } 1096 labelStart=i+1; 1097 } else if(i==labelStart) { 1098 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { 1099 // First character in the label is not an L. 1100 return FALSE; 1101 } 1102 } else { 1103 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1104 // Intermediate character in the label is a B, S or WS. 1105 return FALSE; 1106 } 1107 } 1108 } 1109 return TRUE; 1110 } 1111 1112 UBool 1113 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { 1114 const UBiDiProps *bdp=ubidi_getSingleton(); 1115 // [IDNA2008-Tables] 1116 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 1117 for(int32_t i=0; i<labelLength; ++i) { 1118 if(label[i]==0x200c) { 1119 // Appendix A.1. ZERO WIDTH NON-JOINER 1120 // Rule Set: 1121 // False; 1122 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1123 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 1124 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 1125 if(i==0) { 1126 return FALSE; 1127 } 1128 UChar32 c; 1129 int32_t j=i; 1130 U16_PREV_UNSAFE(label, j, c); 1131 if(uts46Norm2.getCombiningClass(c)==9) { 1132 continue; 1133 } 1134 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 1135 for(;;) { 1136 UJoiningType type=ubidi_getJoiningType(bdp, c); 1137 if(type==U_JT_TRANSPARENT) { 1138 if(j==0) { 1139 return FALSE; 1140 } 1141 U16_PREV_UNSAFE(label, j, c); 1142 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { 1143 break; // precontext fulfilled 1144 } else { 1145 return FALSE; 1146 } 1147 } 1148 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 1149 for(j=i+1;;) { 1150 if(j==labelLength) { 1151 return FALSE; 1152 } 1153 U16_NEXT_UNSAFE(label, j, c); 1154 UJoiningType type=ubidi_getJoiningType(bdp, c); 1155 if(type==U_JT_TRANSPARENT) { 1156 // just skip this character 1157 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { 1158 break; // postcontext fulfilled 1159 } else { 1160 return FALSE; 1161 } 1162 } 1163 } else if(label[i]==0x200d) { 1164 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 1165 // Rule Set: 1166 // False; 1167 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1168 if(i==0) { 1169 return FALSE; 1170 } 1171 UChar32 c; 1172 int32_t j=i; 1173 U16_PREV_UNSAFE(label, j, c); 1174 if(uts46Norm2.getCombiningClass(c)!=9) { 1175 return FALSE; 1176 } 1177 } 1178 } 1179 return TRUE; 1180 } 1181 1182 void 1183 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 1184 int32_t labelEnd=labelLength-1; // inclusive 1185 int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx 1186 for(int32_t i=0; i<=labelEnd; ++i) { 1187 UChar32 c=label[i]; 1188 if(c<0xb7) { 1189 // ASCII fastpath 1190 } else if(c<=0x6f9) { 1191 if(c==0xb7) { 1192 // Appendix A.3. MIDDLE DOT (U+00B7) 1193 // Rule Set: 1194 // False; 1195 // If Before(cp) .eq. U+006C And 1196 // After(cp) .eq. U+006C Then True; 1197 if(!(0<i && label[i-1]==0x6c && 1198 i<labelEnd && label[i+1]==0x6c)) { 1199 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1200 } 1201 } else if(c==0x375) { 1202 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 1203 // Rule Set: 1204 // False; 1205 // If Script(After(cp)) .eq. Greek Then True; 1206 UScriptCode script=USCRIPT_INVALID_CODE; 1207 if(i<labelEnd) { 1208 UErrorCode errorCode=U_ZERO_ERROR; 1209 int32_t j=i+1; 1210 U16_NEXT(label, j, labelLength, c); 1211 script=uscript_getScript(c, &errorCode); 1212 } 1213 if(script!=USCRIPT_GREEK) { 1214 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1215 } 1216 } else if(c==0x5f3 || c==0x5f4) { 1217 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 1218 // Rule Set: 1219 // False; 1220 // If Script(Before(cp)) .eq. Hebrew Then True; 1221 // 1222 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 1223 // Rule Set: 1224 // False; 1225 // If Script(Before(cp)) .eq. Hebrew Then True; 1226 UScriptCode script=USCRIPT_INVALID_CODE; 1227 if(0<i) { 1228 UErrorCode errorCode=U_ZERO_ERROR; 1229 int32_t j=i; 1230 U16_PREV(label, 0, j, c); 1231 script=uscript_getScript(c, &errorCode); 1232 } 1233 if(script!=USCRIPT_HEBREW) { 1234 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1235 } 1236 } else if(0x660<=c /* && c<=0x6f9 */) { 1237 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 1238 // Rule Set: 1239 // True; 1240 // For All Characters: 1241 // If cp .in. 06F0..06F9 Then False; 1242 // End For; 1243 // 1244 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 1245 // Rule Set: 1246 // True; 1247 // For All Characters: 1248 // If cp .in. 0660..0669 Then False; 1249 // End For; 1250 if(c<=0x669) { 1251 if(arabicDigits>0) { 1252 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1253 } 1254 arabicDigits=-1; 1255 } else if(0x6f0<=c) { 1256 if(arabicDigits<0) { 1257 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1258 } 1259 arabicDigits=1; 1260 } 1261 } 1262 } else if(c==0x30fb) { 1263 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 1264 // Rule Set: 1265 // False; 1266 // For All Characters: 1267 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 1268 // End For; 1269 UErrorCode errorCode=U_ZERO_ERROR; 1270 for(int j=0;;) { 1271 if(j>labelEnd) { 1272 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1273 break; 1274 } 1275 U16_NEXT(label, j, labelLength, c); 1276 UScriptCode script=uscript_getScript(c, &errorCode); 1277 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { 1278 break; 1279 } 1280 } 1281 } 1282 } 1283 } 1284 1285 U_NAMESPACE_END 1286 1287 // C API ------------------------------------------------------------------- *** 1288 1289 U_NAMESPACE_USE 1290 1291 U_CAPI UIDNA * U_EXPORT2 1292 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { 1293 return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode)); 1294 } 1295 1296 U_CAPI void U_EXPORT2 1297 uidna_close(UIDNA *idna) { 1298 delete reinterpret_cast<IDNA *>(idna); 1299 } 1300 1301 static UBool 1302 checkArgs(const void *label, int32_t length, 1303 void *dest, int32_t capacity, 1304 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1305 if(U_FAILURE(*pErrorCode)) { 1306 return FALSE; 1307 } 1308 // sizeof(UIDNAInfo)=16 in the first API version. 1309 if(pInfo==NULL || pInfo->size<16) { 1310 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1311 return FALSE; 1312 } 1313 if( (label==NULL ? length!=0 : length<-1) || 1314 (dest==NULL ? capacity!=0 : capacity<0) || 1315 (dest==label && label!=NULL) 1316 ) { 1317 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1318 return FALSE; 1319 } 1320 // Set all *pInfo bytes to 0 except for the size field itself. 1321 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); 1322 return TRUE; 1323 } 1324 1325 static void 1326 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { 1327 pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); 1328 pInfo->errors=info.getErrors(); 1329 } 1330 1331 U_CAPI int32_t U_EXPORT2 1332 uidna_labelToASCII(const UIDNA *idna, 1333 const UChar *label, int32_t length, 1334 UChar *dest, int32_t capacity, 1335 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1336 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1337 return 0; 1338 } 1339 UnicodeString src((UBool)(length<0), label, length); 1340 UnicodeString destString(dest, 0, capacity); 1341 IDNAInfo info; 1342 reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode); 1343 idnaInfoToStruct(info, pInfo); 1344 return destString.extract(dest, capacity, *pErrorCode); 1345 } 1346 1347 U_CAPI int32_t U_EXPORT2 1348 uidna_labelToUnicode(const UIDNA *idna, 1349 const UChar *label, int32_t length, 1350 UChar *dest, int32_t capacity, 1351 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1352 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1353 return 0; 1354 } 1355 UnicodeString src((UBool)(length<0), label, length); 1356 UnicodeString destString(dest, 0, capacity); 1357 IDNAInfo info; 1358 reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode); 1359 idnaInfoToStruct(info, pInfo); 1360 return destString.extract(dest, capacity, *pErrorCode); 1361 } 1362 1363 U_CAPI int32_t U_EXPORT2 1364 uidna_nameToASCII(const UIDNA *idna, 1365 const UChar *name, int32_t length, 1366 UChar *dest, int32_t capacity, 1367 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1368 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1369 return 0; 1370 } 1371 UnicodeString src((UBool)(length<0), name, length); 1372 UnicodeString destString(dest, 0, capacity); 1373 IDNAInfo info; 1374 reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode); 1375 idnaInfoToStruct(info, pInfo); 1376 return destString.extract(dest, capacity, *pErrorCode); 1377 } 1378 1379 U_CAPI int32_t U_EXPORT2 1380 uidna_nameToUnicode(const UIDNA *idna, 1381 const UChar *name, int32_t length, 1382 UChar *dest, int32_t capacity, 1383 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1384 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1385 return 0; 1386 } 1387 UnicodeString src((UBool)(length<0), name, length); 1388 UnicodeString destString(dest, 0, capacity); 1389 IDNAInfo info; 1390 reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode); 1391 idnaInfoToStruct(info, pInfo); 1392 return destString.extract(dest, capacity, *pErrorCode); 1393 } 1394 1395 U_CAPI int32_t U_EXPORT2 1396 uidna_labelToASCII_UTF8(const UIDNA *idna, 1397 const char *label, int32_t length, 1398 char *dest, int32_t capacity, 1399 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1400 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1401 return 0; 1402 } 1403 StringPiece src(label, length<0 ? uprv_strlen(label) : length); 1404 CheckedArrayByteSink sink(dest, capacity); 1405 IDNAInfo info; 1406 reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode); 1407 idnaInfoToStruct(info, pInfo); 1408 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1409 } 1410 1411 U_CAPI int32_t U_EXPORT2 1412 uidna_labelToUnicodeUTF8(const UIDNA *idna, 1413 const char *label, int32_t length, 1414 char *dest, int32_t capacity, 1415 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1416 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1417 return 0; 1418 } 1419 StringPiece src(label, length<0 ? uprv_strlen(label) : length); 1420 CheckedArrayByteSink sink(dest, capacity); 1421 IDNAInfo info; 1422 reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode); 1423 idnaInfoToStruct(info, pInfo); 1424 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1425 } 1426 1427 U_CAPI int32_t U_EXPORT2 1428 uidna_nameToASCII_UTF8(const UIDNA *idna, 1429 const char *name, int32_t length, 1430 char *dest, int32_t capacity, 1431 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1432 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1433 return 0; 1434 } 1435 StringPiece src(name, length<0 ? uprv_strlen(name) : length); 1436 CheckedArrayByteSink sink(dest, capacity); 1437 IDNAInfo info; 1438 reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode); 1439 idnaInfoToStruct(info, pInfo); 1440 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1441 } 1442 1443 U_CAPI int32_t U_EXPORT2 1444 uidna_nameToUnicodeUTF8(const UIDNA *idna, 1445 const char *name, int32_t length, 1446 char *dest, int32_t capacity, 1447 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1448 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1449 return 0; 1450 } 1451 StringPiece src(name, length<0 ? uprv_strlen(name) : length); 1452 CheckedArrayByteSink sink(dest, capacity); 1453 IDNAInfo info; 1454 reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode); 1455 idnaInfoToStruct(info, pInfo); 1456 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1457 } 1458 1459 #endif // UCONFIG_NO_IDNA 1460