1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: uts46.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010mar09 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_IDNA 18 19 #include "unicode/idna.h" 20 #include "unicode/normalizer2.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ustring.h" 23 #include "unicode/utf16.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "punycode.h" 27 #include "ubidi_props.h" 28 #include "ustr_imp.h" 29 30 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: 31 // 32 // The domain name length limit is 255 octets in an internal DNS representation 33 // where the last ("root") label is the empty label 34 // represented by length byte 0 alone. 35 // In a conventional string, this translates to 253 characters, or 254 36 // if there is a trailing dot for the root label. 37 38 U_NAMESPACE_BEGIN 39 40 // Severe errors which usually result in a U+FFFD replacement character in the result string. 41 const uint32_t severeErrors= 42 UIDNA_ERROR_LEADING_COMBINING_MARK| 43 UIDNA_ERROR_DISALLOWED| 44 UIDNA_ERROR_PUNYCODE| 45 UIDNA_ERROR_LABEL_HAS_DOT| 46 UIDNA_ERROR_INVALID_ACE_LABEL; 47 48 static inline UBool 49 isASCIIString(const UnicodeString &dest) { 50 const UChar *s=dest.getBuffer(); 51 const UChar *limit=s+dest.length(); 52 while(s<limit) { 53 if(*s++>0x7f) { 54 return FALSE; 55 } 56 } 57 return TRUE; 58 } 59 60 static UBool 61 isASCIIOkBiDi(const UChar *s, int32_t length); 62 63 static UBool 64 isASCIIOkBiDi(const char *s, int32_t length); 65 66 // IDNA class default implementations -------------------------------------- *** 67 68 IDNA::~IDNA() {} 69 70 void 71 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 72 IDNAInfo &info, UErrorCode &errorCode) const { 73 if(U_SUCCESS(errorCode)) { 74 UnicodeString destString; 75 labelToASCII(UnicodeString::fromUTF8(label), destString, 76 info, errorCode).toUTF8(dest); 77 } 78 } 79 80 void 81 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 82 IDNAInfo &info, UErrorCode &errorCode) const { 83 if(U_SUCCESS(errorCode)) { 84 UnicodeString destString; 85 labelToUnicode(UnicodeString::fromUTF8(label), destString, 86 info, errorCode).toUTF8(dest); 87 } 88 } 89 90 void 91 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 92 IDNAInfo &info, UErrorCode &errorCode) const { 93 if(U_SUCCESS(errorCode)) { 94 UnicodeString destString; 95 nameToASCII(UnicodeString::fromUTF8(name), destString, 96 info, errorCode).toUTF8(dest); 97 } 98 } 99 100 void 101 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 102 IDNAInfo &info, UErrorCode &errorCode) const { 103 if(U_SUCCESS(errorCode)) { 104 UnicodeString destString; 105 nameToUnicode(UnicodeString::fromUTF8(name), destString, 106 info, errorCode).toUTF8(dest); 107 } 108 } 109 110 // UTS46 class declaration ------------------------------------------------- *** 111 112 class UTS46 : public IDNA { 113 public: 114 UTS46(uint32_t options, UErrorCode &errorCode); 115 virtual ~UTS46(); 116 117 virtual UnicodeString & 118 labelToASCII(const UnicodeString &label, UnicodeString &dest, 119 IDNAInfo &info, UErrorCode &errorCode) const; 120 121 virtual UnicodeString & 122 labelToUnicode(const UnicodeString &label, UnicodeString &dest, 123 IDNAInfo &info, UErrorCode &errorCode) const; 124 125 virtual UnicodeString & 126 nameToASCII(const UnicodeString &name, UnicodeString &dest, 127 IDNAInfo &info, UErrorCode &errorCode) const; 128 129 virtual UnicodeString & 130 nameToUnicode(const UnicodeString &name, UnicodeString &dest, 131 IDNAInfo &info, UErrorCode &errorCode) const; 132 133 virtual void 134 labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 135 IDNAInfo &info, UErrorCode &errorCode) const; 136 137 virtual void 138 labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 139 IDNAInfo &info, UErrorCode &errorCode) const; 140 141 virtual void 142 nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 143 IDNAInfo &info, UErrorCode &errorCode) const; 144 145 virtual void 146 nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 147 IDNAInfo &info, UErrorCode &errorCode) const; 148 149 private: 150 UnicodeString & 151 process(const UnicodeString &src, 152 UBool isLabel, UBool toASCII, 153 UnicodeString &dest, 154 IDNAInfo &info, UErrorCode &errorCode) const; 155 156 void 157 processUTF8(const StringPiece &src, 158 UBool isLabel, UBool toASCII, 159 ByteSink &dest, 160 IDNAInfo &info, UErrorCode &errorCode) const; 161 162 UnicodeString & 163 processUnicode(const UnicodeString &src, 164 int32_t labelStart, int32_t mappingStart, 165 UBool isLabel, UBool toASCII, 166 UnicodeString &dest, 167 IDNAInfo &info, UErrorCode &errorCode) const; 168 169 // returns the new dest.length() 170 int32_t 171 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 172 UErrorCode &errorCode) const; 173 174 // returns the new label length 175 int32_t 176 processLabel(UnicodeString &dest, 177 int32_t labelStart, int32_t labelLength, 178 UBool toASCII, 179 IDNAInfo &info, UErrorCode &errorCode) const; 180 int32_t 181 markBadACELabel(UnicodeString &dest, 182 int32_t labelStart, int32_t labelLength, 183 UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const; 184 185 void 186 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 187 188 UBool 189 isLabelOkContextJ(const UChar *label, int32_t labelLength) const; 190 191 void 192 checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const; 193 194 const Normalizer2 &uts46Norm2; // uts46.nrm 195 uint32_t options; 196 }; 197 198 IDNA * 199 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { 200 if(U_SUCCESS(errorCode)) { 201 IDNA *idna=new UTS46(options, errorCode); 202 if(idna==NULL) { 203 errorCode=U_MEMORY_ALLOCATION_ERROR; 204 } else if(U_FAILURE(errorCode)) { 205 delete idna; 206 idna=NULL; 207 } 208 return idna; 209 } else { 210 return NULL; 211 } 212 } 213 214 // UTS46 implementation ---------------------------------------------------- *** 215 216 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) 217 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)), 218 options(opt) {} 219 220 UTS46::~UTS46() {} 221 222 UnicodeString & 223 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, 224 IDNAInfo &info, UErrorCode &errorCode) const { 225 return process(label, TRUE, TRUE, dest, info, errorCode); 226 } 227 228 UnicodeString & 229 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, 230 IDNAInfo &info, UErrorCode &errorCode) const { 231 return process(label, TRUE, FALSE, dest, info, errorCode); 232 } 233 234 UnicodeString & 235 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, 236 IDNAInfo &info, UErrorCode &errorCode) const { 237 process(name, FALSE, TRUE, dest, info, errorCode); 238 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 && 239 isASCIIString(dest) && 240 (dest.length()>254 || dest[253]!=0x2e) 241 ) { 242 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 243 } 244 return dest; 245 } 246 247 UnicodeString & 248 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, 249 IDNAInfo &info, UErrorCode &errorCode) const { 250 return process(name, FALSE, FALSE, dest, info, errorCode); 251 } 252 253 void 254 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, 255 IDNAInfo &info, UErrorCode &errorCode) const { 256 processUTF8(label, TRUE, TRUE, dest, info, errorCode); 257 } 258 259 void 260 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, 261 IDNAInfo &info, UErrorCode &errorCode) const { 262 processUTF8(label, TRUE, FALSE, dest, info, errorCode); 263 } 264 265 void 266 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, 267 IDNAInfo &info, UErrorCode &errorCode) const { 268 processUTF8(name, FALSE, TRUE, dest, info, errorCode); 269 } 270 271 void 272 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, 273 IDNAInfo &info, UErrorCode &errorCode) const { 274 processUTF8(name, FALSE, FALSE, dest, info, errorCode); 275 } 276 277 // UTS #46 data for ASCII characters. 278 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 279 // and passes through all other ASCII characters. 280 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed 281 // using this data. 282 // The ASCII fastpath also uses this data. 283 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 284 static const int8_t asciiData[128]={ 285 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 286 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 287 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 288 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 289 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 290 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 291 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 292 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 293 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 294 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 295 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 297 }; 298 299 UnicodeString & 300 UTS46::process(const UnicodeString &src, 301 UBool isLabel, UBool toASCII, 302 UnicodeString &dest, 303 IDNAInfo &info, UErrorCode &errorCode) const { 304 // uts46Norm2.normalize() would do all of this error checking and setup, 305 // but with the ASCII fastpath we do not always call it, and do not 306 // call it first. 307 if(U_FAILURE(errorCode)) { 308 dest.setToBogus(); 309 return dest; 310 } 311 const UChar *srcArray=src.getBuffer(); 312 if(&dest==&src || srcArray==NULL) { 313 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 314 dest.setToBogus(); 315 return dest; 316 } 317 // Arguments are fine, reset output values. 318 dest.remove(); 319 info.reset(); 320 int32_t srcLength=src.length(); 321 if(srcLength==0) { 322 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 323 return dest; 324 } 325 UChar *destArray=dest.getBuffer(srcLength); 326 if(destArray==NULL) { 327 errorCode=U_MEMORY_ALLOCATION_ERROR; 328 return dest; 329 } 330 // ASCII fastpath 331 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 332 int32_t labelStart=0; 333 int32_t i; 334 for(i=0;; ++i) { 335 if(i==srcLength) { 336 if(toASCII) { 337 if((i-labelStart)>63) { 338 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 339 } 340 // There is a trailing dot if labelStart==i. 341 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 342 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 343 } 344 } 345 info.errors|=info.labelErrors; 346 dest.releaseBuffer(i); 347 return dest; 348 } 349 UChar c=srcArray[i]; 350 if(c>0x7f) { 351 break; 352 } 353 int cData=asciiData[c]; 354 if(cData>0) { 355 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 356 } else if(cData<0 && disallowNonLDHDot) { 357 break; // Replacing with U+FFFD can be complicated for toASCII. 358 } else { 359 destArray[i]=c; 360 if(c==0x2d) { // hyphen 361 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 362 // "??--..." is Punycode or forbidden. 363 ++i; // '-' was copied to dest already 364 break; 365 } 366 if(i==labelStart) { 367 // label starts with "-" 368 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 369 } 370 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 371 // label ends with "-" 372 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 373 } 374 } else if(c==0x2e) { // dot 375 if(isLabel) { 376 // Replacing with U+FFFD can be complicated for toASCII. 377 ++i; // '.' was copied to dest already 378 break; 379 } 380 if(i==labelStart) { 381 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 382 } 383 if(toASCII && (i-labelStart)>63) { 384 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 385 } 386 info.errors|=info.labelErrors; 387 info.labelErrors=0; 388 labelStart=i+1; 389 } 390 } 391 } 392 info.errors|=info.labelErrors; 393 dest.releaseBuffer(i); 394 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); 395 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 396 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart))) 397 ) { 398 info.errors|=UIDNA_ERROR_BIDI; 399 } 400 return dest; 401 } 402 403 void 404 UTS46::processUTF8(const StringPiece &src, 405 UBool isLabel, UBool toASCII, 406 ByteSink &dest, 407 IDNAInfo &info, UErrorCode &errorCode) const { 408 if(U_FAILURE(errorCode)) { 409 return; 410 } 411 const char *srcArray=src.data(); 412 int32_t srcLength=src.length(); 413 if(srcArray==NULL && srcLength!=0) { 414 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 415 return; 416 } 417 // Arguments are fine, reset output values. 418 info.reset(); 419 if(srcLength==0) { 420 info.errors|=UIDNA_ERROR_EMPTY_LABEL; 421 dest.Flush(); 422 return; 423 } 424 UnicodeString destString; 425 int32_t labelStart=0; 426 if(srcLength<=256) { // length of stackArray[] 427 // ASCII fastpath 428 char stackArray[256]; 429 int32_t destCapacity; 430 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, 431 stackArray, UPRV_LENGTHOF(stackArray), &destCapacity); 432 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 433 int32_t i; 434 for(i=0;; ++i) { 435 if(i==srcLength) { 436 if(toASCII) { 437 if((i-labelStart)>63) { 438 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 439 } 440 // There is a trailing dot if labelStart==i. 441 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 442 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 443 } 444 } 445 info.errors|=info.labelErrors; 446 dest.Append(destArray, i); 447 dest.Flush(); 448 return; 449 } 450 char c=srcArray[i]; 451 if((int8_t)c<0) { // (uint8_t)c>0x7f 452 break; 453 } 454 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char. 455 if(cData>0) { 456 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. 457 } else if(cData<0 && disallowNonLDHDot) { 458 break; // Replacing with U+FFFD can be complicated for toASCII. 459 } else { 460 destArray[i]=c; 461 if(c==0x2d) { // hyphen 462 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { 463 // "??--..." is Punycode or forbidden. 464 break; 465 } 466 if(i==labelStart) { 467 // label starts with "-" 468 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 469 } 470 if((i+1)==srcLength || srcArray[i+1]==0x2e) { 471 // label ends with "-" 472 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 473 } 474 } else if(c==0x2e) { // dot 475 if(isLabel) { 476 break; // Replacing with U+FFFD can be complicated for toASCII. 477 } 478 if(i==labelStart) { 479 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 480 } 481 if(toASCII && (i-labelStart)>63) { 482 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 483 } 484 info.errors|=info.labelErrors; 485 info.labelErrors=0; 486 labelStart=i+1; 487 } 488 } 489 } 490 info.errors|=info.labelErrors; 491 // Convert the processed ASCII prefix of the current label to UTF-16. 492 int32_t mappingStart=i-labelStart; 493 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart)); 494 // Output the previous ASCII labels and process the rest of src in UTF-16. 495 dest.Append(destArray, labelStart); 496 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart, 497 isLabel, toASCII, 498 destString, info, errorCode); 499 } else { 500 // src is too long for the ASCII fastpath implementation. 501 processUnicode(UnicodeString::fromUTF8(src), 0, 0, 502 isLabel, toASCII, 503 destString, info, errorCode); 504 } 505 destString.toUTF8(dest); // calls dest.Flush() 506 if(toASCII && !isLabel) { 507 // length==labelStart==254 means that there is a trailing dot (ok) and 508 // destString is empty (do not index at 253-labelStart). 509 int32_t length=labelStart+destString.length(); 510 if( length>=254 && isASCIIString(destString) && 511 (length>254 || 512 (labelStart<254 && destString[253-labelStart]!=0x2e)) 513 ) { 514 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; 515 } 516 } 517 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && 518 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart))) 519 ) { 520 info.errors|=UIDNA_ERROR_BIDI; 521 } 522 } 523 524 UnicodeString & 525 UTS46::processUnicode(const UnicodeString &src, 526 int32_t labelStart, int32_t mappingStart, 527 UBool isLabel, UBool toASCII, 528 UnicodeString &dest, 529 IDNAInfo &info, UErrorCode &errorCode) const { 530 if(mappingStart==0) { 531 uts46Norm2.normalize(src, dest, errorCode); 532 } else { 533 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode); 534 } 535 if(U_FAILURE(errorCode)) { 536 return dest; 537 } 538 UBool doMapDevChars= 539 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : 540 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; 541 const UChar *destArray=dest.getBuffer(); 542 int32_t destLength=dest.length(); 543 int32_t labelLimit=labelStart; 544 while(labelLimit<destLength) { 545 UChar c=destArray[labelLimit]; 546 if(c==0x2e && !isLabel) { 547 int32_t labelLength=labelLimit-labelStart; 548 int32_t newLength=processLabel(dest, labelStart, labelLength, 549 toASCII, info, errorCode); 550 info.errors|=info.labelErrors; 551 info.labelErrors=0; 552 if(U_FAILURE(errorCode)) { 553 return dest; 554 } 555 destArray=dest.getBuffer(); 556 destLength+=newLength-labelLength; 557 labelLimit=labelStart+=newLength+1; 558 } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 559 info.isTransDiff=TRUE; 560 if(doMapDevChars) { 561 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); 562 if(U_FAILURE(errorCode)) { 563 return dest; 564 } 565 destArray=dest.getBuffer(); 566 // Do not increment labelLimit in case c was removed. 567 // All deviation characters have been mapped, no need to check for them again. 568 doMapDevChars=FALSE; 569 } else { 570 ++labelLimit; 571 } 572 } else { 573 ++labelLimit; 574 } 575 } 576 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 577 // but not an empty label elsewhere nor a completely empty domain name. 578 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 579 if(0==labelStart || labelStart<labelLimit) { 580 processLabel(dest, labelStart, labelLimit-labelStart, 581 toASCII, info, errorCode); 582 info.errors|=info.labelErrors; 583 } 584 return dest; 585 } 586 587 int32_t 588 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, 589 UErrorCode &errorCode) const { 590 if(U_FAILURE(errorCode)) { 591 return 0; 592 } 593 int32_t length=dest.length(); 594 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); 595 if(s==NULL) { 596 errorCode=U_MEMORY_ALLOCATION_ERROR; 597 return length; 598 } 599 int32_t capacity=dest.getCapacity(); 600 UBool didMapDevChars=FALSE; 601 int32_t readIndex=mappingStart, writeIndex=mappingStart; 602 do { 603 UChar c=s[readIndex++]; 604 switch(c) { 605 case 0xdf: 606 // Map sharp s to ss. 607 didMapDevChars=TRUE; 608 s[writeIndex++]=0x73; // Replace sharp s with first s. 609 // Insert second s and account for possible buffer reallocation. 610 if(writeIndex==readIndex) { 611 if(length==capacity) { 612 dest.releaseBuffer(length); 613 s=dest.getBuffer(length+1); 614 if(s==NULL) { 615 errorCode=U_MEMORY_ALLOCATION_ERROR; 616 return length; 617 } 618 capacity=dest.getCapacity(); 619 } 620 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); 621 ++readIndex; 622 } 623 s[writeIndex++]=0x73; 624 ++length; 625 break; 626 case 0x3c2: // Map final sigma to nonfinal sigma. 627 didMapDevChars=TRUE; 628 s[writeIndex++]=0x3c3; 629 break; 630 case 0x200c: // Ignore/remove ZWNJ. 631 case 0x200d: // Ignore/remove ZWJ. 632 didMapDevChars=TRUE; 633 --length; 634 break; 635 default: 636 // Only really necessary if writeIndex was different from readIndex. 637 s[writeIndex++]=c; 638 break; 639 } 640 } while(writeIndex<length); 641 dest.releaseBuffer(length); 642 if(didMapDevChars) { 643 // Mapping deviation characters might have resulted in an un-NFC string. 644 // We could use either the NFC or the UTS #46 normalizer. 645 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 646 UnicodeString normalized; 647 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode); 648 if(U_SUCCESS(errorCode)) { 649 dest.replace(labelStart, 0x7fffffff, normalized); 650 if(dest.isBogus()) { 651 errorCode=U_MEMORY_ALLOCATION_ERROR; 652 } 653 return dest.length(); 654 } 655 } 656 return length; 657 } 658 659 // Some non-ASCII characters are equivalent to sequences with 660 // non-LDH ASCII characters. To find them: 661 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 662 static inline UBool 663 isNonASCIIDisallowedSTD3Valid(UChar32 c) { 664 return c==0x2260 || c==0x226E || c==0x226F; 665 } 666 667 // Replace the label in dest with the label string, if the label was modified. 668 // If &label==&dest then the label was modified in-place and labelLength 669 // is the new label length, different from label.length(). 670 // If &label!=&dest then labelLength==label.length(). 671 // Returns labelLength (= the new label length). 672 static int32_t 673 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength, 674 const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) { 675 if(U_FAILURE(errorCode)) { 676 return 0; 677 } 678 if(&label!=&dest) { 679 dest.replace(destLabelStart, destLabelLength, label); 680 if(dest.isBogus()) { 681 errorCode=U_MEMORY_ALLOCATION_ERROR; 682 return 0; 683 } 684 } 685 return labelLength; 686 } 687 688 int32_t 689 UTS46::processLabel(UnicodeString &dest, 690 int32_t labelStart, int32_t labelLength, 691 UBool toASCII, 692 IDNAInfo &info, UErrorCode &errorCode) const { 693 if(U_FAILURE(errorCode)) { 694 return 0; 695 } 696 UnicodeString fromPunycode; 697 UnicodeString *labelString; 698 const UChar *label=dest.getBuffer()+labelStart; 699 int32_t destLabelStart=labelStart; 700 int32_t destLabelLength=labelLength; 701 UBool wasPunycode; 702 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) { 703 // Label starts with "xn--", try to un-Punycode it. 704 wasPunycode=TRUE; 705 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit 706 if(unicodeBuffer==NULL) { 707 // Should never occur if we used capacity==-1 which uses the internal buffer. 708 errorCode=U_MEMORY_ALLOCATION_ERROR; 709 return labelLength; 710 } 711 UErrorCode punycodeErrorCode=U_ZERO_ERROR; 712 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, 713 unicodeBuffer, fromPunycode.getCapacity(), 714 NULL, &punycodeErrorCode); 715 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { 716 fromPunycode.releaseBuffer(0); 717 unicodeBuffer=fromPunycode.getBuffer(unicodeLength); 718 if(unicodeBuffer==NULL) { 719 errorCode=U_MEMORY_ALLOCATION_ERROR; 720 return labelLength; 721 } 722 punycodeErrorCode=U_ZERO_ERROR; 723 unicodeLength=u_strFromPunycode(label+4, labelLength-4, 724 unicodeBuffer, fromPunycode.getCapacity(), 725 NULL, &punycodeErrorCode); 726 } 727 fromPunycode.releaseBuffer(unicodeLength); 728 if(U_FAILURE(punycodeErrorCode)) { 729 info.labelErrors|=UIDNA_ERROR_PUNYCODE; 730 return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); 731 } 732 // Check for NFC, and for characters that are not 733 // valid or deviation characters according to the normalizer. 734 // If there is something wrong, then the string will change. 735 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 736 // Deviation characters are ok in Punycode even in transitional processing. 737 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 738 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 739 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); 740 if(U_FAILURE(errorCode)) { 741 return labelLength; 742 } 743 if(!isValid) { 744 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 745 return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode); 746 } 747 labelString=&fromPunycode; 748 label=fromPunycode.getBuffer(); 749 labelStart=0; 750 labelLength=fromPunycode.length(); 751 } else { 752 wasPunycode=FALSE; 753 labelString=&dest; 754 } 755 // Validity check 756 if(labelLength==0) { 757 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; 758 return replaceLabel(dest, destLabelStart, destLabelLength, 759 *labelString, labelLength, errorCode); 760 } 761 // labelLength>0 762 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { 763 // label starts with "??--" 764 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; 765 } 766 if(label[0]==0x2d) { 767 // label starts with "-" 768 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; 769 } 770 if(label[labelLength-1]==0x2d) { 771 // label ends with "-" 772 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; 773 } 774 // If the label was not a Punycode label, then it was the result of 775 // mapping, normalization and label segmentation. 776 // If the label was in Punycode, then we mapped it again above 777 // and checked its validity. 778 // Now we handle the STD3 restriction to LDH characters (if set) 779 // and we look for U+FFFD which indicates disallowed characters 780 // in a non-Punycode label or U+FFFD itself in a Punycode label. 781 // We also check for dots which can come from the input to a single-label function. 782 // Ok to cast away const because we own the UnicodeString. 783 UChar *s=(UChar *)label; 784 const UChar *limit=label+labelLength; 785 UChar oredChars=0; 786 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 787 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 788 do { 789 UChar c=*s; 790 if(c<=0x7f) { 791 if(c==0x2e) { 792 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 793 *s=0xfffd; 794 } else if(disallowNonLDHDot && asciiData[c]<0) { 795 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 796 *s=0xfffd; 797 } 798 } else { 799 oredChars|=c; 800 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 801 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 802 *s=0xfffd; 803 } else if(c==0xfffd) { 804 info.labelErrors|=UIDNA_ERROR_DISALLOWED; 805 } 806 } 807 ++s; 808 } while(s<limit); 809 // Check for a leading combining mark after other validity checks 810 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. 811 UChar32 c; 812 int32_t cpLength=0; 813 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 814 U16_NEXT_UNSAFE(label, cpLength, c); 815 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 816 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; 817 labelString->replace(labelStart, cpLength, (UChar)0xfffd); 818 label=labelString->getBuffer()+labelStart; 819 labelLength+=1-cpLength; 820 if(labelString==&dest) { 821 destLabelLength=labelLength; 822 } 823 } 824 if((info.labelErrors&severeErrors)==0) { 825 // Do contextual checks only if we do not have U+FFFD from a severe error 826 // because U+FFFD can make these checks fail. 827 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { 828 checkLabelBiDi(label, labelLength, info); 829 } 830 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 831 !isLabelOkContextJ(label, labelLength) 832 ) { 833 info.labelErrors|=UIDNA_ERROR_CONTEXTJ; 834 } 835 if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 836 checkLabelContextO(label, labelLength, info); 837 } 838 if(toASCII) { 839 if(wasPunycode) { 840 // Leave a Punycode label unchanged if it has no severe errors. 841 if(destLabelLength>63) { 842 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 843 } 844 return destLabelLength; 845 } else if(oredChars>=0x80) { 846 // Contains non-ASCII characters. 847 UnicodeString punycode; 848 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length 849 if(buffer==NULL) { 850 errorCode=U_MEMORY_ALLOCATION_ERROR; 851 return destLabelLength; 852 } 853 buffer[0]=0x78; // Write "xn--". 854 buffer[1]=0x6e; 855 buffer[2]=0x2d; 856 buffer[3]=0x2d; 857 int32_t punycodeLength=u_strToPunycode(label, labelLength, 858 buffer+4, punycode.getCapacity()-4, 859 NULL, &errorCode); 860 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 861 errorCode=U_ZERO_ERROR; 862 punycode.releaseBuffer(4); 863 buffer=punycode.getBuffer(4+punycodeLength); 864 if(buffer==NULL) { 865 errorCode=U_MEMORY_ALLOCATION_ERROR; 866 return destLabelLength; 867 } 868 punycodeLength=u_strToPunycode(label, labelLength, 869 buffer+4, punycode.getCapacity()-4, 870 NULL, &errorCode); 871 } 872 punycodeLength+=4; 873 punycode.releaseBuffer(punycodeLength); 874 if(U_FAILURE(errorCode)) { 875 return destLabelLength; 876 } 877 if(punycodeLength>63) { 878 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 879 } 880 return replaceLabel(dest, destLabelStart, destLabelLength, 881 punycode, punycodeLength, errorCode); 882 } else { 883 // all-ASCII label 884 if(labelLength>63) { 885 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 886 } 887 } 888 } 889 } else { 890 // If a Punycode label has severe errors, 891 // then leave it but make sure it does not look valid. 892 if(wasPunycode) { 893 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; 894 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode); 895 } 896 } 897 return replaceLabel(dest, destLabelStart, destLabelLength, 898 *labelString, labelLength, errorCode); 899 } 900 901 // Make sure an ACE label does not look valid. 902 // Append U+FFFD if the label has only LDH characters. 903 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD. 904 int32_t 905 UTS46::markBadACELabel(UnicodeString &dest, 906 int32_t labelStart, int32_t labelLength, 907 UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const { 908 if(U_FAILURE(errorCode)) { 909 return 0; 910 } 911 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; 912 UBool isASCII=TRUE; 913 UBool onlyLDH=TRUE; 914 const UChar *label=dest.getBuffer()+labelStart; 915 // Ok to cast away const because we own the UnicodeString. 916 UChar *s=(UChar *)label+4; // After the initial "xn--". 917 const UChar *limit=label+labelLength; 918 do { 919 UChar c=*s; 920 if(c<=0x7f) { 921 if(c==0x2e) { 922 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; 923 *s=0xfffd; 924 isASCII=onlyLDH=FALSE; 925 } else if(asciiData[c]<0) { 926 onlyLDH=FALSE; 927 if(disallowNonLDHDot) { 928 *s=0xfffd; 929 isASCII=FALSE; 930 } 931 } 932 } else { 933 isASCII=onlyLDH=FALSE; 934 } 935 } while(++s<limit); 936 if(onlyLDH) { 937 dest.insert(labelStart+labelLength, (UChar)0xfffd); 938 if(dest.isBogus()) { 939 errorCode=U_MEMORY_ALLOCATION_ERROR; 940 return 0; 941 } 942 ++labelLength; 943 } else { 944 if(toASCII && isASCII && labelLength>63) { 945 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; 946 } 947 } 948 return labelLength; 949 } 950 951 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); 952 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); 953 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; 954 955 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); 956 957 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); 958 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 959 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); 960 961 const uint32_t ES_CS_ET_ON_BN_NSM_MASK= 962 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| 963 U_MASK(U_COMMON_NUMBER_SEPARATOR)| 964 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| 965 U_MASK(U_OTHER_NEUTRAL)| 966 U_MASK(U_BOUNDARY_NEUTRAL)| 967 U_MASK(U_DIR_NON_SPACING_MARK); 968 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 969 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 970 971 // We scan the whole label and check both for whether it contains RTL characters 972 // and whether it passes the BiDi Rule. 973 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 974 // that a domain name is a BiDi domain name (has an RTL label) only after 975 // processing several earlier labels. 976 void 977 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 978 // IDNA2008 BiDi rule 979 // Get the directionality of the first character. 980 UChar32 c; 981 int32_t i=0; 982 U16_NEXT_UNSAFE(label, i, c); 983 uint32_t firstMask=U_MASK(u_charDirection(c)); 984 // 1. The first character must be a character with BIDI property L, R 985 // or AL. If it has the R or AL property, it is an RTL label; if it 986 // has the L property, it is an LTR label. 987 if((firstMask&~L_R_AL_MASK)!=0) { 988 info.isOkBiDi=FALSE; 989 } 990 // Get the directionality of the last non-NSM character. 991 uint32_t lastMask; 992 for(;;) { 993 if(i>=labelLength) { 994 lastMask=firstMask; 995 break; 996 } 997 U16_PREV_UNSAFE(label, labelLength, c); 998 UCharDirection dir=u_charDirection(c); 999 if(dir!=U_DIR_NON_SPACING_MARK) { 1000 lastMask=U_MASK(dir); 1001 break; 1002 } 1003 } 1004 // 3. In an RTL label, the end of the label must be a character with 1005 // BIDI property R, AL, EN or AN, followed by zero or more 1006 // characters with BIDI property NSM. 1007 // 6. In an LTR label, the end of the label must be a character with 1008 // BIDI property L or EN, followed by zero or more characters with 1009 // BIDI property NSM. 1010 if( (firstMask&L_MASK)!=0 ? 1011 (lastMask&~L_EN_MASK)!=0 : 1012 (lastMask&~R_AL_EN_AN_MASK)!=0 1013 ) { 1014 info.isOkBiDi=FALSE; 1015 } 1016 // Get the directionalities of the intervening characters. 1017 uint32_t mask=0; 1018 while(i<labelLength) { 1019 U16_NEXT_UNSAFE(label, i, c); 1020 mask|=U_MASK(u_charDirection(c)); 1021 } 1022 if(firstMask&L_MASK) { 1023 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1024 // ES, CS, ET, ON, BN and NSM are allowed. 1025 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1026 info.isOkBiDi=FALSE; 1027 } 1028 } else { 1029 // 2. In an RTL label, only characters with the BIDI properties R, AL, 1030 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 1031 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 1032 info.isOkBiDi=FALSE; 1033 } 1034 // 4. In an RTL label, if an EN is present, no AN may be present, and 1035 // vice versa. 1036 if((mask&EN_AN_MASK)==EN_AN_MASK) { 1037 info.isOkBiDi=FALSE; 1038 } 1039 } 1040 // An RTL label is a label that contains at least one character of type 1041 // R, AL or AN. [...] 1042 // A "BIDI domain name" is a domain name that contains at least one RTL 1043 // label. [...] 1044 // The following rule, consisting of six conditions, applies to labels 1045 // in BIDI domain names. 1046 if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) { 1047 info.isBiDi=TRUE; 1048 } 1049 } 1050 1051 // Special code for the ASCII prefix of a BiDi domain name. 1052 // The ASCII prefix is all-LTR. 1053 1054 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 1055 // 1. The first character must be a character with BIDI property L [...] 1056 // 5. In an LTR label, only characters with the BIDI properties L, EN, 1057 // ES, CS, ET, ON, BN and NSM are allowed. 1058 // 6. In an LTR label, the end of the label must be a character with 1059 // BIDI property L or EN [...] 1060 1061 // UTF-16 version, called for mapped ASCII prefix. 1062 // Cannot contain uppercase A-Z. 1063 // s[length-1] must be the trailing dot. 1064 static UBool 1065 isASCIIOkBiDi(const UChar *s, int32_t length) { 1066 int32_t labelStart=0; 1067 for(int32_t i=0; i<length; ++i) { 1068 UChar c=s[i]; 1069 if(c==0x2e) { // dot 1070 if(i>labelStart) { 1071 c=s[i-1]; 1072 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { 1073 // Last character in the label is not an L or EN. 1074 return FALSE; 1075 } 1076 } 1077 labelStart=i+1; 1078 } else if(i==labelStart) { 1079 if(!(0x61<=c && c<=0x7a)) { 1080 // First character in the label is not an L. 1081 return FALSE; 1082 } 1083 } else { 1084 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1085 // Intermediate character in the label is a B, S or WS. 1086 return FALSE; 1087 } 1088 } 1089 } 1090 return TRUE; 1091 } 1092 1093 // UTF-8 version, called for source ASCII prefix. 1094 // Can contain uppercase A-Z. 1095 // s[length-1] must be the trailing dot. 1096 static UBool 1097 isASCIIOkBiDi(const char *s, int32_t length) { 1098 int32_t labelStart=0; 1099 for(int32_t i=0; i<length; ++i) { 1100 char c=s[i]; 1101 if(c==0x2e) { // dot 1102 if(i>labelStart) { 1103 c=s[i-1]; 1104 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) { 1105 // Last character in the label is not an L or EN. 1106 return FALSE; 1107 } 1108 } 1109 labelStart=i+1; 1110 } else if(i==labelStart) { 1111 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { 1112 // First character in the label is not an L. 1113 return FALSE; 1114 } 1115 } else { 1116 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 1117 // Intermediate character in the label is a B, S or WS. 1118 return FALSE; 1119 } 1120 } 1121 } 1122 return TRUE; 1123 } 1124 1125 UBool 1126 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { 1127 const UBiDiProps *bdp=ubidi_getSingleton(); 1128 // [IDNA2008-Tables] 1129 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 1130 for(int32_t i=0; i<labelLength; ++i) { 1131 if(label[i]==0x200c) { 1132 // Appendix A.1. ZERO WIDTH NON-JOINER 1133 // Rule Set: 1134 // False; 1135 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1136 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 1137 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 1138 if(i==0) { 1139 return FALSE; 1140 } 1141 UChar32 c; 1142 int32_t j=i; 1143 U16_PREV_UNSAFE(label, j, c); 1144 if(uts46Norm2.getCombiningClass(c)==9) { 1145 continue; 1146 } 1147 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 1148 for(;;) { 1149 UJoiningType type=ubidi_getJoiningType(bdp, c); 1150 if(type==U_JT_TRANSPARENT) { 1151 if(j==0) { 1152 return FALSE; 1153 } 1154 U16_PREV_UNSAFE(label, j, c); 1155 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { 1156 break; // precontext fulfilled 1157 } else { 1158 return FALSE; 1159 } 1160 } 1161 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 1162 for(j=i+1;;) { 1163 if(j==labelLength) { 1164 return FALSE; 1165 } 1166 U16_NEXT_UNSAFE(label, j, c); 1167 UJoiningType type=ubidi_getJoiningType(bdp, c); 1168 if(type==U_JT_TRANSPARENT) { 1169 // just skip this character 1170 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { 1171 break; // postcontext fulfilled 1172 } else { 1173 return FALSE; 1174 } 1175 } 1176 } else if(label[i]==0x200d) { 1177 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 1178 // Rule Set: 1179 // False; 1180 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 1181 if(i==0) { 1182 return FALSE; 1183 } 1184 UChar32 c; 1185 int32_t j=i; 1186 U16_PREV_UNSAFE(label, j, c); 1187 if(uts46Norm2.getCombiningClass(c)!=9) { 1188 return FALSE; 1189 } 1190 } 1191 } 1192 return TRUE; 1193 } 1194 1195 void 1196 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const { 1197 int32_t labelEnd=labelLength-1; // inclusive 1198 int32_t arabicDigits=0; // -1 for 066x, +1 for 06Fx 1199 for(int32_t i=0; i<=labelEnd; ++i) { 1200 UChar32 c=label[i]; 1201 if(c<0xb7) { 1202 // ASCII fastpath 1203 } else if(c<=0x6f9) { 1204 if(c==0xb7) { 1205 // Appendix A.3. MIDDLE DOT (U+00B7) 1206 // Rule Set: 1207 // False; 1208 // If Before(cp) .eq. U+006C And 1209 // After(cp) .eq. U+006C Then True; 1210 if(!(0<i && label[i-1]==0x6c && 1211 i<labelEnd && label[i+1]==0x6c)) { 1212 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1213 } 1214 } else if(c==0x375) { 1215 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 1216 // Rule Set: 1217 // False; 1218 // If Script(After(cp)) .eq. Greek Then True; 1219 UScriptCode script=USCRIPT_INVALID_CODE; 1220 if(i<labelEnd) { 1221 UErrorCode errorCode=U_ZERO_ERROR; 1222 int32_t j=i+1; 1223 U16_NEXT(label, j, labelLength, c); 1224 script=uscript_getScript(c, &errorCode); 1225 } 1226 if(script!=USCRIPT_GREEK) { 1227 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1228 } 1229 } else if(c==0x5f3 || c==0x5f4) { 1230 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 1231 // Rule Set: 1232 // False; 1233 // If Script(Before(cp)) .eq. Hebrew Then True; 1234 // 1235 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 1236 // Rule Set: 1237 // False; 1238 // If Script(Before(cp)) .eq. Hebrew Then True; 1239 UScriptCode script=USCRIPT_INVALID_CODE; 1240 if(0<i) { 1241 UErrorCode errorCode=U_ZERO_ERROR; 1242 int32_t j=i; 1243 U16_PREV(label, 0, j, c); 1244 script=uscript_getScript(c, &errorCode); 1245 } 1246 if(script!=USCRIPT_HEBREW) { 1247 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1248 } 1249 } else if(0x660<=c /* && c<=0x6f9 */) { 1250 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 1251 // Rule Set: 1252 // True; 1253 // For All Characters: 1254 // If cp .in. 06F0..06F9 Then False; 1255 // End For; 1256 // 1257 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 1258 // Rule Set: 1259 // True; 1260 // For All Characters: 1261 // If cp .in. 0660..0669 Then False; 1262 // End For; 1263 if(c<=0x669) { 1264 if(arabicDigits>0) { 1265 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1266 } 1267 arabicDigits=-1; 1268 } else if(0x6f0<=c) { 1269 if(arabicDigits<0) { 1270 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS; 1271 } 1272 arabicDigits=1; 1273 } 1274 } 1275 } else if(c==0x30fb) { 1276 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 1277 // Rule Set: 1278 // False; 1279 // For All Characters: 1280 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 1281 // End For; 1282 UErrorCode errorCode=U_ZERO_ERROR; 1283 for(int j=0;;) { 1284 if(j>labelEnd) { 1285 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION; 1286 break; 1287 } 1288 U16_NEXT(label, j, labelLength, c); 1289 UScriptCode script=uscript_getScript(c, &errorCode); 1290 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) { 1291 break; 1292 } 1293 } 1294 } 1295 } 1296 } 1297 1298 U_NAMESPACE_END 1299 1300 // C API ------------------------------------------------------------------- *** 1301 1302 U_NAMESPACE_USE 1303 1304 U_CAPI UIDNA * U_EXPORT2 1305 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { 1306 return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode)); 1307 } 1308 1309 U_CAPI void U_EXPORT2 1310 uidna_close(UIDNA *idna) { 1311 delete reinterpret_cast<IDNA *>(idna); 1312 } 1313 1314 static UBool 1315 checkArgs(const void *label, int32_t length, 1316 void *dest, int32_t capacity, 1317 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1318 if(U_FAILURE(*pErrorCode)) { 1319 return FALSE; 1320 } 1321 // sizeof(UIDNAInfo)=16 in the first API version. 1322 if(pInfo==NULL || pInfo->size<16) { 1323 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1324 return FALSE; 1325 } 1326 if( (label==NULL ? length!=0 : length<-1) || 1327 (dest==NULL ? capacity!=0 : capacity<0) || 1328 (dest==label && label!=NULL) 1329 ) { 1330 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1331 return FALSE; 1332 } 1333 // Set all *pInfo bytes to 0 except for the size field itself. 1334 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); 1335 return TRUE; 1336 } 1337 1338 static void 1339 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { 1340 pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); 1341 pInfo->errors=info.getErrors(); 1342 } 1343 1344 U_CAPI int32_t U_EXPORT2 1345 uidna_labelToASCII(const UIDNA *idna, 1346 const UChar *label, int32_t length, 1347 UChar *dest, int32_t capacity, 1348 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1349 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1350 return 0; 1351 } 1352 UnicodeString src((UBool)(length<0), label, length); 1353 UnicodeString destString(dest, 0, capacity); 1354 IDNAInfo info; 1355 reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode); 1356 idnaInfoToStruct(info, pInfo); 1357 return destString.extract(dest, capacity, *pErrorCode); 1358 } 1359 1360 U_CAPI int32_t U_EXPORT2 1361 uidna_labelToUnicode(const UIDNA *idna, 1362 const UChar *label, int32_t length, 1363 UChar *dest, int32_t capacity, 1364 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1365 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1366 return 0; 1367 } 1368 UnicodeString src((UBool)(length<0), label, length); 1369 UnicodeString destString(dest, 0, capacity); 1370 IDNAInfo info; 1371 reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode); 1372 idnaInfoToStruct(info, pInfo); 1373 return destString.extract(dest, capacity, *pErrorCode); 1374 } 1375 1376 U_CAPI int32_t U_EXPORT2 1377 uidna_nameToASCII(const UIDNA *idna, 1378 const UChar *name, int32_t length, 1379 UChar *dest, int32_t capacity, 1380 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1381 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1382 return 0; 1383 } 1384 UnicodeString src((UBool)(length<0), name, length); 1385 UnicodeString destString(dest, 0, capacity); 1386 IDNAInfo info; 1387 reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode); 1388 idnaInfoToStruct(info, pInfo); 1389 return destString.extract(dest, capacity, *pErrorCode); 1390 } 1391 1392 U_CAPI int32_t U_EXPORT2 1393 uidna_nameToUnicode(const UIDNA *idna, 1394 const UChar *name, int32_t length, 1395 UChar *dest, int32_t capacity, 1396 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1397 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1398 return 0; 1399 } 1400 UnicodeString src((UBool)(length<0), name, length); 1401 UnicodeString destString(dest, 0, capacity); 1402 IDNAInfo info; 1403 reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode); 1404 idnaInfoToStruct(info, pInfo); 1405 return destString.extract(dest, capacity, *pErrorCode); 1406 } 1407 1408 U_CAPI int32_t U_EXPORT2 1409 uidna_labelToASCII_UTF8(const UIDNA *idna, 1410 const char *label, int32_t length, 1411 char *dest, int32_t capacity, 1412 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1413 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1414 return 0; 1415 } 1416 StringPiece src(label, length<0 ? uprv_strlen(label) : length); 1417 CheckedArrayByteSink sink(dest, capacity); 1418 IDNAInfo info; 1419 reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode); 1420 idnaInfoToStruct(info, pInfo); 1421 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1422 } 1423 1424 U_CAPI int32_t U_EXPORT2 1425 uidna_labelToUnicodeUTF8(const UIDNA *idna, 1426 const char *label, int32_t length, 1427 char *dest, int32_t capacity, 1428 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1429 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { 1430 return 0; 1431 } 1432 StringPiece src(label, length<0 ? uprv_strlen(label) : length); 1433 CheckedArrayByteSink sink(dest, capacity); 1434 IDNAInfo info; 1435 reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode); 1436 idnaInfoToStruct(info, pInfo); 1437 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1438 } 1439 1440 U_CAPI int32_t U_EXPORT2 1441 uidna_nameToASCII_UTF8(const UIDNA *idna, 1442 const char *name, int32_t length, 1443 char *dest, int32_t capacity, 1444 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1445 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1446 return 0; 1447 } 1448 StringPiece src(name, length<0 ? uprv_strlen(name) : length); 1449 CheckedArrayByteSink sink(dest, capacity); 1450 IDNAInfo info; 1451 reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode); 1452 idnaInfoToStruct(info, pInfo); 1453 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1454 } 1455 1456 U_CAPI int32_t U_EXPORT2 1457 uidna_nameToUnicodeUTF8(const UIDNA *idna, 1458 const char *name, int32_t length, 1459 char *dest, int32_t capacity, 1460 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { 1461 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { 1462 return 0; 1463 } 1464 StringPiece src(name, length<0 ? uprv_strlen(name) : length); 1465 CheckedArrayByteSink sink(dest, capacity); 1466 IDNAInfo info; 1467 reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode); 1468 idnaInfoToStruct(info, pInfo); 1469 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode); 1470 } 1471 1472 #endif // UCONFIG_NO_IDNA 1473