1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.impl; 11 12 import java.util.EnumSet; 13 14 import android.icu.lang.UCharacter; 15 import android.icu.lang.UCharacterCategory; 16 import android.icu.lang.UCharacterDirection; 17 import android.icu.lang.UScript; 18 import android.icu.text.IDNA; 19 import android.icu.text.Normalizer2; 20 import android.icu.text.StringPrepParseException; 21 import android.icu.util.ICUException; 22 23 // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG: 24 // 25 // The domain name length limit is 255 octets in an internal DNS representation 26 // where the last ("root") label is the empty label 27 // represented by length byte 0 alone. 28 // In a conventional string, this translates to 253 characters, or 254 29 // if there is a trailing dot for the root label. 30 31 /** 32 * UTS #46 (IDNA2008) implementation. 33 * @author Markus Scherer 34 * @hide Only a subset of ICU is exposed in Android 35 */ 36 public final class UTS46 extends IDNA { 37 public UTS46(int options) { 38 this.options=options; 39 } 40 41 @Override 42 public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) { 43 return process(label, true, true, dest, info); 44 } 45 46 @Override 47 public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) { 48 return process(label, true, false, dest, info); 49 } 50 51 @Override 52 public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) { 53 process(name, false, true, dest, info); 54 if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) && 55 isASCIIString(dest) && 56 (dest.length()>254 || dest.charAt(253)!='.') 57 ) { 58 addError(info, Error.DOMAIN_NAME_TOO_LONG); 59 } 60 return dest; 61 } 62 63 @Override 64 public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) { 65 return process(name, false, false, dest, info); 66 } 67 68 private static final Normalizer2 uts46Norm2= 69 Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm 70 final int options; 71 72 // Severe errors which usually result in a U+FFFD replacement character in the result string. 73 private static final EnumSet<Error> severeErrors=EnumSet.of( 74 Error.LEADING_COMBINING_MARK, 75 Error.DISALLOWED, 76 Error.PUNYCODE, 77 Error.LABEL_HAS_DOT, 78 Error.INVALID_ACE_LABEL); 79 80 private static boolean 81 isASCIIString(CharSequence dest) { 82 int length=dest.length(); 83 for(int i=0; i<length; ++i) { 84 if(dest.charAt(i)>0x7f) { 85 return false; 86 } 87 } 88 return true; 89 } 90 91 // UTS #46 data for ASCII characters. 92 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 93 // and passes through all other ASCII characters. 94 // If USE_STD3_RULES is set, then non-LDH characters are disallowed 95 // using this data. 96 // The ASCII fastpath also uses this data. 97 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 98 private static final byte asciiData[]={ 99 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 100 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 101 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 102 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 103 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 105 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 106 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 108 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 109 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 111 }; 112 113 private StringBuilder 114 process(CharSequence src, 115 boolean isLabel, boolean toASCII, 116 StringBuilder dest, 117 Info info) { 118 // uts46Norm2.normalize() would do all of this error checking and setup, 119 // but with the ASCII fastpath we do not always call it, and do not 120 // call it first. 121 if(dest==src) { 122 throw new IllegalArgumentException(); 123 } 124 // Arguments are fine, reset output values. 125 dest.delete(0, 0x7fffffff); 126 resetInfo(info); 127 int srcLength=src.length(); 128 if(srcLength==0) { 129 addError(info, Error.EMPTY_LABEL); 130 return dest; 131 } 132 // ASCII fastpath 133 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 134 int labelStart=0; 135 int i; 136 for(i=0;; ++i) { 137 if(i==srcLength) { 138 if(toASCII) { 139 if((i-labelStart)>63) { 140 addLabelError(info, Error.LABEL_TOO_LONG); 141 } 142 // There is a trailing dot if labelStart==i. 143 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 144 addError(info, Error.DOMAIN_NAME_TOO_LONG); 145 } 146 } 147 promoteAndResetLabelErrors(info); 148 return dest; 149 } 150 char c=src.charAt(i); 151 if(c>0x7f) { 152 break; 153 } 154 int cData=asciiData[c]; 155 if(cData>0) { 156 dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter. 157 } else if(cData<0 && disallowNonLDHDot) { 158 break; // Replacing with U+FFFD can be complicated for toASCII. 159 } else { 160 dest.append(c); 161 if(c=='-') { // hyphen 162 if(i==(labelStart+3) && src.charAt(i-1)=='-') { 163 // "??--..." is Punycode or forbidden. 164 ++i; // '-' was copied to dest already 165 break; 166 } 167 if(i==labelStart) { 168 // label starts with "-" 169 addLabelError(info, Error.LEADING_HYPHEN); 170 } 171 if((i+1)==srcLength || src.charAt(i+1)=='.') { 172 // label ends with "-" 173 addLabelError(info, Error.TRAILING_HYPHEN); 174 } 175 } else if(c=='.') { // dot 176 if(isLabel) { 177 // Replacing with U+FFFD can be complicated for toASCII. 178 ++i; // '.' was copied to dest already 179 break; 180 } 181 if(i==labelStart) { 182 addLabelError(info, Error.EMPTY_LABEL); 183 } 184 if(toASCII && (i-labelStart)>63) { 185 addLabelError(info, Error.LABEL_TOO_LONG); 186 } 187 promoteAndResetLabelErrors(info); 188 labelStart=i+1; 189 } 190 } 191 } 192 promoteAndResetLabelErrors(info); 193 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info); 194 if( isBiDi(info) && !hasCertainErrors(info, severeErrors) && 195 (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart))) 196 ) { 197 addError(info, Error.BIDI); 198 } 199 return dest; 200 } 201 202 private StringBuilder 203 processUnicode(CharSequence src, 204 int labelStart, int mappingStart, 205 boolean isLabel, boolean toASCII, 206 StringBuilder dest, 207 Info info) { 208 if(mappingStart==0) { 209 uts46Norm2.normalize(src, dest); 210 } else { 211 uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length())); 212 } 213 boolean doMapDevChars= 214 toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 : 215 (options&NONTRANSITIONAL_TO_UNICODE)==0; 216 int destLength=dest.length(); 217 int labelLimit=labelStart; 218 while(labelLimit<destLength) { 219 char c=dest.charAt(labelLimit); 220 if(c=='.' && !isLabel) { 221 int labelLength=labelLimit-labelStart; 222 int newLength=processLabel(dest, labelStart, labelLength, 223 toASCII, info); 224 promoteAndResetLabelErrors(info); 225 destLength+=newLength-labelLength; 226 labelLimit=labelStart+=newLength+1; 227 } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 228 setTransitionalDifferent(info); 229 if(doMapDevChars) { 230 destLength=mapDevChars(dest, labelStart, labelLimit); 231 // Do not increment labelLimit in case c was removed. 232 // All deviation characters have been mapped, no need to check for them again. 233 doMapDevChars=false; 234 } else { 235 ++labelLimit; 236 } 237 } else { 238 ++labelLimit; 239 } 240 } 241 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 242 // but not an empty label elsewhere nor a completely empty domain name. 243 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 244 if(0==labelStart || labelStart<labelLimit) { 245 processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info); 246 promoteAndResetLabelErrors(info); 247 } 248 return dest; 249 } 250 251 // returns the new dest.length() 252 private int 253 mapDevChars(StringBuilder dest, int labelStart, int mappingStart) { 254 int length=dest.length(); 255 boolean didMapDevChars=false; 256 for(int i=mappingStart; i<length;) { 257 char c=dest.charAt(i); 258 switch(c) { 259 case 0xdf: 260 // Map sharp s to ss. 261 didMapDevChars=true; 262 dest.setCharAt(i++, 's'); 263 dest.insert(i++, 's'); 264 ++length; 265 break; 266 case 0x3c2: // Map final sigma to nonfinal sigma. 267 didMapDevChars=true; 268 dest.setCharAt(i++, '\u03c3'); 269 break; 270 case 0x200c: // Ignore/remove ZWNJ. 271 case 0x200d: // Ignore/remove ZWJ. 272 didMapDevChars=true; 273 dest.delete(i, i+1); 274 --length; 275 break; 276 default: 277 ++i; 278 break; 279 } 280 } 281 if(didMapDevChars) { 282 // Mapping deviation characters might have resulted in an un-NFC string. 283 // We could use either the NFC or the UTS #46 normalizer. 284 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 285 String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length())); 286 dest.replace(labelStart, 0x7fffffff, normalized); 287 return dest.length(); 288 } 289 return length; 290 } 291 // Some non-ASCII characters are equivalent to sequences with 292 // non-LDH ASCII characters. To find them: 293 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 294 private static boolean 295 isNonASCIIDisallowedSTD3Valid(int c) { 296 return c==0x2260 || c==0x226E || c==0x226F; 297 } 298 299 300 // Replace the label in dest with the label string, if the label was modified. 301 // If label==dest then the label was modified in-place and labelLength 302 // is the new label length, different from label.length(). 303 // If label!=dest then labelLength==label.length(). 304 // Returns labelLength (= the new label length). 305 private static int 306 replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength, 307 CharSequence label, int labelLength) { 308 if(label!=dest) { 309 dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label); 310 // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString()); 311 // which would create a String rather than moving characters in the StringBuilder. 312 } 313 return labelLength; 314 } 315 316 // returns the new label length 317 private int 318 processLabel(StringBuilder dest, 319 int labelStart, int labelLength, 320 boolean toASCII, 321 Info info) { 322 StringBuilder fromPunycode; 323 StringBuilder labelString; 324 int destLabelStart=labelStart; 325 int destLabelLength=labelLength; 326 boolean wasPunycode; 327 if( labelLength>=4 && 328 dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' && 329 dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' 330 ) { 331 // Label starts with "xn--", try to un-Punycode it. 332 wasPunycode=true; 333 try { 334 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); 335 } catch (StringPrepParseException e) { 336 addLabelError(info, Error.PUNYCODE); 337 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 338 } 339 // Check for NFC, and for characters that are not 340 // valid or deviation characters according to the normalizer. 341 // If there is something wrong, then the string will change. 342 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 343 // Deviation characters are ok in Punycode even in transitional processing. 344 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 345 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 346 boolean isValid=uts46Norm2.isNormalized(fromPunycode); 347 if(!isValid) { 348 addLabelError(info, Error.INVALID_ACE_LABEL); 349 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 350 } 351 labelString=fromPunycode; 352 labelStart=0; 353 labelLength=fromPunycode.length(); 354 } else { 355 wasPunycode=false; 356 labelString=dest; 357 } 358 // Validity check 359 if(labelLength==0) { 360 addLabelError(info, Error.EMPTY_LABEL); 361 return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 362 } 363 // labelLength>0 364 if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') { 365 // label starts with "??--" 366 addLabelError(info, Error.HYPHEN_3_4); 367 } 368 if(labelString.charAt(labelStart)=='-') { 369 // label starts with "-" 370 addLabelError(info, Error.LEADING_HYPHEN); 371 } 372 if(labelString.charAt(labelStart+labelLength-1)=='-') { 373 // label ends with "-" 374 addLabelError(info, Error.TRAILING_HYPHEN); 375 } 376 // If the label was not a Punycode label, then it was the result of 377 // mapping, normalization and label segmentation. 378 // If the label was in Punycode, then we mapped it again above 379 // and checked its validity. 380 // Now we handle the STD3 restriction to LDH characters (if set) 381 // and we look for U+FFFD which indicates disallowed characters 382 // in a non-Punycode label or U+FFFD itself in a Punycode label. 383 // We also check for dots which can come from the input to a single-label function. 384 // Ok to cast away const because we own the UnicodeString. 385 int i=labelStart; 386 int limit=labelStart+labelLength; 387 char oredChars=0; 388 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 389 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 390 do { 391 char c=labelString.charAt(i); 392 if(c<=0x7f) { 393 if(c=='.') { 394 addLabelError(info, Error.LABEL_HAS_DOT); 395 labelString.setCharAt(i, '\ufffd'); 396 } else if(disallowNonLDHDot && asciiData[c]<0) { 397 addLabelError(info, Error.DISALLOWED); 398 labelString.setCharAt(i, '\ufffd'); 399 } 400 } else { 401 oredChars|=c; 402 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 403 addLabelError(info, Error.DISALLOWED); 404 labelString.setCharAt(i, '\ufffd'); 405 } else if(c==0xfffd) { 406 addLabelError(info, Error.DISALLOWED); 407 } 408 } 409 ++i; 410 } while(i<limit); 411 // Check for a leading combining mark after other validity checks 412 // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here. 413 int c; 414 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 415 c=labelString.codePointAt(labelStart); 416 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 417 addLabelError(info, Error.LEADING_COMBINING_MARK); 418 labelString.setCharAt(labelStart, '\ufffd'); 419 if(c>0xffff) { 420 // Remove c's trail surrogate. 421 labelString.deleteCharAt(labelStart+1); 422 --labelLength; 423 if(labelString==dest) { 424 --destLabelLength; 425 } 426 } 427 } 428 if(!hasCertainLabelErrors(info, severeErrors)) { 429 // Do contextual checks only if we do not have U+FFFD from a severe error 430 // because U+FFFD can make these checks fail. 431 if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) { 432 checkLabelBiDi(labelString, labelStart, labelLength, info); 433 } 434 if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 435 !isLabelOkContextJ(labelString, labelStart, labelLength) 436 ) { 437 addLabelError(info, Error.CONTEXTJ); 438 } 439 if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 440 checkLabelContextO(labelString, labelStart, labelLength, info); 441 } 442 if(toASCII) { 443 if(wasPunycode) { 444 // Leave a Punycode label unchanged if it has no severe errors. 445 if(destLabelLength>63) { 446 addLabelError(info, Error.LABEL_TOO_LONG); 447 } 448 return destLabelLength; 449 } else if(oredChars>=0x80) { 450 // Contains non-ASCII characters. 451 StringBuilder punycode; 452 try { 453 punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null); 454 } catch (StringPrepParseException e) { 455 throw new ICUException(e); // unexpected 456 } 457 punycode.insert(0, "xn--"); 458 if(punycode.length()>63) { 459 addLabelError(info, Error.LABEL_TOO_LONG); 460 } 461 return replaceLabel(dest, destLabelStart, destLabelLength, 462 punycode, punycode.length()); 463 } else { 464 // all-ASCII label 465 if(labelLength>63) { 466 addLabelError(info, Error.LABEL_TOO_LONG); 467 } 468 } 469 } 470 } else { 471 // If a Punycode label has severe errors, 472 // then leave it but make sure it does not look valid. 473 if(wasPunycode) { 474 addLabelError(info, Error.INVALID_ACE_LABEL); 475 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); 476 } 477 } 478 return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 479 } 480 private int 481 markBadACELabel(StringBuilder dest, 482 int labelStart, int labelLength, 483 boolean toASCII, Info info) { 484 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 485 boolean isASCII=true; 486 boolean onlyLDH=true; 487 int i=labelStart+4; // After the initial "xn--". 488 int limit=labelStart+labelLength; 489 do { 490 char c=dest.charAt(i); 491 if(c<=0x7f) { 492 if(c=='.') { 493 addLabelError(info, Error.LABEL_HAS_DOT); 494 dest.setCharAt(i, '\ufffd'); 495 isASCII=onlyLDH=false; 496 } else if(asciiData[c]<0) { 497 onlyLDH=false; 498 if(disallowNonLDHDot) { 499 dest.setCharAt(i, '\ufffd'); 500 isASCII=false; 501 } 502 } 503 } else { 504 isASCII=onlyLDH=false; 505 } 506 } while(++i<limit); 507 if(onlyLDH) { 508 dest.insert(labelStart+labelLength, '\ufffd'); 509 ++labelLength; 510 } else { 511 if(toASCII && isASCII && labelLength>63) { 512 addLabelError(info, Error.LABEL_TOO_LONG); 513 } 514 } 515 return labelLength; 516 } 517 518 private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT); 519 private static final int R_AL_MASK= 520 U_MASK(UCharacterDirection.RIGHT_TO_LEFT)| 521 U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC); 522 private static final int L_R_AL_MASK=L_MASK|R_AL_MASK; 523 524 private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER); 525 526 private static final int EN_AN_MASK= 527 U_MASK(UCharacterDirection.EUROPEAN_NUMBER)| 528 U_MASK(UCharacterDirection.ARABIC_NUMBER); 529 private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 530 private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER); 531 532 private static final int ES_CS_ET_ON_BN_NSM_MASK= 533 U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)| 534 U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)| 535 U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)| 536 U_MASK(UCharacterDirection.OTHER_NEUTRAL)| 537 U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)| 538 U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK); 539 private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 540 private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 541 542 // We scan the whole label and check both for whether it contains RTL characters 543 // and whether it passes the BiDi Rule. 544 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 545 // that a domain name is a BiDi domain name (has an RTL label) only after 546 // processing several earlier labels. 547 private void 548 checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) { 549 // IDNA2008 BiDi rule 550 // Get the directionality of the first character. 551 int c; 552 int i=labelStart; 553 c=Character.codePointAt(label, i); 554 i+=Character.charCount(c); 555 int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 556 // 1. The first character must be a character with BIDI property L, R 557 // or AL. If it has the R or AL property, it is an RTL label; if it 558 // has the L property, it is an LTR label. 559 if((firstMask&~L_R_AL_MASK)!=0) { 560 setNotOkBiDi(info); 561 } 562 // Get the directionality of the last non-NSM character. 563 int lastMask; 564 int labelLimit=labelStart+labelLength; 565 for(;;) { 566 if(i>=labelLimit) { 567 lastMask=firstMask; 568 break; 569 } 570 c=Character.codePointBefore(label, labelLimit); 571 labelLimit-=Character.charCount(c); 572 int dir=UBiDiProps.INSTANCE.getClass(c); 573 if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) { 574 lastMask=U_MASK(dir); 575 break; 576 } 577 } 578 // 3. In an RTL label, the end of the label must be a character with 579 // BIDI property R, AL, EN or AN, followed by zero or more 580 // characters with BIDI property NSM. 581 // 6. In an LTR label, the end of the label must be a character with 582 // BIDI property L or EN, followed by zero or more characters with 583 // BIDI property NSM. 584 if( (firstMask&L_MASK)!=0 ? 585 (lastMask&~L_EN_MASK)!=0 : 586 (lastMask&~R_AL_EN_AN_MASK)!=0 587 ) { 588 setNotOkBiDi(info); 589 } 590 // Add the directionalities of the intervening characters. 591 int mask=firstMask|lastMask; 592 while(i<labelLimit) { 593 c=Character.codePointAt(label, i); 594 i+=Character.charCount(c); 595 mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 596 } 597 if((firstMask&L_MASK)!=0) { 598 // 5. In an LTR label, only characters with the BIDI properties L, EN, 599 // ES, CS, ET, ON, BN and NSM are allowed. 600 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 601 setNotOkBiDi(info); 602 } 603 } else { 604 // 2. In an RTL label, only characters with the BIDI properties R, AL, 605 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 606 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 607 setNotOkBiDi(info); 608 } 609 // 4. In an RTL label, if an EN is present, no AN may be present, and 610 // vice versa. 611 if((mask&EN_AN_MASK)==EN_AN_MASK) { 612 setNotOkBiDi(info); 613 } 614 } 615 // An RTL label is a label that contains at least one character of type 616 // R, AL or AN. [...] 617 // A "BIDI domain name" is a domain name that contains at least one RTL 618 // label. [...] 619 // The following rule, consisting of six conditions, applies to labels 620 // in BIDI domain names. 621 if((mask&R_AL_AN_MASK)!=0) { 622 setBiDi(info); 623 } 624 } 625 626 // Special code for the ASCII prefix of a BiDi domain name. 627 // The ASCII prefix is all-LTR. 628 629 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 630 // 1. The first character must be a character with BIDI property L [...] 631 // 5. In an LTR label, only characters with the BIDI properties L, EN, 632 // ES, CS, ET, ON, BN and NSM are allowed. 633 // 6. In an LTR label, the end of the label must be a character with 634 // BIDI property L or EN [...] 635 636 // UTF-16 version, called for mapped ASCII prefix. 637 // Cannot contain uppercase A-Z. 638 // s[length-1] must be the trailing dot. 639 private static boolean 640 isASCIIOkBiDi(CharSequence s, int length) { 641 int labelStart=0; 642 for(int i=0; i<length; ++i) { 643 char c=s.charAt(i); 644 if(c=='.') { // dot 645 if(i>labelStart) { 646 c=s.charAt(i-1); 647 if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) { 648 // Last character in the label is not an L or EN. 649 return false; 650 } 651 } 652 labelStart=i+1; 653 } else if(i==labelStart) { 654 if(!('a'<=c && c<='z')) { 655 // First character in the label is not an L. 656 return false; 657 } 658 } else { 659 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 660 // Intermediate character in the label is a B, S or WS. 661 return false; 662 } 663 } 664 } 665 return true; 666 } 667 668 private boolean 669 isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) { 670 // [IDNA2008-Tables] 671 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 672 int labelLimit=labelStart+labelLength; 673 for(int i=labelStart; i<labelLimit; ++i) { 674 if(label.charAt(i)==0x200c) { 675 // Appendix A.1. ZERO WIDTH NON-JOINER 676 // Rule Set: 677 // False; 678 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 679 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 680 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 681 if(i==labelStart) { 682 return false; 683 } 684 int c; 685 int j=i; 686 c=Character.codePointBefore(label, j); 687 j-=Character.charCount(c); 688 if(uts46Norm2.getCombiningClass(c)==9) { 689 continue; 690 } 691 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 692 for(;;) { 693 /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 694 if(type==UCharacter.JoiningType.TRANSPARENT) { 695 if(j==0) { 696 return false; 697 } 698 c=Character.codePointBefore(label, j); 699 j-=Character.charCount(c); 700 } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 701 break; // precontext fulfilled 702 } else { 703 return false; 704 } 705 } 706 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 707 for(j=i+1;;) { 708 if(j==labelLimit) { 709 return false; 710 } 711 c=Character.codePointAt(label, j); 712 j+=Character.charCount(c); 713 /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 714 if(type==UCharacter.JoiningType.TRANSPARENT) { 715 // just skip this character 716 } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 717 break; // postcontext fulfilled 718 } else { 719 return false; 720 } 721 } 722 } else if(label.charAt(i)==0x200d) { 723 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 724 // Rule Set: 725 // False; 726 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 727 if(i==labelStart) { 728 return false; 729 } 730 int c=Character.codePointBefore(label, i); 731 if(uts46Norm2.getCombiningClass(c)!=9) { 732 return false; 733 } 734 } 735 } 736 return true; 737 } 738 739 private void 740 checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { 741 int labelEnd=labelStart+labelLength-1; // inclusive 742 int arabicDigits=0; // -1 for 066x, +1 for 06Fx 743 for(int i=labelStart; i<=labelEnd; ++i) { 744 int c=label.charAt(i); 745 if(c<0xb7) { 746 // ASCII fastpath 747 } else if(c<=0x6f9) { 748 if(c==0xb7) { 749 // Appendix A.3. MIDDLE DOT (U+00B7) 750 // Rule Set: 751 // False; 752 // If Before(cp) .eq. U+006C And 753 // After(cp) .eq. U+006C Then True; 754 if(!(labelStart<i && label.charAt(i-1)=='l' && 755 i<labelEnd && label.charAt(i+1)=='l')) { 756 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 757 } 758 } else if(c==0x375) { 759 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 760 // Rule Set: 761 // False; 762 // If Script(After(cp)) .eq. Greek Then True; 763 if(!(i<labelEnd && 764 UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) { 765 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 766 } 767 } else if(c==0x5f3 || c==0x5f4) { 768 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 769 // Rule Set: 770 // False; 771 // If Script(Before(cp)) .eq. Hebrew Then True; 772 // 773 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 774 // Rule Set: 775 // False; 776 // If Script(Before(cp)) .eq. Hebrew Then True; 777 if(!(labelStart<i && 778 UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) { 779 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 780 } 781 } else if(0x660<=c /* && c<=0x6f9 */) { 782 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 783 // Rule Set: 784 // True; 785 // For All Characters: 786 // If cp .in. 06F0..06F9 Then False; 787 // End For; 788 // 789 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 790 // Rule Set: 791 // True; 792 // For All Characters: 793 // If cp .in. 0660..0669 Then False; 794 // End For; 795 if(c<=0x669) { 796 if(arabicDigits>0) { 797 addLabelError(info, Error.CONTEXTO_DIGITS); 798 } 799 arabicDigits=-1; 800 } else if(0x6f0<=c) { 801 if(arabicDigits<0) { 802 addLabelError(info, Error.CONTEXTO_DIGITS); 803 } 804 arabicDigits=1; 805 } 806 } 807 } else if(c==0x30fb) { 808 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 809 // Rule Set: 810 // False; 811 // For All Characters: 812 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 813 // End For; 814 for(int j=labelStart;; j+=Character.charCount(c)) { 815 if(j>labelEnd) { 816 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 817 break; 818 } 819 c=Character.codePointAt(label, j); 820 int script=UScript.getScript(c); 821 if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { 822 break; 823 } 824 } 825 } 826 } 827 } 828 829 // TODO: make public(?) -- in C, these are public in uchar.h 830 private static int U_MASK(int x) { 831 return 1<<x; 832 } 833 private static int U_GET_GC_MASK(int c) { 834 return (1<<UCharacter.getType(c)); 835 } 836 private static int U_GC_M_MASK= 837 U_MASK(UCharacterCategory.NON_SPACING_MARK)| 838 U_MASK(UCharacterCategory.ENCLOSING_MARK)| 839 U_MASK(UCharacterCategory.COMBINING_SPACING_MARK); 840 } 841