1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2010-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 package android.icu.impl; 11 12 import java.util.EnumSet; 13 14 import android.icu.impl.Normalizer2Impl.UTF16Plus; 15 import android.icu.lang.UCharacter; 16 import android.icu.lang.UCharacterCategory; 17 import android.icu.lang.UCharacterDirection; 18 import android.icu.lang.UScript; 19 import android.icu.text.IDNA; 20 import android.icu.text.Normalizer2; 21 import android.icu.text.StringPrepParseException; 22 import android.icu.util.ICUException; 23 24 // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG: 25 // 26 // The domain name length limit is 255 octets in an internal DNS representation 27 // where the last ("root") label is the empty label 28 // represented by length byte 0 alone. 29 // In a conventional string, this translates to 253 characters, or 254 30 // if there is a trailing dot for the root label. 31 32 /** 33 * UTS #46 (IDNA2008) implementation. 34 * @author Markus Scherer 35 * @hide Only a subset of ICU is exposed in Android 36 */ 37 public final class UTS46 extends IDNA { 38 public UTS46(int options) { 39 this.options=options; 40 } 41 42 @Override 43 public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) { 44 return process(label, true, true, dest, info); 45 } 46 47 @Override 48 public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) { 49 return process(label, true, false, dest, info); 50 } 51 52 @Override 53 public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) { 54 process(name, false, true, dest, info); 55 if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) && 56 isASCIIString(dest) && 57 (dest.length()>254 || dest.charAt(253)!='.') 58 ) { 59 addError(info, Error.DOMAIN_NAME_TOO_LONG); 60 } 61 return dest; 62 } 63 64 @Override 65 public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) { 66 return process(name, false, false, dest, info); 67 } 68 69 private static final Normalizer2 uts46Norm2= 70 Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm 71 final int options; 72 73 // Severe errors which usually result in a U+FFFD replacement character in the result string. 74 private static final EnumSet<Error> severeErrors=EnumSet.of( 75 Error.LEADING_COMBINING_MARK, 76 Error.DISALLOWED, 77 Error.PUNYCODE, 78 Error.LABEL_HAS_DOT, 79 Error.INVALID_ACE_LABEL); 80 81 private static boolean 82 isASCIIString(CharSequence dest) { 83 int length=dest.length(); 84 for(int i=0; i<length; ++i) { 85 if(dest.charAt(i)>0x7f) { 86 return false; 87 } 88 } 89 return true; 90 } 91 92 // UTS #46 data for ASCII characters. 93 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase 94 // and passes through all other ASCII characters. 95 // If USE_STD3_RULES is set, then non-LDH characters are disallowed 96 // using this data. 97 // The ASCII fastpath also uses this data. 98 // Values: -1=disallowed 0==valid 1==mapped (lowercase) 99 private static final byte asciiData[]={ 100 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 101 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 102 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP 103 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, 104 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE 105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, 106 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z 107 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 109 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z 110 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 112 }; 113 114 private StringBuilder 115 process(CharSequence src, 116 boolean isLabel, boolean toASCII, 117 StringBuilder dest, 118 Info info) { 119 // uts46Norm2.normalize() would do all of this error checking and setup, 120 // but with the ASCII fastpath we do not always call it, and do not 121 // call it first. 122 if(dest==src) { 123 throw new IllegalArgumentException(); 124 } 125 // Arguments are fine, reset output values. 126 dest.delete(0, 0x7fffffff); 127 resetInfo(info); 128 int srcLength=src.length(); 129 if(srcLength==0) { 130 addError(info, Error.EMPTY_LABEL); 131 return dest; 132 } 133 // ASCII fastpath 134 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 135 int labelStart=0; 136 int i; 137 for(i=0;; ++i) { 138 if(i==srcLength) { 139 if(toASCII) { 140 if((i-labelStart)>63) { 141 addLabelError(info, Error.LABEL_TOO_LONG); 142 } 143 // There is a trailing dot if labelStart==i. 144 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { 145 addError(info, Error.DOMAIN_NAME_TOO_LONG); 146 } 147 } 148 promoteAndResetLabelErrors(info); 149 return dest; 150 } 151 char c=src.charAt(i); 152 if(c>0x7f) { 153 break; 154 } 155 int cData=asciiData[c]; 156 if(cData>0) { 157 dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter. 158 } else if(cData<0 && disallowNonLDHDot) { 159 break; // Replacing with U+FFFD can be complicated for toASCII. 160 } else { 161 dest.append(c); 162 if(c=='-') { // hyphen 163 if(i==(labelStart+3) && src.charAt(i-1)=='-') { 164 // "??--..." is Punycode or forbidden. 165 ++i; // '-' was copied to dest already 166 break; 167 } 168 if(i==labelStart) { 169 // label starts with "-" 170 addLabelError(info, Error.LEADING_HYPHEN); 171 } 172 if((i+1)==srcLength || src.charAt(i+1)=='.') { 173 // label ends with "-" 174 addLabelError(info, Error.TRAILING_HYPHEN); 175 } 176 } else if(c=='.') { // dot 177 if(isLabel) { 178 // Replacing with U+FFFD can be complicated for toASCII. 179 ++i; // '.' was copied to dest already 180 break; 181 } 182 if(i==labelStart) { 183 addLabelError(info, Error.EMPTY_LABEL); 184 } 185 if(toASCII && (i-labelStart)>63) { 186 addLabelError(info, Error.LABEL_TOO_LONG); 187 } 188 promoteAndResetLabelErrors(info); 189 labelStart=i+1; 190 } 191 } 192 } 193 promoteAndResetLabelErrors(info); 194 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info); 195 if( isBiDi(info) && !hasCertainErrors(info, severeErrors) && 196 (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart))) 197 ) { 198 addError(info, Error.BIDI); 199 } 200 return dest; 201 } 202 203 private StringBuilder 204 processUnicode(CharSequence src, 205 int labelStart, int mappingStart, 206 boolean isLabel, boolean toASCII, 207 StringBuilder dest, 208 Info info) { 209 if(mappingStart==0) { 210 uts46Norm2.normalize(src, dest); 211 } else { 212 uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length())); 213 } 214 boolean doMapDevChars= 215 toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 : 216 (options&NONTRANSITIONAL_TO_UNICODE)==0; 217 int destLength=dest.length(); 218 int labelLimit=labelStart; 219 while(labelLimit<destLength) { 220 char c=dest.charAt(labelLimit); 221 if(c=='.' && !isLabel) { 222 int labelLength=labelLimit-labelStart; 223 int newLength=processLabel(dest, labelStart, labelLength, 224 toASCII, info); 225 promoteAndResetLabelErrors(info); 226 destLength+=newLength-labelLength; 227 labelLimit=labelStart+=newLength+1; 228 continue; 229 } else if(c<0xdf) { 230 // pass 231 } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { 232 setTransitionalDifferent(info); 233 if(doMapDevChars) { 234 destLength=mapDevChars(dest, labelStart, labelLimit); 235 // All deviation characters have been mapped, no need to check for them again. 236 doMapDevChars=false; 237 // Do not increment labelLimit in case c was removed. 238 continue; 239 } 240 } else if(Character.isSurrogate(c)) { 241 if(UTF16Plus.isSurrogateLead(c) ? 242 (labelLimit+1)==destLength || 243 !Character.isLowSurrogate(dest.charAt(labelLimit+1)) : 244 labelLimit==labelStart || 245 !Character.isHighSurrogate(dest.charAt(labelLimit-1))) { 246 // Map an unpaired surrogate to U+FFFD before normalization so that when 247 // that removes characters we do not turn two unpaired ones into a pair. 248 addLabelError(info, Error.DISALLOWED); 249 dest.setCharAt(labelLimit, '\ufffd'); 250 } 251 } 252 ++labelLimit; 253 } 254 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok) 255 // but not an empty label elsewhere nor a completely empty domain name. 256 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. 257 if(0==labelStart || labelStart<labelLimit) { 258 processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info); 259 promoteAndResetLabelErrors(info); 260 } 261 return dest; 262 } 263 264 // returns the new dest.length() 265 private int 266 mapDevChars(StringBuilder dest, int labelStart, int mappingStart) { 267 int length=dest.length(); 268 boolean didMapDevChars=false; 269 for(int i=mappingStart; i<length;) { 270 char c=dest.charAt(i); 271 switch(c) { 272 case 0xdf: 273 // Map sharp s to ss. 274 didMapDevChars=true; 275 dest.setCharAt(i++, 's'); 276 dest.insert(i++, 's'); 277 ++length; 278 break; 279 case 0x3c2: // Map final sigma to nonfinal sigma. 280 didMapDevChars=true; 281 dest.setCharAt(i++, '\u03c3'); 282 break; 283 case 0x200c: // Ignore/remove ZWNJ. 284 case 0x200d: // Ignore/remove ZWJ. 285 didMapDevChars=true; 286 dest.delete(i, i+1); 287 --length; 288 break; 289 default: 290 ++i; 291 break; 292 } 293 } 294 if(didMapDevChars) { 295 // Mapping deviation characters might have resulted in an un-NFC string. 296 // We could use either the NFC or the UTS #46 normalizer. 297 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file. 298 String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length())); 299 dest.replace(labelStart, 0x7fffffff, normalized); 300 return dest.length(); 301 } 302 return length; 303 } 304 // Some non-ASCII characters are equivalent to sequences with 305 // non-LDH ASCII characters. To find them: 306 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) 307 private static boolean 308 isNonASCIIDisallowedSTD3Valid(int c) { 309 return c==0x2260 || c==0x226E || c==0x226F; 310 } 311 312 313 // Replace the label in dest with the label string, if the label was modified. 314 // If label==dest then the label was modified in-place and labelLength 315 // is the new label length, different from label.length(). 316 // If label!=dest then labelLength==label.length(). 317 // Returns labelLength (= the new label length). 318 private static int 319 replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength, 320 CharSequence label, int labelLength) { 321 if(label!=dest) { 322 dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label); 323 // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString()); 324 // which would create a String rather than moving characters in the StringBuilder. 325 } 326 return labelLength; 327 } 328 329 // returns the new label length 330 private int 331 processLabel(StringBuilder dest, 332 int labelStart, int labelLength, 333 boolean toASCII, 334 Info info) { 335 StringBuilder fromPunycode; 336 StringBuilder labelString; 337 int destLabelStart=labelStart; 338 int destLabelLength=labelLength; 339 boolean wasPunycode; 340 if( labelLength>=4 && 341 dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' && 342 dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' 343 ) { 344 // Label starts with "xn--", try to un-Punycode it. 345 wasPunycode=true; 346 try { 347 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); 348 } catch (StringPrepParseException e) { 349 addLabelError(info, Error.PUNYCODE); 350 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 351 } 352 // Check for NFC, and for characters that are not 353 // valid or deviation characters according to the normalizer. 354 // If there is something wrong, then the string will change. 355 // Note that the normalizer passes through non-LDH ASCII and deviation characters. 356 // Deviation characters are ok in Punycode even in transitional processing. 357 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES 358 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. 359 boolean isValid=uts46Norm2.isNormalized(fromPunycode); 360 if(!isValid) { 361 addLabelError(info, Error.INVALID_ACE_LABEL); 362 return markBadACELabel(dest, labelStart, labelLength, toASCII, info); 363 } 364 labelString=fromPunycode; 365 labelStart=0; 366 labelLength=fromPunycode.length(); 367 } else { 368 wasPunycode=false; 369 labelString=dest; 370 } 371 // Validity check 372 if(labelLength==0) { 373 addLabelError(info, Error.EMPTY_LABEL); 374 return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 375 } 376 // labelLength>0 377 if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') { 378 // label starts with "??--" 379 addLabelError(info, Error.HYPHEN_3_4); 380 } 381 if(labelString.charAt(labelStart)=='-') { 382 // label starts with "-" 383 addLabelError(info, Error.LEADING_HYPHEN); 384 } 385 if(labelString.charAt(labelStart+labelLength-1)=='-') { 386 // label ends with "-" 387 addLabelError(info, Error.TRAILING_HYPHEN); 388 } 389 // If the label was not a Punycode label, then it was the result of 390 // mapping, normalization and label segmentation. 391 // If the label was in Punycode, then we mapped it again above 392 // and checked its validity. 393 // Now we handle the STD3 restriction to LDH characters (if set) 394 // and we look for U+FFFD which indicates disallowed characters 395 // in a non-Punycode label or U+FFFD itself in a Punycode label. 396 // We also check for dots which can come from the input to a single-label function. 397 // Ok to cast away const because we own the UnicodeString. 398 int i=labelStart; 399 int limit=labelStart+labelLength; 400 char oredChars=0; 401 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. 402 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 403 do { 404 char c=labelString.charAt(i); 405 if(c<=0x7f) { 406 if(c=='.') { 407 addLabelError(info, Error.LABEL_HAS_DOT); 408 labelString.setCharAt(i, '\ufffd'); 409 } else if(disallowNonLDHDot && asciiData[c]<0) { 410 addLabelError(info, Error.DISALLOWED); 411 labelString.setCharAt(i, '\ufffd'); 412 } 413 } else { 414 oredChars|=c; 415 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { 416 addLabelError(info, Error.DISALLOWED); 417 labelString.setCharAt(i, '\ufffd'); 418 } else if(c==0xfffd) { 419 addLabelError(info, Error.DISALLOWED); 420 } 421 } 422 ++i; 423 } while(i<limit); 424 // Check for a leading combining mark after other validity checks 425 // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here. 426 int c; 427 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. 428 c=labelString.codePointAt(labelStart); 429 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { 430 addLabelError(info, Error.LEADING_COMBINING_MARK); 431 labelString.setCharAt(labelStart, '\ufffd'); 432 if(c>0xffff) { 433 // Remove c's trail surrogate. 434 labelString.deleteCharAt(labelStart+1); 435 --labelLength; 436 if(labelString==dest) { 437 --destLabelLength; 438 } 439 } 440 } 441 if(!hasCertainLabelErrors(info, severeErrors)) { 442 // Do contextual checks only if we do not have U+FFFD from a severe error 443 // because U+FFFD can make these checks fail. 444 if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) { 445 checkLabelBiDi(labelString, labelStart, labelLength, info); 446 } 447 if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && 448 !isLabelOkContextJ(labelString, labelStart, labelLength) 449 ) { 450 addLabelError(info, Error.CONTEXTJ); 451 } 452 if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) { 453 checkLabelContextO(labelString, labelStart, labelLength, info); 454 } 455 if(toASCII) { 456 if(wasPunycode) { 457 // Leave a Punycode label unchanged if it has no severe errors. 458 if(destLabelLength>63) { 459 addLabelError(info, Error.LABEL_TOO_LONG); 460 } 461 return destLabelLength; 462 } else if(oredChars>=0x80) { 463 // Contains non-ASCII characters. 464 StringBuilder punycode; 465 try { 466 punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null); 467 } catch (StringPrepParseException e) { 468 throw new ICUException(e); // unexpected 469 } 470 punycode.insert(0, "xn--"); 471 if(punycode.length()>63) { 472 addLabelError(info, Error.LABEL_TOO_LONG); 473 } 474 return replaceLabel(dest, destLabelStart, destLabelLength, 475 punycode, punycode.length()); 476 } else { 477 // all-ASCII label 478 if(labelLength>63) { 479 addLabelError(info, Error.LABEL_TOO_LONG); 480 } 481 } 482 } 483 } else { 484 // If a Punycode label has severe errors, 485 // then leave it but make sure it does not look valid. 486 if(wasPunycode) { 487 addLabelError(info, Error.INVALID_ACE_LABEL); 488 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); 489 } 490 } 491 return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); 492 } 493 private int 494 markBadACELabel(StringBuilder dest, 495 int labelStart, int labelLength, 496 boolean toASCII, Info info) { 497 boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; 498 boolean isASCII=true; 499 boolean onlyLDH=true; 500 int i=labelStart+4; // After the initial "xn--". 501 int limit=labelStart+labelLength; 502 do { 503 char c=dest.charAt(i); 504 if(c<=0x7f) { 505 if(c=='.') { 506 addLabelError(info, Error.LABEL_HAS_DOT); 507 dest.setCharAt(i, '\ufffd'); 508 isASCII=onlyLDH=false; 509 } else if(asciiData[c]<0) { 510 onlyLDH=false; 511 if(disallowNonLDHDot) { 512 dest.setCharAt(i, '\ufffd'); 513 isASCII=false; 514 } 515 } 516 } else { 517 isASCII=onlyLDH=false; 518 } 519 } while(++i<limit); 520 if(onlyLDH) { 521 dest.insert(labelStart+labelLength, '\ufffd'); 522 ++labelLength; 523 } else { 524 if(toASCII && isASCII && labelLength>63) { 525 addLabelError(info, Error.LABEL_TOO_LONG); 526 } 527 } 528 return labelLength; 529 } 530 531 private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT); 532 private static final int R_AL_MASK= 533 U_MASK(UCharacterDirection.RIGHT_TO_LEFT)| 534 U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC); 535 private static final int L_R_AL_MASK=L_MASK|R_AL_MASK; 536 537 private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER); 538 539 private static final int EN_AN_MASK= 540 U_MASK(UCharacterDirection.EUROPEAN_NUMBER)| 541 U_MASK(UCharacterDirection.ARABIC_NUMBER); 542 private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; 543 private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER); 544 545 private static final int ES_CS_ET_ON_BN_NSM_MASK= 546 U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)| 547 U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)| 548 U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)| 549 U_MASK(UCharacterDirection.OTHER_NEUTRAL)| 550 U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)| 551 U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK); 552 private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 553 private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; 554 555 // We scan the whole label and check both for whether it contains RTL characters 556 // and whether it passes the BiDi Rule. 557 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find 558 // that a domain name is a BiDi domain name (has an RTL label) only after 559 // processing several earlier labels. 560 private void 561 checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) { 562 // IDNA2008 BiDi rule 563 // Get the directionality of the first character. 564 int c; 565 int i=labelStart; 566 c=Character.codePointAt(label, i); 567 i+=Character.charCount(c); 568 int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 569 // 1. The first character must be a character with BIDI property L, R 570 // or AL. If it has the R or AL property, it is an RTL label; if it 571 // has the L property, it is an LTR label. 572 if((firstMask&~L_R_AL_MASK)!=0) { 573 setNotOkBiDi(info); 574 } 575 // Get the directionality of the last non-NSM character. 576 int lastMask; 577 int labelLimit=labelStart+labelLength; 578 for(;;) { 579 if(i>=labelLimit) { 580 lastMask=firstMask; 581 break; 582 } 583 c=Character.codePointBefore(label, labelLimit); 584 labelLimit-=Character.charCount(c); 585 int dir=UBiDiProps.INSTANCE.getClass(c); 586 if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) { 587 lastMask=U_MASK(dir); 588 break; 589 } 590 } 591 // 3. In an RTL label, the end of the label must be a character with 592 // BIDI property R, AL, EN or AN, followed by zero or more 593 // characters with BIDI property NSM. 594 // 6. In an LTR label, the end of the label must be a character with 595 // BIDI property L or EN, followed by zero or more characters with 596 // BIDI property NSM. 597 if( (firstMask&L_MASK)!=0 ? 598 (lastMask&~L_EN_MASK)!=0 : 599 (lastMask&~R_AL_EN_AN_MASK)!=0 600 ) { 601 setNotOkBiDi(info); 602 } 603 // Add the directionalities of the intervening characters. 604 int mask=firstMask|lastMask; 605 while(i<labelLimit) { 606 c=Character.codePointAt(label, i); 607 i+=Character.charCount(c); 608 mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c)); 609 } 610 if((firstMask&L_MASK)!=0) { 611 // 5. In an LTR label, only characters with the BIDI properties L, EN, 612 // ES, CS, ET, ON, BN and NSM are allowed. 613 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 614 setNotOkBiDi(info); 615 } 616 } else { 617 // 2. In an RTL label, only characters with the BIDI properties R, AL, 618 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. 619 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { 620 setNotOkBiDi(info); 621 } 622 // 4. In an RTL label, if an EN is present, no AN may be present, and 623 // vice versa. 624 if((mask&EN_AN_MASK)==EN_AN_MASK) { 625 setNotOkBiDi(info); 626 } 627 } 628 // An RTL label is a label that contains at least one character of type 629 // R, AL or AN. [...] 630 // A "BIDI domain name" is a domain name that contains at least one RTL 631 // label. [...] 632 // The following rule, consisting of six conditions, applies to labels 633 // in BIDI domain names. 634 if((mask&R_AL_AN_MASK)!=0) { 635 setBiDi(info); 636 } 637 } 638 639 // Special code for the ASCII prefix of a BiDi domain name. 640 // The ASCII prefix is all-LTR. 641 642 // IDNA2008 BiDi rule, parts relevant to ASCII labels: 643 // 1. The first character must be a character with BIDI property L [...] 644 // 5. In an LTR label, only characters with the BIDI properties L, EN, 645 // ES, CS, ET, ON, BN and NSM are allowed. 646 // 6. In an LTR label, the end of the label must be a character with 647 // BIDI property L or EN [...] 648 649 // UTF-16 version, called for mapped ASCII prefix. 650 // Cannot contain uppercase A-Z. 651 // s[length-1] must be the trailing dot. 652 private static boolean 653 isASCIIOkBiDi(CharSequence s, int length) { 654 int labelStart=0; 655 for(int i=0; i<length; ++i) { 656 char c=s.charAt(i); 657 if(c=='.') { // dot 658 if(i>labelStart) { 659 c=s.charAt(i-1); 660 if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) { 661 // Last character in the label is not an L or EN. 662 return false; 663 } 664 } 665 labelStart=i+1; 666 } else if(i==labelStart) { 667 if(!('a'<=c && c<='z')) { 668 // First character in the label is not an L. 669 return false; 670 } 671 } else { 672 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { 673 // Intermediate character in the label is a B, S or WS. 674 return false; 675 } 676 } 677 } 678 return true; 679 } 680 681 private boolean 682 isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) { 683 // [IDNA2008-Tables] 684 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 685 int labelLimit=labelStart+labelLength; 686 for(int i=labelStart; i<labelLimit; ++i) { 687 if(label.charAt(i)==0x200c) { 688 // Appendix A.1. ZERO WIDTH NON-JOINER 689 // Rule Set: 690 // False; 691 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 692 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C 693 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; 694 if(i==labelStart) { 695 return false; 696 } 697 int c; 698 int j=i; 699 c=Character.codePointBefore(label, j); 700 j-=Character.charCount(c); 701 if(uts46Norm2.getCombiningClass(c)==9) { 702 continue; 703 } 704 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* 705 for(;;) { 706 /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 707 if(type==UCharacter.JoiningType.TRANSPARENT) { 708 if(j==0) { 709 return false; 710 } 711 c=Character.codePointBefore(label, j); 712 j-=Character.charCount(c); 713 } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 714 break; // precontext fulfilled 715 } else { 716 return false; 717 } 718 } 719 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) 720 for(j=i+1;;) { 721 if(j==labelLimit) { 722 return false; 723 } 724 c=Character.codePointAt(label, j); 725 j+=Character.charCount(c); 726 /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c); 727 if(type==UCharacter.JoiningType.TRANSPARENT) { 728 // just skip this character 729 } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) { 730 break; // postcontext fulfilled 731 } else { 732 return false; 733 } 734 } 735 } else if(label.charAt(i)==0x200d) { 736 // Appendix A.2. ZERO WIDTH JOINER (U+200D) 737 // Rule Set: 738 // False; 739 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 740 if(i==labelStart) { 741 return false; 742 } 743 int c=Character.codePointBefore(label, i); 744 if(uts46Norm2.getCombiningClass(c)!=9) { 745 return false; 746 } 747 } 748 } 749 return true; 750 } 751 752 private void 753 checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) { 754 int labelEnd=labelStart+labelLength-1; // inclusive 755 int arabicDigits=0; // -1 for 066x, +1 for 06Fx 756 for(int i=labelStart; i<=labelEnd; ++i) { 757 int c=label.charAt(i); 758 if(c<0xb7) { 759 // ASCII fastpath 760 } else if(c<=0x6f9) { 761 if(c==0xb7) { 762 // Appendix A.3. MIDDLE DOT (U+00B7) 763 // Rule Set: 764 // False; 765 // If Before(cp) .eq. U+006C And 766 // After(cp) .eq. U+006C Then True; 767 if(!(labelStart<i && label.charAt(i-1)=='l' && 768 i<labelEnd && label.charAt(i+1)=='l')) { 769 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 770 } 771 } else if(c==0x375) { 772 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375) 773 // Rule Set: 774 // False; 775 // If Script(After(cp)) .eq. Greek Then True; 776 if(!(i<labelEnd && 777 UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) { 778 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 779 } 780 } else if(c==0x5f3 || c==0x5f4) { 781 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3) 782 // Rule Set: 783 // False; 784 // If Script(Before(cp)) .eq. Hebrew Then True; 785 // 786 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4) 787 // Rule Set: 788 // False; 789 // If Script(Before(cp)) .eq. Hebrew Then True; 790 if(!(labelStart<i && 791 UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) { 792 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 793 } 794 } else if(0x660<=c /* && c<=0x6f9 */) { 795 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669) 796 // Rule Set: 797 // True; 798 // For All Characters: 799 // If cp .in. 06F0..06F9 Then False; 800 // End For; 801 // 802 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9) 803 // Rule Set: 804 // True; 805 // For All Characters: 806 // If cp .in. 0660..0669 Then False; 807 // End For; 808 if(c<=0x669) { 809 if(arabicDigits>0) { 810 addLabelError(info, Error.CONTEXTO_DIGITS); 811 } 812 arabicDigits=-1; 813 } else if(0x6f0<=c) { 814 if(arabicDigits<0) { 815 addLabelError(info, Error.CONTEXTO_DIGITS); 816 } 817 arabicDigits=1; 818 } 819 } 820 } else if(c==0x30fb) { 821 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB) 822 // Rule Set: 823 // False; 824 // For All Characters: 825 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True; 826 // End For; 827 for(int j=labelStart;; j+=Character.charCount(c)) { 828 if(j>labelEnd) { 829 addLabelError(info, Error.CONTEXTO_PUNCTUATION); 830 break; 831 } 832 c=Character.codePointAt(label, j); 833 int script=UScript.getScript(c); 834 if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) { 835 break; 836 } 837 } 838 } 839 } 840 } 841 842 // TODO: make public(?) -- in C, these are public in uchar.h 843 private static int U_MASK(int x) { 844 return 1<<x; 845 } 846 private static int U_GET_GC_MASK(int c) { 847 return (1<<UCharacter.getType(c)); 848 } 849 private static int U_GC_M_MASK= 850 U_MASK(UCharacterCategory.NON_SPACING_MARK)| 851 U_MASK(UCharacterCategory.ENCLOSING_MARK)| 852 U_MASK(UCharacterCategory.COMBINING_SPACING_MARK); 853 } 854