Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2010-2014, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 */
     10 package android.icu.impl;
     11 
     12 import java.util.EnumSet;
     13 
     14 import android.icu.lang.UCharacter;
     15 import android.icu.lang.UCharacterCategory;
     16 import android.icu.lang.UCharacterDirection;
     17 import android.icu.lang.UScript;
     18 import android.icu.text.IDNA;
     19 import android.icu.text.Normalizer2;
     20 import android.icu.text.StringPrepParseException;
     21 import android.icu.util.ICUException;
     22 
     23 // Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG:
     24 //
     25 // The domain name length limit is 255 octets in an internal DNS representation
     26 // where the last ("root") label is the empty label
     27 // represented by length byte 0 alone.
     28 // In a conventional string, this translates to 253 characters, or 254
     29 // if there is a trailing dot for the root label.
     30 
     31 /**
     32  * UTS #46 (IDNA2008) implementation.
     33  * @author Markus Scherer
     34  * @hide Only a subset of ICU is exposed in Android
     35  */
     36 public final class UTS46 extends IDNA {
     37     public UTS46(int options) {
     38         this.options=options;
     39     }
     40 
     41     @Override
     42     public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) {
     43         return process(label, true, true, dest, info);
     44     }
     45 
     46     @Override
     47     public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) {
     48         return process(label, true, false, dest, info);
     49     }
     50 
     51     @Override
     52     public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) {
     53         process(name, false, true, dest, info);
     54         if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) &&
     55             isASCIIString(dest) &&
     56             (dest.length()>254 || dest.charAt(253)!='.')
     57         ) {
     58             addError(info, Error.DOMAIN_NAME_TOO_LONG);
     59         }
     60         return dest;
     61     }
     62 
     63     @Override
     64     public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) {
     65         return process(name, false, false, dest, info);
     66     }
     67 
     68     private static final Normalizer2 uts46Norm2=
     69         Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE);  // uts46.nrm
     70     final int options;
     71 
     72     // Severe errors which usually result in a U+FFFD replacement character in the result string.
     73     private static final EnumSet<Error> severeErrors=EnumSet.of(
     74         Error.LEADING_COMBINING_MARK,
     75         Error.DISALLOWED,
     76         Error.PUNYCODE,
     77         Error.LABEL_HAS_DOT,
     78         Error.INVALID_ACE_LABEL);
     79 
     80     private static boolean
     81     isASCIIString(CharSequence dest) {
     82         int length=dest.length();
     83         for(int i=0; i<length; ++i) {
     84             if(dest.charAt(i)>0x7f) {
     85                 return false;
     86             }
     87         }
     88         return true;
     89     }
     90 
     91     // UTS #46 data for ASCII characters.
     92     // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
     93     // and passes through all other ASCII characters.
     94     // If USE_STD3_RULES is set, then non-LDH characters are disallowed
     95     // using this data.
     96     // The ASCII fastpath also uses this data.
     97     // Values: -1=disallowed  0==valid  1==mapped (lowercase)
     98     private static final byte asciiData[]={
     99         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    100         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    101         // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
    102         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,
    103         // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
    104          0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,
    105         // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
    106         -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
    107          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,
    108         // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
    109         -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    110          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1
    111     };
    112 
    113     private StringBuilder
    114     process(CharSequence src,
    115             boolean isLabel, boolean toASCII,
    116             StringBuilder dest,
    117             Info info) {
    118         // uts46Norm2.normalize() would do all of this error checking and setup,
    119         // but with the ASCII fastpath we do not always call it, and do not
    120         // call it first.
    121         if(dest==src) {
    122             throw new IllegalArgumentException();
    123         }
    124         // Arguments are fine, reset output values.
    125         dest.delete(0, 0x7fffffff);
    126         resetInfo(info);
    127         int srcLength=src.length();
    128         if(srcLength==0) {
    129             addError(info, Error.EMPTY_LABEL);
    130             return dest;
    131         }
    132         // ASCII fastpath
    133         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
    134         int labelStart=0;
    135         int i;
    136         for(i=0;; ++i) {
    137             if(i==srcLength) {
    138                 if(toASCII) {
    139                     if((i-labelStart)>63) {
    140                         addLabelError(info, Error.LABEL_TOO_LONG);
    141                     }
    142                     // There is a trailing dot if labelStart==i.
    143                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
    144                         addError(info, Error.DOMAIN_NAME_TOO_LONG);
    145                     }
    146                 }
    147                 promoteAndResetLabelErrors(info);
    148                 return dest;
    149             }
    150             char c=src.charAt(i);
    151             if(c>0x7f) {
    152                 break;
    153             }
    154             int cData=asciiData[c];
    155             if(cData>0) {
    156                 dest.append((char)(c+0x20));  // Lowercase an uppercase ASCII letter.
    157             } else if(cData<0 && disallowNonLDHDot) {
    158                 break;  // Replacing with U+FFFD can be complicated for toASCII.
    159             } else {
    160                 dest.append(c);
    161                 if(c=='-') {  // hyphen
    162                     if(i==(labelStart+3) && src.charAt(i-1)=='-') {
    163                         // "??--..." is Punycode or forbidden.
    164                         ++i;  // '-' was copied to dest already
    165                         break;
    166                     }
    167                     if(i==labelStart) {
    168                         // label starts with "-"
    169                         addLabelError(info, Error.LEADING_HYPHEN);
    170                     }
    171                     if((i+1)==srcLength || src.charAt(i+1)=='.') {
    172                         // label ends with "-"
    173                         addLabelError(info, Error.TRAILING_HYPHEN);
    174                     }
    175                 } else if(c=='.') {  // dot
    176                     if(isLabel) {
    177                         // Replacing with U+FFFD can be complicated for toASCII.
    178                         ++i;  // '.' was copied to dest already
    179                         break;
    180                     }
    181                     if(i==labelStart) {
    182                         addLabelError(info, Error.EMPTY_LABEL);
    183                     }
    184                     if(toASCII && (i-labelStart)>63) {
    185                         addLabelError(info, Error.LABEL_TOO_LONG);
    186                     }
    187                     promoteAndResetLabelErrors(info);
    188                     labelStart=i+1;
    189                 }
    190             }
    191         }
    192         promoteAndResetLabelErrors(info);
    193         processUnicode(src, labelStart, i, isLabel, toASCII, dest, info);
    194         if( isBiDi(info) && !hasCertainErrors(info, severeErrors) &&
    195             (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart)))
    196         ) {
    197             addError(info, Error.BIDI);
    198         }
    199         return dest;
    200     }
    201 
    202     private StringBuilder
    203     processUnicode(CharSequence src,
    204                    int labelStart, int mappingStart,
    205                    boolean isLabel, boolean toASCII,
    206                    StringBuilder dest,
    207                    Info info) {
    208         if(mappingStart==0) {
    209             uts46Norm2.normalize(src, dest);
    210         } else {
    211             uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length()));
    212         }
    213         boolean doMapDevChars=
    214             toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 :
    215                       (options&NONTRANSITIONAL_TO_UNICODE)==0;
    216         int destLength=dest.length();
    217         int labelLimit=labelStart;
    218         while(labelLimit<destLength) {
    219             char c=dest.charAt(labelLimit);
    220             if(c=='.' && !isLabel) {
    221                 int labelLength=labelLimit-labelStart;
    222                 int newLength=processLabel(dest, labelStart, labelLength,
    223                                                 toASCII, info);
    224                 promoteAndResetLabelErrors(info);
    225                 destLength+=newLength-labelLength;
    226                 labelLimit=labelStart+=newLength+1;
    227             } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
    228                 setTransitionalDifferent(info);
    229                 if(doMapDevChars) {
    230                     destLength=mapDevChars(dest, labelStart, labelLimit);
    231                     // Do not increment labelLimit in case c was removed.
    232                     // All deviation characters have been mapped, no need to check for them again.
    233                     doMapDevChars=false;
    234                 } else {
    235                     ++labelLimit;
    236                 }
    237             } else {
    238                 ++labelLimit;
    239             }
    240         }
    241         // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
    242         // but not an empty label elsewhere nor a completely empty domain name.
    243         // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
    244         if(0==labelStart || labelStart<labelLimit) {
    245             processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info);
    246             promoteAndResetLabelErrors(info);
    247         }
    248         return dest;
    249     }
    250 
    251     // returns the new dest.length()
    252     private int
    253     mapDevChars(StringBuilder dest, int labelStart, int mappingStart) {
    254         int length=dest.length();
    255         boolean didMapDevChars=false;
    256         for(int i=mappingStart; i<length;) {
    257             char c=dest.charAt(i);
    258             switch(c) {
    259             case 0xdf:
    260                 // Map sharp s to ss.
    261                 didMapDevChars=true;
    262                 dest.setCharAt(i++, 's');
    263                 dest.insert(i++, 's');
    264                 ++length;
    265                 break;
    266             case 0x3c2:  // Map final sigma to nonfinal sigma.
    267                 didMapDevChars=true;
    268                 dest.setCharAt(i++, '\u03c3');
    269                 break;
    270             case 0x200c:  // Ignore/remove ZWNJ.
    271             case 0x200d:  // Ignore/remove ZWJ.
    272                 didMapDevChars=true;
    273                 dest.delete(i, i+1);
    274                 --length;
    275                 break;
    276             default:
    277                 ++i;
    278                 break;
    279             }
    280         }
    281         if(didMapDevChars) {
    282             // Mapping deviation characters might have resulted in an un-NFC string.
    283             // We could use either the NFC or the UTS #46 normalizer.
    284             // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
    285             String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length()));
    286             dest.replace(labelStart, 0x7fffffff, normalized);
    287             return dest.length();
    288         }
    289         return length;
    290     }
    291     // Some non-ASCII characters are equivalent to sequences with
    292     // non-LDH ASCII characters. To find them:
    293     // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)
    294     private static boolean
    295     isNonASCIIDisallowedSTD3Valid(int c) {
    296         return c==0x2260 || c==0x226E || c==0x226F;
    297     }
    298 
    299 
    300     // Replace the label in dest with the label string, if the label was modified.
    301     // If label==dest then the label was modified in-place and labelLength
    302     // is the new label length, different from label.length().
    303     // If label!=dest then labelLength==label.length().
    304     // Returns labelLength (= the new label length).
    305     private static int
    306     replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength,
    307                  CharSequence label, int labelLength) {
    308         if(label!=dest) {
    309             dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label);
    310             // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString());
    311             // which would create a String rather than moving characters in the StringBuilder.
    312         }
    313         return labelLength;
    314     }
    315 
    316     // returns the new label length
    317     private int
    318     processLabel(StringBuilder dest,
    319                  int labelStart, int labelLength,
    320                  boolean toASCII,
    321                  Info info) {
    322         StringBuilder fromPunycode;
    323         StringBuilder labelString;
    324         int destLabelStart=labelStart;
    325         int destLabelLength=labelLength;
    326         boolean wasPunycode;
    327         if( labelLength>=4 &&
    328             dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' &&
    329             dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
    330         ) {
    331             // Label starts with "xn--", try to un-Punycode it.
    332             wasPunycode=true;
    333             try {
    334                 fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
    335             } catch (StringPrepParseException e) {
    336                 addLabelError(info, Error.PUNYCODE);
    337                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
    338             }
    339             // Check for NFC, and for characters that are not
    340             // valid or deviation characters according to the normalizer.
    341             // If there is something wrong, then the string will change.
    342             // Note that the normalizer passes through non-LDH ASCII and deviation characters.
    343             // Deviation characters are ok in Punycode even in transitional processing.
    344             // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
    345             // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
    346             boolean isValid=uts46Norm2.isNormalized(fromPunycode);
    347             if(!isValid) {
    348                 addLabelError(info, Error.INVALID_ACE_LABEL);
    349                 return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
    350             }
    351             labelString=fromPunycode;
    352             labelStart=0;
    353             labelLength=fromPunycode.length();
    354         } else {
    355             wasPunycode=false;
    356             labelString=dest;
    357         }
    358         // Validity check
    359         if(labelLength==0) {
    360             addLabelError(info, Error.EMPTY_LABEL);
    361             return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
    362         }
    363         // labelLength>0
    364         if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') {
    365             // label starts with "??--"
    366             addLabelError(info, Error.HYPHEN_3_4);
    367         }
    368         if(labelString.charAt(labelStart)=='-') {
    369             // label starts with "-"
    370             addLabelError(info, Error.LEADING_HYPHEN);
    371         }
    372         if(labelString.charAt(labelStart+labelLength-1)=='-') {
    373             // label ends with "-"
    374             addLabelError(info, Error.TRAILING_HYPHEN);
    375         }
    376         // If the label was not a Punycode label, then it was the result of
    377         // mapping, normalization and label segmentation.
    378         // If the label was in Punycode, then we mapped it again above
    379         // and checked its validity.
    380         // Now we handle the STD3 restriction to LDH characters (if set)
    381         // and we look for U+FFFD which indicates disallowed characters
    382         // in a non-Punycode label or U+FFFD itself in a Punycode label.
    383         // We also check for dots which can come from the input to a single-label function.
    384         // Ok to cast away const because we own the UnicodeString.
    385         int i=labelStart;
    386         int limit=labelStart+labelLength;
    387         char oredChars=0;
    388         // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
    389         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
    390         do {
    391             char c=labelString.charAt(i);
    392             if(c<=0x7f) {
    393                 if(c=='.') {
    394                     addLabelError(info, Error.LABEL_HAS_DOT);
    395                     labelString.setCharAt(i, '\ufffd');
    396                 } else if(disallowNonLDHDot && asciiData[c]<0) {
    397                     addLabelError(info, Error.DISALLOWED);
    398                     labelString.setCharAt(i, '\ufffd');
    399                 }
    400             } else {
    401                 oredChars|=c;
    402                 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {
    403                     addLabelError(info, Error.DISALLOWED);
    404                     labelString.setCharAt(i, '\ufffd');
    405                 } else if(c==0xfffd) {
    406                     addLabelError(info, Error.DISALLOWED);
    407                 }
    408             }
    409             ++i;
    410         } while(i<limit);
    411         // Check for a leading combining mark after other validity checks
    412         // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here.
    413         int c;
    414         // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
    415         c=labelString.codePointAt(labelStart);
    416         if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
    417             addLabelError(info, Error.LEADING_COMBINING_MARK);
    418             labelString.setCharAt(labelStart, '\ufffd');
    419             if(c>0xffff) {
    420                 // Remove c's trail surrogate.
    421                 labelString.deleteCharAt(labelStart+1);
    422                 --labelLength;
    423                 if(labelString==dest) {
    424                     --destLabelLength;
    425                 }
    426             }
    427         }
    428         if(!hasCertainLabelErrors(info, severeErrors)) {
    429             // Do contextual checks only if we do not have U+FFFD from a severe error
    430             // because U+FFFD can make these checks fail.
    431             if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) {
    432                 checkLabelBiDi(labelString, labelStart, labelLength, info);
    433             }
    434             if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
    435                 !isLabelOkContextJ(labelString, labelStart, labelLength)
    436             ) {
    437                 addLabelError(info, Error.CONTEXTJ);
    438             }
    439             if((options&CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {
    440                 checkLabelContextO(labelString, labelStart, labelLength, info);
    441             }
    442             if(toASCII) {
    443                 if(wasPunycode) {
    444                     // Leave a Punycode label unchanged if it has no severe errors.
    445                     if(destLabelLength>63) {
    446                         addLabelError(info, Error.LABEL_TOO_LONG);
    447                     }
    448                     return destLabelLength;
    449                 } else if(oredChars>=0x80) {
    450                     // Contains non-ASCII characters.
    451                     StringBuilder punycode;
    452                     try {
    453                         punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null);
    454                     } catch (StringPrepParseException e) {
    455                         throw new ICUException(e);  // unexpected
    456                     }
    457                     punycode.insert(0, "xn--");
    458                     if(punycode.length()>63) {
    459                         addLabelError(info, Error.LABEL_TOO_LONG);
    460                     }
    461                     return replaceLabel(dest, destLabelStart, destLabelLength,
    462                                         punycode, punycode.length());
    463                 } else {
    464                     // all-ASCII label
    465                     if(labelLength>63) {
    466                         addLabelError(info, Error.LABEL_TOO_LONG);
    467                     }
    468                 }
    469             }
    470         } else {
    471             // If a Punycode label has severe errors,
    472             // then leave it but make sure it does not look valid.
    473             if(wasPunycode) {
    474                 addLabelError(info, Error.INVALID_ACE_LABEL);
    475                 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
    476             }
    477         }
    478         return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
    479     }
    480     private int
    481     markBadACELabel(StringBuilder dest,
    482                     int labelStart, int labelLength,
    483                     boolean toASCII, Info info) {
    484         boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
    485         boolean isASCII=true;
    486         boolean onlyLDH=true;
    487         int i=labelStart+4;  // After the initial "xn--".
    488         int limit=labelStart+labelLength;
    489         do {
    490             char c=dest.charAt(i);
    491             if(c<=0x7f) {
    492                 if(c=='.') {
    493                     addLabelError(info, Error.LABEL_HAS_DOT);
    494                     dest.setCharAt(i, '\ufffd');
    495                     isASCII=onlyLDH=false;
    496                 } else if(asciiData[c]<0) {
    497                     onlyLDH=false;
    498                     if(disallowNonLDHDot) {
    499                         dest.setCharAt(i, '\ufffd');
    500                         isASCII=false;
    501                     }
    502                 }
    503             } else {
    504                 isASCII=onlyLDH=false;
    505             }
    506         } while(++i<limit);
    507         if(onlyLDH) {
    508             dest.insert(labelStart+labelLength, '\ufffd');
    509             ++labelLength;
    510         } else {
    511             if(toASCII && isASCII && labelLength>63) {
    512                 addLabelError(info, Error.LABEL_TOO_LONG);
    513             }
    514         }
    515         return labelLength;
    516     }
    517 
    518     private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT);
    519     private static final int R_AL_MASK=
    520         U_MASK(UCharacterDirection.RIGHT_TO_LEFT)|
    521         U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC);
    522     private static final int L_R_AL_MASK=L_MASK|R_AL_MASK;
    523 
    524     private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER);
    525 
    526     private static final int EN_AN_MASK=
    527         U_MASK(UCharacterDirection.EUROPEAN_NUMBER)|
    528         U_MASK(UCharacterDirection.ARABIC_NUMBER);
    529     private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
    530     private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER);
    531 
    532     private static final int ES_CS_ET_ON_BN_NSM_MASK=
    533         U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)|
    534         U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)|
    535         U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)|
    536         U_MASK(UCharacterDirection.OTHER_NEUTRAL)|
    537         U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)|
    538         U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK);
    539     private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
    540     private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
    541 
    542     // We scan the whole label and check both for whether it contains RTL characters
    543     // and whether it passes the BiDi Rule.
    544     // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
    545     // that a domain name is a BiDi domain name (has an RTL label) only after
    546     // processing several earlier labels.
    547     private void
    548     checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) {
    549         // IDNA2008 BiDi rule
    550         // Get the directionality of the first character.
    551         int c;
    552         int i=labelStart;
    553         c=Character.codePointAt(label, i);
    554         i+=Character.charCount(c);
    555         int firstMask=U_MASK(UBiDiProps.INSTANCE.getClass(c));
    556         // 1. The first character must be a character with BIDI property L, R
    557         // or AL.  If it has the R or AL property, it is an RTL label; if it
    558         // has the L property, it is an LTR label.
    559         if((firstMask&~L_R_AL_MASK)!=0) {
    560             setNotOkBiDi(info);
    561         }
    562         // Get the directionality of the last non-NSM character.
    563         int lastMask;
    564         int labelLimit=labelStart+labelLength;
    565         for(;;) {
    566             if(i>=labelLimit) {
    567                 lastMask=firstMask;
    568                 break;
    569             }
    570             c=Character.codePointBefore(label, labelLimit);
    571             labelLimit-=Character.charCount(c);
    572             int dir=UBiDiProps.INSTANCE.getClass(c);
    573             if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) {
    574                 lastMask=U_MASK(dir);
    575                 break;
    576             }
    577         }
    578         // 3. In an RTL label, the end of the label must be a character with
    579         // BIDI property R, AL, EN or AN, followed by zero or more
    580         // characters with BIDI property NSM.
    581         // 6. In an LTR label, the end of the label must be a character with
    582         // BIDI property L or EN, followed by zero or more characters with
    583         // BIDI property NSM.
    584         if( (firstMask&L_MASK)!=0 ?
    585                 (lastMask&~L_EN_MASK)!=0 :
    586                 (lastMask&~R_AL_EN_AN_MASK)!=0
    587         ) {
    588             setNotOkBiDi(info);
    589         }
    590         // Add the directionalities of the intervening characters.
    591         int mask=firstMask|lastMask;
    592         while(i<labelLimit) {
    593             c=Character.codePointAt(label, i);
    594             i+=Character.charCount(c);
    595             mask|=U_MASK(UBiDiProps.INSTANCE.getClass(c));
    596         }
    597         if((firstMask&L_MASK)!=0) {
    598             // 5. In an LTR label, only characters with the BIDI properties L, EN,
    599             // ES, CS, ET, ON, BN and NSM are allowed.
    600             if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
    601                 setNotOkBiDi(info);
    602             }
    603         } else {
    604             // 2. In an RTL label, only characters with the BIDI properties R, AL,
    605             // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
    606             if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
    607                 setNotOkBiDi(info);
    608             }
    609             // 4. In an RTL label, if an EN is present, no AN may be present, and
    610             // vice versa.
    611             if((mask&EN_AN_MASK)==EN_AN_MASK) {
    612                 setNotOkBiDi(info);
    613             }
    614         }
    615         // An RTL label is a label that contains at least one character of type
    616         // R, AL or AN. [...]
    617         // A "BIDI domain name" is a domain name that contains at least one RTL
    618         // label. [...]
    619         // The following rule, consisting of six conditions, applies to labels
    620         // in BIDI domain names.
    621         if((mask&R_AL_AN_MASK)!=0) {
    622             setBiDi(info);
    623         }
    624     }
    625 
    626     // Special code for the ASCII prefix of a BiDi domain name.
    627     // The ASCII prefix is all-LTR.
    628 
    629     // IDNA2008 BiDi rule, parts relevant to ASCII labels:
    630     // 1. The first character must be a character with BIDI property L [...]
    631     // 5. In an LTR label, only characters with the BIDI properties L, EN,
    632     // ES, CS, ET, ON, BN and NSM are allowed.
    633     // 6. In an LTR label, the end of the label must be a character with
    634     // BIDI property L or EN [...]
    635 
    636     // UTF-16 version, called for mapped ASCII prefix.
    637     // Cannot contain uppercase A-Z.
    638     // s[length-1] must be the trailing dot.
    639     private static boolean
    640     isASCIIOkBiDi(CharSequence s, int length) {
    641         int labelStart=0;
    642         for(int i=0; i<length; ++i) {
    643             char c=s.charAt(i);
    644             if(c=='.') {  // dot
    645                 if(i>labelStart) {
    646                     c=s.charAt(i-1);
    647                     if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) {
    648                         // Last character in the label is not an L or EN.
    649                         return false;
    650                     }
    651                 }
    652                 labelStart=i+1;
    653             } else if(i==labelStart) {
    654                 if(!('a'<=c && c<='z')) {
    655                     // First character in the label is not an L.
    656                     return false;
    657                 }
    658             } else {
    659                 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
    660                     // Intermediate character in the label is a B, S or WS.
    661                     return false;
    662                 }
    663             }
    664         }
    665         return true;
    666     }
    667 
    668     private boolean
    669     isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) {
    670         // [IDNA2008-Tables]
    671         // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
    672         int labelLimit=labelStart+labelLength;
    673         for(int i=labelStart; i<labelLimit; ++i) {
    674             if(label.charAt(i)==0x200c) {
    675                 // Appendix A.1. ZERO WIDTH NON-JOINER
    676                 // Rule Set:
    677                 //  False;
    678                 //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
    679                 //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
    680                 //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
    681                 if(i==labelStart) {
    682                     return false;
    683                 }
    684                 int c;
    685                 int j=i;
    686                 c=Character.codePointBefore(label, j);
    687                 j-=Character.charCount(c);
    688                 if(uts46Norm2.getCombiningClass(c)==9) {
    689                     continue;
    690                 }
    691                 // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
    692                 for(;;) {
    693                     /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c);
    694                     if(type==UCharacter.JoiningType.TRANSPARENT) {
    695                         if(j==0) {
    696                             return false;
    697                         }
    698                         c=Character.codePointBefore(label, j);
    699                         j-=Character.charCount(c);
    700                     } else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
    701                         break;  // precontext fulfilled
    702                     } else {
    703                         return false;
    704                     }
    705                 }
    706                 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
    707                 for(j=i+1;;) {
    708                     if(j==labelLimit) {
    709                         return false;
    710                     }
    711                     c=Character.codePointAt(label, j);
    712                     j+=Character.charCount(c);
    713                     /* UJoiningType */ int type=UBiDiProps.INSTANCE.getJoiningType(c);
    714                     if(type==UCharacter.JoiningType.TRANSPARENT) {
    715                         // just skip this character
    716                     } else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
    717                         break;  // postcontext fulfilled
    718                     } else {
    719                         return false;
    720                     }
    721                 }
    722             } else if(label.charAt(i)==0x200d) {
    723                 // Appendix A.2. ZERO WIDTH JOINER (U+200D)
    724                 // Rule Set:
    725                 //  False;
    726                 //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
    727                 if(i==labelStart) {
    728                     return false;
    729                 }
    730                 int c=Character.codePointBefore(label, i);
    731                 if(uts46Norm2.getCombiningClass(c)!=9) {
    732                     return false;
    733                 }
    734             }
    735         }
    736         return true;
    737     }
    738 
    739     private void
    740     checkLabelContextO(CharSequence label, int labelStart, int labelLength, Info info) {
    741         int labelEnd=labelStart+labelLength-1;  // inclusive
    742         int arabicDigits=0;  // -1 for 066x, +1 for 06Fx
    743         for(int i=labelStart; i<=labelEnd; ++i) {
    744             int c=label.charAt(i);
    745             if(c<0xb7) {
    746                 // ASCII fastpath
    747             } else if(c<=0x6f9) {
    748                 if(c==0xb7) {
    749                     // Appendix A.3. MIDDLE DOT (U+00B7)
    750                     // Rule Set:
    751                     //  False;
    752                     //  If Before(cp) .eq.  U+006C And
    753                     //     After(cp) .eq.  U+006C Then True;
    754                     if(!(labelStart<i && label.charAt(i-1)=='l' &&
    755                          i<labelEnd && label.charAt(i+1)=='l')) {
    756                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
    757                     }
    758                 } else if(c==0x375) {
    759                     // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
    760                     // Rule Set:
    761                     //  False;
    762                     //  If Script(After(cp)) .eq.  Greek Then True;
    763                     if(!(i<labelEnd &&
    764                          UScript.GREEK==UScript.getScript(Character.codePointAt(label, i+1)))) {
    765                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
    766                     }
    767                 } else if(c==0x5f3 || c==0x5f4) {
    768                     // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
    769                     // Rule Set:
    770                     //  False;
    771                     //  If Script(Before(cp)) .eq.  Hebrew Then True;
    772                     //
    773                     // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
    774                     // Rule Set:
    775                     //  False;
    776                     //  If Script(Before(cp)) .eq.  Hebrew Then True;
    777                     if(!(labelStart<i &&
    778                          UScript.HEBREW==UScript.getScript(Character.codePointBefore(label, i)))) {
    779                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
    780                     }
    781                 } else if(0x660<=c /* && c<=0x6f9 */) {
    782                     // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
    783                     // Rule Set:
    784                     //  True;
    785                     //  For All Characters:
    786                     //    If cp .in. 06F0..06F9 Then False;
    787                     //  End For;
    788                     //
    789                     // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
    790                     // Rule Set:
    791                     //  True;
    792                     //  For All Characters:
    793                     //    If cp .in. 0660..0669 Then False;
    794                     //  End For;
    795                     if(c<=0x669) {
    796                         if(arabicDigits>0) {
    797                             addLabelError(info, Error.CONTEXTO_DIGITS);
    798                         }
    799                         arabicDigits=-1;
    800                     } else if(0x6f0<=c) {
    801                         if(arabicDigits<0) {
    802                             addLabelError(info, Error.CONTEXTO_DIGITS);
    803                         }
    804                         arabicDigits=1;
    805                     }
    806                 }
    807             } else if(c==0x30fb) {
    808                 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
    809                 // Rule Set:
    810                 //  False;
    811                 //  For All Characters:
    812                 //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
    813                 //  End For;
    814                 for(int j=labelStart;; j+=Character.charCount(c)) {
    815                     if(j>labelEnd) {
    816                         addLabelError(info, Error.CONTEXTO_PUNCTUATION);
    817                         break;
    818                     }
    819                     c=Character.codePointAt(label, j);
    820                     int script=UScript.getScript(c);
    821                     if(script==UScript.HIRAGANA || script==UScript.KATAKANA || script==UScript.HAN) {
    822                         break;
    823                     }
    824                 }
    825             }
    826         }
    827     }
    828 
    829     // TODO: make public(?) -- in C, these are public in uchar.h
    830     private static int U_MASK(int x) {
    831         return 1<<x;
    832     }
    833     private static int U_GET_GC_MASK(int c) {
    834         return (1<<UCharacter.getType(c));
    835     }
    836     private static int U_GC_M_MASK=
    837         U_MASK(UCharacterCategory.NON_SPACING_MARK)|
    838         U_MASK(UCharacterCategory.ENCLOSING_MARK)|
    839         U_MASK(UCharacterCategory.COMBINING_SPACING_MARK);
    840 }
    841