Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2003-2015, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 
     11 package android.icu.text;
     12 
     13 import java.io.IOException;
     14 import java.io.InputStream;
     15 import java.lang.ref.WeakReference;
     16 import java.nio.ByteBuffer;
     17 
     18 import android.icu.impl.CharTrie;
     19 import android.icu.impl.ICUBinary;
     20 import android.icu.impl.StringPrepDataReader;
     21 import android.icu.impl.UBiDiProps;
     22 import android.icu.lang.UCharacter;
     23 import android.icu.lang.UCharacterDirection;
     24 import android.icu.util.ICUUncheckedIOException;
     25 import android.icu.util.VersionInfo;
     26 
     27 /**
     28  * StringPrep API implements the StingPrep framework as described by
     29  * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
     30  * StringPrep prepares Unicode strings for use in network protocols.
     31  * Profiles of StingPrep are set of rules and data according to which the
     32  * Unicode Strings are prepared. Each profiles contains tables which describe
     33  * how a code point should be treated. The tables are broadly classied into
     34  * <ul>
     35  *     <li> Unassigned Table: Contains code points that are unassigned
     36  *          in the Unicode Version supported by StringPrep. Currently
     37  *          RFC 3454 supports Unicode 3.2. </li>
     38  *     <li> Prohibited Table: Contains code points that are prohibted from
     39  *          the output of the StringPrep processing function. </li>
     40  *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
     41  * </ul>
     42  *
     43  * The procedure for preparing Unicode strings:
     44  * <ol>
     45  *      <li> Map: For each character in the input, check if it has a mapping
     46  *           and, if so, replace it with its mapping. </li>
     47  *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
     48  *           normalization. </li>
     49  *      <li> Prohibit: Check for any characters that are not allowed in the
     50  *           output.  If any are found, return an error.</li>
     51  *      <li> Check bidi: Possibly check for right-to-left characters, and if
     52  *           any are found, make sure that the whole string satisfies the
     53  *           requirements for bidirectional strings.  If the string does not
     54  *           satisfy the requirements for bidirectional strings, return an
     55  *           error.  </li>
     56  * </ol>
     57  * @author Ram Viswanadha
     58  * @hide Only a subset of ICU is exposed in Android
     59  */
     60 public final class StringPrep {
     61     /**
     62      * Option to prohibit processing of unassigned code points in the input
     63      *
     64      * @see   #prepare
     65      */
     66     public static final int DEFAULT = 0x0000;
     67 
     68     /**
     69      * Option to allow processing of unassigned code points in the input
     70      *
     71      * @see   #prepare
     72      */
     73     public static final int ALLOW_UNASSIGNED = 0x0001;
     74 
     75     /**
     76      * Profile type: RFC3491 Nameprep
     77      * @see #getInstance(int)
     78      */
     79     public static final int RFC3491_NAMEPREP = 0;
     80 
     81     /**
     82      * Profile type: RFC3530 nfs4_cs_prep
     83      * @see #getInstance(int)
     84      */
     85     public static final int RFC3530_NFS4_CS_PREP = 1;
     86 
     87     /**
     88      * Profile type: RFC3530 nfs4_cs_prep with case insensitive option
     89      * @see #getInstance(int)
     90      */
     91     public static final int RFC3530_NFS4_CS_PREP_CI = 2;
     92 
     93     /**
     94      * Profile type: RFC3530 nfs4_cis_prep
     95      * @see #getInstance(int)
     96      */
     97     public static final int RFC3530_NFS4_CIS_PREP = 3;
     98 
     99     /**
    100      * Profile type: RFC3530 nfs4_mixed_prep for prefix
    101      * @see #getInstance(int)
    102      */
    103     public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4;
    104 
    105     /**
    106      * Profile type: RFC3530 nfs4_mixed_prep for suffix
    107      * @see #getInstance(int)
    108      */
    109     public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5;
    110 
    111     /**
    112      * Profile type: RFC3722 iSCSI
    113      * @see #getInstance(int)
    114      */
    115     public static final int RFC3722_ISCSI = 6;
    116 
    117     /**
    118      * Profile type: RFC3920 XMPP Nodeprep
    119      * @see #getInstance(int)
    120      */
    121     public static final int RFC3920_NODEPREP = 7;
    122 
    123     /**
    124      * Profile type: RFC3920 XMPP Resourceprep
    125      * @see #getInstance(int)
    126      */
    127     public static final int RFC3920_RESOURCEPREP = 8;
    128 
    129     /**
    130      * Profile type: RFC4011 Policy MIB Stringprep
    131      * @see #getInstance(int)
    132      */
    133     public static final int RFC4011_MIB = 9;
    134 
    135     /**
    136      * Profile type: RFC4013 SASLprep
    137      * @see #getInstance(int)
    138      */
    139     public static final int RFC4013_SASLPREP = 10;
    140 
    141     /**
    142      * Profile type: RFC4505 trace
    143      * @see #getInstance(int)
    144      */
    145     public static final int RFC4505_TRACE = 11;
    146 
    147     /**
    148      * Profile type: RFC4518 LDAP
    149      * @see #getInstance(int)
    150      */
    151     public static final int RFC4518_LDAP = 12;
    152 
    153     /**
    154      * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix
    155      * matching rules
    156      * @see #getInstance(int)
    157      */
    158     public static final int RFC4518_LDAP_CI = 13;
    159 
    160     // Last available profile
    161     private static final int MAX_PROFILE = RFC4518_LDAP_CI;
    162 
    163     // Profile names must be aligned to profile type definitions
    164     private static final String[] PROFILE_NAMES = {
    165         "rfc3491",      /* RFC3491_NAMEPREP */
    166         "rfc3530cs",    /* RFC3530_NFS4_CS_PREP */
    167         "rfc3530csci",  /* RFC3530_NFS4_CS_PREP_CI */
    168         "rfc3491",      /* RFC3530_NSF4_CIS_PREP */
    169         "rfc3530mixp",  /* RFC3530_NSF4_MIXED_PREP_PREFIX */
    170         "rfc3491",      /* RFC3530_NSF4_MIXED_PREP_SUFFIX */
    171         "rfc3722",      /* RFC3722_ISCSI */
    172         "rfc3920node",  /* RFC3920_NODEPREP */
    173         "rfc3920res",   /* RFC3920_RESOURCEPREP */
    174         "rfc4011",      /* RFC4011_MIB */
    175         "rfc4013",      /* RFC4013_SASLPREP */
    176         "rfc4505",      /* RFC4505_TRACE */
    177         "rfc4518",      /* RFC4518_LDAP */
    178         "rfc4518ci",    /* RFC4518_LDAP_CI */
    179     };
    180 
    181     @SuppressWarnings({"unchecked", "rawtypes"})
    182     private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1];
    183 
    184     private static final int UNASSIGNED        = 0x0000;
    185     private static final int MAP               = 0x0001;
    186     private static final int PROHIBITED        = 0x0002;
    187     private static final int DELETE            = 0x0003;
    188     private static final int TYPE_LIMIT        = 0x0004;
    189 
    190     private static final int NORMALIZATION_ON  = 0x0001;
    191     private static final int CHECK_BIDI_ON     = 0x0002;
    192 
    193     private static final int TYPE_THRESHOLD       = 0xFFF0;
    194     private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
    195     //private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
    196 
    197     /* indexes[] value names */
    198 //  private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
    199     private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
    200     private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
    201     private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
    202     private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
    203     private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
    204     private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
    205     private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
    206     private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
    207 
    208 
    209     // CharTrie implmentation for reading the trie data
    210     private CharTrie sprepTrie;
    211     // Indexes read from the data file
    212     private int[] indexes;
    213     // mapping data read from the data file
    214     private char[] mappingData;
    215     // the version of Unicode supported by the data file
    216     private VersionInfo sprepUniVer;
    217     // the Unicode version of last entry in the
    218     // NormalizationCorrections.txt file if normalization
    219     // is turned on
    220     private VersionInfo normCorrVer;
    221     // Option to turn on Normalization
    222     private boolean doNFKC;
    223     // Option to turn on checking for BiDi rules
    224     private boolean checkBiDi;
    225     // bidi properties
    226     private UBiDiProps bdp;
    227 
    228     private char getCodePointValue(int ch){
    229         return sprepTrie.getCodePointValue(ch);
    230     }
    231 
    232     private static VersionInfo getVersionInfo(int comp){
    233         int micro = comp & 0xFF;
    234         int milli =(comp >> 8)  & 0xFF;
    235         int minor =(comp >> 16) & 0xFF;
    236         int major =(comp >> 24) & 0xFF;
    237         return VersionInfo.getInstance(major,minor,milli,micro);
    238     }
    239 
    240     private static VersionInfo getVersionInfo(byte[] version){
    241         if(version.length != 4){
    242             return null;
    243         }
    244         return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
    245     }
    246 
    247     /**
    248      * Creates an StringPrep object after reading the input stream.
    249      * The object does not hold a reference to the input steam, so the stream can be
    250      * closed after the method returns.
    251      *
    252      * @param inputStream The stream for reading the StringPrep profile binarySun
    253      * @throws IOException An exception occurs when I/O of the inputstream is invalid
    254      */
    255     public StringPrep(InputStream inputStream) throws IOException{
    256         // TODO: Add a public constructor that takes ByteBuffer directly.
    257         this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream));
    258     }
    259 
    260     private StringPrep(ByteBuffer bytes) throws IOException {
    261         StringPrepDataReader reader = new StringPrepDataReader(bytes);
    262 
    263         // read the indexes
    264         indexes = reader.readIndexes(INDEX_TOP);
    265 
    266         sprepTrie = new CharTrie(bytes, null);
    267 
    268         //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
    269         // load the rest of the data data and initialize the data members
    270         mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2);
    271 
    272         // get the options
    273         doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
    274         checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
    275         sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
    276         normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
    277         VersionInfo normUniVer = UCharacter.getUnicodeVersion();
    278         if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
    279            normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
    280            ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
    281            ){
    282             throw new IOException("Normalization Correction version not supported");
    283         }
    284 
    285         if(checkBiDi) {
    286             bdp=UBiDiProps.INSTANCE;
    287         }
    288     }
    289 
    290     /**
    291      * Gets a StringPrep instance for the specified profile
    292      *
    293      * @param profile The profile passed to find the StringPrep instance.
    294      */
    295     public static StringPrep getInstance(int profile) {
    296         if (profile < 0 || profile > MAX_PROFILE) {
    297             throw new IllegalArgumentException("Bad profile type");
    298         }
    299 
    300         StringPrep instance = null;
    301 
    302         // A StringPrep instance is immutable.  We use a single instance
    303         // per type and store it in the internal cache.
    304         synchronized (CACHE) {
    305             WeakReference<StringPrep> ref = CACHE[profile];
    306             if (ref != null) {
    307                 instance = ref.get();
    308             }
    309 
    310             if (instance == null) {
    311                 ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp");
    312                 if (bytes != null) {
    313                     try {
    314                         instance = new StringPrep(bytes);
    315                     } catch (IOException e) {
    316                         throw new ICUUncheckedIOException(e);
    317                     }
    318                 }
    319                 if (instance != null) {
    320                     CACHE[profile] = new WeakReference<StringPrep>(instance);
    321                 }
    322             }
    323         }
    324         return instance;
    325     }
    326 
    327     private static final class Values{
    328         boolean isIndex;
    329         int value;
    330         int type;
    331         public void reset(){
    332             isIndex = false;
    333             value = 0;
    334             type = -1;
    335         }
    336     }
    337 
    338     private static final void getValues(char trieWord,Values values){
    339         values.reset();
    340         if(trieWord == 0){
    341             /*
    342              * Initial value stored in the mapping table
    343              * just return TYPE_LIMIT .. so that
    344              * the source codepoint is copied to the destination
    345              */
    346             values.type = TYPE_LIMIT;
    347         }else if(trieWord >= TYPE_THRESHOLD){
    348             values.type = (trieWord - TYPE_THRESHOLD);
    349         }else{
    350             /* get the type */
    351             values.type = MAP;
    352             /* ascertain if the value is index or delta */
    353             if((trieWord & 0x02)>0){
    354                 values.isIndex = true;
    355                 values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
    356 
    357             }else{
    358                 values.isIndex = false;
    359                 values.value = (trieWord<<16)>>16;
    360                 values.value =  (values.value >> 2);
    361 
    362             }
    363 
    364             if((trieWord>>2) == MAX_INDEX_VALUE){
    365                 values.type = DELETE;
    366                 values.isIndex = false;
    367                 values.value = 0;
    368             }
    369         }
    370     }
    371 
    372 
    373 
    374     private StringBuffer map( UCharacterIterator iter, int options)
    375                             throws StringPrepParseException{
    376 
    377         Values val = new Values();
    378         char result = 0;
    379         int ch  = UCharacterIterator.DONE;
    380         StringBuffer dest = new StringBuffer();
    381         boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
    382 
    383         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
    384 
    385             result = getCodePointValue(ch);
    386             getValues(result,val);
    387 
    388             // check if the source codepoint is unassigned
    389             if(val.type == UNASSIGNED && allowUnassigned == false){
    390                  throw new StringPrepParseException("An unassigned code point was found in the input",
    391                                           StringPrepParseException.UNASSIGNED_ERROR,
    392                                           iter.getText(),iter.getIndex());
    393             }else if((val.type == MAP)){
    394                 int index, length;
    395 
    396                 if(val.isIndex){
    397                     index = val.value;
    398                     if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
    399                              index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
    400                         length = 1;
    401                     }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
    402                              index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
    403                         length = 2;
    404                     }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
    405                              index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
    406                         length = 3;
    407                     }else{
    408                         length = mappingData[index++];
    409                     }
    410                     /* copy mapping to destination */
    411                     dest.append(mappingData,index,length);
    412                     continue;
    413 
    414                 }else{
    415                     ch -= val.value;
    416                 }
    417             }else if(val.type == DELETE){
    418                 // just consume the codepoint and contine
    419                 continue;
    420             }
    421             //copy the source into destination
    422             UTF16.append(dest,ch);
    423         }
    424 
    425         return dest;
    426     }
    427 
    428 
    429     private StringBuffer normalize(StringBuffer src){
    430         return new StringBuffer(
    431             Normalizer.normalize(
    432                 src.toString(),
    433                 Normalizer.NFKC,
    434                 Normalizer.UNICODE_3_2));
    435     }
    436     /*
    437     boolean isLabelSeparator(int ch){
    438         int result = getCodePointValue(ch);
    439         if( (result & 0x07)  == LABEL_SEPARATOR){
    440             return true;
    441         }
    442         return false;
    443     }
    444     */
    445      /*
    446        1) Map -- For each character in the input, check if it has a mapping
    447           and, if so, replace it with its mapping.
    448 
    449        2) Normalize -- Possibly normalize the result of step 1 using Unicode
    450           normalization.
    451 
    452        3) Prohibit -- Check for any characters that are not allowed in the
    453           output.  If any are found, return an error.
    454 
    455        4) Check bidi -- Possibly check for right-to-left characters, and if
    456           any are found, make sure that the whole string satisfies the
    457           requirements for bidirectional strings.  If the string does not
    458           satisfy the requirements for bidirectional strings, return an
    459           error.
    460           [Unicode3.2] defines several bidirectional categories; each character
    461            has one bidirectional category assigned to it.  For the purposes of
    462            the requirements below, an "RandALCat character" is a character that
    463            has Unicode bidirectional categories "R" or "AL"; an "LCat character"
    464            is a character that has Unicode bidirectional category "L".  Note
    465 
    466 
    467            that there are many characters which fall in neither of the above
    468            definitions; Latin digits (<U+0030> through <U+0039>) are examples of
    469            this because they have bidirectional category "EN".
    470 
    471            In any profile that specifies bidirectional character handling, all
    472            three of the following requirements MUST be met:
    473 
    474            1) The characters in section 5.8 MUST be prohibited.
    475 
    476            2) If a string contains any RandALCat character, the string MUST NOT
    477               contain any LCat character.
    478 
    479            3) If a string contains any RandALCat character, a RandALCat
    480               character MUST be the first character of the string, and a
    481               RandALCat character MUST be the last character of the string.
    482     */
    483     /**
    484      * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
    485      * checks for prohibited and BiDi characters in the order defined by RFC 3454
    486      * depending on the options specified in the profile.
    487      *
    488      * @param src           A UCharacterIterator object containing the source string
    489      * @param options       A bit set of options:
    490      *   <ul>
    491      *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
    492      *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
    493      *          as normal Unicode code points.</li>
    494      *   </ul>
    495      * @return StringBuffer A StringBuffer containing the output
    496      * @throws StringPrepParseException An exception occurs when parsing a string is invalid.
    497      */
    498     public StringBuffer prepare(UCharacterIterator src, int options)
    499                         throws StringPrepParseException{
    500 
    501         // map
    502         StringBuffer mapOut = map(src,options);
    503         StringBuffer normOut = mapOut;// initialize
    504 
    505         if(doNFKC){
    506             // normalize
    507             normOut = normalize(mapOut);
    508         }
    509 
    510         int ch;
    511         char result;
    512         UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
    513         Values val = new Values();
    514         int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
    515             firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
    516         int rtlPos=-1, ltrPos=-1;
    517         boolean rightToLeft=false, leftToRight=false;
    518 
    519         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
    520             result = getCodePointValue(ch);
    521             getValues(result,val);
    522 
    523             if(val.type == PROHIBITED ){
    524                 throw new StringPrepParseException("A prohibited code point was found in the input",
    525                                          StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value);
    526             }
    527 
    528             if(checkBiDi) {
    529                 direction = bdp.getClass(ch);
    530                 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
    531                     firstCharDir = direction;
    532                 }
    533                 if(direction == UCharacterDirection.LEFT_TO_RIGHT){
    534                     leftToRight = true;
    535                     ltrPos = iter.getIndex()-1;
    536                 }
    537                 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
    538                     rightToLeft = true;
    539                     rtlPos = iter.getIndex()-1;
    540                 }
    541             }
    542         }
    543         if(checkBiDi == true){
    544             // satisfy 2
    545             if( leftToRight == true && rightToLeft == true){
    546                 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
    547                                          StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
    548                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
    549              }
    550 
    551             //satisfy 3
    552             if( rightToLeft == true &&
    553                 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
    554                 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
    555               ){
    556                 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
    557                                          StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
    558                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
    559             }
    560         }
    561         return normOut;
    562 
    563       }
    564 
    565     /**
    566      * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC),
    567      * checks for prohibited and BiDi characters in the order defined by RFC 3454
    568      * depending on the options specified in the profile.
    569      *
    570      * @param src           A string
    571      * @param options       A bit set of options:
    572      *   <ul>
    573      *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
    574      *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
    575      *          as normal Unicode code points.</li>
    576      *   </ul>
    577      * @return String A String containing the output
    578      * @throws StringPrepParseException An exception when parsing or preparing a string is invalid.
    579      */
    580     public String prepare(String src, int options)
    581         throws StringPrepParseException{
    582         StringBuffer result = prepare(UCharacterIterator.getInstance(src), options);
    583         return result.toString();
    584     }
    585 }
    586