Home | History | Annotate | Download | only in impl
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 1996-2016, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 
     11 package android.icu.impl;
     12 
     13 import java.io.IOException;
     14 import java.nio.ByteBuffer;
     15 import java.util.Iterator;
     16 import java.util.MissingResourceException;
     17 
     18 import android.icu.lang.UCharacter;
     19 import android.icu.lang.UCharacter.HangulSyllableType;
     20 import android.icu.lang.UCharacter.NumericType;
     21 import android.icu.lang.UCharacterCategory;
     22 import android.icu.lang.UProperty;
     23 import android.icu.lang.UScript;
     24 import android.icu.text.Normalizer2;
     25 import android.icu.text.UTF16;
     26 import android.icu.text.UnicodeSet;
     27 import android.icu.util.ICUException;
     28 import android.icu.util.VersionInfo;
     29 
     30 /**
     31 * <p>Internal class used for Unicode character property database.</p>
     32 * <p>This classes store binary data read from uprops.icu.
     33 * It does not have the capability to parse the data into more high-level
     34 * information. It only returns bytes of information when required.</p>
     35 * <p>Due to the form most commonly used for retrieval, array of char is used
     36 * to store the binary data.</p>
     37 * <p>UCharacterPropertyDB also contains information on accessing indexes to
     38 * significant points in the binary data.</p>
     39 * <p>Responsibility for molding the binary data into more meaning form lies on
     40 * <a href=UCharacter.html>UCharacter</a>.</p>
     41 * @author Syn Wee Quek
     42 * @hide Only a subset of ICU is exposed in Android
     43 */
     44 
     45 public final class UCharacterProperty
     46 {
     47     // public data members -----------------------------------------------
     48 
     49     /*
     50      * public singleton instance
     51      */
     52     public static final UCharacterProperty INSTANCE;
     53 
     54     /**
     55     * Trie data
     56     */
     57     public Trie2_16 m_trie_;
     58     /**
     59     * Unicode version
     60     */
     61     public VersionInfo m_unicodeVersion_;
     62     /**
     63     * Latin capital letter i with dot above
     64     */
     65     public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
     66     /**
     67     * Latin small letter i with dot above
     68     */
     69     public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
     70     /**
     71     * Latin lowercase i
     72     */
     73     public static final char LATIN_SMALL_LETTER_I_ = 0x69;
     74     /**
     75     * Character type mask
     76     */
     77     public static final int TYPE_MASK = 0x1F;
     78 
     79     // uprops.h enum UPropertySource --------------------------------------- ***
     80 
     81     /** No source, not a supported property. */
     82     public static final int SRC_NONE=0;
     83     /** From uchar.c/uprops.icu main trie */
     84     public static final int SRC_CHAR=1;
     85     /** From uchar.c/uprops.icu properties vectors trie */
     86     public static final int SRC_PROPSVEC=2;
     87     /** From unames.c/unames.icu */
     88     public static final int SRC_NAMES=3;
     89     /** From ucase.c/ucase.icu */
     90     public static final int SRC_CASE=4;
     91     /** From ubidi_props.c/ubidi.icu */
     92     public static final int SRC_BIDI=5;
     93     /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
     94     public static final int SRC_CHAR_AND_PROPSVEC=6;
     95     /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
     96     public static final int SRC_CASE_AND_NORM=7;
     97     /** From normalizer2impl.cpp/nfc.nrm */
     98     public static final int SRC_NFC=8;
     99     /** From normalizer2impl.cpp/nfkc.nrm */
    100     public static final int SRC_NFKC=9;
    101     /** From normalizer2impl.cpp/nfkc_cf.nrm */
    102     public static final int SRC_NFKC_CF=10;
    103     /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
    104     public static final int SRC_NFC_CANON_ITER=11;
    105     /** One more than the highest UPropertySource (SRC_) constant. */
    106     public static final int SRC_COUNT=12;
    107 
    108     // public methods ----------------------------------------------------
    109 
    110     /**
    111     * Gets the main property value for code point ch.
    112     * @param ch code point whose property value is to be retrieved
    113     * @return property value of code point
    114     */
    115     public final int getProperty(int ch)
    116     {
    117         return m_trie_.get(ch);
    118     }
    119 
    120     /**
    121      * Gets the unicode additional properties.
    122      * Java version of C u_getUnicodeProperties().
    123      * @param codepoint codepoint whose additional properties is to be
    124      *                  retrieved
    125      * @param column The column index.
    126      * @return unicode properties
    127      */
    128     public int getAdditional(int codepoint, int column) {
    129         assert column >= 0;
    130         if (column >= m_additionalColumnsCount_) {
    131             return 0;
    132         }
    133         return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
    134     }
    135 
    136     static final int MY_MASK = UCharacterProperty.TYPE_MASK
    137         & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
    138             (1<<UCharacterCategory.LOWERCASE_LETTER) |
    139             (1<<UCharacterCategory.TITLECASE_LETTER) |
    140             (1<<UCharacterCategory.MODIFIER_LETTER) |
    141             (1<<UCharacterCategory.OTHER_LETTER));
    142 
    143 
    144        /**
    145      * <p>Get the "age" of the code point.</p>
    146      * <p>The "age" is the Unicode version when the code point was first
    147      * designated (as a non-character or for Private Use) or assigned a
    148      * character.</p>
    149      * <p>This can be useful to avoid emitting code points to receiving
    150      * processes that do not accept newer characters.</p>
    151      * <p>The data is from the UCD file DerivedAge.txt.</p>
    152      * <p>This API does not check the validity of the codepoint.</p>
    153      * @param codepoint The code point.
    154      * @return the Unicode version number
    155      */
    156     public VersionInfo getAge(int codepoint)
    157     {
    158         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
    159         return VersionInfo.getInstance(
    160                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
    161                            version & LAST_NIBBLE_MASK_, 0, 0);
    162     }
    163 
    164     private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
    165     private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
    166     private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
    167     private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
    168     private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
    169     private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
    170     /** Mask constant for multiple UCharCategory bits (Z Separators). */
    171     private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
    172 
    173     /**
    174      * Checks if c is in
    175      * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
    176      * with space=\p{Whitespace} and Control=Cc.
    177      * Implements UCHAR_POSIX_GRAPH.
    178      * @hide draft / provisional / internal are hidden on Android
    179      */
    180     private static final boolean isgraphPOSIX(int c) {
    181         /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
    182         /* comparing ==0 returns FALSE for the categories mentioned */
    183         return (getMask(UCharacter.getType(c))&
    184                 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
    185                ==0;
    186     }
    187 
    188     // binary properties --------------------------------------------------- ***
    189 
    190     private class BinaryProperty {
    191         int column;  // SRC_PROPSVEC column, or "source" if mask==0
    192         int mask;
    193         BinaryProperty(int column, int mask) {
    194             this.column=column;
    195             this.mask=mask;
    196         }
    197         BinaryProperty(int source) {
    198             this.column=source;
    199             this.mask=0;
    200         }
    201         final int getSource() {
    202             return mask==0 ? column : SRC_PROPSVEC;
    203         }
    204         boolean contains(int c) {
    205             // systematic, directly stored properties
    206             return (getAdditional(c, column)&mask)!=0;
    207         }
    208     }
    209 
    210     private class CaseBinaryProperty extends BinaryProperty {  // case mapping properties
    211         int which;
    212         CaseBinaryProperty(int which) {
    213             super(SRC_CASE);
    214             this.which=which;
    215         }
    216         @Override
    217         boolean contains(int c) {
    218             return UCaseProps.INSTANCE.hasBinaryProperty(c, which);
    219         }
    220     }
    221 
    222     private class NormInertBinaryProperty extends BinaryProperty {  // UCHAR_NF*_INERT properties
    223         int which;
    224         NormInertBinaryProperty(int source, int which) {
    225             super(source);
    226             this.which=which;
    227         }
    228         @Override
    229         boolean contains(int c) {
    230             return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
    231         }
    232     }
    233 
    234     BinaryProperty[] binProps={
    235         /*
    236          * Binary-property implementations must be in order of corresponding UProperty,
    237          * and there must be exactly one entry per binary UProperty.
    238          */
    239         new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)),
    240         new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)),
    241         new BinaryProperty(SRC_BIDI) {  // UCHAR_BIDI_CONTROL
    242             @Override
    243             boolean contains(int c) {
    244                 return UBiDiProps.INSTANCE.isBidiControl(c);
    245             }
    246         },
    247         new BinaryProperty(SRC_BIDI) {  // UCHAR_BIDI_MIRRORED
    248             @Override
    249             boolean contains(int c) {
    250                 return UBiDiProps.INSTANCE.isMirrored(c);
    251             }
    252         },
    253         new BinaryProperty(1, (1<<DASH_PROPERTY_)),
    254         new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)),
    255         new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)),
    256         new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)),
    257         new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)),
    258         new BinaryProperty(SRC_NFC) {  // UCHAR_FULL_COMPOSITION_EXCLUSION
    259             @Override
    260             boolean contains(int c) {
    261                 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
    262                 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl;
    263                 return impl.isCompNo(impl.getNorm16(c));
    264             }
    265         },
    266         new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)),
    267         new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)),
    268         new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)),
    269         new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)),
    270         new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)),
    271         new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)),
    272         new BinaryProperty(1, (1<<ID_START_PROPERTY_)),
    273         new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)),
    274         new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)),
    275         new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)),
    276         new BinaryProperty(SRC_BIDI) {  // UCHAR_JOIN_CONTROL
    277             @Override
    278             boolean contains(int c) {
    279                 return UBiDiProps.INSTANCE.isJoinControl(c);
    280             }
    281         },
    282         new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)),
    283         new CaseBinaryProperty(UProperty.LOWERCASE),
    284         new BinaryProperty(1, (1<<MATH_PROPERTY_)),
    285         new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)),
    286         new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)),
    287         new BinaryProperty(1, (1<<RADICAL_PROPERTY_)),
    288         new CaseBinaryProperty(UProperty.SOFT_DOTTED),
    289         new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)),
    290         new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)),
    291         new CaseBinaryProperty(UProperty.UPPERCASE),
    292         new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)),
    293         new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)),
    294         new BinaryProperty(1, (1<<XID_START_PROPERTY_)),
    295         new CaseBinaryProperty(UProperty.CASE_SENSITIVE),
    296         new BinaryProperty(1, (1<<S_TERM_PROPERTY_)),
    297         new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)),
    298         new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT),
    299         new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT),
    300         new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT),
    301         new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT),
    302         new BinaryProperty(SRC_NFC_CANON_ITER) {  // UCHAR_SEGMENT_STARTER
    303             @Override
    304             boolean contains(int c) {
    305                 return Norm2AllModes.getNFCInstance().impl.
    306                     ensureCanonIterData().isCanonSegmentStarter(c);
    307             }
    308         },
    309         new BinaryProperty(1, (1<<PATTERN_SYNTAX)),
    310         new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)),
    311         new BinaryProperty(SRC_CHAR_AND_PROPSVEC) {  // UCHAR_POSIX_ALNUM
    312             @Override
    313             boolean contains(int c) {
    314                 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c);
    315             }
    316         },
    317         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_BLANK
    318             @Override
    319             boolean contains(int c) {
    320                 // "horizontal space"
    321                 if(c<=0x9f) {
    322                     return c==9 || c==0x20; /* TAB or SPACE */
    323                 } else {
    324                     /* Zs */
    325                     return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    326                 }
    327             }
    328         },
    329         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_GRAPH
    330             @Override
    331             boolean contains(int c) {
    332                 return isgraphPOSIX(c);
    333             }
    334         },
    335         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_PRINT
    336             @Override
    337             boolean contains(int c) {
    338                 /*
    339                  * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
    340                  *
    341                  * The only cntrl character in graph+blank is TAB (in blank).
    342                  * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
    343                  */
    344                 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
    345             }
    346         },
    347         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_XDIGIT
    348             @Override
    349             boolean contains(int c) {
    350                 /* check ASCII and Fullwidth ASCII a-fA-F */
    351                 if(
    352                     (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
    353                     (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    354                 ) {
    355                     return true;
    356                 }
    357                 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
    358             }
    359         },
    360         new CaseBinaryProperty(UProperty.CASED),
    361         new CaseBinaryProperty(UProperty.CASE_IGNORABLE),
    362         new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED),
    363         new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED),
    364         new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED),
    365         new BinaryProperty(SRC_CASE_AND_NORM) {  // UCHAR_CHANGES_WHEN_CASEFOLDED
    366             @Override
    367             boolean contains(int c) {
    368                 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c);
    369                 if(nfd!=null) {
    370                     /* c has a decomposition */
    371                     c=nfd.codePointAt(0);
    372                     if(Character.charCount(c)!=nfd.length()) {
    373                         /* multiple code points */
    374                         c=-1;
    375                     }
    376                 } else if(c<0) {
    377                     return false;  /* protect against bad input */
    378                 }
    379                 if(c>=0) {
    380                     /* single code point */
    381                     UCaseProps csp=UCaseProps.INSTANCE;
    382                     UCaseProps.dummyStringBuilder.setLength(0);
    383                     return csp.toFullFolding(c, UCaseProps.dummyStringBuilder,
    384                                              UCharacter.FOLD_CASE_DEFAULT)>=0;
    385                 } else {
    386                     String folded=UCharacter.foldCase(nfd, true);
    387                     return !folded.equals(nfd);
    388                 }
    389             }
    390         },
    391         new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED),
    392         new BinaryProperty(SRC_NFKC_CF) {  // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
    393             @Override
    394             boolean contains(int c) {
    395                 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl;
    396                 String src=UTF16.valueOf(c);
    397                 StringBuilder dest=new StringBuilder();
    398                 // Small destCapacity for NFKC_CF(c).
    399                 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5);
    400                 kcf.compose(src, 0, src.length(), false, true, buffer);
    401                 return !Normalizer2Impl.UTF16Plus.equal(dest, src);
    402             }
    403         },
    404         new BinaryProperty(2, 1<<PROPS_2_EMOJI),
    405         new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION),
    406         new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER),
    407         new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE),
    408     };
    409 
    410     public boolean hasBinaryProperty(int c, int which) {
    411          if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) {
    412             // not a known binary property
    413             return false;
    414         } else {
    415             return binProps[which].contains(c);
    416         }
    417     }
    418 
    419     // int-value and enumerated properties --------------------------------- ***
    420 
    421     public int getType(int c) {
    422         return getProperty(c)&TYPE_MASK;
    423     }
    424 
    425     /*
    426      * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
    427      * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
    428      */
    429     private static final int /* UHangulSyllableType */ gcbToHst[]={
    430         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
    431         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CONTROL */
    432         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CR */
    433         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_EXTEND */
    434         HangulSyllableType.LEADING_JAMO,     /* U_GCB_L */
    435         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_LF */
    436         HangulSyllableType.LV_SYLLABLE,      /* U_GCB_LV */
    437         HangulSyllableType.LVT_SYLLABLE,     /* U_GCB_LVT */
    438         HangulSyllableType.TRAILING_JAMO,    /* U_GCB_T */
    439         HangulSyllableType.VOWEL_JAMO        /* U_GCB_V */
    440         /*
    441          * Omit GCB values beyond what we need for hst.
    442          * The code below checks for the array length.
    443          */
    444     };
    445 
    446     private class IntProperty {
    447         int column;  // SRC_PROPSVEC column, or "source" if mask==0
    448         int mask;
    449         int shift;
    450         IntProperty(int column, int mask, int shift) {
    451             this.column=column;
    452             this.mask=mask;
    453             this.shift=shift;
    454         }
    455         IntProperty(int source) {
    456             this.column=source;
    457             this.mask=0;
    458         }
    459         final int getSource() {
    460             return mask==0 ? column : SRC_PROPSVEC;
    461         }
    462         int getValue(int c) {
    463             // systematic, directly stored properties
    464             return (getAdditional(c, column)&mask)>>>shift;
    465         }
    466         int getMaxValue(int which) {
    467             return (getMaxValues(column)&mask)>>>shift;
    468         }
    469     }
    470 
    471     private class BiDiIntProperty extends IntProperty {
    472         BiDiIntProperty() {
    473             super(SRC_BIDI);
    474         }
    475         @Override
    476         int getMaxValue(int which) {
    477             return UBiDiProps.INSTANCE.getMaxValue(which);
    478         }
    479     }
    480 
    481     private class CombiningClassIntProperty extends IntProperty {
    482         CombiningClassIntProperty(int source) {
    483             super(source);
    484         }
    485         @Override
    486         int getMaxValue(int which) {
    487             return 0xff;
    488         }
    489     }
    490 
    491     private class NormQuickCheckIntProperty extends IntProperty {  // UCHAR_NF*_QUICK_CHECK properties
    492         int which;
    493         int max;
    494         NormQuickCheckIntProperty(int source, int which, int max) {
    495             super(source);
    496             this.which=which;
    497             this.max=max;
    498         }
    499         @Override
    500         int getValue(int c) {
    501             return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c);
    502         }
    503         @Override
    504         int getMaxValue(int which) {
    505             return max;
    506         }
    507     }
    508 
    509     IntProperty intProps[]={
    510         new BiDiIntProperty() {  // BIDI_CLASS
    511             @Override
    512             int getValue(int c) {
    513                 return UBiDiProps.INSTANCE.getClass(c);
    514             }
    515         },
    516         new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_),
    517         new CombiningClassIntProperty(SRC_NFC) {  // CANONICAL_COMBINING_CLASS
    518             @Override
    519             int getValue(int c) {
    520                 return Normalizer2.getNFDInstance().getCombiningClass(c);
    521             }
    522         },
    523         new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0),
    524         new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_),
    525         new IntProperty(SRC_CHAR) {  // GENERAL_CATEGORY
    526             @Override
    527             int getValue(int c) {
    528                 return getType(c);
    529             }
    530             @Override
    531             int getMaxValue(int which) {
    532                 return UCharacterCategory.CHAR_CATEGORY_COUNT-1;
    533             }
    534         },
    535         new BiDiIntProperty() {  // JOINING_GROUP
    536             @Override
    537             int getValue(int c) {
    538                 return UBiDiProps.INSTANCE.getJoiningGroup(c);
    539             }
    540         },
    541         new BiDiIntProperty() {  // JOINING_TYPE
    542             @Override
    543             int getValue(int c) {
    544                 return UBiDiProps.INSTANCE.getJoiningType(c);
    545             }
    546         },
    547         new IntProperty(2, LB_MASK, LB_SHIFT),  // LINE_BREAK
    548         new IntProperty(SRC_CHAR) {  // NUMERIC_TYPE
    549             @Override
    550             int getValue(int c) {
    551                 return ntvGetType(getNumericTypeValue(getProperty(c)));
    552             }
    553             @Override
    554             int getMaxValue(int which) {
    555                 return NumericType.COUNT-1;
    556             }
    557         },
    558         new IntProperty(0, SCRIPT_MASK_, 0) {
    559             @Override
    560             int getValue(int c) {
    561                 return UScript.getScript(c);
    562             }
    563         },
    564         new IntProperty(SRC_PROPSVEC) {  // HANGUL_SYLLABLE_TYPE
    565             @Override
    566             int getValue(int c) {
    567                 /* see comments on gcbToHst[] above */
    568                 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
    569                 if(gcb<gcbToHst.length) {
    570                     return gcbToHst[gcb];
    571                 } else {
    572                     return HangulSyllableType.NOT_APPLICABLE;
    573                 }
    574             }
    575             @Override
    576             int getMaxValue(int which) {
    577                 return HangulSyllableType.COUNT-1;
    578             }
    579         },
    580         // max=1=YES -- these are never "maybe", only "no" or "yes"
    581         new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1),
    582         new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1),
    583         // max=2=MAYBE
    584         new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2),
    585         new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2),
    586         new CombiningClassIntProperty(SRC_NFC) {  // LEAD_CANONICAL_COMBINING_CLASS
    587             @Override
    588             int getValue(int c) {
    589                 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8;
    590             }
    591         },
    592         new CombiningClassIntProperty(SRC_NFC) {  // TRAIL_CANONICAL_COMBINING_CLASS
    593             @Override
    594             int getValue(int c) {
    595                 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff;
    596             }
    597         },
    598         new IntProperty(2, GCB_MASK, GCB_SHIFT),  // GRAPHEME_CLUSTER_BREAK
    599         new IntProperty(2, SB_MASK, SB_SHIFT),  // SENTENCE_BREAK
    600         new IntProperty(2, WB_MASK, WB_SHIFT),  // WORD_BREAK
    601         new BiDiIntProperty() {  // BIDI_PAIRED_BRACKET_TYPE
    602             @Override
    603             int getValue(int c) {
    604                 return UBiDiProps.INSTANCE.getPairedBracketType(c);
    605             }
    606         },
    607     };
    608 
    609     public int getIntPropertyValue(int c, int which) {
    610         if(which<UProperty.INT_START) {
    611             if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
    612                 return binProps[which].contains(c) ? 1 : 0;
    613             }
    614         } else if(which<UProperty.INT_LIMIT) {
    615             return intProps[which-UProperty.INT_START].getValue(c);
    616         } else if (which == UProperty.GENERAL_CATEGORY_MASK) {
    617             return getMask(getType(c));
    618         }
    619         return 0; // undefined
    620     }
    621 
    622     public int getIntPropertyMaxValue(int which) {
    623         if(which<UProperty.INT_START) {
    624             if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
    625                 return 1;  // maximum TRUE for all binary properties
    626             }
    627         } else if(which<UProperty.INT_LIMIT) {
    628             return intProps[which-UProperty.INT_START].getMaxValue(which);
    629         }
    630         return -1; // undefined
    631     }
    632 
    633     public final int getSource(int which) {
    634         if(which<UProperty.BINARY_START) {
    635             return SRC_NONE; /* undefined */
    636         } else if(which<UProperty.BINARY_LIMIT) {
    637             return binProps[which].getSource();
    638         } else if(which<UProperty.INT_START) {
    639             return SRC_NONE; /* undefined */
    640         } else if(which<UProperty.INT_LIMIT) {
    641             return intProps[which-UProperty.INT_START].getSource();
    642         } else if(which<UProperty.STRING_START) {
    643             switch(which) {
    644             case UProperty.GENERAL_CATEGORY_MASK:
    645             case UProperty.NUMERIC_VALUE:
    646                 return SRC_CHAR;
    647 
    648             default:
    649                 return SRC_NONE;
    650             }
    651         } else if(which<UProperty.STRING_LIMIT) {
    652             switch(which) {
    653             case UProperty.AGE:
    654                 return SRC_PROPSVEC;
    655 
    656             case UProperty.BIDI_MIRRORING_GLYPH:
    657                 return SRC_BIDI;
    658 
    659             case UProperty.CASE_FOLDING:
    660             case UProperty.LOWERCASE_MAPPING:
    661             case UProperty.SIMPLE_CASE_FOLDING:
    662             case UProperty.SIMPLE_LOWERCASE_MAPPING:
    663             case UProperty.SIMPLE_TITLECASE_MAPPING:
    664             case UProperty.SIMPLE_UPPERCASE_MAPPING:
    665             case UProperty.TITLECASE_MAPPING:
    666             case UProperty.UPPERCASE_MAPPING:
    667                 return SRC_CASE;
    668 
    669             case UProperty.ISO_COMMENT:
    670             case UProperty.NAME:
    671             case UProperty.UNICODE_1_NAME:
    672                 return SRC_NAMES;
    673 
    674             default:
    675                 return SRC_NONE;
    676             }
    677         } else {
    678             switch(which) {
    679             case UProperty.SCRIPT_EXTENSIONS:
    680                 return SRC_PROPSVEC;
    681             default:
    682                 return SRC_NONE; /* undefined */
    683             }
    684         }
    685     }
    686 
    687     /**
    688      * <p>
    689      * Unicode property names and property value names are compared
    690      * "loosely". Property[Value]Aliases.txt say:
    691      * <quote>
    692      *   "With loose matching of property names, the case distinctions,
    693      *    whitespace, and '_' are ignored."
    694      * </quote>
    695      * </p>
    696      * <p>
    697      * This function does just that, for ASCII (char *) name strings.
    698      * It is almost identical to ucnv_compareNames() but also ignores
    699      * ASCII White_Space characters (U+0009..U+000d).
    700      * </p>
    701      * @param name1 name to compare
    702      * @param name2 name to compare
    703      * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
    704      *         if name1 is greater than name2.
    705      */
    706     /* to be implemented in 2.4
    707      * public static int comparePropertyNames(String name1, String name2)
    708     {
    709         int result = 0;
    710         int i1 = 0;
    711         int i2 = 0;
    712         while (true) {
    713             char ch1 = 0;
    714             char ch2 = 0;
    715             // Ignore delimiters '-', '_', and ASCII White_Space
    716             if (i1 < name1.length()) {
    717                 ch1 = name1.charAt(i1 ++);
    718             }
    719             while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
    720                    || ch1 == '\n' // synwee what is || ch1 == '\v'
    721                    || ch1 == '\f' || ch1=='\r') {
    722                 if (i1 < name1.length()) {
    723                     ch1 = name1.charAt(i1 ++);
    724                 }
    725                 else {
    726                     ch1 = 0;
    727                 }
    728             }
    729             if (i2 < name2.length()) {
    730                 ch2 = name2.charAt(i2 ++);
    731             }
    732             while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
    733                    || ch2 == '\n' // synwee what is || ch1 == '\v'
    734                    || ch2 == '\f' || ch2=='\r') {
    735                 if (i2 < name2.length()) {
    736                     ch2 = name2.charAt(i2 ++);
    737                 }
    738                 else {
    739                     ch2 = 0;
    740                 }
    741             }
    742 
    743             // If we reach the ends of both strings then they match
    744             if (ch1 == 0 && ch2 == 0) {
    745                 return 0;
    746             }
    747 
    748             // Case-insensitive comparison
    749             if (ch1 != ch2) {
    750                 result = Character.toLowerCase(ch1)
    751                                                 - Character.toLowerCase(ch2);
    752                 if (result != 0) {
    753                     return result;
    754                 }
    755             }
    756         }
    757     }
    758     */
    759 
    760     /**
    761      * Get the the maximum values for some enum/int properties.
    762      * @return maximum values for the integer properties.
    763      */
    764     public int getMaxValues(int column)
    765     {
    766        // return m_maxBlockScriptValue_;
    767 
    768         switch(column) {
    769         case 0:
    770             return m_maxBlockScriptValue_;
    771         case 2:
    772             return m_maxJTGValue_;
    773         default:
    774             return 0;
    775         }
    776     }
    777 
    778     /**
    779      * Gets the type mask
    780      * @param type character type
    781      * @return mask
    782      */
    783     public static final int getMask(int type)
    784     {
    785         return 1 << type;
    786     }
    787 
    788 
    789     /**
    790      * Returns the digit values of characters like 'A' - 'Z', normal,
    791      * half-width and full-width. This method assumes that the other digit
    792      * characters are checked by the calling method.
    793      * @param ch character to test
    794      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
    795      *         its corresponding digit will be returned.
    796      */
    797     public static int getEuropeanDigit(int ch) {
    798         if ((ch > 0x7a && ch < 0xff21)
    799             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
    800             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
    801             return -1;
    802         }
    803         if (ch <= 0x7a) {
    804             // ch >= 0x41 or ch < 0x61
    805             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
    806         }
    807         // ch >= 0xff21
    808         if (ch <= 0xff3a) {
    809             return ch + 10 - 0xff21;
    810         }
    811         // ch >= 0xff41 && ch <= 0xff5a
    812         return ch + 10 - 0xff41;
    813     }
    814 
    815     public int digit(int c) {
    816         int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
    817         if(value<=9) {
    818             return value;
    819         } else {
    820             return -1;
    821         }
    822     }
    823 
    824     public int getNumericValue(int c) {
    825         // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
    826         int ntv = getNumericTypeValue(getProperty(c));
    827 
    828         if(ntv==NTV_NONE_) {
    829             return getEuropeanDigit(c);
    830         } else if(ntv<NTV_DIGIT_START_) {
    831             /* decimal digit */
    832             return ntv-NTV_DECIMAL_START_;
    833         } else if(ntv<NTV_NUMERIC_START_) {
    834             /* other digit */
    835             return ntv-NTV_DIGIT_START_;
    836         } else if(ntv<NTV_FRACTION_START_) {
    837             /* small integer */
    838             return ntv-NTV_NUMERIC_START_;
    839         } else if(ntv<NTV_LARGE_START_) {
    840             /* fraction */
    841             return -2;
    842         } else if(ntv<NTV_BASE60_START_) {
    843             /* large, single-significant-digit integer */
    844             int mant=(ntv>>5)-14;
    845             int exp=(ntv&0x1f)+2;
    846             if(exp<9 || (exp==9 && mant<=2)) {
    847                 int numValue=mant;
    848                 do {
    849                     numValue*=10;
    850                 } while(--exp>0);
    851                 return numValue;
    852             } else {
    853                 return -2;
    854             }
    855         } else if(ntv<NTV_FRACTION20_START_) {
    856             /* sexagesimal (base 60) integer */
    857             int numValue=(ntv>>2)-0xbf;
    858             int exp=(ntv&3)+1;
    859 
    860             switch(exp) {
    861             case 4:
    862                 numValue*=60*60*60*60;
    863                 break;
    864             case 3:
    865                 numValue*=60*60*60;
    866                 break;
    867             case 2:
    868                 numValue*=60*60;
    869                 break;
    870             case 1:
    871                 numValue*=60;
    872                 break;
    873             case 0:
    874             default:
    875                 break;
    876             }
    877 
    878             return numValue;
    879         } else if(ntv<NTV_RESERVED_START_) {
    880             // fraction-20 e.g. 3/80
    881             return -2;
    882         } else {
    883             /* reserved */
    884             return -2;
    885         }
    886     }
    887 
    888     public double getUnicodeNumericValue(int c) {
    889         // equivalent to c version double u_getNumericValue(UChar32 c)
    890         int ntv = getNumericTypeValue(getProperty(c));
    891 
    892         if(ntv==NTV_NONE_) {
    893             return UCharacter.NO_NUMERIC_VALUE;
    894         } else if(ntv<NTV_DIGIT_START_) {
    895             /* decimal digit */
    896             return ntv-NTV_DECIMAL_START_;
    897         } else if(ntv<NTV_NUMERIC_START_) {
    898             /* other digit */
    899             return ntv-NTV_DIGIT_START_;
    900         } else if(ntv<NTV_FRACTION_START_) {
    901             /* small integer */
    902             return ntv-NTV_NUMERIC_START_;
    903         } else if(ntv<NTV_LARGE_START_) {
    904             /* fraction */
    905             int numerator=(ntv>>4)-12;
    906             int denominator=(ntv&0xf)+1;
    907             return (double)numerator/denominator;
    908         } else if(ntv<NTV_BASE60_START_) {
    909             /* large, single-significant-digit integer */
    910             double numValue;
    911             int mant=(ntv>>5)-14;
    912             int exp=(ntv&0x1f)+2;
    913             numValue=mant;
    914 
    915             /* multiply by 10^exp without math.h */
    916             while(exp>=4) {
    917                 numValue*=10000.;
    918                 exp-=4;
    919             }
    920             switch(exp) {
    921             case 3:
    922                 numValue*=1000.;
    923                 break;
    924             case 2:
    925                 numValue*=100.;
    926                 break;
    927             case 1:
    928                 numValue*=10.;
    929                 break;
    930             case 0:
    931             default:
    932                 break;
    933             }
    934 
    935             return numValue;
    936         } else if(ntv<NTV_FRACTION20_START_) {
    937             /* sexagesimal (base 60) integer */
    938             int numValue=(ntv>>2)-0xbf;
    939             int exp=(ntv&3)+1;
    940 
    941             switch(exp) {
    942             case 4:
    943                 numValue*=60*60*60*60;
    944                 break;
    945             case 3:
    946                 numValue*=60*60*60;
    947                 break;
    948             case 2:
    949                 numValue*=60*60;
    950                 break;
    951             case 1:
    952                 numValue*=60;
    953                 break;
    954             case 0:
    955             default:
    956                 break;
    957             }
    958 
    959             return numValue;
    960         } else if(ntv<NTV_RESERVED_START_) {
    961             // fraction-20 e.g. 3/80
    962             int frac20=ntv-NTV_FRACTION20_START_;  // 0..0x17
    963             int numerator=2*(frac20&3)+1;
    964             int denominator=20<<(frac20>>2);
    965             return (double)numerator/denominator;
    966         } else {
    967             /* reserved */
    968             return UCharacter.NO_NUMERIC_VALUE;
    969         }
    970     }
    971 
    972     // protected variables -----------------------------------------------
    973 
    974     /**
    975      * Extra property trie
    976      */
    977     Trie2_16 m_additionalTrie_;
    978     /**
    979      * Extra property vectors, 1st column for age and second for binary
    980      * properties.
    981      */
    982     int m_additionalVectors_[];
    983     /**
    984      * Number of additional columns
    985      */
    986     int m_additionalColumnsCount_;
    987     /**
    988      * Maximum values for block, bits used as in vector word
    989      * 0
    990      */
    991     int m_maxBlockScriptValue_;
    992     /**
    993      * Maximum values for script, bits used as in vector word
    994      * 0
    995      */
    996      int m_maxJTGValue_;
    997 
    998     /**
    999      * Script_Extensions data
   1000      */
   1001     public char[] m_scriptExtensions_;
   1002 
   1003     // private variables -------------------------------------------------
   1004 
   1005     /**
   1006     * Default name of the datafile
   1007     */
   1008     private static final String DATA_FILE_NAME_ = "uprops.icu";
   1009 
   1010     // property data constants -------------------------------------------------
   1011 
   1012     /**
   1013      * Numeric types and values in the main properties words.
   1014      */
   1015     private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
   1016     private static final int getNumericTypeValue(int props) {
   1017         return props >> NUMERIC_TYPE_VALUE_SHIFT_;
   1018     }
   1019     /* constants for the storage form of numeric types and values */
   1020     /** No numeric value. */
   1021     private static final int NTV_NONE_ = 0;
   1022     /** Decimal digits: nv=0..9 */
   1023     private static final int NTV_DECIMAL_START_ = 1;
   1024     /** Other digits: nv=0..9 */
   1025     private static final int NTV_DIGIT_START_ = 11;
   1026     /** Small integers: nv=0..154 */
   1027     private static final int NTV_NUMERIC_START_ = 21;
   1028     /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
   1029     private static final int NTV_FRACTION_START_ = 0xb0;
   1030     /**
   1031      * Large integers:
   1032      * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
   1033      * (only one significant decimal digit)
   1034      */
   1035     private static final int NTV_LARGE_START_ = 0x1e0;
   1036     /**
   1037      * Sexagesimal numbers:
   1038      * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
   1039      */
   1040     private static final int NTV_BASE60_START_=0x300;
   1041     /**
   1042      * Fraction-20 values:
   1043      * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
   1044      * numerator: num = 2*(frac20&3)+1
   1045      * denominator: den = 20<<(frac20>>2)
   1046      */
   1047     private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36;  // 0x300+9*4=0x324
   1048     /** No numeric value (yet). */
   1049     private static final int NTV_RESERVED_START_ = NTV_FRACTION20_START_ + 24;  // 0x324+6*4=0x34c
   1050 
   1051     private static final int ntvGetType(int ntv) {
   1052         return
   1053             (ntv==NTV_NONE_) ? NumericType.NONE :
   1054             (ntv<NTV_DIGIT_START_) ?  NumericType.DECIMAL :
   1055             (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
   1056             NumericType.NUMERIC;
   1057     }
   1058 
   1059     /*
   1060      * Properties in vector word 0
   1061      * Bits
   1062      * 31..24   DerivedAge version major/minor one nibble each
   1063      * 23..22   3..1: Bits 7..0 = Script_Extensions index
   1064      *             3: Script value from Script_Extensions
   1065      *             2: Script=Inherited
   1066      *             1: Script=Common
   1067      *             0: Script=bits 7..0
   1068      * 21..20   reserved
   1069      * 19..17   East Asian Width
   1070      * 16.. 8   UBlockCode
   1071      *  7.. 0   UScriptCode
   1072      */
   1073 
   1074     /**
   1075      * Script_Extensions: mask includes Script
   1076      */
   1077     public static final int SCRIPT_X_MASK = 0x00c000ff;
   1078     //private static final int SCRIPT_X_SHIFT = 22;
   1079     /**
   1080      * Integer properties mask and shift values for East Asian cell width.
   1081      * Equivalent to icu4c UPROPS_EA_MASK
   1082      */
   1083     private static final int EAST_ASIAN_MASK_ = 0x000e0000;
   1084     /**
   1085      * Integer properties mask and shift values for East Asian cell width.
   1086      * Equivalent to icu4c UPROPS_EA_SHIFT
   1087      */
   1088     private static final int EAST_ASIAN_SHIFT_ = 17;
   1089     /**
   1090      * Integer properties mask and shift values for blocks.
   1091      * Equivalent to icu4c UPROPS_BLOCK_MASK
   1092      */
   1093     private static final int BLOCK_MASK_ = 0x0001ff00;
   1094     /**
   1095      * Integer properties mask and shift values for blocks.
   1096      * Equivalent to icu4c UPROPS_BLOCK_SHIFT
   1097      */
   1098     private static final int BLOCK_SHIFT_ = 8;
   1099     /**
   1100      * Integer properties mask and shift values for scripts.
   1101      * Equivalent to icu4c UPROPS_SHIFT_MASK
   1102      */
   1103     public static final int SCRIPT_MASK_ = 0x000000ff;
   1104 
   1105     /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
   1106     public static final int SCRIPT_X_WITH_COMMON = 0x400000;
   1107     public static final int SCRIPT_X_WITH_INHERITED = 0x800000;
   1108     public static final int SCRIPT_X_WITH_OTHER = 0xc00000;
   1109 
   1110     /**
   1111      * Additional properties used in internal trie data
   1112      */
   1113     /*
   1114      * Properties in vector word 1
   1115      * Each bit encodes one binary property.
   1116      * The following constants represent the bit number, use 1<<UPROPS_XYZ.
   1117      * UPROPS_BINARY_1_TOP<=32!
   1118      *
   1119      * Keep this list of property enums in sync with
   1120      * propListNames[] in icu/source/tools/genprops/props2.c!
   1121      *
   1122      * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
   1123      */
   1124     private static final int WHITE_SPACE_PROPERTY_ = 0;
   1125     private static final int DASH_PROPERTY_ = 1;
   1126     private static final int HYPHEN_PROPERTY_ = 2;
   1127     private static final int QUOTATION_MARK_PROPERTY_ = 3;
   1128     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
   1129     private static final int MATH_PROPERTY_ = 5;
   1130     private static final int HEX_DIGIT_PROPERTY_ = 6;
   1131     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
   1132     private static final int ALPHABETIC_PROPERTY_ = 8;
   1133     private static final int IDEOGRAPHIC_PROPERTY_ = 9;
   1134     private static final int DIACRITIC_PROPERTY_ = 10;
   1135     private static final int EXTENDER_PROPERTY_ = 11;
   1136     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
   1137     private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
   1138     private static final int GRAPHEME_LINK_PROPERTY_ = 14;
   1139     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
   1140     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
   1141     private static final int RADICAL_PROPERTY_ = 17;
   1142     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
   1143     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
   1144     private static final int DEPRECATED_PROPERTY_ = 20;
   1145     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
   1146     private static final int XID_START_PROPERTY_ = 22;
   1147     private static final int XID_CONTINUE_PROPERTY_ = 23;
   1148     private static final int ID_START_PROPERTY_    = 24;
   1149     private static final int ID_CONTINUE_PROPERTY_ = 25;
   1150     private static final int GRAPHEME_BASE_PROPERTY_ = 26;
   1151     private static final int S_TERM_PROPERTY_ = 27;
   1152     private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
   1153     private static final int PATTERN_SYNTAX = 29;                   /* new in ICU 3.4 and Unicode 4.1 */
   1154     private static final int PATTERN_WHITE_SPACE = 30;
   1155 
   1156     /*
   1157      * Properties in vector word 2
   1158      * Bits
   1159      * 31..28   http://www.unicode.org/reports/tr51/#Emoji_Properties
   1160      * 27..26   reserved
   1161      * 25..20   Line Break
   1162      * 19..15   Sentence Break
   1163      * 14..10   Word Break
   1164      *  9.. 5   Grapheme Cluster Break
   1165      *  4.. 0   Decomposition Type
   1166      */
   1167     private static final int PROPS_2_EMOJI = 28;
   1168     private static final int PROPS_2_EMOJI_PRESENTATION = 29;
   1169     private static final int PROPS_2_EMOJI_MODIFIER = 30;
   1170     private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31;
   1171 
   1172     private static final int LB_MASK          = 0x03f00000;
   1173     private static final int LB_SHIFT         = 20;
   1174 
   1175     private static final int SB_MASK          = 0x000f8000;
   1176     private static final int SB_SHIFT         = 15;
   1177 
   1178     private static final int WB_MASK          = 0x00007c00;
   1179     private static final int WB_SHIFT         = 10;
   1180 
   1181     private static final int GCB_MASK         = 0x000003e0;
   1182     private static final int GCB_SHIFT        = 5;
   1183 
   1184     /**
   1185      * Integer properties mask for decomposition type.
   1186      * Equivalent to icu4c UPROPS_DT_MASK.
   1187      */
   1188     private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
   1189 
   1190     /**
   1191      * First nibble shift
   1192      */
   1193     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
   1194     /**
   1195      * Second nibble mask
   1196      */
   1197     private static final int LAST_NIBBLE_MASK_ = 0xF;
   1198     /**
   1199      * Age value shift
   1200      */
   1201     private static final int AGE_SHIFT_ = 24;
   1202 
   1203 
   1204     // private constructors --------------------------------------------------
   1205 
   1206     /**
   1207      * Constructor
   1208      * @exception IOException thrown when data reading fails or data corrupted
   1209      */
   1210     private UCharacterProperty() throws IOException
   1211     {
   1212         // consistency check
   1213         if(binProps.length!=UProperty.BINARY_LIMIT) {
   1214             throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT");
   1215         }
   1216         if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) {
   1217             throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)");
   1218         }
   1219 
   1220         // jar access
   1221         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
   1222         m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
   1223         // Read or skip the 16 indexes.
   1224         int propertyOffset = bytes.getInt();
   1225         /* exceptionOffset = */ bytes.getInt();
   1226         /* caseOffset = */ bytes.getInt();
   1227         int additionalOffset = bytes.getInt();
   1228         int additionalVectorsOffset = bytes.getInt();
   1229         m_additionalColumnsCount_ = bytes.getInt();
   1230         int scriptExtensionsOffset = bytes.getInt();
   1231         int reservedOffset7 = bytes.getInt();
   1232         /* reservedOffset8 = */ bytes.getInt();
   1233         /* dataTopOffset = */ bytes.getInt();
   1234         m_maxBlockScriptValue_ = bytes.getInt();
   1235         m_maxJTGValue_ = bytes.getInt();
   1236         ICUBinary.skipBytes(bytes, (16 - 12) << 2);
   1237 
   1238         // read the main properties trie
   1239         m_trie_ = Trie2_16.createFromSerialized(bytes);
   1240         int expectedTrieLength = (propertyOffset - 16) * 4;
   1241         int trieLength = m_trie_.getSerializedLength();
   1242         if(trieLength > expectedTrieLength) {
   1243             throw new IOException("uprops.icu: not enough bytes for main trie");
   1244         }
   1245         // skip padding after trie bytes
   1246         ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
   1247 
   1248         // skip unused intervening data structures
   1249         ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
   1250 
   1251         if(m_additionalColumnsCount_ > 0) {
   1252             // reads the additional property block
   1253             m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
   1254             expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
   1255             trieLength = m_additionalTrie_.getSerializedLength();
   1256             if(trieLength > expectedTrieLength) {
   1257                 throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
   1258             }
   1259             // skip padding after trie bytes
   1260             ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
   1261 
   1262             // additional properties
   1263             int size = scriptExtensionsOffset - additionalVectorsOffset;
   1264             m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0);
   1265         }
   1266 
   1267         // Script_Extensions
   1268         int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
   1269         if(numChars > 0) {
   1270             m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0);
   1271         }
   1272     }
   1273 
   1274     private static final class IsAcceptable implements ICUBinary.Authenticate {
   1275         // @Override when we switch to Java 6
   1276         @Override
   1277         public boolean isDataVersionAcceptable(byte version[]) {
   1278             return version[0] == 7;
   1279         }
   1280     }
   1281     private static final int DATA_FORMAT = 0x5550726F;  // "UPro"
   1282 
   1283     // private methods -------------------------------------------------------
   1284 
   1285     /*
   1286      * Compare additional properties to see if it has argument type
   1287      * @param property 32 bit properties
   1288      * @param type character type
   1289      * @return true if property has type
   1290      */
   1291     /*private boolean compareAdditionalType(int property, int type)
   1292     {
   1293         return (property & (1 << type)) != 0;
   1294     }*/
   1295 
   1296     // property starts for UnicodeSet -------------------------------------- ***
   1297 
   1298     private static final int TAB     = 0x0009;
   1299     //private static final int LF      = 0x000a;
   1300     //private static final int FF      = 0x000c;
   1301     private static final int CR      = 0x000d;
   1302     private static final int U_A     = 0x0041;
   1303     private static final int U_F     = 0x0046;
   1304     private static final int U_Z     = 0x005a;
   1305     private static final int U_a     = 0x0061;
   1306     private static final int U_f     = 0x0066;
   1307     private static final int U_z     = 0x007a;
   1308     private static final int DEL     = 0x007f;
   1309     private static final int NL      = 0x0085;
   1310     private static final int NBSP    = 0x00a0;
   1311     private static final int CGJ     = 0x034f;
   1312     private static final int FIGURESP= 0x2007;
   1313     private static final int HAIRSP  = 0x200a;
   1314     //private static final int ZWNJ    = 0x200c;
   1315     //private static final int ZWJ     = 0x200d;
   1316     private static final int RLM     = 0x200f;
   1317     private static final int NNBSP   = 0x202f;
   1318     private static final int WJ      = 0x2060;
   1319     private static final int INHSWAP = 0x206a;
   1320     private static final int NOMDIG  = 0x206f;
   1321     private static final int U_FW_A  = 0xff21;
   1322     private static final int U_FW_F  = 0xff26;
   1323     private static final int U_FW_Z  = 0xff3a;
   1324     private static final int U_FW_a  = 0xff41;
   1325     private static final int U_FW_f  = 0xff46;
   1326     private static final int U_FW_z  = 0xff5a;
   1327     private static final int ZWNBSP  = 0xfeff;
   1328 
   1329     public UnicodeSet addPropertyStarts(UnicodeSet set) {
   1330         /* add the start code point of each same-value range of the main trie */
   1331         Iterator<Trie2.Range> trieIterator = m_trie_.iterator();
   1332         Trie2.Range range;
   1333         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
   1334             set.add(range.startCodePoint);
   1335         }
   1336 
   1337         /* add code points with hardcoded properties, plus the ones following them */
   1338 
   1339         /* add for u_isblank() */
   1340         set.add(TAB);
   1341         set.add(TAB+1);
   1342 
   1343         /* add for IS_THAT_CONTROL_SPACE() */
   1344         set.add(CR+1); /* range TAB..CR */
   1345         set.add(0x1c);
   1346         set.add(0x1f+1);
   1347         set.add(NL);
   1348         set.add(NL+1);
   1349 
   1350         /* add for u_isIDIgnorable() what was not added above */
   1351         set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
   1352         set.add(HAIRSP);
   1353         set.add(RLM+1);
   1354         set.add(INHSWAP);
   1355         set.add(NOMDIG+1);
   1356         set.add(ZWNBSP);
   1357         set.add(ZWNBSP+1);
   1358 
   1359         /* add no-break spaces for u_isWhitespace() what was not added above */
   1360         set.add(NBSP);
   1361         set.add(NBSP+1);
   1362         set.add(FIGURESP);
   1363         set.add(FIGURESP+1);
   1364         set.add(NNBSP);
   1365         set.add(NNBSP+1);
   1366 
   1367         /* add for u_charDigitValue() */
   1368         // TODO remove when UCharacter.getHanNumericValue() is changed to just return
   1369         // Unicode numeric values
   1370         set.add(0x3007);
   1371         set.add(0x3008);
   1372         set.add(0x4e00);
   1373         set.add(0x4e01);
   1374         set.add(0x4e8c);
   1375         set.add(0x4e8d);
   1376         set.add(0x4e09);
   1377         set.add(0x4e0a);
   1378         set.add(0x56db);
   1379         set.add(0x56dc);
   1380         set.add(0x4e94);
   1381         set.add(0x4e95);
   1382         set.add(0x516d);
   1383         set.add(0x516e);
   1384         set.add(0x4e03);
   1385         set.add(0x4e04);
   1386         set.add(0x516b);
   1387         set.add(0x516c);
   1388         set.add(0x4e5d);
   1389         set.add(0x4e5e);
   1390 
   1391         /* add for u_digit() */
   1392         set.add(U_a);
   1393         set.add(U_z+1);
   1394         set.add(U_A);
   1395         set.add(U_Z+1);
   1396         set.add(U_FW_a);
   1397         set.add(U_FW_z+1);
   1398         set.add(U_FW_A);
   1399         set.add(U_FW_Z+1);
   1400 
   1401         /* add for u_isxdigit() */
   1402         set.add(U_f+1);
   1403         set.add(U_F+1);
   1404         set.add(U_FW_f+1);
   1405         set.add(U_FW_F+1);
   1406 
   1407         /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
   1408         set.add(WJ); /* range WJ..NOMDIG */
   1409         set.add(0xfff0);
   1410         set.add(0xfffb+1);
   1411         set.add(0xe0000);
   1412         set.add(0xe0fff+1);
   1413 
   1414         /* add for UCHAR_GRAPHEME_BASE and others */
   1415         set.add(CGJ);
   1416         set.add(CGJ+1);
   1417 
   1418         return set; // for chaining
   1419     }
   1420 
   1421     public void upropsvec_addPropertyStarts(UnicodeSet set) {
   1422         /* add the start code point of each same-value range of the properties vectors trie */
   1423         if(m_additionalColumnsCount_>0) {
   1424             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
   1425             Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
   1426             Trie2.Range range;
   1427             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
   1428                 set.add(range.startCodePoint);
   1429             }
   1430         }
   1431     }
   1432 
   1433     // This static initializer block must be placed after
   1434     // other static member initialization
   1435     static {
   1436         try {
   1437             INSTANCE = new UCharacterProperty();
   1438         }
   1439         catch (IOException e) {
   1440             throw new MissingResourceException(e.getMessage(),"","");
   1441         }
   1442     }
   1443 
   1444 /*----------------------------------------------------------------
   1445  * Inclusions list
   1446  *----------------------------------------------------------------*/
   1447 
   1448     /*
   1449      * Return a set of characters for property enumeration.
   1450      * The set implicitly contains 0x110000 as well, which is one more than the highest
   1451      * Unicode code point.
   1452      *
   1453      * This set is used as an ordered list - its code points are ordered, and
   1454      * consecutive code points (in Unicode code point order) in the set define a range.
   1455      * For each two consecutive characters (start, limit) in the set,
   1456      * all of the UCD/normalization and related properties for
   1457      * all code points start..limit-1 are all the same,
   1458      * except for character names and ISO comments.
   1459      *
   1460      * All Unicode code points U+0000..U+10ffff are covered by these ranges.
   1461      * The ranges define a partition of the Unicode code space.
   1462      * ICU uses the inclusions set to enumerate properties for generating
   1463      * UnicodeSets containing all code points that have a certain property value.
   1464      *
   1465      * The Inclusion List is generated from the UCD. It is generated
   1466      * by enumerating the data tries, and code points for hardcoded properties
   1467      * are added as well.
   1468      *
   1469      * --------------------------------------------------------------------------
   1470      *
   1471      * The following are ideas for getting properties-unique code point ranges,
   1472      * with possible optimizations beyond the current implementation.
   1473      * These optimizations would require more code and be more fragile.
   1474      * The current implementation generates one single list (set) for all properties.
   1475      *
   1476      * To enumerate properties efficiently, one needs to know ranges of
   1477      * repetitive values, so that the value of only each start code point
   1478      * can be applied to the whole range.
   1479      * This information is in principle available in the uprops.icu/unorm.icu data.
   1480      *
   1481      * There are two obstacles:
   1482      *
   1483      * 1. Some properties are computed from multiple data structures,
   1484      *    making it necessary to get repetitive ranges by intersecting
   1485      *    ranges from multiple tries.
   1486      *
   1487      * 2. It is not economical to write code for getting repetitive ranges
   1488      *    that are precise for each of some 50 properties.
   1489      *
   1490      * Compromise ideas:
   1491      *
   1492      * - Get ranges per trie, not per individual property.
   1493      *   Each range contains the same values for a whole group of properties.
   1494      *   This would generate currently five range sets, two for uprops.icu tries
   1495      *   and three for unorm.icu tries.
   1496      *
   1497      * - Combine sets of ranges for multiple tries to get sufficient sets
   1498      *   for properties, e.g., the uprops.icu main and auxiliary tries
   1499      *   for all non-normalization properties.
   1500      *
   1501      * Ideas for representing ranges and combining them:
   1502      *
   1503      * - A UnicodeSet could hold just the start code points of ranges.
   1504      *   Multiple sets are easily combined by or-ing them together.
   1505      *
   1506      * - Alternatively, a UnicodeSet could hold each even-numbered range.
   1507      *   All ranges could be enumerated by using each start code point
   1508      *   (for the even-numbered ranges) as well as each limit (end+1) code point
   1509      *   (for the odd-numbered ranges).
   1510      *   It should be possible to combine two such sets by xor-ing them,
   1511      *   but no more than two.
   1512      *
   1513      * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
   1514      * but the first one is certainly simpler and applicable for combining more than
   1515      * two range sets.
   1516      *
   1517      * It is possible to combine all range sets for all uprops/unorm tries into one
   1518      * set that can be used for all properties.
   1519      * As an optimization, there could be less-combined range sets for certain
   1520      * groups of properties.
   1521      * The relationship of which less-combined range set to use for which property
   1522      * depends on the implementation of the properties and must be hardcoded
   1523      * - somewhat error-prone and higher maintenance but can be tested easily
   1524      * by building property sets "the simple way" in test code.
   1525      *
   1526      * ---
   1527      *
   1528      * Do not use a UnicodeSet pattern because that causes infinite recursion;
   1529      * UnicodeSet depends on the inclusions set.
   1530      *
   1531      * ---
   1532      *
   1533      * getInclusions() is commented out starting 2005-feb-12 because
   1534      * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
   1535      * and only for the relevant property source.
   1536      */
   1537     /*
   1538     public UnicodeSet getInclusions() {
   1539         UnicodeSet set = new UnicodeSet();
   1540         NormalizerImpl.addPropertyStarts(set);
   1541         addPropertyStarts(set);
   1542         return set;
   1543     }
   1544     */
   1545 }
   1546