Home | History | Annotate | Download | only in impl
      1 /*
      2  *******************************************************************************
      3  * Copyright (C) 1996-2014, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 
      8 package com.ibm.icu.impl;
      9 
     10 import java.io.IOException;
     11 import java.nio.ByteBuffer;
     12 import java.util.Iterator;
     13 import java.util.MissingResourceException;
     14 
     15 import com.ibm.icu.lang.UCharacter;
     16 import com.ibm.icu.lang.UCharacter.HangulSyllableType;
     17 import com.ibm.icu.lang.UCharacter.NumericType;
     18 import com.ibm.icu.lang.UCharacterCategory;
     19 import com.ibm.icu.lang.UProperty;
     20 import com.ibm.icu.lang.UScript;
     21 import com.ibm.icu.text.Normalizer2;
     22 import com.ibm.icu.text.UTF16;
     23 import com.ibm.icu.text.UnicodeSet;
     24 import com.ibm.icu.util.ICUException;
     25 import com.ibm.icu.util.VersionInfo;
     26 
     27 /**
     28 * <p>Internal class used for Unicode character property database.</p>
     29 * <p>This classes store binary data read from uprops.icu.
     30 * It does not have the capability to parse the data into more high-level
     31 * information. It only returns bytes of information when required.</p>
     32 * <p>Due to the form most commonly used for retrieval, array of char is used
     33 * to store the binary data.</p>
     34 * <p>UCharacterPropertyDB also contains information on accessing indexes to
     35 * significant points in the binary data.</p>
     36 * <p>Responsibility for molding the binary data into more meaning form lies on
     37 * <a href=UCharacter.html>UCharacter</a>.</p>
     38 * @author Syn Wee Quek
     39 * @since release 2.1, february 1st 2002
     40 */
     41 
     42 public final class UCharacterProperty
     43 {
     44     // public data members -----------------------------------------------
     45 
     46     /*
     47      * public singleton instance
     48      */
     49     public static final UCharacterProperty INSTANCE;
     50 
     51     /**
     52     * Trie data
     53     */
     54     public Trie2_16 m_trie_;
     55     /**
     56     * Unicode version
     57     */
     58     public VersionInfo m_unicodeVersion_;
     59     /**
     60     * Latin capital letter i with dot above
     61     */
     62     public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
     63     /**
     64     * Latin small letter i with dot above
     65     */
     66     public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
     67     /**
     68     * Latin lowercase i
     69     */
     70     public static final char LATIN_SMALL_LETTER_I_ = 0x69;
     71     /**
     72     * Character type mask
     73     */
     74     public static final int TYPE_MASK = 0x1F;
     75 
     76     // uprops.h enum UPropertySource --------------------------------------- ***
     77 
     78     /** No source, not a supported property. */
     79     public static final int SRC_NONE=0;
     80     /** From uchar.c/uprops.icu main trie */
     81     public static final int SRC_CHAR=1;
     82     /** From uchar.c/uprops.icu properties vectors trie */
     83     public static final int SRC_PROPSVEC=2;
     84     /** From unames.c/unames.icu */
     85     public static final int SRC_NAMES=3;
     86     /** From ucase.c/ucase.icu */
     87     public static final int SRC_CASE=4;
     88     /** From ubidi_props.c/ubidi.icu */
     89     public static final int SRC_BIDI=5;
     90     /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
     91     public static final int SRC_CHAR_AND_PROPSVEC=6;
     92     /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
     93     public static final int SRC_CASE_AND_NORM=7;
     94     /** From normalizer2impl.cpp/nfc.nrm */
     95     public static final int SRC_NFC=8;
     96     /** From normalizer2impl.cpp/nfkc.nrm */
     97     public static final int SRC_NFKC=9;
     98     /** From normalizer2impl.cpp/nfkc_cf.nrm */
     99     public static final int SRC_NFKC_CF=10;
    100     /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
    101     public static final int SRC_NFC_CANON_ITER=11;
    102     /** One more than the highest UPropertySource (SRC_) constant. */
    103     public static final int SRC_COUNT=12;
    104 
    105     // public methods ----------------------------------------------------
    106 
    107     /**
    108     * Gets the main property value for code point ch.
    109     * @param ch code point whose property value is to be retrieved
    110     * @return property value of code point
    111     */
    112     public final int getProperty(int ch)
    113     {
    114         return m_trie_.get(ch);
    115     }
    116 
    117     /**
    118      * Gets the unicode additional properties.
    119      * Java version of C u_getUnicodeProperties().
    120      * @param codepoint codepoint whose additional properties is to be
    121      *                  retrieved
    122      * @param column The column index.
    123      * @return unicode properties
    124      */
    125     public int getAdditional(int codepoint, int column) {
    126         assert column >= 0;
    127         if (column >= m_additionalColumnsCount_) {
    128             return 0;
    129         }
    130         return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
    131     }
    132 
    133     static final int MY_MASK = UCharacterProperty.TYPE_MASK
    134         & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
    135             (1<<UCharacterCategory.LOWERCASE_LETTER) |
    136             (1<<UCharacterCategory.TITLECASE_LETTER) |
    137             (1<<UCharacterCategory.MODIFIER_LETTER) |
    138             (1<<UCharacterCategory.OTHER_LETTER));
    139 
    140 
    141        /**
    142      * <p>Get the "age" of the code point.</p>
    143      * <p>The "age" is the Unicode version when the code point was first
    144      * designated (as a non-character or for Private Use) or assigned a
    145      * character.</p>
    146      * <p>This can be useful to avoid emitting code points to receiving
    147      * processes that do not accept newer characters.</p>
    148      * <p>The data is from the UCD file DerivedAge.txt.</p>
    149      * <p>This API does not check the validity of the codepoint.</p>
    150      * @param codepoint The code point.
    151      * @return the Unicode version number
    152      */
    153     public VersionInfo getAge(int codepoint)
    154     {
    155         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
    156         return VersionInfo.getInstance(
    157                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
    158                            version & LAST_NIBBLE_MASK_, 0, 0);
    159     }
    160 
    161     private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
    162     private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
    163     private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
    164     private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
    165     private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
    166     private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
    167     /** Mask constant for multiple UCharCategory bits (Z Separators). */
    168     private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
    169 
    170     /**
    171      * Checks if c is in
    172      * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
    173      * with space=\p{Whitespace} and Control=Cc.
    174      * Implements UCHAR_POSIX_GRAPH.
    175      * @internal
    176      */
    177     private static final boolean isgraphPOSIX(int c) {
    178         /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
    179         /* comparing ==0 returns FALSE for the categories mentioned */
    180         return (getMask(UCharacter.getType(c))&
    181                 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
    182                ==0;
    183     }
    184 
    185     // binary properties --------------------------------------------------- ***
    186 
    187     private class BinaryProperty {
    188         int column;  // SRC_PROPSVEC column, or "source" if mask==0
    189         int mask;
    190         BinaryProperty(int column, int mask) {
    191             this.column=column;
    192             this.mask=mask;
    193         }
    194         BinaryProperty(int source) {
    195             this.column=source;
    196             this.mask=0;
    197         }
    198         final int getSource() {
    199             return mask==0 ? column : SRC_PROPSVEC;
    200         }
    201         boolean contains(int c) {
    202             // systematic, directly stored properties
    203             return (getAdditional(c, column)&mask)!=0;
    204         }
    205     }
    206 
    207     private class CaseBinaryProperty extends BinaryProperty {  // case mapping properties
    208         int which;
    209         CaseBinaryProperty(int which) {
    210             super(SRC_CASE);
    211             this.which=which;
    212         }
    213         boolean contains(int c) {
    214             return UCaseProps.INSTANCE.hasBinaryProperty(c, which);
    215         }
    216     }
    217 
    218     private class NormInertBinaryProperty extends BinaryProperty {  // UCHAR_NF*_INERT properties
    219         int which;
    220         NormInertBinaryProperty(int source, int which) {
    221             super(source);
    222             this.which=which;
    223         }
    224         boolean contains(int c) {
    225             return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
    226         }
    227     }
    228 
    229     BinaryProperty[] binProps={
    230         /*
    231          * Binary-property implementations must be in order of corresponding UProperty,
    232          * and there must be exactly one entry per binary UProperty.
    233          */
    234         new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)),
    235         new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)),
    236         new BinaryProperty(SRC_BIDI) {  // UCHAR_BIDI_CONTROL
    237             boolean contains(int c) {
    238                 return UBiDiProps.INSTANCE.isBidiControl(c);
    239             }
    240         },
    241         new BinaryProperty(SRC_BIDI) {  // UCHAR_BIDI_MIRRORED
    242             boolean contains(int c) {
    243                 return UBiDiProps.INSTANCE.isMirrored(c);
    244             }
    245         },
    246         new BinaryProperty(1, (1<<DASH_PROPERTY_)),
    247         new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)),
    248         new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)),
    249         new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)),
    250         new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)),
    251         new BinaryProperty(SRC_NFC) {  // UCHAR_FULL_COMPOSITION_EXCLUSION
    252             boolean contains(int c) {
    253                 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
    254                 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl;
    255                 return impl.isCompNo(impl.getNorm16(c));
    256             }
    257         },
    258         new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)),
    259         new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)),
    260         new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)),
    261         new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)),
    262         new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)),
    263         new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)),
    264         new BinaryProperty(1, (1<<ID_START_PROPERTY_)),
    265         new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)),
    266         new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)),
    267         new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)),
    268         new BinaryProperty(SRC_BIDI) {  // UCHAR_JOIN_CONTROL
    269             boolean contains(int c) {
    270                 return UBiDiProps.INSTANCE.isJoinControl(c);
    271             }
    272         },
    273         new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)),
    274         new CaseBinaryProperty(UProperty.LOWERCASE),
    275         new BinaryProperty(1, (1<<MATH_PROPERTY_)),
    276         new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)),
    277         new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)),
    278         new BinaryProperty(1, (1<<RADICAL_PROPERTY_)),
    279         new CaseBinaryProperty(UProperty.SOFT_DOTTED),
    280         new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)),
    281         new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)),
    282         new CaseBinaryProperty(UProperty.UPPERCASE),
    283         new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)),
    284         new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)),
    285         new BinaryProperty(1, (1<<XID_START_PROPERTY_)),
    286         new CaseBinaryProperty(UProperty.CASE_SENSITIVE),
    287         new BinaryProperty(1, (1<<S_TERM_PROPERTY_)),
    288         new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)),
    289         new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT),
    290         new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT),
    291         new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT),
    292         new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT),
    293         new BinaryProperty(SRC_NFC_CANON_ITER) {  // UCHAR_SEGMENT_STARTER
    294             boolean contains(int c) {
    295                 return Norm2AllModes.getNFCInstance().impl.
    296                     ensureCanonIterData().isCanonSegmentStarter(c);
    297             }
    298         },
    299         new BinaryProperty(1, (1<<PATTERN_SYNTAX)),
    300         new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)),
    301         new BinaryProperty(SRC_CHAR_AND_PROPSVEC) {  // UCHAR_POSIX_ALNUM
    302             boolean contains(int c) {
    303                 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c);
    304             }
    305         },
    306         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_BLANK
    307             boolean contains(int c) {
    308                 // "horizontal space"
    309                 if(c<=0x9f) {
    310                     return c==9 || c==0x20; /* TAB or SPACE */
    311                 } else {
    312                     /* Zs */
    313                     return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    314                 }
    315             }
    316         },
    317         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_GRAPH
    318             boolean contains(int c) {
    319                 return isgraphPOSIX(c);
    320             }
    321         },
    322         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_PRINT
    323             boolean contains(int c) {
    324                 /*
    325                  * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
    326                  *
    327                  * The only cntrl character in graph+blank is TAB (in blank).
    328                  * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
    329                  */
    330                 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
    331             }
    332         },
    333         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_XDIGIT
    334             boolean contains(int c) {
    335                 /* check ASCII and Fullwidth ASCII a-fA-F */
    336                 if(
    337                     (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
    338                     (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    339                 ) {
    340                     return true;
    341                 }
    342                 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
    343             }
    344         },
    345         new CaseBinaryProperty(UProperty.CASED),
    346         new CaseBinaryProperty(UProperty.CASE_IGNORABLE),
    347         new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED),
    348         new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED),
    349         new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED),
    350         new BinaryProperty(SRC_CASE_AND_NORM) {  // UCHAR_CHANGES_WHEN_CASEFOLDED
    351             boolean contains(int c) {
    352                 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c);
    353                 if(nfd!=null) {
    354                     /* c has a decomposition */
    355                     c=nfd.codePointAt(0);
    356                     if(Character.charCount(c)!=nfd.length()) {
    357                         /* multiple code points */
    358                         c=-1;
    359                     }
    360                 } else if(c<0) {
    361                     return false;  /* protect against bad input */
    362                 }
    363                 if(c>=0) {
    364                     /* single code point */
    365                     UCaseProps csp=UCaseProps.INSTANCE;
    366                     UCaseProps.dummyStringBuilder.setLength(0);
    367                     return csp.toFullFolding(c, UCaseProps.dummyStringBuilder,
    368                                              UCharacter.FOLD_CASE_DEFAULT)>=0;
    369                 } else {
    370                     String folded=UCharacter.foldCase(nfd, true);
    371                     return !folded.equals(nfd);
    372                 }
    373             }
    374         },
    375         new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED),
    376         new BinaryProperty(SRC_NFKC_CF) {  // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
    377             boolean contains(int c) {
    378                 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl;
    379                 String src=UTF16.valueOf(c);
    380                 StringBuilder dest=new StringBuilder();
    381                 // Small destCapacity for NFKC_CF(c).
    382                 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5);
    383                 kcf.compose(src, 0, src.length(), false, true, buffer);
    384                 return !Normalizer2Impl.UTF16Plus.equal(dest, src);
    385             }
    386         },
    387     };
    388 
    389     public boolean hasBinaryProperty(int c, int which) {
    390          if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) {
    391             // not a known binary property
    392             return false;
    393         } else {
    394             return binProps[which].contains(c);
    395         }
    396     }
    397 
    398     // int-value and enumerated properties --------------------------------- ***
    399 
    400     public int getType(int c) {
    401         return getProperty(c)&TYPE_MASK;
    402     }
    403 
    404     /*
    405      * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
    406      * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
    407      */
    408     private static final int /* UHangulSyllableType */ gcbToHst[]={
    409         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
    410         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CONTROL */
    411         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CR */
    412         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_EXTEND */
    413         HangulSyllableType.LEADING_JAMO,     /* U_GCB_L */
    414         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_LF */
    415         HangulSyllableType.LV_SYLLABLE,      /* U_GCB_LV */
    416         HangulSyllableType.LVT_SYLLABLE,     /* U_GCB_LVT */
    417         HangulSyllableType.TRAILING_JAMO,    /* U_GCB_T */
    418         HangulSyllableType.VOWEL_JAMO        /* U_GCB_V */
    419         /*
    420          * Omit GCB values beyond what we need for hst.
    421          * The code below checks for the array length.
    422          */
    423     };
    424 
    425     private class IntProperty {
    426         int column;  // SRC_PROPSVEC column, or "source" if mask==0
    427         int mask;
    428         int shift;
    429         IntProperty(int column, int mask, int shift) {
    430             this.column=column;
    431             this.mask=mask;
    432             this.shift=shift;
    433         }
    434         IntProperty(int source) {
    435             this.column=source;
    436             this.mask=0;
    437         }
    438         final int getSource() {
    439             return mask==0 ? column : SRC_PROPSVEC;
    440         }
    441         int getValue(int c) {
    442             // systematic, directly stored properties
    443             return (getAdditional(c, column)&mask)>>>shift;
    444         }
    445         int getMaxValue(int which) {
    446             return (getMaxValues(column)&mask)>>>shift;
    447         }
    448     }
    449 
    450     private class BiDiIntProperty extends IntProperty {
    451         BiDiIntProperty() {
    452             super(SRC_BIDI);
    453         }
    454         int getMaxValue(int which) {
    455             return UBiDiProps.INSTANCE.getMaxValue(which);
    456         }
    457     }
    458 
    459     private class CombiningClassIntProperty extends IntProperty {
    460         CombiningClassIntProperty(int source) {
    461             super(source);
    462         }
    463         int getMaxValue(int which) {
    464             return 0xff;
    465         }
    466     }
    467 
    468     private class NormQuickCheckIntProperty extends IntProperty {  // UCHAR_NF*_QUICK_CHECK properties
    469         int which;
    470         int max;
    471         NormQuickCheckIntProperty(int source, int which, int max) {
    472             super(source);
    473             this.which=which;
    474             this.max=max;
    475         }
    476         int getValue(int c) {
    477             return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c);
    478         }
    479         int getMaxValue(int which) {
    480             return max;
    481         }
    482     }
    483 
    484     IntProperty intProps[]={
    485         new BiDiIntProperty() {  // BIDI_CLASS
    486             int getValue(int c) {
    487                 return UBiDiProps.INSTANCE.getClass(c);
    488             }
    489         },
    490         new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_),
    491         new CombiningClassIntProperty(SRC_NFC) {  // CANONICAL_COMBINING_CLASS
    492             int getValue(int c) {
    493                 return Normalizer2.getNFDInstance().getCombiningClass(c);
    494             }
    495         },
    496         new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0),
    497         new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_),
    498         new IntProperty(SRC_CHAR) {  // GENERAL_CATEGORY
    499             int getValue(int c) {
    500                 return getType(c);
    501             }
    502             int getMaxValue(int which) {
    503                 return UCharacterCategory.CHAR_CATEGORY_COUNT-1;
    504             }
    505         },
    506         new BiDiIntProperty() {  // JOINING_GROUP
    507             int getValue(int c) {
    508                 return UBiDiProps.INSTANCE.getJoiningGroup(c);
    509             }
    510         },
    511         new BiDiIntProperty() {  // JOINING_TYPE
    512             int getValue(int c) {
    513                 return UBiDiProps.INSTANCE.getJoiningType(c);
    514             }
    515         },
    516         new IntProperty(2, LB_MASK, LB_SHIFT),  // LINE_BREAK
    517         new IntProperty(SRC_CHAR) {  // NUMERIC_TYPE
    518             int getValue(int c) {
    519                 return ntvGetType(getNumericTypeValue(getProperty(c)));
    520             }
    521             int getMaxValue(int which) {
    522                 return NumericType.COUNT-1;
    523             }
    524         },
    525         new IntProperty(0, SCRIPT_MASK_, 0) {
    526             int getValue(int c) {
    527                 return UScript.getScript(c);
    528             }
    529         },
    530         new IntProperty(SRC_PROPSVEC) {  // HANGUL_SYLLABLE_TYPE
    531             int getValue(int c) {
    532                 /* see comments on gcbToHst[] above */
    533                 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
    534                 if(gcb<gcbToHst.length) {
    535                     return gcbToHst[gcb];
    536                 } else {
    537                     return HangulSyllableType.NOT_APPLICABLE;
    538                 }
    539             }
    540             int getMaxValue(int which) {
    541                 return HangulSyllableType.COUNT-1;
    542             }
    543         },
    544         // max=1=YES -- these are never "maybe", only "no" or "yes"
    545         new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1),
    546         new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1),
    547         // max=2=MAYBE
    548         new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2),
    549         new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2),
    550         new CombiningClassIntProperty(SRC_NFC) {  // LEAD_CANONICAL_COMBINING_CLASS
    551             int getValue(int c) {
    552                 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8;
    553             }
    554         },
    555         new CombiningClassIntProperty(SRC_NFC) {  // TRAIL_CANONICAL_COMBINING_CLASS
    556             int getValue(int c) {
    557                 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff;
    558             }
    559         },
    560         new IntProperty(2, GCB_MASK, GCB_SHIFT),  // GRAPHEME_CLUSTER_BREAK
    561         new IntProperty(2, SB_MASK, SB_SHIFT),  // SENTENCE_BREAK
    562         new IntProperty(2, WB_MASK, WB_SHIFT),  // WORD_BREAK
    563         new BiDiIntProperty() {  // BIDI_PAIRED_BRACKET_TYPE
    564             int getValue(int c) {
    565                 return UBiDiProps.INSTANCE.getPairedBracketType(c);
    566             }
    567         },
    568     };
    569 
    570     public int getIntPropertyValue(int c, int which) {
    571         if(which<UProperty.INT_START) {
    572             if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
    573                 return binProps[which].contains(c) ? 1 : 0;
    574             }
    575         } else if(which<UProperty.INT_LIMIT) {
    576             return intProps[which-UProperty.INT_START].getValue(c);
    577         } else if (which == UProperty.GENERAL_CATEGORY_MASK) {
    578             return getMask(getType(c));
    579         }
    580         return 0; // undefined
    581     }
    582 
    583     public int getIntPropertyMaxValue(int which) {
    584         if(which<UProperty.INT_START) {
    585             if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
    586                 return 1;  // maximum TRUE for all binary properties
    587             }
    588         } else if(which<UProperty.INT_LIMIT) {
    589             return intProps[which-UProperty.INT_START].getMaxValue(which);
    590         }
    591         return -1; // undefined
    592     }
    593 
    594     public final int getSource(int which) {
    595         if(which<UProperty.BINARY_START) {
    596             return SRC_NONE; /* undefined */
    597         } else if(which<UProperty.BINARY_LIMIT) {
    598             return binProps[which].getSource();
    599         } else if(which<UProperty.INT_START) {
    600             return SRC_NONE; /* undefined */
    601         } else if(which<UProperty.INT_LIMIT) {
    602             return intProps[which-UProperty.INT_START].getSource();
    603         } else if(which<UProperty.STRING_START) {
    604             switch(which) {
    605             case UProperty.GENERAL_CATEGORY_MASK:
    606             case UProperty.NUMERIC_VALUE:
    607                 return SRC_CHAR;
    608 
    609             default:
    610                 return SRC_NONE;
    611             }
    612         } else if(which<UProperty.STRING_LIMIT) {
    613             switch(which) {
    614             case UProperty.AGE:
    615                 return SRC_PROPSVEC;
    616 
    617             case UProperty.BIDI_MIRRORING_GLYPH:
    618                 return SRC_BIDI;
    619 
    620             case UProperty.CASE_FOLDING:
    621             case UProperty.LOWERCASE_MAPPING:
    622             case UProperty.SIMPLE_CASE_FOLDING:
    623             case UProperty.SIMPLE_LOWERCASE_MAPPING:
    624             case UProperty.SIMPLE_TITLECASE_MAPPING:
    625             case UProperty.SIMPLE_UPPERCASE_MAPPING:
    626             case UProperty.TITLECASE_MAPPING:
    627             case UProperty.UPPERCASE_MAPPING:
    628                 return SRC_CASE;
    629 
    630             case UProperty.ISO_COMMENT:
    631             case UProperty.NAME:
    632             case UProperty.UNICODE_1_NAME:
    633                 return SRC_NAMES;
    634 
    635             default:
    636                 return SRC_NONE;
    637             }
    638         } else {
    639             switch(which) {
    640             case UProperty.SCRIPT_EXTENSIONS:
    641                 return SRC_PROPSVEC;
    642             default:
    643                 return SRC_NONE; /* undefined */
    644             }
    645         }
    646     }
    647 
    648     /**
    649     * Forms a supplementary code point from the argument character<br>
    650     * Note this is for internal use hence no checks for the validity of the
    651     * surrogate characters are done
    652     * @param lead lead surrogate character
    653     * @param trail trailing surrogate character
    654     * @return code point of the supplementary character
    655     */
    656     public static int getRawSupplementary(char lead, char trail)
    657     {
    658         return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
    659     }
    660 
    661     /**
    662      * <p>
    663      * Unicode property names and property value names are compared
    664      * "loosely". Property[Value]Aliases.txt say:
    665      * <quote>
    666      *   "With loose matching of property names, the case distinctions,
    667      *    whitespace, and '_' are ignored."
    668      * </quote>
    669      * </p>
    670      * <p>
    671      * This function does just that, for ASCII (char *) name strings.
    672      * It is almost identical to ucnv_compareNames() but also ignores
    673      * ASCII White_Space characters (U+0009..U+000d).
    674      * </p>
    675      * @param name1 name to compare
    676      * @param name2 name to compare
    677      * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
    678      *         if name1 is greater than name2.
    679      */
    680     /* to be implemented in 2.4
    681      * public static int comparePropertyNames(String name1, String name2)
    682     {
    683         int result = 0;
    684         int i1 = 0;
    685         int i2 = 0;
    686         while (true) {
    687             char ch1 = 0;
    688             char ch2 = 0;
    689             // Ignore delimiters '-', '_', and ASCII White_Space
    690             if (i1 < name1.length()) {
    691                 ch1 = name1.charAt(i1 ++);
    692             }
    693             while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
    694                    || ch1 == '\n' // synwee what is || ch1 == '\v'
    695                    || ch1 == '\f' || ch1=='\r') {
    696                 if (i1 < name1.length()) {
    697                     ch1 = name1.charAt(i1 ++);
    698                 }
    699                 else {
    700                     ch1 = 0;
    701                 }
    702             }
    703             if (i2 < name2.length()) {
    704                 ch2 = name2.charAt(i2 ++);
    705             }
    706             while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
    707                    || ch2 == '\n' // synwee what is || ch1 == '\v'
    708                    || ch2 == '\f' || ch2=='\r') {
    709                 if (i2 < name2.length()) {
    710                     ch2 = name2.charAt(i2 ++);
    711                 }
    712                 else {
    713                     ch2 = 0;
    714                 }
    715             }
    716 
    717             // If we reach the ends of both strings then they match
    718             if (ch1 == 0 && ch2 == 0) {
    719                 return 0;
    720             }
    721 
    722             // Case-insensitive comparison
    723             if (ch1 != ch2) {
    724                 result = Character.toLowerCase(ch1)
    725                                                 - Character.toLowerCase(ch2);
    726                 if (result != 0) {
    727                     return result;
    728                 }
    729             }
    730         }
    731     }
    732     */
    733 
    734     /**
    735      * Get the the maximum values for some enum/int properties.
    736      * @return maximum values for the integer properties.
    737      */
    738     public int getMaxValues(int column)
    739     {
    740        // return m_maxBlockScriptValue_;
    741 
    742         switch(column) {
    743         case 0:
    744             return m_maxBlockScriptValue_;
    745         case 2:
    746             return m_maxJTGValue_;
    747         default:
    748             return 0;
    749         }
    750     }
    751 
    752     /**
    753      * Gets the type mask
    754      * @param type character type
    755      * @return mask
    756      */
    757     public static final int getMask(int type)
    758     {
    759         return 1 << type;
    760     }
    761 
    762 
    763     /**
    764      * Returns the digit values of characters like 'A' - 'Z', normal,
    765      * half-width and full-width. This method assumes that the other digit
    766      * characters are checked by the calling method.
    767      * @param ch character to test
    768      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
    769      *         its corresponding digit will be returned.
    770      */
    771     public static int getEuropeanDigit(int ch) {
    772         if ((ch > 0x7a && ch < 0xff21)
    773             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
    774             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
    775             return -1;
    776         }
    777         if (ch <= 0x7a) {
    778             // ch >= 0x41 or ch < 0x61
    779             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
    780         }
    781         // ch >= 0xff21
    782         if (ch <= 0xff3a) {
    783             return ch + 10 - 0xff21;
    784         }
    785         // ch >= 0xff41 && ch <= 0xff5a
    786         return ch + 10 - 0xff41;
    787     }
    788 
    789     public int digit(int c) {
    790         int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
    791         if(value<=9) {
    792             return value;
    793         } else {
    794             return -1;
    795         }
    796     }
    797 
    798     public int getNumericValue(int c) {
    799         // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
    800         int ntv = getNumericTypeValue(getProperty(c));
    801 
    802         if(ntv==NTV_NONE_) {
    803             return getEuropeanDigit(c);
    804         } else if(ntv<NTV_DIGIT_START_) {
    805             /* decimal digit */
    806             return ntv-NTV_DECIMAL_START_;
    807         } else if(ntv<NTV_NUMERIC_START_) {
    808             /* other digit */
    809             return ntv-NTV_DIGIT_START_;
    810         } else if(ntv<NTV_FRACTION_START_) {
    811             /* small integer */
    812             return ntv-NTV_NUMERIC_START_;
    813         } else if(ntv<NTV_LARGE_START_) {
    814             /* fraction */
    815             return -2;
    816         } else if(ntv<NTV_BASE60_START_) {
    817             /* large, single-significant-digit integer */
    818             int mant=(ntv>>5)-14;
    819             int exp=(ntv&0x1f)+2;
    820             if(exp<9 || (exp==9 && mant<=2)) {
    821                 int numValue=mant;
    822                 do {
    823                     numValue*=10;
    824                 } while(--exp>0);
    825                 return numValue;
    826             } else {
    827                 return -2;
    828             }
    829         } else if(ntv<NTV_RESERVED_START_) {
    830             /* sexagesimal (base 60) integer */
    831             int numValue=(ntv>>2)-0xbf;
    832             int exp=(ntv&3)+1;
    833 
    834             switch(exp) {
    835             case 4:
    836                 numValue*=60*60*60*60;
    837                 break;
    838             case 3:
    839                 numValue*=60*60*60;
    840                 break;
    841             case 2:
    842                 numValue*=60*60;
    843                 break;
    844             case 1:
    845                 numValue*=60;
    846                 break;
    847             case 0:
    848             default:
    849                 break;
    850             }
    851 
    852             return numValue;
    853         } else {
    854             /* reserved */
    855             return -2;
    856         }
    857     }
    858 
    859     public double getUnicodeNumericValue(int c) {
    860         // equivalent to c version double u_getNumericValue(UChar32 c)
    861         int ntv = getNumericTypeValue(getProperty(c));
    862 
    863         if(ntv==NTV_NONE_) {
    864             return UCharacter.NO_NUMERIC_VALUE;
    865         } else if(ntv<NTV_DIGIT_START_) {
    866             /* decimal digit */
    867             return ntv-NTV_DECIMAL_START_;
    868         } else if(ntv<NTV_NUMERIC_START_) {
    869             /* other digit */
    870             return ntv-NTV_DIGIT_START_;
    871         } else if(ntv<NTV_FRACTION_START_) {
    872             /* small integer */
    873             return ntv-NTV_NUMERIC_START_;
    874         } else if(ntv<NTV_LARGE_START_) {
    875             /* fraction */
    876             int numerator=(ntv>>4)-12;
    877             int denominator=(ntv&0xf)+1;
    878             return (double)numerator/denominator;
    879         } else if(ntv<NTV_BASE60_START_) {
    880             /* large, single-significant-digit integer */
    881             double numValue;
    882             int mant=(ntv>>5)-14;
    883             int exp=(ntv&0x1f)+2;
    884             numValue=mant;
    885 
    886             /* multiply by 10^exp without math.h */
    887             while(exp>=4) {
    888                 numValue*=10000.;
    889                 exp-=4;
    890             }
    891             switch(exp) {
    892             case 3:
    893                 numValue*=1000.;
    894                 break;
    895             case 2:
    896                 numValue*=100.;
    897                 break;
    898             case 1:
    899                 numValue*=10.;
    900                 break;
    901             case 0:
    902             default:
    903                 break;
    904             }
    905 
    906             return numValue;
    907         } else if(ntv<NTV_RESERVED_START_) {
    908             /* sexagesimal (base 60) integer */
    909             int numValue=(ntv>>2)-0xbf;
    910             int exp=(ntv&3)+1;
    911 
    912             switch(exp) {
    913             case 4:
    914                 numValue*=60*60*60*60;
    915                 break;
    916             case 3:
    917                 numValue*=60*60*60;
    918                 break;
    919             case 2:
    920                 numValue*=60*60;
    921                 break;
    922             case 1:
    923                 numValue*=60;
    924                 break;
    925             case 0:
    926             default:
    927                 break;
    928             }
    929 
    930             return numValue;
    931         } else {
    932             /* reserved */
    933             return UCharacter.NO_NUMERIC_VALUE;
    934         }
    935     }
    936 
    937     // protected variables -----------------------------------------------
    938 
    939     /**
    940      * Extra property trie
    941      */
    942     Trie2_16 m_additionalTrie_;
    943     /**
    944      * Extra property vectors, 1st column for age and second for binary
    945      * properties.
    946      */
    947     int m_additionalVectors_[];
    948     /**
    949      * Number of additional columns
    950      */
    951     int m_additionalColumnsCount_;
    952     /**
    953      * Maximum values for block, bits used as in vector word
    954      * 0
    955      */
    956     int m_maxBlockScriptValue_;
    957     /**
    958      * Maximum values for script, bits used as in vector word
    959      * 0
    960      */
    961      int m_maxJTGValue_;
    962 
    963     /**
    964      * Script_Extensions data
    965      */
    966     public char[] m_scriptExtensions_;
    967 
    968     // private variables -------------------------------------------------
    969 
    970     /**
    971     * Default name of the datafile
    972     */
    973     private static final String DATA_FILE_NAME_ = "uprops.icu";
    974 
    975     /**
    976     * Shift value for lead surrogate to form a supplementary character.
    977     */
    978     private static final int LEAD_SURROGATE_SHIFT_ = 10;
    979     /**
    980     * Offset to add to combined surrogate pair to avoid masking.
    981     */
    982     private static final int SURROGATE_OFFSET_ =
    983                            UTF16.SUPPLEMENTARY_MIN_VALUE -
    984                            (UTF16.SURROGATE_MIN_VALUE <<
    985                            LEAD_SURROGATE_SHIFT_) -
    986                            UTF16.TRAIL_SURROGATE_MIN_VALUE;
    987 
    988 
    989     // property data constants -------------------------------------------------
    990 
    991     /**
    992      * Numeric types and values in the main properties words.
    993      */
    994     private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
    995     private static final int getNumericTypeValue(int props) {
    996         return props >> NUMERIC_TYPE_VALUE_SHIFT_;
    997     }
    998     /* constants for the storage form of numeric types and values */
    999     /** No numeric value. */
   1000     private static final int NTV_NONE_ = 0;
   1001     /** Decimal digits: nv=0..9 */
   1002     private static final int NTV_DECIMAL_START_ = 1;
   1003     /** Other digits: nv=0..9 */
   1004     private static final int NTV_DIGIT_START_ = 11;
   1005     /** Small integers: nv=0..154 */
   1006     private static final int NTV_NUMERIC_START_ = 21;
   1007     /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
   1008     private static final int NTV_FRACTION_START_ = 0xb0;
   1009     /**
   1010      * Large integers:
   1011      * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
   1012      * (only one significant decimal digit)
   1013      */
   1014     private static final int NTV_LARGE_START_ = 0x1e0;
   1015     /**
   1016      * Sexagesimal numbers:
   1017      * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
   1018      */
   1019     private static final int NTV_BASE60_START_=0x300;
   1020     /** No numeric value (yet). */
   1021     private static final int NTV_RESERVED_START_ = NTV_BASE60_START_ + 36;  // 0x300+9*4=0x324
   1022 
   1023     private static final int ntvGetType(int ntv) {
   1024         return
   1025             (ntv==NTV_NONE_) ? NumericType.NONE :
   1026             (ntv<NTV_DIGIT_START_) ?  NumericType.DECIMAL :
   1027             (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
   1028             NumericType.NUMERIC;
   1029     }
   1030 
   1031     /*
   1032      * Properties in vector word 0
   1033      * Bits
   1034      * 31..24   DerivedAge version major/minor one nibble each
   1035      * 23..22   3..1: Bits 7..0 = Script_Extensions index
   1036      *             3: Script value from Script_Extensions
   1037      *             2: Script=Inherited
   1038      *             1: Script=Common
   1039      *             0: Script=bits 7..0
   1040      * 21..20   reserved
   1041      * 19..17   East Asian Width
   1042      * 16.. 8   UBlockCode
   1043      *  7.. 0   UScriptCode
   1044      */
   1045 
   1046     /**
   1047      * Script_Extensions: mask includes Script
   1048      */
   1049     public static final int SCRIPT_X_MASK = 0x00c000ff;
   1050     //private static final int SCRIPT_X_SHIFT = 22;
   1051     /**
   1052      * Integer properties mask and shift values for East Asian cell width.
   1053      * Equivalent to icu4c UPROPS_EA_MASK
   1054      */
   1055     private static final int EAST_ASIAN_MASK_ = 0x000e0000;
   1056     /**
   1057      * Integer properties mask and shift values for East Asian cell width.
   1058      * Equivalent to icu4c UPROPS_EA_SHIFT
   1059      */
   1060     private static final int EAST_ASIAN_SHIFT_ = 17;
   1061     /**
   1062      * Integer properties mask and shift values for blocks.
   1063      * Equivalent to icu4c UPROPS_BLOCK_MASK
   1064      */
   1065     private static final int BLOCK_MASK_ = 0x0001ff00;
   1066     /**
   1067      * Integer properties mask and shift values for blocks.
   1068      * Equivalent to icu4c UPROPS_BLOCK_SHIFT
   1069      */
   1070     private static final int BLOCK_SHIFT_ = 8;
   1071     /**
   1072      * Integer properties mask and shift values for scripts.
   1073      * Equivalent to icu4c UPROPS_SHIFT_MASK
   1074      */
   1075     public static final int SCRIPT_MASK_ = 0x000000ff;
   1076 
   1077     /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
   1078     public static final int SCRIPT_X_WITH_COMMON = 0x400000;
   1079     public static final int SCRIPT_X_WITH_INHERITED = 0x800000;
   1080     public static final int SCRIPT_X_WITH_OTHER = 0xc00000;
   1081 
   1082     /**
   1083      * Additional properties used in internal trie data
   1084      */
   1085     /*
   1086      * Properties in vector word 1
   1087      * Each bit encodes one binary property.
   1088      * The following constants represent the bit number, use 1<<UPROPS_XYZ.
   1089      * UPROPS_BINARY_1_TOP<=32!
   1090      *
   1091      * Keep this list of property enums in sync with
   1092      * propListNames[] in icu/source/tools/genprops/props2.c!
   1093      *
   1094      * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
   1095      */
   1096     private static final int WHITE_SPACE_PROPERTY_ = 0;
   1097     private static final int DASH_PROPERTY_ = 1;
   1098     private static final int HYPHEN_PROPERTY_ = 2;
   1099     private static final int QUOTATION_MARK_PROPERTY_ = 3;
   1100     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
   1101     private static final int MATH_PROPERTY_ = 5;
   1102     private static final int HEX_DIGIT_PROPERTY_ = 6;
   1103     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
   1104     private static final int ALPHABETIC_PROPERTY_ = 8;
   1105     private static final int IDEOGRAPHIC_PROPERTY_ = 9;
   1106     private static final int DIACRITIC_PROPERTY_ = 10;
   1107     private static final int EXTENDER_PROPERTY_ = 11;
   1108     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
   1109     private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
   1110     private static final int GRAPHEME_LINK_PROPERTY_ = 14;
   1111     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
   1112     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
   1113     private static final int RADICAL_PROPERTY_ = 17;
   1114     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
   1115     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
   1116     private static final int DEPRECATED_PROPERTY_ = 20;
   1117     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
   1118     private static final int XID_START_PROPERTY_ = 22;
   1119     private static final int XID_CONTINUE_PROPERTY_ = 23;
   1120     private static final int ID_START_PROPERTY_    = 24;
   1121     private static final int ID_CONTINUE_PROPERTY_ = 25;
   1122     private static final int GRAPHEME_BASE_PROPERTY_ = 26;
   1123     private static final int S_TERM_PROPERTY_ = 27;
   1124     private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
   1125     private static final int PATTERN_SYNTAX = 29;                   /* new in ICU 3.4 and Unicode 4.1 */
   1126     private static final int PATTERN_WHITE_SPACE = 30;
   1127 
   1128     /*
   1129      * Properties in vector word 2
   1130      * Bits
   1131      * 31..26   reserved
   1132      * 25..20   Line Break
   1133      * 19..15   Sentence Break
   1134      * 14..10   Word Break
   1135      *  9.. 5   Grapheme Cluster Break
   1136      *  4.. 0   Decomposition Type
   1137      */
   1138     private static final int LB_MASK          = 0x03f00000;
   1139     private static final int LB_SHIFT         = 20;
   1140 
   1141     private static final int SB_MASK          = 0x000f8000;
   1142     private static final int SB_SHIFT         = 15;
   1143 
   1144     private static final int WB_MASK          = 0x00007c00;
   1145     private static final int WB_SHIFT         = 10;
   1146 
   1147     private static final int GCB_MASK         = 0x000003e0;
   1148     private static final int GCB_SHIFT        = 5;
   1149 
   1150     /**
   1151      * Integer properties mask for decomposition type.
   1152      * Equivalent to icu4c UPROPS_DT_MASK.
   1153      */
   1154     private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
   1155 
   1156     /**
   1157      * First nibble shift
   1158      */
   1159     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
   1160     /**
   1161      * Second nibble mask
   1162      */
   1163     private static final int LAST_NIBBLE_MASK_ = 0xF;
   1164     /**
   1165      * Age value shift
   1166      */
   1167     private static final int AGE_SHIFT_ = 24;
   1168 
   1169 
   1170     // private constructors --------------------------------------------------
   1171 
   1172     /**
   1173      * Constructor
   1174      * @exception IOException thrown when data reading fails or data corrupted
   1175      */
   1176     private UCharacterProperty() throws IOException
   1177     {
   1178         // consistency check
   1179         if(binProps.length!=UProperty.BINARY_LIMIT) {
   1180             throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT");
   1181         }
   1182         if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) {
   1183             throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)");
   1184         }
   1185 
   1186         // jar access
   1187         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
   1188         m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
   1189         // Read or skip the 16 indexes.
   1190         int propertyOffset = bytes.getInt();
   1191         /* exceptionOffset = */ bytes.getInt();
   1192         /* caseOffset = */ bytes.getInt();
   1193         int additionalOffset = bytes.getInt();
   1194         int additionalVectorsOffset = bytes.getInt();
   1195         m_additionalColumnsCount_ = bytes.getInt();
   1196         int scriptExtensionsOffset = bytes.getInt();
   1197         int reservedOffset7 = bytes.getInt();
   1198         /* reservedOffset8 = */ bytes.getInt();
   1199         /* dataTopOffset = */ bytes.getInt();
   1200         m_maxBlockScriptValue_ = bytes.getInt();
   1201         m_maxJTGValue_ = bytes.getInt();
   1202         ICUBinary.skipBytes(bytes, (16 - 12) << 2);
   1203 
   1204         // read the main properties trie
   1205         m_trie_ = Trie2_16.createFromSerialized(bytes);
   1206         int expectedTrieLength = (propertyOffset - 16) * 4;
   1207         int trieLength = m_trie_.getSerializedLength();
   1208         if(trieLength > expectedTrieLength) {
   1209             throw new IOException("uprops.icu: not enough bytes for main trie");
   1210         }
   1211         // skip padding after trie bytes
   1212         ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
   1213 
   1214         // skip unused intervening data structures
   1215         ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
   1216 
   1217         if(m_additionalColumnsCount_ > 0) {
   1218             // reads the additional property block
   1219             m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
   1220             expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
   1221             trieLength = m_additionalTrie_.getSerializedLength();
   1222             if(trieLength > expectedTrieLength) {
   1223                 throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
   1224             }
   1225             // skip padding after trie bytes
   1226             ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
   1227 
   1228             // additional properties
   1229             int size = scriptExtensionsOffset - additionalVectorsOffset;
   1230             m_additionalVectors_ = new int[size];
   1231             for (int i = 0; i < size; i ++) {
   1232                 m_additionalVectors_[i] = bytes.getInt();
   1233             }
   1234         }
   1235 
   1236         // Script_Extensions
   1237         int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
   1238         if(numChars > 0) {
   1239             m_scriptExtensions_ = new char[numChars];
   1240             for(int i = 0; i < numChars; ++i) {
   1241                 m_scriptExtensions_[i] = bytes.getChar();
   1242             }
   1243         }
   1244     }
   1245 
   1246     private static final class IsAcceptable implements ICUBinary.Authenticate {
   1247         // @Override when we switch to Java 6
   1248         public boolean isDataVersionAcceptable(byte version[]) {
   1249             return version[0] == 7;
   1250         }
   1251     }
   1252     private static final int DATA_FORMAT = 0x5550726F;  // "UPro"
   1253 
   1254     // private methods -------------------------------------------------------
   1255 
   1256     /*
   1257      * Compare additional properties to see if it has argument type
   1258      * @param property 32 bit properties
   1259      * @param type character type
   1260      * @return true if property has type
   1261      */
   1262     /*private boolean compareAdditionalType(int property, int type)
   1263     {
   1264         return (property & (1 << type)) != 0;
   1265     }*/
   1266 
   1267     // property starts for UnicodeSet -------------------------------------- ***
   1268 
   1269     private static final int TAB     = 0x0009;
   1270     //private static final int LF      = 0x000a;
   1271     //private static final int FF      = 0x000c;
   1272     private static final int CR      = 0x000d;
   1273     private static final int U_A     = 0x0041;
   1274     private static final int U_F     = 0x0046;
   1275     private static final int U_Z     = 0x005a;
   1276     private static final int U_a     = 0x0061;
   1277     private static final int U_f     = 0x0066;
   1278     private static final int U_z     = 0x007a;
   1279     private static final int DEL     = 0x007f;
   1280     private static final int NL      = 0x0085;
   1281     private static final int NBSP    = 0x00a0;
   1282     private static final int CGJ     = 0x034f;
   1283     private static final int FIGURESP= 0x2007;
   1284     private static final int HAIRSP  = 0x200a;
   1285     //private static final int ZWNJ    = 0x200c;
   1286     //private static final int ZWJ     = 0x200d;
   1287     private static final int RLM     = 0x200f;
   1288     private static final int NNBSP   = 0x202f;
   1289     private static final int WJ      = 0x2060;
   1290     private static final int INHSWAP = 0x206a;
   1291     private static final int NOMDIG  = 0x206f;
   1292     private static final int U_FW_A  = 0xff21;
   1293     private static final int U_FW_F  = 0xff26;
   1294     private static final int U_FW_Z  = 0xff3a;
   1295     private static final int U_FW_a  = 0xff41;
   1296     private static final int U_FW_f  = 0xff46;
   1297     private static final int U_FW_z  = 0xff5a;
   1298     private static final int ZWNBSP  = 0xfeff;
   1299 
   1300     public UnicodeSet addPropertyStarts(UnicodeSet set) {
   1301         /* add the start code point of each same-value range of the main trie */
   1302         Iterator<Trie2.Range> trieIterator = m_trie_.iterator();
   1303         Trie2.Range range;
   1304         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
   1305             set.add(range.startCodePoint);
   1306         }
   1307 
   1308         /* add code points with hardcoded properties, plus the ones following them */
   1309 
   1310         /* add for u_isblank() */
   1311         set.add(TAB);
   1312         set.add(TAB+1);
   1313 
   1314         /* add for IS_THAT_CONTROL_SPACE() */
   1315         set.add(CR+1); /* range TAB..CR */
   1316         set.add(0x1c);
   1317         set.add(0x1f+1);
   1318         set.add(NL);
   1319         set.add(NL+1);
   1320 
   1321         /* add for u_isIDIgnorable() what was not added above */
   1322         set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
   1323         set.add(HAIRSP);
   1324         set.add(RLM+1);
   1325         set.add(INHSWAP);
   1326         set.add(NOMDIG+1);
   1327         set.add(ZWNBSP);
   1328         set.add(ZWNBSP+1);
   1329 
   1330         /* add no-break spaces for u_isWhitespace() what was not added above */
   1331         set.add(NBSP);
   1332         set.add(NBSP+1);
   1333         set.add(FIGURESP);
   1334         set.add(FIGURESP+1);
   1335         set.add(NNBSP);
   1336         set.add(NNBSP+1);
   1337 
   1338         /* add for u_charDigitValue() */
   1339         // TODO remove when UCharacter.getHanNumericValue() is changed to just return
   1340         // Unicode numeric values
   1341         set.add(0x3007);
   1342         set.add(0x3008);
   1343         set.add(0x4e00);
   1344         set.add(0x4e01);
   1345         set.add(0x4e8c);
   1346         set.add(0x4e8d);
   1347         set.add(0x4e09);
   1348         set.add(0x4e0a);
   1349         set.add(0x56db);
   1350         set.add(0x56dc);
   1351         set.add(0x4e94);
   1352         set.add(0x4e95);
   1353         set.add(0x516d);
   1354         set.add(0x516e);
   1355         set.add(0x4e03);
   1356         set.add(0x4e04);
   1357         set.add(0x516b);
   1358         set.add(0x516c);
   1359         set.add(0x4e5d);
   1360         set.add(0x4e5e);
   1361 
   1362         /* add for u_digit() */
   1363         set.add(U_a);
   1364         set.add(U_z+1);
   1365         set.add(U_A);
   1366         set.add(U_Z+1);
   1367         set.add(U_FW_a);
   1368         set.add(U_FW_z+1);
   1369         set.add(U_FW_A);
   1370         set.add(U_FW_Z+1);
   1371 
   1372         /* add for u_isxdigit() */
   1373         set.add(U_f+1);
   1374         set.add(U_F+1);
   1375         set.add(U_FW_f+1);
   1376         set.add(U_FW_F+1);
   1377 
   1378         /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
   1379         set.add(WJ); /* range WJ..NOMDIG */
   1380         set.add(0xfff0);
   1381         set.add(0xfffb+1);
   1382         set.add(0xe0000);
   1383         set.add(0xe0fff+1);
   1384 
   1385         /* add for UCHAR_GRAPHEME_BASE and others */
   1386         set.add(CGJ);
   1387         set.add(CGJ+1);
   1388 
   1389         return set; // for chaining
   1390     }
   1391 
   1392     public void upropsvec_addPropertyStarts(UnicodeSet set) {
   1393         /* add the start code point of each same-value range of the properties vectors trie */
   1394         if(m_additionalColumnsCount_>0) {
   1395             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
   1396             Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
   1397             Trie2.Range range;
   1398             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
   1399                 set.add(range.startCodePoint);
   1400             }
   1401         }
   1402     }
   1403 
   1404     // This static initializer block must be placed after
   1405     // other static member initialization
   1406     static {
   1407         try {
   1408             INSTANCE = new UCharacterProperty();
   1409         }
   1410         catch (IOException e) {
   1411             throw new MissingResourceException(e.getMessage(),"","");
   1412         }
   1413     }
   1414 
   1415 /*----------------------------------------------------------------
   1416  * Inclusions list
   1417  *----------------------------------------------------------------*/
   1418 
   1419     /*
   1420      * Return a set of characters for property enumeration.
   1421      * The set implicitly contains 0x110000 as well, which is one more than the highest
   1422      * Unicode code point.
   1423      *
   1424      * This set is used as an ordered list - its code points are ordered, and
   1425      * consecutive code points (in Unicode code point order) in the set define a range.
   1426      * For each two consecutive characters (start, limit) in the set,
   1427      * all of the UCD/normalization and related properties for
   1428      * all code points start..limit-1 are all the same,
   1429      * except for character names and ISO comments.
   1430      *
   1431      * All Unicode code points U+0000..U+10ffff are covered by these ranges.
   1432      * The ranges define a partition of the Unicode code space.
   1433      * ICU uses the inclusions set to enumerate properties for generating
   1434      * UnicodeSets containing all code points that have a certain property value.
   1435      *
   1436      * The Inclusion List is generated from the UCD. It is generated
   1437      * by enumerating the data tries, and code points for hardcoded properties
   1438      * are added as well.
   1439      *
   1440      * --------------------------------------------------------------------------
   1441      *
   1442      * The following are ideas for getting properties-unique code point ranges,
   1443      * with possible optimizations beyond the current implementation.
   1444      * These optimizations would require more code and be more fragile.
   1445      * The current implementation generates one single list (set) for all properties.
   1446      *
   1447      * To enumerate properties efficiently, one needs to know ranges of
   1448      * repetitive values, so that the value of only each start code point
   1449      * can be applied to the whole range.
   1450      * This information is in principle available in the uprops.icu/unorm.icu data.
   1451      *
   1452      * There are two obstacles:
   1453      *
   1454      * 1. Some properties are computed from multiple data structures,
   1455      *    making it necessary to get repetitive ranges by intersecting
   1456      *    ranges from multiple tries.
   1457      *
   1458      * 2. It is not economical to write code for getting repetitive ranges
   1459      *    that are precise for each of some 50 properties.
   1460      *
   1461      * Compromise ideas:
   1462      *
   1463      * - Get ranges per trie, not per individual property.
   1464      *   Each range contains the same values for a whole group of properties.
   1465      *   This would generate currently five range sets, two for uprops.icu tries
   1466      *   and three for unorm.icu tries.
   1467      *
   1468      * - Combine sets of ranges for multiple tries to get sufficient sets
   1469      *   for properties, e.g., the uprops.icu main and auxiliary tries
   1470      *   for all non-normalization properties.
   1471      *
   1472      * Ideas for representing ranges and combining them:
   1473      *
   1474      * - A UnicodeSet could hold just the start code points of ranges.
   1475      *   Multiple sets are easily combined by or-ing them together.
   1476      *
   1477      * - Alternatively, a UnicodeSet could hold each even-numbered range.
   1478      *   All ranges could be enumerated by using each start code point
   1479      *   (for the even-numbered ranges) as well as each limit (end+1) code point
   1480      *   (for the odd-numbered ranges).
   1481      *   It should be possible to combine two such sets by xor-ing them,
   1482      *   but no more than two.
   1483      *
   1484      * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
   1485      * but the first one is certainly simpler and applicable for combining more than
   1486      * two range sets.
   1487      *
   1488      * It is possible to combine all range sets for all uprops/unorm tries into one
   1489      * set that can be used for all properties.
   1490      * As an optimization, there could be less-combined range sets for certain
   1491      * groups of properties.
   1492      * The relationship of which less-combined range set to use for which property
   1493      * depends on the implementation of the properties and must be hardcoded
   1494      * - somewhat error-prone and higher maintenance but can be tested easily
   1495      * by building property sets "the simple way" in test code.
   1496      *
   1497      * ---
   1498      *
   1499      * Do not use a UnicodeSet pattern because that causes infinite recursion;
   1500      * UnicodeSet depends on the inclusions set.
   1501      *
   1502      * ---
   1503      *
   1504      * getInclusions() is commented out starting 2005-feb-12 because
   1505      * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
   1506      * and only for the relevant property source.
   1507      */
   1508     /*
   1509     public UnicodeSet getInclusions() {
   1510         UnicodeSet set = new UnicodeSet();
   1511         NormalizerImpl.addPropertyStarts(set);
   1512         addPropertyStarts(set);
   1513         return set;
   1514     }
   1515     */
   1516 }
   1517