Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 1996-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.impl;
     11 
     12 import java.io.IOException;
     13 import java.nio.ByteBuffer;
     14 import java.util.Iterator;
     15 import java.util.MissingResourceException;
     16 
     17 import com.ibm.icu.lang.UCharacter;
     18 import com.ibm.icu.lang.UCharacter.HangulSyllableType;
     19 import com.ibm.icu.lang.UCharacter.NumericType;
     20 import com.ibm.icu.lang.UCharacterCategory;
     21 import com.ibm.icu.lang.UProperty;
     22 import com.ibm.icu.lang.UScript;
     23 import com.ibm.icu.text.Normalizer2;
     24 import com.ibm.icu.text.UTF16;
     25 import com.ibm.icu.text.UnicodeSet;
     26 import com.ibm.icu.util.ICUException;
     27 import com.ibm.icu.util.VersionInfo;
     28 
     29 /**
     30 * <p>Internal class used for Unicode character property database.</p>
     31 * <p>This classes store binary data read from uprops.icu.
     32 * It does not have the capability to parse the data into more high-level
     33 * information. It only returns bytes of information when required.</p>
     34 * <p>Due to the form most commonly used for retrieval, array of char is used
     35 * to store the binary data.</p>
     36 * <p>UCharacterPropertyDB also contains information on accessing indexes to
     37 * significant points in the binary data.</p>
     38 * <p>Responsibility for molding the binary data into more meaning form lies on
     39 * <a href=UCharacter.html>UCharacter</a>.</p>
     40 * @author Syn Wee Quek
     41 * @since release 2.1, february 1st 2002
     42 */
     43 
     44 public final class UCharacterProperty
     45 {
     46     // public data members -----------------------------------------------
     47 
     48     /*
     49      * public singleton instance
     50      */
     51     public static final UCharacterProperty INSTANCE;
     52 
     53     /**
     54     * Trie data
     55     */
     56     public Trie2_16 m_trie_;
     57     /**
     58     * Unicode version
     59     */
     60     public VersionInfo m_unicodeVersion_;
     61     /**
     62     * Latin capital letter i with dot above
     63     */
     64     public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130;
     65     /**
     66     * Latin small letter i with dot above
     67     */
     68     public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131;
     69     /**
     70     * Latin lowercase i
     71     */
     72     public static final char LATIN_SMALL_LETTER_I_ = 0x69;
     73     /**
     74     * Character type mask
     75     */
     76     public static final int TYPE_MASK = 0x1F;
     77 
     78     // uprops.h enum UPropertySource --------------------------------------- ***
     79 
     80     /** No source, not a supported property. */
     81     public static final int SRC_NONE=0;
     82     /** From uchar.c/uprops.icu main trie */
     83     public static final int SRC_CHAR=1;
     84     /** From uchar.c/uprops.icu properties vectors trie */
     85     public static final int SRC_PROPSVEC=2;
     86     /** From unames.c/unames.icu */
     87     public static final int SRC_NAMES=3;
     88     /** From ucase.c/ucase.icu */
     89     public static final int SRC_CASE=4;
     90     /** From ubidi_props.c/ubidi.icu */
     91     public static final int SRC_BIDI=5;
     92     /** From uchar.c/uprops.icu main trie as well as properties vectors trie */
     93     public static final int SRC_CHAR_AND_PROPSVEC=6;
     94     /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
     95     public static final int SRC_CASE_AND_NORM=7;
     96     /** From normalizer2impl.cpp/nfc.nrm */
     97     public static final int SRC_NFC=8;
     98     /** From normalizer2impl.cpp/nfkc.nrm */
     99     public static final int SRC_NFKC=9;
    100     /** From normalizer2impl.cpp/nfkc_cf.nrm */
    101     public static final int SRC_NFKC_CF=10;
    102     /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */
    103     public static final int SRC_NFC_CANON_ITER=11;
    104     /** One more than the highest UPropertySource (SRC_) constant. */
    105     public static final int SRC_COUNT=12;
    106 
    107     // public methods ----------------------------------------------------
    108 
    109     /**
    110     * Gets the main property value for code point ch.
    111     * @param ch code point whose property value is to be retrieved
    112     * @return property value of code point
    113     */
    114     public final int getProperty(int ch)
    115     {
    116         return m_trie_.get(ch);
    117     }
    118 
    119     /**
    120      * Gets the unicode additional properties.
    121      * Java version of C u_getUnicodeProperties().
    122      * @param codepoint codepoint whose additional properties is to be
    123      *                  retrieved
    124      * @param column The column index.
    125      * @return unicode properties
    126      */
    127     public int getAdditional(int codepoint, int column) {
    128         assert column >= 0;
    129         if (column >= m_additionalColumnsCount_) {
    130             return 0;
    131         }
    132         return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
    133     }
    134 
    135     static final int MY_MASK = UCharacterProperty.TYPE_MASK
    136         & ((1<<UCharacterCategory.UPPERCASE_LETTER) |
    137             (1<<UCharacterCategory.LOWERCASE_LETTER) |
    138             (1<<UCharacterCategory.TITLECASE_LETTER) |
    139             (1<<UCharacterCategory.MODIFIER_LETTER) |
    140             (1<<UCharacterCategory.OTHER_LETTER));
    141 
    142 
    143        /**
    144      * <p>Get the "age" of the code point.</p>
    145      * <p>The "age" is the Unicode version when the code point was first
    146      * designated (as a non-character or for Private Use) or assigned a
    147      * character.</p>
    148      * <p>This can be useful to avoid emitting code points to receiving
    149      * processes that do not accept newer characters.</p>
    150      * <p>The data is from the UCD file DerivedAge.txt.</p>
    151      * <p>This API does not check the validity of the codepoint.</p>
    152      * @param codepoint The code point.
    153      * @return the Unicode version number
    154      */
    155     public VersionInfo getAge(int codepoint)
    156     {
    157         int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
    158         return VersionInfo.getInstance(
    159                            (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
    160                            version & LAST_NIBBLE_MASK_, 0, 0);
    161     }
    162 
    163     private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED);
    164     private static final int GC_CC_MASK = getMask(UCharacter.CONTROL);
    165     private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE);
    166     private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR);
    167     private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR);
    168     private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR);
    169     /** Mask constant for multiple UCharCategory bits (Z Separators). */
    170     private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK;
    171 
    172     /**
    173      * Checks if c is in
    174      * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}]
    175      * with space=\p{Whitespace} and Control=Cc.
    176      * Implements UCHAR_POSIX_GRAPH.
    177      * @internal
    178      */
    179     private static final boolean isgraphPOSIX(int c) {
    180         /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */
    181         /* comparing ==0 returns FALSE for the categories mentioned */
    182         return (getMask(UCharacter.getType(c))&
    183                 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK))
    184                ==0;
    185     }
    186 
    187     // binary properties --------------------------------------------------- ***
    188 
    189     private class BinaryProperty {
    190         int column;  // SRC_PROPSVEC column, or "source" if mask==0
    191         int mask;
    192         BinaryProperty(int column, int mask) {
    193             this.column=column;
    194             this.mask=mask;
    195         }
    196         BinaryProperty(int source) {
    197             this.column=source;
    198             this.mask=0;
    199         }
    200         final int getSource() {
    201             return mask==0 ? column : SRC_PROPSVEC;
    202         }
    203         boolean contains(int c) {
    204             // systematic, directly stored properties
    205             return (getAdditional(c, column)&mask)!=0;
    206         }
    207     }
    208 
    209     private class CaseBinaryProperty extends BinaryProperty {  // case mapping properties
    210         int which;
    211         CaseBinaryProperty(int which) {
    212             super(SRC_CASE);
    213             this.which=which;
    214         }
    215         @Override
    216         boolean contains(int c) {
    217             return UCaseProps.INSTANCE.hasBinaryProperty(c, which);
    218         }
    219     }
    220 
    221     private class NormInertBinaryProperty extends BinaryProperty {  // UCHAR_NF*_INERT properties
    222         int which;
    223         NormInertBinaryProperty(int source, int which) {
    224             super(source);
    225             this.which=which;
    226         }
    227         @Override
    228         boolean contains(int c) {
    229             return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
    230         }
    231     }
    232 
    233     BinaryProperty[] binProps={
    234         /*
    235          * Binary-property implementations must be in order of corresponding UProperty,
    236          * and there must be exactly one entry per binary UProperty.
    237          */
    238         new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)),
    239         new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)),
    240         new BinaryProperty(SRC_BIDI) {  // UCHAR_BIDI_CONTROL
    241             @Override
    242             boolean contains(int c) {
    243                 return UBiDiProps.INSTANCE.isBidiControl(c);
    244             }
    245         },
    246         new BinaryProperty(SRC_BIDI) {  // UCHAR_BIDI_MIRRORED
    247             @Override
    248             boolean contains(int c) {
    249                 return UBiDiProps.INSTANCE.isMirrored(c);
    250             }
    251         },
    252         new BinaryProperty(1, (1<<DASH_PROPERTY_)),
    253         new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)),
    254         new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)),
    255         new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)),
    256         new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)),
    257         new BinaryProperty(SRC_NFC) {  // UCHAR_FULL_COMPOSITION_EXCLUSION
    258             @Override
    259             boolean contains(int c) {
    260                 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
    261                 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl;
    262                 return impl.isCompNo(impl.getNorm16(c));
    263             }
    264         },
    265         new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)),
    266         new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)),
    267         new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)),
    268         new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)),
    269         new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)),
    270         new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)),
    271         new BinaryProperty(1, (1<<ID_START_PROPERTY_)),
    272         new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)),
    273         new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)),
    274         new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)),
    275         new BinaryProperty(SRC_BIDI) {  // UCHAR_JOIN_CONTROL
    276             @Override
    277             boolean contains(int c) {
    278                 return UBiDiProps.INSTANCE.isJoinControl(c);
    279             }
    280         },
    281         new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)),
    282         new CaseBinaryProperty(UProperty.LOWERCASE),
    283         new BinaryProperty(1, (1<<MATH_PROPERTY_)),
    284         new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)),
    285         new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)),
    286         new BinaryProperty(1, (1<<RADICAL_PROPERTY_)),
    287         new CaseBinaryProperty(UProperty.SOFT_DOTTED),
    288         new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)),
    289         new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)),
    290         new CaseBinaryProperty(UProperty.UPPERCASE),
    291         new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)),
    292         new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)),
    293         new BinaryProperty(1, (1<<XID_START_PROPERTY_)),
    294         new CaseBinaryProperty(UProperty.CASE_SENSITIVE),
    295         new BinaryProperty(1, (1<<S_TERM_PROPERTY_)),
    296         new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)),
    297         new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT),
    298         new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT),
    299         new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT),
    300         new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT),
    301         new BinaryProperty(SRC_NFC_CANON_ITER) {  // UCHAR_SEGMENT_STARTER
    302             @Override
    303             boolean contains(int c) {
    304                 return Norm2AllModes.getNFCInstance().impl.
    305                     ensureCanonIterData().isCanonSegmentStarter(c);
    306             }
    307         },
    308         new BinaryProperty(1, (1<<PATTERN_SYNTAX)),
    309         new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)),
    310         new BinaryProperty(SRC_CHAR_AND_PROPSVEC) {  // UCHAR_POSIX_ALNUM
    311             @Override
    312             boolean contains(int c) {
    313                 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c);
    314             }
    315         },
    316         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_BLANK
    317             @Override
    318             boolean contains(int c) {
    319                 // "horizontal space"
    320                 if(c<=0x9f) {
    321                     return c==9 || c==0x20; /* TAB or SPACE */
    322                 } else {
    323                     /* Zs */
    324                     return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR;
    325                 }
    326             }
    327         },
    328         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_GRAPH
    329             @Override
    330             boolean contains(int c) {
    331                 return isgraphPOSIX(c);
    332             }
    333         },
    334         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_PRINT
    335             @Override
    336             boolean contains(int c) {
    337                 /*
    338                  * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}.
    339                  *
    340                  * The only cntrl character in graph+blank is TAB (in blank).
    341                  * Here we implement (blank-TAB)=Zs instead of calling u_isblank().
    342                  */
    343                 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c);
    344             }
    345         },
    346         new BinaryProperty(SRC_CHAR) {  // UCHAR_POSIX_XDIGIT
    347             @Override
    348             boolean contains(int c) {
    349                 /* check ASCII and Fullwidth ASCII a-fA-F */
    350                 if(
    351                     (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) ||
    352                     (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41))
    353                 ) {
    354                     return true;
    355                 }
    356                 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER;
    357             }
    358         },
    359         new CaseBinaryProperty(UProperty.CASED),
    360         new CaseBinaryProperty(UProperty.CASE_IGNORABLE),
    361         new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED),
    362         new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED),
    363         new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED),
    364         new BinaryProperty(SRC_CASE_AND_NORM) {  // UCHAR_CHANGES_WHEN_CASEFOLDED
    365             @Override
    366             boolean contains(int c) {
    367                 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c);
    368                 if(nfd!=null) {
    369                     /* c has a decomposition */
    370                     c=nfd.codePointAt(0);
    371                     if(Character.charCount(c)!=nfd.length()) {
    372                         /* multiple code points */
    373                         c=-1;
    374                     }
    375                 } else if(c<0) {
    376                     return false;  /* protect against bad input */
    377                 }
    378                 if(c>=0) {
    379                     /* single code point */
    380                     UCaseProps csp=UCaseProps.INSTANCE;
    381                     UCaseProps.dummyStringBuilder.setLength(0);
    382                     return csp.toFullFolding(c, UCaseProps.dummyStringBuilder,
    383                                              UCharacter.FOLD_CASE_DEFAULT)>=0;
    384                 } else {
    385                     String folded=UCharacter.foldCase(nfd, true);
    386                     return !folded.equals(nfd);
    387                 }
    388             }
    389         },
    390         new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED),
    391         new BinaryProperty(SRC_NFKC_CF) {  // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
    392             @Override
    393             boolean contains(int c) {
    394                 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl;
    395                 String src=UTF16.valueOf(c);
    396                 StringBuilder dest=new StringBuilder();
    397                 // Small destCapacity for NFKC_CF(c).
    398                 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5);
    399                 kcf.compose(src, 0, src.length(), false, true, buffer);
    400                 return !Normalizer2Impl.UTF16Plus.equal(dest, src);
    401             }
    402         },
    403         new BinaryProperty(2, 1<<PROPS_2_EMOJI),
    404         new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION),
    405         new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER),
    406         new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE),
    407         new BinaryProperty(2, 1<<PROPS_2_EMOJI_COMPONENT),
    408         new BinaryProperty(SRC_PROPSVEC) {  // REGIONAL_INDICATOR
    409             // Property starts are a subset of lb=RI etc.
    410             @Override
    411             boolean contains(int c) {
    412                 return 0x1F1E6<=c && c<=0x1F1FF;
    413             }
    414         },
    415         new BinaryProperty(1, 1<<PREPENDED_CONCATENATION_MARK),
    416     };
    417 
    418     public boolean hasBinaryProperty(int c, int which) {
    419          if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) {
    420             // not a known binary property
    421             return false;
    422         } else {
    423             return binProps[which].contains(c);
    424         }
    425     }
    426 
    427     // int-value and enumerated properties --------------------------------- ***
    428 
    429     public int getType(int c) {
    430         return getProperty(c)&TYPE_MASK;
    431     }
    432 
    433     /*
    434      * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
    435      * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
    436      */
    437     private static final int /* UHangulSyllableType */ gcbToHst[]={
    438         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_OTHER */
    439         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CONTROL */
    440         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_CR */
    441         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_EXTEND */
    442         HangulSyllableType.LEADING_JAMO,     /* U_GCB_L */
    443         HangulSyllableType.NOT_APPLICABLE,   /* U_GCB_LF */
    444         HangulSyllableType.LV_SYLLABLE,      /* U_GCB_LV */
    445         HangulSyllableType.LVT_SYLLABLE,     /* U_GCB_LVT */
    446         HangulSyllableType.TRAILING_JAMO,    /* U_GCB_T */
    447         HangulSyllableType.VOWEL_JAMO        /* U_GCB_V */
    448         /*
    449          * Omit GCB values beyond what we need for hst.
    450          * The code below checks for the array length.
    451          */
    452     };
    453 
    454     private class IntProperty {
    455         int column;  // SRC_PROPSVEC column, or "source" if mask==0
    456         int mask;
    457         int shift;
    458         IntProperty(int column, int mask, int shift) {
    459             this.column=column;
    460             this.mask=mask;
    461             this.shift=shift;
    462         }
    463         IntProperty(int source) {
    464             this.column=source;
    465             this.mask=0;
    466         }
    467         final int getSource() {
    468             return mask==0 ? column : SRC_PROPSVEC;
    469         }
    470         int getValue(int c) {
    471             // systematic, directly stored properties
    472             return (getAdditional(c, column)&mask)>>>shift;
    473         }
    474         int getMaxValue(int which) {
    475             return (getMaxValues(column)&mask)>>>shift;
    476         }
    477     }
    478 
    479     private class BiDiIntProperty extends IntProperty {
    480         BiDiIntProperty() {
    481             super(SRC_BIDI);
    482         }
    483         @Override
    484         int getMaxValue(int which) {
    485             return UBiDiProps.INSTANCE.getMaxValue(which);
    486         }
    487     }
    488 
    489     private class CombiningClassIntProperty extends IntProperty {
    490         CombiningClassIntProperty(int source) {
    491             super(source);
    492         }
    493         @Override
    494         int getMaxValue(int which) {
    495             return 0xff;
    496         }
    497     }
    498 
    499     private class NormQuickCheckIntProperty extends IntProperty {  // UCHAR_NF*_QUICK_CHECK properties
    500         int which;
    501         int max;
    502         NormQuickCheckIntProperty(int source, int which, int max) {
    503             super(source);
    504             this.which=which;
    505             this.max=max;
    506         }
    507         @Override
    508         int getValue(int c) {
    509             return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c);
    510         }
    511         @Override
    512         int getMaxValue(int which) {
    513             return max;
    514         }
    515     }
    516 
    517     IntProperty intProps[]={
    518         new BiDiIntProperty() {  // BIDI_CLASS
    519             @Override
    520             int getValue(int c) {
    521                 return UBiDiProps.INSTANCE.getClass(c);
    522             }
    523         },
    524         new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_),
    525         new CombiningClassIntProperty(SRC_NFC) {  // CANONICAL_COMBINING_CLASS
    526             @Override
    527             int getValue(int c) {
    528                 return Normalizer2.getNFDInstance().getCombiningClass(c);
    529             }
    530         },
    531         new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0),
    532         new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_),
    533         new IntProperty(SRC_CHAR) {  // GENERAL_CATEGORY
    534             @Override
    535             int getValue(int c) {
    536                 return getType(c);
    537             }
    538             @Override
    539             int getMaxValue(int which) {
    540                 return UCharacterCategory.CHAR_CATEGORY_COUNT-1;
    541             }
    542         },
    543         new BiDiIntProperty() {  // JOINING_GROUP
    544             @Override
    545             int getValue(int c) {
    546                 return UBiDiProps.INSTANCE.getJoiningGroup(c);
    547             }
    548         },
    549         new BiDiIntProperty() {  // JOINING_TYPE
    550             @Override
    551             int getValue(int c) {
    552                 return UBiDiProps.INSTANCE.getJoiningType(c);
    553             }
    554         },
    555         new IntProperty(2, LB_MASK, LB_SHIFT),  // LINE_BREAK
    556         new IntProperty(SRC_CHAR) {  // NUMERIC_TYPE
    557             @Override
    558             int getValue(int c) {
    559                 return ntvGetType(getNumericTypeValue(getProperty(c)));
    560             }
    561             @Override
    562             int getMaxValue(int which) {
    563                 return NumericType.COUNT-1;
    564             }
    565         },
    566         new IntProperty(0, SCRIPT_MASK_, 0) {
    567             @Override
    568             int getValue(int c) {
    569                 return UScript.getScript(c);
    570             }
    571         },
    572         new IntProperty(SRC_PROPSVEC) {  // HANGUL_SYLLABLE_TYPE
    573             @Override
    574             int getValue(int c) {
    575                 /* see comments on gcbToHst[] above */
    576                 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
    577                 if(gcb<gcbToHst.length) {
    578                     return gcbToHst[gcb];
    579                 } else {
    580                     return HangulSyllableType.NOT_APPLICABLE;
    581                 }
    582             }
    583             @Override
    584             int getMaxValue(int which) {
    585                 return HangulSyllableType.COUNT-1;
    586             }
    587         },
    588         // max=1=YES -- these are never "maybe", only "no" or "yes"
    589         new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1),
    590         new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1),
    591         // max=2=MAYBE
    592         new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2),
    593         new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2),
    594         new CombiningClassIntProperty(SRC_NFC) {  // LEAD_CANONICAL_COMBINING_CLASS
    595             @Override
    596             int getValue(int c) {
    597                 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8;
    598             }
    599         },
    600         new CombiningClassIntProperty(SRC_NFC) {  // TRAIL_CANONICAL_COMBINING_CLASS
    601             @Override
    602             int getValue(int c) {
    603                 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff;
    604             }
    605         },
    606         new IntProperty(2, GCB_MASK, GCB_SHIFT),  // GRAPHEME_CLUSTER_BREAK
    607         new IntProperty(2, SB_MASK, SB_SHIFT),  // SENTENCE_BREAK
    608         new IntProperty(2, WB_MASK, WB_SHIFT),  // WORD_BREAK
    609         new BiDiIntProperty() {  // BIDI_PAIRED_BRACKET_TYPE
    610             @Override
    611             int getValue(int c) {
    612                 return UBiDiProps.INSTANCE.getPairedBracketType(c);
    613             }
    614         },
    615     };
    616 
    617     public int getIntPropertyValue(int c, int which) {
    618         if(which<UProperty.INT_START) {
    619             if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
    620                 return binProps[which].contains(c) ? 1 : 0;
    621             }
    622         } else if(which<UProperty.INT_LIMIT) {
    623             return intProps[which-UProperty.INT_START].getValue(c);
    624         } else if (which == UProperty.GENERAL_CATEGORY_MASK) {
    625             return getMask(getType(c));
    626         }
    627         return 0; // undefined
    628     }
    629 
    630     public int getIntPropertyMaxValue(int which) {
    631         if(which<UProperty.INT_START) {
    632             if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) {
    633                 return 1;  // maximum TRUE for all binary properties
    634             }
    635         } else if(which<UProperty.INT_LIMIT) {
    636             return intProps[which-UProperty.INT_START].getMaxValue(which);
    637         }
    638         return -1; // undefined
    639     }
    640 
    641     public final int getSource(int which) {
    642         if(which<UProperty.BINARY_START) {
    643             return SRC_NONE; /* undefined */
    644         } else if(which<UProperty.BINARY_LIMIT) {
    645             return binProps[which].getSource();
    646         } else if(which<UProperty.INT_START) {
    647             return SRC_NONE; /* undefined */
    648         } else if(which<UProperty.INT_LIMIT) {
    649             return intProps[which-UProperty.INT_START].getSource();
    650         } else if(which<UProperty.STRING_START) {
    651             switch(which) {
    652             case UProperty.GENERAL_CATEGORY_MASK:
    653             case UProperty.NUMERIC_VALUE:
    654                 return SRC_CHAR;
    655 
    656             default:
    657                 return SRC_NONE;
    658             }
    659         } else if(which<UProperty.STRING_LIMIT) {
    660             switch(which) {
    661             case UProperty.AGE:
    662                 return SRC_PROPSVEC;
    663 
    664             case UProperty.BIDI_MIRRORING_GLYPH:
    665                 return SRC_BIDI;
    666 
    667             case UProperty.CASE_FOLDING:
    668             case UProperty.LOWERCASE_MAPPING:
    669             case UProperty.SIMPLE_CASE_FOLDING:
    670             case UProperty.SIMPLE_LOWERCASE_MAPPING:
    671             case UProperty.SIMPLE_TITLECASE_MAPPING:
    672             case UProperty.SIMPLE_UPPERCASE_MAPPING:
    673             case UProperty.TITLECASE_MAPPING:
    674             case UProperty.UPPERCASE_MAPPING:
    675                 return SRC_CASE;
    676 
    677             case UProperty.ISO_COMMENT:
    678             case UProperty.NAME:
    679             case UProperty.UNICODE_1_NAME:
    680                 return SRC_NAMES;
    681 
    682             default:
    683                 return SRC_NONE;
    684             }
    685         } else {
    686             switch(which) {
    687             case UProperty.SCRIPT_EXTENSIONS:
    688                 return SRC_PROPSVEC;
    689             default:
    690                 return SRC_NONE; /* undefined */
    691             }
    692         }
    693     }
    694 
    695     /**
    696      * <p>
    697      * Unicode property names and property value names are compared
    698      * "loosely". Property[Value]Aliases.txt say:
    699      * <quote>
    700      *   "With loose matching of property names, the case distinctions,
    701      *    whitespace, and '_' are ignored."
    702      * </quote>
    703      * </p>
    704      * <p>
    705      * This function does just that, for ASCII (char *) name strings.
    706      * It is almost identical to ucnv_compareNames() but also ignores
    707      * ASCII White_Space characters (U+0009..U+000d).
    708      * </p>
    709      * @param name1 name to compare
    710      * @param name2 name to compare
    711      * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0
    712      *         if name1 is greater than name2.
    713      */
    714     /* to be implemented in 2.4
    715      * public static int comparePropertyNames(String name1, String name2)
    716     {
    717         int result = 0;
    718         int i1 = 0;
    719         int i2 = 0;
    720         while (true) {
    721             char ch1 = 0;
    722             char ch2 = 0;
    723             // Ignore delimiters '-', '_', and ASCII White_Space
    724             if (i1 < name1.length()) {
    725                 ch1 = name1.charAt(i1 ++);
    726             }
    727             while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t'
    728                    || ch1 == '\n' // synwee what is || ch1 == '\v'
    729                    || ch1 == '\f' || ch1=='\r') {
    730                 if (i1 < name1.length()) {
    731                     ch1 = name1.charAt(i1 ++);
    732                 }
    733                 else {
    734                     ch1 = 0;
    735                 }
    736             }
    737             if (i2 < name2.length()) {
    738                 ch2 = name2.charAt(i2 ++);
    739             }
    740             while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t'
    741                    || ch2 == '\n' // synwee what is || ch1 == '\v'
    742                    || ch2 == '\f' || ch2=='\r') {
    743                 if (i2 < name2.length()) {
    744                     ch2 = name2.charAt(i2 ++);
    745                 }
    746                 else {
    747                     ch2 = 0;
    748                 }
    749             }
    750 
    751             // If we reach the ends of both strings then they match
    752             if (ch1 == 0 && ch2 == 0) {
    753                 return 0;
    754             }
    755 
    756             // Case-insensitive comparison
    757             if (ch1 != ch2) {
    758                 result = Character.toLowerCase(ch1)
    759                                                 - Character.toLowerCase(ch2);
    760                 if (result != 0) {
    761                     return result;
    762                 }
    763             }
    764         }
    765     }
    766     */
    767 
    768     /**
    769      * Get the the maximum values for some enum/int properties.
    770      * @return maximum values for the integer properties.
    771      */
    772     public int getMaxValues(int column)
    773     {
    774        // return m_maxBlockScriptValue_;
    775 
    776         switch(column) {
    777         case 0:
    778             return m_maxBlockScriptValue_;
    779         case 2:
    780             return m_maxJTGValue_;
    781         default:
    782             return 0;
    783         }
    784     }
    785 
    786     /**
    787      * Gets the type mask
    788      * @param type character type
    789      * @return mask
    790      */
    791     public static final int getMask(int type)
    792     {
    793         return 1 << type;
    794     }
    795 
    796 
    797     /**
    798      * Returns the digit values of characters like 'A' - 'Z', normal,
    799      * half-width and full-width. This method assumes that the other digit
    800      * characters are checked by the calling method.
    801      * @param ch character to test
    802      * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
    803      *         its corresponding digit will be returned.
    804      */
    805     public static int getEuropeanDigit(int ch) {
    806         if ((ch > 0x7a && ch < 0xff21)
    807             || ch < 0x41 || (ch > 0x5a && ch < 0x61)
    808             || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
    809             return -1;
    810         }
    811         if (ch <= 0x7a) {
    812             // ch >= 0x41 or ch < 0x61
    813             return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
    814         }
    815         // ch >= 0xff21
    816         if (ch <= 0xff3a) {
    817             return ch + 10 - 0xff21;
    818         }
    819         // ch >= 0xff41 && ch <= 0xff5a
    820         return ch + 10 - 0xff41;
    821     }
    822 
    823     public int digit(int c) {
    824         int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
    825         if(value<=9) {
    826             return value;
    827         } else {
    828             return -1;
    829         }
    830     }
    831 
    832     public int getNumericValue(int c) {
    833         // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit()
    834         int ntv = getNumericTypeValue(getProperty(c));
    835 
    836         if(ntv==NTV_NONE_) {
    837             return getEuropeanDigit(c);
    838         } else if(ntv<NTV_DIGIT_START_) {
    839             /* decimal digit */
    840             return ntv-NTV_DECIMAL_START_;
    841         } else if(ntv<NTV_NUMERIC_START_) {
    842             /* other digit */
    843             return ntv-NTV_DIGIT_START_;
    844         } else if(ntv<NTV_FRACTION_START_) {
    845             /* small integer */
    846             return ntv-NTV_NUMERIC_START_;
    847         } else if(ntv<NTV_LARGE_START_) {
    848             /* fraction */
    849             return -2;
    850         } else if(ntv<NTV_BASE60_START_) {
    851             /* large, single-significant-digit integer */
    852             int mant=(ntv>>5)-14;
    853             int exp=(ntv&0x1f)+2;
    854             if(exp<9 || (exp==9 && mant<=2)) {
    855                 int numValue=mant;
    856                 do {
    857                     numValue*=10;
    858                 } while(--exp>0);
    859                 return numValue;
    860             } else {
    861                 return -2;
    862             }
    863         } else if(ntv<NTV_FRACTION20_START_) {
    864             /* sexagesimal (base 60) integer */
    865             int numValue=(ntv>>2)-0xbf;
    866             int exp=(ntv&3)+1;
    867 
    868             switch(exp) {
    869             case 4:
    870                 numValue*=60*60*60*60;
    871                 break;
    872             case 3:
    873                 numValue*=60*60*60;
    874                 break;
    875             case 2:
    876                 numValue*=60*60;
    877                 break;
    878             case 1:
    879                 numValue*=60;
    880                 break;
    881             case 0:
    882             default:
    883                 break;
    884             }
    885 
    886             return numValue;
    887         } else if(ntv<NTV_RESERVED_START_) {
    888             // fraction-20 e.g. 3/80
    889             return -2;
    890         } else {
    891             /* reserved */
    892             return -2;
    893         }
    894     }
    895 
    896     public double getUnicodeNumericValue(int c) {
    897         // equivalent to c version double u_getNumericValue(UChar32 c)
    898         int ntv = getNumericTypeValue(getProperty(c));
    899 
    900         if(ntv==NTV_NONE_) {
    901             return UCharacter.NO_NUMERIC_VALUE;
    902         } else if(ntv<NTV_DIGIT_START_) {
    903             /* decimal digit */
    904             return ntv-NTV_DECIMAL_START_;
    905         } else if(ntv<NTV_NUMERIC_START_) {
    906             /* other digit */
    907             return ntv-NTV_DIGIT_START_;
    908         } else if(ntv<NTV_FRACTION_START_) {
    909             /* small integer */
    910             return ntv-NTV_NUMERIC_START_;
    911         } else if(ntv<NTV_LARGE_START_) {
    912             /* fraction */
    913             int numerator=(ntv>>4)-12;
    914             int denominator=(ntv&0xf)+1;
    915             return (double)numerator/denominator;
    916         } else if(ntv<NTV_BASE60_START_) {
    917             /* large, single-significant-digit integer */
    918             double numValue;
    919             int mant=(ntv>>5)-14;
    920             int exp=(ntv&0x1f)+2;
    921             numValue=mant;
    922 
    923             /* multiply by 10^exp without math.h */
    924             while(exp>=4) {
    925                 numValue*=10000.;
    926                 exp-=4;
    927             }
    928             switch(exp) {
    929             case 3:
    930                 numValue*=1000.;
    931                 break;
    932             case 2:
    933                 numValue*=100.;
    934                 break;
    935             case 1:
    936                 numValue*=10.;
    937                 break;
    938             case 0:
    939             default:
    940                 break;
    941             }
    942 
    943             return numValue;
    944         } else if(ntv<NTV_FRACTION20_START_) {
    945             /* sexagesimal (base 60) integer */
    946             int numValue=(ntv>>2)-0xbf;
    947             int exp=(ntv&3)+1;
    948 
    949             switch(exp) {
    950             case 4:
    951                 numValue*=60*60*60*60;
    952                 break;
    953             case 3:
    954                 numValue*=60*60*60;
    955                 break;
    956             case 2:
    957                 numValue*=60*60;
    958                 break;
    959             case 1:
    960                 numValue*=60;
    961                 break;
    962             case 0:
    963             default:
    964                 break;
    965             }
    966 
    967             return numValue;
    968         } else if(ntv<NTV_RESERVED_START_) {
    969             // fraction-20 e.g. 3/80
    970             int frac20=ntv-NTV_FRACTION20_START_;  // 0..0x17
    971             int numerator=2*(frac20&3)+1;
    972             int denominator=20<<(frac20>>2);
    973             return (double)numerator/denominator;
    974         } else {
    975             /* reserved */
    976             return UCharacter.NO_NUMERIC_VALUE;
    977         }
    978     }
    979 
    980     // protected variables -----------------------------------------------
    981 
    982     /**
    983      * Extra property trie
    984      */
    985     Trie2_16 m_additionalTrie_;
    986     /**
    987      * Extra property vectors, 1st column for age and second for binary
    988      * properties.
    989      */
    990     int m_additionalVectors_[];
    991     /**
    992      * Number of additional columns
    993      */
    994     int m_additionalColumnsCount_;
    995     /**
    996      * Maximum values for block, bits used as in vector word
    997      * 0
    998      */
    999     int m_maxBlockScriptValue_;
   1000     /**
   1001      * Maximum values for script, bits used as in vector word
   1002      * 0
   1003      */
   1004      int m_maxJTGValue_;
   1005 
   1006     /**
   1007      * Script_Extensions data
   1008      */
   1009     public char[] m_scriptExtensions_;
   1010 
   1011     // private variables -------------------------------------------------
   1012 
   1013     /**
   1014     * Default name of the datafile
   1015     */
   1016     private static final String DATA_FILE_NAME_ = "uprops.icu";
   1017 
   1018     // property data constants -------------------------------------------------
   1019 
   1020     /**
   1021      * Numeric types and values in the main properties words.
   1022      */
   1023     private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
   1024     private static final int getNumericTypeValue(int props) {
   1025         return props >> NUMERIC_TYPE_VALUE_SHIFT_;
   1026     }
   1027     /* constants for the storage form of numeric types and values */
   1028     /** No numeric value. */
   1029     private static final int NTV_NONE_ = 0;
   1030     /** Decimal digits: nv=0..9 */
   1031     private static final int NTV_DECIMAL_START_ = 1;
   1032     /** Other digits: nv=0..9 */
   1033     private static final int NTV_DIGIT_START_ = 11;
   1034     /** Small integers: nv=0..154 */
   1035     private static final int NTV_NUMERIC_START_ = 21;
   1036     /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */
   1037     private static final int NTV_FRACTION_START_ = 0xb0;
   1038     /**
   1039      * Large integers:
   1040      * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
   1041      * (only one significant decimal digit)
   1042      */
   1043     private static final int NTV_LARGE_START_ = 0x1e0;
   1044     /**
   1045      * Sexagesimal numbers:
   1046      * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
   1047      */
   1048     private static final int NTV_BASE60_START_=0x300;
   1049     /**
   1050      * Fraction-20 values:
   1051      * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
   1052      * numerator: num = 2*(frac20&3)+1
   1053      * denominator: den = 20<<(frac20>>2)
   1054      */
   1055     private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36;  // 0x300+9*4=0x324
   1056     /** No numeric value (yet). */
   1057     private static final int NTV_RESERVED_START_ = NTV_FRACTION20_START_ + 24;  // 0x324+6*4=0x34c
   1058 
   1059     private static final int ntvGetType(int ntv) {
   1060         return
   1061             (ntv==NTV_NONE_) ? NumericType.NONE :
   1062             (ntv<NTV_DIGIT_START_) ?  NumericType.DECIMAL :
   1063             (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
   1064             NumericType.NUMERIC;
   1065     }
   1066 
   1067     /*
   1068      * Properties in vector word 0
   1069      * Bits
   1070      * 31..24   DerivedAge version major/minor one nibble each
   1071      * 23..22   3..1: Bits 7..0 = Script_Extensions index
   1072      *             3: Script value from Script_Extensions
   1073      *             2: Script=Inherited
   1074      *             1: Script=Common
   1075      *             0: Script=bits 7..0
   1076      * 21..20   reserved
   1077      * 19..17   East Asian Width
   1078      * 16.. 8   UBlockCode
   1079      *  7.. 0   UScriptCode
   1080      */
   1081 
   1082     /**
   1083      * Script_Extensions: mask includes Script
   1084      */
   1085     public static final int SCRIPT_X_MASK = 0x00c000ff;
   1086     //private static final int SCRIPT_X_SHIFT = 22;
   1087     /**
   1088      * Integer properties mask and shift values for East Asian cell width.
   1089      * Equivalent to icu4c UPROPS_EA_MASK
   1090      */
   1091     private static final int EAST_ASIAN_MASK_ = 0x000e0000;
   1092     /**
   1093      * Integer properties mask and shift values for East Asian cell width.
   1094      * Equivalent to icu4c UPROPS_EA_SHIFT
   1095      */
   1096     private static final int EAST_ASIAN_SHIFT_ = 17;
   1097     /**
   1098      * Integer properties mask and shift values for blocks.
   1099      * Equivalent to icu4c UPROPS_BLOCK_MASK
   1100      */
   1101     private static final int BLOCK_MASK_ = 0x0001ff00;
   1102     /**
   1103      * Integer properties mask and shift values for blocks.
   1104      * Equivalent to icu4c UPROPS_BLOCK_SHIFT
   1105      */
   1106     private static final int BLOCK_SHIFT_ = 8;
   1107     /**
   1108      * Integer properties mask and shift values for scripts.
   1109      * Equivalent to icu4c UPROPS_SHIFT_MASK
   1110      */
   1111     public static final int SCRIPT_MASK_ = 0x000000ff;
   1112 
   1113     /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */
   1114     public static final int SCRIPT_X_WITH_COMMON = 0x400000;
   1115     public static final int SCRIPT_X_WITH_INHERITED = 0x800000;
   1116     public static final int SCRIPT_X_WITH_OTHER = 0xc00000;
   1117 
   1118     /**
   1119      * Additional properties used in internal trie data
   1120      */
   1121     /*
   1122      * Properties in vector word 1
   1123      * Each bit encodes one binary property.
   1124      * The following constants represent the bit number, use 1<<UPROPS_XYZ.
   1125      * UPROPS_BINARY_1_TOP<=32!
   1126      *
   1127      * Keep this list of property enums in sync with
   1128      * propListNames[] in icu/source/tools/genprops/props2.c!
   1129      *
   1130      * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
   1131      */
   1132     private static final int WHITE_SPACE_PROPERTY_ = 0;
   1133     private static final int DASH_PROPERTY_ = 1;
   1134     private static final int HYPHEN_PROPERTY_ = 2;
   1135     private static final int QUOTATION_MARK_PROPERTY_ = 3;
   1136     private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
   1137     private static final int MATH_PROPERTY_ = 5;
   1138     private static final int HEX_DIGIT_PROPERTY_ = 6;
   1139     private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
   1140     private static final int ALPHABETIC_PROPERTY_ = 8;
   1141     private static final int IDEOGRAPHIC_PROPERTY_ = 9;
   1142     private static final int DIACRITIC_PROPERTY_ = 10;
   1143     private static final int EXTENDER_PROPERTY_ = 11;
   1144     private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
   1145     private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
   1146     private static final int GRAPHEME_LINK_PROPERTY_ = 14;
   1147     private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
   1148     private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
   1149     private static final int RADICAL_PROPERTY_ = 17;
   1150     private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
   1151     private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
   1152     private static final int DEPRECATED_PROPERTY_ = 20;
   1153     private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
   1154     private static final int XID_START_PROPERTY_ = 22;
   1155     private static final int XID_CONTINUE_PROPERTY_ = 23;
   1156     private static final int ID_START_PROPERTY_    = 24;
   1157     private static final int ID_CONTINUE_PROPERTY_ = 25;
   1158     private static final int GRAPHEME_BASE_PROPERTY_ = 26;
   1159     private static final int S_TERM_PROPERTY_ = 27;
   1160     private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
   1161     private static final int PATTERN_SYNTAX = 29;                   /* new in ICU 3.4 and Unicode 4.1 */
   1162     private static final int PATTERN_WHITE_SPACE = 30;
   1163     private static final int PREPENDED_CONCATENATION_MARK = 31;     // new in ICU 60 and Unicode 10
   1164 
   1165     /*
   1166      * Properties in vector word 2
   1167      * Bits
   1168      * 31..27   http://www.unicode.org/reports/tr51/#Emoji_Properties
   1169      *     26   reserved
   1170      * 25..20   Line Break
   1171      * 19..15   Sentence Break
   1172      * 14..10   Word Break
   1173      *  9.. 5   Grapheme Cluster Break
   1174      *  4.. 0   Decomposition Type
   1175      */
   1176     private static final int PROPS_2_EMOJI_COMPONENT = 27;
   1177     private static final int PROPS_2_EMOJI = 28;
   1178     private static final int PROPS_2_EMOJI_PRESENTATION = 29;
   1179     private static final int PROPS_2_EMOJI_MODIFIER = 30;
   1180     private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31;
   1181 
   1182     private static final int LB_MASK          = 0x03f00000;
   1183     private static final int LB_SHIFT         = 20;
   1184 
   1185     private static final int SB_MASK          = 0x000f8000;
   1186     private static final int SB_SHIFT         = 15;
   1187 
   1188     private static final int WB_MASK          = 0x00007c00;
   1189     private static final int WB_SHIFT         = 10;
   1190 
   1191     private static final int GCB_MASK         = 0x000003e0;
   1192     private static final int GCB_SHIFT        = 5;
   1193 
   1194     /**
   1195      * Integer properties mask for decomposition type.
   1196      * Equivalent to icu4c UPROPS_DT_MASK.
   1197      */
   1198     private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
   1199 
   1200     /**
   1201      * First nibble shift
   1202      */
   1203     private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
   1204     /**
   1205      * Second nibble mask
   1206      */
   1207     private static final int LAST_NIBBLE_MASK_ = 0xF;
   1208     /**
   1209      * Age value shift
   1210      */
   1211     private static final int AGE_SHIFT_ = 24;
   1212 
   1213 
   1214     // private constructors --------------------------------------------------
   1215 
   1216     /**
   1217      * Constructor
   1218      * @exception IOException thrown when data reading fails or data corrupted
   1219      */
   1220     private UCharacterProperty() throws IOException
   1221     {
   1222         // consistency check
   1223         if(binProps.length!=UProperty.BINARY_LIMIT) {
   1224             throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT");
   1225         }
   1226         if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) {
   1227             throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)");
   1228         }
   1229 
   1230         // jar access
   1231         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
   1232         m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
   1233         // Read or skip the 16 indexes.
   1234         int propertyOffset = bytes.getInt();
   1235         /* exceptionOffset = */ bytes.getInt();
   1236         /* caseOffset = */ bytes.getInt();
   1237         int additionalOffset = bytes.getInt();
   1238         int additionalVectorsOffset = bytes.getInt();
   1239         m_additionalColumnsCount_ = bytes.getInt();
   1240         int scriptExtensionsOffset = bytes.getInt();
   1241         int reservedOffset7 = bytes.getInt();
   1242         /* reservedOffset8 = */ bytes.getInt();
   1243         /* dataTopOffset = */ bytes.getInt();
   1244         m_maxBlockScriptValue_ = bytes.getInt();
   1245         m_maxJTGValue_ = bytes.getInt();
   1246         ICUBinary.skipBytes(bytes, (16 - 12) << 2);
   1247 
   1248         // read the main properties trie
   1249         m_trie_ = Trie2_16.createFromSerialized(bytes);
   1250         int expectedTrieLength = (propertyOffset - 16) * 4;
   1251         int trieLength = m_trie_.getSerializedLength();
   1252         if(trieLength > expectedTrieLength) {
   1253             throw new IOException("uprops.icu: not enough bytes for main trie");
   1254         }
   1255         // skip padding after trie bytes
   1256         ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
   1257 
   1258         // skip unused intervening data structures
   1259         ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
   1260 
   1261         if(m_additionalColumnsCount_ > 0) {
   1262             // reads the additional property block
   1263             m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
   1264             expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
   1265             trieLength = m_additionalTrie_.getSerializedLength();
   1266             if(trieLength > expectedTrieLength) {
   1267                 throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
   1268             }
   1269             // skip padding after trie bytes
   1270             ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
   1271 
   1272             // additional properties
   1273             int size = scriptExtensionsOffset - additionalVectorsOffset;
   1274             m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0);
   1275         }
   1276 
   1277         // Script_Extensions
   1278         int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
   1279         if(numChars > 0) {
   1280             m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0);
   1281         }
   1282     }
   1283 
   1284     private static final class IsAcceptable implements ICUBinary.Authenticate {
   1285         // @Override when we switch to Java 6
   1286         @Override
   1287         public boolean isDataVersionAcceptable(byte version[]) {
   1288             return version[0] == 7;
   1289         }
   1290     }
   1291     private static final int DATA_FORMAT = 0x5550726F;  // "UPro"
   1292 
   1293     // private methods -------------------------------------------------------
   1294 
   1295     /*
   1296      * Compare additional properties to see if it has argument type
   1297      * @param property 32 bit properties
   1298      * @param type character type
   1299      * @return true if property has type
   1300      */
   1301     /*private boolean compareAdditionalType(int property, int type)
   1302     {
   1303         return (property & (1 << type)) != 0;
   1304     }*/
   1305 
   1306     // property starts for UnicodeSet -------------------------------------- ***
   1307 
   1308     private static final int TAB     = 0x0009;
   1309     //private static final int LF      = 0x000a;
   1310     //private static final int FF      = 0x000c;
   1311     private static final int CR      = 0x000d;
   1312     private static final int U_A     = 0x0041;
   1313     private static final int U_F     = 0x0046;
   1314     private static final int U_Z     = 0x005a;
   1315     private static final int U_a     = 0x0061;
   1316     private static final int U_f     = 0x0066;
   1317     private static final int U_z     = 0x007a;
   1318     private static final int DEL     = 0x007f;
   1319     private static final int NL      = 0x0085;
   1320     private static final int NBSP    = 0x00a0;
   1321     private static final int CGJ     = 0x034f;
   1322     private static final int FIGURESP= 0x2007;
   1323     private static final int HAIRSP  = 0x200a;
   1324     //private static final int ZWNJ    = 0x200c;
   1325     //private static final int ZWJ     = 0x200d;
   1326     private static final int RLM     = 0x200f;
   1327     private static final int NNBSP   = 0x202f;
   1328     private static final int WJ      = 0x2060;
   1329     private static final int INHSWAP = 0x206a;
   1330     private static final int NOMDIG  = 0x206f;
   1331     private static final int U_FW_A  = 0xff21;
   1332     private static final int U_FW_F  = 0xff26;
   1333     private static final int U_FW_Z  = 0xff3a;
   1334     private static final int U_FW_a  = 0xff41;
   1335     private static final int U_FW_f  = 0xff46;
   1336     private static final int U_FW_z  = 0xff5a;
   1337     private static final int ZWNBSP  = 0xfeff;
   1338 
   1339     public UnicodeSet addPropertyStarts(UnicodeSet set) {
   1340         /* add the start code point of each same-value range of the main trie */
   1341         Iterator<Trie2.Range> trieIterator = m_trie_.iterator();
   1342         Trie2.Range range;
   1343         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
   1344             set.add(range.startCodePoint);
   1345         }
   1346 
   1347         /* add code points with hardcoded properties, plus the ones following them */
   1348 
   1349         /* add for u_isblank() */
   1350         set.add(TAB);
   1351         set.add(TAB+1);
   1352 
   1353         /* add for IS_THAT_CONTROL_SPACE() */
   1354         set.add(CR+1); /* range TAB..CR */
   1355         set.add(0x1c);
   1356         set.add(0x1f+1);
   1357         set.add(NL);
   1358         set.add(NL+1);
   1359 
   1360         /* add for u_isIDIgnorable() what was not added above */
   1361         set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
   1362         set.add(HAIRSP);
   1363         set.add(RLM+1);
   1364         set.add(INHSWAP);
   1365         set.add(NOMDIG+1);
   1366         set.add(ZWNBSP);
   1367         set.add(ZWNBSP+1);
   1368 
   1369         /* add no-break spaces for u_isWhitespace() what was not added above */
   1370         set.add(NBSP);
   1371         set.add(NBSP+1);
   1372         set.add(FIGURESP);
   1373         set.add(FIGURESP+1);
   1374         set.add(NNBSP);
   1375         set.add(NNBSP+1);
   1376 
   1377         /* add for u_charDigitValue() */
   1378         // TODO remove when UCharacter.getHanNumericValue() is changed to just return
   1379         // Unicode numeric values
   1380         set.add(0x3007);
   1381         set.add(0x3008);
   1382         set.add(0x4e00);
   1383         set.add(0x4e01);
   1384         set.add(0x4e8c);
   1385         set.add(0x4e8d);
   1386         set.add(0x4e09);
   1387         set.add(0x4e0a);
   1388         set.add(0x56db);
   1389         set.add(0x56dc);
   1390         set.add(0x4e94);
   1391         set.add(0x4e95);
   1392         set.add(0x516d);
   1393         set.add(0x516e);
   1394         set.add(0x4e03);
   1395         set.add(0x4e04);
   1396         set.add(0x516b);
   1397         set.add(0x516c);
   1398         set.add(0x4e5d);
   1399         set.add(0x4e5e);
   1400 
   1401         /* add for u_digit() */
   1402         set.add(U_a);
   1403         set.add(U_z+1);
   1404         set.add(U_A);
   1405         set.add(U_Z+1);
   1406         set.add(U_FW_a);
   1407         set.add(U_FW_z+1);
   1408         set.add(U_FW_A);
   1409         set.add(U_FW_Z+1);
   1410 
   1411         /* add for u_isxdigit() */
   1412         set.add(U_f+1);
   1413         set.add(U_F+1);
   1414         set.add(U_FW_f+1);
   1415         set.add(U_FW_F+1);
   1416 
   1417         /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
   1418         set.add(WJ); /* range WJ..NOMDIG */
   1419         set.add(0xfff0);
   1420         set.add(0xfffb+1);
   1421         set.add(0xe0000);
   1422         set.add(0xe0fff+1);
   1423 
   1424         /* add for UCHAR_GRAPHEME_BASE and others */
   1425         set.add(CGJ);
   1426         set.add(CGJ+1);
   1427 
   1428         return set; // for chaining
   1429     }
   1430 
   1431     public void upropsvec_addPropertyStarts(UnicodeSet set) {
   1432         /* add the start code point of each same-value range of the properties vectors trie */
   1433         if(m_additionalColumnsCount_>0) {
   1434             /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
   1435             Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
   1436             Trie2.Range range;
   1437             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
   1438                 set.add(range.startCodePoint);
   1439             }
   1440         }
   1441     }
   1442 
   1443     // This static initializer block must be placed after
   1444     // other static member initialization
   1445     static {
   1446         try {
   1447             INSTANCE = new UCharacterProperty();
   1448         }
   1449         catch (IOException e) {
   1450             throw new MissingResourceException(e.getMessage(),"","");
   1451         }
   1452     }
   1453 
   1454 /*----------------------------------------------------------------
   1455  * Inclusions list
   1456  *----------------------------------------------------------------*/
   1457 
   1458     /*
   1459      * Return a set of characters for property enumeration.
   1460      * The set implicitly contains 0x110000 as well, which is one more than the highest
   1461      * Unicode code point.
   1462      *
   1463      * This set is used as an ordered list - its code points are ordered, and
   1464      * consecutive code points (in Unicode code point order) in the set define a range.
   1465      * For each two consecutive characters (start, limit) in the set,
   1466      * all of the UCD/normalization and related properties for
   1467      * all code points start..limit-1 are all the same,
   1468      * except for character names and ISO comments.
   1469      *
   1470      * All Unicode code points U+0000..U+10ffff are covered by these ranges.
   1471      * The ranges define a partition of the Unicode code space.
   1472      * ICU uses the inclusions set to enumerate properties for generating
   1473      * UnicodeSets containing all code points that have a certain property value.
   1474      *
   1475      * The Inclusion List is generated from the UCD. It is generated
   1476      * by enumerating the data tries, and code points for hardcoded properties
   1477      * are added as well.
   1478      *
   1479      * --------------------------------------------------------------------------
   1480      *
   1481      * The following are ideas for getting properties-unique code point ranges,
   1482      * with possible optimizations beyond the current implementation.
   1483      * These optimizations would require more code and be more fragile.
   1484      * The current implementation generates one single list (set) for all properties.
   1485      *
   1486      * To enumerate properties efficiently, one needs to know ranges of
   1487      * repetitive values, so that the value of only each start code point
   1488      * can be applied to the whole range.
   1489      * This information is in principle available in the uprops.icu/unorm.icu data.
   1490      *
   1491      * There are two obstacles:
   1492      *
   1493      * 1. Some properties are computed from multiple data structures,
   1494      *    making it necessary to get repetitive ranges by intersecting
   1495      *    ranges from multiple tries.
   1496      *
   1497      * 2. It is not economical to write code for getting repetitive ranges
   1498      *    that are precise for each of some 50 properties.
   1499      *
   1500      * Compromise ideas:
   1501      *
   1502      * - Get ranges per trie, not per individual property.
   1503      *   Each range contains the same values for a whole group of properties.
   1504      *   This would generate currently five range sets, two for uprops.icu tries
   1505      *   and three for unorm.icu tries.
   1506      *
   1507      * - Combine sets of ranges for multiple tries to get sufficient sets
   1508      *   for properties, e.g., the uprops.icu main and auxiliary tries
   1509      *   for all non-normalization properties.
   1510      *
   1511      * Ideas for representing ranges and combining them:
   1512      *
   1513      * - A UnicodeSet could hold just the start code points of ranges.
   1514      *   Multiple sets are easily combined by or-ing them together.
   1515      *
   1516      * - Alternatively, a UnicodeSet could hold each even-numbered range.
   1517      *   All ranges could be enumerated by using each start code point
   1518      *   (for the even-numbered ranges) as well as each limit (end+1) code point
   1519      *   (for the odd-numbered ranges).
   1520      *   It should be possible to combine two such sets by xor-ing them,
   1521      *   but no more than two.
   1522      *
   1523      * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
   1524      * but the first one is certainly simpler and applicable for combining more than
   1525      * two range sets.
   1526      *
   1527      * It is possible to combine all range sets for all uprops/unorm tries into one
   1528      * set that can be used for all properties.
   1529      * As an optimization, there could be less-combined range sets for certain
   1530      * groups of properties.
   1531      * The relationship of which less-combined range set to use for which property
   1532      * depends on the implementation of the properties and must be hardcoded
   1533      * - somewhat error-prone and higher maintenance but can be tested easily
   1534      * by building property sets "the simple way" in test code.
   1535      *
   1536      * ---
   1537      *
   1538      * Do not use a UnicodeSet pattern because that causes infinite recursion;
   1539      * UnicodeSet depends on the inclusions set.
   1540      *
   1541      * ---
   1542      *
   1543      * getInclusions() is commented out starting 2005-feb-12 because
   1544      * UnicodeSet now calls the uxyz_addPropertyStarts() directly,
   1545      * and only for the relevant property source.
   1546      */
   1547     /*
   1548     public UnicodeSet getInclusions() {
   1549         UnicodeSet set = new UnicodeSet();
   1550         NormalizerImpl.addPropertyStarts(set);
   1551         addPropertyStarts(set);
   1552         return set;
   1553     }
   1554     */
   1555 }
   1556