Home | History | Annotate | Download | only in normalizer
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  * Copyright (C) 1998-2007 International Business Machines Corporation and
      6  * Unicode, Inc. All Rights Reserved.<br>
      7  * The Unicode Consortium makes no expressed or implied warranty of any
      8  * kind, and assumes no liability for errors or omissions.
      9  * No liability is assumed for incidental and consequential damages
     10  * in connection with or arising out of the use of the information here.
     11  */
     12 package android.icu.dev.test.normalizer;
     13 
     14 import java.util.BitSet;
     15 
     16 import android.icu.dev.test.UTF16Util;
     17 import android.icu.testsharding.MainTestShard;
     18 
     19 /**
     20  * Accesses the Normalization Data used for Forms C and D.<br>
     21  * @author Mark Davis
     22  * Updates for supplementary code points:
     23  * Vladimir Weinstein & Markus Scherer
     24  */
     25 @MainTestShard
     26 public class NormalizerData {
     27 //    static final String copyright = "Copyright (C) 1998-2003 International Business Machines Corporation and Unicode, Inc.";
     28 
     29     /**
     30     * Constant for use in getPairwiseComposition
     31     */
     32     public static final int NOT_COMPOSITE = '\uFFFF';
     33 
     34     /**
     35     * Gets the combining class of a character from the
     36     * Unicode Character Database.
     37     * @param   ch      the source character
     38     * @return          value from 0 to 255
     39     */
     40     public int getCanonicalClass(int ch) {
     41         return canonicalClass.get(ch);
     42     }
     43 
     44     /**
     45     * Returns the composite of the two characters. If the two
     46     * characters don't combine, returns NOT_COMPOSITE.
     47     * @param   first   first character (e.g. 'c')
     48     * @param   second  second character (e.g. \u0327 cedilla)
     49     * @return          composite (e.g. \u00C7 c cedilla)
     50     */
     51     public int getPairwiseComposition(int first, int second) {
     52         return compose.get(((long)first << 32) | second);
     53     }
     54 
     55 
     56     /**
     57     * Gets recursive decomposition of a character from the
     58     * Unicode Character Database.
     59     * @param   canonical    If true
     60     *                  bit is on in this byte, then selects the recursive
     61     *                  canonical decomposition, otherwise selects
     62     *                  the recursive compatibility and canonical decomposition.
     63     * @param   ch      the source character
     64     * @param   buffer  buffer to be filled with the decomposition
     65     */
     66     public void getRecursiveDecomposition(boolean canonical, int ch, StringBuffer buffer) {
     67         String decomp = decompose.get(ch);
     68         if (decomp != null && !(canonical && isCompatibility.get(ch))) {
     69             for (int i = 0; i < decomp.length(); i+=UTF16Util.codePointLength(ch)) {
     70                 ch = UTF16Util.nextCodePoint(decomp, i);
     71                 getRecursiveDecomposition(canonical, ch, buffer);
     72             }
     73         } else {                    // if no decomp, append
     74             UTF16Util.appendCodePoint(buffer, ch);
     75         }
     76     }
     77 
     78     // =================================================
     79     //                   PRIVATES
     80     // =================================================
     81 
     82     /**
     83      * Only accessed by NormalizerBuilder.
     84      */
     85     NormalizerData(IntHashtable canonicalClass, IntStringHashtable decompose,
     86       LongHashtable compose, BitSet isCompatibility, BitSet isExcluded) {
     87         this.canonicalClass = canonicalClass;
     88         this.decompose = decompose;
     89         this.compose = compose;
     90         this.isCompatibility = isCompatibility;
     91         this.isExcluded = isExcluded;
     92     }
     93 
     94     /**
     95     * Just accessible for testing.
     96     */
     97     boolean getExcluded (char ch) {
     98         return isExcluded.get(ch);
     99     }
    100 
    101     /**
    102     * Just accessible for testing.
    103     */
    104     String getRawDecompositionMapping (char ch) {
    105         return decompose.get(ch);
    106     }
    107 
    108     /**
    109     * For now, just use IntHashtable
    110     * Two-stage tables would be used in an optimized implementation.
    111     */
    112     private IntHashtable canonicalClass;
    113 
    114     /**
    115     * The main data table maps chars to a 32-bit int.
    116     * It holds either a pair: top = first, bottom = second
    117     * or singleton: top = 0, bottom = single.
    118     * If there is no decomposition, the value is 0.
    119     * Two-stage tables would be used in an optimized implementation.
    120     * An optimization could also map chars to a small index, then use that
    121     * index in a small array of ints.
    122     */
    123     private IntStringHashtable decompose;
    124 
    125     /**
    126     * Maps from pairs of characters to single.
    127     * If there is no decomposition, the value is NOT_COMPOSITE.
    128     */
    129     private LongHashtable compose;
    130 
    131     /**
    132     * Tells whether decomposition is canonical or not.
    133     */
    134     private BitSet isCompatibility = new BitSet();
    135 
    136     /**
    137     * Tells whether character is script-excluded or not.
    138     * Used only while building, and for testing.
    139     */
    140 
    141     private BitSet isExcluded = new BitSet();
    142 }
    143