Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2010-2015, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * CollationData.java, ported from collationdata.h/.cpp
     10 *
     11 * C++ version created on: 2010oct27
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import android.icu.impl.Normalizer2Impl;
     18 import android.icu.impl.Trie2_32;
     19 import android.icu.lang.UScript;
     20 import android.icu.text.Collator;
     21 import android.icu.text.UnicodeSet;
     22 import android.icu.util.ICUException;
     23 
     24 /**
     25  * Collation data container.
     26  * Immutable data created by a CollationDataBuilder, or loaded from a file,
     27  * or deserialized from API-provided binary data.
     28  *
     29  * Includes data for the collation base (root/default), aliased if this is not the base.
     30  * @hide Only a subset of ICU is exposed in Android
     31  */
     32 public final class CollationData {
     33     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
     34     // parallel with the ranges, and resetting ranges that are indexed.
     35     // The reordering builder code could clone the resulting template array.
     36     static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
     37     static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
     38 
     39     static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
     40 
     41     CollationData(Normalizer2Impl nfc) {
     42         nfcImpl = nfc;
     43     }
     44 
     45     public int getCE32(int c) {
     46         return trie.get(c);
     47     }
     48 
     49     int getCE32FromSupplementary(int c) {
     50         return trie.get(c);  // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
     51     }
     52 
     53     boolean isDigit(int c) {
     54         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
     55                 Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
     56     }
     57 
     58     public boolean isUnsafeBackward(int c, boolean numeric) {
     59         return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
     60     }
     61 
     62     public boolean isCompressibleLeadByte(int b) {
     63         return compressibleBytes[b];
     64     }
     65 
     66     public boolean isCompressiblePrimary(long p) {
     67         return isCompressibleLeadByte((int)p >>> 24);
     68     }
     69 
     70     /**
     71      * Returns the CE32 from two contexts words.
     72      * Access to the defaultCE32 for contraction and prefix matching.
     73      */
     74     int getCE32FromContexts(int index) {
     75         return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
     76     }
     77 
     78     /**
     79      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
     80      * Requires that ce32 is special.
     81      */
     82     int getIndirectCE32(int ce32) {
     83         assert(Collation.isSpecialCE32(ce32));
     84         int tag = Collation.tagFromCE32(ce32);
     85         if(tag == Collation.DIGIT_TAG) {
     86             // Fetch the non-numeric-collation CE32.
     87             ce32 = ce32s[Collation.indexFromCE32(ce32)];
     88         } else if(tag == Collation.LEAD_SURROGATE_TAG) {
     89             ce32 = Collation.UNASSIGNED_CE32;
     90         } else if(tag == Collation.U0000_TAG) {
     91             // Fetch the normal ce32 for U+0000.
     92             ce32 = ce32s[0];
     93         }
     94         return ce32;
     95     }
     96 
     97     /**
     98      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
     99      * if ce32 is special.
    100      */
    101     int getFinalCE32(int ce32) {
    102         if(Collation.isSpecialCE32(ce32)) {
    103             ce32 = getIndirectCE32(ce32);
    104         }
    105         return ce32;
    106     }
    107 
    108     /**
    109      * Computes a CE from c's ce32 which has the OFFSET_TAG.
    110      */
    111     long getCEFromOffsetCE32(int c, int ce32) {
    112         long dataCE = ces[Collation.indexFromCE32(ce32)];
    113         return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
    114     }
    115 
    116     /**
    117      * Returns the single CE that c maps to.
    118      * Throws UnsupportedOperationException if c does not map to a single CE.
    119      */
    120     long getSingleCE(int c) {
    121         CollationData d;
    122         int ce32 = getCE32(c);
    123         if(ce32 == Collation.FALLBACK_CE32) {
    124             d = base;
    125             ce32 = base.getCE32(c);
    126         } else {
    127             d = this;
    128         }
    129         while(Collation.isSpecialCE32(ce32)) {
    130             switch(Collation.tagFromCE32(ce32)) {
    131             case Collation.LATIN_EXPANSION_TAG:
    132             case Collation.BUILDER_DATA_TAG:
    133             case Collation.PREFIX_TAG:
    134             case Collation.CONTRACTION_TAG:
    135             case Collation.HANGUL_TAG:
    136             case Collation.LEAD_SURROGATE_TAG:
    137                 throw new UnsupportedOperationException(String.format(
    138                         "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
    139                         c, ce32));
    140             case Collation.FALLBACK_TAG:
    141             case Collation.RESERVED_TAG_3:
    142                 throw new AssertionError(String.format(
    143                         "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
    144             case Collation.LONG_PRIMARY_TAG:
    145                 return Collation.ceFromLongPrimaryCE32(ce32);
    146             case Collation.LONG_SECONDARY_TAG:
    147                 return Collation.ceFromLongSecondaryCE32(ce32);
    148             case Collation.EXPANSION32_TAG:
    149                 if(Collation.lengthFromCE32(ce32) == 1) {
    150                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
    151                     break;
    152                 } else {
    153                     throw new UnsupportedOperationException(String.format(
    154                             "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
    155                             c, ce32));
    156                 }
    157             case Collation.EXPANSION_TAG: {
    158                 if(Collation.lengthFromCE32(ce32) == 1) {
    159                     return d.ces[Collation.indexFromCE32(ce32)];
    160                 } else {
    161                     throw new UnsupportedOperationException(String.format(
    162                             "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
    163                             c, ce32));
    164                 }
    165             }
    166             case Collation.DIGIT_TAG:
    167                 // Fetch the non-numeric-collation CE32 and continue.
    168                 ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
    169                 break;
    170             case Collation.U0000_TAG:
    171                 assert(c == 0);
    172                 // Fetch the normal ce32 for U+0000 and continue.
    173                 ce32 = d.ce32s[0];
    174                 break;
    175             case Collation.OFFSET_TAG:
    176                 return d.getCEFromOffsetCE32(c, ce32);
    177             case Collation.IMPLICIT_TAG:
    178                 return Collation.unassignedCEFromCodePoint(c);
    179             }
    180         }
    181         return Collation.ceFromSimpleCE32(ce32);
    182     }
    183 
    184     /**
    185      * Returns the FCD16 value for code point c. c must be >= 0.
    186      */
    187     int getFCD16(int c) {
    188         return nfcImpl.getFCD16(c);
    189     }
    190 
    191     /**
    192      * Returns the first primary for the script's reordering group.
    193      * @return the primary with only the first primary lead byte of the group
    194      *         (not necessarily an actual root collator primary weight),
    195      *         or 0 if the script is unknown
    196      */
    197     long getFirstPrimaryForGroup(int script) {
    198         int index = getScriptIndex(script);
    199         return index == 0 ? 0 : (long)scriptStarts[index] << 16;
    200     }
    201 
    202     /**
    203      * Returns the last primary for the script's reordering group.
    204      * @return the last primary of the group
    205      *         (not an actual root collator primary weight),
    206      *         or 0 if the script is unknown
    207      */
    208     public long getLastPrimaryForGroup(int script) {
    209         int index = getScriptIndex(script);
    210         if(index == 0) {
    211             return 0;
    212         }
    213         long limit = scriptStarts[index + 1];
    214         return (limit << 16) - 1;
    215     }
    216 
    217     /**
    218      * Finds the reordering group which contains the primary weight.
    219      * @return the first script of the group, or -1 if the weight is beyond the last group
    220      */
    221     public int getGroupForPrimary(long p) {
    222         p >>= 16;
    223         if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
    224             return -1;
    225         }
    226         int index = 1;
    227         while(p >= scriptStarts[index + 1]) { ++index; }
    228         for(int i = 0; i < numScripts; ++i) {
    229             if(scriptsIndex[i] == index) {
    230                 return i;
    231             }
    232         }
    233         for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    234             if(scriptsIndex[numScripts + i] == index) {
    235                 return Collator.ReorderCodes.FIRST + i;
    236             }
    237         }
    238         return -1;
    239     }
    240 
    241     private int getScriptIndex(int script) {
    242         if(script < 0) {
    243             return 0;
    244         } else if(script < numScripts) {
    245             return scriptsIndex[script];
    246         } else if(script < Collator.ReorderCodes.FIRST) {
    247             return 0;
    248         } else {
    249             script -= Collator.ReorderCodes.FIRST;
    250             if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
    251                 return scriptsIndex[numScripts + script];
    252             } else {
    253                 return 0;
    254             }
    255         }
    256     }
    257 
    258     public int[] getEquivalentScripts(int script) {
    259         int index = getScriptIndex(script);
    260         if(index == 0) { return EMPTY_INT_ARRAY; }
    261         if(script >= Collator.ReorderCodes.FIRST) {
    262             // Special groups have no aliases.
    263             return new int[] { script };
    264         }
    265 
    266         int length = 0;
    267         for(int i = 0; i < numScripts; ++i) {
    268             if(scriptsIndex[i] == index) {
    269                 ++length;
    270             }
    271         }
    272         int[] dest = new int[length];
    273         if(length == 1) {
    274             dest[0] = script;
    275             return dest;
    276         }
    277         length = 0;
    278         for(int i = 0; i < numScripts; ++i) {
    279             if(scriptsIndex[i] == index) {
    280                 dest[length++] = i;
    281             }
    282         }
    283         return dest;
    284     }
    285 
    286     /**
    287      * Writes the permutation of primary-weight ranges
    288      * for the given reordering of scripts and groups.
    289      * The caller checks for illegal arguments and
    290      * takes care of [DEFAULT] and memory allocation.
    291      *
    292      * <p>Each list element will be a (limit, offset) pair as described
    293      * for the CollationSettings.reorderRanges.
    294      * The list will be empty if no ranges are reordered.
    295      */
    296     void makeReorderRanges(int[] reorder, UVector32 ranges) {
    297         makeReorderRanges(reorder, false, ranges);
    298     }
    299 
    300     private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
    301         ranges.removeAllElements();
    302         int length = reorder.length;
    303         if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
    304             return;
    305         }
    306 
    307         // Maps each script-or-group range to a new lead byte.
    308         short[] table = new short[scriptStarts.length - 1];  // C++: uint8_t[]
    309 
    310         {
    311             // Set "don't care" values for reserved ranges.
    312             int index = scriptsIndex[
    313                     numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
    314             if(index != 0) {
    315                 table[index] = 0xff;
    316             }
    317             index = scriptsIndex[
    318                     numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
    319             if(index != 0) {
    320                 table[index] = 0xff;
    321             }
    322         }
    323 
    324         // Never reorder special low and high primary lead bytes.
    325         assert(scriptStarts.length >= 2);
    326         assert(scriptStarts[0] == 0);
    327         int lowStart = scriptStarts[1];
    328         assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
    329         int highLimit = scriptStarts[scriptStarts.length - 1];
    330         assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
    331 
    332         // Get the set of special reorder codes in the input list.
    333         // This supports a fixed number of special reorder codes;
    334         // it works for data with codes beyond Collator.ReorderCodes.LIMIT.
    335         int specials = 0;
    336         for(int i = 0; i < length; ++i) {
    337             int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
    338             if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
    339                 specials |= 1 << reorderCode;
    340             }
    341         }
    342 
    343         // Start the reordering with the special low reorder codes that do not occur in the input.
    344         for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    345             int index = scriptsIndex[numScripts + i];
    346             if(index != 0 && (specials & (1 << i)) == 0) {
    347                 lowStart = addLowScriptRange(table, index, lowStart);
    348             }
    349         }
    350 
    351         // Skip the reserved range before Latin if Latin is the first script,
    352         // so that we do not move it unnecessarily.
    353         int skippedReserved = 0;
    354         if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
    355             int index = scriptsIndex[UScript.LATIN];
    356             assert(index != 0);
    357             int start = scriptStarts[index];
    358             assert(lowStart <= start);
    359             skippedReserved = start - lowStart;
    360             lowStart = start;
    361         }
    362 
    363         // Reorder according to the input scripts, continuing from the bottom of the primary range.
    364         boolean hasReorderToEnd = false;
    365         for(int i = 0; i < length;) {
    366             int script = reorder[i++];
    367             if(script == UScript.UNKNOWN) {
    368                 // Put the remaining scripts at the top.
    369                 hasReorderToEnd = true;
    370                 while(i < length) {
    371                     script = reorder[--length];
    372                     if(script == UScript.UNKNOWN) {  // Must occur at most once.
    373                         throw new IllegalArgumentException(
    374                                 "setReorderCodes(): duplicate UScript.UNKNOWN");
    375                     }
    376                     if(script == Collator.ReorderCodes.DEFAULT) {
    377                         throw new IllegalArgumentException(
    378                                 "setReorderCodes(): UScript.DEFAULT together with other scripts");
    379                     }
    380                     int index = getScriptIndex(script);
    381                     if(index == 0) { continue; }
    382                     if(table[index] != 0) {  // Duplicate or equivalent script.
    383                         throw new IllegalArgumentException(
    384                                 "setReorderCodes(): duplicate or equivalent script " +
    385                                 scriptCodeString(script));
    386                     }
    387                     highLimit = addHighScriptRange(table, index, highLimit);
    388                 }
    389                 break;
    390             }
    391             if(script == Collator.ReorderCodes.DEFAULT) {
    392                 // The default code must be the only one in the list, and that is handled by the caller.
    393                 // Otherwise it must not be used.
    394                 throw new IllegalArgumentException(
    395                         "setReorderCodes(): UScript.DEFAULT together with other scripts");
    396             }
    397             int index = getScriptIndex(script);
    398             if(index == 0) { continue; }
    399             if(table[index] != 0) {  // Duplicate or equivalent script.
    400                 throw new IllegalArgumentException(
    401                         "setReorderCodes(): duplicate or equivalent script " +
    402                         scriptCodeString(script));
    403             }
    404             lowStart = addLowScriptRange(table, index, lowStart);
    405         }
    406 
    407         // Put all remaining scripts into the middle.
    408         for(int i = 1; i < scriptStarts.length - 1; ++i) {
    409             int leadByte = table[i];
    410             if(leadByte != 0) { continue; }
    411             int start = scriptStarts[i];
    412             if(!hasReorderToEnd && start > lowStart) {
    413                 // No need to move this script.
    414                 lowStart = start;
    415             }
    416             lowStart = addLowScriptRange(table, i, lowStart);
    417         }
    418         if(lowStart > highLimit) {
    419             if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
    420                 // Try not skipping the before-Latin reserved range.
    421                 makeReorderRanges(reorder, true, ranges);
    422                 return;
    423             }
    424             // We need more primary lead bytes than available, despite the reserved ranges.
    425             throw new ICUException(
    426                     "setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
    427         }
    428 
    429         // Turn lead bytes into a list of (limit, offset) pairs.
    430         // Encode each pair in one list element:
    431         // Upper 16 bits = limit, lower 16 = signed lead byte offset.
    432         int offset = 0;
    433         for(int i = 1;; ++i) {
    434             int nextOffset = offset;
    435             while(i < scriptStarts.length - 1) {
    436                 int newLeadByte = table[i];
    437                 if(newLeadByte == 0xff) {
    438                     // "Don't care" lead byte for reserved range, continue with current offset.
    439                 } else {
    440                     nextOffset = newLeadByte - (scriptStarts[i] >> 8);
    441                     if(nextOffset != offset) { break; }
    442                 }
    443                 ++i;
    444             }
    445             if(offset != 0 || i < scriptStarts.length - 1) {
    446                 ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
    447             }
    448             if(i == scriptStarts.length - 1) { break; }
    449             offset = nextOffset;
    450         }
    451     }
    452 
    453     private int addLowScriptRange(short[] table, int index, int lowStart) {
    454         int start = scriptStarts[index];
    455         if((start & 0xff) < (lowStart & 0xff)) {
    456             lowStart += 0x100;
    457         }
    458         table[index] = (short)(lowStart >> 8);
    459         int limit = scriptStarts[index + 1];
    460         lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
    461         return lowStart;
    462     }
    463 
    464     private int addHighScriptRange(short[] table, int index, int highLimit) {
    465         int limit = scriptStarts[index + 1];
    466         if((limit & 0xff) > (highLimit & 0xff)) {
    467             highLimit -= 0x100;
    468         }
    469         int start = scriptStarts[index];
    470         highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
    471         table[index] = (short)(highLimit >> 8);
    472         return highLimit;
    473     }
    474 
    475     private static String scriptCodeString(int script) {
    476         // Do not use the script name here: We do not want to depend on that data.
    477         return (script < Collator.ReorderCodes.FIRST) ?
    478                 Integer.toString(script) : "0x" + Integer.toHexString(script);
    479     }
    480 
    481     private static final int[] EMPTY_INT_ARRAY = new int[0];
    482 
    483     /** @see jamoCE32s */
    484     static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
    485 
    486     /** Main lookup trie. */
    487     Trie2_32 trie;
    488     /**
    489      * Array of CE32 values.
    490      * At index 0 there must be CE32(U+0000)
    491      * to support U+0000's special-tag for NUL-termination handling.
    492      */
    493     int[] ce32s;
    494     /** Array of CE values for expansions and OFFSET_TAG. */
    495     long[] ces;
    496     /** Array of prefix and contraction-suffix matching data. */
    497     String contexts;
    498     /** Base collation data, or null if this data itself is a base. */
    499     public CollationData base;
    500     /**
    501      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
    502      * They are normally simple CE32s, rarely expansions.
    503      * For fast handling of HANGUL_TAG.
    504      */
    505     int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
    506     public Normalizer2Impl nfcImpl;
    507     /** The single-byte primary weight (xx000000) for numeric collation. */
    508     long numericPrimary = 0x12000000;
    509 
    510     /** 256 flags for which primary-weight lead bytes are compressible. */
    511     public boolean[] compressibleBytes;
    512     /**
    513      * Set of code points that are unsafe for starting string comparison after an identical prefix,
    514      * or in backwards CE iteration.
    515      */
    516     UnicodeSet unsafeBackwardSet;
    517 
    518     /**
    519      * Fast Latin table for common-Latin-text string comparisons.
    520      * Data structure see class CollationFastLatin.
    521      */
    522     public char[] fastLatinTable;
    523     /**
    524      * Header portion of the fastLatinTable.
    525      * In C++, these are one array, and the header is skipped for mapping characters.
    526      * In Java, two arrays work better.
    527      */
    528     char[] fastLatinTableHeader;
    529 
    530     /**
    531      * Data for scripts and reordering groups.
    532      * Uses include building a reordering permutation table and
    533      * providing script boundaries to AlphabeticIndex.
    534      */
    535     int numScripts;
    536     /**
    537      * The length of scriptsIndex is numScripts+16.
    538      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
    539      * 16 special reorder codes (not all used) are mapped starting at numScripts.
    540      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
    541      * There are special codes at the end for reorder-reserved primary ranges.
    542      *
    543      * <p>Multiple scripts may share a range and index, for example Hira & Kana.
    544      */
    545     char[] scriptsIndex;
    546     /**
    547      * Start primary weight (top 16 bits only) for a group/script/reserved range
    548      * indexed by scriptsIndex.
    549      * The first range (separators & terminators) and the last range (trailing weights)
    550      * are not reorderable, and no scriptsIndex entry points to them.
    551      */
    552     char[] scriptStarts;
    553 
    554     /**
    555      * Collation elements in the root collator.
    556      * Used by the CollationRootElements class. The data structure is described there.
    557      * null in a tailoring.
    558      */
    559     public long[] rootElements;
    560 }
    561