Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2010-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * CollationData.java, ported from collationdata.h/.cpp
      9 *
     10 * C++ version created on: 2010oct27
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import com.ibm.icu.impl.Normalizer2Impl;
     17 import com.ibm.icu.impl.Trie2_32;
     18 import com.ibm.icu.lang.UScript;
     19 import com.ibm.icu.text.Collator;
     20 import com.ibm.icu.text.UnicodeSet;
     21 import com.ibm.icu.util.ICUException;
     22 
     23 /**
     24  * Collation data container.
     25  * Immutable data created by a CollationDataBuilder, or loaded from a file,
     26  * or deserialized from API-provided binary data.
     27  *
     28  * Includes data for the collation base (root/default), aliased if this is not the base.
     29  */
     30 public final class CollationData {
     31     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
     32     // parallel with the ranges, and resetting ranges that are indexed.
     33     // The reordering builder code could clone the resulting template array.
     34     static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
     35     static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
     36 
     37     static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
     38 
     39     CollationData(Normalizer2Impl nfc) {
     40         nfcImpl = nfc;
     41     }
     42 
     43     public int getCE32(int c) {
     44         return trie.get(c);
     45     }
     46 
     47     int getCE32FromSupplementary(int c) {
     48         return trie.get(c);  // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
     49     }
     50 
     51     boolean isDigit(int c) {
     52         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
     53                 Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
     54     }
     55 
     56     public boolean isUnsafeBackward(int c, boolean numeric) {
     57         return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
     58     }
     59 
     60     public boolean isCompressibleLeadByte(int b) {
     61         return compressibleBytes[b];
     62     }
     63 
     64     public boolean isCompressiblePrimary(long p) {
     65         return isCompressibleLeadByte((int)p >>> 24);
     66     }
     67 
     68     /**
     69      * Returns the CE32 from two contexts words.
     70      * Access to the defaultCE32 for contraction and prefix matching.
     71      */
     72     int getCE32FromContexts(int index) {
     73         return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
     74     }
     75 
     76     /**
     77      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
     78      * Requires that ce32 is special.
     79      */
     80     int getIndirectCE32(int ce32) {
     81         assert(Collation.isSpecialCE32(ce32));
     82         int tag = Collation.tagFromCE32(ce32);
     83         if(tag == Collation.DIGIT_TAG) {
     84             // Fetch the non-numeric-collation CE32.
     85             ce32 = ce32s[Collation.indexFromCE32(ce32)];
     86         } else if(tag == Collation.LEAD_SURROGATE_TAG) {
     87             ce32 = Collation.UNASSIGNED_CE32;
     88         } else if(tag == Collation.U0000_TAG) {
     89             // Fetch the normal ce32 for U+0000.
     90             ce32 = ce32s[0];
     91         }
     92         return ce32;
     93     }
     94 
     95     /**
     96      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
     97      * if ce32 is special.
     98      */
     99     int getFinalCE32(int ce32) {
    100         if(Collation.isSpecialCE32(ce32)) {
    101             ce32 = getIndirectCE32(ce32);
    102         }
    103         return ce32;
    104     }
    105 
    106     /**
    107      * Computes a CE from c's ce32 which has the OFFSET_TAG.
    108      */
    109     long getCEFromOffsetCE32(int c, int ce32) {
    110         long dataCE = ces[Collation.indexFromCE32(ce32)];
    111         return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
    112     }
    113 
    114     /**
    115      * Returns the single CE that c maps to.
    116      * Throws UnsupportedOperationException if c does not map to a single CE.
    117      */
    118     long getSingleCE(int c) {
    119         CollationData d;
    120         int ce32 = getCE32(c);
    121         if(ce32 == Collation.FALLBACK_CE32) {
    122             d = base;
    123             ce32 = base.getCE32(c);
    124         } else {
    125             d = this;
    126         }
    127         while(Collation.isSpecialCE32(ce32)) {
    128             switch(Collation.tagFromCE32(ce32)) {
    129             case Collation.LATIN_EXPANSION_TAG:
    130             case Collation.BUILDER_DATA_TAG:
    131             case Collation.PREFIX_TAG:
    132             case Collation.CONTRACTION_TAG:
    133             case Collation.HANGUL_TAG:
    134             case Collation.LEAD_SURROGATE_TAG:
    135                 throw new UnsupportedOperationException(String.format(
    136                         "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
    137                         c, ce32));
    138             case Collation.FALLBACK_TAG:
    139             case Collation.RESERVED_TAG_3:
    140                 throw new AssertionError(String.format(
    141                         "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
    142             case Collation.LONG_PRIMARY_TAG:
    143                 return Collation.ceFromLongPrimaryCE32(ce32);
    144             case Collation.LONG_SECONDARY_TAG:
    145                 return Collation.ceFromLongSecondaryCE32(ce32);
    146             case Collation.EXPANSION32_TAG:
    147                 if(Collation.lengthFromCE32(ce32) == 1) {
    148                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
    149                     break;
    150                 } else {
    151                     throw new UnsupportedOperationException(String.format(
    152                             "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
    153                             c, ce32));
    154                 }
    155             case Collation.EXPANSION_TAG: {
    156                 if(Collation.lengthFromCE32(ce32) == 1) {
    157                     return d.ces[Collation.indexFromCE32(ce32)];
    158                 } else {
    159                     throw new UnsupportedOperationException(String.format(
    160                             "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
    161                             c, ce32));
    162                 }
    163             }
    164             case Collation.DIGIT_TAG:
    165                 // Fetch the non-numeric-collation CE32 and continue.
    166                 ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
    167                 break;
    168             case Collation.U0000_TAG:
    169                 assert(c == 0);
    170                 // Fetch the normal ce32 for U+0000 and continue.
    171                 ce32 = d.ce32s[0];
    172                 break;
    173             case Collation.OFFSET_TAG:
    174                 return d.getCEFromOffsetCE32(c, ce32);
    175             case Collation.IMPLICIT_TAG:
    176                 return Collation.unassignedCEFromCodePoint(c);
    177             }
    178         }
    179         return Collation.ceFromSimpleCE32(ce32);
    180     }
    181 
    182     /**
    183      * Returns the FCD16 value for code point c. c must be >= 0.
    184      */
    185     int getFCD16(int c) {
    186         return nfcImpl.getFCD16(c);
    187     }
    188 
    189     /**
    190      * Returns the first primary for the script's reordering group.
    191      * @return the primary with only the first primary lead byte of the group
    192      *         (not necessarily an actual root collator primary weight),
    193      *         or 0 if the script is unknown
    194      */
    195     long getFirstPrimaryForGroup(int script) {
    196         int index = getScriptIndex(script);
    197         return index == 0 ? 0 : (long)scriptStarts[index] << 16;
    198     }
    199 
    200     /**
    201      * Returns the last primary for the script's reordering group.
    202      * @return the last primary of the group
    203      *         (not an actual root collator primary weight),
    204      *         or 0 if the script is unknown
    205      */
    206     public long getLastPrimaryForGroup(int script) {
    207         int index = getScriptIndex(script);
    208         if(index == 0) {
    209             return 0;
    210         }
    211         long limit = scriptStarts[index + 1];
    212         return (limit << 16) - 1;
    213     }
    214 
    215     /**
    216      * Finds the reordering group which contains the primary weight.
    217      * @return the first script of the group, or -1 if the weight is beyond the last group
    218      */
    219     public int getGroupForPrimary(long p) {
    220         p >>= 16;
    221         if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
    222             return -1;
    223         }
    224         int index = 1;
    225         while(p >= scriptStarts[index + 1]) { ++index; }
    226         for(int i = 0; i < numScripts; ++i) {
    227             if(scriptsIndex[i] == index) {
    228                 return i;
    229             }
    230         }
    231         for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    232             if(scriptsIndex[numScripts + i] == index) {
    233                 return Collator.ReorderCodes.FIRST + i;
    234             }
    235         }
    236         return -1;
    237     }
    238 
    239     private int getScriptIndex(int script) {
    240         if(script < 0) {
    241             return 0;
    242         } else if(script < numScripts) {
    243             return scriptsIndex[script];
    244         } else if(script < Collator.ReorderCodes.FIRST) {
    245             return 0;
    246         } else {
    247             script -= Collator.ReorderCodes.FIRST;
    248             if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
    249                 return scriptsIndex[numScripts + script];
    250             } else {
    251                 return 0;
    252             }
    253         }
    254     }
    255 
    256     public int[] getEquivalentScripts(int script) {
    257         int index = getScriptIndex(script);
    258         if(index == 0) { return EMPTY_INT_ARRAY; }
    259         if(script >= Collator.ReorderCodes.FIRST) {
    260             // Special groups have no aliases.
    261             return new int[] { script };
    262         }
    263 
    264         int length = 0;
    265         for(int i = 0; i < numScripts; ++i) {
    266             if(scriptsIndex[i] == index) {
    267                 ++length;
    268             }
    269         }
    270         int[] dest = new int[length];
    271         if(length == 1) {
    272             dest[0] = script;
    273             return dest;
    274         }
    275         length = 0;
    276         for(int i = 0; i < numScripts; ++i) {
    277             if(scriptsIndex[i] == index) {
    278                 dest[length++] = i;
    279             }
    280         }
    281         return dest;
    282     }
    283 
    284     /**
    285      * Writes the permutation of primary-weight ranges
    286      * for the given reordering of scripts and groups.
    287      * The caller checks for illegal arguments and
    288      * takes care of [DEFAULT] and memory allocation.
    289      *
    290      * <p>Each list element will be a (limit, offset) pair as described
    291      * for the CollationSettings.reorderRanges.
    292      * The list will be empty if no ranges are reordered.
    293      */
    294     void makeReorderRanges(int[] reorder, UVector32 ranges) {
    295         makeReorderRanges(reorder, false, ranges);
    296     }
    297 
    298     private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
    299         ranges.removeAllElements();
    300         int length = reorder.length;
    301         if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
    302             return;
    303         }
    304 
    305         // Maps each script-or-group range to a new lead byte.
    306         short[] table = new short[scriptStarts.length - 1];  // C++: uint8_t[]
    307 
    308         {
    309             // Set "don't care" values for reserved ranges.
    310             int index = scriptsIndex[
    311                     numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
    312             if(index != 0) {
    313                 table[index] = 0xff;
    314             }
    315             index = scriptsIndex[
    316                     numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
    317             if(index != 0) {
    318                 table[index] = 0xff;
    319             }
    320         }
    321 
    322         // Never reorder special low and high primary lead bytes.
    323         assert(scriptStarts.length >= 2);
    324         assert(scriptStarts[0] == 0);
    325         int lowStart = scriptStarts[1];
    326         assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
    327         int highLimit = scriptStarts[scriptStarts.length - 1];
    328         assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
    329 
    330         // Get the set of special reorder codes in the input list.
    331         // This supports a fixed number of special reorder codes;
    332         // it works for data with codes beyond Collator.ReorderCodes.LIMIT.
    333         int specials = 0;
    334         for(int i = 0; i < length; ++i) {
    335             int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
    336             if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
    337                 specials |= 1 << reorderCode;
    338             }
    339         }
    340 
    341         // Start the reordering with the special low reorder codes that do not occur in the input.
    342         for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
    343             int index = scriptsIndex[numScripts + i];
    344             if(index != 0 && (specials & (1 << i)) == 0) {
    345                 lowStart = addLowScriptRange(table, index, lowStart);
    346             }
    347         }
    348 
    349         // Skip the reserved range before Latin if Latin is the first script,
    350         // so that we do not move it unnecessarily.
    351         int skippedReserved = 0;
    352         if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
    353             int index = scriptsIndex[UScript.LATIN];
    354             assert(index != 0);
    355             int start = scriptStarts[index];
    356             assert(lowStart <= start);
    357             skippedReserved = start - lowStart;
    358             lowStart = start;
    359         }
    360 
    361         // Reorder according to the input scripts, continuing from the bottom of the primary range.
    362         boolean hasReorderToEnd = false;
    363         for(int i = 0; i < length;) {
    364             int script = reorder[i++];
    365             if(script == UScript.UNKNOWN) {
    366                 // Put the remaining scripts at the top.
    367                 hasReorderToEnd = true;
    368                 while(i < length) {
    369                     script = reorder[--length];
    370                     if(script == UScript.UNKNOWN) {  // Must occur at most once.
    371                         throw new IllegalArgumentException(
    372                                 "setReorderCodes(): duplicate UScript.UNKNOWN");
    373                     }
    374                     if(script == Collator.ReorderCodes.DEFAULT) {
    375                         throw new IllegalArgumentException(
    376                                 "setReorderCodes(): UScript.DEFAULT together with other scripts");
    377                     }
    378                     int index = getScriptIndex(script);
    379                     if(index == 0) { continue; }
    380                     if(table[index] != 0) {  // Duplicate or equivalent script.
    381                         throw new IllegalArgumentException(
    382                                 "setReorderCodes(): duplicate or equivalent script " +
    383                                 scriptCodeString(script));
    384                     }
    385                     highLimit = addHighScriptRange(table, index, highLimit);
    386                 }
    387                 break;
    388             }
    389             if(script == Collator.ReorderCodes.DEFAULT) {
    390                 // The default code must be the only one in the list, and that is handled by the caller.
    391                 // Otherwise it must not be used.
    392                 throw new IllegalArgumentException(
    393                         "setReorderCodes(): UScript.DEFAULT together with other scripts");
    394             }
    395             int index = getScriptIndex(script);
    396             if(index == 0) { continue; }
    397             if(table[index] != 0) {  // Duplicate or equivalent script.
    398                 throw new IllegalArgumentException(
    399                         "setReorderCodes(): duplicate or equivalent script " +
    400                         scriptCodeString(script));
    401             }
    402             lowStart = addLowScriptRange(table, index, lowStart);
    403         }
    404 
    405         // Put all remaining scripts into the middle.
    406         for(int i = 1; i < scriptStarts.length - 1; ++i) {
    407             int leadByte = table[i];
    408             if(leadByte != 0) { continue; }
    409             int start = scriptStarts[i];
    410             if(!hasReorderToEnd && start > lowStart) {
    411                 // No need to move this script.
    412                 lowStart = start;
    413             }
    414             lowStart = addLowScriptRange(table, i, lowStart);
    415         }
    416         if(lowStart > highLimit) {
    417             if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
    418                 // Try not skipping the before-Latin reserved range.
    419                 makeReorderRanges(reorder, true, ranges);
    420                 return;
    421             }
    422             // We need more primary lead bytes than available, despite the reserved ranges.
    423             throw new ICUException(
    424                     "setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
    425         }
    426 
    427         // Turn lead bytes into a list of (limit, offset) pairs.
    428         // Encode each pair in one list element:
    429         // Upper 16 bits = limit, lower 16 = signed lead byte offset.
    430         int offset = 0;
    431         for(int i = 1;; ++i) {
    432             int nextOffset = offset;
    433             while(i < scriptStarts.length - 1) {
    434                 int newLeadByte = table[i];
    435                 if(newLeadByte == 0xff) {
    436                     // "Don't care" lead byte for reserved range, continue with current offset.
    437                 } else {
    438                     nextOffset = newLeadByte - (scriptStarts[i] >> 8);
    439                     if(nextOffset != offset) { break; }
    440                 }
    441                 ++i;
    442             }
    443             if(offset != 0 || i < scriptStarts.length - 1) {
    444                 ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
    445             }
    446             if(i == scriptStarts.length - 1) { break; }
    447             offset = nextOffset;
    448         }
    449     }
    450 
    451     private int addLowScriptRange(short[] table, int index, int lowStart) {
    452         int start = scriptStarts[index];
    453         if((start & 0xff) < (lowStart & 0xff)) {
    454             lowStart += 0x100;
    455         }
    456         table[index] = (short)(lowStart >> 8);
    457         int limit = scriptStarts[index + 1];
    458         lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
    459         return lowStart;
    460     }
    461 
    462     private int addHighScriptRange(short[] table, int index, int highLimit) {
    463         int limit = scriptStarts[index + 1];
    464         if((limit & 0xff) > (highLimit & 0xff)) {
    465             highLimit -= 0x100;
    466         }
    467         int start = scriptStarts[index];
    468         highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
    469         table[index] = (short)(highLimit >> 8);
    470         return highLimit;
    471     }
    472 
    473     private static String scriptCodeString(int script) {
    474         // Do not use the script name here: We do not want to depend on that data.
    475         return (script < Collator.ReorderCodes.FIRST) ?
    476                 Integer.toString(script) : "0x" + Integer.toHexString(script);
    477     }
    478 
    479     private static final int[] EMPTY_INT_ARRAY = new int[0];
    480 
    481     /** @see jamoCE32s */
    482     static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
    483 
    484     /** Main lookup trie. */
    485     Trie2_32 trie;
    486     /**
    487      * Array of CE32 values.
    488      * At index 0 there must be CE32(U+0000)
    489      * to support U+0000's special-tag for NUL-termination handling.
    490      */
    491     int[] ce32s;
    492     /** Array of CE values for expansions and OFFSET_TAG. */
    493     long[] ces;
    494     /** Array of prefix and contraction-suffix matching data. */
    495     String contexts;
    496     /** Base collation data, or null if this data itself is a base. */
    497     public CollationData base;
    498     /**
    499      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
    500      * They are normally simple CE32s, rarely expansions.
    501      * For fast handling of HANGUL_TAG.
    502      */
    503     int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
    504     public Normalizer2Impl nfcImpl;
    505     /** The single-byte primary weight (xx000000) for numeric collation. */
    506     long numericPrimary = 0x12000000;
    507 
    508     /** 256 flags for which primary-weight lead bytes are compressible. */
    509     public boolean[] compressibleBytes;
    510     /**
    511      * Set of code points that are unsafe for starting string comparison after an identical prefix,
    512      * or in backwards CE iteration.
    513      */
    514     UnicodeSet unsafeBackwardSet;
    515 
    516     /**
    517      * Fast Latin table for common-Latin-text string comparisons.
    518      * Data structure see class CollationFastLatin.
    519      */
    520     public char[] fastLatinTable;
    521     /**
    522      * Header portion of the fastLatinTable.
    523      * In C++, these are one array, and the header is skipped for mapping characters.
    524      * In Java, two arrays work better.
    525      */
    526     char[] fastLatinTableHeader;
    527 
    528     /**
    529      * Data for scripts and reordering groups.
    530      * Uses include building a reordering permutation table and
    531      * providing script boundaries to AlphabeticIndex.
    532      */
    533     int numScripts;
    534     /**
    535      * The length of scriptsIndex is numScripts+16.
    536      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
    537      * 16 special reorder codes (not all used) are mapped starting at numScripts.
    538      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
    539      * There are special codes at the end for reorder-reserved primary ranges.
    540      *
    541      * <p>Multiple scripts may share a range and index, for example Hira & Kana.
    542      */
    543     char[] scriptsIndex;
    544     /**
    545      * Start primary weight (top 16 bits only) for a group/script/reserved range
    546      * indexed by scriptsIndex.
    547      * The first range (separators & terminators) and the last range (trailing weights)
    548      * are not reorderable, and no scriptsIndex entry points to them.
    549      */
    550     char[] scriptStarts;
    551 
    552     /**
    553      * Collation elements in the root collator.
    554      * Used by the CollationRootElements class. The data structure is described there.
    555      * null in a tailoring.
    556      */
    557     public long[] rootElements;
    558 }
    559