Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2015, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * CollationDataReader.java, ported from collationdatareader.h/.cpp
      9 *
     10 * C++ version created on: 2013feb07
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import java.io.IOException;
     17 import java.nio.ByteBuffer;
     18 import java.nio.CharBuffer;
     19 import java.util.Arrays;
     20 
     21 import com.ibm.icu.impl.ICUBinary;
     22 import com.ibm.icu.impl.Trie2_32;
     23 import com.ibm.icu.impl.USerializedSet;
     24 import com.ibm.icu.text.Collator;
     25 import com.ibm.icu.text.UnicodeSet;
     26 import com.ibm.icu.util.ICUException;
     27 
     28 /**
     29  * Collation binary data reader.
     30  */
     31 final class CollationDataReader /* all static */ {
     32     // The following constants are also copied into source/common/ucol_swp.cpp.
     33     // Keep them in sync!
     34     /**
     35      * Number of int indexes.
     36      *
     37      * Can be 2 if there are only options.
     38      * Can be 7 or 8 if there are only options and a script reordering.
     39      * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
     40      */
     41     static final int IX_INDEXES_LENGTH = 0;
     42     /**
     43      * Bits 31..24: numericPrimary, for numeric collation
     44      *      23..16: fast Latin format version (0 = no fast Latin table)
     45      *      15.. 0: options bit set
     46      */
     47     static final int IX_OPTIONS = 1;
     48     static final int IX_RESERVED2 = 2;
     49     static final int IX_RESERVED3 = 3;
     50 
     51     /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
     52     static final int IX_JAMO_CE32S_START = 4;
     53 
     54     // Byte offsets from the start of the data, after the generic header.
     55     // The indexes[] are at byte offset 0, other data follows.
     56     // Each data item is aligned properly.
     57     // The data items should be in descending order of unit size,
     58     // to minimize the need for padding.
     59     // Each item's byte length is given by the difference between its offset and
     60     // the next index/offset value.
     61     /** Byte offset to int reorderCodes[]. */
     62     static final int IX_REORDER_CODES_OFFSET = 5;
     63     /**
     64      * Byte offset to uint8_t reorderTable[].
     65      * Empty table if <256 bytes (padding only).
     66      * Otherwise 256 bytes or more (with padding).
     67      */
     68     static final int IX_REORDER_TABLE_OFFSET = 6;
     69     /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
     70     static final int IX_TRIE_OFFSET = 7;
     71 
     72     static final int IX_RESERVED8_OFFSET = 8;
     73     /** Byte offset to long ces[]. */
     74     static final int IX_CES_OFFSET = 9;
     75     static final int IX_RESERVED10_OFFSET = 10;
     76     /** Byte offset to int ce32s[]. */
     77     static final int IX_CE32S_OFFSET = 11;
     78 
     79     /** Byte offset to uint32_t rootElements[]. */
     80     static final int IX_ROOT_ELEMENTS_OFFSET = 12;
     81     /** Byte offset to UChar *contexts[]. */
     82     static final int IX_CONTEXTS_OFFSET = 13;
     83     /** Byte offset to char [] with serialized unsafeBackwardSet. */
     84     static final int IX_UNSAFE_BWD_OFFSET = 14;
     85     /** Byte offset to char fastLatinTable[]. */
     86     static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
     87 
     88     /** Byte offset to char scripts[]. */
     89     static final int IX_SCRIPTS_OFFSET = 16;
     90     /**
     91      * Byte offset to boolean compressibleBytes[].
     92      * Empty table if <256 bytes (padding only).
     93      * Otherwise 256 bytes or more (with padding).
     94      */
     95     static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
     96     static final int IX_RESERVED18_OFFSET = 18;
     97     static final int IX_TOTAL_SIZE = 19;
     98 
     99     static void read(CollationTailoring base, ByteBuffer inBytes,
    100                      CollationTailoring tailoring) throws IOException {
    101         tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
    102         if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
    103             throw new ICUException("Tailoring UCA version differs from base data UCA version");
    104         }
    105 
    106         int inLength = inBytes.remaining();
    107         if(inLength < 8) {
    108             throw new ICUException("not enough bytes");
    109         }
    110         int indexesLength = inBytes.getInt();  // inIndexes[IX_INDEXES_LENGTH]
    111         if(indexesLength < 2 || inLength < indexesLength * 4) {
    112             throw new ICUException("not enough indexes");
    113         }
    114         int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
    115         inIndexes[0] = indexesLength;
    116         for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
    117             inIndexes[i] = inBytes.getInt();
    118         }
    119         for(int i = indexesLength; i < inIndexes.length; ++i) {
    120             inIndexes[i] = -1;
    121         }
    122         if(indexesLength > inIndexes.length) {
    123             ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
    124         }
    125 
    126         // Assume that the tailoring data is in initial state,
    127         // with null pointers and 0 lengths.
    128 
    129         // Set pointers to non-empty data parts.
    130         // Do this in order of their byte offsets. (Should help porting to Java.)
    131 
    132         int index;  // one of the indexes[] slots
    133         int offset;  // byte offset for the index part
    134         int length;  // number of bytes in the index part
    135 
    136         if(indexesLength > IX_TOTAL_SIZE) {
    137             length = inIndexes[IX_TOTAL_SIZE];
    138         } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
    139             length = inIndexes[indexesLength - 1];
    140         } else {
    141             length = 0;  // only indexes, and inLength was already checked for them
    142         }
    143         if(inLength < length) {
    144             throw new ICUException("not enough bytes");
    145         }
    146 
    147         CollationData baseData = base == null ? null : base.data;
    148         int[] reorderCodes;
    149         int reorderCodesLength;
    150         index = IX_REORDER_CODES_OFFSET;
    151         offset = inIndexes[index];
    152         length = inIndexes[index + 1] - offset;
    153         if(length >= 4) {
    154             if(baseData == null) {
    155                 // We assume for collation settings that
    156                 // the base data does not have a reordering.
    157                 throw new ICUException("Collation base data must not reorder scripts");
    158             }
    159             reorderCodesLength = length / 4;
    160             reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
    161 
    162             // The reorderRanges (if any) are the trailing reorderCodes entries.
    163             // Split the array at the boundary.
    164             // Script or reorder codes do not exceed 16-bit values.
    165             // Range limits are stored in the upper 16 bits, and are never 0.
    166             int reorderRangesLength = 0;
    167             while(reorderRangesLength < reorderCodesLength &&
    168                     (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
    169                 ++reorderRangesLength;
    170             }
    171             assert(reorderRangesLength < reorderCodesLength);
    172             reorderCodesLength -= reorderRangesLength;
    173         } else {
    174             reorderCodes = new int[0];
    175             reorderCodesLength = 0;
    176             ICUBinary.skipBytes(inBytes, length);
    177         }
    178 
    179         // There should be a reorder table only if there are reorder codes.
    180         // However, when there are reorder codes the reorder table may be omitted to reduce
    181         // the data size.
    182         byte[] reorderTable = null;
    183         index = IX_REORDER_TABLE_OFFSET;
    184         offset = inIndexes[index];
    185         length = inIndexes[index + 1] - offset;
    186         if(length >= 256) {
    187             if(reorderCodesLength == 0) {
    188                 throw new ICUException("Reordering table without reordering codes");
    189             }
    190             reorderTable = new byte[256];
    191             inBytes.get(reorderTable);
    192             length -= 256;
    193         } else {
    194             // If we have reorder codes, then build the reorderTable at the end,
    195             // when the CollationData is otherwise complete.
    196         }
    197         ICUBinary.skipBytes(inBytes, length);
    198 
    199         if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
    200             throw new ICUException("Tailoring numeric primary weight differs from base data");
    201         }
    202         CollationData data = null;  // Remains null if there are no mappings.
    203 
    204         index = IX_TRIE_OFFSET;
    205         offset = inIndexes[index];
    206         length = inIndexes[index + 1] - offset;
    207         if(length >= 8) {
    208             tailoring.ensureOwnedData();
    209             data = tailoring.ownedData;
    210             data.base = baseData;
    211             data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
    212             data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
    213             int trieLength = data.trie.getSerializedLength();
    214             if(trieLength > length) {
    215                 throw new ICUException("Not enough bytes for the mappings trie");  // No mappings.
    216             }
    217             length -= trieLength;
    218         } else if(baseData != null) {
    219             // Use the base data. Only the settings are tailored.
    220             tailoring.data = baseData;
    221         } else {
    222             throw new ICUException("Missing collation data mappings");  // No mappings.
    223         }
    224         ICUBinary.skipBytes(inBytes, length);
    225 
    226         index = IX_RESERVED8_OFFSET;
    227         offset = inIndexes[index];
    228         length = inIndexes[index + 1] - offset;
    229         ICUBinary.skipBytes(inBytes, length);
    230 
    231         index = IX_CES_OFFSET;
    232         offset = inIndexes[index];
    233         length = inIndexes[index + 1] - offset;
    234         if(length >= 8) {
    235             if(data == null) {
    236                 throw new ICUException("Tailored ces without tailored trie");
    237             }
    238             data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
    239         } else {
    240             ICUBinary.skipBytes(inBytes, length);
    241         }
    242 
    243         index = IX_RESERVED10_OFFSET;
    244         offset = inIndexes[index];
    245         length = inIndexes[index + 1] - offset;
    246         ICUBinary.skipBytes(inBytes, length);
    247 
    248         index = IX_CE32S_OFFSET;
    249         offset = inIndexes[index];
    250         length = inIndexes[index + 1] - offset;
    251         if(length >= 4) {
    252             if(data == null) {
    253                 throw new ICUException("Tailored ce32s without tailored trie");
    254             }
    255             data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
    256         } else {
    257             ICUBinary.skipBytes(inBytes, length);
    258         }
    259 
    260         int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
    261         if(jamoCE32sStart >= 0) {
    262             if(data == null || data.ce32s == null) {
    263                 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
    264             }
    265             data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
    266             System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
    267         } else if(data == null) {
    268             // Nothing to do.
    269         } else if(baseData != null) {
    270             data.jamoCE32s = baseData.jamoCE32s;
    271         } else {
    272             throw new ICUException("Missing Jamo CE32s for Hangul processing");
    273         }
    274 
    275         index = IX_ROOT_ELEMENTS_OFFSET;
    276         offset = inIndexes[index];
    277         length = inIndexes[index + 1] - offset;
    278         if(length >= 4) {
    279             int rootElementsLength = length / 4;
    280             if(data == null) {
    281                 throw new ICUException("Root elements but no mappings");
    282             }
    283             if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
    284                 throw new ICUException("Root elements array too short");
    285             }
    286             data.rootElements = new long[rootElementsLength];
    287             for(int i = 0; i < rootElementsLength; ++i) {
    288                 data.rootElements[i] = inBytes.getInt() & 0xffffffffL;  // unsigned int -> long
    289             }
    290             long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
    291             if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
    292                 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
    293             }
    294             long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
    295             if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
    296                 // [fixed last secondary common byte] is too low,
    297                 // and secondary weights would collide with compressed common secondaries.
    298                 throw new ICUException("[fixed last secondary common byte] is too low");
    299             }
    300             length &= 3;
    301         }
    302         ICUBinary.skipBytes(inBytes, length);
    303 
    304         index = IX_CONTEXTS_OFFSET;
    305         offset = inIndexes[index];
    306         length = inIndexes[index + 1] - offset;
    307         if(length >= 2) {
    308             if(data == null) {
    309                 throw new ICUException("Tailored contexts without tailored trie");
    310             }
    311             data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
    312         } else {
    313             ICUBinary.skipBytes(inBytes, length);
    314         }
    315 
    316         index = IX_UNSAFE_BWD_OFFSET;
    317         offset = inIndexes[index];
    318         length = inIndexes[index + 1] - offset;
    319         if(length >= 2) {
    320             if(data == null) {
    321                 throw new ICUException("Unsafe-backward-set but no mappings");
    322             }
    323             if(baseData == null) {
    324                 // Create the unsafe-backward set for the root collator.
    325                 // Include all non-zero combining marks and trail surrogates.
    326                 // We do this at load time, rather than at build time,
    327                 // to simplify Unicode version bootstrapping:
    328                 // The root data builder only needs the new FractionalUCA.txt data,
    329                 // but it need not be built with a version of ICU already updated to
    330                 // the corresponding new Unicode Character Database.
    331                 //
    332                 // The following is an optimized version of
    333                 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
    334                 // It is faster and requires fewer code dependencies.
    335                 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
    336                 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
    337             } else {
    338                 // Clone the root collator's set contents.
    339                 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
    340             }
    341             // Add the ranges from the data file to the unsafe-backward set.
    342             USerializedSet sset = new USerializedSet();
    343             char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
    344             length = 0;
    345             sset.getSet(unsafeData, 0);
    346             int count = sset.countRanges();
    347             int[] range = new int[2];
    348             for(int i = 0; i < count; ++i) {
    349                 sset.getRange(i, range);
    350                 tailoring.unsafeBackwardSet.add(range[0], range[1]);
    351             }
    352             // Mark each lead surrogate as "unsafe"
    353             // if any of its 1024 associated supplementary code points is "unsafe".
    354             int c = 0x10000;
    355             for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
    356                 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
    357                     tailoring.unsafeBackwardSet.add(lead);
    358                 }
    359             }
    360             tailoring.unsafeBackwardSet.freeze();
    361             data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
    362         } else if(data == null) {
    363             // Nothing to do.
    364         } else if(baseData != null) {
    365             // No tailoring-specific data: Alias the root collator's set.
    366             data.unsafeBackwardSet = baseData.unsafeBackwardSet;
    367         } else {
    368             throw new ICUException("Missing unsafe-backward-set");
    369         }
    370         ICUBinary.skipBytes(inBytes, length);
    371 
    372         // If the fast Latin format version is different,
    373         // or the version is set to 0 for "no fast Latin table",
    374         // then just always use the normal string comparison path.
    375         index = IX_FAST_LATIN_TABLE_OFFSET;
    376         offset = inIndexes[index];
    377         length = inIndexes[index + 1] - offset;
    378         if(data != null) {
    379             data.fastLatinTable = null;
    380             data.fastLatinTableHeader = null;
    381             if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
    382                 if(length >= 2) {
    383                     char header0 = inBytes.getChar();
    384                     int headerLength = header0 & 0xff;
    385                     data.fastLatinTableHeader = new char[headerLength];
    386                     data.fastLatinTableHeader[0] = header0;
    387                     for(int i = 1; i < headerLength; ++i) {
    388                         data.fastLatinTableHeader[i] = inBytes.getChar();
    389                     }
    390                     int tableLength = length / 2 - headerLength;
    391                     data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
    392                     length = 0;
    393                     if((header0 >> 8) != CollationFastLatin.VERSION) {
    394                         throw new ICUException("Fast-Latin table version differs from version in data header");
    395                     }
    396                 } else if(baseData != null) {
    397                     data.fastLatinTable = baseData.fastLatinTable;
    398                     data.fastLatinTableHeader = baseData.fastLatinTableHeader;
    399                 }
    400             }
    401         }
    402         ICUBinary.skipBytes(inBytes, length);
    403 
    404         index = IX_SCRIPTS_OFFSET;
    405         offset = inIndexes[index];
    406         length = inIndexes[index + 1] - offset;
    407         if(length >= 2) {
    408             if(data == null) {
    409                 throw new ICUException("Script order data but no mappings");
    410             }
    411             int scriptsLength = length / 2;
    412             CharBuffer inChars = inBytes.asCharBuffer();
    413             data.numScripts = inChars.get();
    414             // There must be enough entries for both arrays, including more than two range starts.
    415             int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
    416             if(scriptStartsLength <= 2) {
    417                 throw new ICUException("Script order data too short");
    418             }
    419             inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
    420             inChars.get(data.scriptStarts = new char[scriptStartsLength]);
    421             if(!(data.scriptStarts[0] == 0 &&
    422                     data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
    423                     data.scriptStarts[scriptStartsLength - 1] ==
    424                             (Collation.TRAIL_WEIGHT_BYTE << 8))) {
    425                 throw new ICUException("Script order data not valid");
    426             }
    427         } else if(data == null) {
    428             // Nothing to do.
    429         } else if(baseData != null) {
    430             data.numScripts = baseData.numScripts;
    431             data.scriptsIndex = baseData.scriptsIndex;
    432             data.scriptStarts = baseData.scriptStarts;
    433         }
    434         ICUBinary.skipBytes(inBytes, length);
    435 
    436         index = IX_COMPRESSIBLE_BYTES_OFFSET;
    437         offset = inIndexes[index];
    438         length = inIndexes[index + 1] - offset;
    439         if(length >= 256) {
    440             if(data == null) {
    441                 throw new ICUException("Data for compressible primary lead bytes but no mappings");
    442             }
    443             data.compressibleBytes = new boolean[256];
    444             for(int i = 0; i < 256; ++i) {
    445                 data.compressibleBytes[i] = inBytes.get() != 0;
    446             }
    447             length -= 256;
    448         } else if(data == null) {
    449             // Nothing to do.
    450         } else if(baseData != null) {
    451             data.compressibleBytes = baseData.compressibleBytes;
    452         } else {
    453             throw new ICUException("Missing data for compressible primary lead bytes");
    454         }
    455         ICUBinary.skipBytes(inBytes, length);
    456 
    457         index = IX_RESERVED18_OFFSET;
    458         offset = inIndexes[index];
    459         length = inIndexes[index + 1] - offset;
    460         ICUBinary.skipBytes(inBytes, length);
    461 
    462         CollationSettings ts = tailoring.settings.readOnly();
    463         int options = inIndexes[IX_OPTIONS] & 0xffff;
    464         char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
    465         int fastLatinOptions = CollationFastLatin.getOptions(
    466                 tailoring.data, ts, fastLatinPrimaries);
    467         if(options == ts.options && ts.variableTop != 0 &&
    468                 Arrays.equals(reorderCodes, ts.reorderCodes) &&
    469                 fastLatinOptions == ts.fastLatinOptions &&
    470                 (fastLatinOptions < 0 ||
    471                         Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
    472             return;
    473         }
    474 
    475         CollationSettings settings = tailoring.settings.copyOnWrite();
    476         settings.options = options;
    477         // Set variableTop from options and scripts data.
    478         settings.variableTop = tailoring.data.getLastPrimaryForGroup(
    479                 Collator.ReorderCodes.FIRST + settings.getMaxVariable());
    480         if(settings.variableTop == 0) {
    481             throw new ICUException("The maxVariable could not be mapped to a variableTop");
    482         }
    483 
    484         if(reorderCodesLength != 0) {
    485             settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
    486         }
    487 
    488         settings.fastLatinOptions = CollationFastLatin.getOptions(
    489             tailoring.data, settings,
    490             settings.fastLatinPrimaries);
    491     }
    492 
    493     private static final class IsAcceptable implements ICUBinary.Authenticate {
    494         @Override
    495         public boolean isDataVersionAcceptable(byte version[]) {
    496             return version[0] == 5;
    497         }
    498     }
    499     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
    500     private static final int DATA_FORMAT = 0x55436f6c;  // "UCol"
    501 
    502     private CollationDataReader() {}  // no constructor
    503 }
    504 
    505 /*
    506  * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
    507  * See ICU4C source/common/collationdatareader.h.
    508  */
    509