Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2013-2015, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * CollationDataReader.java, ported from collationdatareader.h/.cpp
     10 *
     11 * C++ version created on: 2013feb07
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import java.io.IOException;
     18 import java.nio.ByteBuffer;
     19 import java.nio.CharBuffer;
     20 import java.util.Arrays;
     21 
     22 import android.icu.impl.ICUBinary;
     23 import android.icu.impl.Trie2_32;
     24 import android.icu.impl.USerializedSet;
     25 import android.icu.text.Collator;
     26 import android.icu.text.UnicodeSet;
     27 import android.icu.util.ICUException;
     28 
     29 /**
     30  * Collation binary data reader.
     31  */
     32 final class CollationDataReader /* all static */ {
     33     // The following constants are also copied into source/common/ucol_swp.cpp.
     34     // Keep them in sync!
     35     /**
     36      * Number of int indexes.
     37      *
     38      * Can be 2 if there are only options.
     39      * Can be 7 or 8 if there are only options and a script reordering.
     40      * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
     41      */
     42     static final int IX_INDEXES_LENGTH = 0;
     43     /**
     44      * Bits 31..24: numericPrimary, for numeric collation
     45      *      23..16: fast Latin format version (0 = no fast Latin table)
     46      *      15.. 0: options bit set
     47      */
     48     static final int IX_OPTIONS = 1;
     49     static final int IX_RESERVED2 = 2;
     50     static final int IX_RESERVED3 = 3;
     51 
     52     /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
     53     static final int IX_JAMO_CE32S_START = 4;
     54 
     55     // Byte offsets from the start of the data, after the generic header.
     56     // The indexes[] are at byte offset 0, other data follows.
     57     // Each data item is aligned properly.
     58     // The data items should be in descending order of unit size,
     59     // to minimize the need for padding.
     60     // Each item's byte length is given by the difference between its offset and
     61     // the next index/offset value.
     62     /** Byte offset to int reorderCodes[]. */
     63     static final int IX_REORDER_CODES_OFFSET = 5;
     64     /**
     65      * Byte offset to uint8_t reorderTable[].
     66      * Empty table if <256 bytes (padding only).
     67      * Otherwise 256 bytes or more (with padding).
     68      */
     69     static final int IX_REORDER_TABLE_OFFSET = 6;
     70     /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
     71     static final int IX_TRIE_OFFSET = 7;
     72 
     73     static final int IX_RESERVED8_OFFSET = 8;
     74     /** Byte offset to long ces[]. */
     75     static final int IX_CES_OFFSET = 9;
     76     static final int IX_RESERVED10_OFFSET = 10;
     77     /** Byte offset to int ce32s[]. */
     78     static final int IX_CE32S_OFFSET = 11;
     79 
     80     /** Byte offset to uint32_t rootElements[]. */
     81     static final int IX_ROOT_ELEMENTS_OFFSET = 12;
     82     /** Byte offset to UChar *contexts[]. */
     83     static final int IX_CONTEXTS_OFFSET = 13;
     84     /** Byte offset to char [] with serialized unsafeBackwardSet. */
     85     static final int IX_UNSAFE_BWD_OFFSET = 14;
     86     /** Byte offset to char fastLatinTable[]. */
     87     static final int IX_FAST_LATIN_TABLE_OFFSET = 15;
     88 
     89     /** Byte offset to char scripts[]. */
     90     static final int IX_SCRIPTS_OFFSET = 16;
     91     /**
     92      * Byte offset to boolean compressibleBytes[].
     93      * Empty table if <256 bytes (padding only).
     94      * Otherwise 256 bytes or more (with padding).
     95      */
     96     static final int IX_COMPRESSIBLE_BYTES_OFFSET = 17;
     97     static final int IX_RESERVED18_OFFSET = 18;
     98     static final int IX_TOTAL_SIZE = 19;
     99 
    100     static void read(CollationTailoring base, ByteBuffer inBytes,
    101                      CollationTailoring tailoring) throws IOException {
    102         tailoring.version = ICUBinary.readHeader(inBytes, DATA_FORMAT, IS_ACCEPTABLE);
    103         if(base != null && base.getUCAVersion() != tailoring.getUCAVersion()) {
    104             throw new ICUException("Tailoring UCA version differs from base data UCA version");
    105         }
    106 
    107         int inLength = inBytes.remaining();
    108         if(inLength < 8) {
    109             throw new ICUException("not enough bytes");
    110         }
    111         int indexesLength = inBytes.getInt();  // inIndexes[IX_INDEXES_LENGTH]
    112         if(indexesLength < 2 || inLength < indexesLength * 4) {
    113             throw new ICUException("not enough indexes");
    114         }
    115         int[] inIndexes = new int[IX_TOTAL_SIZE + 1];
    116         inIndexes[0] = indexesLength;
    117         for(int i = 1; i < indexesLength && i < inIndexes.length; ++i) {
    118             inIndexes[i] = inBytes.getInt();
    119         }
    120         for(int i = indexesLength; i < inIndexes.length; ++i) {
    121             inIndexes[i] = -1;
    122         }
    123         if(indexesLength > inIndexes.length) {
    124             ICUBinary.skipBytes(inBytes, (indexesLength - inIndexes.length) * 4);
    125         }
    126 
    127         // Assume that the tailoring data is in initial state,
    128         // with null pointers and 0 lengths.
    129 
    130         // Set pointers to non-empty data parts.
    131         // Do this in order of their byte offsets. (Should help porting to Java.)
    132 
    133         int index;  // one of the indexes[] slots
    134         int offset;  // byte offset for the index part
    135         int length;  // number of bytes in the index part
    136 
    137         if(indexesLength > IX_TOTAL_SIZE) {
    138             length = inIndexes[IX_TOTAL_SIZE];
    139         } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
    140             length = inIndexes[indexesLength - 1];
    141         } else {
    142             length = 0;  // only indexes, and inLength was already checked for them
    143         }
    144         if(inLength < length) {
    145             throw new ICUException("not enough bytes");
    146         }
    147 
    148         CollationData baseData = base == null ? null : base.data;
    149         int[] reorderCodes;
    150         int reorderCodesLength;
    151         index = IX_REORDER_CODES_OFFSET;
    152         offset = inIndexes[index];
    153         length = inIndexes[index + 1] - offset;
    154         if(length >= 4) {
    155             if(baseData == null) {
    156                 // We assume for collation settings that
    157                 // the base data does not have a reordering.
    158                 throw new ICUException("Collation base data must not reorder scripts");
    159             }
    160             reorderCodesLength = length / 4;
    161             reorderCodes = ICUBinary.getInts(inBytes, reorderCodesLength, length & 3);
    162 
    163             // The reorderRanges (if any) are the trailing reorderCodes entries.
    164             // Split the array at the boundary.
    165             // Script or reorder codes do not exceed 16-bit values.
    166             // Range limits are stored in the upper 16 bits, and are never 0.
    167             int reorderRangesLength = 0;
    168             while(reorderRangesLength < reorderCodesLength &&
    169                     (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
    170                 ++reorderRangesLength;
    171             }
    172             assert(reorderRangesLength < reorderCodesLength);
    173             reorderCodesLength -= reorderRangesLength;
    174         } else {
    175             reorderCodes = new int[0];
    176             reorderCodesLength = 0;
    177             ICUBinary.skipBytes(inBytes, length);
    178         }
    179 
    180         // There should be a reorder table only if there are reorder codes.
    181         // However, when there are reorder codes the reorder table may be omitted to reduce
    182         // the data size.
    183         byte[] reorderTable = null;
    184         index = IX_REORDER_TABLE_OFFSET;
    185         offset = inIndexes[index];
    186         length = inIndexes[index + 1] - offset;
    187         if(length >= 256) {
    188             if(reorderCodesLength == 0) {
    189                 throw new ICUException("Reordering table without reordering codes");
    190             }
    191             reorderTable = new byte[256];
    192             inBytes.get(reorderTable);
    193             length -= 256;
    194         } else {
    195             // If we have reorder codes, then build the reorderTable at the end,
    196             // when the CollationData is otherwise complete.
    197         }
    198         ICUBinary.skipBytes(inBytes, length);
    199 
    200         if(baseData != null && baseData.numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000L)) {
    201             throw new ICUException("Tailoring numeric primary weight differs from base data");
    202         }
    203         CollationData data = null;  // Remains null if there are no mappings.
    204 
    205         index = IX_TRIE_OFFSET;
    206         offset = inIndexes[index];
    207         length = inIndexes[index + 1] - offset;
    208         if(length >= 8) {
    209             tailoring.ensureOwnedData();
    210             data = tailoring.ownedData;
    211             data.base = baseData;
    212             data.numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000L;
    213             data.trie = tailoring.trie = Trie2_32.createFromSerialized(inBytes);
    214             int trieLength = data.trie.getSerializedLength();
    215             if(trieLength > length) {
    216                 throw new ICUException("Not enough bytes for the mappings trie");  // No mappings.
    217             }
    218             length -= trieLength;
    219         } else if(baseData != null) {
    220             // Use the base data. Only the settings are tailored.
    221             tailoring.data = baseData;
    222         } else {
    223             throw new ICUException("Missing collation data mappings");  // No mappings.
    224         }
    225         ICUBinary.skipBytes(inBytes, length);
    226 
    227         index = IX_RESERVED8_OFFSET;
    228         offset = inIndexes[index];
    229         length = inIndexes[index + 1] - offset;
    230         ICUBinary.skipBytes(inBytes, length);
    231 
    232         index = IX_CES_OFFSET;
    233         offset = inIndexes[index];
    234         length = inIndexes[index + 1] - offset;
    235         if(length >= 8) {
    236             if(data == null) {
    237                 throw new ICUException("Tailored ces without tailored trie");
    238             }
    239             data.ces = ICUBinary.getLongs(inBytes, length / 8, length & 7);
    240         } else {
    241             ICUBinary.skipBytes(inBytes, length);
    242         }
    243 
    244         index = IX_RESERVED10_OFFSET;
    245         offset = inIndexes[index];
    246         length = inIndexes[index + 1] - offset;
    247         ICUBinary.skipBytes(inBytes, length);
    248 
    249         index = IX_CE32S_OFFSET;
    250         offset = inIndexes[index];
    251         length = inIndexes[index + 1] - offset;
    252         if(length >= 4) {
    253             if(data == null) {
    254                 throw new ICUException("Tailored ce32s without tailored trie");
    255             }
    256             data.ce32s = ICUBinary.getInts(inBytes, length / 4, length & 3);
    257         } else {
    258             ICUBinary.skipBytes(inBytes, length);
    259         }
    260 
    261         int jamoCE32sStart = inIndexes[IX_JAMO_CE32S_START];
    262         if(jamoCE32sStart >= 0) {
    263             if(data == null || data.ce32s == null) {
    264                 throw new ICUException("JamoCE32sStart index into non-existent ce32s[]");
    265             }
    266             data.jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
    267             System.arraycopy(data.ce32s, jamoCE32sStart, data.jamoCE32s, 0, CollationData.JAMO_CE32S_LENGTH);
    268         } else if(data == null) {
    269             // Nothing to do.
    270         } else if(baseData != null) {
    271             data.jamoCE32s = baseData.jamoCE32s;
    272         } else {
    273             throw new ICUException("Missing Jamo CE32s for Hangul processing");
    274         }
    275 
    276         index = IX_ROOT_ELEMENTS_OFFSET;
    277         offset = inIndexes[index];
    278         length = inIndexes[index + 1] - offset;
    279         if(length >= 4) {
    280             int rootElementsLength = length / 4;
    281             if(data == null) {
    282                 throw new ICUException("Root elements but no mappings");
    283             }
    284             if(rootElementsLength <= CollationRootElements.IX_SEC_TER_BOUNDARIES) {
    285                 throw new ICUException("Root elements array too short");
    286             }
    287             data.rootElements = new long[rootElementsLength];
    288             for(int i = 0; i < rootElementsLength; ++i) {
    289                 data.rootElements[i] = inBytes.getInt() & 0xffffffffL;  // unsigned int -> long
    290             }
    291             long commonSecTer = data.rootElements[CollationRootElements.IX_COMMON_SEC_AND_TER_CE];
    292             if(commonSecTer != Collation.COMMON_SEC_AND_TER_CE) {
    293                 throw new ICUException("Common sec/ter weights in base data differ from the hardcoded value");
    294             }
    295             long secTerBoundaries = data.rootElements[CollationRootElements.IX_SEC_TER_BOUNDARIES];
    296             if((secTerBoundaries >>> 24) < CollationKeys.SEC_COMMON_HIGH) {
    297                 // [fixed last secondary common byte] is too low,
    298                 // and secondary weights would collide with compressed common secondaries.
    299                 throw new ICUException("[fixed last secondary common byte] is too low");
    300             }
    301             length &= 3;
    302         }
    303         ICUBinary.skipBytes(inBytes, length);
    304 
    305         index = IX_CONTEXTS_OFFSET;
    306         offset = inIndexes[index];
    307         length = inIndexes[index + 1] - offset;
    308         if(length >= 2) {
    309             if(data == null) {
    310                 throw new ICUException("Tailored contexts without tailored trie");
    311             }
    312             data.contexts = ICUBinary.getString(inBytes, length / 2, length & 1);
    313         } else {
    314             ICUBinary.skipBytes(inBytes, length);
    315         }
    316 
    317         index = IX_UNSAFE_BWD_OFFSET;
    318         offset = inIndexes[index];
    319         length = inIndexes[index + 1] - offset;
    320         if(length >= 2) {
    321             if(data == null) {
    322                 throw new ICUException("Unsafe-backward-set but no mappings");
    323             }
    324             if(baseData == null) {
    325                 // Create the unsafe-backward set for the root collator.
    326                 // Include all non-zero combining marks and trail surrogates.
    327                 // We do this at load time, rather than at build time,
    328                 // to simplify Unicode version bootstrapping:
    329                 // The root data builder only needs the new FractionalUCA.txt data,
    330                 // but it need not be built with a version of ICU already updated to
    331                 // the corresponding new Unicode Character Database.
    332                 //
    333                 // The following is an optimized version of
    334                 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
    335                 // It is faster and requires fewer code dependencies.
    336                 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
    337                 data.nfcImpl.addLcccChars(tailoring.unsafeBackwardSet);
    338             } else {
    339                 // Clone the root collator's set contents.
    340                 tailoring.unsafeBackwardSet = baseData.unsafeBackwardSet.cloneAsThawed();
    341             }
    342             // Add the ranges from the data file to the unsafe-backward set.
    343             USerializedSet sset = new USerializedSet();
    344             char[] unsafeData = ICUBinary.getChars(inBytes, length / 2, length & 1);
    345             length = 0;
    346             sset.getSet(unsafeData, 0);
    347             int count = sset.countRanges();
    348             int[] range = new int[2];
    349             for(int i = 0; i < count; ++i) {
    350                 sset.getRange(i, range);
    351                 tailoring.unsafeBackwardSet.add(range[0], range[1]);
    352             }
    353             // Mark each lead surrogate as "unsafe"
    354             // if any of its 1024 associated supplementary code points is "unsafe".
    355             int c = 0x10000;
    356             for(int lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
    357                 if(!tailoring.unsafeBackwardSet.containsNone(c, c + 0x3ff)) {
    358                     tailoring.unsafeBackwardSet.add(lead);
    359                 }
    360             }
    361             tailoring.unsafeBackwardSet.freeze();
    362             data.unsafeBackwardSet = tailoring.unsafeBackwardSet;
    363         } else if(data == null) {
    364             // Nothing to do.
    365         } else if(baseData != null) {
    366             // No tailoring-specific data: Alias the root collator's set.
    367             data.unsafeBackwardSet = baseData.unsafeBackwardSet;
    368         } else {
    369             throw new ICUException("Missing unsafe-backward-set");
    370         }
    371         ICUBinary.skipBytes(inBytes, length);
    372 
    373         // If the fast Latin format version is different,
    374         // or the version is set to 0 for "no fast Latin table",
    375         // then just always use the normal string comparison path.
    376         index = IX_FAST_LATIN_TABLE_OFFSET;
    377         offset = inIndexes[index];
    378         length = inIndexes[index + 1] - offset;
    379         if(data != null) {
    380             data.fastLatinTable = null;
    381             data.fastLatinTableHeader = null;
    382             if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin.VERSION) {
    383                 if(length >= 2) {
    384                     char header0 = inBytes.getChar();
    385                     int headerLength = header0 & 0xff;
    386                     data.fastLatinTableHeader = new char[headerLength];
    387                     data.fastLatinTableHeader[0] = header0;
    388                     for(int i = 1; i < headerLength; ++i) {
    389                         data.fastLatinTableHeader[i] = inBytes.getChar();
    390                     }
    391                     int tableLength = length / 2 - headerLength;
    392                     data.fastLatinTable = ICUBinary.getChars(inBytes, tableLength, length & 1);
    393                     length = 0;
    394                     if((header0 >> 8) != CollationFastLatin.VERSION) {
    395                         throw new ICUException("Fast-Latin table version differs from version in data header");
    396                     }
    397                 } else if(baseData != null) {
    398                     data.fastLatinTable = baseData.fastLatinTable;
    399                     data.fastLatinTableHeader = baseData.fastLatinTableHeader;
    400                 }
    401             }
    402         }
    403         ICUBinary.skipBytes(inBytes, length);
    404 
    405         index = IX_SCRIPTS_OFFSET;
    406         offset = inIndexes[index];
    407         length = inIndexes[index + 1] - offset;
    408         if(length >= 2) {
    409             if(data == null) {
    410                 throw new ICUException("Script order data but no mappings");
    411             }
    412             int scriptsLength = length / 2;
    413             CharBuffer inChars = inBytes.asCharBuffer();
    414             data.numScripts = inChars.get();
    415             // There must be enough entries for both arrays, including more than two range starts.
    416             int scriptStartsLength = scriptsLength - (1 + data.numScripts + 16);
    417             if(scriptStartsLength <= 2) {
    418                 throw new ICUException("Script order data too short");
    419             }
    420             inChars.get(data.scriptsIndex = new char[data.numScripts + 16]);
    421             inChars.get(data.scriptStarts = new char[scriptStartsLength]);
    422             if(!(data.scriptStarts[0] == 0 &&
    423                     data.scriptStarts[1] == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8) &&
    424                     data.scriptStarts[scriptStartsLength - 1] ==
    425                             (Collation.TRAIL_WEIGHT_BYTE << 8))) {
    426                 throw new ICUException("Script order data not valid");
    427             }
    428         } else if(data == null) {
    429             // Nothing to do.
    430         } else if(baseData != null) {
    431             data.numScripts = baseData.numScripts;
    432             data.scriptsIndex = baseData.scriptsIndex;
    433             data.scriptStarts = baseData.scriptStarts;
    434         }
    435         ICUBinary.skipBytes(inBytes, length);
    436 
    437         index = IX_COMPRESSIBLE_BYTES_OFFSET;
    438         offset = inIndexes[index];
    439         length = inIndexes[index + 1] - offset;
    440         if(length >= 256) {
    441             if(data == null) {
    442                 throw new ICUException("Data for compressible primary lead bytes but no mappings");
    443             }
    444             data.compressibleBytes = new boolean[256];
    445             for(int i = 0; i < 256; ++i) {
    446                 data.compressibleBytes[i] = inBytes.get() != 0;
    447             }
    448             length -= 256;
    449         } else if(data == null) {
    450             // Nothing to do.
    451         } else if(baseData != null) {
    452             data.compressibleBytes = baseData.compressibleBytes;
    453         } else {
    454             throw new ICUException("Missing data for compressible primary lead bytes");
    455         }
    456         ICUBinary.skipBytes(inBytes, length);
    457 
    458         index = IX_RESERVED18_OFFSET;
    459         offset = inIndexes[index];
    460         length = inIndexes[index + 1] - offset;
    461         ICUBinary.skipBytes(inBytes, length);
    462 
    463         CollationSettings ts = tailoring.settings.readOnly();
    464         int options = inIndexes[IX_OPTIONS] & 0xffff;
    465         char[] fastLatinPrimaries = new char[CollationFastLatin.LATIN_LIMIT];
    466         int fastLatinOptions = CollationFastLatin.getOptions(
    467                 tailoring.data, ts, fastLatinPrimaries);
    468         if(options == ts.options && ts.variableTop != 0 &&
    469                 Arrays.equals(reorderCodes, ts.reorderCodes) &&
    470                 fastLatinOptions == ts.fastLatinOptions &&
    471                 (fastLatinOptions < 0 ||
    472                         Arrays.equals(fastLatinPrimaries, ts.fastLatinPrimaries))) {
    473             return;
    474         }
    475 
    476         CollationSettings settings = tailoring.settings.copyOnWrite();
    477         settings.options = options;
    478         // Set variableTop from options and scripts data.
    479         settings.variableTop = tailoring.data.getLastPrimaryForGroup(
    480                 Collator.ReorderCodes.FIRST + settings.getMaxVariable());
    481         if(settings.variableTop == 0) {
    482             throw new ICUException("The maxVariable could not be mapped to a variableTop");
    483         }
    484 
    485         if(reorderCodesLength != 0) {
    486             settings.aliasReordering(baseData, reorderCodes, reorderCodesLength, reorderTable);
    487         }
    488 
    489         settings.fastLatinOptions = CollationFastLatin.getOptions(
    490             tailoring.data, settings,
    491             settings.fastLatinPrimaries);
    492     }
    493 
    494     private static final class IsAcceptable implements ICUBinary.Authenticate {
    495         @Override
    496         public boolean isDataVersionAcceptable(byte version[]) {
    497             return version[0] == 5;
    498         }
    499     }
    500     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
    501     private static final int DATA_FORMAT = 0x55436f6c;  // "UCol"
    502 
    503     private CollationDataReader() {}  // no constructor
    504 }
    505 
    506 /*
    507  * Format of collation data (ucadata.icu, binary data in coll/ *.res files):
    508  * See ICU4C source/common/collationdatareader.h.
    509  */
    510