Home | History | Annotate | Download | only in charset
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2006-2015, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.charset;
     11 
     12 import java.io.IOException;
     13 import java.nio.ByteBuffer;
     14 
     15 import com.ibm.icu.impl.ICUBinary;
     16 
     17 
     18 /* Format of cnvalias.icu -----------------------------------------------------
     19  *
     20  * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
     21  * This binary form contains several tables. All indexes are to uint16_t
     22  * units, and not to the bytes (uint8_t units). Addressing everything on
     23  * 16-bit boundaries allows us to store more information with small index
     24  * numbers, which are also 16-bit in size. The majority of the table (except
     25  * the string table) are 16-bit numbers.
     26  *
     27  * First there is the size of the Table of Contents (TOC). The TOC
     28  * entries contain the size of each section. In order to find the offset
     29  * you just need to sum up the previous offsets.
     30  * The TOC length and entries are an array of uint32_t values.
     31  * The first section after the TOC starts immediately after the TOC.
     32  *
     33  * 1) This section contains a list of converters. This list contains indexes
     34  * into the string table for the converter name. The index of this list is
     35  * also used by other sections, which are mentioned later on.
     36  * This list is not sorted.
     37  *
     38  * 2) This section contains a list of tags. This list contains indexes
     39  * into the string table for the tag name. The index of this list is
     40  * also used by other sections, which are mentioned later on.
     41  * This list is in priority order of standards.
     42  *
     43  * 3) This section contains a list of sorted unique aliases. This
     44  * list contains indexes into the string table for the alias name. The
     45  * index of this list is also used by other sections, like the 4th section.
     46  * The index for the 3rd and 4th section is used to get the
     47  * alias -> converter name mapping. Section 3 and 4 form a two column table.
     48  *
     49  * 4) This section contains a list of mapped converter names. Consider this
     50  * as a table that maps the 3rd section to the 1st section. This list contains
     51  * indexes into the 1st section. The index of this list is the same index in
     52  * the 3rd section. There is also some extra information in the high bits of
     53  * each converter index in this table. Currently it's only used to say that
     54  * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
     55  * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
     56  * the predigested form of the 5th section so that an alias lookup can be fast.
     57  *
     58  * 5) This section contains a 2D array with indexes to the 6th section. This
     59  * section is the full form of all alias mappings. The column index is the
     60  * index into the converter list (column header). The row index is the index
     61  * to tag list (row header). This 2D array is the top part a 3D array. The
     62  * third dimension is in the 6th section.
     63  *
     64  * 6) This is blob of variable length arrays. Each array starts with a size,
     65  * and is followed by indexes to alias names in the string table. This is
     66  * the third dimension to the section 5. No other section should be referencing
     67  * this section.
     68  *
     69  * 7) Reserved at this time (There is no information). This _usually_ has a
     70  * size of 0. Future versions may add more information here.
     71  *
     72  * 8) This is the string table. All strings are indexed on an even address.
     73  * There are two reasons for this. First many chip architectures locate strings
     74  * faster on even address boundaries. Second, since all indexes are 16-bit
     75  * numbers, this string table can be 128KB in size instead of 64KB when we
     76  * only have strings starting on an even address.
     77  *
     78  *
     79  * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
     80  * has a unique alias among all converters. That same alias can
     81  * be mentioned in other standards on different converters,
     82  * but only one alias per tag can be unique.
     83  *
     84  *
     85  *              Converter Names (Usually in TR22 form)
     86  *           -------------------------------------------.
     87  *     T    /                                          /|
     88  *     a   /                                          / |
     89  *     g  /                                          /  |
     90  *     s /                                          /   |
     91  *      /                                          /    |
     92  *      ------------------------------------------/     |
     93  *    A |                                         |     |
     94  *    l |                                         |     |
     95  *    i |                                         |    /
     96  *    a |                                         |   /
     97  *    s |                                         |  /
     98  *    e |                                         | /
     99  *    s |                                         |/
    100  *      -------------------------------------------
    101  *
    102  *
    103  *
    104  * Here is what it really looks like. It's like swiss cheese.
    105  * There are holes. Some converters aren't recognized by
    106  * a standard, or they are really old converters that the
    107  * standard doesn't recognize anymore.
    108  *
    109  *              Converter Names (Usually in TR22 form)
    110  *           -------------------------------------------.
    111  *     T    /##########################################/|
    112  *     a   /     #            #                       /#
    113  *     g  /  #      ##     ##     ### # ### ### ### #/
    114  *     s / #             #####  ####        ##  ## #/#
    115  *      / ### # # ##  #  #   #          ### # #   #/##
    116  *      ------------------------------------------/# #
    117  *    A |### # # ##  #  #   #          ### # #   #|# #
    118  *    l |# # #    #     #               ## #     #|# #
    119  *    i |# # #    #     #                #       #|#
    120  *    a |#                                       #|#
    121  *    s |                                        #|#
    122  *    e
    123  *    s
    124  *
    125  */
    126 
    127 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
    128 //    private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
    129 
    130    /**
    131     * <p>Protected constructor.</p>
    132     * @param bytes ICU uprop.dat file buffer
    133     * @exception IOException throw if data file fails authentication
    134     */
    135     protected UConverterAliasDataReader(ByteBuffer bytes)
    136                                         throws IOException{
    137         //if(debug) System.out.println("Bytes in buffer " + bytes.remaining());
    138 
    139         byteBuffer = bytes;
    140         /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this);
    141 
    142         //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining());
    143     }
    144 
    145     // protected methods -------------------------------------------------
    146 
    147     protected int[] readToc(int n)throws IOException
    148     {
    149         //Read the toc
    150         return ICUBinary.getInts(byteBuffer, n, 0);
    151     }
    152 
    153     @Override
    154     public boolean isDataVersionAcceptable(byte version[])
    155     {
    156         return version.length >= DATA_FORMAT_VERSION.length
    157             && version[0] == DATA_FORMAT_VERSION[0]
    158             && version[1] == DATA_FORMAT_VERSION[1]
    159             && version[2] == DATA_FORMAT_VERSION[2];
    160     }
    161 
    162     /*byte[] getUnicodeVersion(){
    163         return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion);
    164     }*/
    165     // private data members -------------------------------------------------
    166 
    167 
    168     /**
    169     * ICU data file buffer
    170     */
    171     private ByteBuffer byteBuffer;
    172 
    173 //    private int unicodeVersion;
    174 
    175     /**
    176     * File format version that this class understands.
    177     * No guarantees are made if a older version is used
    178     * see store.c of gennorm for more information and values
    179     */
    180         // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
    181     private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl"
    182     private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
    183 }
    184