Home | History | Annotate | Download | only in charset
      1 /*
      2  *******************************************************************************
      3  * Copyright (C) 2006-2015, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  *******************************************************************************
      6  */
      7 
      8 package com.ibm.icu.charset;
      9 
     10 import java.io.IOException;
     11 import java.nio.ByteBuffer;
     12 
     13 import com.ibm.icu.impl.ICUBinary;
     14 
     15 
     16 /* Format of cnvalias.icu -----------------------------------------------------
     17  *
     18  * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt.
     19  * This binary form contains several tables. All indexes are to uint16_t
     20  * units, and not to the bytes (uint8_t units). Addressing everything on
     21  * 16-bit boundaries allows us to store more information with small index
     22  * numbers, which are also 16-bit in size. The majority of the table (except
     23  * the string table) are 16-bit numbers.
     24  *
     25  * First there is the size of the Table of Contents (TOC). The TOC
     26  * entries contain the size of each section. In order to find the offset
     27  * you just need to sum up the previous offsets.
     28  * The TOC length and entries are an array of uint32_t values.
     29  * The first section after the TOC starts immediately after the TOC.
     30  *
     31  * 1) This section contains a list of converters. This list contains indexes
     32  * into the string table for the converter name. The index of this list is
     33  * also used by other sections, which are mentioned later on.
     34  * This list is not sorted.
     35  *
     36  * 2) This section contains a list of tags. This list contains indexes
     37  * into the string table for the tag name. The index of this list is
     38  * also used by other sections, which are mentioned later on.
     39  * This list is in priority order of standards.
     40  *
     41  * 3) This section contains a list of sorted unique aliases. This
     42  * list contains indexes into the string table for the alias name. The
     43  * index of this list is also used by other sections, like the 4th section.
     44  * The index for the 3rd and 4th section is used to get the
     45  * alias -> converter name mapping. Section 3 and 4 form a two column table.
     46  *
     47  * 4) This section contains a list of mapped converter names. Consider this
     48  * as a table that maps the 3rd section to the 1st section. This list contains
     49  * indexes into the 1st section. The index of this list is the same index in
     50  * the 3rd section. There is also some extra information in the high bits of
     51  * each converter index in this table. Currently it's only used to say that
     52  * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK
     53  * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is
     54  * the predigested form of the 5th section so that an alias lookup can be fast.
     55  *
     56  * 5) This section contains a 2D array with indexes to the 6th section. This
     57  * section is the full form of all alias mappings. The column index is the
     58  * index into the converter list (column header). The row index is the index
     59  * to tag list (row header). This 2D array is the top part a 3D array. The
     60  * third dimension is in the 6th section.
     61  *
     62  * 6) This is blob of variable length arrays. Each array starts with a size,
     63  * and is followed by indexes to alias names in the string table. This is
     64  * the third dimension to the section 5. No other section should be referencing
     65  * this section.
     66  *
     67  * 7) Reserved at this time (There is no information). This _usually_ has a
     68  * size of 0. Future versions may add more information here.
     69  *
     70  * 8) This is the string table. All strings are indexed on an even address.
     71  * There are two reasons for this. First many chip architectures locate strings
     72  * faster on even address boundaries. Second, since all indexes are 16-bit
     73  * numbers, this string table can be 128KB in size instead of 64KB when we
     74  * only have strings starting on an even address.
     75  *
     76  *
     77  * Here is the concept of section 5 and 6. It's a 3D cube. Each tag
     78  * has a unique alias among all converters. That same alias can
     79  * be mentioned in other standards on different converters,
     80  * but only one alias per tag can be unique.
     81  *
     82  *
     83  *              Converter Names (Usually in TR22 form)
     84  *           -------------------------------------------.
     85  *     T    /                                          /|
     86  *     a   /                                          / |
     87  *     g  /                                          /  |
     88  *     s /                                          /   |
     89  *      /                                          /    |
     90  *      ------------------------------------------/     |
     91  *    A |                                         |     |
     92  *    l |                                         |     |
     93  *    i |                                         |    /
     94  *    a |                                         |   /
     95  *    s |                                         |  /
     96  *    e |                                         | /
     97  *    s |                                         |/
     98  *      -------------------------------------------
     99  *
    100  *
    101  *
    102  * Here is what it really looks like. It's like swiss cheese.
    103  * There are holes. Some converters aren't recognized by
    104  * a standard, or they are really old converters that the
    105  * standard doesn't recognize anymore.
    106  *
    107  *              Converter Names (Usually in TR22 form)
    108  *           -------------------------------------------.
    109  *     T    /##########################################/|
    110  *     a   /     #            #                       /#
    111  *     g  /  #      ##     ##     ### # ### ### ### #/
    112  *     s / #             #####  ####        ##  ## #/#
    113  *      / ### # # ##  #  #   #          ### # #   #/##
    114  *      ------------------------------------------/# #
    115  *    A |### # # ##  #  #   #          ### # #   #|# #
    116  *    l |# # #    #     #               ## #     #|# #
    117  *    i |# # #    #     #                #       #|#
    118  *    a |#                                       #|#
    119  *    s |                                        #|#
    120  *    e
    121  *    s
    122  *
    123  */
    124 
    125 final class UConverterAliasDataReader implements ICUBinary.Authenticate {
    126 //    private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader");
    127 
    128    /**
    129     * <p>Protected constructor.</p>
    130     * @param bytes ICU uprop.dat file buffer
    131     * @exception IOException throw if data file fails authentication
    132     */
    133     protected UConverterAliasDataReader(ByteBuffer bytes)
    134                                         throws IOException{
    135         //if(debug) System.out.println("Bytes in buffer " + bytes.remaining());
    136 
    137         byteBuffer = bytes;
    138         /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this);
    139 
    140         //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining());
    141     }
    142 
    143     // protected methods -------------------------------------------------
    144 
    145     protected int[] readToc(int n)throws IOException
    146     {
    147         //Read the toc
    148         return ICUBinary.getInts(byteBuffer, n, 0);
    149     }
    150 
    151     public boolean isDataVersionAcceptable(byte version[])
    152     {
    153         return version.length >= DATA_FORMAT_VERSION.length
    154             && version[0] == DATA_FORMAT_VERSION[0]
    155             && version[1] == DATA_FORMAT_VERSION[1]
    156             && version[2] == DATA_FORMAT_VERSION[2];
    157     }
    158 
    159     /*byte[] getUnicodeVersion(){
    160         return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion);
    161     }*/
    162     // private data members -------------------------------------------------
    163 
    164 
    165     /**
    166     * ICU data file buffer
    167     */
    168     private ByteBuffer byteBuffer;
    169 
    170 //    private int unicodeVersion;
    171 
    172     /**
    173     * File format version that this class understands.
    174     * No guarantees are made if a older version is used
    175     * see store.c of gennorm for more information and values
    176     */
    177         // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c)
    178     private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl"
    179     private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1};
    180 }
    181