1 /* 2 ******************************************************************************* 3 * Copyright (C) 2006-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.charset; 9 10 import java.io.IOException; 11 import java.nio.ByteBuffer; 12 13 import com.ibm.icu.impl.ICUBinary; 14 15 16 /* Format of cnvalias.icu ----------------------------------------------------- 17 * 18 * cnvalias.icu is a binary, memory-mappable form of convrtrs.txt. 19 * This binary form contains several tables. All indexes are to uint16_t 20 * units, and not to the bytes (uint8_t units). Addressing everything on 21 * 16-bit boundaries allows us to store more information with small index 22 * numbers, which are also 16-bit in size. The majority of the table (except 23 * the string table) are 16-bit numbers. 24 * 25 * First there is the size of the Table of Contents (TOC). The TOC 26 * entries contain the size of each section. In order to find the offset 27 * you just need to sum up the previous offsets. 28 * The TOC length and entries are an array of uint32_t values. 29 * The first section after the TOC starts immediately after the TOC. 30 * 31 * 1) This section contains a list of converters. This list contains indexes 32 * into the string table for the converter name. The index of this list is 33 * also used by other sections, which are mentioned later on. 34 * This list is not sorted. 35 * 36 * 2) This section contains a list of tags. This list contains indexes 37 * into the string table for the tag name. The index of this list is 38 * also used by other sections, which are mentioned later on. 39 * This list is in priority order of standards. 40 * 41 * 3) This section contains a list of sorted unique aliases. This 42 * list contains indexes into the string table for the alias name. The 43 * index of this list is also used by other sections, like the 4th section. 44 * The index for the 3rd and 4th section is used to get the 45 * alias -> converter name mapping. Section 3 and 4 form a two column table. 46 * 47 * 4) This section contains a list of mapped converter names. Consider this 48 * as a table that maps the 3rd section to the 1st section. This list contains 49 * indexes into the 1st section. The index of this list is the same index in 50 * the 3rd section. There is also some extra information in the high bits of 51 * each converter index in this table. Currently it's only used to say that 52 * an alias mapped to this converter is ambiguous. See UCNV_CONVERTER_INDEX_MASK 53 * and UCNV_AMBIGUOUS_ALIAS_MAP_BIT for more information. This section is 54 * the predigested form of the 5th section so that an alias lookup can be fast. 55 * 56 * 5) This section contains a 2D array with indexes to the 6th section. This 57 * section is the full form of all alias mappings. The column index is the 58 * index into the converter list (column header). The row index is the index 59 * to tag list (row header). This 2D array is the top part a 3D array. The 60 * third dimension is in the 6th section. 61 * 62 * 6) This is blob of variable length arrays. Each array starts with a size, 63 * and is followed by indexes to alias names in the string table. This is 64 * the third dimension to the section 5. No other section should be referencing 65 * this section. 66 * 67 * 7) Reserved at this time (There is no information). This _usually_ has a 68 * size of 0. Future versions may add more information here. 69 * 70 * 8) This is the string table. All strings are indexed on an even address. 71 * There are two reasons for this. First many chip architectures locate strings 72 * faster on even address boundaries. Second, since all indexes are 16-bit 73 * numbers, this string table can be 128KB in size instead of 64KB when we 74 * only have strings starting on an even address. 75 * 76 * 77 * Here is the concept of section 5 and 6. It's a 3D cube. Each tag 78 * has a unique alias among all converters. That same alias can 79 * be mentioned in other standards on different converters, 80 * but only one alias per tag can be unique. 81 * 82 * 83 * Converter Names (Usually in TR22 form) 84 * -------------------------------------------. 85 * T / /| 86 * a / / | 87 * g / / | 88 * s / / | 89 * / / | 90 * ------------------------------------------/ | 91 * A | | | 92 * l | | | 93 * i | | / 94 * a | | / 95 * s | | / 96 * e | | / 97 * s | |/ 98 * ------------------------------------------- 99 * 100 * 101 * 102 * Here is what it really looks like. It's like swiss cheese. 103 * There are holes. Some converters aren't recognized by 104 * a standard, or they are really old converters that the 105 * standard doesn't recognize anymore. 106 * 107 * Converter Names (Usually in TR22 form) 108 * -------------------------------------------. 109 * T /##########################################/| 110 * a / # # /# 111 * g / # ## ## ### # ### ### ### #/ 112 * s / # ##### #### ## ## #/# 113 * / ### # # ## # # # ### # # #/## 114 * ------------------------------------------/# # 115 * A |### # # ## # # # ### # # #|# # 116 * l |# # # # # ## # #|# # 117 * i |# # # # # # #|# 118 * a |# #|# 119 * s | #|# 120 * e 121 * s 122 * 123 */ 124 125 final class UConverterAliasDataReader implements ICUBinary.Authenticate { 126 // private final static boolean debug = ICUDebug.enabled("UConverterAliasDataReader"); 127 128 /** 129 * <p>Protected constructor.</p> 130 * @param bytes ICU uprop.dat file buffer 131 * @exception IOException throw if data file fails authentication 132 */ 133 protected UConverterAliasDataReader(ByteBuffer bytes) 134 throws IOException{ 135 //if(debug) System.out.println("Bytes in buffer " + bytes.remaining()); 136 137 byteBuffer = bytes; 138 /*unicodeVersion = */ICUBinary.readHeader(byteBuffer, DATA_FORMAT_ID, this); 139 140 //if(debug) System.out.println("Bytes left in byteBuffer " + byteBuffer.remaining()); 141 } 142 143 // protected methods ------------------------------------------------- 144 145 protected int[] readToc(int n)throws IOException 146 { 147 //Read the toc 148 return ICUBinary.getInts(byteBuffer, n, 0); 149 } 150 151 public boolean isDataVersionAcceptable(byte version[]) 152 { 153 return version.length >= DATA_FORMAT_VERSION.length 154 && version[0] == DATA_FORMAT_VERSION[0] 155 && version[1] == DATA_FORMAT_VERSION[1] 156 && version[2] == DATA_FORMAT_VERSION[2]; 157 } 158 159 /*byte[] getUnicodeVersion(){ 160 return ICUBinary.getVersionByteArrayFromCompactInt(unicodeVersion); 161 }*/ 162 // private data members ------------------------------------------------- 163 164 165 /** 166 * ICU data file buffer 167 */ 168 private ByteBuffer byteBuffer; 169 170 // private int unicodeVersion; 171 172 /** 173 * File format version that this class understands. 174 * No guarantees are made if a older version is used 175 * see store.c of gennorm for more information and values 176 */ 177 // DATA_FORMAT_ID_ values taken from icu4c isAcceptable (ucnv_io.c) 178 private static final int DATA_FORMAT_ID = 0x4376416c; // dataFormat="CvAl" 179 private static final byte DATA_FORMAT_VERSION[] = {3, 0, 1}; 180 } 181