1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: store.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999dec11 14 * created by: Markus W. Scherer 15 * 16 * Store Unicode character properties efficiently for 17 * random access. 18 */ 19 20 #include <stdio.h> 21 #include "unicode/utypes.h" 22 #include "unicode/uchar.h" 23 #include "cmemory.h" 24 #include "cstring.h" 25 #include "utrie.h" 26 #include "unicode/udata.h" 27 #include "unewdata.h" 28 #include "writesrc.h" 29 #include "uprops.h" 30 #include "genprops.h" 31 32 #define DO_DEBUG_OUT 0 33 34 /* Unicode character properties file format ------------------------------------ 35 36 The file format prepared and written here contains several data 37 structures that store indexes or data. 38 39 Before the data contents described below, there are the headers required by 40 the udata API for loading ICU data. Especially, a UDataInfo structure 41 precedes the actual data. It contains platform properties values and the 42 file format version. 43 44 The following is a description of format version 6 . 45 46 Data contents: 47 48 The contents is a parsed, binary form of several Unicode character 49 database files, most prominently UnicodeData.txt. 50 51 Any Unicode code point from 0 to 0x10ffff can be looked up to get 52 the properties, if any, for that code point. This means that the input 53 to the lookup are 21-bit unsigned integers, with not all of the 54 21-bit range used. 55 56 It is assumed that client code keeps a uint32_t pointer 57 to the beginning of the data: 58 59 const uint32_t *p32; 60 61 Formally, the file contains the following structures: 62 63 const int32_t indexes[16] with values i0..i15: 64 65 i0 indicates the length of the main trie. 66 i0..i3 all have the same value in format versions 4.0 and higher; 67 the related props32[] and exceptions[] and uchars[] were used in format version 3 68 69 i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words 70 i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words 71 i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings 72 73 i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties 74 i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors 75 i5 additionalVectorsColumns; -- number of 32-bit words per properties vector 76 77 i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table 78 i7..i9 reservedIndexes; -- reserved values; 0 for now 79 80 i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+) 81 i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2) 82 i12..i15 reservedIndexes; -- reserved values; 0 for now 83 84 PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) 85 86 P, E, and U are not used (empty) in format versions 4 and above 87 88 P const uint32_t props32[i1-i0]; 89 E const uint32_t exceptions[i2-i1]; 90 U const UChar uchars[2*(i3-i2)]; 91 92 AT serialized trie for additional properties (byte size: 4*(i4-i3)) 93 PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; 94 95 Trie lookup and properties: 96 97 In order to condense the data for the 21-bit code space, several properties of 98 the Unicode code assignment are exploited: 99 - The code space is sparse. 100 - There are several 10k of consecutive codes with the same properties. 101 - Characters and scripts are allocated in groups of 16 code points. 102 - Inside blocks for scripts the properties are often repetitive. 103 - The 21-bit space is not fully used for Unicode. 104 105 The lookup of properties for a given code point is done with a trie lookup, 106 using the UTrie implementation. 107 The trie lookup result is a 16-bit properties word. 108 109 With a given Unicode code point 110 111 UChar32 c; 112 113 and 0<=c<0x110000, the lookup is done like this: 114 115 uint16_t props; 116 UTRIE_GET16(trie, c, props); 117 118 Each 16-bit properties word contains: 119 120 0.. 4 general category 121 5 reserved 122 6..15 numeric type and value (ntv) 123 124 Encoding of numeric type and value in the 10-bit ntv field: 125 ntv type value 126 0 U_NT_NONE 0 127 1..10 U_NT_DECIMAL 0..9 128 11..20 U_NT_DIGIT 0..9 129 21..0x2ff U_NT_NUMERIC see below 130 0x300..0x3ff reserved 131 132 For U_NT_NUMERIC: 133 ntv value 134 21..0xaf integer 0..154 135 0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 136 0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 137 (only one significant decimal digit) 138 139 --- Additional properties (new in format version 2.1) --- 140 141 The second trie for additional properties (AT) is also a UTrie with 16-bit data. 142 The data words consist of 32-bit unit indexes (not row indexes!) into the 143 table of unique properties vectors (PV). 144 Each vector contains a set of properties. 145 The width of a vector (number of uint32_t per row) may change 146 with the formatVersion, it is stored in i5. 147 148 Current properties: see icu/source/common/uprops.h 149 150 --- Changes in format version 3.1 --- 151 152 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT. 153 154 --- Changes in format version 3.2 --- 155 156 - The tries use linear Latin-1 ranges. 157 - The additional properties bits store full properties XYZ instead 158 of partial Other_XYZ, so that changes in the derivation formulas 159 need not be tracked in runtime library code. 160 - Joining Type and Line Break are also stored completely, so that uprops.c 161 needs no runtime formulas for enumerated properties either. 162 - Store the case-sensitive flag in the main properties word. 163 - i10 also contains U_LB_COUNT and U_EA_COUNT. 164 - i11 contains maxValues2 for vector word 2. 165 166 --- Changes in format version 4 --- 167 168 The format changes between version 3 and 4 because the properties related to 169 case mappings and bidi/shaping are pulled out into separate files 170 for modularization. 171 In order to reduce the need for code changes, some of the previous data 172 structures are omitted, rather than rearranging everything. 173 174 (The change to format version 4 is for ICU 3.4. The last CVS revision of 175 genprops/store.c for format version 3.2 is 1.48.) 176 177 The main trie's data is significantly simplified: 178 - The trie's 16-bit data word is used directly instead of as an index 179 into props32[]. 180 - The trie uses the default trie folding functions instead of custom ones. 181 - Numeric values are stored directly in the trie data word, with special 182 encodings. 183 - No more exception data (the data that needed it was pulled out, or, in the 184 case of numeric values, encoded differently). 185 - No more string data (pulled out - was for case mappings). 186 187 Also, some of the previously used properties vector bits are reserved again. 188 189 The indexes[] values for the omitted structures are still filled in 190 (indicating zero-length arrays) so that the swapper code remains unchanged. 191 192 --- Changes in format version 5 --- 193 194 Format version 5 became necessary because the bit field for script codes 195 overflowed. The changes are incompatible because 196 old code would have seen nonsensically low values for new, higher script codes. 197 198 Rearranged bit fields in the second trie (AT) and widened three (Script, Block, 199 Word_Break) by one bit each. 200 201 Modified bit fields in icu/source/common/uprops.h 202 203 --- Changes in format version 6 --- 204 205 Format version 6 became necessary because Unicode 5.2 adds fractions with 206 denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric 207 types and values rather than add another variant to the previous format. 208 209 ----------------------------------------------------------------------------- */ 210 211 /* UDataInfo cf. udata.h */ 212 static UDataInfo dataInfo={ 213 sizeof(UDataInfo), 214 0, 215 216 U_IS_BIG_ENDIAN, 217 U_CHARSET_FAMILY, 218 U_SIZEOF_UCHAR, 219 0, 220 221 { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */ 222 { 6, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 223 { 5, 1, 0, 0 } /* dataVersion */ 224 }; 225 226 static UNewTrie *pTrie=NULL; 227 228 /* -------------------------------------------------------------------------- */ 229 230 extern void 231 setUnicodeVersion(const char *v) { 232 UVersionInfo version; 233 u_versionFromString(version, v); 234 uprv_memcpy(dataInfo.dataVersion, version, 4); 235 } 236 237 extern void 238 initStore() { 239 pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE); 240 if(pTrie==NULL) { 241 fprintf(stderr, "error: unable to create a UNewTrie\n"); 242 exit(U_MEMORY_ALLOCATION_ERROR); 243 } 244 245 initAdditionalProperties(); 246 } 247 248 extern void 249 exitStore() { 250 utrie_close(pTrie); 251 exitAdditionalProperties(); 252 } 253 254 /* store a character's properties ------------------------------------------- */ 255 256 extern uint32_t 257 makeProps(Props *p) { 258 uint32_t den; 259 int32_t type, value, exp, ntv; 260 261 /* encode numeric type & value */ 262 type=p->numericType; 263 value=p->numericValue; 264 den=p->denominator; 265 exp=p->exponent; 266 267 ntv=-1; /* the numeric type and value cannot be encoded if ntv remains -1 */ 268 switch(type) { 269 case U_NT_NONE: 270 if(value==0 && den==0 && exp==0) { 271 ntv=UPROPS_NTV_NONE; 272 } 273 break; 274 case U_NT_DECIMAL: 275 if(0<=value && value<=9 && den==0 && exp==0) { 276 ntv=UPROPS_NTV_DECIMAL_START+value; 277 } 278 break; 279 case U_NT_DIGIT: 280 if(0<=value && value<=9 && den==0 && exp==0) { 281 ntv=UPROPS_NTV_DIGIT_START+value; 282 } 283 break; 284 case U_NT_NUMERIC: 285 if(den==0) { 286 if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) { 287 /* small integer parsed like a large one */ 288 ntv=UPROPS_NTV_NUMERIC_START+value*100; 289 } else if(exp==0 && value>=0) { 290 if(value<=UPROPS_NTV_MAX_SMALL_INT) { 291 /* small integer */ 292 ntv=UPROPS_NTV_NUMERIC_START+value; 293 } else { 294 /* large integer parsed like a small one */ 295 /* split the value into mantissa and exponent, base 10 */ 296 int32_t mant=value; 297 while((mant%10)==0) { 298 mant/=10; 299 ++exp; 300 } 301 if(mant<=9) { 302 ntv=((mant+14)<<5)+(exp-2); 303 } 304 } 305 } else if(2<=exp && exp<=33 && 1<=value && value<=9) { 306 /* large, single-significant-digit integer */ 307 ntv=((value+14)<<5)+(exp-2); 308 } 309 } else if(exp==0) { 310 if(-1<=value && value<=17 && 1<=den && den<=16) { 311 /* fraction */ 312 ntv=((value+12)<<4)+(den-1); 313 } 314 } 315 default: 316 break; 317 } 318 if(ntv<0) { 319 fprintf(stderr, "genprops error: unable to encode numeric type %d & value %ld/%lu E%d\n", 320 (int)type, (long)value, (unsigned long)den, exp); 321 exit(U_ILLEGAL_ARGUMENT_ERROR); 322 } 323 324 /* encode the properties */ 325 return 326 (uint32_t)p->generalCategory | 327 (ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT); 328 } 329 330 extern void 331 addProps(uint32_t c, uint32_t x) { 332 if(!utrie_set32(pTrie, (UChar32)c, x)) { 333 fprintf(stderr, "error: too many entries for the properties trie\n"); 334 exit(U_BUFFER_OVERFLOW_ERROR); 335 } 336 } 337 338 extern uint32_t 339 getProps(uint32_t c) { 340 return utrie_get32(pTrie, (UChar32)c, NULL); 341 } 342 343 /* areas of same properties ------------------------------------------------- */ 344 345 extern void 346 repeatProps(uint32_t first, uint32_t last, uint32_t x) { 347 if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) { 348 fprintf(stderr, "error: too many entries for the properties trie\n"); 349 exit(U_BUFFER_OVERFLOW_ERROR); 350 } 351 } 352 353 /* generate output data ----------------------------------------------------- */ 354 355 extern void 356 generateData(const char *dataDir, UBool csource) { 357 static int32_t indexes[UPROPS_INDEX_COUNT]={ 358 0, 0, 0, 0, 359 0, 0, 0, 0, 360 0, 0, 0, 0, 361 0, 0, 0, 0 362 }; 363 static uint8_t trieBlock[40000]; 364 static uint8_t additionalProps[120000]; 365 366 UNewDataMemory *pData; 367 UErrorCode errorCode=U_ZERO_ERROR; 368 uint32_t size = 0; 369 int32_t trieSize, additionalPropsSize, offset; 370 long dataLength; 371 372 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); 373 if(U_FAILURE(errorCode)) { 374 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); 375 exit(errorCode); 376 } 377 378 offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */ 379 380 /* round up trie size to 4-alignment */ 381 trieSize=(trieSize+3)&~3; 382 offset+=trieSize>>2; 383 indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */ 384 indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */ 385 indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */ 386 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset; 387 388 if(beVerbose) { 389 printf("trie size in bytes: %5u\n", (int)trieSize); 390 } 391 392 if(csource) { 393 /* write .c file for hardcoded data */ 394 UTrie trie={ NULL }; 395 UTrie2 *trie2; 396 FILE *f; 397 398 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); 399 if(U_FAILURE(errorCode)) { 400 fprintf( 401 stderr, 402 "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n", 403 u_errorName(errorCode)); 404 exit(errorCode); 405 } 406 407 /* use UTrie2 */ 408 trie2=utrie2_fromUTrie(&trie, 0, &errorCode); 409 if(U_FAILURE(errorCode)) { 410 fprintf( 411 stderr, 412 "genprops error: utrie2_fromUTrie() failed - %s\n", 413 u_errorName(errorCode)); 414 exit(errorCode); 415 } 416 { 417 /* delete lead surrogate code unit values */ 418 UChar lead; 419 trie2=utrie2_cloneAsThawed(trie2, &errorCode); 420 for(lead=0xd800; lead<0xdc00; ++lead) { 421 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 422 } 423 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 424 if(U_FAILURE(errorCode)) { 425 fprintf( 426 stderr, 427 "genprops error: deleting lead surrogate code unit values failed - %s\n", 428 u_errorName(errorCode)); 429 exit(errorCode); 430 } 431 } 432 433 f=usrc_create(dataDir, "uchar_props_data.c"); 434 if(f!=NULL) { 435 /* unused 436 usrc_writeArray(f, 437 "static const UVersionInfo formatVersion={", 438 dataInfo.formatVersion, 8, 4, 439 "};\n\n"); 440 */ 441 usrc_writeArray(f, 442 "static const UVersionInfo dataVersion={", 443 dataInfo.dataVersion, 8, 4, 444 "};\n\n"); 445 usrc_writeUTrie2Arrays(f, 446 "static const uint16_t propsTrie_index[%ld]={\n", NULL, 447 trie2, 448 "\n};\n\n"); 449 usrc_writeUTrie2Struct(f, 450 "static const UTrie2 propsTrie={\n", 451 trie2, "propsTrie_index", NULL, 452 "};\n\n"); 453 454 additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes); 455 size=4*offset+additionalPropsSize; /* total size of data */ 456 457 usrc_writeArray(f, 458 "static const int32_t indexes[UPROPS_INDEX_COUNT]={", 459 indexes, 32, UPROPS_INDEX_COUNT, 460 "};\n\n"); 461 fclose(f); 462 } 463 utrie2_close(trie2); 464 } else { 465 /* write the data */ 466 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, 467 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 468 if(U_FAILURE(errorCode)) { 469 fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode)); 470 exit(errorCode); 471 } 472 473 additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes); 474 size=4*offset+additionalPropsSize; /* total size of data */ 475 476 udata_writeBlock(pData, indexes, sizeof(indexes)); 477 udata_writeBlock(pData, trieBlock, trieSize); 478 udata_writeBlock(pData, additionalProps, additionalPropsSize); 479 480 /* finish up */ 481 dataLength=udata_finish(pData, &errorCode); 482 if(U_FAILURE(errorCode)) { 483 fprintf(stderr, "genprops: error %d writing the output file\n", errorCode); 484 exit(errorCode); 485 } 486 487 if(dataLength!=(long)size) { 488 fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", 489 dataLength, (unsigned long)size); 490 exit(U_INTERNAL_PROGRAM_ERROR); 491 } 492 } 493 494 if(beVerbose) { 495 printf("data size: %6lu\n", (unsigned long)size); 496 } 497 } 498 499 /* 500 * Hey, Emacs, please set the following: 501 * 502 * Local Variables: 503 * indent-tabs-mode: nil 504 * End: 505 * 506 */ 507