1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: store.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999dec11 14 * created by: Markus W. Scherer 15 * 16 * Store Unicode character properties efficiently for 17 * random access. 18 */ 19 20 #include <stdio.h> 21 #include "unicode/utypes.h" 22 #include "unicode/uchar.h" 23 #include "cmemory.h" 24 #include "cstring.h" 25 #include "utrie.h" 26 #include "unicode/udata.h" 27 #include "unewdata.h" 28 #include "writesrc.h" 29 #include "uprops.h" 30 #include "genprops.h" 31 32 #define DO_DEBUG_OUT 0 33 34 /* Unicode character properties file format ------------------------------------ 35 36 The file format prepared and written here contains several data 37 structures that store indexes or data. 38 39 Before the data contents described below, there are the headers required by 40 the udata API for loading ICU data. Especially, a UDataInfo structure 41 precedes the actual data. It contains platform properties values and the 42 file format version. 43 44 The following is a description of format version 5 . 45 46 The format changes between version 3 and 4 because the properties related to 47 case mappings and bidi/shaping are pulled out into separate files 48 for modularization. 49 In order to reduce the need for code changes, some of the previous data 50 structures are omitted, rather than rearranging everything. 51 52 For details see "Changes in format version 4" below. 53 54 Format version 5 became necessary because the bit field for script codes 55 overflowed. Several bit fields got rearranged, and three (Script, Block, 56 Word_Break) got widened by one bit each. 57 58 Data contents: 59 60 The contents is a parsed, binary form of several Unicode character 61 database files, most prominently UnicodeData.txt. 62 63 Any Unicode code point from 0 to 0x10ffff can be looked up to get 64 the properties, if any, for that code point. This means that the input 65 to the lookup are 21-bit unsigned integers, with not all of the 66 21-bit range used. 67 68 It is assumed that client code keeps a uint32_t pointer 69 to the beginning of the data: 70 71 const uint32_t *p32; 72 73 Formally, the file contains the following structures: 74 75 const int32_t indexes[16] with values i0..i15: 76 77 i0 indicates the length of the main trie. 78 i0..i3 all have the same value in format version 4.0; 79 the related props32[] and exceptions[] and uchars[] were used in format version 3 80 81 i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words 82 i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words 83 i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings 84 85 i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties 86 i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors 87 i5 additionalVectorsColumns; -- number of 32-bit words per properties vector 88 89 i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table 90 i7..i9 reservedIndexes; -- reserved values; 0 for now 91 92 i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+) 93 i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2) 94 i12..i15 reservedIndexes; -- reserved values; 0 for now 95 96 PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) 97 98 P, E, and U are not used (empty) in format version 4 99 100 P const uint32_t props32[i1-i0]; 101 E const uint32_t exceptions[i2-i1]; 102 U const UChar uchars[2*(i3-i2)]; 103 104 AT serialized trie for additional properties (byte size: 4*(i4-i3)) 105 PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; 106 107 Trie lookup and properties: 108 109 In order to condense the data for the 21-bit code space, several properties of 110 the Unicode code assignment are exploited: 111 - The code space is sparse. 112 - There are several 10k of consecutive codes with the same properties. 113 - Characters and scripts are allocated in groups of 16 code points. 114 - Inside blocks for scripts the properties are often repetitive. 115 - The 21-bit space is not fully used for Unicode. 116 117 The lookup of properties for a given code point is done with a trie lookup, 118 using the UTrie implementation. 119 The trie lookup result is a 16-bit properties word. 120 121 With a given Unicode code point 122 123 UChar32 c; 124 125 and 0<=c<0x110000, the lookup is done like this: 126 127 uint16_t props; 128 UTRIE_GET16(trie, c, props); 129 130 Each 16-bit properties word contains: 131 132 0.. 4 general category 133 5.. 7 numeric type 134 non-digit numbers are stored with multiple types and pseudo-types 135 in order to facilitate compact encoding: 136 0 no numeric value (0) 137 1 decimal digit value (0..9) 138 2 digit value (0..9) 139 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff 140 4 (internal type UPROPS_NT_FRACTION) fraction 141 5 (internal type UPROPS_NT_LARGE) large number >0xff 142 6..7 reserved 143 144 when returning the numeric type from a public API, 145 internal types must be turned into U_NT_NUMERIC 146 147 8..15 numeric value 148 encoding of fractions and large numbers see below 149 150 Fractions: 151 // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) 152 int32_t num, den; 153 num=n>>3; // num=0..31 154 den=(n&7)+2; // den=2..9 155 if(num==0) { 156 num=-1; // num=-1 or 1..31 157 } 158 double result=(double)num/(double)den; 159 160 Large numbers: 161 // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) 162 int32_t m, e; 163 m=n>>4; // m=0..15 164 e=(n&0xf); 165 if(m==0) { 166 m=1; // for large powers of 10 167 e+=18; // e=18..33 168 } else { 169 e+=2; // e=2..17 170 } // m==10..15 are reserved 171 double result=(double)m*10^e; 172 173 --- Additional properties (new in format version 2.1) --- 174 175 The second trie for additional properties (AT) is also a UTrie with 16-bit data. 176 The data words consist of 32-bit unit indexes (not row indexes!) into the 177 table of unique properties vectors (PV). 178 Each vector contains a set of properties. 179 The width of a vector (number of uint32_t per row) may change 180 with the formatVersion, it is stored in i5. 181 182 Current properties: see icu/source/common/uprops.h 183 184 --- Changes in format version 3.1 --- 185 186 See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT. 187 188 --- Changes in format version 3.2 --- 189 190 - The tries use linear Latin-1 ranges. 191 - The additional properties bits store full properties XYZ instead 192 of partial Other_XYZ, so that changes in the derivation formulas 193 need not be tracked in runtime library code. 194 - Joining Type and Line Break are also stored completely, so that uprops.c 195 needs no runtime formulas for enumerated properties either. 196 - Store the case-sensitive flag in the main properties word. 197 - i10 also contains U_LB_COUNT and U_EA_COUNT. 198 - i11 contains maxValues2 for vector word 2. 199 200 --- Changes in format version 4 --- 201 202 The format changes between version 3 and 4 because the properties related to 203 case mappings and bidi/shaping are pulled out into separate files 204 for modularization. 205 In order to reduce the need for code changes, some of the previous data 206 structures are omitted, rather than rearranging everything. 207 208 (The change to format version 4 is for ICU 3.4. The last CVS revision of 209 genprops/store.c for format version 3.2 is 1.48.) 210 211 The main trie's data is significantly simplified: 212 - The trie's 16-bit data word is used directly instead of as an index 213 into props32[]. 214 - The trie uses the default trie folding functions instead of custom ones. 215 - Numeric values are stored directly in the trie data word, with special 216 encodings. 217 - No more exception data (the data that needed it was pulled out, or, in the 218 case of numeric values, encoded differently). 219 - No more string data (pulled out - was for case mappings). 220 221 Also, some of the previously used properties vector bits are reserved again. 222 223 The indexes[] values for the omitted structures are still filled in 224 (indicating zero-length arrays) so that the swapper code remains unchanged. 225 226 --- Changes in format version 5 --- 227 228 Rearranged bit fields in the second trie (AT) because the script code field 229 overflowed. Old code would have seen nonsensically low values for new, higher 230 script codes. 231 Modified bit fields in icu/source/common/uprops.h 232 233 ----------------------------------------------------------------------------- */ 234 235 /* UDataInfo cf. udata.h */ 236 static UDataInfo dataInfo={ 237 sizeof(UDataInfo), 238 0, 239 240 U_IS_BIG_ENDIAN, 241 U_CHARSET_FAMILY, 242 U_SIZEOF_UCHAR, 243 0, 244 245 { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */ 246 { 5, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 247 { 5, 1, 0, 0 } /* dataVersion */ 248 }; 249 250 static UNewTrie *pTrie=NULL; 251 252 /* -------------------------------------------------------------------------- */ 253 254 extern void 255 setUnicodeVersion(const char *v) { 256 UVersionInfo version; 257 u_versionFromString(version, v); 258 uprv_memcpy(dataInfo.dataVersion, version, 4); 259 } 260 261 extern void 262 initStore() { 263 pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE); 264 if(pTrie==NULL) { 265 fprintf(stderr, "error: unable to create a UNewTrie\n"); 266 exit(U_MEMORY_ALLOCATION_ERROR); 267 } 268 269 initAdditionalProperties(); 270 } 271 272 extern void 273 exitStore() { 274 utrie_close(pTrie); 275 exitAdditionalProperties(); 276 } 277 278 static uint32_t printNumericTypeValueError(Props *p) { 279 fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n", 280 (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent); 281 exit(U_ILLEGAL_ARGUMENT_ERROR); 282 return 0; 283 } 284 285 /* store a character's properties ------------------------------------------- */ 286 287 extern uint32_t 288 makeProps(Props *p) { 289 uint32_t den; 290 int32_t type, value, exp; 291 292 /* encode numeric type & value */ 293 type=p->numericType; 294 value=p->numericValue; 295 den=p->denominator; 296 exp=p->exponent; 297 298 if(den!=0) { 299 /* fraction */ 300 if( type!=U_NT_NUMERIC || 301 value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM || 302 den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den || 303 exp!=0 304 ) { 305 return printNumericTypeValueError(p); 306 } 307 type=UPROPS_NT_FRACTION; 308 309 if(value==-1) { 310 value=0; 311 } 312 den-=UPROPS_FRACTION_DEN_OFFSET; 313 value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den; 314 } else if(exp!=0) { 315 /* very large value */ 316 if( type!=U_NT_NUMERIC || 317 value<1 || 9<value || 318 exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp 319 ) { 320 return printNumericTypeValueError(p); 321 } 322 type=UPROPS_NT_LARGE; 323 324 if(exp<=UPROPS_LARGE_MAX_EXP) { 325 /* 1..9 * 10^(2..17) */ 326 exp-=UPROPS_LARGE_EXP_OFFSET; 327 } else { 328 /* 1 * 10^(18..33) */ 329 if(value!=1) { 330 return printNumericTypeValueError(p); 331 } 332 value=0; 333 exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA; 334 } 335 value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp; 336 } else if(value>UPROPS_MAX_SMALL_NUMBER) { 337 /* large value */ 338 if(type!=U_NT_NUMERIC) { 339 return printNumericTypeValueError(p); 340 } 341 type=UPROPS_NT_LARGE; 342 343 /* split the value into mantissa and exponent, base 10 */ 344 while((value%10)==0) { 345 value/=10; 346 ++exp; 347 } 348 if(value>9) { 349 return printNumericTypeValueError(p); 350 } 351 352 exp-=UPROPS_LARGE_EXP_OFFSET; 353 value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp; 354 } else if(value<0) { 355 /* unable to encode negative values, other than fractions -1/x */ 356 return printNumericTypeValueError(p); 357 358 /* } else normal value=0..0xff { */ 359 } 360 361 /* encode the properties */ 362 return 363 (uint32_t)p->generalCategory | 364 ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) | 365 ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT); 366 } 367 368 extern void 369 addProps(uint32_t c, uint32_t x) { 370 if(!utrie_set32(pTrie, (UChar32)c, x)) { 371 fprintf(stderr, "error: too many entries for the properties trie\n"); 372 exit(U_BUFFER_OVERFLOW_ERROR); 373 } 374 } 375 376 extern uint32_t 377 getProps(uint32_t c) { 378 return utrie_get32(pTrie, (UChar32)c, NULL); 379 } 380 381 /* areas of same properties ------------------------------------------------- */ 382 383 extern void 384 repeatProps(uint32_t first, uint32_t last, uint32_t x) { 385 if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) { 386 fprintf(stderr, "error: too many entries for the properties trie\n"); 387 exit(U_BUFFER_OVERFLOW_ERROR); 388 } 389 } 390 391 /* generate output data ----------------------------------------------------- */ 392 393 extern void 394 generateData(const char *dataDir, UBool csource) { 395 static int32_t indexes[UPROPS_INDEX_COUNT]={ 396 0, 0, 0, 0, 397 0, 0, 0, 0, 398 0, 0, 0, 0, 399 0, 0, 0, 0 400 }; 401 static uint8_t trieBlock[40000]; 402 static uint8_t additionalProps[120000]; 403 404 UNewDataMemory *pData; 405 UErrorCode errorCode=U_ZERO_ERROR; 406 uint32_t size = 0; 407 int32_t trieSize, additionalPropsSize, offset; 408 long dataLength; 409 410 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); 411 if(U_FAILURE(errorCode)) { 412 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); 413 exit(errorCode); 414 } 415 416 offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */ 417 418 /* round up trie size to 4-alignment */ 419 trieSize=(trieSize+3)&~3; 420 offset+=trieSize>>2; 421 indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */ 422 indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */ 423 indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */ 424 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset; 425 426 if(beVerbose) { 427 printf("trie size in bytes: %5u\n", (int)trieSize); 428 } 429 430 if(csource) { 431 /* write .c file for hardcoded data */ 432 UTrie trie={ NULL }; 433 UTrie2 *trie2; 434 FILE *f; 435 436 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); 437 if(U_FAILURE(errorCode)) { 438 fprintf( 439 stderr, 440 "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n", 441 u_errorName(errorCode)); 442 exit(errorCode); 443 } 444 445 /* use UTrie2 */ 446 dataInfo.formatVersion[0]=6; 447 dataInfo.formatVersion[2]=0; 448 dataInfo.formatVersion[3]=0; 449 trie2=utrie2_fromUTrie(&trie, 0, &errorCode); 450 if(U_FAILURE(errorCode)) { 451 fprintf( 452 stderr, 453 "genprops error: utrie2_fromUTrie() failed - %s\n", 454 u_errorName(errorCode)); 455 exit(errorCode); 456 } 457 { 458 /* delete lead surrogate code unit values */ 459 UChar lead; 460 trie2=utrie2_cloneAsThawed(trie2, &errorCode); 461 for(lead=0xd800; lead<0xdc00; ++lead) { 462 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 463 } 464 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 465 if(U_FAILURE(errorCode)) { 466 fprintf( 467 stderr, 468 "genprops error: deleting lead surrogate code unit values failed - %s\n", 469 u_errorName(errorCode)); 470 exit(errorCode); 471 } 472 } 473 474 f=usrc_create(dataDir, "uchar_props_data.c"); 475 if(f!=NULL) { 476 usrc_writeArray(f, 477 "static const UVersionInfo formatVersion={", 478 dataInfo.formatVersion, 8, 4, 479 "};\n\n"); 480 usrc_writeArray(f, 481 "static const UVersionInfo dataVersion={", 482 dataInfo.dataVersion, 8, 4, 483 "};\n\n"); 484 usrc_writeUTrie2Arrays(f, 485 "static const uint16_t propsTrie_index[%ld]={\n", NULL, 486 trie2, 487 "\n};\n\n"); 488 usrc_writeUTrie2Struct(f, 489 "static const UTrie2 propsTrie={\n", 490 trie2, "propsTrie_index", NULL, 491 "};\n\n"); 492 493 additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes); 494 size=4*offset+additionalPropsSize; /* total size of data */ 495 496 usrc_writeArray(f, 497 "static const int32_t indexes[UPROPS_INDEX_COUNT]={", 498 indexes, 32, UPROPS_INDEX_COUNT, 499 "};\n\n"); 500 fclose(f); 501 } 502 utrie2_close(trie2); 503 } else { 504 /* write the data */ 505 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, 506 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 507 if(U_FAILURE(errorCode)) { 508 fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode)); 509 exit(errorCode); 510 } 511 512 additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes); 513 size=4*offset+additionalPropsSize; /* total size of data */ 514 515 udata_writeBlock(pData, indexes, sizeof(indexes)); 516 udata_writeBlock(pData, trieBlock, trieSize); 517 udata_writeBlock(pData, additionalProps, additionalPropsSize); 518 519 /* finish up */ 520 dataLength=udata_finish(pData, &errorCode); 521 if(U_FAILURE(errorCode)) { 522 fprintf(stderr, "genprops: error %d writing the output file\n", errorCode); 523 exit(errorCode); 524 } 525 526 if(dataLength!=(long)size) { 527 fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", 528 dataLength, (unsigned long)size); 529 exit(U_INTERNAL_PROGRAM_ERROR); 530 } 531 } 532 533 if(beVerbose) { 534 printf("data size: %6lu\n", (unsigned long)size); 535 } 536 } 537 538 /* 539 * Hey, Emacs, please set the following: 540 * 541 * Local Variables: 542 * indent-tabs-mode: nil 543 * End: 544 * 545 */ 546