1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: store.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004dec30 14 * created by: Markus W. Scherer 15 * 16 * Store Unicode bidi/shaping properties efficiently for 17 * random access. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include "unicode/utypes.h" 23 #include "unicode/uchar.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "utrie.h" 27 #include "utrie2.h" 28 #include "uarrsort.h" 29 #include "unicode/udata.h" 30 #include "unewdata.h" 31 #include "propsvec.h" 32 #include "writesrc.h" 33 #include "ubidi_props.h" 34 #include "genbidi.h" 35 36 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 37 38 /* Unicode bidi/shaping properties file format --------------------------------- 39 40 The file format prepared and written here contains several data 41 structures that store indexes or data. 42 43 Before the data contents described below, there are the headers required by 44 the udata API for loading ICU data. Especially, a UDataInfo structure 45 precedes the actual data. It contains platform properties values and the 46 file format version. 47 48 The following is a description of format version 1.0 . 49 50 The file contains the following structures: 51 52 const int32_t indexes[i0] with values i0, i1, ...: 53 (see UBIDI_IX_... constants for names of indexes) 54 55 i0 indexLength; -- length of indexes[] (UBIDI_IX_TOP) 56 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) 57 i2 trieSize; -- size in bytes of the bidi/shaping properties trie 58 i3 mirrorLength; -- length in uint32_t of the bidi mirroring array 59 60 i4 jgStart; -- first code point with Joining_Group data 61 i5 jgLimit; -- limit code point for Joining_Group data 62 63 i6..i14 reservedIndexes; -- reserved values; 0 for now 64 65 i15 maxValues; -- maximum code values for enumerated properties 66 bits 23..16 contain the max value for Joining_Group, 67 otherwise the bits are used like enum fields in the trie word 68 69 Serialized trie, see utrie.h; 70 71 const uint32_t mirrors[mirrorLength]; 72 73 const uint8_t jgArray[i5-i4]; -- (i5-i4) is always a multiple of 4 74 75 Trie data word: 76 Bits 77 15..13 signed delta to bidi mirroring code point 78 (add delta to input code point) 79 0 no such code point (source maps to itself) 80 -3..-1, 1..3 delta 81 -4 look in mirrors table 82 12 is mirrored 83 11 Bidi_Control 84 10 Join_Control 85 9.. 8 reserved (set to 0) 86 7.. 5 Joining_Type 87 4.. 0 BiDi category 88 89 90 Mirrors: 91 Stores some of the bidi mirroring data, where each code point maps to 92 at most one other. 93 Most code points do not have a mirroring code point; most that do have a signed 94 delta stored in the trie data value. Only those where the delta does not fit 95 into the trie data are stored in this table. 96 97 Logically, this is a two-column table with source and mirror code points. 98 99 Physically, the table is compressed by taking advantage of the fact that each 100 mirror code point is also a source code point 101 (each of them is a mirror of the other). 102 Therefore, both logical columns contain the same set of code points, which needs 103 to be stored only once. 104 105 The table stores source code points, and also for each the index of its mirror 106 code point in the same table, in a simple array of uint32_t. 107 Bits 108 31..21 index to mirror code point (unsigned) 109 20.. 0 source code point 110 111 The table is sorted by source code points. 112 113 114 Joining_Group array: 115 The Joining_Group values do not fit into the 16-bit trie, but the data is also 116 limited to a small range of code points (Arabic and Syriac) and not 117 well compressible. 118 119 The start and limit code points for the range are stored in the indexes[] 120 array, and the jgArray[] stores a byte for each of these code points, 121 containing the Joining_Group value. 122 123 All code points outside of this range have No_Joining_Group (0). 124 125 ----------------------------------------------------------------------------- */ 126 127 /* UDataInfo cf. udata.h */ 128 static UDataInfo dataInfo={ 129 sizeof(UDataInfo), 130 0, 131 132 U_IS_BIG_ENDIAN, 133 U_CHARSET_FAMILY, 134 U_SIZEOF_UCHAR, 135 0, 136 137 /* dataFormat="BiDi" */ 138 { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, 139 { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 140 { 4, 0, 1, 0 } /* dataVersion */ 141 }; 142 143 /* exceptions values */ 144 static uint32_t mirrors[UBIDI_MAX_MIRROR_INDEX+1][2]; 145 static uint16_t mirrorTop=0; 146 147 /* -------------------------------------------------------------------------- */ 148 149 extern void 150 setUnicodeVersion(const char *v) { 151 UVersionInfo version; 152 u_versionFromString(version, v); 153 uprv_memcpy(dataInfo.dataVersion, version, 4); 154 } 155 156 /* bidi mirroring table ----------------------------------------------------- */ 157 158 extern void 159 addMirror(UChar32 src, UChar32 mirror) { 160 UErrorCode errorCode; 161 int32_t delta; 162 163 delta=mirror-src; 164 if(delta==0) { 165 return; /* mapping to self=no mapping */ 166 } 167 168 if(delta<UBIDI_MIN_MIRROR_DELTA || UBIDI_MAX_MIRROR_DELTA<delta) { 169 /* delta does not fit into the trie properties value, store in the mirrors[] table */ 170 if(mirrorTop==LENGTHOF(mirrors)) { 171 fprintf(stderr, "genbidi error: too many long-distance mirroring mappings\n"); 172 exit(U_BUFFER_OVERFLOW_ERROR); 173 } 174 175 /* possible: search the table so far and see if src is already listed */ 176 177 mirrors[mirrorTop][0]=(uint32_t)src; 178 mirrors[mirrorTop][1]=(uint32_t)mirror; 179 ++mirrorTop; 180 181 /* set an escape marker in src's properties */ 182 delta=UBIDI_ESC_MIRROR_DELTA; 183 } 184 185 errorCode=U_ZERO_ERROR; 186 upvec_setValue( 187 pv, src, src, 0, 188 (uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT, (uint32_t)(-1)<<UBIDI_MIRROR_DELTA_SHIFT, 189 &errorCode); 190 if(U_FAILURE(errorCode)) { 191 fprintf(stderr, "genbidi error: unable to set mirroring delta, code: %s\n", 192 u_errorName(errorCode)); 193 exit(errorCode); 194 } 195 } 196 197 static int32_t U_CALLCONV 198 compareMirror(const void *context, const void *left, const void *right) { 199 UChar32 l, r; 200 201 l=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)left)[0]); 202 r=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)right)[0]); 203 return l-r; 204 } 205 206 static void 207 makeMirror() { 208 uint32_t *reducedMirror; 209 UErrorCode errorCode; 210 int32_t i, j, start, limit, step; 211 uint32_t c; 212 213 /* sort the mirroring table by source code points */ 214 errorCode=U_ZERO_ERROR; 215 uprv_sortArray(mirrors, mirrorTop, 8, 216 compareMirror, NULL, FALSE, &errorCode); 217 218 /* 219 * reduce the 2-column table to a single column 220 * by putting the index to the mirror entry into the source entry 221 * 222 * first: 223 * find each mirror code point in the source column and set each other's indexes 224 * 225 * second: 226 * reduce the table, combine the source code points with their indexes 227 * and store as a simple array of uint32_t 228 */ 229 for(i=0; i<mirrorTop; ++i) { 230 c=mirrors[i][1]; /* mirror code point */ 231 if(c>0x1fffff) { 232 continue; /* this entry already has an index */ 233 } 234 235 /* search for the mirror code point in the source column */ 236 if(c<mirrors[i][0]) { 237 /* search before i */ 238 start=i-1; 239 limit=-1; 240 step=-1; 241 } else { 242 start=i+1; 243 limit=mirrorTop; 244 step=1; 245 } 246 247 for(j=start;; j+=step) { 248 if(j==limit) { 249 fprintf(stderr, 250 "genbidi error: bidi mirror does not roundtrip - %04lx->%04lx->?\n", 251 (long)mirrors[i][0], (long)mirrors[i][1]); 252 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 253 } 254 if(c==mirrors[j][0]) { 255 /* 256 * found the mirror code point c in the source column, 257 * set both entries' indexes to each other 258 */ 259 if(UBIDI_GET_MIRROR_CODE_POINT(mirrors[i][0])!=UBIDI_GET_MIRROR_CODE_POINT(mirrors[j][1])) { 260 /* roundtrip check fails */ 261 fprintf(stderr, 262 "genbidi error: bidi mirrors do not roundtrip - %04lx->%04lx->%04lx\n", 263 (long)mirrors[i][0], (long)mirrors[i][1], (long)mirrors[j][1]); 264 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 265 } else { 266 mirrors[i][1]|=(uint32_t)j<<UBIDI_MIRROR_INDEX_SHIFT; 267 mirrors[j][1]|=(uint32_t)i<<UBIDI_MIRROR_INDEX_SHIFT; 268 } 269 break; 270 } 271 } 272 } 273 274 /* now the second step, the actual reduction of the table */ 275 reducedMirror=mirrors[0]; 276 for(i=0; i<mirrorTop; ++i) { 277 reducedMirror[i]=mirrors[i][0]|(mirrors[i][1]&~0x1fffff); 278 } 279 280 if(U_FAILURE(errorCode)) { 281 exit(errorCode); 282 } 283 } 284 285 /* generate output data ----------------------------------------------------- */ 286 287 extern void 288 generateData(const char *dataDir, UBool csource) { 289 static int32_t indexes[UBIDI_IX_TOP]={ 290 UBIDI_IX_TOP 291 }; 292 static uint8_t trieBlock[40000]; 293 static uint8_t jgArray[0x300]; /* at most for U+0600..U+08FF */ 294 295 const uint32_t *row; 296 UChar32 start, end, prev, jgStart; 297 int32_t i; 298 299 UNewDataMemory *pData; 300 UNewTrie *pTrie; 301 UErrorCode errorCode=U_ZERO_ERROR; 302 int32_t trieSize; 303 long dataLength; 304 305 makeMirror(); 306 307 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE); 308 if(pTrie==NULL) { 309 fprintf(stderr, "genbidi error: unable to create a UNewTrie\n"); 310 exit(U_MEMORY_ALLOCATION_ERROR); 311 } 312 313 prev=jgStart=0; 314 for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP; ++i) { 315 /* store most values from vector column 0 in the trie */ 316 if(!utrie_setRange32(pTrie, start, end+1, *row, TRUE)) { 317 fprintf(stderr, "genbidi error: unable to set trie value (overflow)\n"); 318 exit(U_BUFFER_OVERFLOW_ERROR); 319 } 320 321 /* store Joining_Group values from vector column 1 in a simple byte array */ 322 if(row[1]!=0) { 323 if(start<0x600 || 0x8ff<end) { 324 fprintf(stderr, "genbidi error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n", 325 (long)start, (long)end); 326 exit(U_ILLEGAL_ARGUMENT_ERROR); 327 } 328 329 if(prev==0) { 330 /* first code point with any value */ 331 prev=jgStart=start; 332 } else { 333 /* add No_Joining_Group for code points between prev and start */ 334 while(prev<start) { 335 jgArray[prev++ -jgStart]=0; 336 } 337 } 338 339 /* set Joining_Group value for start..end */ 340 while(prev<=end) { 341 jgArray[prev++ -jgStart]=(uint8_t)row[1]; 342 } 343 } 344 } 345 346 /* finish jgArray, pad to multiple of 4 */ 347 while((prev-jgStart)&3) { 348 jgArray[prev++ -jgStart]=0; 349 } 350 indexes[UBIDI_IX_JG_START]=jgStart; 351 indexes[UBIDI_IX_JG_LIMIT]=prev; 352 353 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); 354 if(U_FAILURE(errorCode)) { 355 fprintf(stderr, "genbidi error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); 356 exit(errorCode); 357 } 358 359 indexes[UBIDI_IX_TRIE_SIZE]=trieSize; 360 indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop; 361 indexes[UBIDI_IX_LENGTH]= 362 (int32_t)sizeof(indexes)+ 363 trieSize+ 364 4*mirrorTop+ 365 (prev-jgStart); 366 367 if(beVerbose) { 368 printf("trie size in bytes: %5d\n", (int)trieSize); 369 printf("size in bytes of mirroring table: %5d\n", (int)(4*mirrorTop)); 370 printf("length of Joining_Group array: %5d (U+%04x..U+%04x)\n", (int)(prev-jgStart), (int)jgStart, (int)(prev-1)); 371 printf("data size: %5d\n", (int)indexes[UBIDI_IX_LENGTH]); 372 } 373 374 indexes[UBIDI_MAX_VALUES_INDEX]= 375 ((int32_t)U_CHAR_DIRECTION_COUNT-1)| 376 (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)| 377 (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT); 378 379 if(csource) { 380 /* write .c file for hardcoded data */ 381 UTrie trie={ NULL }; 382 UTrie2 *trie2; 383 FILE *f; 384 385 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); 386 if(U_FAILURE(errorCode)) { 387 fprintf( 388 stderr, 389 "genbidi error: failed to utrie_unserialize(ubidi.icu trie) - %s\n", 390 u_errorName(errorCode)); 391 exit(errorCode); 392 } 393 394 /* use UTrie2 */ 395 dataInfo.formatVersion[0]=2; 396 dataInfo.formatVersion[2]=0; 397 dataInfo.formatVersion[3]=0; 398 trie2=utrie2_fromUTrie(&trie, 0, &errorCode); 399 if(U_FAILURE(errorCode)) { 400 fprintf( 401 stderr, 402 "genbidi error: utrie2_fromUTrie() failed - %s\n", 403 u_errorName(errorCode)); 404 exit(errorCode); 405 } 406 { 407 /* delete lead surrogate code unit values */ 408 UChar lead; 409 trie2=utrie2_cloneAsThawed(trie2, &errorCode); 410 for(lead=0xd800; lead<0xdc00; ++lead) { 411 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 412 } 413 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 414 if(U_FAILURE(errorCode)) { 415 fprintf( 416 stderr, 417 "genbidi error: deleting lead surrogate code unit values failed - %s\n", 418 u_errorName(errorCode)); 419 exit(errorCode); 420 } 421 } 422 423 f=usrc_create(dataDir, "ubidi_props_data.c"); 424 if(f!=NULL) { 425 usrc_writeArray(f, 426 "static const UVersionInfo ubidi_props_dataVersion={", 427 dataInfo.dataVersion, 8, 4, 428 "};\n\n"); 429 usrc_writeArray(f, 430 "static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={", 431 indexes, 32, UBIDI_IX_TOP, 432 "};\n\n"); 433 usrc_writeUTrie2Arrays(f, 434 "static const uint16_t ubidi_props_trieIndex[%ld]={\n", NULL, 435 trie2, 436 "\n};\n\n"); 437 usrc_writeArray(f, 438 "static const uint32_t ubidi_props_mirrors[%ld]={\n", 439 mirrors, 32, mirrorTop, 440 "\n};\n\n"); 441 usrc_writeArray(f, 442 "static const uint8_t ubidi_props_jgArray[%ld]={\n", 443 jgArray, 8, prev-jgStart, 444 "\n};\n\n"); 445 fputs( 446 "static const UBiDiProps ubidi_props_singleton={\n" 447 " NULL,\n" 448 " ubidi_props_indexes,\n" 449 " ubidi_props_mirrors,\n" 450 " ubidi_props_jgArray,\n", 451 f); 452 usrc_writeUTrie2Struct(f, 453 " {\n", 454 trie2, "ubidi_props_trieIndex", NULL, 455 " },\n"); 456 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n"); 457 fputs("};\n", f); 458 fclose(f); 459 } 460 utrie2_close(trie2); 461 } else { 462 /* write the data */ 463 pData=udata_create(dataDir, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &dataInfo, 464 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 465 if(U_FAILURE(errorCode)) { 466 fprintf(stderr, "genbidi: unable to create data memory, %s\n", u_errorName(errorCode)); 467 exit(errorCode); 468 } 469 470 udata_writeBlock(pData, indexes, sizeof(indexes)); 471 udata_writeBlock(pData, trieBlock, trieSize); 472 udata_writeBlock(pData, mirrors, 4*mirrorTop); 473 udata_writeBlock(pData, jgArray, prev-jgStart); 474 475 /* finish up */ 476 dataLength=udata_finish(pData, &errorCode); 477 if(U_FAILURE(errorCode)) { 478 fprintf(stderr, "genbidi: error %d writing the output file\n", errorCode); 479 exit(errorCode); 480 } 481 482 if(dataLength!=indexes[UBIDI_IX_LENGTH]) { 483 fprintf(stderr, "genbidi: data length %ld != calculated size %d\n", 484 dataLength, (int)indexes[UBIDI_IX_LENGTH]); 485 exit(U_INTERNAL_PROGRAM_ERROR); 486 } 487 } 488 489 utrie_close(pTrie); 490 upvec_close(pv); 491 } 492 493 /* 494 * Hey, Emacs, please set the following: 495 * 496 * Local Variables: 497 * indent-tabs-mode: nil 498 * End: 499 * 500 */ 501