1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: store.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004aug28 14 * created by: Markus W. Scherer 15 * 16 * Store Unicode case mapping properties efficiently for 17 * random access. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include "unicode/utypes.h" 23 #include "unicode/uchar.h" 24 #include "unicode/ustring.h" 25 #include "cmemory.h" 26 #include "cstring.h" 27 #include "filestrm.h" 28 #include "utrie.h" 29 #include "utrie2.h" 30 #include "uarrsort.h" 31 #include "unicode/udata.h" 32 #include "unewdata.h" 33 #include "propsvec.h" 34 #include "writesrc.h" 35 #include "gencase.h" 36 37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 38 39 /* Unicode case mapping properties file format --------------------------------- 40 41 The file format prepared and written here contains several data 42 structures that store indexes or data. 43 44 Before the data contents described below, there are the headers required by 45 the udata API for loading ICU data. Especially, a UDataInfo structure 46 precedes the actual data. It contains platform properties values and the 47 file format version. 48 49 The following is a description of format version 1.1 . 50 51 Format version 1.1 adds data for case closure. 52 53 The file contains the following structures: 54 55 const int32_t indexes[i0] with values i0, i1, ...: 56 (see UCASE_IX_... constants for names of indexes) 57 58 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP) 59 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) 60 i2 trieSize; -- size in bytes of the case mapping properties trie 61 i3 exceptionsLength; -- length in uint16_t of the exceptions array 62 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1) 63 64 i5..i14 reservedIndexes; -- reserved values; 0 for now 65 66 i15 maxFullLength; -- maximum length of a full case mapping/folding string 67 68 69 Serialized trie, see utrie.h; 70 71 const uint16_t exceptions[exceptionsLength]; 72 73 const UChar unfold[unfoldLength]; 74 75 76 Trie data word: 77 Bits 78 if(exception) { 79 15..4 unsigned exception index 80 } else { 81 if(not uncased) { 82 15..6 signed delta to simple case mapping code point 83 (add delta to input code point) 84 } else { 85 6 the code point is case-ignorable 86 (U+0307 is also case-ignorable but has an exception) 87 } 88 5..4 0 normal character with cc=0 89 1 soft-dotted character 90 2 cc=230 91 3 other cc 92 } 93 3 exception 94 2 case sensitive 95 1..0 0 uncased 96 1 lowercase 97 2 uppercase 98 3 titlecase 99 100 101 Exceptions: 102 A sub-array of the exceptions array is indexed by the exception index in a 103 trie word. 104 The sub-array consists of the following fields: 105 uint16_t excWord; 106 uint16_t optional values []; 107 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase 108 109 excWord: (see UCASE_EXC_...) 110 Bits 111 15 conditional case folding 112 14 conditional special casing 113 13..12 same as non-exception trie data bits 5..4 114 moved here because the exception index needs more bits than the delta 115 0 normal character with cc=0 116 1 soft-dotted character 117 2 cc=230 118 3 other cc 119 11.. 9 reserved 120 8 if set, then for each optional-value slot there are 2 uint16_t values 121 (high and low parts of 32-bit values) 122 instead of single ones 123 7.. 0 bits for which optional value is present 124 125 Optional-value slots: 126 0 lowercase mapping (code point) 127 1 case folding (code point) 128 2 uppercase mapping (code point) 129 3 titlecase mapping (code point) 130 4 reserved 131 5 reserved 132 6 closure mappings (new in format version 1.1) 133 7 there is at least one full (string) case mapping 134 the length of each is encoded in a nibble of this optional value, 135 and the strings follow this optional value in the same order: 136 lower/fold/upper/title 137 138 The optional closure mappings value is used as follows: 139 Bits 0..3 contain the length of a string of code points for case closure. 140 The string immediately follows the full case mappings, or the closure value 141 slot if there are no full case mappings. 142 Bits 4..15 are reserved and could be used in the future to indicate the 143 number of strings for case closure. 144 Complete case closure for a code point is given by the union of all simple 145 and full case mappings and foldings, plus the case closure code points 146 (and potentially, in the future, case closure strings). 147 148 For space saving, some values are not stored. Lookups are as follows: 149 - If special casing is conditional, then no full lower/upper/title mapping 150 strings are stored. 151 - If case folding is conditional, then no simple or full case foldings are 152 stored. 153 - Fall back in this order: 154 full (string) mapping -- if full mappings are used 155 simple (code point) mapping of the same type 156 simple fold->simple lower 157 simple title->simple upper 158 finally, the original code point (no mapping) 159 160 This fallback order is strict: 161 In particular, the fallback from full case folding is to simple case folding, 162 not to full lowercase mapping. 163 164 Reverse case folding data ("unfold") array: (new in format version 1.1) 165 166 This array stores some miscellaneous values followed by a table. The data maps 167 back from multi-character strings to their original code points, for use 168 in case closure. 169 170 The table contains two columns of strings. 171 The string in the first column is the case folding of each of the code points 172 in the second column. The strings are terminated with NUL or by the end of the 173 column, whichever comes first. 174 175 The miscellaneous data takes up one pseudo-row and includes: 176 - number of rows 177 - number of UChars per row 178 - number of UChars in the left (folding string) column 179 180 The table is sorted by its first column. Values in the first column are unique. 181 182 ----------------------------------------------------------------------------- */ 183 184 /* UDataInfo cf. udata.h */ 185 static UDataInfo dataInfo={ 186 sizeof(UDataInfo), 187 0, 188 189 U_IS_BIG_ENDIAN, 190 U_CHARSET_FAMILY, 191 U_SIZEOF_UCHAR, 192 0, 193 194 /* dataFormat="cAsE" */ 195 { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, 196 { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 197 { 4, 0, 1, 0 } /* dataVersion */ 198 }; 199 200 enum { 201 /* maximum number of exceptions expected */ 202 MAX_EXC_COUNT=1000 203 }; 204 205 /* exceptions values */ 206 static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100]; 207 static uint16_t exceptionsTop=0; 208 static Props excProps[MAX_EXC_COUNT]; 209 static uint16_t exceptionsCount=0; 210 211 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */ 212 static int32_t maxFullLength=U16_MAX_LENGTH; 213 214 /* reverse case folding ("unfold") data */ 215 static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={ 216 0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0 217 }; 218 static uint16_t unfoldRows=0; 219 static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH; 220 221 /* Unicode versions --------------------------------------------------------- */ 222 223 static const UVersionInfo 224 unicodeVersions[]={ 225 { 1, 0, 0, 0 }, 226 { 1, 1, 0, 0 }, 227 { 2, 0, 0, 0 }, 228 { 3, 0, 0, 0 }, 229 { 3, 1, 0, 0 }, 230 { 3, 2, 0, 0 }, 231 { 4, 0, 0, 0 }, 232 { 4, 0, 1, 0 }, 233 { 4, 1, 0, 0 } 234 }; 235 236 int32_t ucdVersion=UNI_4_1; 237 238 static int32_t 239 findUnicodeVersion(const UVersionInfo version) { 240 int32_t i; 241 242 for(i=0; /* while(version>unicodeVersions[i]) {} */ 243 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0; 244 ++i) {} 245 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) { 246 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */ 247 } 248 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */ 249 } 250 251 extern void 252 setUnicodeVersion(const char *v) { 253 UVersionInfo version; 254 u_versionFromString(version, v); 255 uprv_memcpy(dataInfo.dataVersion, version, 4); 256 ucdVersion=findUnicodeVersion(version); 257 } 258 259 /* -------------------------------------------------------------------------- */ 260 261 static void 262 addUnfolding(UChar32 c, const UChar *s, int32_t length) { 263 int32_t i; 264 265 if(length>UGENCASE_UNFOLD_STRING_WIDTH) { 266 fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n", 267 (long)length, UGENCASE_UNFOLD_STRING_WIDTH); 268 exit(U_INTERNAL_PROGRAM_ERROR); 269 } 270 if(unfoldTop >= (LENGTHOF(unfold) - UGENCASE_UNFOLD_STRING_WIDTH)) { 271 fprintf(stderr, "gencase error: too many multi-character case foldings\n"); 272 exit(U_BUFFER_OVERFLOW_ERROR); 273 } 274 u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH); 275 u_memcpy(unfold+unfoldTop, s, length); 276 277 i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH; 278 U16_APPEND_UNSAFE(unfold, i, c); 279 280 ++unfoldRows; 281 unfoldTop+=UGENCASE_UNFOLD_WIDTH; 282 } 283 284 /* store a character's properties ------------------------------------------- */ 285 286 extern void 287 setProps(Props *p) { 288 UErrorCode errorCode; 289 uint32_t value, oldValue; 290 int32_t delta; 291 UBool isCaseIgnorable; 292 293 /* get the non-UnicodeData.txt properties */ 294 value=oldValue=upvec_getValue(pv, p->code, 0); 295 296 /* default: map to self */ 297 delta=0; 298 299 if(p->gc==U_TITLECASE_LETTER) { 300 /* the Titlecase property is read late, from UnicodeData.txt */ 301 value|=UCASE_TITLE; 302 } 303 304 if(p->upperCase!=0) { 305 /* uppercase mapping as delta if the character is lowercase */ 306 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { 307 delta=p->upperCase-p->code; 308 } else { 309 value|=UCASE_EXCEPTION; 310 } 311 } 312 if(p->lowerCase!=0) { 313 /* lowercase mapping as delta if the character is uppercase or titlecase */ 314 if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) { 315 delta=p->lowerCase-p->code; 316 } else { 317 value|=UCASE_EXCEPTION; 318 } 319 } 320 if(p->upperCase!=p->titleCase) { 321 value|=UCASE_EXCEPTION; 322 } 323 if(p->closure[0]!=0) { 324 value|=UCASE_EXCEPTION; 325 } 326 if(p->specialCasing!=NULL) { 327 value|=UCASE_EXCEPTION; 328 } 329 if(p->caseFolding!=NULL) { 330 value|=UCASE_EXCEPTION; 331 } 332 333 if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) { 334 value|=UCASE_EXCEPTION; 335 } 336 337 if(p->cc!=0) { 338 if(value&UCASE_DOT_MASK) { 339 fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n"); 340 exit(U_INTERNAL_PROGRAM_ERROR); 341 } 342 if(p->cc==230) { 343 value|=UCASE_ABOVE; 344 } else { 345 value|=UCASE_OTHER_ACCENT; 346 } 347 } 348 349 /* encode case-ignorable as delta==1 on uncased characters */ 350 isCaseIgnorable=FALSE; 351 if((value&UCASE_TYPE_MASK)==UCASE_NONE) { 352 if(ucdVersion>=UNI_4_1) { 353 /* 354 * Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk 355 * Unicode 5.1 and up: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk 356 * The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet. 357 */ 358 if( 359 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || 360 (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0 361 ) { 362 isCaseIgnorable=TRUE; 363 } 364 } else { 365 /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */ 366 if( 367 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || 368 p->code==0x27 || p->code==0xad || p->code==0x2019 369 ) { 370 isCaseIgnorable=TRUE; 371 } 372 } 373 } 374 375 if(isCaseIgnorable && p->code!=0x307) { 376 /* 377 * We use one of the delta/exception bits, which works because we only 378 * store the case-ignorable flag for uncased characters. 379 * There is no delta for uncased characters (see checks above). 380 * If there is an exception for an uncased, case-ignorable character 381 * (although there should not be any case mappings if it's uncased) 382 * then we have a problem. 383 * There is one character which is case-ignorable but has an exception: 384 * U+0307 is uncased, Mn, has conditional special casing and 385 * is therefore handled in code instead. 386 */ 387 if(value&UCASE_EXCEPTION) { 388 fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n", 389 (unsigned long)p->code); 390 exit(U_INTERNAL_PROGRAM_ERROR); 391 } 392 393 delta=1; 394 } 395 396 /* handle exceptions */ 397 if(value&UCASE_EXCEPTION) { 398 /* simply store exceptions for later processing and encoding */ 399 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; 400 uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p)); 401 if(++exceptionsCount==MAX_EXC_COUNT) { 402 fprintf(stderr, "gencase: too many exceptions\n"); 403 exit(U_INDEX_OUTOFBOUNDS_ERROR); 404 } 405 } else { 406 /* store the simple case mapping delta */ 407 value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK; 408 } 409 410 errorCode=U_ZERO_ERROR; 411 if(value!=oldValue) { 412 upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode); 413 if(U_FAILURE(errorCode)) { 414 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", 415 u_errorName(errorCode)); 416 exit(errorCode); 417 } 418 } 419 420 /* add the multi-character case folding to the "unfold" data */ 421 if(p->caseFolding!=NULL) { 422 int32_t length=p->caseFolding->full[0]; 423 if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) { 424 addUnfolding(p->code, p->caseFolding->full+1, length); 425 } 426 } 427 } 428 429 extern void 430 addCaseSensitive(UChar32 first, UChar32 last) { 431 UErrorCode errorCode=U_ZERO_ERROR; 432 upvec_setValue(pv, first, last, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode); 433 if(U_FAILURE(errorCode)) { 434 fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n", 435 u_errorName(errorCode)); 436 exit(errorCode); 437 } 438 } 439 440 /* finalize reverse case folding ("unfold") data ---------------------------- */ 441 442 static int32_t U_CALLCONV 443 compareUnfold(const void *context, const void *left, const void *right) { 444 return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH); 445 } 446 447 static void 448 makeUnfoldData() { 449 static const UChar 450 iDot[2]= { 0x69, 0x307 }; 451 452 UChar *p, *q; 453 int32_t i, j, k; 454 UErrorCode errorCode; 455 456 /* 457 * add a case folding that we missed because it's conditional: 458 * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 459 */ 460 addUnfolding(0x130, iDot, 2); 461 462 /* sort the data */ 463 errorCode=U_ZERO_ERROR; 464 uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2, 465 compareUnfold, NULL, FALSE, &errorCode); 466 467 /* make unique-string rows by merging adjacent ones' code point columns */ 468 469 /* make p point to row i-1 */ 470 p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH; 471 472 for(i=1; i<unfoldRows;) { 473 if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) { 474 /* concatenate code point columns */ 475 q=p+UGENCASE_UNFOLD_STRING_WIDTH; 476 for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {} 477 for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) { 478 q[j]=q[UGENCASE_UNFOLD_WIDTH+k]; 479 } 480 if(j>UGENCASE_UNFOLD_CP_WIDTH) { 481 fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n", 482 (long)j, UGENCASE_UNFOLD_CP_WIDTH); 483 exit(U_BUFFER_OVERFLOW_ERROR); 484 } 485 486 /* move following rows up one */ 487 --unfoldRows; 488 unfoldTop-=UGENCASE_UNFOLD_WIDTH; 489 u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH); 490 } else { 491 p+=UGENCASE_UNFOLD_WIDTH; 492 ++i; 493 } 494 } 495 496 unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows; 497 498 if(beVerbose) { 499 puts("unfold data:"); 500 501 p=(UChar *)unfold; 502 for(i=0; i<unfoldRows; ++i) { 503 p+=UGENCASE_UNFOLD_WIDTH; 504 printf("[%2d] %04x %04x %04x <- %04x %04x\n", 505 (int)i, p[0], p[1], p[2], p[3], p[4]); 506 } 507 } 508 } 509 510 /* case closure ------------------------------------------------------------- */ 511 512 static void 513 addClosureMapping(UChar32 src, UChar32 dest) { 514 uint32_t value; 515 516 if(beVerbose) { 517 printf("add closure mapping U+%04lx->U+%04lx\n", 518 (unsigned long)src, (unsigned long)dest); 519 } 520 521 value=upvec_getValue(pv, src, 0); 522 if(value&UCASE_EXCEPTION) { 523 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); 524 int32_t i; 525 526 /* append dest to src's closure array */ 527 for(i=0;; ++i) { 528 if(i==LENGTHOF(p->closure)) { 529 fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n", 530 (unsigned long)src, (unsigned long)dest); 531 exit(U_BUFFER_OVERFLOW_ERROR); 532 } else if(p->closure[i]==dest) { 533 break; /* do not store duplicates */ 534 } else if(p->closure[i]==0) { 535 p->closure[i]=dest; 536 break; 537 } 538 } 539 } else { 540 Props p2={ 0 }; 541 UChar32 next; 542 UErrorCode errorCode; 543 544 /* 545 * decode value into p2 (enough for makeException() to work properly), 546 * add the closure mapping, 547 * and set the new exception for src 548 */ 549 p2.code=src; 550 p2.closure[0]=dest; 551 552 if((value&UCASE_TYPE_MASK)>UCASE_NONE) { 553 /* one simple case mapping, don't care which one */ 554 next=src+((int16_t)value>>UCASE_DELTA_SHIFT); 555 if(next!=src) { 556 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { 557 p2.upperCase=p2.titleCase=next; 558 } else { 559 p2.lowerCase=next; 560 } 561 } 562 } else if(value&UCASE_DELTA_MASK) { 563 fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n", 564 (unsigned long)src); 565 exit(U_INTERNAL_PROGRAM_ERROR); 566 } 567 568 value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */ 569 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; 570 value|=UCASE_EXCEPTION; 571 uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2)); 572 if(++exceptionsCount==MAX_EXC_COUNT) { 573 fprintf(stderr, "gencase: too many exceptions\n"); 574 exit(U_INDEX_OUTOFBOUNDS_ERROR); 575 } 576 577 errorCode=U_ZERO_ERROR; 578 upvec_setValue(pv, src, src, 0, value, 0xffffffff, &errorCode); 579 if(U_FAILURE(errorCode)) { 580 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", 581 u_errorName(errorCode)); 582 exit(errorCode); 583 } 584 } 585 } 586 587 /* 588 * Find missing case mapping relationships and add mappings for case closure. 589 * This function starts from an "original" code point and recursively 590 * finds its case mappings and the case mappings of where it maps to. 591 * 592 * The recursion depth is capped at 3 nested calls of this function. 593 * In each call, the current code point is c, and the function enumerates 594 * all of c's simple (single-code point) case mappings. 595 * prev is the code point that case-mapped to c. 596 * prev2 is the code point that case-mapped to prev. 597 * 598 * The initial function call has prev2<0, prev<0, and c==orig 599 * (marking no code points). 600 * It enumerates c's case mappings and recurses without further action. 601 * 602 * The second-level function call has prev2<0, prev==orig, and c is 603 * the destination code point of one of prev's case mappings. 604 * The function checks if any of c's case mappings go back to orig 605 * and adds a closure mapping if not. 606 * In other words, it turns a case mapping relationship of 607 * orig->c 608 * into 609 * orig<->c 610 * 611 * The third-level function call has prev2==orig, prev>=0, and c is 612 * the destination code point of one of prev's case mappings. 613 * (And prev is the destination of one of prev2's case mappings.) 614 * The function checks if any of c's case mappings go back to orig 615 * and adds a closure mapping if not. 616 * In other words, it turns case mapping relationships of 617 * orig->prev->c or orig->prev<->c 618 * into 619 * orig->prev->c->orig or orig->prev<->c->orig 620 * etc. 621 * (Graphically, this closes a triangle.) 622 * 623 * With repeated application on all code points until no more closure mappings 624 * are added, all case equivalence groups get complete mappings. 625 * That is, in each group of code points with case relationships 626 * each code point will in the end have some mapping to each other 627 * code point in the group. 628 * 629 * @return TRUE if a closure mapping was added 630 */ 631 static UBool 632 addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) { 633 UChar32 next; 634 UBool someMappingsAdded=FALSE; 635 636 if(c!=orig) { 637 /* get the properties for c */ 638 value=upvec_getValue(pv, c, 0); 639 } 640 /* else if c==orig then c's value was passed in */ 641 642 if(value&UCASE_EXCEPTION) { 643 UChar32 set[32]; 644 int32_t i, count=0; 645 646 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); 647 648 /* 649 * marker for whether any of c's mappings goes to orig 650 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings 651 */ 652 UBool mapsToOrig=(UBool)(c==orig); 653 654 /* collect c's case mapping destinations in set[] */ 655 if((next=p->upperCase)!=0 && next!=c) { 656 set[count++]=next; 657 } 658 if((next=p->lowerCase)!=0 && next!=c) { 659 set[count++]=next; 660 } 661 if(p->upperCase!=(next=p->titleCase) && next!=c) { 662 set[count++]=next; 663 } 664 if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) { 665 set[count++]=next; 666 } 667 668 /* append c's current closure mappings to set[] */ 669 for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) { 670 set[count++]=next; 671 } 672 673 /* process all code points to which c case-maps */ 674 for(i=0; i<count; ++i) { 675 next=set[i]; /* next!=c */ 676 677 if(next==orig) { 678 mapsToOrig=TRUE; /* remember that we map to orig */ 679 } else if(prev2<0 && next!=prev) { 680 /* 681 * recurse unless 682 * we have reached maximum depth (prev2>=0) or 683 * this is a mapping to one of the previous code points (orig, prev, c) 684 */ 685 someMappingsAdded|=addClosure(orig, prev, c, next, 0); 686 } 687 } 688 689 if(!mapsToOrig) { 690 addClosureMapping(c, orig); 691 return TRUE; 692 } 693 } else { 694 if((value&UCASE_TYPE_MASK)>UCASE_NONE) { 695 /* one simple case mapping, don't care which one */ 696 next=c+((int16_t)value>>UCASE_DELTA_SHIFT); 697 if(next!=c) { 698 /* 699 * recurse unless 700 * we have reached maximum depth (prev2>=0) or 701 * this is a mapping to one of the previous code points (orig, prev, c) 702 */ 703 if(prev2<0 && next!=orig && next!=prev) { 704 someMappingsAdded|=addClosure(orig, prev, c, next, 0); 705 } 706 707 if(c!=orig && next!=orig) { 708 /* c does not map to orig, add a closure mapping c->orig */ 709 addClosureMapping(c, orig); 710 return TRUE; 711 } 712 } 713 } 714 } 715 716 return someMappingsAdded; 717 } 718 719 extern void 720 makeCaseClosure() { 721 UChar *p; 722 uint32_t *row; 723 uint32_t value; 724 UChar32 start, end, c, c2; 725 int32_t i, j; 726 UBool someMappingsAdded; 727 728 /* 729 * finalize the "unfold" data because we need to use it to add closure mappings 730 * for situations like FB05->"st"<-FB06 731 * where we would otherwise miss the FB05<->FB06 relationship 732 */ 733 makeUnfoldData(); 734 735 /* use the "unfold" data to add mappings */ 736 737 /* p always points to the code points; this loop ignores the strings completely */ 738 p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH; 739 740 for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) { 741 j=0; 742 U16_NEXT_UNSAFE(p, j, c); 743 while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) { 744 U16_NEXT_UNSAFE(p, j, c2); 745 addClosure(c, U_SENTINEL, c, c2, 0); 746 } 747 } 748 749 if(beVerbose) { 750 puts("---- ---- ---- ---- (done with closures from unfolding)"); 751 } 752 753 /* add further closure mappings from analyzing simple mappings */ 754 do { 755 someMappingsAdded=FALSE; 756 757 i=0; 758 while((row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP) { 759 value=*row; 760 if(value!=0) { 761 while(start<=end) { 762 if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) { 763 someMappingsAdded=TRUE; 764 765 /* 766 * stop this loop because pv was changed and row is not valid any more 767 * skip all rows below the current start 768 */ 769 while((row=upvec_getRow(pv, i, NULL, &end))!=NULL && start>end) { 770 ++i; 771 } 772 row=NULL; /* signal to continue with outer loop, without further ++i */ 773 break; 774 } 775 ++start; 776 } 777 if(row==NULL) { 778 continue; /* see row=NULL above */ 779 } 780 } 781 ++i; 782 } 783 784 if(beVerbose && someMappingsAdded) { 785 puts("---- ---- ---- ----"); 786 } 787 } while(someMappingsAdded); 788 } 789 790 /* exceptions --------------------------------------------------------------- */ 791 792 /* get the string length from zero-terminated code points in a limited-length array */ 793 static int32_t 794 getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) { 795 int32_t i, length; 796 797 for(i=length=0; i<maxLength && s[i]!=0; ++i) { 798 length+=U16_LENGTH(s[i]); 799 } 800 return length; 801 } 802 803 static UBool 804 fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) { 805 int32_t i, length; 806 UChar32 full; 807 808 length=*s++; 809 if(length==0 || length>U16_MAX_LENGTH) { 810 return FALSE; 811 } 812 i=0; 813 U16_NEXT(s, i, length, full); 814 815 if(simple==0) { 816 simple=c; /* UCD has no simple mapping if it's the same as the code point itself */ 817 } 818 return (UBool)(i==length && full==simple); 819 } 820 821 static uint16_t 822 makeException(uint32_t value, Props *p) { 823 uint32_t slots[8]; 824 uint32_t slotBits; 825 uint16_t excWord, excIndex, excTop, i, count, length, fullLengths; 826 UBool doubleSlots; 827 828 /* excIndex will be returned for storing in the trie word */ 829 excIndex=exceptionsTop; 830 if(excIndex>=UCASE_MAX_EXCEPTIONS) { 831 fprintf(stderr, "gencase error: too many exceptions words\n"); 832 exit(U_BUFFER_OVERFLOW_ERROR); 833 } 834 835 excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */ 836 837 /* copy and shift the soft-dotted bits */ 838 excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT; 839 840 /* update maxFullLength */ 841 if(p->specialCasing!=NULL) { 842 length=p->specialCasing->lowerCase[0]; 843 if(length>maxFullLength) { 844 maxFullLength=length; 845 } 846 length=p->specialCasing->upperCase[0]; 847 if(length>maxFullLength) { 848 maxFullLength=length; 849 } 850 length=p->specialCasing->titleCase[0]; 851 if(length>maxFullLength) { 852 maxFullLength=length; 853 } 854 } 855 if(p->caseFolding!=NULL) { 856 length=p->caseFolding->full[0]; 857 if(length>maxFullLength) { 858 maxFullLength=length; 859 } 860 } 861 862 /* set the bits for conditional mappings */ 863 if(p->specialCasing!=NULL && p->specialCasing->isComplex) { 864 excWord|=UCASE_EXC_CONDITIONAL_SPECIAL; 865 p->specialCasing=NULL; 866 } 867 if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) { 868 excWord|=UCASE_EXC_CONDITIONAL_FOLD; 869 p->caseFolding=NULL; 870 } 871 872 /* 873 * Note: 874 * UCD stores no simple mappings when they are the same as the code point itself. 875 * SpecialCasing and CaseFolding do store simple mappings even if they are 876 * the same as the code point itself. 877 * Comparisons between simple regular mappings and simple special/folding 878 * mappings need to compensate for the difference by comparing with the 879 * original code point if a simple UCD mapping is missing (0). 880 */ 881 882 /* remove redundant data */ 883 if(p->specialCasing!=NULL) { 884 /* do not store full mappings if they are the same as the simple ones */ 885 if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) { 886 p->specialCasing->lowerCase[0]=0; 887 } 888 if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) { 889 p->specialCasing->upperCase[0]=0; 890 } 891 if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) { 892 p->specialCasing->titleCase[0]=0; 893 } 894 } 895 if( p->caseFolding!=NULL && 896 fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code) 897 ) { 898 p->caseFolding->full[0]=0; 899 } 900 901 /* write the optional slots */ 902 slotBits=0; 903 count=0; 904 905 if(p->lowerCase!=0) { 906 slots[count]=(uint32_t)p->lowerCase; 907 slotBits|=slots[count]; 908 ++count; 909 excWord|=U_MASK(UCASE_EXC_LOWER); 910 } 911 if( p->caseFolding!=NULL && 912 p->caseFolding->simple!=0 && 913 (p->lowerCase!=0 ? 914 p->caseFolding->simple!=p->lowerCase : 915 p->caseFolding->simple!=p->code) 916 ) { 917 slots[count]=(uint32_t)p->caseFolding->simple; 918 slotBits|=slots[count]; 919 ++count; 920 excWord|=U_MASK(UCASE_EXC_FOLD); 921 } 922 if(p->upperCase!=0) { 923 slots[count]=(uint32_t)p->upperCase; 924 slotBits|=slots[count]; 925 ++count; 926 excWord|=U_MASK(UCASE_EXC_UPPER); 927 } 928 if(p->upperCase!=p->titleCase) { 929 if(p->titleCase!=0) { 930 slots[count]=(uint32_t)p->titleCase; 931 } else { 932 slots[count]=(uint32_t)p->code; 933 } 934 slotBits|=slots[count]; 935 ++count; 936 excWord|=U_MASK(UCASE_EXC_TITLE); 937 } 938 939 /* length of case closure */ 940 if(p->closure[0]!=0) { 941 length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure)); 942 slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */ 943 slotBits|=slots[count]; 944 ++count; 945 excWord|=U_MASK(UCASE_EXC_CLOSURE); 946 } 947 948 /* lengths of full case mapping strings, stored in the last slot */ 949 fullLengths=0; 950 if(p->specialCasing!=NULL) { 951 fullLengths=p->specialCasing->lowerCase[0]; 952 fullLengths|=p->specialCasing->upperCase[0]<<8; 953 fullLengths|=p->specialCasing->titleCase[0]<<12; 954 } 955 if(p->caseFolding!=NULL) { 956 fullLengths|=p->caseFolding->full[0]<<4; 957 } 958 if(fullLengths!=0) { 959 slots[count]=fullLengths; 960 slotBits|=slots[count]; 961 ++count; 962 excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS); 963 } 964 965 /* write slots */ 966 doubleSlots=(UBool)(slotBits>0xffff); 967 if(!doubleSlots) { 968 for(i=0; i<count; ++i) { 969 exceptions[excTop++]=(uint16_t)slots[i]; 970 } 971 } else { 972 excWord|=UCASE_EXC_DOUBLE_SLOTS; 973 for(i=0; i<count; ++i) { 974 exceptions[excTop++]=(uint16_t)(slots[i]>>16); 975 exceptions[excTop++]=(uint16_t)slots[i]; 976 } 977 } 978 979 /* write the full case mapping strings */ 980 if(p->specialCasing!=NULL) { 981 length=(uint16_t)p->specialCasing->lowerCase[0]; 982 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length); 983 excTop+=length; 984 } 985 if(p->caseFolding!=NULL) { 986 length=(uint16_t)p->caseFolding->full[0]; 987 u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length); 988 excTop+=length; 989 } 990 if(p->specialCasing!=NULL) { 991 length=(uint16_t)p->specialCasing->upperCase[0]; 992 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length); 993 excTop+=length; 994 995 length=(uint16_t)p->specialCasing->titleCase[0]; 996 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length); 997 excTop+=length; 998 } 999 1000 /* write the closure data */ 1001 if(p->closure[0]!=0) { 1002 UChar32 c; 1003 1004 for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) { 1005 U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c); 1006 } 1007 } 1008 1009 exceptionsTop=excTop; 1010 1011 /* write the main exceptions word */ 1012 exceptions[excIndex]=excWord; 1013 1014 return excIndex; 1015 } 1016 1017 extern void 1018 makeExceptions() { 1019 uint32_t *row; 1020 uint32_t value; 1021 int32_t i; 1022 uint16_t excIndex; 1023 1024 i=0; 1025 while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) { 1026 value=*row; 1027 if(value&UCASE_EXCEPTION) { 1028 excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT)); 1029 *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT); 1030 } 1031 ++i; 1032 } 1033 } 1034 1035 /* generate output data ----------------------------------------------------- */ 1036 1037 extern void 1038 generateData(const char *dataDir, UBool csource) { 1039 static int32_t indexes[UCASE_IX_TOP]={ 1040 UCASE_IX_TOP 1041 }; 1042 static uint8_t trieBlock[40000]; 1043 1044 const uint32_t *row; 1045 UChar32 start, end; 1046 int32_t i; 1047 1048 UNewDataMemory *pData; 1049 UNewTrie *pTrie; 1050 UErrorCode errorCode=U_ZERO_ERROR; 1051 int32_t trieSize; 1052 long dataLength; 1053 1054 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE); 1055 if(pTrie==NULL) { 1056 fprintf(stderr, "gencase error: unable to create a UNewTrie\n"); 1057 exit(U_MEMORY_ALLOCATION_ERROR); 1058 } 1059 1060 for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL; ++i) { 1061 if(start<UPVEC_FIRST_SPECIAL_CP && !utrie_setRange32(pTrie, start, end+1, *row, TRUE)) { 1062 fprintf(stderr, "gencase error: unable to set trie value (overflow)\n"); 1063 exit(U_BUFFER_OVERFLOW_ERROR); 1064 } 1065 } 1066 1067 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); 1068 if(U_FAILURE(errorCode)) { 1069 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); 1070 exit(errorCode); 1071 } 1072 1073 indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop; 1074 indexes[UCASE_IX_TRIE_SIZE]=trieSize; 1075 indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop; 1076 indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop; 1077 1078 indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength; 1079 1080 if(beVerbose) { 1081 printf("trie size in bytes: %5d\n", (int)trieSize); 1082 printf("number of code points with exceptions: %5d\n", exceptionsCount); 1083 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop); 1084 printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop); 1085 printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]); 1086 } 1087 1088 if(csource) { 1089 /* write .c file for hardcoded data */ 1090 UTrie trie={ NULL }; 1091 UTrie2 *trie2; 1092 FILE *f; 1093 1094 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); 1095 if(U_FAILURE(errorCode)) { 1096 fprintf( 1097 stderr, 1098 "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n", 1099 u_errorName(errorCode)); 1100 exit(errorCode); 1101 } 1102 1103 /* use UTrie2 */ 1104 dataInfo.formatVersion[0]=2; 1105 dataInfo.formatVersion[2]=0; 1106 dataInfo.formatVersion[3]=0; 1107 trie2=utrie2_fromUTrie(&trie, 0, &errorCode); 1108 if(U_FAILURE(errorCode)) { 1109 fprintf( 1110 stderr, 1111 "gencase error: utrie2_fromUTrie() failed - %s\n", 1112 u_errorName(errorCode)); 1113 exit(errorCode); 1114 } 1115 { 1116 /* delete lead surrogate code unit values */ 1117 UChar lead; 1118 trie2=utrie2_cloneAsThawed(trie2, &errorCode); 1119 for(lead=0xd800; lead<0xdc00; ++lead) { 1120 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 1121 } 1122 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 1123 if(U_FAILURE(errorCode)) { 1124 fprintf( 1125 stderr, 1126 "gencase error: deleting lead surrogate code unit values failed - %s\n", 1127 u_errorName(errorCode)); 1128 exit(errorCode); 1129 } 1130 } 1131 1132 f=usrc_create(dataDir, "ucase_props_data.c"); 1133 if(f!=NULL) { 1134 usrc_writeArray(f, 1135 "static const UVersionInfo ucase_props_dataVersion={", 1136 dataInfo.dataVersion, 8, 4, 1137 "};\n\n"); 1138 usrc_writeArray(f, 1139 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={", 1140 indexes, 32, UCASE_IX_TOP, 1141 "};\n\n"); 1142 usrc_writeUTrie2Arrays(f, 1143 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL, 1144 trie2, 1145 "\n};\n\n"); 1146 usrc_writeArray(f, 1147 "static const uint16_t ucase_props_exceptions[%ld]={\n", 1148 exceptions, 16, exceptionsTop, 1149 "\n};\n\n"); 1150 usrc_writeArray(f, 1151 "static const uint16_t ucase_props_unfold[%ld]={\n", 1152 unfold, 16, unfoldTop, 1153 "\n};\n\n"); 1154 fputs( 1155 "static const UCaseProps ucase_props_singleton={\n" 1156 " NULL,\n" 1157 " ucase_props_indexes,\n" 1158 " ucase_props_exceptions,\n" 1159 " ucase_props_unfold,\n", 1160 f); 1161 usrc_writeUTrie2Struct(f, 1162 " {\n", 1163 trie2, "ucase_props_trieIndex", NULL, 1164 " },\n"); 1165 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n"); 1166 fputs("};\n", f); 1167 fclose(f); 1168 } 1169 utrie2_close(trie2); 1170 } else { 1171 /* write the data */ 1172 pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo, 1173 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 1174 if(U_FAILURE(errorCode)) { 1175 fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode)); 1176 exit(errorCode); 1177 } 1178 1179 udata_writeBlock(pData, indexes, sizeof(indexes)); 1180 udata_writeBlock(pData, trieBlock, trieSize); 1181 udata_writeBlock(pData, exceptions, 2*exceptionsTop); 1182 udata_writeBlock(pData, unfold, 2*unfoldTop); 1183 1184 /* finish up */ 1185 dataLength=udata_finish(pData, &errorCode); 1186 if(U_FAILURE(errorCode)) { 1187 fprintf(stderr, "gencase: error %d writing the output file\n", errorCode); 1188 exit(errorCode); 1189 } 1190 1191 if(dataLength!=indexes[UCASE_IX_LENGTH]) { 1192 fprintf(stderr, "gencase: data length %ld != calculated size %d\n", 1193 dataLength, (int)indexes[UCASE_IX_LENGTH]); 1194 exit(U_INTERNAL_PROGRAM_ERROR); 1195 } 1196 } 1197 1198 utrie_close(pTrie); 1199 } 1200 1201 /* 1202 * Hey, Emacs, please set the following: 1203 * 1204 * Local Variables: 1205 * indent-tabs-mode: nil 1206 * End: 1207 * 1208 */ 1209