1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: store.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004aug28 14 * created by: Markus W. Scherer 15 * 16 * Store Unicode case mapping properties efficiently for 17 * random access. 18 */ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include "unicode/utypes.h" 23 #include "unicode/uchar.h" 24 #include "unicode/ustring.h" 25 #include "cmemory.h" 26 #include "cstring.h" 27 #include "filestrm.h" 28 #include "utrie.h" 29 #include "utrie2.h" 30 #include "uarrsort.h" 31 #include "unicode/udata.h" 32 #include "unewdata.h" 33 #include "propsvec.h" 34 #include "writesrc.h" 35 #include "gencase.h" 36 37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 38 39 /* Unicode case mapping properties file format --------------------------------- 40 41 The file format prepared and written here contains several data 42 structures that store indexes or data. 43 44 Before the data contents described below, there are the headers required by 45 the udata API for loading ICU data. Especially, a UDataInfo structure 46 precedes the actual data. It contains platform properties values and the 47 file format version. 48 49 The following is a description of format version 1.2 . 50 51 Format version 1.1 adds data for case closure. 52 53 Format version 1.2 adds an exception bit for case-ignorable. Needed because 54 the Cased and Case_Ignorable properties are not disjoint. 55 56 The file contains the following structures: 57 58 const int32_t indexes[i0] with values i0, i1, ...: 59 (see UCASE_IX_... constants for names of indexes) 60 61 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP) 62 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) 63 i2 trieSize; -- size in bytes of the case mapping properties trie 64 i3 exceptionsLength; -- length in uint16_t of the exceptions array 65 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1) 66 67 i5..i14 reservedIndexes; -- reserved values; 0 for now 68 69 i15 maxFullLength; -- maximum length of a full case mapping/folding string 70 71 72 Serialized trie, see utrie.h; 73 74 const uint16_t exceptions[exceptionsLength]; 75 76 const UChar unfold[unfoldLength]; 77 78 79 Trie data word: 80 Bits 81 if(exception) { 82 15..4 unsigned exception index 83 } else { 84 if(not uncased) { 85 15..6 signed delta to simple case mapping code point 86 (add delta to input code point) 87 } else { 88 6 the code point is case-ignorable 89 (U+0307 is also case-ignorable but has an exception) 90 } 91 5..4 0 normal character with cc=0 92 1 soft-dotted character 93 2 cc=230 94 3 other cc 95 } 96 3 exception 97 2 case sensitive 98 1..0 0 uncased 99 1 lowercase 100 2 uppercase 101 3 titlecase 102 103 104 Exceptions: 105 A sub-array of the exceptions array is indexed by the exception index in a 106 trie word. 107 The sub-array consists of the following fields: 108 uint16_t excWord; 109 uint16_t optional values []; 110 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase 111 112 excWord: (see UCASE_EXC_...) 113 Bits 114 15 conditional case folding 115 14 conditional special casing 116 13..12 same as non-exception trie data bits 5..4 117 moved here because the exception index needs more bits than the delta 118 0 normal character with cc=0 119 1 soft-dotted character 120 2 cc=230 121 3 other cc 122 11 case-ignorable (used when the character is cased or has another exception) 123 (new in formatVersion 1.2/ICU 4.4) 124 10.. 9 reserved 125 8 if set, then for each optional-value slot there are 2 uint16_t values 126 (high and low parts of 32-bit values) 127 instead of single ones 128 7.. 0 bits for which optional value is present 129 130 Optional-value slots: 131 0 lowercase mapping (code point) 132 1 case folding (code point) 133 2 uppercase mapping (code point) 134 3 titlecase mapping (code point) 135 4 reserved 136 5 reserved 137 6 closure mappings (new in format version 1.1) 138 7 there is at least one full (string) case mapping 139 the length of each is encoded in a nibble of this optional value, 140 and the strings follow this optional value in the same order: 141 lower/fold/upper/title 142 143 The optional closure mappings value is used as follows: 144 Bits 0..3 contain the length of a string of code points for case closure. 145 The string immediately follows the full case mappings, or the closure value 146 slot if there are no full case mappings. 147 Bits 4..15 are reserved and could be used in the future to indicate the 148 number of strings for case closure. 149 Complete case closure for a code point is given by the union of all simple 150 and full case mappings and foldings, plus the case closure code points 151 (and potentially, in the future, case closure strings). 152 153 For space saving, some values are not stored. Lookups are as follows: 154 - If special casing is conditional, then no full lower/upper/title mapping 155 strings are stored. 156 - If case folding is conditional, then no simple or full case foldings are 157 stored. 158 - Fall back in this order: 159 full (string) mapping -- if full mappings are used 160 simple (code point) mapping of the same type 161 simple fold->simple lower 162 simple title->simple upper 163 finally, the original code point (no mapping) 164 165 This fallback order is strict: 166 In particular, the fallback from full case folding is to simple case folding, 167 not to full lowercase mapping. 168 169 Reverse case folding data ("unfold") array: (new in format version 1.1) 170 171 This array stores some miscellaneous values followed by a table. The data maps 172 back from multi-character strings to their original code points, for use 173 in case closure. 174 175 The table contains two columns of strings. 176 The string in the first column is the case folding of each of the code points 177 in the second column. The strings are terminated with NUL or by the end of the 178 column, whichever comes first. 179 180 The miscellaneous data takes up one pseudo-row and includes: 181 - number of rows 182 - number of UChars per row 183 - number of UChars in the left (folding string) column 184 185 The table is sorted by its first column. Values in the first column are unique. 186 187 ----------------------------------------------------------------------------- */ 188 189 /* UDataInfo cf. udata.h */ 190 static UDataInfo dataInfo={ 191 sizeof(UDataInfo), 192 0, 193 194 U_IS_BIG_ENDIAN, 195 U_CHARSET_FAMILY, 196 U_SIZEOF_UCHAR, 197 0, 198 199 /* dataFormat="cAsE" */ 200 { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, 201 { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 202 { 4, 0, 1, 0 } /* dataVersion */ 203 }; 204 205 enum { 206 /* maximum number of exceptions expected */ 207 MAX_EXC_COUNT=1000 208 }; 209 210 /* exceptions values */ 211 static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100]; 212 static uint16_t exceptionsTop=0; 213 static Props excProps[MAX_EXC_COUNT]; 214 static uint16_t exceptionsCount=0; 215 216 /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */ 217 static int32_t maxFullLength=U16_MAX_LENGTH; 218 219 /* reverse case folding ("unfold") data */ 220 static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={ 221 0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0 222 }; 223 static uint16_t unfoldRows=0; 224 static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH; 225 226 /* Unicode versions --------------------------------------------------------- */ 227 228 static const UVersionInfo 229 unicodeVersions[]={ 230 { 1, 0, 0, 0 }, 231 { 1, 1, 0, 0 }, 232 { 2, 0, 0, 0 }, 233 { 3, 0, 0, 0 }, 234 { 3, 1, 0, 0 }, 235 { 3, 2, 0, 0 }, 236 { 4, 0, 0, 0 }, 237 { 4, 0, 1, 0 }, 238 { 4, 1, 0, 0 }, 239 { 5, 1, 0, 0 }, 240 { 5, 2, 0, 0 }, 241 { 6, 0, 0, 0 } 242 }; 243 244 int32_t ucdVersion=UNI_4_1; 245 246 static int32_t 247 findUnicodeVersion(const UVersionInfo version) { 248 int32_t i; 249 250 for(i=0; /* while(version>unicodeVersions[i]) {} */ 251 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0; 252 ++i) {} 253 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) { 254 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */ 255 } 256 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */ 257 } 258 259 extern void 260 setUnicodeVersion(const char *v) { 261 UVersionInfo version; 262 u_versionFromString(version, v); 263 uprv_memcpy(dataInfo.dataVersion, version, 4); 264 ucdVersion=findUnicodeVersion(version); 265 } 266 267 /* -------------------------------------------------------------------------- */ 268 269 static void 270 addUnfolding(UChar32 c, const UChar *s, int32_t length) { 271 int32_t i; 272 273 if(length>UGENCASE_UNFOLD_STRING_WIDTH) { 274 fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n", 275 (long)length, UGENCASE_UNFOLD_STRING_WIDTH); 276 exit(U_INTERNAL_PROGRAM_ERROR); 277 } 278 if(unfoldTop >= (LENGTHOF(unfold) - UGENCASE_UNFOLD_STRING_WIDTH)) { 279 fprintf(stderr, "gencase error: too many multi-character case foldings\n"); 280 exit(U_BUFFER_OVERFLOW_ERROR); 281 } 282 u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH); 283 u_memcpy(unfold+unfoldTop, s, length); 284 285 i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH; 286 U16_APPEND_UNSAFE(unfold, i, c); 287 288 ++unfoldRows; 289 unfoldTop+=UGENCASE_UNFOLD_WIDTH; 290 } 291 292 /* store a character's properties ------------------------------------------- */ 293 294 extern void 295 setProps(Props *p) { 296 UErrorCode errorCode; 297 uint32_t value, oldValue; 298 int32_t delta; 299 300 /* get the non-UnicodeData.txt properties */ 301 value=oldValue=upvec_getValue(pv, p->code, 0); 302 303 /* default: map to self */ 304 delta=0; 305 306 if(p->gc==U_TITLECASE_LETTER) { 307 /* the Titlecase property is read late, from UnicodeData.txt */ 308 value|=UCASE_TITLE; 309 } 310 311 if(p->upperCase!=0) { 312 /* uppercase mapping as delta if the character is lowercase */ 313 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { 314 delta=p->upperCase-p->code; 315 } else { 316 value|=UCASE_EXCEPTION; 317 } 318 } 319 if(p->lowerCase!=0) { 320 /* lowercase mapping as delta if the character is uppercase or titlecase */ 321 if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) { 322 delta=p->lowerCase-p->code; 323 } else { 324 value|=UCASE_EXCEPTION; 325 } 326 } 327 if(p->upperCase!=p->titleCase) { 328 value|=UCASE_EXCEPTION; 329 } 330 if(p->closure[0]!=0) { 331 value|=UCASE_EXCEPTION; 332 } 333 if(p->specialCasing!=NULL) { 334 value|=UCASE_EXCEPTION; 335 } 336 if(p->caseFolding!=NULL) { 337 value|=UCASE_EXCEPTION; 338 } 339 340 if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) { 341 value|=UCASE_EXCEPTION; 342 } 343 344 if(p->cc!=0) { 345 if(value&UCASE_DOT_MASK) { 346 fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n"); 347 exit(U_INTERNAL_PROGRAM_ERROR); 348 } 349 if(p->cc==230) { 350 value|=UCASE_ABOVE; 351 } else { 352 value|=UCASE_OTHER_ACCENT; 353 } 354 } 355 356 /* 357 * Encode case-ignorable as delta==1 on uncased characters, 358 * and with an exception bit on cased characters and characters with another exception. 359 */ 360 if(ucdVersion>=UNI_4_1) { 361 /* 362 * Unicode 4.1 & 5.0: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk 363 * Unicode 5.1: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk 364 * The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet. 365 * Unicode 5.2: The definition (Unicode Standard Definition D121) is unchanged, 366 * but now Case_Ignorable is a public property 367 * with its values listed in DerivedCoreProperties.txt. 368 * gencase.c parses those values as well, just in case the definition changes 369 * in the future. gencase.c sets the UGENCASE_IS_MID_LETTER_SHIFT bit 370 * for each Case_Ignorable entry. (It never resets that bit.) 371 */ 372 if( 373 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || 374 (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0 375 ) { 376 p->isCaseIgnorable=TRUE; 377 } 378 } else { 379 /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */ 380 if( 381 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || 382 p->code==0x27 || p->code==0xad || p->code==0x2019 383 ) { 384 p->isCaseIgnorable=TRUE; 385 } 386 } 387 if(p->isCaseIgnorable) { 388 if((value&UCASE_TYPE_MASK)==UCASE_NONE) { 389 /* 390 * We use one of the delta/exception bits for 391 * the case-ignorable flag for uncased characters. 392 * There is no delta for uncased characters (see checks above). 393 */ 394 delta=1; 395 } else { 396 /* 397 * If the character is cased or has another exception, 398 * then we store the case-ignorable flag as an exception bit. 399 */ 400 value|=UCASE_EXCEPTION; 401 } 402 } 403 404 /* handle exceptions */ 405 if(value&UCASE_EXCEPTION) { 406 /* simply store exceptions for later processing and encoding */ 407 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; 408 uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p)); 409 if(++exceptionsCount==MAX_EXC_COUNT) { 410 fprintf(stderr, "gencase: too many exceptions\n"); 411 exit(U_INDEX_OUTOFBOUNDS_ERROR); 412 } 413 } else { 414 /* store the simple case mapping delta */ 415 value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK; 416 } 417 418 errorCode=U_ZERO_ERROR; 419 if(value!=oldValue) { 420 upvec_setValue(pv, p->code, p->code, 0, value, 0xffffffff, &errorCode); 421 if(U_FAILURE(errorCode)) { 422 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", 423 u_errorName(errorCode)); 424 exit(errorCode); 425 } 426 } 427 428 /* add the multi-character case folding to the "unfold" data */ 429 if(p->caseFolding!=NULL) { 430 int32_t length=p->caseFolding->full[0]; 431 if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) { 432 addUnfolding(p->code, p->caseFolding->full+1, length); 433 } 434 } 435 } 436 437 extern void 438 addCaseSensitive(UChar32 first, UChar32 last) { 439 UErrorCode errorCode=U_ZERO_ERROR; 440 upvec_setValue(pv, first, last, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode); 441 if(U_FAILURE(errorCode)) { 442 fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n", 443 u_errorName(errorCode)); 444 exit(errorCode); 445 } 446 } 447 448 /* finalize reverse case folding ("unfold") data ---------------------------- */ 449 450 static int32_t U_CALLCONV 451 compareUnfold(const void *context, const void *left, const void *right) { 452 return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH); 453 } 454 455 static void 456 makeUnfoldData() { 457 static const UChar 458 iDot[2]= { 0x69, 0x307 }; 459 460 UChar *p, *q; 461 int32_t i, j, k; 462 UErrorCode errorCode; 463 464 /* 465 * add a case folding that we missed because it's conditional: 466 * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 467 */ 468 addUnfolding(0x130, iDot, 2); 469 470 /* sort the data */ 471 errorCode=U_ZERO_ERROR; 472 uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2, 473 compareUnfold, NULL, FALSE, &errorCode); 474 475 /* make unique-string rows by merging adjacent ones' code point columns */ 476 477 /* make p point to row i-1 */ 478 p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH; 479 480 for(i=1; i<unfoldRows;) { 481 if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) { 482 /* concatenate code point columns */ 483 q=p+UGENCASE_UNFOLD_STRING_WIDTH; 484 for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {} 485 for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) { 486 q[j]=q[UGENCASE_UNFOLD_WIDTH+k]; 487 } 488 if(j>UGENCASE_UNFOLD_CP_WIDTH) { 489 fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n", 490 (long)j, UGENCASE_UNFOLD_CP_WIDTH); 491 exit(U_BUFFER_OVERFLOW_ERROR); 492 } 493 494 /* move following rows up one */ 495 --unfoldRows; 496 unfoldTop-=UGENCASE_UNFOLD_WIDTH; 497 u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH); 498 } else { 499 p+=UGENCASE_UNFOLD_WIDTH; 500 ++i; 501 } 502 } 503 504 unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows; 505 506 if(beVerbose) { 507 puts("unfold data:"); 508 509 p=(UChar *)unfold; 510 for(i=0; i<unfoldRows; ++i) { 511 p+=UGENCASE_UNFOLD_WIDTH; 512 printf("[%2d] %04x %04x %04x <- %04x %04x\n", 513 (int)i, p[0], p[1], p[2], p[3], p[4]); 514 } 515 } 516 } 517 518 /* case closure ------------------------------------------------------------- */ 519 520 static void 521 addClosureMapping(UChar32 src, UChar32 dest) { 522 uint32_t value; 523 524 if(beVerbose) { 525 printf("add closure mapping U+%04lx->U+%04lx\n", 526 (unsigned long)src, (unsigned long)dest); 527 } 528 529 value=upvec_getValue(pv, src, 0); 530 if(value&UCASE_EXCEPTION) { 531 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); 532 int32_t i; 533 534 /* append dest to src's closure array */ 535 for(i=0;; ++i) { 536 if(i==LENGTHOF(p->closure)) { 537 fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n", 538 (unsigned long)src, (unsigned long)dest); 539 exit(U_BUFFER_OVERFLOW_ERROR); 540 } else if(p->closure[i]==dest) { 541 break; /* do not store duplicates */ 542 } else if(p->closure[i]==0) { 543 p->closure[i]=dest; 544 break; 545 } 546 } 547 } else { 548 Props p2={ 0 }; 549 UChar32 next; 550 UErrorCode errorCode; 551 552 /* 553 * decode value into p2 (enough for makeException() to work properly), 554 * add the closure mapping, 555 * and set the new exception for src 556 */ 557 p2.code=src; 558 p2.closure[0]=dest; 559 560 if((value&UCASE_TYPE_MASK)>UCASE_NONE) { 561 /* one simple case mapping, don't care which one */ 562 next=src+((int16_t)value>>UCASE_DELTA_SHIFT); 563 if(next!=src) { 564 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { 565 p2.upperCase=p2.titleCase=next; 566 } else { 567 p2.lowerCase=next; 568 } 569 } 570 } else if(value&UCASE_DELTA_MASK) { 571 fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n", 572 (unsigned long)src); 573 exit(U_INTERNAL_PROGRAM_ERROR); 574 } 575 576 value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */ 577 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; 578 value|=UCASE_EXCEPTION; 579 uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2)); 580 if(++exceptionsCount==MAX_EXC_COUNT) { 581 fprintf(stderr, "gencase: too many exceptions\n"); 582 exit(U_INDEX_OUTOFBOUNDS_ERROR); 583 } 584 585 errorCode=U_ZERO_ERROR; 586 upvec_setValue(pv, src, src, 0, value, 0xffffffff, &errorCode); 587 if(U_FAILURE(errorCode)) { 588 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", 589 u_errorName(errorCode)); 590 exit(errorCode); 591 } 592 } 593 } 594 595 /* 596 * Find missing case mapping relationships and add mappings for case closure. 597 * This function starts from an "original" code point and recursively 598 * finds its case mappings and the case mappings of where it maps to. 599 * 600 * The recursion depth is capped at 3 nested calls of this function. 601 * In each call, the current code point is c, and the function enumerates 602 * all of c's simple (single-code point) case mappings. 603 * prev is the code point that case-mapped to c. 604 * prev2 is the code point that case-mapped to prev. 605 * 606 * The initial function call has prev2<0, prev<0, and c==orig 607 * (marking no code points). 608 * It enumerates c's case mappings and recurses without further action. 609 * 610 * The second-level function call has prev2<0, prev==orig, and c is 611 * the destination code point of one of prev's case mappings. 612 * The function checks if any of c's case mappings go back to orig 613 * and adds a closure mapping if not. 614 * In other words, it turns a case mapping relationship of 615 * orig->c 616 * into 617 * orig<->c 618 * 619 * The third-level function call has prev2==orig, prev>=0, and c is 620 * the destination code point of one of prev's case mappings. 621 * (And prev is the destination of one of prev2's case mappings.) 622 * The function checks if any of c's case mappings go back to orig 623 * and adds a closure mapping if not. 624 * In other words, it turns case mapping relationships of 625 * orig->prev->c or orig->prev<->c 626 * into 627 * orig->prev->c->orig or orig->prev<->c->orig 628 * etc. 629 * (Graphically, this closes a triangle.) 630 * 631 * With repeated application on all code points until no more closure mappings 632 * are added, all case equivalence groups get complete mappings. 633 * That is, in each group of code points with case relationships 634 * each code point will in the end have some mapping to each other 635 * code point in the group. 636 * 637 * @return TRUE if a closure mapping was added 638 */ 639 static UBool 640 addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) { 641 UChar32 next; 642 UBool someMappingsAdded=FALSE; 643 644 if(c!=orig) { 645 /* get the properties for c */ 646 value=upvec_getValue(pv, c, 0); 647 } 648 /* else if c==orig then c's value was passed in */ 649 650 if(value&UCASE_EXCEPTION) { 651 UChar32 set[32]; 652 int32_t i, count=0; 653 654 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); 655 656 /* 657 * marker for whether any of c's mappings goes to orig 658 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings 659 */ 660 UBool mapsToOrig=(UBool)(c==orig); 661 662 /* collect c's case mapping destinations in set[] */ 663 if((next=p->upperCase)!=0 && next!=c) { 664 set[count++]=next; 665 } 666 if((next=p->lowerCase)!=0 && next!=c) { 667 set[count++]=next; 668 } 669 if(p->upperCase!=(next=p->titleCase) && next!=c) { 670 set[count++]=next; 671 } 672 if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) { 673 set[count++]=next; 674 } 675 676 /* append c's current closure mappings to set[] */ 677 for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) { 678 set[count++]=next; 679 } 680 681 /* process all code points to which c case-maps */ 682 for(i=0; i<count; ++i) { 683 next=set[i]; /* next!=c */ 684 685 if(next==orig) { 686 mapsToOrig=TRUE; /* remember that we map to orig */ 687 } else if(prev2<0 && next!=prev) { 688 /* 689 * recurse unless 690 * we have reached maximum depth (prev2>=0) or 691 * this is a mapping to one of the previous code points (orig, prev, c) 692 */ 693 someMappingsAdded|=addClosure(orig, prev, c, next, 0); 694 } 695 } 696 697 if(!mapsToOrig) { 698 addClosureMapping(c, orig); 699 return TRUE; 700 } 701 } else { 702 if((value&UCASE_TYPE_MASK)>UCASE_NONE) { 703 /* one simple case mapping, don't care which one */ 704 next=c+((int16_t)value>>UCASE_DELTA_SHIFT); 705 if(next!=c) { 706 /* 707 * recurse unless 708 * we have reached maximum depth (prev2>=0) or 709 * this is a mapping to one of the previous code points (orig, prev, c) 710 */ 711 if(prev2<0 && next!=orig && next!=prev) { 712 someMappingsAdded|=addClosure(orig, prev, c, next, 0); 713 } 714 715 if(c!=orig && next!=orig) { 716 /* c does not map to orig, add a closure mapping c->orig */ 717 addClosureMapping(c, orig); 718 return TRUE; 719 } 720 } 721 } 722 } 723 724 return someMappingsAdded; 725 } 726 727 extern void 728 makeCaseClosure() { 729 UChar *p; 730 uint32_t *row; 731 uint32_t value; 732 UChar32 start, end, c, c2; 733 int32_t i, j; 734 UBool someMappingsAdded; 735 736 /* 737 * finalize the "unfold" data because we need to use it to add closure mappings 738 * for situations like FB05->"st"<-FB06 739 * where we would otherwise miss the FB05<->FB06 relationship 740 */ 741 makeUnfoldData(); 742 743 /* use the "unfold" data to add mappings */ 744 745 /* p always points to the code points; this loop ignores the strings completely */ 746 p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH; 747 748 for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) { 749 j=0; 750 U16_NEXT_UNSAFE(p, j, c); 751 while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) { 752 U16_NEXT_UNSAFE(p, j, c2); 753 addClosure(c, U_SENTINEL, c, c2, 0); 754 } 755 } 756 757 if(beVerbose) { 758 puts("---- ---- ---- ---- (done with closures from unfolding)"); 759 } 760 761 /* add further closure mappings from analyzing simple mappings */ 762 do { 763 someMappingsAdded=FALSE; 764 765 i=0; 766 while((row=upvec_getRow(pv, i, &start, &end))!=NULL && start<UPVEC_FIRST_SPECIAL_CP) { 767 value=*row; 768 if(value!=0) { 769 while(start<=end) { 770 if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) { 771 someMappingsAdded=TRUE; 772 773 /* 774 * stop this loop because pv was changed and row is not valid any more 775 * skip all rows below the current start 776 */ 777 while((row=upvec_getRow(pv, i, NULL, &end))!=NULL && start>end) { 778 ++i; 779 } 780 row=NULL; /* signal to continue with outer loop, without further ++i */ 781 break; 782 } 783 ++start; 784 } 785 if(row==NULL) { 786 continue; /* see row=NULL above */ 787 } 788 } 789 ++i; 790 } 791 792 if(beVerbose && someMappingsAdded) { 793 puts("---- ---- ---- ----"); 794 } 795 } while(someMappingsAdded); 796 } 797 798 /* exceptions --------------------------------------------------------------- */ 799 800 /* get the string length from zero-terminated code points in a limited-length array */ 801 static int32_t 802 getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) { 803 int32_t i, length; 804 805 for(i=length=0; i<maxLength && s[i]!=0; ++i) { 806 length+=U16_LENGTH(s[i]); 807 } 808 return length; 809 } 810 811 static UBool 812 fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) { 813 int32_t i, length; 814 UChar32 full; 815 816 length=*s++; 817 if(length==0 || length>U16_MAX_LENGTH) { 818 return FALSE; 819 } 820 i=0; 821 U16_NEXT(s, i, length, full); 822 823 if(simple==0) { 824 simple=c; /* UCD has no simple mapping if it's the same as the code point itself */ 825 } 826 return (UBool)(i==length && full==simple); 827 } 828 829 static uint16_t 830 makeException(uint32_t value, Props *p) { 831 uint32_t slots[8]; 832 uint32_t slotBits; 833 uint16_t excWord, i, count, length, fullLengths; 834 UBool doubleSlots; 835 836 /* exceptionsTop might be returned for storing in the trie word */ 837 if(exceptionsTop>=UCASE_MAX_EXCEPTIONS) { 838 fprintf(stderr, "gencase error: too many exceptions words\n"); 839 exit(U_BUFFER_OVERFLOW_ERROR); 840 } 841 842 /* copy and shift the soft-dotted bits */ 843 excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT; 844 845 if(p->isCaseIgnorable) { 846 excWord|=UCASE_EXC_CASE_IGNORABLE; 847 } 848 849 /* update maxFullLength */ 850 if(p->specialCasing!=NULL) { 851 length=p->specialCasing->lowerCase[0]; 852 if(length>maxFullLength) { 853 maxFullLength=length; 854 } 855 length=p->specialCasing->upperCase[0]; 856 if(length>maxFullLength) { 857 maxFullLength=length; 858 } 859 length=p->specialCasing->titleCase[0]; 860 if(length>maxFullLength) { 861 maxFullLength=length; 862 } 863 } 864 if(p->caseFolding!=NULL) { 865 length=p->caseFolding->full[0]; 866 if(length>maxFullLength) { 867 maxFullLength=length; 868 } 869 } 870 871 /* set the bits for conditional mappings */ 872 if(p->specialCasing!=NULL && p->specialCasing->isComplex) { 873 excWord|=UCASE_EXC_CONDITIONAL_SPECIAL; 874 p->specialCasing=NULL; 875 } 876 if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) { 877 excWord|=UCASE_EXC_CONDITIONAL_FOLD; 878 p->caseFolding=NULL; 879 } 880 881 /* 882 * Note: 883 * UCD stores no simple mappings when they are the same as the code point itself. 884 * SpecialCasing and CaseFolding do store simple mappings even if they are 885 * the same as the code point itself. 886 * Comparisons between simple regular mappings and simple special/folding 887 * mappings need to compensate for the difference by comparing with the 888 * original code point if a simple UCD mapping is missing (0). 889 */ 890 891 /* remove redundant data */ 892 if(p->specialCasing!=NULL) { 893 /* do not store full mappings if they are the same as the simple ones */ 894 if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) { 895 p->specialCasing->lowerCase[0]=0; 896 } 897 if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) { 898 p->specialCasing->upperCase[0]=0; 899 } 900 if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) { 901 p->specialCasing->titleCase[0]=0; 902 } 903 } 904 if( p->caseFolding!=NULL && 905 fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code) 906 ) { 907 p->caseFolding->full[0]=0; 908 } 909 910 /* write the optional slots */ 911 slotBits=0; 912 count=0; 913 914 if(p->lowerCase!=0) { 915 slots[count]=(uint32_t)p->lowerCase; 916 slotBits|=slots[count]; 917 ++count; 918 excWord|=U_MASK(UCASE_EXC_LOWER); 919 } 920 if( p->caseFolding!=NULL && 921 p->caseFolding->simple!=0 && 922 (p->lowerCase!=0 ? 923 p->caseFolding->simple!=p->lowerCase : 924 p->caseFolding->simple!=p->code) 925 ) { 926 slots[count]=(uint32_t)p->caseFolding->simple; 927 slotBits|=slots[count]; 928 ++count; 929 excWord|=U_MASK(UCASE_EXC_FOLD); 930 } 931 if(p->upperCase!=0) { 932 slots[count]=(uint32_t)p->upperCase; 933 slotBits|=slots[count]; 934 ++count; 935 excWord|=U_MASK(UCASE_EXC_UPPER); 936 } 937 if(p->upperCase!=p->titleCase) { 938 if(p->titleCase!=0) { 939 slots[count]=(uint32_t)p->titleCase; 940 } else { 941 slots[count]=(uint32_t)p->code; 942 } 943 slotBits|=slots[count]; 944 ++count; 945 excWord|=U_MASK(UCASE_EXC_TITLE); 946 } 947 948 /* length of case closure */ 949 if(p->closure[0]!=0) { 950 length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure)); 951 slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */ 952 slotBits|=slots[count]; 953 ++count; 954 excWord|=U_MASK(UCASE_EXC_CLOSURE); 955 } 956 957 /* lengths of full case mapping strings, stored in the last slot */ 958 fullLengths=0; 959 if(p->specialCasing!=NULL) { 960 fullLengths=p->specialCasing->lowerCase[0]; 961 fullLengths|=p->specialCasing->upperCase[0]<<8; 962 fullLengths|=p->specialCasing->titleCase[0]<<12; 963 } 964 if(p->caseFolding!=NULL) { 965 fullLengths|=p->caseFolding->full[0]<<4; 966 } 967 if(fullLengths!=0) { 968 slots[count]=fullLengths; 969 slotBits|=slots[count]; 970 ++count; 971 excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS); 972 } 973 974 if(count==0) { 975 /* No optional slots: Try to share excWord entries. */ 976 uint16_t excIndex; 977 for(excIndex=0; excIndex<exceptionsTop; ++excIndex) { 978 if(excWord==exceptions[excIndex]) { 979 return excIndex; 980 } 981 } 982 /* not found */ 983 ++exceptionsTop; 984 exceptions[excIndex]=excWord; 985 return excIndex; 986 } else { 987 /* write slots */ 988 uint16_t excIndex=exceptionsTop; 989 uint16_t excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */ 990 991 doubleSlots=(UBool)(slotBits>0xffff); 992 if(!doubleSlots) { 993 for(i=0; i<count; ++i) { 994 exceptions[excTop++]=(uint16_t)slots[i]; 995 } 996 } else { 997 excWord|=UCASE_EXC_DOUBLE_SLOTS; 998 for(i=0; i<count; ++i) { 999 exceptions[excTop++]=(uint16_t)(slots[i]>>16); 1000 exceptions[excTop++]=(uint16_t)slots[i]; 1001 } 1002 } 1003 1004 /* write the full case mapping strings */ 1005 if(p->specialCasing!=NULL) { 1006 length=(uint16_t)p->specialCasing->lowerCase[0]; 1007 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length); 1008 excTop+=length; 1009 } 1010 if(p->caseFolding!=NULL) { 1011 length=(uint16_t)p->caseFolding->full[0]; 1012 u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length); 1013 excTop+=length; 1014 } 1015 if(p->specialCasing!=NULL) { 1016 length=(uint16_t)p->specialCasing->upperCase[0]; 1017 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length); 1018 excTop+=length; 1019 1020 length=(uint16_t)p->specialCasing->titleCase[0]; 1021 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length); 1022 excTop+=length; 1023 } 1024 1025 /* write the closure data */ 1026 if(p->closure[0]!=0) { 1027 UChar32 c; 1028 1029 for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) { 1030 U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c); 1031 } 1032 } 1033 1034 exceptionsTop=excTop; 1035 1036 /* write the main exceptions word */ 1037 exceptions[excIndex]=excWord; 1038 1039 return excIndex; 1040 } 1041 } 1042 1043 extern void 1044 makeExceptions() { 1045 uint32_t *row; 1046 uint32_t value; 1047 int32_t i; 1048 uint16_t excIndex; 1049 1050 i=0; 1051 while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) { 1052 value=*row; 1053 if(value&UCASE_EXCEPTION) { 1054 excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT)); 1055 *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT); 1056 } 1057 ++i; 1058 } 1059 } 1060 1061 /* generate output data ----------------------------------------------------- */ 1062 1063 extern void 1064 generateData(const char *dataDir, UBool csource) { 1065 static int32_t indexes[UCASE_IX_TOP]={ 1066 UCASE_IX_TOP 1067 }; 1068 static uint8_t trieBlock[40000]; 1069 1070 const uint32_t *row; 1071 UChar32 start, end; 1072 int32_t i; 1073 1074 UNewDataMemory *pData; 1075 UNewTrie *pTrie; 1076 UErrorCode errorCode=U_ZERO_ERROR; 1077 int32_t trieSize; 1078 long dataLength; 1079 1080 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE); 1081 if(pTrie==NULL) { 1082 fprintf(stderr, "gencase error: unable to create a UNewTrie\n"); 1083 exit(U_MEMORY_ALLOCATION_ERROR); 1084 } 1085 1086 for(i=0; (row=upvec_getRow(pv, i, &start, &end))!=NULL; ++i) { 1087 if(start<UPVEC_FIRST_SPECIAL_CP && !utrie_setRange32(pTrie, start, end+1, *row, TRUE)) { 1088 fprintf(stderr, "gencase error: unable to set trie value (overflow)\n"); 1089 exit(U_BUFFER_OVERFLOW_ERROR); 1090 } 1091 } 1092 1093 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); 1094 if(U_FAILURE(errorCode)) { 1095 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); 1096 exit(errorCode); 1097 } 1098 1099 indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop; 1100 indexes[UCASE_IX_TRIE_SIZE]=trieSize; 1101 indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop; 1102 indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop; 1103 1104 indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength; 1105 1106 if(beVerbose) { 1107 printf("trie size in bytes: %5d\n", (int)trieSize); 1108 printf("number of code points with exceptions: %5d\n", exceptionsCount); 1109 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop); 1110 printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop); 1111 printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]); 1112 } 1113 1114 if(csource) { 1115 /* write .c file for hardcoded data */ 1116 UTrie trie={ NULL }; 1117 UTrie2 *trie2; 1118 FILE *f; 1119 1120 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); 1121 if(U_FAILURE(errorCode)) { 1122 fprintf( 1123 stderr, 1124 "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n", 1125 u_errorName(errorCode)); 1126 exit(errorCode); 1127 } 1128 1129 /* use UTrie2 */ 1130 dataInfo.formatVersion[0]=2; 1131 dataInfo.formatVersion[2]=0; 1132 dataInfo.formatVersion[3]=0; 1133 trie2=utrie2_fromUTrie(&trie, 0, &errorCode); 1134 if(U_FAILURE(errorCode)) { 1135 fprintf( 1136 stderr, 1137 "gencase error: utrie2_fromUTrie() failed - %s\n", 1138 u_errorName(errorCode)); 1139 exit(errorCode); 1140 } 1141 { 1142 /* delete lead surrogate code unit values */ 1143 UChar lead; 1144 trie2=utrie2_cloneAsThawed(trie2, &errorCode); 1145 for(lead=0xd800; lead<0xdc00; ++lead) { 1146 utrie2_set32ForLeadSurrogateCodeUnit(trie2, lead, trie2->initialValue, &errorCode); 1147 } 1148 utrie2_freeze(trie2, UTRIE2_16_VALUE_BITS, &errorCode); 1149 if(U_FAILURE(errorCode)) { 1150 fprintf( 1151 stderr, 1152 "gencase error: deleting lead surrogate code unit values failed - %s\n", 1153 u_errorName(errorCode)); 1154 exit(errorCode); 1155 } 1156 } 1157 1158 f=usrc_create(dataDir, "ucase_props_data.c"); 1159 if(f!=NULL) { 1160 usrc_writeArray(f, 1161 "static const UVersionInfo ucase_props_dataVersion={", 1162 dataInfo.dataVersion, 8, 4, 1163 "};\n\n"); 1164 usrc_writeArray(f, 1165 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={", 1166 indexes, 32, UCASE_IX_TOP, 1167 "};\n\n"); 1168 usrc_writeUTrie2Arrays(f, 1169 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL, 1170 trie2, 1171 "\n};\n\n"); 1172 usrc_writeArray(f, 1173 "static const uint16_t ucase_props_exceptions[%ld]={\n", 1174 exceptions, 16, exceptionsTop, 1175 "\n};\n\n"); 1176 usrc_writeArray(f, 1177 "static const uint16_t ucase_props_unfold[%ld]={\n", 1178 unfold, 16, unfoldTop, 1179 "\n};\n\n"); 1180 fputs( 1181 "static const UCaseProps ucase_props_singleton={\n" 1182 " NULL,\n" 1183 " ucase_props_indexes,\n" 1184 " ucase_props_exceptions,\n" 1185 " ucase_props_unfold,\n", 1186 f); 1187 usrc_writeUTrie2Struct(f, 1188 " {\n", 1189 trie2, "ucase_props_trieIndex", NULL, 1190 " },\n"); 1191 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n"); 1192 fputs("};\n", f); 1193 fclose(f); 1194 } 1195 utrie2_close(trie2); 1196 } else { 1197 /* write the data */ 1198 pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo, 1199 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 1200 if(U_FAILURE(errorCode)) { 1201 fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode)); 1202 exit(errorCode); 1203 } 1204 1205 udata_writeBlock(pData, indexes, sizeof(indexes)); 1206 udata_writeBlock(pData, trieBlock, trieSize); 1207 udata_writeBlock(pData, exceptions, 2*exceptionsTop); 1208 udata_writeBlock(pData, unfold, 2*unfoldTop); 1209 1210 /* finish up */ 1211 dataLength=udata_finish(pData, &errorCode); 1212 if(U_FAILURE(errorCode)) { 1213 fprintf(stderr, "gencase: error %d writing the output file\n", errorCode); 1214 exit(errorCode); 1215 } 1216 1217 if(dataLength!=indexes[UCASE_IX_LENGTH]) { 1218 fprintf(stderr, "gencase: data length %ld != calculated size %d\n", 1219 dataLength, (int)indexes[UCASE_IX_LENGTH]); 1220 exit(U_INTERNAL_PROGRAM_ERROR); 1221 } 1222 } 1223 1224 utrie_close(pTrie); 1225 } 1226 1227 /* 1228 * Hey, Emacs, please set the following: 1229 * 1230 * Local Variables: 1231 * indent-tabs-mode: nil 1232 * End: 1233 * 1234 */ 1235