1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: store.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2001may25 14 * created by: Markus W. Scherer 15 * 16 * Store Unicode normalization data in a memory-mappable file. 17 */ 18 19 #include <stdio.h> 20 #include <stdlib.h> 21 #include "unicode/utypes.h" 22 #include "unicode/uchar.h" 23 #include "unicode/ustring.h" 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "filestrm.h" 27 #include "unicode/udata.h" 28 #include "utrie.h" 29 #include "utrie2.h" 30 #include "unicode/uset.h" 31 #include "toolutil.h" 32 #include "unewdata.h" 33 #include "writesrc.h" 34 #include "unormimp.h" 35 #include "gennorm.h" 36 37 #define DO_DEBUG_OUT 0 38 39 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 40 41 /* 42 * The new implementation of the normalization code loads its data from 43 * unorm.icu, which is generated with this gennorm tool. 44 * The format of that file is described in unormimp.h . 45 */ 46 47 /* file data ---------------------------------------------------------------- */ 48 49 #if UCONFIG_NO_NORMALIZATION 50 51 /* dummy UDataInfo cf. udata.h */ 52 static UDataInfo dataInfo = { 53 sizeof(UDataInfo), 54 0, 55 56 U_IS_BIG_ENDIAN, 57 U_CHARSET_FAMILY, 58 U_SIZEOF_UCHAR, 59 0, 60 61 { 0, 0, 0, 0 }, /* dummy dataFormat */ 62 { 0, 0, 0, 0 }, /* dummy formatVersion */ 63 { 0, 0, 0, 0 } /* dummy dataVersion */ 64 }; 65 66 #else 67 68 /* UDataInfo cf. udata.h */ 69 static UDataInfo dataInfo={ 70 sizeof(UDataInfo), 71 0, 72 73 U_IS_BIG_ENDIAN, 74 U_CHARSET_FAMILY, 75 U_SIZEOF_UCHAR, 76 0, 77 78 { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ 79 { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ 80 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ 81 }; 82 83 extern void 84 setUnicodeVersion(const char *v) { 85 UVersionInfo version; 86 u_versionFromString(version, v); 87 uprv_memcpy(dataInfo.dataVersion, version, 4); 88 } 89 90 static int32_t indexes[_NORM_INDEX_TOP]={ 0 }; 91 92 /* builder data ------------------------------------------------------------- */ 93 94 /* modularization flags, see gennorm.h (default to "store everything") */ 95 uint32_t gStoreFlags=0xffffffff; 96 97 typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); 98 99 static UNewTrie 100 *normTrie, 101 *norm32Trie, 102 *fcdTrie, 103 *auxTrie; 104 105 static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem; 106 107 static Norm *norms; 108 109 /* 110 * set a flag for each code point that was seen in decompositions - 111 * avoid to decompose ones that have not been used before 112 */ 113 static uint32_t haveSeenFlags[256]; 114 115 /* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */ 116 static USet *nfdQCNoSet; 117 118 /* see addCombiningCP() for details */ 119 static uint32_t combiningCPs[2000]; 120 121 /* 122 * after processCombining() this contains for each code point in combiningCPs[] 123 * the runtime combining index 124 */ 125 static uint16_t combiningIndexes[2000]; 126 127 /* section limits for combiningCPs[], see addCombiningCP() */ 128 static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0; 129 130 /** 131 * Structure for a triple of code points, stored in combiningTriplesMem. 132 * The lead and trail code points combine into the the combined one, 133 * i.e., there is a canonical decomposition of combined-> <lead, trail>. 134 * 135 * Before processCombining() is called, leadIndex and trailIndex are 0. 136 * After processCombining(), they contain the indexes of the lead and trail 137 * code point in the combiningCPs[] array. 138 * They are then sorted by leadIndex, then trailIndex. 139 * They are not sorted by code points. 140 */ 141 typedef struct CombiningTriple { 142 uint16_t leadIndex, trailIndex; 143 uint32_t lead, trail, combined; 144 } CombiningTriple; 145 146 /* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */ 147 static uint16_t combiningTable[0x8000]; 148 static uint16_t combiningTableTop=0; 149 150 #define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000 151 static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH 152 +10000]; /* +10000 for exclusion sets */ 153 static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP; 154 static int32_t canonSetsCount=0; 155 156 /* allocate and initialize a Norm unit */ 157 static Norm * 158 allocNorm() { 159 /* allocate Norm */ 160 Norm *p=(Norm *)utm_alloc(normMem); 161 /* 162 * The combiningIndex must not be initialized to 0 because 0 is the 163 * combiningIndex of the first forward-combining character. 164 */ 165 p->combiningIndex=0xffff; 166 return p; 167 } 168 169 extern void 170 init() { 171 uint16_t *p16; 172 173 normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); 174 uprv_memset(normTrie, 0, sizeof(UNewTrie)); 175 norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); 176 uprv_memset(norm32Trie, 0, sizeof(UNewTrie)); 177 fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); 178 uprv_memset(fcdTrie, 0, sizeof(UNewTrie)); 179 auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); 180 uprv_memset(auxTrie, 0, sizeof(UNewTrie)); 181 182 /* initialize the two tries */ 183 if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) { 184 fprintf(stderr, "error: failed to initialize tries\n"); 185 exit(U_MEMORY_ALLOCATION_ERROR); 186 } 187 188 /* allocate Norm structures and reset the first one */ 189 normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm)); 190 norms=allocNorm(); 191 192 /* allocate UTF-32 string memory */ 193 utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4); 194 195 /* reset all "have seen" flags */ 196 uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags)); 197 198 /* open an empty set */ 199 nfdQCNoSet=uset_open(1, 0); 200 201 /* allocate extra data memory for UTF-16 decomposition strings and other values */ 202 extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2); 203 /* initialize the extraMem counter for the top of FNC strings */ 204 p16=(uint16_t *)utm_alloc(extraMem); 205 *p16=1; 206 207 /* allocate temporary memory for combining triples */ 208 combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple)); 209 210 /* set the minimum code points for no/maybe quick check values to the end of the BMP */ 211 indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff; 212 indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff; 213 indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff; 214 indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff; 215 216 /* preset the indexes portion of canonStartSets */ 217 uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2); 218 } 219 220 /* 221 * get or create a Norm unit; 222 * get or create the intermediate trie entries for it as well 223 */ 224 static Norm * 225 createNorm(uint32_t code) { 226 Norm *p; 227 uint32_t i; 228 229 i=utrie_get32(normTrie, (UChar32)code, NULL); 230 if(i!=0) { 231 p=norms+i; 232 } else { 233 /* allocate Norm */ 234 p=allocNorm(); 235 if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) { 236 fprintf(stderr, "error: too many normalization entries\n"); 237 exit(U_BUFFER_OVERFLOW_ERROR); 238 } 239 } 240 return p; 241 } 242 243 /* get an existing Norm unit */ 244 static Norm * 245 getNorm(uint32_t code) { 246 uint32_t i; 247 248 i=utrie_get32(normTrie, (UChar32)code, NULL); 249 if(i==0) { 250 return NULL; 251 } 252 return norms+i; 253 } 254 255 /* get the canonical combining class of a character */ 256 static uint8_t 257 getCCFromCP(uint32_t code) { 258 Norm *norm=getNorm(code); 259 if(norm==NULL) { 260 return 0; 261 } else { 262 return norm->udataCC; 263 } 264 } 265 266 /* 267 * enumerate all code points with their Norm structs and call a function for each 268 * return the number of code points with data 269 */ 270 static uint32_t 271 enumTrie(EnumTrieFn *fn, void *context) { 272 uint32_t count, i; 273 UChar32 code; 274 UBool isInBlockZero; 275 276 count=0; 277 for(code=0; code<=0x10ffff;) { 278 i=utrie_get32(normTrie, code, &isInBlockZero); 279 if(isInBlockZero) { 280 code+=UTRIE_DATA_BLOCK_LENGTH; 281 } else { 282 if(i!=0) { 283 fn(context, (uint32_t)code, norms+i); 284 ++count; 285 } 286 ++code; 287 } 288 } 289 return count; 290 } 291 292 static void 293 setHaveSeenString(const uint32_t *s, int32_t length) { 294 uint32_t c; 295 296 while(length>0) { 297 c=*s++; 298 haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f)); 299 --length; 300 } 301 } 302 303 #define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f))) 304 305 /* handle combining data ---------------------------------------------------- */ 306 307 /* 308 * Insert an entry into combiningCPs[] for the new code point code with its flags. 309 * The flags indicate if code combines forward, backward, or both. 310 * 311 * combiningCPs[] contains three sections: 312 * 1. code points that combine forward 313 * 2. code points that combine forward and backward 314 * 3. code points that combine backward 315 * 316 * Search for code in the entire array. 317 * If it is found and already is in the right section (old flags==new flags) 318 * then we are done. 319 * If it is found but the flags are different, then remove it, 320 * union the old and new flags, and reinsert it into its correct section. 321 * If it is not found, then just insert it. 322 * 323 * Within each section, the code points are not sorted. 324 */ 325 static void 326 addCombiningCP(uint32_t code, uint8_t flags) { 327 uint32_t newEntry; 328 uint16_t i; 329 330 newEntry=code|((uint32_t)flags<<24); 331 332 /* search for this code point */ 333 for(i=0; i<combineBackTop; ++i) { 334 if(code==(combiningCPs[i]&0xffffff)) { 335 /* found it */ 336 if(newEntry==combiningCPs[i]) { 337 return; /* no change */ 338 } 339 340 /* combine the flags, remove the old entry from the old place, and insert the new one */ 341 newEntry|=combiningCPs[i]; 342 if(i!=--combineBackTop) { 343 uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4); 344 } 345 if(i<combineBothTop) { 346 --combineBothTop; 347 } 348 if(i<combineFwdTop) { 349 --combineFwdTop; 350 } 351 break; 352 } 353 } 354 355 /* not found or modified, insert it */ 356 if(combineBackTop>=sizeof(combiningCPs)/4) { 357 fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n", 358 (long)(sizeof(combiningCPs)/4)); 359 exit(U_MEMORY_ALLOCATION_ERROR); 360 } 361 362 /* set i to the insertion point */ 363 flags=(uint8_t)(newEntry>>24); 364 if(flags==1) { 365 i=combineFwdTop++; 366 ++combineBothTop; 367 } else if(flags==3) { 368 i=combineBothTop++; 369 } else /* flags==2 */ { 370 i=combineBackTop; 371 } 372 373 /* move the following code points up one and insert newEntry at i */ 374 if(i<combineBackTop) { 375 uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4); 376 } 377 combiningCPs[i]=newEntry; 378 379 /* finally increment the total counter */ 380 ++combineBackTop; 381 } 382 383 /** 384 * Find the index in combiningCPs[] where code point code is stored. 385 * @param code code point to look for 386 * @param isLead is code a forward combining code point? 387 * @return index in combiningCPs[] where code is stored 388 */ 389 static uint16_t 390 findCombiningCP(uint32_t code, UBool isLead) { 391 uint16_t i, limit; 392 393 if(isLead) { 394 i=0; 395 limit=combineBothTop; 396 } else { 397 i=combineFwdTop; 398 limit=combineBackTop; 399 } 400 401 /* search for this code point */ 402 for(; i<limit; ++i) { 403 if(code==(combiningCPs[i]&0xffffff)) { 404 /* found it */ 405 return i; 406 } 407 } 408 409 /* not found */ 410 return 0xffff; 411 } 412 413 static void 414 addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) { 415 CombiningTriple *triple; 416 417 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) { 418 return; 419 } 420 421 /* 422 * set combiningFlags for the two code points 423 * do this after decomposition so that getNorm() above returns NULL 424 * if we do not have actual sub-decomposition data for the initial NFD here 425 */ 426 createNorm(lead)->combiningFlags|=1; /* combines forward */ 427 createNorm(trail)->combiningFlags|=2; /* combines backward */ 428 429 addCombiningCP(lead, 1); 430 addCombiningCP(trail, 2); 431 432 triple=(CombiningTriple *)utm_alloc(combiningTriplesMem); 433 triple->lead=lead; 434 triple->trail=trail; 435 triple->combined=combined; 436 } 437 438 static int 439 compareTriples(const void *l, const void *r) { 440 int diff; 441 diff=(int)((CombiningTriple *)l)->leadIndex- 442 (int)((CombiningTriple *)r)->leadIndex; 443 if(diff==0) { 444 diff=(int)((CombiningTriple *)l)->trailIndex- 445 (int)((CombiningTriple *)r)->trailIndex; 446 } 447 return diff; 448 } 449 450 static void 451 processCombining() { 452 CombiningTriple *triples; 453 uint16_t *p; 454 uint32_t combined; 455 uint16_t i, j, count, tableTop, finalIndex, combinesFwd; 456 457 triples=utm_getStart(combiningTriplesMem); 458 459 /* add lead and trail indexes to the triples for sorting */ 460 count=(uint16_t)utm_countItems(combiningTriplesMem); 461 for(i=0; i<count; ++i) { 462 /* findCombiningCP() must always find the code point */ 463 triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE); 464 triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE); 465 } 466 467 /* sort them by leadIndex, trailIndex */ 468 qsort(triples, count, sizeof(CombiningTriple), compareTriples); 469 470 /* calculate final combining indexes and store them in the Norm entries */ 471 tableTop=0; 472 j=0; /* triples counter */ 473 474 /* first, combining indexes of fwd/both characters are indexes into the combiningTable */ 475 for(i=0; i<combineBothTop; ++i) { 476 /* start a new table */ 477 478 /* assign combining index */ 479 createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop; 480 481 /* calculate the length of the combining data for this lead code point in the combiningTable */ 482 while(j<count && i==triples[j].leadIndex) { 483 /* count 2 to 3 16-bit units per composition entry (back-index, code point) */ 484 combined=triples[j++].combined; 485 if(combined<=0x1fff) { 486 tableTop+=2; 487 } else { 488 tableTop+=3; 489 } 490 } 491 } 492 493 /* second, combining indexes of back-only characters are simply incremented from here to be unique */ 494 finalIndex=tableTop; 495 for(; i<combineBackTop; ++i) { 496 createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++; 497 } 498 499 /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */ 500 if(finalIndex>0x8000) { 501 fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n", 502 tableTop, (long)(sizeof(combiningTable)/4)); 503 exit(U_MEMORY_ALLOCATION_ERROR); 504 } 505 506 combiningTableTop=tableTop; 507 508 /* store the combining data in the combiningTable, with the final indexes from above */ 509 p=combiningTable; 510 j=0; /* triples counter */ 511 512 /* 513 * this is essentially the same loop as above, but 514 * it writes the table data instead of calculating and setting the final indexes; 515 * it is necessary to have two passes so that all the final indexes are known before 516 * they are written into the table 517 */ 518 for(i=0; i<combineBothTop; ++i) { 519 /* start a new table */ 520 521 combined=0; /* avoid compiler warning */ 522 523 /* store the combining data for this lead code point in the combiningTable */ 524 while(j<count && i==triples[j].leadIndex) { 525 Norm *normPtr; 526 finalIndex=combiningIndexes[triples[j].trailIndex]; 527 combined=triples[j++].combined; 528 normPtr = getNorm(combined); 529 530 if (normPtr == NULL) { 531 fprintf(stderr, "error: processCombining did not get expected result. combined=%d\n", combined); 532 exit(U_INTERNAL_PROGRAM_ERROR); 533 } 534 535 /* is combined a starter? (i.e., cc==0 && combines forward) */ 536 combinesFwd=(uint16_t)((normPtr->combiningFlags&1)<<13); 537 538 *p++=finalIndex; 539 if(combined<=0x1fff) { 540 *p++=(uint16_t)(combinesFwd|combined); 541 } else if(combined<=0xffff) { 542 *p++=(uint16_t)(0x8000|combinesFwd); 543 *p++=(uint16_t)combined; 544 } else { 545 *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10)); 546 *p++=(uint16_t)(0xdc00|(combined&0x3ff)); 547 } 548 } 549 550 /* set a marker on the last final trail index in this lead's table */ 551 if(combined<=0x1fff) { 552 *(p-2)|=0x8000; 553 } else { 554 *(p-3)|=0x8000; 555 } 556 } 557 558 /* post condition: tableTop==(p-combiningTable) */ 559 } 560 561 /* processing incoming normalization data ----------------------------------- */ 562 563 /* 564 * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct. 565 * c must be a Hangul syllable code point. 566 */ 567 static void 568 getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) { 569 /* Hangul syllable: decompose algorithmically */ 570 uint32_t c2; 571 uint8_t length; 572 573 uprv_memset(pHangulNorm, 0, sizeof(Norm)); 574 575 c-=HANGUL_BASE; 576 577 c2=c%JAMO_T_COUNT; 578 c/=JAMO_T_COUNT; 579 if(c2>0) { 580 hangulBuffer[2]=JAMO_T_BASE+c2; 581 length=3; 582 } else { 583 hangulBuffer[2]=0; 584 length=2; 585 } 586 587 hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT; 588 hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT; 589 590 pHangulNorm->nfd=hangulBuffer; 591 pHangulNorm->lenNFD=length; 592 if(DO_STORE(UGENNORM_STORE_COMPAT)) { 593 pHangulNorm->nfkd=hangulBuffer; 594 pHangulNorm->lenNFKD=length; 595 } 596 } 597 598 /* 599 * decompose the one decomposition further, may generate two decompositions 600 * apply all previous characters' decompositions to this one 601 */ 602 static void 603 decompStoreNewNF(uint32_t code, Norm *norm) { 604 uint32_t nfd[40], nfkd[40], hangulBuffer[3]; 605 Norm hangulNorm; 606 607 uint32_t *s32; 608 Norm *p; 609 uint32_t c; 610 int32_t i, length; 611 uint8_t lenNFD=0, lenNFKD=0; 612 UBool changedNFD=FALSE, changedNFKD=FALSE; 613 614 if((length=norm->lenNFD)!=0) { 615 /* always allocate the original string */ 616 changedNFD=TRUE; 617 s32=norm->nfd; 618 } else if((length=norm->lenNFKD)!=0) { 619 /* always allocate the original string */ 620 changedNFKD=TRUE; 621 s32=norm->nfkd; 622 } else { 623 /* no decomposition here, nothing to do */ 624 return; 625 } 626 627 /* decompose each code point */ 628 for(i=0; i<length; ++i) { 629 c=s32[i]; 630 p=getNorm(c); 631 if(p==NULL) { 632 if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) { 633 getHangulDecomposition(c, &hangulNorm, hangulBuffer); 634 p=&hangulNorm; 635 } else { 636 /* no data, no decomposition */ 637 nfd[lenNFD++]=c; 638 nfkd[lenNFKD++]=c; 639 continue; 640 } 641 } 642 643 /* canonically decompose c */ 644 if(changedNFD) { 645 if(p->lenNFD!=0) { 646 uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4); 647 lenNFD+=p->lenNFD; 648 } else { 649 nfd[lenNFD++]=c; 650 } 651 } 652 653 /* compatibility-decompose c */ 654 if(p->lenNFKD!=0) { 655 uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4); 656 lenNFKD+=p->lenNFKD; 657 changedNFKD=TRUE; 658 } else if(p->lenNFD!=0) { 659 uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4); 660 lenNFKD+=p->lenNFD; 661 /* 662 * not changedNFKD=TRUE; 663 * so that we do not store a new nfkd if there was no nfkd string before 664 * and we only see canonical decompositions 665 */ 666 } else { 667 nfkd[lenNFKD++]=c; 668 } 669 } 670 671 /* assume that norm->lenNFD==1 or ==2 */ 672 if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) { 673 addCombiningTriple(s32[0], s32[1], code); 674 } 675 676 if(changedNFD) { 677 if(lenNFD!=0) { 678 s32=utm_allocN(utf32Mem, lenNFD); 679 uprv_memcpy(s32, nfd, lenNFD*4); 680 } else { 681 s32=NULL; 682 } 683 norm->lenNFD=lenNFD; 684 norm->nfd=s32; 685 setHaveSeenString(nfd, lenNFD); 686 } 687 if(changedNFKD) { 688 if(lenNFKD!=0) { 689 s32=utm_allocN(utf32Mem, lenNFKD); 690 uprv_memcpy(s32, nfkd, lenNFKD*4); 691 } else { 692 s32=NULL; 693 } 694 norm->lenNFKD=lenNFKD; 695 norm->nfkd=s32; 696 setHaveSeenString(nfkd, lenNFKD); 697 } 698 } 699 700 typedef struct DecompSingle { 701 uint32_t c; 702 Norm *norm; 703 } DecompSingle; 704 705 /* 706 * apply this one character's decompositions (there is at least one!) to 707 * all previous characters' decompositions to decompose them further 708 */ 709 static void 710 decompWithSingleFn(void *context, uint32_t code, Norm *norm) { 711 uint32_t nfd[40], nfkd[40]; 712 uint32_t *s32; 713 DecompSingle *me=(DecompSingle *)context; 714 uint32_t c, myC; 715 int32_t i, length; 716 uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD; 717 UBool changedNFD=FALSE, changedNFKD=FALSE; 718 719 /* get the new character's data */ 720 myC=me->c; 721 myLenNFD=me->norm->lenNFD; 722 myLenNFKD=me->norm->lenNFKD; 723 /* assume that myC has at least one decomposition */ 724 725 if((length=norm->lenNFD)!=0 && myLenNFD!=0) { 726 /* apply NFD(myC) to norm->nfd */ 727 s32=norm->nfd; 728 for(i=0; i<length; ++i) { 729 c=s32[i]; 730 if(c==myC) { 731 uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4); 732 lenNFD+=myLenNFD; 733 changedNFD=TRUE; 734 } else { 735 nfd[lenNFD++]=c; 736 } 737 } 738 } 739 740 if((length=norm->lenNFKD)!=0) { 741 /* apply NFD(myC) and NFKD(myC) to norm->nfkd */ 742 s32=norm->nfkd; 743 for(i=0; i<length; ++i) { 744 c=s32[i]; 745 if(c==myC) { 746 if(myLenNFKD!=0) { 747 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4); 748 lenNFKD+=myLenNFKD; 749 } else /* assume myLenNFD!=0 */ { 750 uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4); 751 lenNFKD+=myLenNFD; 752 } 753 changedNFKD=TRUE; 754 } else { 755 nfkd[lenNFKD++]=c; 756 } 757 } 758 } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) { 759 /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */ 760 s32=norm->nfd; 761 for(i=0; i<length; ++i) { 762 c=s32[i]; 763 if(c==myC) { 764 uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4); 765 lenNFKD+=myLenNFKD; 766 changedNFKD=TRUE; 767 } else { 768 nfkd[lenNFKD++]=c; 769 } 770 } 771 } 772 773 /* set the new decompositions, forget the old ones */ 774 if(changedNFD) { 775 if(lenNFD!=0) { 776 if(lenNFD>norm->lenNFD) { 777 s32=utm_allocN(utf32Mem, lenNFD); 778 } else { 779 s32=norm->nfd; 780 } 781 uprv_memcpy(s32, nfd, lenNFD*4); 782 } else { 783 s32=NULL; 784 } 785 norm->lenNFD=lenNFD; 786 norm->nfd=s32; 787 } 788 if(changedNFKD) { 789 if(lenNFKD!=0) { 790 if(lenNFKD>norm->lenNFKD) { 791 s32=utm_allocN(utf32Mem, lenNFKD); 792 } else { 793 s32=norm->nfkd; 794 } 795 uprv_memcpy(s32, nfkd, lenNFKD*4); 796 } else { 797 s32=NULL; 798 } 799 norm->lenNFKD=lenNFKD; 800 norm->nfkd=s32; 801 } 802 } 803 804 /* 805 * process the data for one code point listed in UnicodeData; 806 * UnicodeData itself never maps a code point to both NFD and NFKD 807 */ 808 extern void 809 storeNorm(uint32_t code, Norm *norm) { 810 DecompSingle decompSingle; 811 Norm *p; 812 813 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) { 814 /* ignore compatibility decomposition */ 815 norm->lenNFKD=0; 816 } 817 818 /* copy existing derived normalization properties */ 819 p=createNorm(code); 820 norm->qcFlags=p->qcFlags; 821 norm->combiningFlags=p->combiningFlags; 822 norm->fncIndex=p->fncIndex; 823 824 /* process the decomposition if there is one here */ 825 if((norm->lenNFD|norm->lenNFKD)!=0) { 826 /* decompose this one decomposition further, may generate two decompositions */ 827 decompStoreNewNF(code, norm); 828 829 /* has this code point been used in previous decompositions? */ 830 if(HAVE_SEEN(code)) { 831 /* use this decomposition to decompose other decompositions further */ 832 decompSingle.c=code; 833 decompSingle.norm=norm; 834 enumTrie(decompWithSingleFn, &decompSingle); 835 } 836 } 837 838 /* store the data */ 839 uprv_memcpy(p, norm, sizeof(Norm)); 840 } 841 842 extern void 843 setQCFlags(uint32_t code, uint8_t qcFlags) { 844 if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) { 845 /* ignore compatibility decomposition: unset the KC/KD flags */ 846 qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD); 847 848 /* set the KC/KD flags to the same values as the C/D flags */ 849 qcFlags|=qcFlags<<1; 850 } 851 if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) { 852 /* ignore composition data: unset the C/KC flags */ 853 qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC); 854 855 /* set the C/KC flags to the same values as the D/KD flags */ 856 qcFlags|=qcFlags>>2; 857 } 858 859 createNorm(code)->qcFlags|=qcFlags; 860 861 /* adjust the minimum code point for quick check no/maybe */ 862 if(code<0xffff) { 863 if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) { 864 indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code; 865 } 866 if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) { 867 indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code; 868 } 869 if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) { 870 indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code; 871 } 872 if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) { 873 indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code; 874 } 875 } 876 877 if(qcFlags&_NORM_QC_NFD) { 878 uset_add(nfdQCNoSet, (UChar32)code); 879 } 880 } 881 882 extern void 883 setCompositionExclusion(uint32_t code) { 884 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { 885 createNorm(code)->combiningFlags|=0x80; 886 } 887 } 888 889 static void 890 setHangulJamoSpecials() { 891 Norm *norm; 892 uint32_t c, hangul; 893 894 /* 895 * Hangul syllables are algorithmically decomposed into Jamos, 896 * and Jamos are algorithmically composed into Hangul syllables. 897 * The quick check flags are parsed, except for Hangul. 898 */ 899 900 /* set Jamo L specials */ 901 hangul=0xac00; 902 for(c=0x1100; c<=0x1112; ++c) { 903 norm=createNorm(c); 904 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L; 905 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { 906 norm->combiningFlags=1; 907 } 908 909 /* for each Jamo L create a set with its associated Hangul block */ 910 norm->canonStart=uset_open(hangul, hangul+21*28-1); 911 hangul+=21*28; 912 } 913 914 /* set Jamo V specials */ 915 for(c=0x1161; c<=0x1175; ++c) { 916 norm=createNorm(c); 917 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V; 918 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { 919 norm->combiningFlags=2; 920 } 921 norm->unsafeStart=TRUE; 922 } 923 924 /* set Jamo T specials */ 925 for(c=0x11a8; c<=0x11c2; ++c) { 926 norm=createNorm(c); 927 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T; 928 if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { 929 norm->combiningFlags=2; 930 } 931 norm->unsafeStart=TRUE; 932 } 933 934 /* set Hangul specials, precompacted */ 935 norm=allocNorm(); 936 norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL; 937 if(DO_STORE(UGENNORM_STORE_COMPAT)) { 938 norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD; 939 } else { 940 norm->qcFlags=_NORM_QC_NFD; 941 } 942 943 if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) { 944 fprintf(stderr, "error: too many normalization entries (setting Hangul)\n"); 945 exit(U_BUFFER_OVERFLOW_ERROR); 946 } 947 } 948 949 /* 950 * set FC-NFKC-Closure string 951 * s contains the closure string; s[0]==length, s[1..length] is the actual string 952 * may modify s[0] 953 */ 954 U_CFUNC void 955 setFNC(uint32_t c, UChar *s) { 956 uint16_t *p; 957 int32_t length, i, count; 958 UChar first; 959 960 if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) || 961 DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) || 962 DO_NOT_STORE(UGENNORM_STORE_AUX) 963 ) { 964 return; 965 } 966 967 count=utm_countItems(extraMem); 968 length=s[0]; 969 first=s[1]; 970 971 /* try to overlay single-unit strings with existing ones */ 972 if(length==1 && first<0xff00) { 973 p=utm_getStart(extraMem); 974 for(i=1; i<count; ++i) { 975 if(first==p[i]) { 976 break; 977 } 978 } 979 } else { 980 i=count; 981 } 982 983 /* append the new string if it cannot be overlayed with an old one */ 984 if(i==count) { 985 if(count>_NORM_AUX_MAX_FNC) { 986 fprintf(stderr, "gennorm error: too many FNC strings\n"); 987 exit(U_INDEX_OUTOFBOUNDS_ERROR); 988 } 989 990 /* prepend 0xffxx with xx==length */ 991 s[0]=(uint16_t)(0xff00+length); 992 ++length; 993 p=(uint16_t *)utm_allocN(extraMem, length); 994 uprv_memcpy(p, s, length*2); 995 996 /* update the top index in extraMem[0] */ 997 count+=length; 998 ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count; 999 } 1000 1001 /* store the index to the string */ 1002 createNorm(c)->fncIndex=i; 1003 } 1004 1005 /* build runtime structures ------------------------------------------------- */ 1006 1007 /* canonically reorder a UTF-32 string; return { leadCC, trailCC } */ 1008 static uint16_t 1009 reorderString(uint32_t *s, int32_t length) { 1010 uint8_t ccs[40]; 1011 uint32_t c; 1012 int32_t i, j; 1013 uint8_t cc, prevCC; 1014 1015 if(length<=0) { 1016 return 0; 1017 } 1018 1019 for(i=0; i<length; ++i) { 1020 /* get the i-th code point and its combining class */ 1021 c=s[i]; 1022 cc=getCCFromCP(c); 1023 if(cc!=0 && i!=0) { 1024 /* it is a combining mark, see if it needs to be moved back */ 1025 j=i; 1026 do { 1027 prevCC=ccs[j-1]; 1028 if(prevCC<=cc) { 1029 break; /* found the right place */ 1030 } 1031 /* move the previous code point here and go back */ 1032 s[j]=s[j-1]; 1033 ccs[j]=prevCC; 1034 } while(--j!=0); 1035 s[j]=c; 1036 ccs[j]=cc; 1037 } else { 1038 /* just store the combining class */ 1039 ccs[i]=cc; 1040 } 1041 } 1042 1043 return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]); 1044 } 1045 1046 #if 0 1047 static UBool combineAndQC[64]={ 0 }; 1048 #endif 1049 1050 /* 1051 * canonically reorder the up to two decompositions 1052 * and store the leading and trailing combining classes accordingly 1053 * 1054 * also process canonical decompositions for canonical closure 1055 */ 1056 static void 1057 postParseFn(void *context, uint32_t code, Norm *norm) { 1058 int32_t length; 1059 1060 /* canonically order the NFD */ 1061 length=norm->lenNFD; 1062 if(length>0) { 1063 norm->canonBothCCs=reorderString(norm->nfd, length); 1064 } 1065 1066 /* canonically reorder the NFKD */ 1067 length=norm->lenNFKD; 1068 if(length>0) { 1069 norm->compatBothCCs=reorderString(norm->nfkd, length); 1070 } 1071 1072 /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */ 1073 if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) { 1074 fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags); 1075 } 1076 if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) { 1077 fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags); 1078 } 1079 1080 /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */ 1081 #if 0 1082 combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1; 1083 #endif 1084 1085 if(norm->combiningFlags&1) { 1086 if(norm->udataCC!=0) { 1087 /* illegal - data-derivable composition exclusion */ 1088 fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC); 1089 } 1090 } 1091 if(norm->combiningFlags&2) { 1092 if((norm->qcFlags&0x11)==0) { 1093 fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code); 1094 } 1095 #if 0 1096 /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */ 1097 if(norm->udataCC==0) { 1098 printf("U+%04lx combines backward but udataCC==0\n", (long)code); 1099 } 1100 #endif 1101 } 1102 if((norm->combiningFlags&3)==3 && beVerbose) { 1103 printf("U+%04lx combines both ways\n", (long)code); 1104 } 1105 1106 /* 1107 * process canonical decompositions for canonical closure 1108 * 1109 * in each canonical decomposition: 1110 * add the current character (code) to the set of canonical starters of its norm->nfd[0] 1111 * set the "unsafe starter" flag for each norm->nfd[1..] 1112 */ 1113 length=norm->lenNFD; 1114 if(length>0) { 1115 Norm *otherNorm; 1116 UChar32 c; 1117 int32_t i; 1118 1119 /* nfd[0].canonStart.add(code) */ 1120 c=norm->nfd[0]; 1121 otherNorm=createNorm(c); 1122 if(otherNorm->canonStart==NULL) { 1123 otherNorm->canonStart=uset_open(code, code); 1124 if(otherNorm->canonStart==NULL) { 1125 fprintf(stderr, "gennorm error: out of memory in uset_open()\n"); 1126 exit(U_MEMORY_ALLOCATION_ERROR); 1127 } 1128 } else { 1129 uset_add(otherNorm->canonStart, code); 1130 if(!uset_contains(otherNorm->canonStart, code)) { 1131 fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code); 1132 exit(U_INTERNAL_PROGRAM_ERROR); 1133 } 1134 } 1135 1136 /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */ 1137 for(i=1; i<length; ++i) { 1138 createNorm(norm->nfd[i])->unsafeStart=TRUE; 1139 } 1140 } 1141 } 1142 1143 static uint32_t 1144 make32BitNorm(Norm *norm) { 1145 UChar extra[100]; 1146 const Norm *other; 1147 uint32_t word; 1148 int32_t i, length, beforeZero=0, count, start; 1149 1150 /* 1151 * Check for assumptions: 1152 * 1153 * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes, 1154 * then the decomposition also begins with a true starter. 1155 */ 1156 if(norm->udataCC==0) { 1157 /* this is a starter */ 1158 if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) { 1159 /* a "true" NFC starter with a canonical decomposition */ 1160 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */ 1161 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */ 1162 ) { 1163 fprintf(stderr, 1164 "error: true NFC starter canonical decomposition[%u] does not begin\n" 1165 " with a true NFC starter: U+%04lx U+%04lx%s\n", 1166 norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1], 1167 norm->lenNFD<=2 ? "" : " ..."); 1168 exit(U_INVALID_TABLE_FILE); 1169 } 1170 } 1171 1172 if((norm->qcFlags&_NORM_QC_NFKC)==0) { 1173 if(norm->lenNFKD>0) { 1174 /* a "true" NFKC starter with a compatibility decomposition */ 1175 if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */ 1176 ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */ 1177 ) { 1178 fprintf(stderr, 1179 "error: true NFKC starter compatibility decomposition[%u] does not begin\n" 1180 " with a true NFKC starter: U+%04lx U+%04lx%s\n", 1181 norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1], 1182 norm->lenNFKD<=2 ? "" : " ..."); 1183 exit(U_INVALID_TABLE_FILE); 1184 } 1185 } else if(norm->lenNFD>0) { 1186 /* a "true" NFKC starter with only a canonical decomposition */ 1187 if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */ 1188 ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */ 1189 ) { 1190 fprintf(stderr, 1191 "error: true NFKC starter canonical decomposition[%u] does not begin\n" 1192 " with a true NFKC starter: U+%04lx U+%04lx%s\n", 1193 norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1], 1194 norm->lenNFD<=2 ? "" : " ..."); 1195 exit(U_INVALID_TABLE_FILE); 1196 } 1197 } 1198 } 1199 } 1200 1201 /* reset the 32-bit word and set the quick check flags */ 1202 word=norm->qcFlags; 1203 1204 /* set the UnicodeData combining class */ 1205 word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT; 1206 1207 /* set the combining flag and index */ 1208 if(norm->combiningFlags&3) { 1209 word|=(uint32_t)(norm->combiningFlags&3)<<6; 1210 } 1211 1212 /* set the combining index value into the extra data */ 1213 /* 0xffff: no combining index; 0..0x7fff: combining index */ 1214 if(norm->combiningIndex!=0xffff) { 1215 extra[0]=norm->combiningIndex; 1216 beforeZero=1; 1217 } 1218 1219 count=beforeZero; 1220 1221 /* write the decompositions */ 1222 if((norm->lenNFD|norm->lenNFKD)!=0) { 1223 extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */ 1224 1225 length=norm->lenNFD; 1226 if(length>0) { 1227 if(norm->canonBothCCs!=0) { 1228 extra[beforeZero]|=0x80; 1229 extra[count++]=norm->canonBothCCs; 1230 } 1231 start=count; 1232 for(i=0; i<length; ++i) { 1233 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]); 1234 } 1235 extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */ 1236 } 1237 1238 length=norm->lenNFKD; 1239 if(length>0) { 1240 if(norm->compatBothCCs!=0) { 1241 extra[beforeZero]|=0x8000; 1242 extra[count++]=norm->compatBothCCs; 1243 } 1244 start=count; 1245 for(i=0; i<length; ++i) { 1246 UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]); 1247 } 1248 extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */ 1249 } 1250 } 1251 1252 /* allocate and copy the extra data */ 1253 if(count!=0) { 1254 UChar *p; 1255 1256 if(norm->specialTag!=0) { 1257 fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag); 1258 exit(U_ILLEGAL_ARGUMENT_ERROR); 1259 } 1260 1261 p=(UChar *)utm_allocN(extraMem, count); 1262 uprv_memcpy(p, extra, count*2); 1263 1264 /* set the extra index, offset by beforeZero */ 1265 word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT; 1266 } else if(norm->specialTag!=0) { 1267 /* set a special tag instead of an extra index */ 1268 word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT; 1269 } 1270 1271 return word; 1272 } 1273 1274 /* turn all Norm structs into corresponding 32-bit norm values */ 1275 static void 1276 makeAll32() { 1277 uint32_t *pNormData; 1278 uint32_t n; 1279 int32_t i, normLength, count; 1280 1281 count=(int32_t)utm_countItems(normMem); 1282 for(i=0; i<count; ++i) { 1283 norms[i].value32=make32BitNorm(norms+i); 1284 } 1285 1286 pNormData=utrie_getData(norm32Trie, &normLength); 1287 1288 count=0; /* count is now just used for debugging */ 1289 for(i=0; i<normLength; ++i) { 1290 n=pNormData[i]; 1291 if(0!=(pNormData[i]=norms[n].value32)) { 1292 ++count; 1293 } 1294 } 1295 } 1296 1297 /* 1298 * extract all Norm.canonBothCCs into the FCD table 1299 * set 32-bit values to use the common fold and compact functions 1300 */ 1301 static void 1302 makeFCD() { 1303 uint32_t *pFCDData; 1304 uint32_t n; 1305 int32_t i, count, fcdLength; 1306 uint16_t bothCCs; 1307 1308 count=utm_countItems(normMem); 1309 for(i=0; i<count; ++i) { 1310 bothCCs=norms[i].canonBothCCs; 1311 if(bothCCs==0) { 1312 /* if there are no decomposition cc's then use the udataCC twice */ 1313 bothCCs=norms[i].udataCC; 1314 bothCCs|=bothCCs<<8; 1315 } 1316 norms[i].value32=bothCCs; 1317 } 1318 1319 pFCDData=utrie_getData(fcdTrie, &fcdLength); 1320 1321 for(i=0; i<fcdLength; ++i) { 1322 n=pFCDData[i]; 1323 pFCDData[i]=norms[n].value32; 1324 } 1325 } 1326 1327 /** 1328 * If the given set contains exactly one character, then return it. 1329 * Otherwise return -1. 1330 */ 1331 static int32_t 1332 usetContainsOne(const USet* set) { 1333 if(uset_getItemCount(set)==1) { 1334 /* there is a single item (a single range) */ 1335 UChar32 start, end; 1336 UErrorCode ec=U_ZERO_ERROR; 1337 int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec); 1338 if (len==0 && start==end) { /* a range (len==0) with a single code point */ 1339 return start; 1340 } 1341 } 1342 return -1; 1343 } 1344 1345 static void 1346 makeCanonSetFn(void *context, uint32_t code, Norm *norm) { 1347 if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) { 1348 uint16_t *table; 1349 int32_t c, tableLength; 1350 UErrorCode errorCode=U_ZERO_ERROR; 1351 1352 /* does the set contain exactly one code point? */ 1353 c=usetContainsOne(norm->canonStart); 1354 1355 /* add an entry to the BMP or supplementary search table */ 1356 if(code<=0xffff) { 1357 table=canonStartSets+_NORM_MAX_CANON_SETS; 1358 tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; 1359 1360 table[tableLength++]=(uint16_t)code; 1361 1362 if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) { 1363 /* single-code point BMP result for BMP code point */ 1364 table[tableLength++]=(uint16_t)c; 1365 } else { 1366 table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop); 1367 c=-1; 1368 } 1369 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength; 1370 } else { 1371 table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH; 1372 tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; 1373 1374 table[tableLength++]=(uint16_t)(code>>16); 1375 table[tableLength++]=(uint16_t)code; 1376 1377 if(c>=0) { 1378 /* single-code point result for supplementary code point */ 1379 table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); 1380 table[tableLength++]=(uint16_t)c; 1381 } else { 1382 table[tableLength++]=(uint16_t)canonStartSetsTop; 1383 } 1384 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength; 1385 } 1386 1387 if(c<0) { 1388 /* write a USerializedSet */ 1389 ++canonSetsCount; 1390 canonStartSetsTop+= 1391 uset_serialize(norm->canonStart, 1392 canonStartSets+canonStartSetsTop, 1393 _NORM_MAX_CANON_SETS-canonStartSetsTop, 1394 &errorCode); 1395 } 1396 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop; 1397 1398 if(U_FAILURE(errorCode)) { 1399 fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop); 1400 exit(errorCode); 1401 } 1402 if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) { 1403 fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n"); 1404 exit(U_INDEX_OUTOFBOUNDS_ERROR); 1405 } 1406 } 1407 } 1408 1409 /* for getSkippableFlags ---------------------------------------------------- */ 1410 1411 /* combine the lead and trail code points; return <0 if they do not combine */ 1412 static int32_t 1413 combine(uint32_t lead, uint32_t trail) { 1414 CombiningTriple *triples; 1415 uint32_t i, count; 1416 1417 /* search for all triples with c as lead code point */ 1418 triples=utm_getStart(combiningTriplesMem); 1419 count=utm_countItems(combiningTriplesMem); 1420 1421 /* triples are not sorted by code point but for each lead CP there is one contiguous block */ 1422 for(i=0; i<count && lead!=triples[i].lead; ++i) {} 1423 1424 /* check each triple for this code point */ 1425 for(; i<count && lead==triples[i].lead; ++i) { 1426 if(trail==triples[i].trail) { 1427 return (int32_t)triples[i].combined; 1428 } 1429 } 1430 1431 return -1; 1432 } 1433 1434 /* 1435 * Starting from the canonical decomposition s[0..length[ of a single code point, 1436 * is the code point c consumed in an NFC/FCC recomposition? 1437 * 1438 * No need to handle discontiguous composition because that would not consume some 1439 * intermediate character, so would not compose back to the original character. 1440 * See comments in canChangeWithFollowing(). 1441 * 1442 * No need to compose beyond where c canonically orders because if it is consumed 1443 * then the result differs from the original anyway. 1444 * 1445 * Possible optimization: 1446 * - Verify that there are no cases of the same combining mark stacking twice. 1447 * - return FALSE right away if c inserts after a copy of itself 1448 * without attempting to recompose; will happen because each mark in 1449 * the decomposition will be enumerated and passed in as c. 1450 * More complicated and fragile though than it is already. 1451 * 1452 * markus 2002nov04 1453 */ 1454 static UBool 1455 doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) { 1456 int32_t starter, i; 1457 1458 /* ignore trailing characters where cc<prevCC */ 1459 while(length>1 && cc<getCCFromCP(s[length-1])) { 1460 --length; 1461 } 1462 1463 /* start consuming/combining from the beginning */ 1464 starter=(int32_t)s[0]; 1465 for(i=1; i<length; ++i) { 1466 starter=combine((uint32_t)starter, s[i]); 1467 if(starter<0) { 1468 fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n", 1469 (int)s[0], (int)s[1], (int)length, (int)c, cc); 1470 exit(U_INTERNAL_PROGRAM_ERROR); 1471 } 1472 } 1473 1474 /* try to combine/consume c, return TRUE if it is consumed */ 1475 return combine((uint32_t)starter, c)>=0; 1476 } 1477 1478 /* does the starter s[0] combine forward with another char that is below trailCC? */ 1479 static UBool 1480 canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) { 1481 if(trailCC<=1) { 1482 /* no character will combine ahead of the trailing char of the decomposition */ 1483 return FALSE; 1484 } 1485 1486 /* 1487 * We are only checking skippable condition (f). 1488 * Therefore, the original character does not have quick check flag NFC_NO (c), 1489 * i.e., the decomposition recomposes completely back into the original code point. 1490 * So s[0] must be a true starter with cc==0 and 1491 * combining with following code points. 1492 * 1493 * Similarly, length==1 is not possible because that would be a singleton 1494 * decomposition which is marked with NFC_NO and does not pass (c). 1495 * 1496 * Only a character with cc<trailCC can change the composition. 1497 * Reason: A char with cc>=trailCC would order after decomposition s[], 1498 * composition would consume all of the decomposition, and here we know that 1499 * the original char passed check d), i.e., it does not combine forward, 1500 * therefore does not combine with anything after the decomposition is consumed. 1501 * 1502 * Now see if there is a character that 1503 * 1. combines backward 1504 * 2. has cc<trailCC 1505 * 3. is consumed in recomposition 1506 * 1507 * length==2 is simple: 1508 * 1509 * Characters that fulfill these conditions are exactly the ones that combine directly 1510 * with the starter c==s[0] because there is no intervening character after 1511 * reordering. 1512 * We can just enumerate all chars with which c combines (they all pass 1. and 3.) 1513 * and see if one has cc<trailCC (passes 2.). 1514 * 1515 * length>2 is a little harder: 1516 * 1517 * Since we will get different starters during recomposition, we need to 1518 * enumerate each backward-combining character (1.) 1519 * with cc<trailCC (2.) and 1520 * see if it gets consumed in recomposition. (3.) 1521 * No need to enumerate both-ways combining characters because they must have cc==0. 1522 */ 1523 if(length==2) { 1524 /* enumerate all chars that combine with this one and check their cc */ 1525 CombiningTriple *triples; 1526 uint32_t c, i, count; 1527 uint8_t cc; 1528 1529 /* search for all triples with c as lead code point */ 1530 triples=utm_getStart(combiningTriplesMem); 1531 count=utm_countItems(combiningTriplesMem); 1532 c=s[0]; 1533 1534 /* triples are not sorted by code point but for each lead CP there is one contiguous block */ 1535 for(i=0; i<count && c!=triples[i].lead; ++i) {} 1536 1537 /* check each triple for this code point */ 1538 for(; i<count && c==triples[i].lead; ++i) { 1539 cc=getCCFromCP(triples[i].trail); 1540 if(cc>0 && cc<trailCC) { 1541 /* this trail code point combines with c and has cc<trailCC */ 1542 return TRUE; 1543 } 1544 } 1545 } else { 1546 /* enumerate all chars that combine backward */ 1547 uint32_t c2; 1548 uint16_t i; 1549 uint8_t cc; 1550 1551 for(i=combineBothTop; i<combineBackTop; ++i) { 1552 c2=combiningCPs[i]&0xffffff; 1553 cc=getCCFromCP(c2); 1554 /* pass in length-1 because we already know that c2 will insert before the last character with trailCC */ 1555 if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) { 1556 return TRUE; 1557 } 1558 } 1559 } 1560 1561 /* this decomposition is not modified by any appended character */ 1562 return FALSE; 1563 } 1564 1565 /* see unormimp.h for details on NF*C Skippable flags */ 1566 static uint32_t 1567 getSkippableFlags(const Norm *norm) { 1568 /* ignore NF*D skippable properties because they are covered by norm32, test at runtime */ 1569 1570 /* ignore Hangul, test those at runtime (LV Hangul are not skippable) */ 1571 if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) { 1572 return 0; 1573 } 1574 1575 /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */ 1576 1577 /* 1578 * Note: 1579 * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not. 1580 * 1581 * This means that (a)..(e) must always be derived from the runtime norm32 value, 1582 * and (f) be checked from the auxTrie if the character is skippable per (a)..(e), 1583 * the form is NF*C and there is a canonical decomposition (NFD_NO). 1584 * 1585 * (a) unassigned code points get "not skippable"==false because they 1586 * don't have a Norm struct so they won't get here 1587 */ 1588 1589 /* (b) not skippable if cc!=0 */ 1590 if(norm->udataCC!=0) { 1591 return 0; /* non-zero flag for (f) only */ 1592 } 1593 1594 /* 1595 * not NFC_Skippable if 1596 * (c) quick check flag == NO or 1597 * (d) combines forward or 1598 * (e) combines back or 1599 * (f) can change if another character is added 1600 * 1601 * for (f): 1602 * For NF*C: Get corresponding decomposition, get its last starter (cc==0), 1603 * check its composition list, 1604 * see if any of the second code points in the list 1605 * has cc less than the trailCC of the decomposition. 1606 * 1607 * For FCC: Test at runtime if the decomposition has a trailCC>1 1608 * -> there are characters with cc==1, they would order before the trail char 1609 * and prevent contiguous combination with the trail char. 1610 */ 1611 if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 || 1612 (norm->combiningFlags&3)!=0) { 1613 return 0; /* non-zero flag for (f) only */ 1614 } 1615 if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) { 1616 return _NORM_AUX_NFC_SKIP_F_MASK; 1617 } 1618 1619 return 0; /* skippable */ 1620 } 1621 1622 static void 1623 makeAux() { 1624 Norm *norm; 1625 uint32_t *pData; 1626 int32_t i, length; 1627 1628 pData=utrie_getData(auxTrie, &length); 1629 1630 for(i=0; i<length; ++i) { 1631 norm=norms+pData[i]; 1632 /* 1633 * 16-bit auxiliary normalization properties 1634 * see unormimp.h 1635 */ 1636 pData[i]= 1637 ((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))| 1638 (uint32_t)norm->fncIndex; 1639 1640 if(norm->unsafeStart || norm->udataCC!=0) { 1641 pData[i]|=_NORM_AUX_UNSAFE_MASK; 1642 } 1643 1644 pData[i]|=getSkippableFlags(norm); 1645 } 1646 } 1647 1648 /* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */ 1649 static uint32_t U_CALLCONV 1650 getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) { 1651 uint32_t value, leadNorm32=0; 1652 UChar32 limit; 1653 UBool inBlockZero; 1654 1655 limit=start+0x400; 1656 while(start<limit) { 1657 value=utrie_get32(trie, start, &inBlockZero); 1658 if(inBlockZero) { 1659 start+=UTRIE_DATA_BLOCK_LENGTH; 1660 } else { 1661 if(value!=0) { 1662 leadNorm32|=value; 1663 } 1664 ++start; 1665 } 1666 } 1667 1668 /* turn multi-bit fields into the worst-case value */ 1669 if(leadNorm32&_NORM_CC_MASK) { 1670 leadNorm32|=_NORM_CC_MASK; 1671 } 1672 1673 /* clean up unnecessarily ored bit fields */ 1674 leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT); 1675 1676 if(leadNorm32==0) { 1677 /* nothing to do (only composition exclusions?) */ 1678 return 0; 1679 } 1680 1681 /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */ 1682 leadNorm32|=( 1683 (uint32_t)_NORM_EXTRA_INDEX_TOP+ 1684 (uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS) 1685 )<<_NORM_EXTRA_SHIFT; 1686 1687 return leadNorm32; 1688 } 1689 1690 /* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */ 1691 1692 /* 1693 * folding value for auxiliary data: 1694 * store the non-zero offset in bits 9..0 (FNC bits) 1695 * if there is any non-0 entry; 1696 * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points 1697 */ 1698 static uint32_t U_CALLCONV 1699 getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) { 1700 uint32_t value, oredValues; 1701 UChar32 limit; 1702 UBool inBlockZero; 1703 1704 oredValues=0; 1705 limit=start+0x400; 1706 while(start<limit) { 1707 value=utrie_get32(trie, start, &inBlockZero); 1708 if(inBlockZero) { 1709 start+=UTRIE_DATA_BLOCK_LENGTH; 1710 } else { 1711 oredValues|=value; 1712 ++start; 1713 } 1714 } 1715 1716 if(oredValues!=0) { 1717 /* move the 10 significant offset bits into bits 9..0 */ 1718 offset>>=UTRIE_SURROGATE_BLOCK_BITS; 1719 if(offset>_NORM_AUX_FNC_MASK) { 1720 fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n"); 1721 exit(U_INDEX_OUTOFBOUNDS_ERROR); 1722 } 1723 return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK); 1724 } else { 1725 return 0; 1726 } 1727 } 1728 1729 extern void 1730 processData() { 1731 #if 0 1732 uint16_t i; 1733 #endif 1734 1735 processCombining(); 1736 1737 /* canonically reorder decompositions and assign combining classes for decompositions */ 1738 enumTrie(postParseFn, NULL); 1739 1740 #if 0 1741 for(i=1; i<64; ++i) { 1742 if(combineAndQC[i]) { 1743 printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33); 1744 } 1745 } 1746 #endif 1747 1748 /* add hangul/jamo specials */ 1749 setHangulJamoSpecials(); 1750 1751 /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */ 1752 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop; 1753 1754 /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */ 1755 if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) { 1756 enumTrie(makeCanonSetFn, NULL); 1757 } 1758 1759 /* clone the normalization builder trie to make the final data tries */ 1760 if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) || 1761 NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) || 1762 NULL==utrie_clone(auxTrie, normTrie, NULL, 0) 1763 ) { 1764 fprintf(stderr, "error: unable to clone the normalization trie\n"); 1765 exit(U_MEMORY_ALLOCATION_ERROR); 1766 } 1767 1768 /* --- finalize data for quick checks & normalization --- */ 1769 1770 /* turn the Norm structs (stage2, norms) into 32-bit data words */ 1771 makeAll32(); 1772 1773 /* --- finalize data for FCD checks --- */ 1774 1775 /* FCD data: take Norm.canonBothCCs and store them in the FCD table */ 1776 makeFCD(); 1777 1778 /* --- finalize auxiliary normalization data --- */ 1779 makeAux(); 1780 1781 if(beVerbose) { 1782 #if 0 1783 printf("number of stage 2 entries: %ld\n", stage2Mem->index); 1784 printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2); 1785 #endif 1786 printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop); 1787 printf("combining table count: %u\n", combiningTableTop); 1788 } 1789 } 1790 1791 /* is this a norm32 with a special index for a lead surrogate? */ 1792 static U_INLINE UBool 1793 isNorm32LeadSurrogate(uint32_t norm32) { 1794 return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP; 1795 } 1796 1797 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ 1798 static int32_t U_CALLCONV 1799 getFoldingNormOffset(uint32_t norm32) { 1800 if(isNorm32LeadSurrogate(norm32)) { 1801 return 1802 UTRIE_BMP_INDEX_LENGTH+ 1803 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& 1804 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS)); 1805 } else { 1806 return 0; 1807 } 1808 } 1809 1810 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */ 1811 static int32_t U_CALLCONV 1812 getFoldingAuxOffset(uint32_t data) { 1813 return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS; 1814 } 1815 1816 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 1817 1818 extern void 1819 generateData(const char *dataDir, UBool csource) { 1820 static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000]; 1821 1822 UNewDataMemory *pData; 1823 UErrorCode errorCode=U_ZERO_ERROR; 1824 int32_t size, dataLength; 1825 1826 #if UCONFIG_NO_NORMALIZATION 1827 1828 size=0; 1829 1830 #else 1831 1832 U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15); 1833 U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12); 1834 USet *set; 1835 int32_t normTrieSize, fcdTrieSize, auxTrieSize; 1836 1837 normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode); 1838 if(U_FAILURE(errorCode)) { 1839 fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode)); 1840 exit(errorCode); 1841 } 1842 1843 if(DO_STORE(UGENNORM_STORE_FCD)) { 1844 fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode); 1845 if(U_FAILURE(errorCode)) { 1846 fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode)); 1847 exit(errorCode); 1848 } 1849 } else { 1850 fcdTrieSize=0; 1851 } 1852 1853 if(DO_STORE(UGENNORM_STORE_AUX)) { 1854 auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode); 1855 if(U_FAILURE(errorCode)) { 1856 fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode)); 1857 exit(errorCode); 1858 } 1859 } else { 1860 auxTrieSize=0; 1861 } 1862 1863 /* move the parts of canonStartSets[] together into a contiguous block */ 1864 if( canonStartSetsTop<_NORM_MAX_CANON_SETS && 1865 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0 1866 ) { 1867 uprv_memmove(canonStartSets+canonStartSetsTop, 1868 canonStartSets+_NORM_MAX_CANON_SETS, 1869 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2); 1870 } 1871 canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; 1872 1873 if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) && 1874 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0 1875 ) { 1876 uprv_memmove(canonStartSets+canonStartSetsTop, 1877 canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH, 1878 canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2); 1879 } 1880 canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; 1881 1882 /* create the normalization exclusion sets */ 1883 /* 1884 * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]] 1885 * but we cannot use NFD_QC from the pattern because that would require 1886 * unorm.icu which we are just going to generate. 1887 * Therefore we have manually collected nfdQCNoSet and intersect Ideographic 1888 * with that. 1889 */ 1890 U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15); 1891 U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12); 1892 1893 canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop; 1894 set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode); 1895 if(U_FAILURE(errorCode)) { 1896 fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode)); 1897 exit(errorCode); 1898 } 1899 uset_retainAll(set, nfdQCNoSet); 1900 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) { 1901 uset_clear(set); 1902 } 1903 canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode); 1904 if(U_FAILURE(errorCode)) { 1905 fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode)); 1906 exit(errorCode); 1907 } 1908 uset_close(set); 1909 1910 canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop; 1911 set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode); 1912 if(U_FAILURE(errorCode)) { 1913 fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode)); 1914 exit(errorCode); 1915 } 1916 if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) { 1917 uset_clear(set); 1918 } 1919 canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode); 1920 if(U_FAILURE(errorCode)) { 1921 fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode)); 1922 exit(errorCode); 1923 } 1924 uset_close(set); 1925 1926 canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop; 1927 1928 /* make sure that the FCD trie is 4-aligned */ 1929 if((utm_countItems(extraMem)+combiningTableTop)&1) { 1930 combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */ 1931 } 1932 1933 /* pad canonStartSets to 4-alignment, too */ 1934 if(canonStartSetsTop&1) { 1935 canonStartSets[canonStartSetsTop++]=0x1235; 1936 } 1937 1938 size= 1939 _NORM_INDEX_TOP*4+ 1940 normTrieSize+ 1941 utm_countItems(extraMem)*2+ 1942 combiningTableTop*2+ 1943 fcdTrieSize+ 1944 auxTrieSize+ 1945 canonStartSetsTop*2; 1946 1947 if(beVerbose) { 1948 printf("size of normalization trie %5u bytes\n", (int)normTrieSize); 1949 printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem)); 1950 printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]); 1951 printf("size of combining table %5u uint16_t\n", combiningTableTop); 1952 printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize); 1953 printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize); 1954 printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop); 1955 printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP); 1956 printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP); 1957 printf(" number of sets %5d\n", (int)canonSetsCount); 1958 printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]); 1959 printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]); 1960 printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]); 1961 printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size); 1962 } 1963 1964 indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize; 1965 indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem); 1966 1967 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop; 1968 indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop; 1969 indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop); 1970 indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop); 1971 1972 /* the quick check minimum code points are already set */ 1973 1974 indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize; 1975 indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize; 1976 indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop; 1977 1978 #endif 1979 1980 if(csource) { 1981 #if UCONFIG_NO_NORMALIZATION 1982 /* no csource for dummy mode..? */ 1983 fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n"); 1984 exit(1); 1985 #else 1986 /* write .c file for hardcoded data */ 1987 UTrie normRuntimeTrie={ NULL }, fcdRuntimeTrie={ NULL }, auxRuntimeTrie={ NULL }; 1988 UTrie2 *normRuntimeTrie2, *fcdRuntimeTrie2=NULL, *auxRuntimeTrie2=NULL; 1989 FILE *f; 1990 1991 utrie_unserialize(&normRuntimeTrie, normTrieBlock, normTrieSize, &errorCode); 1992 normRuntimeTrie.getFoldingOffset=getFoldingNormOffset; 1993 if(fcdTrieSize>0) { 1994 utrie_unserialize(&fcdRuntimeTrie, fcdTrieBlock, fcdTrieSize, &errorCode); 1995 } 1996 if(auxTrieSize>0) { 1997 utrie_unserialize(&auxRuntimeTrie, auxTrieBlock, auxTrieSize, &errorCode); 1998 auxRuntimeTrie.getFoldingOffset=getFoldingAuxOffset; 1999 } 2000 if(U_FAILURE(errorCode)) { 2001 fprintf( 2002 stderr, 2003 "gennorm error: failed to utrie_unserialize() one of the tries - %s\n", 2004 u_errorName(errorCode)); 2005 exit(errorCode); 2006 } 2007 2008 /* use UTrie2 */ 2009 dataInfo.formatVersion[0]=3; 2010 dataInfo.formatVersion[2]=0; 2011 dataInfo.formatVersion[3]=0; 2012 normRuntimeTrie2=utrie2_fromUTrie(&normRuntimeTrie, 0, &errorCode); 2013 if(fcdTrieSize>0) { 2014 fcdRuntimeTrie2=utrie2_fromUTrie(&fcdRuntimeTrie, 0, &errorCode); 2015 } 2016 if(auxTrieSize>0) { 2017 auxRuntimeTrie2=utrie2_fromUTrie(&auxRuntimeTrie, 0, &errorCode); 2018 } 2019 if(U_FAILURE(errorCode)) { 2020 fprintf( 2021 stderr, 2022 "gennorm error: utrie2_fromUTrie() failed - %s\n", 2023 u_errorName(errorCode)); 2024 exit(errorCode); 2025 } 2026 if(auxTrieSize>0) { 2027 /* delete lead surrogate code unit values */ 2028 UChar lead; 2029 auxRuntimeTrie2=utrie2_cloneAsThawed(auxRuntimeTrie2, &errorCode); 2030 for(lead=0xd800; lead<0xdc00; ++lead) { 2031 utrie2_set32ForLeadSurrogateCodeUnit(auxRuntimeTrie2, lead, auxRuntimeTrie2->initialValue, &errorCode); 2032 } 2033 utrie2_freeze(auxRuntimeTrie2, UTRIE2_16_VALUE_BITS, &errorCode); 2034 if(U_FAILURE(errorCode)) { 2035 fprintf( 2036 stderr, 2037 "gennorm error: deleting lead surrogate code unit values failed - %s\n", 2038 u_errorName(errorCode)); 2039 exit(errorCode); 2040 } 2041 } 2042 2043 f=usrc_create(dataDir, "unorm_props_data.c"); 2044 if(f!=NULL) { 2045 usrc_writeArray(f, 2046 "static const UVersionInfo formatVersion={ ", 2047 dataInfo.formatVersion, 8, 4, 2048 " };\n\n"); 2049 usrc_writeArray(f, 2050 "static const UVersionInfo dataVersion={ ", 2051 dataInfo.dataVersion, 8, 4, 2052 " };\n\n"); 2053 usrc_writeArray(f, 2054 "static const int32_t indexes[_NORM_INDEX_TOP]={\n", 2055 indexes, 32, _NORM_INDEX_TOP, 2056 "\n};\n\n"); 2057 usrc_writeUTrie2Arrays(f, 2058 "static const uint16_t normTrie_index[%ld]={\n", 2059 "static const uint32_t normTrie_data32[%ld]={\n", 2060 normRuntimeTrie2, 2061 "\n};\n\n"); 2062 usrc_writeUTrie2Struct(f, 2063 "static const UTrie2 normTrie={\n", 2064 normRuntimeTrie2, "normTrie_index", "normTrie_data32", 2065 "};\n\n"); 2066 usrc_writeArray(f, 2067 "static const uint16_t extraData[%ld]={\n", 2068 utm_getStart(extraMem), 16, utm_countItems(extraMem), 2069 "\n};\n\n"); 2070 usrc_writeArray(f, 2071 "static const uint16_t combiningTable[%ld]={\n", 2072 combiningTable, 16, combiningTableTop, 2073 "\n};\n\n"); 2074 if(fcdTrieSize>0) { 2075 usrc_writeUTrie2Arrays(f, 2076 "static const uint16_t fcdTrie_index[%ld]={\n", NULL, 2077 fcdRuntimeTrie2, 2078 "\n};\n\n"); 2079 usrc_writeUTrie2Struct(f, 2080 "static const UTrie2 fcdTrie={\n", 2081 fcdRuntimeTrie2, "fcdTrie_index", NULL, 2082 "};\n\n"); 2083 } else { 2084 fputs( "static const UTrie2 fcdTrie={ NULL };\n\n", f); 2085 } 2086 if(auxTrieSize>0) { 2087 usrc_writeUTrie2Arrays(f, 2088 "static const uint16_t auxTrie_index[%ld]={\n", NULL, 2089 auxRuntimeTrie2, 2090 "\n};\n\n"); 2091 usrc_writeUTrie2Struct(f, 2092 "static const UTrie2 auxTrie={\n", 2093 auxRuntimeTrie2, "auxTrie_index", NULL, 2094 "};\n\n"); 2095 } else { 2096 fputs( "static const UTrie2 auxTrie={ NULL };\n\n", f); 2097 } 2098 usrc_writeArray(f, 2099 "static const uint16_t canonStartSets[%ld]={\n", 2100 canonStartSets, 16, canonStartSetsTop, 2101 "\n};\n\n"); 2102 fclose(f); 2103 } 2104 utrie2_close(normRuntimeTrie2); 2105 utrie2_close(fcdRuntimeTrie2); 2106 utrie2_close(auxRuntimeTrie2); 2107 #endif 2108 } else { 2109 /* write the data */ 2110 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, 2111 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); 2112 if(U_FAILURE(errorCode)) { 2113 fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode); 2114 exit(errorCode); 2115 } 2116 2117 #if !UCONFIG_NO_NORMALIZATION 2118 2119 udata_writeBlock(pData, indexes, sizeof(indexes)); 2120 udata_writeBlock(pData, normTrieBlock, normTrieSize); 2121 udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2); 2122 udata_writeBlock(pData, combiningTable, combiningTableTop*2); 2123 udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize); 2124 udata_writeBlock(pData, auxTrieBlock, auxTrieSize); 2125 udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2); 2126 2127 #endif 2128 2129 /* finish up */ 2130 dataLength=udata_finish(pData, &errorCode); 2131 if(U_FAILURE(errorCode)) { 2132 fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode); 2133 exit(errorCode); 2134 } 2135 2136 if(dataLength!=size) { 2137 fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n", 2138 (long)dataLength, (long)size); 2139 exit(U_INTERNAL_PROGRAM_ERROR); 2140 } 2141 } 2142 } 2143 2144 #if !UCONFIG_NO_NORMALIZATION 2145 2146 extern void 2147 cleanUpData(void) { 2148 int32_t i, count; 2149 2150 count=utm_countItems(normMem); 2151 for(i=0; i<count; ++i) { 2152 uset_close(norms[i].canonStart); 2153 } 2154 2155 utm_close(normMem); 2156 utm_close(utf32Mem); 2157 utm_close(extraMem); 2158 utm_close(combiningTriplesMem); 2159 utrie_close(normTrie); 2160 utrie_close(norm32Trie); 2161 utrie_close(fcdTrie); 2162 utrie_close(auxTrie); 2163 2164 uset_close(nfdQCNoSet); 2165 2166 uprv_free(normTrie); 2167 uprv_free(norm32Trie); 2168 uprv_free(fcdTrie); 2169 uprv_free(auxTrie); 2170 } 2171 2172 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 2173 2174 /* 2175 * Hey, Emacs, please set the following: 2176 * 2177 * Local Variables: 2178 * indent-tabs-mode: nil 2179 * End: 2180 * 2181 */ 2182