1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucaelems.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This program reads the Franctional UCA table and generates 17 * internal format for UCA table as well as inverse UCA table. 18 * It then writes binary files containing the data: ucadata.dat 19 * & invuca.dat 20 * 21 * date name comments 22 * 03/02/2001 synwee added setMaxExpansion 23 * 03/07/2001 synwee merged UCA's maxexpansion and tailoring's 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_COLLATION 29 30 #include "unicode/uchar.h" 31 #include "unicode/unistr.h" 32 #include "unicode/ucoleitr.h" 33 #include "unicode/normlzr.h" 34 #include "ucol_elm.h" 35 #include "ucol_tok.h" 36 #include "ucol_cnt.h" 37 #include "unormimp.h" 38 #include "unicode/caniter.h" 39 #include "cmemory.h" 40 41 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status); 42 43 U_CDECL_BEGIN 44 static int32_t U_CALLCONV 45 prefixLookupHash(const UHashTok e) { 46 UCAElements *element = (UCAElements *)e.pointer; 47 UChar buf[256]; 48 UHashTok key; 49 key.pointer = buf; 50 uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar)); 51 buf[element->cSize] = 0; 52 //key.pointer = element->cPoints; 53 //element->cPoints[element->cSize] = 0; 54 return uhash_hashUChars(key); 55 } 56 57 static int8_t U_CALLCONV 58 prefixLookupComp(const UHashTok e1, const UHashTok e2) { 59 UCAElements *element1 = (UCAElements *)e1.pointer; 60 UCAElements *element2 = (UCAElements *)e2.pointer; 61 62 UChar buf1[256]; 63 UHashTok key1; 64 key1.pointer = buf1; 65 uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar)); 66 buf1[element1->cSize] = 0; 67 68 UChar buf2[256]; 69 UHashTok key2; 70 key2.pointer = buf2; 71 uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar)); 72 buf2[element2->cSize] = 0; 73 74 return uhash_compareUChars(key1, key2); 75 } 76 U_CDECL_END 77 78 static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) { 79 if(U_FAILURE(*status)) { 80 return 0; 81 } 82 if(expansions->CEs == NULL) { 83 expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); 84 /* test for NULL */ 85 if (expansions->CEs == NULL) { 86 *status = U_MEMORY_ALLOCATION_ERROR; 87 return 0; 88 } 89 expansions->size = INIT_EXP_TABLE_SIZE; 90 expansions->position = 0; 91 } 92 93 if(expansions->position == expansions->size) { 94 uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t)); 95 if(newData == NULL) { 96 #ifdef UCOL_DEBUG 97 fprintf(stderr, "out of memory for expansions\n"); 98 #endif 99 *status = U_MEMORY_ALLOCATION_ERROR; 100 return -1; 101 } 102 expansions->CEs = newData; 103 expansions->size *= 2; 104 } 105 106 expansions->CEs[expansions->position] = value; 107 return(expansions->position++); 108 } 109 110 U_CAPI tempUCATable* U_EXPORT2 111 uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) { 112 MaxJamoExpansionTable *maxjet; 113 MaxExpansionTable *maxet; 114 tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 115 /* test for NULL */ 116 if (t == NULL) { 117 *status = U_MEMORY_ALLOCATION_ERROR; 118 return NULL; 119 } 120 uprv_memset(t, 0, sizeof(tempUCATable)); 121 122 maxet = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 123 if (maxet == NULL) { 124 goto allocation_failure; 125 } 126 uprv_memset(maxet, 0, sizeof(MaxExpansionTable)); 127 t->maxExpansions = maxet; 128 129 maxjet = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 130 if (maxjet == NULL) { 131 goto allocation_failure; 132 } 133 uprv_memset(maxjet, 0, sizeof(MaxJamoExpansionTable)); 134 t->maxJamoExpansions = maxjet; 135 136 t->image = image; 137 t->options = opts; 138 139 t->UCA = UCA; 140 t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 141 /* test for NULL */ 142 if (t->expansions == NULL) { 143 goto allocation_failure; 144 } 145 uprv_memset(t->expansions, 0, sizeof(ExpansionTable)); 146 147 t->mapping = utrie_open(NULL, NULL, UCOL_ELM_TRIE_CAPACITY, 148 UCOL_SPECIAL_FLAG | (initTag<<24), 149 UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24), 150 TRUE); // Do your own mallocs for the structure, array and have linear Latin 1 151 if (U_FAILURE(*status)) { 152 goto allocation_failure; 153 } 154 t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, NULL, status); 155 if (U_FAILURE(*status)) { 156 goto allocation_failure; 157 } 158 uhash_setValueDeleter(t->prefixLookup, uhash_freeBlock); 159 160 t->contractions = uprv_cnttab_open(t->mapping, status); 161 if (U_FAILURE(*status)) { 162 goto cleanup; 163 } 164 165 /* copy UCA's maxexpansion and merge as we go along */ 166 if (UCA != NULL) { 167 /* adding an extra initial value for easier manipulation */ 168 maxet->size = (UCA->lastEndExpansionCE - UCA->endExpansionCE) 169 + 2; 170 maxet->position = maxet->size - 1; 171 maxet->endExpansionCE = 172 (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size); 173 /* test for NULL */ 174 if (maxet->endExpansionCE == NULL) { 175 goto allocation_failure; 176 } 177 maxet->expansionCESize = 178 (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size); 179 /* test for NULL */ 180 if (maxet->expansionCESize == NULL) { 181 goto allocation_failure; 182 } 183 /* initialized value */ 184 *(maxet->endExpansionCE) = 0; 185 *(maxet->expansionCESize) = 0; 186 uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE, 187 sizeof(uint32_t) * (maxet->size - 1)); 188 uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize, 189 sizeof(uint8_t) * (maxet->size - 1)); 190 } 191 else { 192 maxet->size = 0; 193 } 194 maxjet->endExpansionCE = NULL; 195 maxjet->isV = NULL; 196 maxjet->size = 0; 197 maxjet->position = 0; 198 maxjet->maxLSize = 1; 199 maxjet->maxVSize = 1; 200 maxjet->maxTSize = 1; 201 202 t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 203 /* test for NULL */ 204 if (t->unsafeCP == NULL) { 205 goto allocation_failure; 206 } 207 t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 208 /* test for NULL */ 209 if (t->contrEndCP == NULL) { 210 goto allocation_failure; 211 } 212 uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 213 uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 214 t->cmLookup = NULL; 215 return t; 216 217 allocation_failure: 218 *status = U_MEMORY_ALLOCATION_ERROR; 219 cleanup: 220 uprv_uca_closeTempTable(t); 221 return NULL; 222 } 223 224 static tempUCATable* U_EXPORT2 225 uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) { 226 if(U_FAILURE(*status)) { 227 return NULL; 228 } 229 230 tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 231 /* test for NULL */ 232 if (r == NULL) { 233 *status = U_MEMORY_ALLOCATION_ERROR; 234 return NULL; 235 } 236 uprv_memset(r, 0, sizeof(tempUCATable)); 237 238 /* mapping */ 239 if(t->mapping != NULL) { 240 /*r->mapping = ucmpe32_clone(t->mapping, status);*/ 241 r->mapping = utrie_clone(NULL, t->mapping, NULL, 0); 242 } 243 244 // a hashing clone function would be very nice. We have none currently... 245 // However, we should be good, as closing should not produce any prefixed elements. 246 r->prefixLookup = NULL; // prefixes are not used in closing 247 248 /* expansions */ 249 if(t->expansions != NULL) { 250 r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 251 /* test for NULL */ 252 if (r->expansions == NULL) { 253 *status = U_MEMORY_ALLOCATION_ERROR; 254 goto cleanup; 255 } 256 r->expansions->position = t->expansions->position; 257 r->expansions->size = t->expansions->size; 258 if(t->expansions->CEs != NULL) { 259 r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size); 260 /* test for NULL */ 261 if (r->expansions->CEs == NULL) { 262 *status = U_MEMORY_ALLOCATION_ERROR; 263 goto cleanup; 264 } 265 uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position); 266 } else { 267 r->expansions->CEs = NULL; 268 } 269 } 270 271 if(t->contractions != NULL) { 272 r->contractions = uprv_cnttab_clone(t->contractions, status); 273 // Check for cloning failure. 274 if (r->contractions == NULL) { 275 *status = U_MEMORY_ALLOCATION_ERROR; 276 goto cleanup; 277 } 278 r->contractions->mapping = r->mapping; 279 } 280 281 if(t->maxExpansions != NULL) { 282 r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 283 /* test for NULL */ 284 if (r->maxExpansions == NULL) { 285 *status = U_MEMORY_ALLOCATION_ERROR; 286 goto cleanup; 287 } 288 r->maxExpansions->size = t->maxExpansions->size; 289 r->maxExpansions->position = t->maxExpansions->position; 290 if(t->maxExpansions->endExpansionCE != NULL) { 291 r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size); 292 /* test for NULL */ 293 if (r->maxExpansions->endExpansionCE == NULL) { 294 *status = U_MEMORY_ALLOCATION_ERROR; 295 goto cleanup; 296 } 297 uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size); 298 uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t)); 299 } else { 300 r->maxExpansions->endExpansionCE = NULL; 301 } 302 if(t->maxExpansions->expansionCESize != NULL) { 303 r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size); 304 /* test for NULL */ 305 if (r->maxExpansions->expansionCESize == NULL) { 306 *status = U_MEMORY_ALLOCATION_ERROR; 307 goto cleanup; 308 } 309 uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size); 310 uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t)); 311 } else { 312 r->maxExpansions->expansionCESize = NULL; 313 } 314 } 315 316 if(t->maxJamoExpansions != NULL) { 317 r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 318 /* test for NULL */ 319 if (r->maxJamoExpansions == NULL) { 320 *status = U_MEMORY_ALLOCATION_ERROR; 321 goto cleanup; 322 } 323 r->maxJamoExpansions->size = t->maxJamoExpansions->size; 324 r->maxJamoExpansions->position = t->maxJamoExpansions->position; 325 r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize; 326 r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize; 327 r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize; 328 if(t->maxJamoExpansions->size != 0) { 329 r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size); 330 /* test for NULL */ 331 if (r->maxJamoExpansions->endExpansionCE == NULL) { 332 *status = U_MEMORY_ALLOCATION_ERROR; 333 goto cleanup; 334 } 335 uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t)); 336 r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size); 337 /* test for NULL */ 338 if (r->maxJamoExpansions->isV == NULL) { 339 *status = U_MEMORY_ALLOCATION_ERROR; 340 goto cleanup; 341 } 342 uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool)); 343 } else { 344 r->maxJamoExpansions->endExpansionCE = NULL; 345 r->maxJamoExpansions->isV = NULL; 346 } 347 } 348 349 if(t->unsafeCP != NULL) { 350 r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 351 /* test for NULL */ 352 if (r->unsafeCP == NULL) { 353 *status = U_MEMORY_ALLOCATION_ERROR; 354 goto cleanup; 355 } 356 uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 357 } 358 359 if(t->contrEndCP != NULL) { 360 r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 361 /* test for NULL */ 362 if (r->contrEndCP == NULL) { 363 *status = U_MEMORY_ALLOCATION_ERROR; 364 goto cleanup; 365 } 366 uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 367 } 368 369 r->UCA = t->UCA; 370 r->image = t->image; 371 r->options = t->options; 372 373 return r; 374 cleanup: 375 uprv_uca_closeTempTable(t); 376 return NULL; 377 } 378 379 380 U_CAPI void U_EXPORT2 381 uprv_uca_closeTempTable(tempUCATable *t) { 382 if(t != NULL) { 383 if (t->expansions != NULL) { 384 uprv_free(t->expansions->CEs); 385 uprv_free(t->expansions); 386 } 387 if(t->contractions != NULL) { 388 uprv_cnttab_close(t->contractions); 389 } 390 if (t->mapping != NULL) { 391 utrie_close(t->mapping); 392 } 393 394 if(t->prefixLookup != NULL) { 395 uhash_close(t->prefixLookup); 396 } 397 398 if (t->maxExpansions != NULL) { 399 uprv_free(t->maxExpansions->endExpansionCE); 400 uprv_free(t->maxExpansions->expansionCESize); 401 uprv_free(t->maxExpansions); 402 } 403 404 if (t->maxJamoExpansions->size > 0) { 405 uprv_free(t->maxJamoExpansions->endExpansionCE); 406 uprv_free(t->maxJamoExpansions->isV); 407 } 408 uprv_free(t->maxJamoExpansions); 409 410 uprv_free(t->unsafeCP); 411 uprv_free(t->contrEndCP); 412 413 if (t->cmLookup != NULL) { 414 uprv_free(t->cmLookup->cPoints); 415 uprv_free(t->cmLookup); 416 } 417 418 uprv_free(t); 419 } 420 } 421 422 /** 423 * Looks for the maximum length of all expansion sequences ending with the same 424 * collation element. The size required for maxexpansion and maxsize is 425 * returned if the arrays are too small. 426 * @param endexpansion the last expansion collation element to be added 427 * @param expansionsize size of the expansion 428 * @param maxexpansion data structure to store the maximum expansion data. 429 * @param status error status 430 * @returns size of the maxexpansion and maxsize used. 431 */ 432 static int uprv_uca_setMaxExpansion(uint32_t endexpansion, 433 uint8_t expansionsize, 434 MaxExpansionTable *maxexpansion, 435 UErrorCode *status) 436 { 437 if (maxexpansion->size == 0) { 438 /* we'll always make the first element 0, for easier manipulation */ 439 maxexpansion->endExpansionCE = 440 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t)); 441 /* test for NULL */ 442 if (maxexpansion->endExpansionCE == NULL) { 443 *status = U_MEMORY_ALLOCATION_ERROR; 444 return 0; 445 } 446 *(maxexpansion->endExpansionCE) = 0; 447 maxexpansion->expansionCESize = 448 (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t)); 449 /* test for NULL */; 450 if (maxexpansion->expansionCESize == NULL) { 451 *status = U_MEMORY_ALLOCATION_ERROR; 452 return 0; 453 } 454 *(maxexpansion->expansionCESize) = 0; 455 maxexpansion->size = INIT_EXP_TABLE_SIZE; 456 maxexpansion->position = 0; 457 } 458 459 if (maxexpansion->position + 1 == maxexpansion->size) { 460 uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 461 2 * maxexpansion->size * sizeof(uint32_t)); 462 if (neweece == NULL) { 463 *status = U_MEMORY_ALLOCATION_ERROR; 464 return 0; 465 } 466 maxexpansion->endExpansionCE = neweece; 467 468 uint8_t *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize, 469 2 * maxexpansion->size * sizeof(uint8_t)); 470 if (neweces == NULL) { 471 *status = U_MEMORY_ALLOCATION_ERROR; 472 return 0; 473 } 474 maxexpansion->expansionCESize = neweces; 475 maxexpansion->size *= 2; 476 } 477 478 uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 479 uint8_t *pexpansionsize = maxexpansion->expansionCESize; 480 int pos = maxexpansion->position; 481 482 uint32_t *start = pendexpansionce; 483 uint32_t *limit = pendexpansionce + pos; 484 485 /* using binary search to determine if last expansion element is 486 already in the array */ 487 uint32_t *mid; 488 int result = -1; 489 while (start < limit - 1) { 490 mid = start + ((limit - start) >> 1); 491 if (endexpansion <= *mid) { 492 limit = mid; 493 } 494 else { 495 start = mid; 496 } 497 } 498 499 if (*start == endexpansion) { 500 result = start - pendexpansionce; 501 } 502 else if (*limit == endexpansion) { 503 result = limit - pendexpansionce; 504 } 505 506 if (result > -1) { 507 /* found the ce in expansion, we'll just modify the size if it is 508 smaller */ 509 uint8_t *currentsize = pexpansionsize + result; 510 if (*currentsize < expansionsize) { 511 *currentsize = expansionsize; 512 } 513 } 514 else { 515 /* we'll need to squeeze the value into the array. 516 initial implementation. */ 517 /* shifting the subarray down by 1 */ 518 int shiftsize = (pendexpansionce + pos) - start; 519 uint32_t *shiftpos = start + 1; 520 uint8_t *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce); 521 522 /* okay need to rearrange the array into sorted order */ 523 if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */ 524 *(pendexpansionce + pos + 1) = endexpansion; 525 *(pexpansionsize + pos + 1) = expansionsize; 526 } 527 else { 528 uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t)); 529 uprv_memmove(sizeshiftpos + 1, sizeshiftpos, 530 shiftsize * sizeof(uint8_t)); 531 *shiftpos = endexpansion; 532 *sizeshiftpos = expansionsize; 533 } 534 maxexpansion->position ++; 535 536 #ifdef UCOL_DEBUG 537 int temp; 538 UBool found = FALSE; 539 for (temp = 0; temp < maxexpansion->position; temp ++) { 540 if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) { 541 fprintf(stderr, "expansions %d\n", temp); 542 } 543 if (pendexpansionce[temp] == endexpansion) { 544 found =TRUE; 545 if (pexpansionsize[temp] < expansionsize) { 546 fprintf(stderr, "expansions size %d\n", temp); 547 } 548 } 549 } 550 if (pendexpansionce[temp] == endexpansion) { 551 found =TRUE; 552 if (pexpansionsize[temp] < expansionsize) { 553 fprintf(stderr, "expansions size %d\n", temp); 554 } 555 } 556 if (!found) 557 fprintf(stderr, "expansion not found %d\n", temp); 558 #endif 559 } 560 561 return maxexpansion->position; 562 } 563 564 /** 565 * Sets the maximum length of all jamo expansion sequences ending with the same 566 * collation element. The size required for maxexpansion and maxsize is 567 * returned if the arrays are too small. 568 * @param ch the jamo codepoint 569 * @param endexpansion the last expansion collation element to be added 570 * @param expansionsize size of the expansion 571 * @param maxexpansion data structure to store the maximum expansion data. 572 * @param status error status 573 * @returns size of the maxexpansion and maxsize used. 574 */ 575 static int uprv_uca_setMaxJamoExpansion(UChar ch, 576 uint32_t endexpansion, 577 uint8_t expansionsize, 578 MaxJamoExpansionTable *maxexpansion, 579 UErrorCode *status) 580 { 581 UBool isV = TRUE; 582 if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) { 583 /* determines L for Jamo, doesn't need to store this since it is never 584 at the end of a expansion */ 585 if (maxexpansion->maxLSize < expansionsize) { 586 maxexpansion->maxLSize = expansionsize; 587 } 588 return maxexpansion->position; 589 } 590 591 if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) { 592 /* determines V for Jamo */ 593 if (maxexpansion->maxVSize < expansionsize) { 594 maxexpansion->maxVSize = expansionsize; 595 } 596 } 597 598 if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) { 599 isV = FALSE; 600 /* determines T for Jamo */ 601 if (maxexpansion->maxTSize < expansionsize) { 602 maxexpansion->maxTSize = expansionsize; 603 } 604 } 605 606 if (maxexpansion->size == 0) { 607 /* we'll always make the first element 0, for easier manipulation */ 608 maxexpansion->endExpansionCE = 609 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t)); 610 /* test for NULL */; 611 if (maxexpansion->endExpansionCE == NULL) { 612 *status = U_MEMORY_ALLOCATION_ERROR; 613 return 0; 614 } 615 *(maxexpansion->endExpansionCE) = 0; 616 maxexpansion->isV = 617 (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool)); 618 /* test for NULL */; 619 if (maxexpansion->isV == NULL) { 620 *status = U_MEMORY_ALLOCATION_ERROR; 621 uprv_free(maxexpansion->endExpansionCE); 622 maxexpansion->endExpansionCE = NULL; 623 return 0; 624 } 625 *(maxexpansion->isV) = 0; 626 maxexpansion->size = INIT_EXP_TABLE_SIZE; 627 maxexpansion->position = 0; 628 } 629 630 if (maxexpansion->position + 1 == maxexpansion->size) { 631 maxexpansion->size *= 2; 632 maxexpansion->endExpansionCE = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 633 maxexpansion->size * sizeof(uint32_t)); 634 if (maxexpansion->endExpansionCE == NULL) { 635 #ifdef UCOL_DEBUG 636 fprintf(stderr, "out of memory for maxExpansions\n"); 637 #endif 638 *status = U_MEMORY_ALLOCATION_ERROR; 639 return 0; 640 } 641 maxexpansion->isV = (UBool *)uprv_realloc(maxexpansion->isV, 642 maxexpansion->size * sizeof(UBool)); 643 if (maxexpansion->isV == NULL) { 644 #ifdef UCOL_DEBUG 645 fprintf(stderr, "out of memory for maxExpansions\n"); 646 #endif 647 *status = U_MEMORY_ALLOCATION_ERROR; 648 uprv_free(maxexpansion->endExpansionCE); 649 maxexpansion->endExpansionCE = NULL; 650 return 0; 651 } 652 } 653 654 uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 655 int pos = maxexpansion->position; 656 657 while (pos > 0) { 658 pos --; 659 if (*(pendexpansionce + pos) == endexpansion) { 660 return maxexpansion->position; 661 } 662 } 663 664 *(pendexpansionce + maxexpansion->position) = endexpansion; 665 *(maxexpansion->isV + maxexpansion->position) = isV; 666 maxexpansion->position ++; 667 668 return maxexpansion->position; 669 } 670 671 672 static void ContrEndCPSet(uint8_t *table, UChar c) { 673 uint32_t hash; 674 uint8_t *htByte; 675 676 hash = c; 677 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 678 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 679 } 680 htByte = &table[hash>>3]; 681 *htByte |= (1 << (hash & 7)); 682 } 683 684 685 static void unsafeCPSet(uint8_t *table, UChar c) { 686 uint32_t hash; 687 uint8_t *htByte; 688 689 hash = c; 690 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 691 if (hash >= 0xd800 && hash <= 0xf8ff) { 692 /* Part of a surrogate, or in private use area. */ 693 /* These don't go in the table */ 694 return; 695 } 696 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 697 } 698 htByte = &table[hash>>3]; 699 *htByte |= (1 << (hash & 7)); 700 } 701 702 static void 703 uprv_uca_createCMTable(tempUCATable *t, int32_t noOfCM, UErrorCode *status) { 704 t->cmLookup = (CombinClassTable *)uprv_malloc(sizeof(CombinClassTable)); 705 if (t->cmLookup==NULL) { 706 *status = U_MEMORY_ALLOCATION_ERROR; 707 return; 708 } 709 t->cmLookup->cPoints=(UChar *)uprv_malloc(noOfCM*sizeof(UChar)); 710 if (t->cmLookup->cPoints ==NULL) { 711 uprv_free(t->cmLookup); 712 t->cmLookup = NULL; 713 *status = U_MEMORY_ALLOCATION_ERROR; 714 return; 715 } 716 717 t->cmLookup->size=noOfCM; 718 uprv_memset(t->cmLookup->index, 0, sizeof(t->cmLookup->index)); 719 720 return; 721 } 722 723 static void 724 uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) { 725 int32_t count=0; 726 727 for (int32_t i=0; i<256; ++i) { 728 if (index[i]>0) { 729 // cPoints is ordered by combining class value. 730 uprv_memcpy(t->cmLookup->cPoints+count, cm+(i<<8), index[i]*sizeof(UChar)); 731 count += index[i]; 732 } 733 t->cmLookup->index[i]=count; 734 } 735 return; 736 } 737 738 /* 1. to the UnsafeCP hash table, add all chars with combining class != 0 */ 739 /* 2. build combining marks table for all chars with combining class != 0 */ 740 static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) { 741 742 UChar c; 743 uint16_t fcd; // Hi byte is lead combining class. 744 // lo byte is trailing combing class. 745 const uint16_t *fcdTrieIndex; 746 UChar32 fcdHighStart; 747 UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table 748 UChar *cm=NULL; 749 uint16_t index[256]; 750 int32_t count=0; 751 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 752 if (U_FAILURE(*status)) { 753 return; 754 } 755 756 if (buildCMTable) { 757 if (cm==NULL) { 758 cm = (UChar *)uprv_malloc(sizeof(UChar)*UCOL_MAX_CM_TAB); 759 if (cm==NULL) { 760 *status = U_MEMORY_ALLOCATION_ERROR; 761 return; 762 } 763 } 764 uprv_memset(index, 0, sizeof(index)); 765 } 766 for (c=0; c<0xffff; c++) { 767 fcd = unorm_getFCD16(fcdTrieIndex, c); 768 if (fcd >= 0x100 || // if the leading combining class(c) > 0 || 769 (UTF_IS_LEAD(c) && fcd != 0)) {// c is a leading surrogate with some FCD data 770 if (buildCMTable) { 771 uint32_t cClass = fcd & 0xff; 772 //uint32_t temp=(cClass<<8)+index[cClass]; 773 cm[(cClass<<8)+index[cClass]] = c; // 774 index[cClass]++; 775 count++; 776 } 777 unsafeCPSet(t->unsafeCP, c); 778 } 779 } 780 781 // copy to cm table 782 if (buildCMTable) { 783 uprv_uca_createCMTable(t, count, status); 784 if(U_FAILURE(*status)) { 785 if (cm!=NULL) { 786 uprv_free(cm); 787 } 788 return; 789 } 790 uprv_uca_copyCMTable(t, cm, index); 791 } 792 793 if(t->prefixLookup != NULL) { 794 int32_t i = -1; 795 const UHashElement *e = NULL; 796 UCAElements *element = NULL; 797 UChar NFCbuf[256]; 798 uint32_t NFCbufLen = 0; 799 while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) { 800 element = (UCAElements *)e->value.pointer; 801 // codepoints here are in the NFD form. We need to add the 802 // first code point of the NFC form to unsafe, because 803 // strcoll needs to backup over them. 804 NFCbufLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, 805 NFCbuf, 256, status); 806 unsafeCPSet(t->unsafeCP, NFCbuf[0]); 807 } 808 } 809 810 if (cm!=NULL) { 811 uprv_free(cm); 812 } 813 } 814 815 static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE, 816 UCAElements *element, UErrorCode *status) 817 { 818 // currently the longest prefix we're supporting in Japanese is two characters 819 // long. Although this table could quite easily mimic complete contraction stuff 820 // there is no good reason to make a general solution, as it would require some 821 // error prone messing. 822 CntTable *contractions = t->contractions; 823 UChar32 cp; 824 uint32_t cpsize = 0; 825 UChar *oldCP = element->cPoints; 826 uint32_t oldCPSize = element->cSize; 827 828 829 contractions->currentTag = SPEC_PROC_TAG; 830 831 // here, we will normalize & add prefix to the table. 832 uint32_t j = 0; 833 #ifdef UCOL_DEBUG 834 for(j=0; j<element->cSize; j++) { 835 fprintf(stdout, "CP: %04X ", element->cPoints[j]); 836 } 837 fprintf(stdout, "El: %08X Pref: ", CE); 838 for(j=0; j<element->prefixSize; j++) { 839 fprintf(stdout, "%04X ", element->prefix[j]); 840 } 841 fprintf(stdout, "%08X ", element->mapCE); 842 #endif 843 844 for (j = 1; j<element->prefixSize; j++) { /* First add NFD prefix chars to unsafe CP hash table */ 845 // Unless it is a trail surrogate, which is handled algoritmically and 846 // shouldn't take up space in the table. 847 if(!(UTF_IS_TRAIL(element->prefix[j]))) { 848 unsafeCPSet(t->unsafeCP, element->prefix[j]); 849 } 850 } 851 852 UChar tempPrefix = 0; 853 854 for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards 855 // therefore, we will promptly reverse the prefix buffer... 856 tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1); 857 *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j]; 858 element->prefix[j] = tempPrefix; 859 } 860 861 #ifdef UCOL_DEBUG 862 fprintf(stdout, "Reversed: "); 863 for(j=0; j<element->prefixSize; j++) { 864 fprintf(stdout, "%04X ", element->prefix[j]); 865 } 866 fprintf(stdout, "%08X\n", element->mapCE); 867 #endif 868 869 // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix 870 if(!(UTF_IS_TRAIL(element->cPoints[0]))) { 871 unsafeCPSet(t->unsafeCP, element->cPoints[0]); 872 } 873 874 // Maybe we need this... To handle prefixes completely in the forward direction... 875 //if(element->cSize == 1) { 876 // if(!(UTF_IS_TRAIL(element->cPoints[0]))) { 877 // ContrEndCPSet(t->contrEndCP, element->cPoints[0]); 878 // } 879 //} 880 881 element->cPoints = element->prefix; 882 element->cSize = element->prefixSize; 883 884 // Add the last char of the contraction to the contraction-end hash table. 885 // unless it is a trail surrogate, which is handled algorithmically and 886 // shouldn't be in the table 887 if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) { 888 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 889 } 890 891 // First we need to check if contractions starts with a surrogate 892 UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp); 893 894 // If there are any Jamos in the contraction, we should turn on special 895 // processing for Jamos 896 if(UCOL_ISJAMO(element->prefix[0])) { 897 t->image->jamoSpecial = TRUE; 898 } 899 /* then we need to deal with it */ 900 /* we could aready have something in table - or we might not */ 901 902 if(!isPrefix(CE)) { 903 /* if it wasn't contraction, we wouldn't end up here*/ 904 int32_t firstContractionOffset = 0; 905 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 906 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 907 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status); 908 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 909 CE = constructContractCE(SPEC_PROC_TAG, firstContractionOffset); 910 } else { /* we are adding to existing contraction */ 911 /* there were already some elements in the table, so we need to add a new contraction */ 912 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 913 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status); 914 if(position > 0) { /* if it is we just continue down the chain */ 915 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 916 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 917 uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status); 918 } else { /* if it isn't, we will have to create a new sequence */ 919 uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 920 uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status); 921 } 922 } 923 924 element->cPoints = oldCP; 925 element->cSize = oldCPSize; 926 927 return CE; 928 } 929 930 // Note regarding surrogate handling: We are interested only in the single 931 // or leading surrogates in a contraction. If a surrogate is somewhere else 932 // in the contraction, it is going to be handled as a pair of code units, 933 // as it doesn't affect the performance AND handling surrogates specially 934 // would complicate code way too much. 935 static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 936 UCAElements *element, UErrorCode *status) 937 { 938 CntTable *contractions = t->contractions; 939 UChar32 cp; 940 uint32_t cpsize = 0; 941 942 contractions->currentTag = CONTRACTION_TAG; 943 944 // First we need to check if contractions starts with a surrogate 945 UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp); 946 947 if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first 948 uint32_t j = 0; 949 for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */ 950 // Unless it is a trail surrogate, which is handled algoritmically and 951 // shouldn't take up space in the table. 952 if(!(UTF_IS_TRAIL(element->cPoints[j]))) { 953 unsafeCPSet(t->unsafeCP, element->cPoints[j]); 954 } 955 } 956 // Add the last char of the contraction to the contraction-end hash table. 957 // unless it is a trail surrogate, which is handled algorithmically and 958 // shouldn't be in the table 959 if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) { 960 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 961 } 962 963 // If there are any Jamos in the contraction, we should turn on special 964 // processing for Jamos 965 if(UCOL_ISJAMO(element->cPoints[0])) { 966 t->image->jamoSpecial = TRUE; 967 } 968 /* then we need to deal with it */ 969 /* we could aready have something in table - or we might not */ 970 element->cPoints+=cpsize; 971 element->cSize-=cpsize; 972 if(!isContraction(CE)) { 973 /* if it wasn't contraction, we wouldn't end up here*/ 974 int32_t firstContractionOffset = 0; 975 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 976 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 977 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 978 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 979 CE = constructContractCE(CONTRACTION_TAG, firstContractionOffset); 980 } else { /* we are adding to existing contraction */ 981 /* there were already some elements in the table, so we need to add a new contraction */ 982 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 983 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status); 984 if(position > 0) { /* if it is we just continue down the chain */ 985 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 986 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 987 uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status); 988 } else { /* if it isn't, we will have to create a new sequence */ 989 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 990 uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status); 991 } 992 } 993 element->cPoints-=cpsize; 994 element->cSize+=cpsize; 995 /*ucmpe32_set(t->mapping, cp, CE);*/ 996 utrie_set32(t->mapping, cp, CE); 997 } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */ 998 /*ucmpe32_set(t->mapping, cp, element->mapCE);*/ 999 utrie_set32(t->mapping, cp, element->mapCE); 1000 } else { /* fill out the first stage of the contraction with the surrogate CE */ 1001 uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status); 1002 uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status); 1003 } 1004 return CE; 1005 } 1006 1007 1008 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) { 1009 int32_t firstContractionOffset = 0; 1010 // uint32_t contractionElement = UCOL_NOT_FOUND; 1011 1012 if(U_FAILURE(*status)) { 1013 return UCOL_NOT_FOUND; 1014 } 1015 1016 /* end of recursion */ 1017 if(element->cSize == 1) { 1018 if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) { 1019 uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status); 1020 uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status); 1021 return existingCE; 1022 } else { 1023 return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */ 1024 } 1025 } 1026 1027 /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ 1028 /* for both backward and forward cycles */ 1029 1030 /* we encountered either an empty space or a non-contraction element */ 1031 /* this means we are constructing a new contraction sequence */ 1032 element->cPoints++; 1033 element->cSize--; 1034 if(!isCntTableElement(existingCE)) { 1035 /* if it wasn't contraction, we wouldn't end up here*/ 1036 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status); 1037 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1038 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 1039 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status); 1040 existingCE = constructContractCE(contractions->currentTag, firstContractionOffset); 1041 } else { /* we are adding to existing contraction */ 1042 /* there were already some elements in the table, so we need to add a new contraction */ 1043 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 1044 int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status); 1045 if(position > 0) { /* if it is we just continue down the chain */ 1046 uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status); 1047 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 1048 uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status); 1049 } else { /* if it isn't, we will have to create a new sequence */ 1050 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1051 uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status); 1052 } 1053 } 1054 element->cPoints--; 1055 element->cSize++; 1056 return existingCE; 1057 } 1058 1059 static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1060 uint32_t CE = UCOL_NOT_FOUND; 1061 // This should add a completely ignorable element to the 1062 // unsafe table, so that backward iteration will skip 1063 // over it when treating contractions. 1064 uint32_t i = 0; 1065 if(element->mapCE == 0) { 1066 for(i = 0; i < element->cSize; i++) { 1067 if(!UTF_IS_TRAIL(element->cPoints[i])) { 1068 unsafeCPSet(t->unsafeCP, element->cPoints[i]); 1069 } 1070 } 1071 } 1072 if(element->cSize > 1) { /* we're adding a contraction */ 1073 uint32_t i = 0; 1074 UChar32 cp; 1075 1076 UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp); 1077 /*CE = ucmpe32_get(t->mapping, cp);*/ 1078 CE = utrie_get32(t->mapping, cp, NULL); 1079 1080 CE = uprv_uca_addContraction(t, CE, element, status); 1081 } else { /* easy case, */ 1082 /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/ 1083 CE = utrie_get32(t->mapping, element->cPoints[0], NULL); 1084 1085 if( CE != UCOL_NOT_FOUND) { 1086 if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */ 1087 if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop 1088 // Only expansions and regular CEs can go here... Contractions will never happen in this place 1089 uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status); 1090 /* This loop has to change the CE at the end of contraction REDO!*/ 1091 uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status); 1092 } 1093 } else { 1094 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1095 utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1096 if ((element->prefixSize!=0) && (getCETag(CE)!=IMPLICIT_TAG)) { 1097 UCAElements *origElem = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1098 /* test for NULL */ 1099 if (origElem== NULL) { 1100 *status = U_MEMORY_ALLOCATION_ERROR; 1101 return 0; 1102 } 1103 /* copy the original UCA value */ 1104 origElem->prefixSize = 0; 1105 origElem->prefix = NULL; 1106 origElem->cPoints = origElem->uchars; 1107 origElem->cPoints[0] = element->cPoints[0]; 1108 origElem->cSize = 1; 1109 origElem->CEs[0]=CE; 1110 origElem->mapCE=CE; 1111 origElem->noOfCEs=1; 1112 uprv_uca_finalizeAddition(t, origElem, status); 1113 uprv_free(origElem); 1114 } 1115 #ifdef UCOL_DEBUG 1116 fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]); 1117 //*status = U_ILLEGAL_ARGUMENT_ERROR; 1118 #endif 1119 } 1120 } else { 1121 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1122 utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1123 } 1124 } 1125 return CE; 1126 } 1127 1128 /* This adds a read element, while testing for existence */ 1129 U_CAPI uint32_t U_EXPORT2 1130 uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1131 U_NAMESPACE_USE 1132 1133 ExpansionTable *expansions = t->expansions; 1134 1135 uint32_t i = 1; 1136 uint32_t expansion = 0; 1137 uint32_t CE; 1138 1139 if(U_FAILURE(*status)) { 1140 return 0xFFFF; 1141 } 1142 1143 element->mapCE = 0; // clear mapCE so that we can catch expansions 1144 1145 if(element->noOfCEs == 1) { 1146 element->mapCE = element->CEs[0]; 1147 } else { 1148 /* ICU 2.1 long primaries */ 1149 /* unfortunately, it looks like we have to look for a long primary here */ 1150 /* since in canonical closure we are going to hit some long primaries from */ 1151 /* the first phase, and they will come back as continuations/expansions */ 1152 /* destroying the effect of the previous opitimization */ 1153 /* A long primary is a three byte primary with starting secondaries and tertiaries */ 1154 /* It can appear in long runs of only primary differences (like east Asian tailorings) */ 1155 /* also, it should not be an expansion, as expansions would break with this */ 1156 // This part came in from ucol_bld.cpp 1157 //if(tok->expansion == 0 1158 //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1 1159 //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) { 1160 /* we will construct a special CE that will go unchanged to the table */ 1161 if(element->noOfCEs == 2 // a two CE expansion 1162 && isContinuation(element->CEs[1]) // which is a continuation 1163 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1164 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1165 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1166 ) 1167 { 1168 #ifdef UCOL_DEBUG 1169 fprintf(stdout, "Long primary %04X\n", element->cPoints[0]); 1170 #endif 1171 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1172 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1173 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1174 } 1175 else { 1176 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1177 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1178 & 0xFFFFF0); 1179 1180 for(i = 1; i<element->noOfCEs; i++) { 1181 uprv_uca_addExpansion(expansions, element->CEs[i], status); 1182 } 1183 if(element->noOfCEs <= 0xF) { 1184 expansion |= element->noOfCEs; 1185 } else { 1186 uprv_uca_addExpansion(expansions, 0, status); 1187 } 1188 element->mapCE = expansion; 1189 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1190 (uint8_t)element->noOfCEs, 1191 t->maxExpansions, 1192 status); 1193 if(UCOL_ISJAMO(element->cPoints[0])) { 1194 t->image->jamoSpecial = TRUE; 1195 uprv_uca_setMaxJamoExpansion(element->cPoints[0], 1196 element->CEs[element->noOfCEs - 1], 1197 (uint8_t)element->noOfCEs, 1198 t->maxJamoExpansions, 1199 status); 1200 } 1201 if (U_FAILURE(*status)) { 1202 return 0; 1203 } 1204 } 1205 } 1206 1207 // We treat digits differently - they are "uber special" and should be 1208 // processed differently if numeric collation is on. 1209 UChar32 uniChar = 0; 1210 //printElement(element); 1211 if ((element->cSize == 2) && U16_IS_LEAD(element->cPoints[0])){ 1212 uniChar = U16_GET_SUPPLEMENTARY(element->cPoints[0], element->cPoints[1]); 1213 } else if (element->cSize == 1){ 1214 uniChar = element->cPoints[0]; 1215 } 1216 1217 // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only 1218 // one element to the expansion buffer. When we encounter a digit and we don't 1219 // do numeric collation, we will just pick the CE we have and break out of case 1220 // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked 1221 // a special, further processing will occur. If it's a simple CE, we'll return due 1222 // to how the loop is constructed. 1223 if (uniChar != 0 && u_isdigit(uniChar)){ 1224 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element 1225 if(element->mapCE) { // if there is an expansion, we'll pick it here 1226 expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4); 1227 } else { 1228 expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4); 1229 } 1230 element->mapCE = expansion; 1231 1232 // Need to go back to the beginning of the digit string if in the middle! 1233 if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars 1234 unsafeCPSet(t->unsafeCP, (UChar)uniChar); 1235 } 1236 } 1237 1238 // here we want to add the prefix structure. 1239 // I will try to process it as a reverse contraction, if possible. 1240 // prefix buffer is already reversed. 1241 1242 if(element->prefixSize!=0) { 1243 // We keep the seen prefix starter elements in a hashtable 1244 // we need it to be able to distinguish between the simple 1245 // codepoints and prefix starters. Also, we need to use it 1246 // for canonical closure. 1247 1248 UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1249 /* test for NULL */ 1250 if (composed == NULL) { 1251 *status = U_MEMORY_ALLOCATION_ERROR; 1252 return 0; 1253 } 1254 uprv_memcpy(composed, element, sizeof(UCAElements)); 1255 composed->cPoints = composed->uchars; 1256 composed->prefix = composed->prefixChars; 1257 1258 composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status); 1259 1260 1261 if(t->prefixLookup != NULL) { 1262 UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element); 1263 if(uCE != NULL) { // there is already a set of code points here 1264 element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status); 1265 } else { // no code points, so this spot is clean 1266 element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status); 1267 uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1268 /* test for NULL */ 1269 if (uCE == NULL) { 1270 *status = U_MEMORY_ALLOCATION_ERROR; 1271 return 0; 1272 } 1273 uprv_memcpy(uCE, element, sizeof(UCAElements)); 1274 uCE->cPoints = uCE->uchars; 1275 uhash_put(t->prefixLookup, uCE, uCE, status); 1276 } 1277 if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) { 1278 // do it! 1279 composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status); 1280 } 1281 } 1282 uprv_free(composed); 1283 } 1284 1285 // We need to use the canonical iterator here 1286 // the way we do it is to generate the canonically equivalent strings 1287 // for the contraction and then add the sequences that pass FCD check 1288 if(element->cSize > 1 && !(element->cSize==2 && UTF16_IS_LEAD(element->cPoints[0]) && UTF16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included 1289 UnicodeString source(element->cPoints, element->cSize); 1290 CanonicalIterator it(source, *status); 1291 source = it.next(); 1292 while(!source.isBogus()) { 1293 if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) { 1294 element->cSize = source.extract(element->cPoints, 128, *status); 1295 uprv_uca_finalizeAddition(t, element, status); 1296 } 1297 source = it.next(); 1298 } 1299 CE = element->mapCE; 1300 } else { 1301 CE = uprv_uca_finalizeAddition(t, element, status); 1302 } 1303 1304 return CE; 1305 } 1306 1307 1308 /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */ 1309 static void uprv_uca_getMaxExpansionJamo(UNewTrie *mapping, 1310 MaxExpansionTable *maxexpansion, 1311 MaxJamoExpansionTable *maxjamoexpansion, 1312 UBool jamospecial, 1313 UErrorCode *status) 1314 { 1315 const uint32_t VBASE = 0x1161; 1316 const uint32_t TBASE = 0x11A8; 1317 const uint32_t VCOUNT = 21; 1318 const uint32_t TCOUNT = 28; 1319 1320 uint32_t v = VBASE + VCOUNT - 1; 1321 uint32_t t = TBASE + TCOUNT - 1; 1322 uint32_t ce; 1323 1324 while (v >= VBASE) { 1325 /*ce = ucmpe32_get(mapping, v);*/ 1326 ce = utrie_get32(mapping, v, NULL); 1327 if (ce < UCOL_SPECIAL_FLAG) { 1328 uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status); 1329 } 1330 v --; 1331 } 1332 1333 while (t >= TBASE) 1334 { 1335 /*ce = ucmpe32_get(mapping, t);*/ 1336 ce = utrie_get32(mapping, t, NULL); 1337 if (ce < UCOL_SPECIAL_FLAG) { 1338 uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status); 1339 } 1340 t --; 1341 } 1342 /* According to the docs, 99% of the time, the Jamo will not be special */ 1343 if (jamospecial) { 1344 /* gets the max expansion in all unicode characters */ 1345 int count = maxjamoexpansion->position; 1346 uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize + 1347 maxjamoexpansion->maxVSize + 1348 maxjamoexpansion->maxTSize); 1349 uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize + 1350 maxjamoexpansion->maxVSize); 1351 1352 while (count > 0) { 1353 count --; 1354 if (*(maxjamoexpansion->isV + count) == TRUE) { 1355 uprv_uca_setMaxExpansion( 1356 *(maxjamoexpansion->endExpansionCE + count), 1357 maxVSize, maxexpansion, status); 1358 } 1359 else { 1360 uprv_uca_setMaxExpansion( 1361 *(maxjamoexpansion->endExpansionCE + count), 1362 maxTSize, maxexpansion, status); 1363 } 1364 } 1365 } 1366 } 1367 1368 U_CDECL_BEGIN 1369 static inline uint32_t U_CALLCONV 1370 getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) 1371 { 1372 uint32_t value; 1373 uint32_t tag; 1374 UChar32 limit; 1375 UBool inBlockZero; 1376 1377 limit=start+0x400; 1378 while(start<limit) { 1379 value=utrie_get32(trie, start, &inBlockZero); 1380 tag = getCETag(value); 1381 if(inBlockZero == TRUE) { 1382 start+=UTRIE_DATA_BLOCK_LENGTH; 1383 } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) { 1384 /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the 1385 * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is 1386 * nothing in this position and that it should be skipped. 1387 */ 1388 #ifdef UCOL_DEBUG 1389 static int32_t count = 1; 1390 fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value); 1391 #endif 1392 return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset); 1393 } else { 1394 ++start; 1395 } 1396 } 1397 return 0; 1398 } 1399 U_CDECL_END 1400 1401 #ifdef UCOL_DEBUG 1402 // This is a debug function to print the contents of a trie. 1403 // It is used in conjuction with the code around utrie_unserialize call 1404 void enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 1405 if(start<0x10000) { 1406 fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value); 1407 } else { 1408 fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value); 1409 } 1410 } 1411 1412 int32_t 1413 myGetFoldingOffset(uint32_t data) { 1414 if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) { 1415 return (data&0xFFFFFF); 1416 } else { 1417 return 0; 1418 } 1419 } 1420 #endif 1421 1422 U_CAPI UCATableHeader* U_EXPORT2 1423 uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { 1424 /*CompactEIntArray *mapping = t->mapping;*/ 1425 UNewTrie *mapping = t->mapping; 1426 ExpansionTable *expansions = t->expansions; 1427 CntTable *contractions = t->contractions; 1428 MaxExpansionTable *maxexpansion = t->maxExpansions; 1429 1430 if(U_FAILURE(*status)) { 1431 return NULL; 1432 } 1433 1434 uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar)); 1435 1436 int32_t contractionsSize = 0; 1437 contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status); 1438 1439 /* the following operation depends on the trie data. Therefore, we have to do it before */ 1440 /* the trie is compacted */ 1441 /* sets jamo expansions */ 1442 uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions, 1443 t->image->jamoSpecial, status); 1444 1445 /*ucmpe32_compact(mapping);*/ 1446 /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/ 1447 /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/ 1448 /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/ 1449 1450 // After setting the jamo expansions, compact the trie and get the needed size 1451 int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status); 1452 1453 uint32_t tableOffset = 0; 1454 uint8_t *dataStart; 1455 1456 /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */ 1457 1458 uint32_t toAllocate =(uint32_t)(headersize+ 1459 paddedsize(expansions->position*sizeof(uint32_t))+ 1460 paddedsize(mappingSize)+ 1461 paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+ 1462 //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */ 1463 /* maxexpansion array */ 1464 + paddedsize(maxexpansion->position * sizeof(uint32_t)) + 1465 /* maxexpansion size array */ 1466 paddedsize(maxexpansion->position * sizeof(uint8_t)) + 1467 paddedsize(UCOL_UNSAFECP_TABLE_SIZE) + /* Unsafe chars */ 1468 paddedsize(UCOL_UNSAFECP_TABLE_SIZE)); /* Contraction Ending chars */ 1469 1470 1471 dataStart = (uint8_t *)uprv_malloc(toAllocate); 1472 /* test for NULL */ 1473 if (dataStart == NULL) { 1474 *status = U_MEMORY_ALLOCATION_ERROR; 1475 return NULL; 1476 } 1477 1478 UCATableHeader *myData = (UCATableHeader *)dataStart; 1479 // Please, do reset all the fields! 1480 uprv_memset(dataStart, 0, toAllocate); 1481 // Make sure we know this is reset 1482 myData->magic = UCOL_HEADER_MAGIC; 1483 myData->isBigEndian = U_IS_BIG_ENDIAN; 1484 myData->charSetFamily = U_CHARSET_FAMILY; 1485 myData->formatVersion[0] = UCA_FORMAT_VERSION_0; 1486 myData->formatVersion[1] = UCA_FORMAT_VERSION_1; 1487 myData->formatVersion[2] = UCA_FORMAT_VERSION_2; 1488 myData->formatVersion[3] = UCA_FORMAT_VERSION_3; 1489 myData->jamoSpecial = t->image->jamoSpecial; 1490 1491 // Don't copy stuff from UCA header! 1492 //uprv_memcpy(myData, t->image, sizeof(UCATableHeader)); 1493 1494 myData->contractionSize = contractionsSize; 1495 1496 tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader))); 1497 1498 myData->options = tableOffset; 1499 uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet)); 1500 tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet))); 1501 1502 /* copy expansions */ 1503 /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/ 1504 myData->expansion = tableOffset; 1505 uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t)); 1506 tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t))); 1507 1508 /* contractions block */ 1509 if(contractionsSize != 0) { 1510 /* copy contraction index */ 1511 /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/ 1512 myData->contractionIndex = tableOffset; 1513 uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar)); 1514 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar))); 1515 1516 /* copy contraction collation elements */ 1517 /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/ 1518 myData->contractionCEs = tableOffset; 1519 uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t)); 1520 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t))); 1521 } else { 1522 myData->contractionIndex = 0; 1523 myData->contractionCEs = 0; 1524 } 1525 1526 /* copy mapping table */ 1527 /*myData->mappingPosition = dataStart+tableOffset;*/ 1528 /*myData->mappingPosition = tableOffset;*/ 1529 /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/ 1530 1531 myData->mappingPosition = tableOffset; 1532 utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status); 1533 #ifdef UCOL_DEBUG 1534 // This is debug code to dump the contents of the trie. It needs two functions defined above 1535 { 1536 UTrie UCAt = { 0 }; 1537 uint32_t trieWord; 1538 utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status); 1539 UCAt.getFoldingOffset = myGetFoldingOffset; 1540 if(U_SUCCESS(*status)) { 1541 utrie_enum(&UCAt, NULL, enumRange, NULL); 1542 } 1543 trieWord = UTRIE_GET32_FROM_LEAD(UCAt, 0xDC01) 1544 } 1545 #endif 1546 tableOffset += paddedsize(mappingSize); 1547 1548 1549 int32_t i = 0; 1550 1551 /* copy max expansion table */ 1552 myData->endExpansionCE = tableOffset; 1553 myData->endExpansionCECount = maxexpansion->position - 1; 1554 /* not copying the first element which is a dummy */ 1555 uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1, 1556 (maxexpansion->position - 1) * sizeof(uint32_t)); 1557 tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t))); 1558 myData->expansionCESize = tableOffset; 1559 uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1, 1560 (maxexpansion->position - 1) * sizeof(uint8_t)); 1561 tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t))); 1562 1563 /* Unsafe chars table. Finish it off, then copy it. */ 1564 uprv_uca_unsafeCPAddCCNZ(t, status); 1565 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1566 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1567 t->unsafeCP[i] |= t->UCA->unsafeCP[i]; 1568 } 1569 } 1570 myData->unsafeCP = tableOffset; 1571 uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 1572 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1573 1574 1575 /* Finish building Contraction Ending chars hash table and then copy it out. */ 1576 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1577 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1578 t->contrEndCP[i] |= t->UCA->contrEndCP[i]; 1579 } 1580 } 1581 myData->contrEndCP = tableOffset; 1582 uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 1583 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1584 1585 if(tableOffset != toAllocate) { 1586 #ifdef UCOL_DEBUG 1587 fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset); 1588 #endif 1589 *status = U_INTERNAL_PROGRAM_ERROR; 1590 uprv_free(dataStart); 1591 return 0; 1592 } 1593 1594 myData->size = tableOffset; 1595 /* This should happen upon ressurection */ 1596 /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/ 1597 /*uprv_mstrm_close(ms);*/ 1598 return myData; 1599 } 1600 1601 1602 struct enumStruct { 1603 tempUCATable *t; 1604 UCollator *tempColl; 1605 UCollationElements* colEl; 1606 int32_t noOfClosures; 1607 UErrorCode *status; 1608 }; 1609 U_CDECL_BEGIN 1610 static UBool U_CALLCONV 1611 _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1612 1613 if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later 1614 UErrorCode *status = ((enumStruct *)context)->status; 1615 tempUCATable *t = ((enumStruct *)context)->t; 1616 UCollator *tempColl = ((enumStruct *)context)->tempColl; 1617 UCollationElements* colEl = ((enumStruct *)context)->colEl; 1618 UCAElements el; 1619 UChar decomp[256] = { 0 }; 1620 int32_t noOfDec = 0; 1621 1622 UChar32 u32 = 0; 1623 UChar comp[2]; 1624 uint32_t len = 0; 1625 1626 for(u32 = start; u32 < limit; u32++) { 1627 noOfDec = unorm_getDecomposition(u32, FALSE, decomp, 256); 1628 //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1 1629 //|| (noOfDec == 1 && *decomp != (UChar)u32)) 1630 if(noOfDec > 0) // if we're positive, that means there is no decomposition 1631 { 1632 len = 0; 1633 UTF_APPEND_CHAR_UNSAFE(comp, len, u32); 1634 if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) { 1635 #ifdef UCOL_DEBUG 1636 fprintf(stderr, "Closure: %08X -> ", u32); 1637 uint32_t i = 0; 1638 for(i = 0; i<noOfDec; i++) { 1639 fprintf(stderr, "%04X ", decomp[i]); 1640 } 1641 fprintf(stderr, "\n"); 1642 #endif 1643 ((enumStruct *)context)->noOfClosures++; 1644 el.cPoints = decomp; 1645 el.cSize = noOfDec; 1646 el.noOfCEs = 0; 1647 el.prefix = el.prefixChars; 1648 el.prefixSize = 0; 1649 1650 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el); 1651 el.cPoints = comp; 1652 el.cSize = len; 1653 el.prefix = el.prefixChars; 1654 el.prefixSize = 0; 1655 if(prefix == NULL) { 1656 el.noOfCEs = 0; 1657 ucol_setText(colEl, decomp, noOfDec, status); 1658 while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1659 el.noOfCEs++; 1660 } 1661 } else { 1662 el.noOfCEs = 1; 1663 el.CEs[0] = prefix->mapCE; 1664 // This character uses a prefix. We have to add it 1665 // to the unsafe table, as it decomposed form is already 1666 // in. In Japanese, this happens for \u309e & \u30fe 1667 // Since unsafeCPSet is static in ucol_elm, we are going 1668 // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function 1669 } 1670 uprv_uca_addAnElement(t, &el, status); 1671 } 1672 } 1673 } 1674 } 1675 return TRUE; 1676 } 1677 U_CDECL_END 1678 1679 static void 1680 uprv_uca_setMapCE(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1681 uint32_t expansion = 0; 1682 int32_t j; 1683 1684 ExpansionTable *expansions = t->expansions; 1685 if(element->noOfCEs == 2 // a two CE expansion 1686 && isContinuation(element->CEs[1]) // which is a continuation 1687 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1688 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1689 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1690 ) { 1691 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1692 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1693 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1694 } else { 1695 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1696 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1697 & 0xFFFFF0); 1698 1699 for(j = 1; j<(int32_t)element->noOfCEs; j++) { 1700 uprv_uca_addExpansion(expansions, element->CEs[j], status); 1701 } 1702 if(element->noOfCEs <= 0xF) { 1703 expansion |= element->noOfCEs; 1704 } else { 1705 uprv_uca_addExpansion(expansions, 0, status); 1706 } 1707 element->mapCE = expansion; 1708 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1709 (uint8_t)element->noOfCEs, 1710 t->maxExpansions, 1711 status); 1712 } 1713 } 1714 1715 static void 1716 uprv_uca_addFCD4AccentedContractions(tempUCATable *t, 1717 UCollationElements* colEl, 1718 UChar *data, 1719 int32_t len, 1720 UCAElements *el, 1721 UErrorCode *status) { 1722 UChar decomp[256], comp[256]; 1723 int32_t decLen, compLen; 1724 1725 decLen = unorm_normalize(data, len, UNORM_NFD, 0, decomp, 256, status); 1726 compLen = unorm_normalize(data, len, UNORM_NFC, 0, comp, 256, status); 1727 decomp[decLen] = comp[compLen] = 0; 1728 1729 el->cPoints = decomp; 1730 el->cSize = decLen; 1731 el->noOfCEs = 0; 1732 el->prefixSize = 0; 1733 el->prefix = el->prefixChars; 1734 1735 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1736 el->cPoints = comp; 1737 el->cSize = compLen; 1738 el->prefix = el->prefixChars; 1739 el->prefixSize = 0; 1740 if(prefix == NULL) { 1741 el->noOfCEs = 0; 1742 ucol_setText(colEl, decomp, decLen, status); 1743 while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1744 el->noOfCEs++; 1745 } 1746 uprv_uca_setMapCE(t, el, status); 1747 uprv_uca_addAnElement(t, el, status); 1748 } 1749 } 1750 1751 static void 1752 uprv_uca_addMultiCMContractions(tempUCATable *t, 1753 UCollationElements* colEl, 1754 tempTailorContext *c, 1755 UCAElements *el, 1756 UErrorCode *status) { 1757 CombinClassTable *cmLookup = t->cmLookup; 1758 UChar newDecomp[256]; 1759 int32_t maxComp, newDecLen; 1760 UChar32 fcdHighStart; 1761 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 1762 if (U_FAILURE(*status)) { 1763 return; 1764 } 1765 int16_t curClass = (unorm_getFCD16(fcdTrieIndex, c->tailoringCM) & 0xff); 1766 CompData *precomp = c->precomp; 1767 int32_t compLen = c->compLen; 1768 UChar *comp = c->comp; 1769 maxComp = c->precompLen; 1770 1771 for (int32_t j=0; j < maxComp; j++) { 1772 int32_t count=0; 1773 do { 1774 if ( count == 0 ) { // Decompose the saved precomposed char. 1775 UChar temp[2]; 1776 temp[0]=precomp[j].cp; 1777 temp[1]=0; 1778 newDecLen = unorm_normalize(temp, 1, UNORM_NFD, 0, 1779 newDecomp, sizeof(newDecomp)/sizeof(UChar), status); 1780 newDecomp[newDecLen++] = cmLookup->cPoints[c->cmPos]; 1781 } 1782 else { // swap 2 combining marks when they are equal. 1783 uprv_memcpy(newDecomp, c->decomp, sizeof(UChar)*(c->decompLen)); 1784 newDecLen = c->decompLen; 1785 newDecomp[newDecLen++] = precomp[j].cClass; 1786 } 1787 newDecomp[newDecLen] = 0; 1788 compLen = unorm_normalize(newDecomp, newDecLen, UNORM_NFC, 0, 1789 comp, 256, status); 1790 if (compLen==1) { 1791 comp[compLen++] = newDecomp[newDecLen++] = c->tailoringCM; 1792 comp[compLen] = newDecomp[newDecLen] = 0; 1793 el->cPoints = newDecomp; 1794 el->cSize = newDecLen; 1795 1796 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1797 el->cPoints = c->comp; 1798 el->cSize = compLen; 1799 el->prefix = el->prefixChars; 1800 el->prefixSize = 0; 1801 if(prefix == NULL) { 1802 el->noOfCEs = 0; 1803 ucol_setText(colEl, newDecomp, newDecLen, status); 1804 while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1805 el->noOfCEs++; 1806 } 1807 uprv_uca_setMapCE(t, el, status); 1808 uprv_uca_finalizeAddition(t, el, status); 1809 1810 // Save the current precomposed char and its class to find any 1811 // other combining mark combinations. 1812 precomp[c->precompLen].cp=comp[0]; 1813 precomp[c->precompLen].cClass = curClass; 1814 c->precompLen++; 1815 } 1816 } 1817 } while (++count<2 && (precomp[j].cClass == curClass)); 1818 } 1819 1820 } 1821 1822 static void 1823 uprv_uca_addTailCanonicalClosures(tempUCATable *t, 1824 UCollationElements* colEl, 1825 UChar baseCh, 1826 UChar cMark, 1827 UCAElements *el, 1828 UErrorCode *status) { 1829 CombinClassTable *cmLookup = t->cmLookup; 1830 UChar32 fcdHighStart; 1831 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 1832 if (U_FAILURE(*status)) { 1833 return; 1834 } 1835 int16_t maxIndex = (unorm_getFCD16(fcdTrieIndex, cMark) & 0xff ); 1836 UCAElements element; 1837 uint16_t *index; 1838 UChar decomp[256]; 1839 UChar comp[256]; 1840 CompData precomp[256]; // precomposed array 1841 int32_t precompLen = 0; // count for precomp 1842 int32_t i, len, decompLen, curClass, replacedPos; 1843 tempTailorContext c; 1844 1845 if ( cmLookup == NULL ) { 1846 return; 1847 } 1848 index = cmLookup->index; 1849 int32_t cClass=(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff); 1850 maxIndex = (int32_t)index[(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff)-1]; 1851 c.comp = comp; 1852 c.decomp = decomp; 1853 c.precomp = precomp; 1854 c.tailoringCM = cMark; 1855 1856 if (cClass>0) { 1857 maxIndex = (int32_t)index[cClass-1]; 1858 } 1859 else { 1860 maxIndex=0; 1861 } 1862 decomp[0]=baseCh; 1863 for ( i=0; i<maxIndex ; i++ ) { 1864 decomp[1] = cmLookup->cPoints[i]; 1865 decomp[2]=0; 1866 decompLen=2; 1867 len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1868 if (len==1) { 1869 // Save the current precomposed char and its class to find any 1870 // other combining mark combinations. 1871 precomp[precompLen].cp=comp[0]; 1872 curClass = precomp[precompLen].cClass = 1873 index[unorm_getFCD16(fcdTrieIndex, decomp[1]) & 0xff]; 1874 precompLen++; 1875 replacedPos=0; 1876 for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) { 1877 decomp[decompLen] = el->cPoints[decompLen]; 1878 if (decomp[decompLen]==cMark) { 1879 replacedPos = decompLen; // record the position for later use 1880 } 1881 } 1882 if ( replacedPos != 0 ) { 1883 decomp[replacedPos]=cmLookup->cPoints[i]; 1884 } 1885 decomp[decompLen] = 0; 1886 len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1887 comp[len++] = decomp[decompLen++] = cMark; 1888 comp[len] = decomp[decompLen] = 0; 1889 element.cPoints = decomp; 1890 element.cSize = decompLen; 1891 element.noOfCEs = 0; 1892 element.prefix = el->prefixChars; 1893 element.prefixSize = 0; 1894 1895 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &element); 1896 element.cPoints = comp; 1897 element.cSize = len; 1898 element.prefix = el->prefixChars; 1899 element.prefixSize = 0; 1900 if(prefix == NULL) { 1901 element.noOfCEs = 0; 1902 ucol_setText(colEl, decomp, decompLen, status); 1903 while((element.CEs[element.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1904 element.noOfCEs++; 1905 } 1906 uprv_uca_setMapCE(t, &element, status); 1907 uprv_uca_finalizeAddition(t, &element, status); 1908 } 1909 1910 // This is a fix for tailoring contractions with accented 1911 // character at the end of contraction string. 1912 if ((len>2) && 1913 (unorm_getFCD16(fcdTrieIndex, comp[len-2]) & 0xff00)==0) { 1914 uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status); 1915 } 1916 1917 if (precompLen >1) { 1918 c.compLen = len; 1919 c.decompLen = decompLen; 1920 c.precompLen = precompLen; 1921 c.cmPos = i; 1922 uprv_uca_addMultiCMContractions(t, colEl, &c, &element, status); 1923 precompLen = c.precompLen; 1924 } 1925 } 1926 } 1927 } 1928 1929 U_CFUNC int32_t U_EXPORT2 1930 uprv_uca_canonicalClosure(tempUCATable *t, 1931 UColTokenParser *src, 1932 UErrorCode *status) 1933 { 1934 enumStruct context; 1935 context.noOfClosures = 0; 1936 UCAElements el; 1937 UColToken *tok; 1938 uint32_t i = 0, j = 0; 1939 UChar baseChar, firstCM; 1940 UChar32 fcdHighStart; 1941 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 1942 1943 if(U_FAILURE(*status)) { 1944 return 0; 1945 } 1946 1947 UCollator *tempColl = NULL; 1948 tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status); 1949 // Check for null pointer 1950 if (U_FAILURE(*status)) { 1951 return 0; 1952 } 1953 1954 UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status); 1955 tempColl = ucol_initCollator(tempData, 0, t->UCA, status); 1956 if ( tempTable->cmLookup != NULL ) { 1957 t->cmLookup = tempTable->cmLookup; // copy over to t 1958 tempTable->cmLookup = NULL; 1959 } 1960 uprv_uca_closeTempTable(tempTable); 1961 1962 if(U_SUCCESS(*status)) { 1963 tempColl->ucaRules = NULL; 1964 tempColl->actualLocale = NULL; 1965 tempColl->validLocale = NULL; 1966 tempColl->requestedLocale = NULL; 1967 tempColl->hasRealData = TRUE; 1968 tempColl->freeImageOnClose = TRUE; 1969 } else if(tempData != 0) { 1970 uprv_free(tempData); 1971 } 1972 1973 /* produce canonical closure */ 1974 UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status); 1975 // Check for null pointer 1976 if (U_FAILURE(*status)) { 1977 return 0; 1978 } 1979 context.t = t; 1980 context.tempColl = tempColl; 1981 context.colEl = colEl; 1982 context.status = status; 1983 u_enumCharTypes(_enumCategoryRangeClosureCategory, &context); 1984 1985 if ( (src==NULL) || !src->buildCCTabFlag ) { 1986 ucol_closeElements(colEl); 1987 ucol_close(tempColl); 1988 return context.noOfClosures; // no extra contraction needed to add 1989 } 1990 1991 for (i=0; i < src->resultLen; i++) { 1992 baseChar = firstCM= (UChar)0; 1993 tok = src->lh[i].first; 1994 while (tok != NULL && U_SUCCESS(*status)) { 1995 el.prefix = el.prefixChars; 1996 el.cPoints = el.uchars; 1997 if(tok->prefix != 0) { 1998 el.prefixSize = tok->prefix>>24; 1999 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 2000 2001 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 2002 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 2003 } else { 2004 el.prefixSize = 0; 2005 *el.prefix = 0; 2006 2007 el.cSize = (tok->source >> 24); 2008 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 2009 } 2010 if(src->UCA != NULL) { 2011 for(j = 0; j<el.cSize; j++) { 2012 int16_t fcd = unorm_getFCD16(fcdTrieIndex, el.cPoints[j]); 2013 if ( (fcd & 0xff) == 0 ) { 2014 baseChar = el.cPoints[j]; // last base character 2015 firstCM=0; // reset combining mark value 2016 } 2017 else { 2018 if ( (baseChar!=0) && (firstCM==0) ) { 2019 firstCM = el.cPoints[j]; // first combining mark 2020 } 2021 } 2022 } 2023 } 2024 if ( (baseChar!= (UChar)0) && (firstCM != (UChar)0) ) { 2025 // find all the canonical rules 2026 uprv_uca_addTailCanonicalClosures(t, colEl, baseChar, firstCM, &el, status); 2027 } 2028 tok = tok->next; 2029 } 2030 } 2031 ucol_closeElements(colEl); 2032 ucol_close(tempColl); 2033 2034 return context.noOfClosures; 2035 } 2036 2037 #endif /* #if !UCONFIG_NO_COLLATION */ 2038