1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucaelems.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This program reads the Franctional UCA table and generates 17 * internal format for UCA table as well as inverse UCA table. 18 * It then writes binary files containing the data: ucadata.dat 19 * & invuca.dat 20 * 21 * date name comments 22 * 03/02/2001 synwee added setMaxExpansion 23 * 03/07/2001 synwee merged UCA's maxexpansion and tailoring's 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_COLLATION 29 30 #include "unicode/uchar.h" 31 #include "unicode/unistr.h" 32 #include "unicode/ucoleitr.h" 33 #include "unicode/normlzr.h" 34 #include "normalizer2impl.h" 35 #include "ucol_elm.h" 36 #include "ucol_tok.h" 37 #include "ucol_cnt.h" 38 #include "unicode/caniter.h" 39 #include "cmemory.h" 40 41 U_NAMESPACE_USE 42 43 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status); 44 45 U_CDECL_BEGIN 46 static int32_t U_CALLCONV 47 prefixLookupHash(const UHashTok e) { 48 UCAElements *element = (UCAElements *)e.pointer; 49 UChar buf[256]; 50 UHashTok key; 51 key.pointer = buf; 52 uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar)); 53 buf[element->cSize] = 0; 54 //key.pointer = element->cPoints; 55 //element->cPoints[element->cSize] = 0; 56 return uhash_hashUChars(key); 57 } 58 59 static int8_t U_CALLCONV 60 prefixLookupComp(const UHashTok e1, const UHashTok e2) { 61 UCAElements *element1 = (UCAElements *)e1.pointer; 62 UCAElements *element2 = (UCAElements *)e2.pointer; 63 64 UChar buf1[256]; 65 UHashTok key1; 66 key1.pointer = buf1; 67 uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar)); 68 buf1[element1->cSize] = 0; 69 70 UChar buf2[256]; 71 UHashTok key2; 72 key2.pointer = buf2; 73 uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar)); 74 buf2[element2->cSize] = 0; 75 76 return uhash_compareUChars(key1, key2); 77 } 78 U_CDECL_END 79 80 static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) { 81 if(U_FAILURE(*status)) { 82 return 0; 83 } 84 if(expansions->CEs == NULL) { 85 expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); 86 /* test for NULL */ 87 if (expansions->CEs == NULL) { 88 *status = U_MEMORY_ALLOCATION_ERROR; 89 return 0; 90 } 91 expansions->size = INIT_EXP_TABLE_SIZE; 92 expansions->position = 0; 93 } 94 95 if(expansions->position == expansions->size) { 96 uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t)); 97 if(newData == NULL) { 98 #ifdef UCOL_DEBUG 99 fprintf(stderr, "out of memory for expansions\n"); 100 #endif 101 *status = U_MEMORY_ALLOCATION_ERROR; 102 return -1; 103 } 104 expansions->CEs = newData; 105 expansions->size *= 2; 106 } 107 108 expansions->CEs[expansions->position] = value; 109 return(expansions->position++); 110 } 111 112 U_CAPI tempUCATable* U_EXPORT2 113 uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) { 114 MaxJamoExpansionTable *maxjet; 115 MaxExpansionTable *maxet; 116 tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 117 /* test for NULL */ 118 if (t == NULL) { 119 *status = U_MEMORY_ALLOCATION_ERROR; 120 return NULL; 121 } 122 uprv_memset(t, 0, sizeof(tempUCATable)); 123 124 maxet = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 125 if (maxet == NULL) { 126 goto allocation_failure; 127 } 128 uprv_memset(maxet, 0, sizeof(MaxExpansionTable)); 129 t->maxExpansions = maxet; 130 131 maxjet = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 132 if (maxjet == NULL) { 133 goto allocation_failure; 134 } 135 uprv_memset(maxjet, 0, sizeof(MaxJamoExpansionTable)); 136 t->maxJamoExpansions = maxjet; 137 138 t->image = image; 139 t->options = opts; 140 141 t->UCA = UCA; 142 t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 143 /* test for NULL */ 144 if (t->expansions == NULL) { 145 goto allocation_failure; 146 } 147 uprv_memset(t->expansions, 0, sizeof(ExpansionTable)); 148 149 t->mapping = utrie_open(NULL, NULL, UCOL_ELM_TRIE_CAPACITY, 150 UCOL_SPECIAL_FLAG | (initTag<<24), 151 UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24), 152 TRUE); // Do your own mallocs for the structure, array and have linear Latin 1 153 if (U_FAILURE(*status)) { 154 goto allocation_failure; 155 } 156 t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, NULL, status); 157 if (U_FAILURE(*status)) { 158 goto allocation_failure; 159 } 160 uhash_setValueDeleter(t->prefixLookup, uhash_freeBlock); 161 162 t->contractions = uprv_cnttab_open(t->mapping, status); 163 if (U_FAILURE(*status)) { 164 goto cleanup; 165 } 166 167 /* copy UCA's maxexpansion and merge as we go along */ 168 if (UCA != NULL) { 169 /* adding an extra initial value for easier manipulation */ 170 maxet->size = (int32_t)(UCA->lastEndExpansionCE - UCA->endExpansionCE) + 2; 171 maxet->position = maxet->size - 1; 172 maxet->endExpansionCE = 173 (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size); 174 /* test for NULL */ 175 if (maxet->endExpansionCE == NULL) { 176 goto allocation_failure; 177 } 178 maxet->expansionCESize = 179 (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size); 180 /* test for NULL */ 181 if (maxet->expansionCESize == NULL) { 182 goto allocation_failure; 183 } 184 /* initialized value */ 185 *(maxet->endExpansionCE) = 0; 186 *(maxet->expansionCESize) = 0; 187 uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE, 188 sizeof(uint32_t) * (maxet->size - 1)); 189 uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize, 190 sizeof(uint8_t) * (maxet->size - 1)); 191 } 192 else { 193 maxet->size = 0; 194 } 195 maxjet->endExpansionCE = NULL; 196 maxjet->isV = NULL; 197 maxjet->size = 0; 198 maxjet->position = 0; 199 maxjet->maxLSize = 1; 200 maxjet->maxVSize = 1; 201 maxjet->maxTSize = 1; 202 203 t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 204 /* test for NULL */ 205 if (t->unsafeCP == NULL) { 206 goto allocation_failure; 207 } 208 t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 209 /* test for NULL */ 210 if (t->contrEndCP == NULL) { 211 goto allocation_failure; 212 } 213 uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 214 uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 215 t->cmLookup = NULL; 216 return t; 217 218 allocation_failure: 219 *status = U_MEMORY_ALLOCATION_ERROR; 220 cleanup: 221 uprv_uca_closeTempTable(t); 222 return NULL; 223 } 224 225 static tempUCATable* U_EXPORT2 226 uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) { 227 if(U_FAILURE(*status)) { 228 return NULL; 229 } 230 231 tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 232 /* test for NULL */ 233 if (r == NULL) { 234 *status = U_MEMORY_ALLOCATION_ERROR; 235 return NULL; 236 } 237 uprv_memset(r, 0, sizeof(tempUCATable)); 238 239 /* mapping */ 240 if(t->mapping != NULL) { 241 /*r->mapping = ucmpe32_clone(t->mapping, status);*/ 242 r->mapping = utrie_clone(NULL, t->mapping, NULL, 0); 243 } 244 245 // a hashing clone function would be very nice. We have none currently... 246 // However, we should be good, as closing should not produce any prefixed elements. 247 r->prefixLookup = NULL; // prefixes are not used in closing 248 249 /* expansions */ 250 if(t->expansions != NULL) { 251 r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 252 /* test for NULL */ 253 if (r->expansions == NULL) { 254 *status = U_MEMORY_ALLOCATION_ERROR; 255 goto cleanup; 256 } 257 r->expansions->position = t->expansions->position; 258 r->expansions->size = t->expansions->size; 259 if(t->expansions->CEs != NULL) { 260 r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size); 261 /* test for NULL */ 262 if (r->expansions->CEs == NULL) { 263 *status = U_MEMORY_ALLOCATION_ERROR; 264 goto cleanup; 265 } 266 uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position); 267 } else { 268 r->expansions->CEs = NULL; 269 } 270 } 271 272 if(t->contractions != NULL) { 273 r->contractions = uprv_cnttab_clone(t->contractions, status); 274 // Check for cloning failure. 275 if (r->contractions == NULL) { 276 *status = U_MEMORY_ALLOCATION_ERROR; 277 goto cleanup; 278 } 279 r->contractions->mapping = r->mapping; 280 } 281 282 if(t->maxExpansions != NULL) { 283 r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 284 /* test for NULL */ 285 if (r->maxExpansions == NULL) { 286 *status = U_MEMORY_ALLOCATION_ERROR; 287 goto cleanup; 288 } 289 r->maxExpansions->size = t->maxExpansions->size; 290 r->maxExpansions->position = t->maxExpansions->position; 291 if(t->maxExpansions->endExpansionCE != NULL) { 292 r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size); 293 /* test for NULL */ 294 if (r->maxExpansions->endExpansionCE == NULL) { 295 *status = U_MEMORY_ALLOCATION_ERROR; 296 goto cleanup; 297 } 298 uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size); 299 uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t)); 300 } else { 301 r->maxExpansions->endExpansionCE = NULL; 302 } 303 if(t->maxExpansions->expansionCESize != NULL) { 304 r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size); 305 /* test for NULL */ 306 if (r->maxExpansions->expansionCESize == NULL) { 307 *status = U_MEMORY_ALLOCATION_ERROR; 308 goto cleanup; 309 } 310 uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size); 311 uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t)); 312 } else { 313 r->maxExpansions->expansionCESize = NULL; 314 } 315 } 316 317 if(t->maxJamoExpansions != NULL) { 318 r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 319 /* test for NULL */ 320 if (r->maxJamoExpansions == NULL) { 321 *status = U_MEMORY_ALLOCATION_ERROR; 322 goto cleanup; 323 } 324 r->maxJamoExpansions->size = t->maxJamoExpansions->size; 325 r->maxJamoExpansions->position = t->maxJamoExpansions->position; 326 r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize; 327 r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize; 328 r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize; 329 if(t->maxJamoExpansions->size != 0) { 330 r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size); 331 /* test for NULL */ 332 if (r->maxJamoExpansions->endExpansionCE == NULL) { 333 *status = U_MEMORY_ALLOCATION_ERROR; 334 goto cleanup; 335 } 336 uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t)); 337 r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size); 338 /* test for NULL */ 339 if (r->maxJamoExpansions->isV == NULL) { 340 *status = U_MEMORY_ALLOCATION_ERROR; 341 goto cleanup; 342 } 343 uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool)); 344 } else { 345 r->maxJamoExpansions->endExpansionCE = NULL; 346 r->maxJamoExpansions->isV = NULL; 347 } 348 } 349 350 if(t->unsafeCP != NULL) { 351 r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 352 /* test for NULL */ 353 if (r->unsafeCP == NULL) { 354 *status = U_MEMORY_ALLOCATION_ERROR; 355 goto cleanup; 356 } 357 uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 358 } 359 360 if(t->contrEndCP != NULL) { 361 r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 362 /* test for NULL */ 363 if (r->contrEndCP == NULL) { 364 *status = U_MEMORY_ALLOCATION_ERROR; 365 goto cleanup; 366 } 367 uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 368 } 369 370 r->UCA = t->UCA; 371 r->image = t->image; 372 r->options = t->options; 373 374 return r; 375 cleanup: 376 uprv_uca_closeTempTable(t); 377 return NULL; 378 } 379 380 381 U_CAPI void U_EXPORT2 382 uprv_uca_closeTempTable(tempUCATable *t) { 383 if(t != NULL) { 384 if (t->expansions != NULL) { 385 uprv_free(t->expansions->CEs); 386 uprv_free(t->expansions); 387 } 388 if(t->contractions != NULL) { 389 uprv_cnttab_close(t->contractions); 390 } 391 if (t->mapping != NULL) { 392 utrie_close(t->mapping); 393 } 394 395 if(t->prefixLookup != NULL) { 396 uhash_close(t->prefixLookup); 397 } 398 399 if (t->maxExpansions != NULL) { 400 uprv_free(t->maxExpansions->endExpansionCE); 401 uprv_free(t->maxExpansions->expansionCESize); 402 uprv_free(t->maxExpansions); 403 } 404 405 if (t->maxJamoExpansions->size > 0) { 406 uprv_free(t->maxJamoExpansions->endExpansionCE); 407 uprv_free(t->maxJamoExpansions->isV); 408 } 409 uprv_free(t->maxJamoExpansions); 410 411 uprv_free(t->unsafeCP); 412 uprv_free(t->contrEndCP); 413 414 if (t->cmLookup != NULL) { 415 uprv_free(t->cmLookup->cPoints); 416 uprv_free(t->cmLookup); 417 } 418 419 uprv_free(t); 420 } 421 } 422 423 /** 424 * Looks for the maximum length of all expansion sequences ending with the same 425 * collation element. The size required for maxexpansion and maxsize is 426 * returned if the arrays are too small. 427 * @param endexpansion the last expansion collation element to be added 428 * @param expansionsize size of the expansion 429 * @param maxexpansion data structure to store the maximum expansion data. 430 * @param status error status 431 * @returns size of the maxexpansion and maxsize used. 432 */ 433 static int uprv_uca_setMaxExpansion(uint32_t endexpansion, 434 uint8_t expansionsize, 435 MaxExpansionTable *maxexpansion, 436 UErrorCode *status) 437 { 438 if (maxexpansion->size == 0) { 439 /* we'll always make the first element 0, for easier manipulation */ 440 maxexpansion->endExpansionCE = 441 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t)); 442 /* test for NULL */ 443 if (maxexpansion->endExpansionCE == NULL) { 444 *status = U_MEMORY_ALLOCATION_ERROR; 445 return 0; 446 } 447 *(maxexpansion->endExpansionCE) = 0; 448 maxexpansion->expansionCESize = 449 (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t)); 450 /* test for NULL */; 451 if (maxexpansion->expansionCESize == NULL) { 452 *status = U_MEMORY_ALLOCATION_ERROR; 453 return 0; 454 } 455 *(maxexpansion->expansionCESize) = 0; 456 maxexpansion->size = INIT_EXP_TABLE_SIZE; 457 maxexpansion->position = 0; 458 } 459 460 if (maxexpansion->position + 1 == maxexpansion->size) { 461 uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 462 2 * maxexpansion->size * sizeof(uint32_t)); 463 if (neweece == NULL) { 464 *status = U_MEMORY_ALLOCATION_ERROR; 465 return 0; 466 } 467 maxexpansion->endExpansionCE = neweece; 468 469 uint8_t *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize, 470 2 * maxexpansion->size * sizeof(uint8_t)); 471 if (neweces == NULL) { 472 *status = U_MEMORY_ALLOCATION_ERROR; 473 return 0; 474 } 475 maxexpansion->expansionCESize = neweces; 476 maxexpansion->size *= 2; 477 } 478 479 uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 480 uint8_t *pexpansionsize = maxexpansion->expansionCESize; 481 int pos = maxexpansion->position; 482 483 uint32_t *start = pendexpansionce; 484 uint32_t *limit = pendexpansionce + pos; 485 486 /* using binary search to determine if last expansion element is 487 already in the array */ 488 uint32_t *mid; 489 int result = -1; 490 while (start < limit - 1) { 491 mid = start + ((limit - start) >> 1); 492 if (endexpansion <= *mid) { 493 limit = mid; 494 } 495 else { 496 start = mid; 497 } 498 } 499 500 if (*start == endexpansion) { 501 result = (int)(start - pendexpansionce); 502 } 503 else if (*limit == endexpansion) { 504 result = (int)(limit - pendexpansionce); 505 } 506 507 if (result > -1) { 508 /* found the ce in expansion, we'll just modify the size if it is 509 smaller */ 510 uint8_t *currentsize = pexpansionsize + result; 511 if (*currentsize < expansionsize) { 512 *currentsize = expansionsize; 513 } 514 } 515 else { 516 /* we'll need to squeeze the value into the array. 517 initial implementation. */ 518 /* shifting the subarray down by 1 */ 519 int shiftsize = (int)((pendexpansionce + pos) - start); 520 uint32_t *shiftpos = start + 1; 521 uint8_t *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce); 522 523 /* okay need to rearrange the array into sorted order */ 524 if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */ 525 *(pendexpansionce + pos + 1) = endexpansion; 526 *(pexpansionsize + pos + 1) = expansionsize; 527 } 528 else { 529 uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t)); 530 uprv_memmove(sizeshiftpos + 1, sizeshiftpos, 531 shiftsize * sizeof(uint8_t)); 532 *shiftpos = endexpansion; 533 *sizeshiftpos = expansionsize; 534 } 535 maxexpansion->position ++; 536 537 #ifdef UCOL_DEBUG 538 int temp; 539 UBool found = FALSE; 540 for (temp = 0; temp < maxexpansion->position; temp ++) { 541 if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) { 542 fprintf(stderr, "expansions %d\n", temp); 543 } 544 if (pendexpansionce[temp] == endexpansion) { 545 found =TRUE; 546 if (pexpansionsize[temp] < expansionsize) { 547 fprintf(stderr, "expansions size %d\n", temp); 548 } 549 } 550 } 551 if (pendexpansionce[temp] == endexpansion) { 552 found =TRUE; 553 if (pexpansionsize[temp] < expansionsize) { 554 fprintf(stderr, "expansions size %d\n", temp); 555 } 556 } 557 if (!found) 558 fprintf(stderr, "expansion not found %d\n", temp); 559 #endif 560 } 561 562 return maxexpansion->position; 563 } 564 565 /** 566 * Sets the maximum length of all jamo expansion sequences ending with the same 567 * collation element. The size required for maxexpansion and maxsize is 568 * returned if the arrays are too small. 569 * @param ch the jamo codepoint 570 * @param endexpansion the last expansion collation element to be added 571 * @param expansionsize size of the expansion 572 * @param maxexpansion data structure to store the maximum expansion data. 573 * @param status error status 574 * @returns size of the maxexpansion and maxsize used. 575 */ 576 static int uprv_uca_setMaxJamoExpansion(UChar ch, 577 uint32_t endexpansion, 578 uint8_t expansionsize, 579 MaxJamoExpansionTable *maxexpansion, 580 UErrorCode *status) 581 { 582 UBool isV = TRUE; 583 if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) { 584 /* determines L for Jamo, doesn't need to store this since it is never 585 at the end of a expansion */ 586 if (maxexpansion->maxLSize < expansionsize) { 587 maxexpansion->maxLSize = expansionsize; 588 } 589 return maxexpansion->position; 590 } 591 592 if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) { 593 /* determines V for Jamo */ 594 if (maxexpansion->maxVSize < expansionsize) { 595 maxexpansion->maxVSize = expansionsize; 596 } 597 } 598 599 if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) { 600 isV = FALSE; 601 /* determines T for Jamo */ 602 if (maxexpansion->maxTSize < expansionsize) { 603 maxexpansion->maxTSize = expansionsize; 604 } 605 } 606 607 if (maxexpansion->size == 0) { 608 /* we'll always make the first element 0, for easier manipulation */ 609 maxexpansion->endExpansionCE = 610 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t)); 611 /* test for NULL */; 612 if (maxexpansion->endExpansionCE == NULL) { 613 *status = U_MEMORY_ALLOCATION_ERROR; 614 return 0; 615 } 616 *(maxexpansion->endExpansionCE) = 0; 617 maxexpansion->isV = 618 (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool)); 619 /* test for NULL */; 620 if (maxexpansion->isV == NULL) { 621 *status = U_MEMORY_ALLOCATION_ERROR; 622 uprv_free(maxexpansion->endExpansionCE); 623 maxexpansion->endExpansionCE = NULL; 624 return 0; 625 } 626 *(maxexpansion->isV) = 0; 627 maxexpansion->size = INIT_EXP_TABLE_SIZE; 628 maxexpansion->position = 0; 629 } 630 631 if (maxexpansion->position + 1 == maxexpansion->size) { 632 maxexpansion->size *= 2; 633 maxexpansion->endExpansionCE = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 634 maxexpansion->size * sizeof(uint32_t)); 635 if (maxexpansion->endExpansionCE == NULL) { 636 #ifdef UCOL_DEBUG 637 fprintf(stderr, "out of memory for maxExpansions\n"); 638 #endif 639 *status = U_MEMORY_ALLOCATION_ERROR; 640 return 0; 641 } 642 maxexpansion->isV = (UBool *)uprv_realloc(maxexpansion->isV, 643 maxexpansion->size * sizeof(UBool)); 644 if (maxexpansion->isV == NULL) { 645 #ifdef UCOL_DEBUG 646 fprintf(stderr, "out of memory for maxExpansions\n"); 647 #endif 648 *status = U_MEMORY_ALLOCATION_ERROR; 649 uprv_free(maxexpansion->endExpansionCE); 650 maxexpansion->endExpansionCE = NULL; 651 return 0; 652 } 653 } 654 655 uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 656 int pos = maxexpansion->position; 657 658 while (pos > 0) { 659 pos --; 660 if (*(pendexpansionce + pos) == endexpansion) { 661 return maxexpansion->position; 662 } 663 } 664 665 *(pendexpansionce + maxexpansion->position) = endexpansion; 666 *(maxexpansion->isV + maxexpansion->position) = isV; 667 maxexpansion->position ++; 668 669 return maxexpansion->position; 670 } 671 672 673 static void ContrEndCPSet(uint8_t *table, UChar c) { 674 uint32_t hash; 675 uint8_t *htByte; 676 677 hash = c; 678 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 679 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 680 } 681 htByte = &table[hash>>3]; 682 *htByte |= (1 << (hash & 7)); 683 } 684 685 686 static void unsafeCPSet(uint8_t *table, UChar c) { 687 uint32_t hash; 688 uint8_t *htByte; 689 690 hash = c; 691 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 692 if (hash >= 0xd800 && hash <= 0xf8ff) { 693 /* Part of a surrogate, or in private use area. */ 694 /* These don't go in the table */ 695 return; 696 } 697 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 698 } 699 htByte = &table[hash>>3]; 700 *htByte |= (1 << (hash & 7)); 701 } 702 703 static void 704 uprv_uca_createCMTable(tempUCATable *t, int32_t noOfCM, UErrorCode *status) { 705 t->cmLookup = (CombinClassTable *)uprv_malloc(sizeof(CombinClassTable)); 706 if (t->cmLookup==NULL) { 707 *status = U_MEMORY_ALLOCATION_ERROR; 708 return; 709 } 710 t->cmLookup->cPoints=(UChar *)uprv_malloc(noOfCM*sizeof(UChar)); 711 if (t->cmLookup->cPoints ==NULL) { 712 uprv_free(t->cmLookup); 713 t->cmLookup = NULL; 714 *status = U_MEMORY_ALLOCATION_ERROR; 715 return; 716 } 717 718 t->cmLookup->size=noOfCM; 719 uprv_memset(t->cmLookup->index, 0, sizeof(t->cmLookup->index)); 720 721 return; 722 } 723 724 static void 725 uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) { 726 int32_t count=0; 727 728 for (int32_t i=0; i<256; ++i) { 729 if (index[i]>0) { 730 // cPoints is ordered by combining class value. 731 uprv_memcpy(t->cmLookup->cPoints+count, cm+(i<<8), index[i]*sizeof(UChar)); 732 count += index[i]; 733 } 734 t->cmLookup->index[i]=count; 735 } 736 return; 737 } 738 739 /* 1. to the UnsafeCP hash table, add all chars with combining class != 0 */ 740 /* 2. build combining marks table for all chars with combining class != 0 */ 741 static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) { 742 743 UChar c; 744 uint16_t fcd; // Hi byte is lead combining class. 745 // lo byte is trailing combing class. 746 const uint16_t *fcdTrieIndex; 747 UChar32 fcdHighStart; 748 UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table 749 UChar *cm=NULL; 750 uint16_t index[256]; 751 int32_t count=0; 752 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 753 if (U_FAILURE(*status)) { 754 return; 755 } 756 757 if (buildCMTable) { 758 if (cm==NULL) { 759 cm = (UChar *)uprv_malloc(sizeof(UChar)*UCOL_MAX_CM_TAB); 760 if (cm==NULL) { 761 *status = U_MEMORY_ALLOCATION_ERROR; 762 return; 763 } 764 } 765 uprv_memset(index, 0, sizeof(index)); 766 } 767 for (c=0; c<0xffff; c++) { 768 fcd = unorm_getFCD16(fcdTrieIndex, c); 769 if (fcd >= 0x100 || // if the leading combining class(c) > 0 || 770 (UTF_IS_LEAD(c) && fcd != 0)) {// c is a leading surrogate with some FCD data 771 if (buildCMTable) { 772 uint32_t cClass = fcd & 0xff; 773 //uint32_t temp=(cClass<<8)+index[cClass]; 774 cm[(cClass<<8)+index[cClass]] = c; // 775 index[cClass]++; 776 count++; 777 } 778 unsafeCPSet(t->unsafeCP, c); 779 } 780 } 781 782 // copy to cm table 783 if (buildCMTable) { 784 uprv_uca_createCMTable(t, count, status); 785 if(U_FAILURE(*status)) { 786 if (cm!=NULL) { 787 uprv_free(cm); 788 } 789 return; 790 } 791 uprv_uca_copyCMTable(t, cm, index); 792 } 793 794 if(t->prefixLookup != NULL) { 795 int32_t i = -1; 796 const UHashElement *e = NULL; 797 UCAElements *element = NULL; 798 UChar NFCbuf[256]; 799 uint32_t NFCbufLen = 0; 800 while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) { 801 element = (UCAElements *)e->value.pointer; 802 // codepoints here are in the NFD form. We need to add the 803 // first code point of the NFC form to unsafe, because 804 // strcoll needs to backup over them. 805 NFCbufLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, 806 NFCbuf, 256, status); 807 unsafeCPSet(t->unsafeCP, NFCbuf[0]); 808 } 809 } 810 811 if (cm!=NULL) { 812 uprv_free(cm); 813 } 814 } 815 816 static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE, 817 UCAElements *element, UErrorCode *status) 818 { 819 // currently the longest prefix we're supporting in Japanese is two characters 820 // long. Although this table could quite easily mimic complete contraction stuff 821 // there is no good reason to make a general solution, as it would require some 822 // error prone messing. 823 CntTable *contractions = t->contractions; 824 UChar32 cp; 825 uint32_t cpsize = 0; 826 UChar *oldCP = element->cPoints; 827 uint32_t oldCPSize = element->cSize; 828 829 830 contractions->currentTag = SPEC_PROC_TAG; 831 832 // here, we will normalize & add prefix to the table. 833 uint32_t j = 0; 834 #ifdef UCOL_DEBUG 835 for(j=0; j<element->cSize; j++) { 836 fprintf(stdout, "CP: %04X ", element->cPoints[j]); 837 } 838 fprintf(stdout, "El: %08X Pref: ", CE); 839 for(j=0; j<element->prefixSize; j++) { 840 fprintf(stdout, "%04X ", element->prefix[j]); 841 } 842 fprintf(stdout, "%08X ", element->mapCE); 843 #endif 844 845 for (j = 1; j<element->prefixSize; j++) { /* First add NFD prefix chars to unsafe CP hash table */ 846 // Unless it is a trail surrogate, which is handled algoritmically and 847 // shouldn't take up space in the table. 848 if(!(UTF_IS_TRAIL(element->prefix[j]))) { 849 unsafeCPSet(t->unsafeCP, element->prefix[j]); 850 } 851 } 852 853 UChar tempPrefix = 0; 854 855 for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards 856 // therefore, we will promptly reverse the prefix buffer... 857 tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1); 858 *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j]; 859 element->prefix[j] = tempPrefix; 860 } 861 862 #ifdef UCOL_DEBUG 863 fprintf(stdout, "Reversed: "); 864 for(j=0; j<element->prefixSize; j++) { 865 fprintf(stdout, "%04X ", element->prefix[j]); 866 } 867 fprintf(stdout, "%08X\n", element->mapCE); 868 #endif 869 870 // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix 871 if(!(UTF_IS_TRAIL(element->cPoints[0]))) { 872 unsafeCPSet(t->unsafeCP, element->cPoints[0]); 873 } 874 875 // Maybe we need this... To handle prefixes completely in the forward direction... 876 //if(element->cSize == 1) { 877 // if(!(UTF_IS_TRAIL(element->cPoints[0]))) { 878 // ContrEndCPSet(t->contrEndCP, element->cPoints[0]); 879 // } 880 //} 881 882 element->cPoints = element->prefix; 883 element->cSize = element->prefixSize; 884 885 // Add the last char of the contraction to the contraction-end hash table. 886 // unless it is a trail surrogate, which is handled algorithmically and 887 // shouldn't be in the table 888 if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) { 889 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 890 } 891 892 // First we need to check if contractions starts with a surrogate 893 UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp); 894 895 // If there are any Jamos in the contraction, we should turn on special 896 // processing for Jamos 897 if(UCOL_ISJAMO(element->prefix[0])) { 898 t->image->jamoSpecial = TRUE; 899 } 900 /* then we need to deal with it */ 901 /* we could aready have something in table - or we might not */ 902 903 if(!isPrefix(CE)) { 904 /* if it wasn't contraction, we wouldn't end up here*/ 905 int32_t firstContractionOffset = 0; 906 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 907 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 908 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status); 909 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 910 CE = constructContractCE(SPEC_PROC_TAG, firstContractionOffset); 911 } else { /* we are adding to existing contraction */ 912 /* there were already some elements in the table, so we need to add a new contraction */ 913 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 914 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status); 915 if(position > 0) { /* if it is we just continue down the chain */ 916 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 917 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 918 uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status); 919 } else { /* if it isn't, we will have to create a new sequence */ 920 uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 921 uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status); 922 } 923 } 924 925 element->cPoints = oldCP; 926 element->cSize = oldCPSize; 927 928 return CE; 929 } 930 931 // Note regarding surrogate handling: We are interested only in the single 932 // or leading surrogates in a contraction. If a surrogate is somewhere else 933 // in the contraction, it is going to be handled as a pair of code units, 934 // as it doesn't affect the performance AND handling surrogates specially 935 // would complicate code way too much. 936 static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 937 UCAElements *element, UErrorCode *status) 938 { 939 CntTable *contractions = t->contractions; 940 UChar32 cp; 941 uint32_t cpsize = 0; 942 943 contractions->currentTag = CONTRACTION_TAG; 944 945 // First we need to check if contractions starts with a surrogate 946 UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp); 947 948 if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first 949 uint32_t j = 0; 950 for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */ 951 // Unless it is a trail surrogate, which is handled algoritmically and 952 // shouldn't take up space in the table. 953 if(!(UTF_IS_TRAIL(element->cPoints[j]))) { 954 unsafeCPSet(t->unsafeCP, element->cPoints[j]); 955 } 956 } 957 // Add the last char of the contraction to the contraction-end hash table. 958 // unless it is a trail surrogate, which is handled algorithmically and 959 // shouldn't be in the table 960 if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) { 961 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 962 } 963 964 // If there are any Jamos in the contraction, we should turn on special 965 // processing for Jamos 966 if(UCOL_ISJAMO(element->cPoints[0])) { 967 t->image->jamoSpecial = TRUE; 968 } 969 /* then we need to deal with it */ 970 /* we could aready have something in table - or we might not */ 971 element->cPoints+=cpsize; 972 element->cSize-=cpsize; 973 if(!isContraction(CE)) { 974 /* if it wasn't contraction, we wouldn't end up here*/ 975 int32_t firstContractionOffset = 0; 976 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 977 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 978 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 979 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 980 CE = constructContractCE(CONTRACTION_TAG, firstContractionOffset); 981 } else { /* we are adding to existing contraction */ 982 /* there were already some elements in the table, so we need to add a new contraction */ 983 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 984 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status); 985 if(position > 0) { /* if it is we just continue down the chain */ 986 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 987 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 988 uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status); 989 } else { /* if it isn't, we will have to create a new sequence */ 990 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 991 uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status); 992 } 993 } 994 element->cPoints-=cpsize; 995 element->cSize+=cpsize; 996 /*ucmpe32_set(t->mapping, cp, CE);*/ 997 utrie_set32(t->mapping, cp, CE); 998 } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */ 999 /*ucmpe32_set(t->mapping, cp, element->mapCE);*/ 1000 utrie_set32(t->mapping, cp, element->mapCE); 1001 } else { /* fill out the first stage of the contraction with the surrogate CE */ 1002 uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status); 1003 uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status); 1004 } 1005 return CE; 1006 } 1007 1008 1009 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) { 1010 int32_t firstContractionOffset = 0; 1011 // uint32_t contractionElement = UCOL_NOT_FOUND; 1012 1013 if(U_FAILURE(*status)) { 1014 return UCOL_NOT_FOUND; 1015 } 1016 1017 /* end of recursion */ 1018 if(element->cSize == 1) { 1019 if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) { 1020 uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status); 1021 uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status); 1022 return existingCE; 1023 } else { 1024 return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */ 1025 } 1026 } 1027 1028 /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ 1029 /* for both backward and forward cycles */ 1030 1031 /* we encountered either an empty space or a non-contraction element */ 1032 /* this means we are constructing a new contraction sequence */ 1033 element->cPoints++; 1034 element->cSize--; 1035 if(!isCntTableElement(existingCE)) { 1036 /* if it wasn't contraction, we wouldn't end up here*/ 1037 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status); 1038 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1039 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 1040 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status); 1041 existingCE = constructContractCE(contractions->currentTag, firstContractionOffset); 1042 } else { /* we are adding to existing contraction */ 1043 /* there were already some elements in the table, so we need to add a new contraction */ 1044 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 1045 int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status); 1046 if(position > 0) { /* if it is we just continue down the chain */ 1047 uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status); 1048 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 1049 uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status); 1050 } else { /* if it isn't, we will have to create a new sequence */ 1051 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1052 uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status); 1053 } 1054 } 1055 element->cPoints--; 1056 element->cSize++; 1057 return existingCE; 1058 } 1059 1060 static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1061 uint32_t CE = UCOL_NOT_FOUND; 1062 // This should add a completely ignorable element to the 1063 // unsafe table, so that backward iteration will skip 1064 // over it when treating contractions. 1065 uint32_t i = 0; 1066 if(element->mapCE == 0) { 1067 for(i = 0; i < element->cSize; i++) { 1068 if(!UTF_IS_TRAIL(element->cPoints[i])) { 1069 unsafeCPSet(t->unsafeCP, element->cPoints[i]); 1070 } 1071 } 1072 } 1073 if(element->cSize > 1) { /* we're adding a contraction */ 1074 uint32_t i = 0; 1075 UChar32 cp; 1076 1077 UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp); 1078 /*CE = ucmpe32_get(t->mapping, cp);*/ 1079 CE = utrie_get32(t->mapping, cp, NULL); 1080 1081 CE = uprv_uca_addContraction(t, CE, element, status); 1082 } else { /* easy case, */ 1083 /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/ 1084 CE = utrie_get32(t->mapping, element->cPoints[0], NULL); 1085 1086 if( CE != UCOL_NOT_FOUND) { 1087 if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */ 1088 if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop 1089 // Only expansions and regular CEs can go here... Contractions will never happen in this place 1090 uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status); 1091 /* This loop has to change the CE at the end of contraction REDO!*/ 1092 uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status); 1093 } 1094 } else { 1095 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1096 utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1097 if ((element->prefixSize!=0) && (!isSpecial(CE) || (getCETag(CE)!=IMPLICIT_TAG))) { 1098 UCAElements *origElem = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1099 /* test for NULL */ 1100 if (origElem== NULL) { 1101 *status = U_MEMORY_ALLOCATION_ERROR; 1102 return 0; 1103 } 1104 /* copy the original UCA value */ 1105 origElem->prefixSize = 0; 1106 origElem->prefix = NULL; 1107 origElem->cPoints = origElem->uchars; 1108 origElem->cPoints[0] = element->cPoints[0]; 1109 origElem->cSize = 1; 1110 origElem->CEs[0]=CE; 1111 origElem->mapCE=CE; 1112 origElem->noOfCEs=1; 1113 uprv_uca_finalizeAddition(t, origElem, status); 1114 uprv_free(origElem); 1115 } 1116 #ifdef UCOL_DEBUG 1117 fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]); 1118 //*status = U_ILLEGAL_ARGUMENT_ERROR; 1119 #endif 1120 } 1121 } else { 1122 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1123 utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1124 } 1125 } 1126 return CE; 1127 } 1128 1129 /* This adds a read element, while testing for existence */ 1130 U_CAPI uint32_t U_EXPORT2 1131 uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1132 U_NAMESPACE_USE 1133 1134 ExpansionTable *expansions = t->expansions; 1135 1136 uint32_t i = 1; 1137 uint32_t expansion = 0; 1138 uint32_t CE; 1139 1140 if(U_FAILURE(*status)) { 1141 return 0xFFFF; 1142 } 1143 1144 element->mapCE = 0; // clear mapCE so that we can catch expansions 1145 1146 if(element->noOfCEs == 1) { 1147 element->mapCE = element->CEs[0]; 1148 } else { 1149 /* ICU 2.1 long primaries */ 1150 /* unfortunately, it looks like we have to look for a long primary here */ 1151 /* since in canonical closure we are going to hit some long primaries from */ 1152 /* the first phase, and they will come back as continuations/expansions */ 1153 /* destroying the effect of the previous opitimization */ 1154 /* A long primary is a three byte primary with starting secondaries and tertiaries */ 1155 /* It can appear in long runs of only primary differences (like east Asian tailorings) */ 1156 /* also, it should not be an expansion, as expansions would break with this */ 1157 // This part came in from ucol_bld.cpp 1158 //if(tok->expansion == 0 1159 //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1 1160 //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) { 1161 /* we will construct a special CE that will go unchanged to the table */ 1162 if(element->noOfCEs == 2 // a two CE expansion 1163 && isContinuation(element->CEs[1]) // which is a continuation 1164 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1165 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1166 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1167 ) 1168 { 1169 #ifdef UCOL_DEBUG 1170 fprintf(stdout, "Long primary %04X\n", element->cPoints[0]); 1171 #endif 1172 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1173 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1174 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1175 } 1176 else { 1177 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1178 | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1179 & 0xFFFFF0)); 1180 1181 for(i = 1; i<element->noOfCEs; i++) { 1182 uprv_uca_addExpansion(expansions, element->CEs[i], status); 1183 } 1184 if(element->noOfCEs <= 0xF) { 1185 expansion |= element->noOfCEs; 1186 } else { 1187 uprv_uca_addExpansion(expansions, 0, status); 1188 } 1189 element->mapCE = expansion; 1190 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1191 (uint8_t)element->noOfCEs, 1192 t->maxExpansions, 1193 status); 1194 if(UCOL_ISJAMO(element->cPoints[0])) { 1195 t->image->jamoSpecial = TRUE; 1196 uprv_uca_setMaxJamoExpansion(element->cPoints[0], 1197 element->CEs[element->noOfCEs - 1], 1198 (uint8_t)element->noOfCEs, 1199 t->maxJamoExpansions, 1200 status); 1201 } 1202 if (U_FAILURE(*status)) { 1203 return 0; 1204 } 1205 } 1206 } 1207 1208 // We treat digits differently - they are "uber special" and should be 1209 // processed differently if numeric collation is on. 1210 UChar32 uniChar = 0; 1211 //printElement(element); 1212 if ((element->cSize == 2) && U16_IS_LEAD(element->cPoints[0])){ 1213 uniChar = U16_GET_SUPPLEMENTARY(element->cPoints[0], element->cPoints[1]); 1214 } else if (element->cSize == 1){ 1215 uniChar = element->cPoints[0]; 1216 } 1217 1218 // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only 1219 // one element to the expansion buffer. When we encounter a digit and we don't 1220 // do numeric collation, we will just pick the CE we have and break out of case 1221 // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked 1222 // a special, further processing will occur. If it's a simple CE, we'll return due 1223 // to how the loop is constructed. 1224 if (uniChar != 0 && u_isdigit(uniChar)){ 1225 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element 1226 if(element->mapCE) { // if there is an expansion, we'll pick it here 1227 expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4); 1228 } else { 1229 expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4); 1230 } 1231 element->mapCE = expansion; 1232 1233 // Need to go back to the beginning of the digit string if in the middle! 1234 if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars 1235 unsafeCPSet(t->unsafeCP, (UChar)uniChar); 1236 } 1237 } 1238 1239 // here we want to add the prefix structure. 1240 // I will try to process it as a reverse contraction, if possible. 1241 // prefix buffer is already reversed. 1242 1243 if(element->prefixSize!=0) { 1244 // We keep the seen prefix starter elements in a hashtable 1245 // we need it to be able to distinguish between the simple 1246 // codepoints and prefix starters. Also, we need to use it 1247 // for canonical closure. 1248 1249 UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1250 /* test for NULL */ 1251 if (composed == NULL) { 1252 *status = U_MEMORY_ALLOCATION_ERROR; 1253 return 0; 1254 } 1255 uprv_memcpy(composed, element, sizeof(UCAElements)); 1256 composed->cPoints = composed->uchars; 1257 composed->prefix = composed->prefixChars; 1258 1259 composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status); 1260 1261 1262 if(t->prefixLookup != NULL) { 1263 UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element); 1264 if(uCE != NULL) { // there is already a set of code points here 1265 element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status); 1266 } else { // no code points, so this spot is clean 1267 element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status); 1268 uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1269 /* test for NULL */ 1270 if (uCE == NULL) { 1271 *status = U_MEMORY_ALLOCATION_ERROR; 1272 return 0; 1273 } 1274 uprv_memcpy(uCE, element, sizeof(UCAElements)); 1275 uCE->cPoints = uCE->uchars; 1276 uhash_put(t->prefixLookup, uCE, uCE, status); 1277 } 1278 if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) { 1279 // do it! 1280 composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status); 1281 } 1282 } 1283 uprv_free(composed); 1284 } 1285 1286 // We need to use the canonical iterator here 1287 // the way we do it is to generate the canonically equivalent strings 1288 // for the contraction and then add the sequences that pass FCD check 1289 if(element->cSize > 1 && !(element->cSize==2 && UTF16_IS_LEAD(element->cPoints[0]) && UTF16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included 1290 UnicodeString source(element->cPoints, element->cSize); 1291 CanonicalIterator it(source, *status); 1292 source = it.next(); 1293 while(!source.isBogus()) { 1294 if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) { 1295 element->cSize = source.extract(element->cPoints, 128, *status); 1296 uprv_uca_finalizeAddition(t, element, status); 1297 } 1298 source = it.next(); 1299 } 1300 CE = element->mapCE; 1301 } else { 1302 CE = uprv_uca_finalizeAddition(t, element, status); 1303 } 1304 1305 return CE; 1306 } 1307 1308 1309 /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */ 1310 static void uprv_uca_getMaxExpansionJamo(UNewTrie *mapping, 1311 MaxExpansionTable *maxexpansion, 1312 MaxJamoExpansionTable *maxjamoexpansion, 1313 UBool jamospecial, 1314 UErrorCode *status) 1315 { 1316 const uint32_t VBASE = 0x1161; 1317 const uint32_t TBASE = 0x11A8; 1318 const uint32_t VCOUNT = 21; 1319 const uint32_t TCOUNT = 28; 1320 1321 uint32_t v = VBASE + VCOUNT - 1; 1322 uint32_t t = TBASE + TCOUNT - 1; 1323 uint32_t ce; 1324 1325 while (v >= VBASE) { 1326 /*ce = ucmpe32_get(mapping, v);*/ 1327 ce = utrie_get32(mapping, v, NULL); 1328 if (ce < UCOL_SPECIAL_FLAG) { 1329 uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status); 1330 } 1331 v --; 1332 } 1333 1334 while (t >= TBASE) 1335 { 1336 /*ce = ucmpe32_get(mapping, t);*/ 1337 ce = utrie_get32(mapping, t, NULL); 1338 if (ce < UCOL_SPECIAL_FLAG) { 1339 uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status); 1340 } 1341 t --; 1342 } 1343 /* According to the docs, 99% of the time, the Jamo will not be special */ 1344 if (jamospecial) { 1345 /* gets the max expansion in all unicode characters */ 1346 int count = maxjamoexpansion->position; 1347 uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize + 1348 maxjamoexpansion->maxVSize + 1349 maxjamoexpansion->maxTSize); 1350 uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize + 1351 maxjamoexpansion->maxVSize); 1352 1353 while (count > 0) { 1354 count --; 1355 if (*(maxjamoexpansion->isV + count) == TRUE) { 1356 uprv_uca_setMaxExpansion( 1357 *(maxjamoexpansion->endExpansionCE + count), 1358 maxVSize, maxexpansion, status); 1359 } 1360 else { 1361 uprv_uca_setMaxExpansion( 1362 *(maxjamoexpansion->endExpansionCE + count), 1363 maxTSize, maxexpansion, status); 1364 } 1365 } 1366 } 1367 } 1368 1369 U_CDECL_BEGIN 1370 static inline uint32_t U_CALLCONV 1371 getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) 1372 { 1373 uint32_t value; 1374 uint32_t tag; 1375 UChar32 limit; 1376 UBool inBlockZero; 1377 1378 limit=start+0x400; 1379 while(start<limit) { 1380 value=utrie_get32(trie, start, &inBlockZero); 1381 tag = getCETag(value); 1382 if(inBlockZero == TRUE) { 1383 start+=UTRIE_DATA_BLOCK_LENGTH; 1384 } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) { 1385 /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the 1386 * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is 1387 * nothing in this position and that it should be skipped. 1388 */ 1389 #ifdef UCOL_DEBUG 1390 static int32_t count = 1; 1391 fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value); 1392 #endif 1393 return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset); 1394 } else { 1395 ++start; 1396 } 1397 } 1398 return 0; 1399 } 1400 U_CDECL_END 1401 1402 #ifdef UCOL_DEBUG 1403 // This is a debug function to print the contents of a trie. 1404 // It is used in conjuction with the code around utrie_unserialize call 1405 UBool enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 1406 if(start<0x10000) { 1407 fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value); 1408 } else { 1409 fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value); 1410 } 1411 return TRUE; 1412 } 1413 1414 int32_t 1415 myGetFoldingOffset(uint32_t data) { 1416 if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) { 1417 return (data&0xFFFFFF); 1418 } else { 1419 return 0; 1420 } 1421 } 1422 #endif 1423 1424 U_CAPI UCATableHeader* U_EXPORT2 1425 uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { 1426 /*CompactEIntArray *mapping = t->mapping;*/ 1427 UNewTrie *mapping = t->mapping; 1428 ExpansionTable *expansions = t->expansions; 1429 CntTable *contractions = t->contractions; 1430 MaxExpansionTable *maxexpansion = t->maxExpansions; 1431 1432 if(U_FAILURE(*status)) { 1433 return NULL; 1434 } 1435 1436 uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar)); 1437 1438 int32_t contractionsSize = 0; 1439 contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status); 1440 1441 /* the following operation depends on the trie data. Therefore, we have to do it before */ 1442 /* the trie is compacted */ 1443 /* sets jamo expansions */ 1444 uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions, 1445 t->image->jamoSpecial, status); 1446 1447 /*ucmpe32_compact(mapping);*/ 1448 /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/ 1449 /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/ 1450 /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/ 1451 1452 // After setting the jamo expansions, compact the trie and get the needed size 1453 int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status); 1454 1455 uint32_t tableOffset = 0; 1456 uint8_t *dataStart; 1457 1458 /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */ 1459 1460 uint32_t toAllocate =(uint32_t)(headersize+ 1461 paddedsize(expansions->position*sizeof(uint32_t))+ 1462 paddedsize(mappingSize)+ 1463 paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+ 1464 //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */ 1465 /* maxexpansion array */ 1466 + paddedsize(maxexpansion->position * sizeof(uint32_t)) + 1467 /* maxexpansion size array */ 1468 paddedsize(maxexpansion->position * sizeof(uint8_t)) + 1469 paddedsize(UCOL_UNSAFECP_TABLE_SIZE) + /* Unsafe chars */ 1470 paddedsize(UCOL_UNSAFECP_TABLE_SIZE)); /* Contraction Ending chars */ 1471 1472 1473 dataStart = (uint8_t *)uprv_malloc(toAllocate); 1474 /* test for NULL */ 1475 if (dataStart == NULL) { 1476 *status = U_MEMORY_ALLOCATION_ERROR; 1477 return NULL; 1478 } 1479 1480 UCATableHeader *myData = (UCATableHeader *)dataStart; 1481 // Please, do reset all the fields! 1482 uprv_memset(dataStart, 0, toAllocate); 1483 // Make sure we know this is reset 1484 myData->magic = UCOL_HEADER_MAGIC; 1485 myData->isBigEndian = U_IS_BIG_ENDIAN; 1486 myData->charSetFamily = U_CHARSET_FAMILY; 1487 myData->formatVersion[0] = UCA_FORMAT_VERSION_0; 1488 myData->formatVersion[1] = UCA_FORMAT_VERSION_1; 1489 myData->formatVersion[2] = UCA_FORMAT_VERSION_2; 1490 myData->formatVersion[3] = UCA_FORMAT_VERSION_3; 1491 myData->jamoSpecial = t->image->jamoSpecial; 1492 1493 // Don't copy stuff from UCA header! 1494 //uprv_memcpy(myData, t->image, sizeof(UCATableHeader)); 1495 1496 myData->contractionSize = contractionsSize; 1497 1498 tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader))); 1499 1500 myData->options = tableOffset; 1501 uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet)); 1502 tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet))); 1503 1504 /* copy expansions */ 1505 /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/ 1506 myData->expansion = tableOffset; 1507 uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t)); 1508 tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t))); 1509 1510 /* contractions block */ 1511 if(contractionsSize != 0) { 1512 /* copy contraction index */ 1513 /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/ 1514 myData->contractionIndex = tableOffset; 1515 uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar)); 1516 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar))); 1517 1518 /* copy contraction collation elements */ 1519 /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/ 1520 myData->contractionCEs = tableOffset; 1521 uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t)); 1522 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t))); 1523 } else { 1524 myData->contractionIndex = 0; 1525 myData->contractionCEs = 0; 1526 } 1527 1528 /* copy mapping table */ 1529 /*myData->mappingPosition = dataStart+tableOffset;*/ 1530 /*myData->mappingPosition = tableOffset;*/ 1531 /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/ 1532 1533 myData->mappingPosition = tableOffset; 1534 utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status); 1535 #ifdef UCOL_DEBUG 1536 // This is debug code to dump the contents of the trie. It needs two functions defined above 1537 { 1538 UTrie UCAt = { 0 }; 1539 uint32_t trieWord; 1540 utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status); 1541 UCAt.getFoldingOffset = myGetFoldingOffset; 1542 if(U_SUCCESS(*status)) { 1543 utrie_enum(&UCAt, NULL, enumRange, NULL); 1544 } 1545 trieWord = UTRIE_GET32_FROM_LEAD(&UCAt, 0xDC01); 1546 } 1547 #endif 1548 tableOffset += paddedsize(mappingSize); 1549 1550 1551 int32_t i = 0; 1552 1553 /* copy max expansion table */ 1554 myData->endExpansionCE = tableOffset; 1555 myData->endExpansionCECount = maxexpansion->position - 1; 1556 /* not copying the first element which is a dummy */ 1557 uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1, 1558 (maxexpansion->position - 1) * sizeof(uint32_t)); 1559 tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t))); 1560 myData->expansionCESize = tableOffset; 1561 uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1, 1562 (maxexpansion->position - 1) * sizeof(uint8_t)); 1563 tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t))); 1564 1565 /* Unsafe chars table. Finish it off, then copy it. */ 1566 uprv_uca_unsafeCPAddCCNZ(t, status); 1567 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1568 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1569 t->unsafeCP[i] |= t->UCA->unsafeCP[i]; 1570 } 1571 } 1572 myData->unsafeCP = tableOffset; 1573 uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 1574 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1575 1576 1577 /* Finish building Contraction Ending chars hash table and then copy it out. */ 1578 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1579 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1580 t->contrEndCP[i] |= t->UCA->contrEndCP[i]; 1581 } 1582 } 1583 myData->contrEndCP = tableOffset; 1584 uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 1585 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1586 1587 if(tableOffset != toAllocate) { 1588 #ifdef UCOL_DEBUG 1589 fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset); 1590 #endif 1591 *status = U_INTERNAL_PROGRAM_ERROR; 1592 uprv_free(dataStart); 1593 return 0; 1594 } 1595 1596 myData->size = tableOffset; 1597 /* This should happen upon ressurection */ 1598 /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/ 1599 /*uprv_mstrm_close(ms);*/ 1600 return myData; 1601 } 1602 1603 1604 struct enumStruct { 1605 tempUCATable *t; 1606 UCollator *tempColl; 1607 UCollationElements* colEl; 1608 const Normalizer2Impl *nfcImpl; 1609 UnicodeSet *closed; 1610 int32_t noOfClosures; 1611 UErrorCode *status; 1612 }; 1613 U_CDECL_BEGIN 1614 static UBool U_CALLCONV 1615 _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1616 1617 if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later 1618 UErrorCode *status = ((enumStruct *)context)->status; 1619 tempUCATable *t = ((enumStruct *)context)->t; 1620 UCollator *tempColl = ((enumStruct *)context)->tempColl; 1621 UCollationElements* colEl = ((enumStruct *)context)->colEl; 1622 UCAElements el; 1623 UChar decompBuffer[4]; 1624 const UChar *decomp; 1625 int32_t noOfDec = 0; 1626 1627 UChar32 u32 = 0; 1628 UChar comp[2]; 1629 uint32_t len = 0; 1630 1631 for(u32 = start; u32 < limit; u32++) { 1632 decomp = ((enumStruct *)context)->nfcImpl-> 1633 getDecomposition(u32, decompBuffer, noOfDec); 1634 //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1 1635 //|| (noOfDec == 1 && *decomp != (UChar)u32)) 1636 if(decomp != NULL) 1637 { 1638 len = 0; 1639 U16_APPEND_UNSAFE(comp, len, u32); 1640 if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) { 1641 #ifdef UCOL_DEBUG 1642 fprintf(stderr, "Closure: U+%04X -> ", u32); 1643 UChar32 c; 1644 int32_t i = 0; 1645 while(i < noOfDec) { 1646 U16_NEXT(decomp, i, noOfDec, c); 1647 fprintf(stderr, "%04X ", c); 1648 } 1649 fprintf(stderr, "\n"); 1650 // print CEs for code point vs. decomposition 1651 fprintf(stderr, "U+%04X CEs: ", u32); 1652 UCollationElements *iter = ucol_openElements(tempColl, comp, len, status); 1653 int32_t ce; 1654 while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) { 1655 fprintf(stderr, "%08X ", ce); 1656 } 1657 fprintf(stderr, "\nDecomp CEs: "); 1658 ucol_setText(iter, decomp, noOfDec, status); 1659 while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) { 1660 fprintf(stderr, "%08X ", ce); 1661 } 1662 fprintf(stderr, "\n"); 1663 ucol_closeElements(iter); 1664 #endif 1665 if(((enumStruct *)context)->closed != NULL) { 1666 ((enumStruct *)context)->closed->add(u32); 1667 } 1668 ((enumStruct *)context)->noOfClosures++; 1669 el.cPoints = (UChar *)decomp; 1670 el.cSize = noOfDec; 1671 el.noOfCEs = 0; 1672 el.prefix = el.prefixChars; 1673 el.prefixSize = 0; 1674 1675 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el); 1676 el.cPoints = comp; 1677 el.cSize = len; 1678 el.prefix = el.prefixChars; 1679 el.prefixSize = 0; 1680 if(prefix == NULL) { 1681 el.noOfCEs = 0; 1682 ucol_setText(colEl, decomp, noOfDec, status); 1683 while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1684 el.noOfCEs++; 1685 } 1686 } else { 1687 el.noOfCEs = 1; 1688 el.CEs[0] = prefix->mapCE; 1689 // This character uses a prefix. We have to add it 1690 // to the unsafe table, as it decomposed form is already 1691 // in. In Japanese, this happens for \u309e & \u30fe 1692 // Since unsafeCPSet is static in ucol_elm, we are going 1693 // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function 1694 } 1695 uprv_uca_addAnElement(t, &el, status); 1696 } 1697 } 1698 } 1699 } 1700 return TRUE; 1701 } 1702 U_CDECL_END 1703 1704 static void 1705 uprv_uca_setMapCE(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1706 uint32_t expansion = 0; 1707 int32_t j; 1708 1709 ExpansionTable *expansions = t->expansions; 1710 if(element->noOfCEs == 2 // a two CE expansion 1711 && isContinuation(element->CEs[1]) // which is a continuation 1712 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1713 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1714 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1715 ) { 1716 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1717 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1718 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1719 } else { 1720 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1721 | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1722 & 0xFFFFF0)); 1723 1724 for(j = 1; j<(int32_t)element->noOfCEs; j++) { 1725 uprv_uca_addExpansion(expansions, element->CEs[j], status); 1726 } 1727 if(element->noOfCEs <= 0xF) { 1728 expansion |= element->noOfCEs; 1729 } else { 1730 uprv_uca_addExpansion(expansions, 0, status); 1731 } 1732 element->mapCE = expansion; 1733 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1734 (uint8_t)element->noOfCEs, 1735 t->maxExpansions, 1736 status); 1737 } 1738 } 1739 1740 static void 1741 uprv_uca_addFCD4AccentedContractions(tempUCATable *t, 1742 UCollationElements* colEl, 1743 UChar *data, 1744 int32_t len, 1745 UCAElements *el, 1746 UErrorCode *status) { 1747 UChar decomp[256], comp[256]; 1748 int32_t decLen, compLen; 1749 1750 decLen = unorm_normalize(data, len, UNORM_NFD, 0, decomp, 256, status); 1751 compLen = unorm_normalize(data, len, UNORM_NFC, 0, comp, 256, status); 1752 decomp[decLen] = comp[compLen] = 0; 1753 1754 el->cPoints = decomp; 1755 el->cSize = decLen; 1756 el->noOfCEs = 0; 1757 el->prefixSize = 0; 1758 el->prefix = el->prefixChars; 1759 1760 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1761 el->cPoints = comp; 1762 el->cSize = compLen; 1763 el->prefix = el->prefixChars; 1764 el->prefixSize = 0; 1765 if(prefix == NULL) { 1766 el->noOfCEs = 0; 1767 ucol_setText(colEl, decomp, decLen, status); 1768 while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1769 el->noOfCEs++; 1770 } 1771 uprv_uca_setMapCE(t, el, status); 1772 uprv_uca_addAnElement(t, el, status); 1773 } 1774 } 1775 1776 static void 1777 uprv_uca_addMultiCMContractions(tempUCATable *t, 1778 UCollationElements* colEl, 1779 tempTailorContext *c, 1780 UCAElements *el, 1781 UErrorCode *status) { 1782 CombinClassTable *cmLookup = t->cmLookup; 1783 UChar newDecomp[256]; 1784 int32_t maxComp, newDecLen; 1785 UChar32 fcdHighStart; 1786 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 1787 if (U_FAILURE(*status)) { 1788 return; 1789 } 1790 int16_t curClass = (unorm_getFCD16(fcdTrieIndex, c->tailoringCM) & 0xff); 1791 CompData *precomp = c->precomp; 1792 int32_t compLen = c->compLen; 1793 UChar *comp = c->comp; 1794 maxComp = c->precompLen; 1795 1796 for (int32_t j=0; j < maxComp; j++) { 1797 int32_t count=0; 1798 do { 1799 if ( count == 0 ) { // Decompose the saved precomposed char. 1800 UChar temp[2]; 1801 temp[0]=precomp[j].cp; 1802 temp[1]=0; 1803 newDecLen = unorm_normalize(temp, 1, UNORM_NFD, 0, 1804 newDecomp, sizeof(newDecomp)/sizeof(UChar), status); 1805 newDecomp[newDecLen++] = cmLookup->cPoints[c->cmPos]; 1806 } 1807 else { // swap 2 combining marks when they are equal. 1808 uprv_memcpy(newDecomp, c->decomp, sizeof(UChar)*(c->decompLen)); 1809 newDecLen = c->decompLen; 1810 newDecomp[newDecLen++] = precomp[j].cClass; 1811 } 1812 newDecomp[newDecLen] = 0; 1813 compLen = unorm_normalize(newDecomp, newDecLen, UNORM_NFC, 0, 1814 comp, 256, status); 1815 if (compLen==1) { 1816 comp[compLen++] = newDecomp[newDecLen++] = c->tailoringCM; 1817 comp[compLen] = newDecomp[newDecLen] = 0; 1818 el->cPoints = newDecomp; 1819 el->cSize = newDecLen; 1820 1821 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1822 el->cPoints = c->comp; 1823 el->cSize = compLen; 1824 el->prefix = el->prefixChars; 1825 el->prefixSize = 0; 1826 if(prefix == NULL) { 1827 el->noOfCEs = 0; 1828 ucol_setText(colEl, newDecomp, newDecLen, status); 1829 while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1830 el->noOfCEs++; 1831 } 1832 uprv_uca_setMapCE(t, el, status); 1833 uprv_uca_finalizeAddition(t, el, status); 1834 1835 // Save the current precomposed char and its class to find any 1836 // other combining mark combinations. 1837 precomp[c->precompLen].cp=comp[0]; 1838 precomp[c->precompLen].cClass = curClass; 1839 c->precompLen++; 1840 } 1841 } 1842 } while (++count<2 && (precomp[j].cClass == curClass)); 1843 } 1844 1845 } 1846 1847 static void 1848 uprv_uca_addTailCanonicalClosures(tempUCATable *t, 1849 UCollationElements* colEl, 1850 UChar baseCh, 1851 UChar cMark, 1852 UCAElements *el, 1853 UErrorCode *status) { 1854 CombinClassTable *cmLookup = t->cmLookup; 1855 UChar32 fcdHighStart; 1856 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 1857 if (U_FAILURE(*status)) { 1858 return; 1859 } 1860 int16_t maxIndex = (unorm_getFCD16(fcdTrieIndex, cMark) & 0xff ); 1861 UCAElements element; 1862 uint16_t *index; 1863 UChar decomp[256]; 1864 UChar comp[256]; 1865 CompData precomp[256]; // precomposed array 1866 int32_t precompLen = 0; // count for precomp 1867 int32_t i, len, decompLen, curClass, replacedPos; 1868 tempTailorContext c; 1869 1870 if ( cmLookup == NULL ) { 1871 return; 1872 } 1873 index = cmLookup->index; 1874 int32_t cClass=(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff); 1875 maxIndex = (int32_t)index[(unorm_getFCD16(fcdTrieIndex, cMark) & 0xff)-1]; 1876 c.comp = comp; 1877 c.decomp = decomp; 1878 c.precomp = precomp; 1879 c.tailoringCM = cMark; 1880 1881 if (cClass>0) { 1882 maxIndex = (int32_t)index[cClass-1]; 1883 } 1884 else { 1885 maxIndex=0; 1886 } 1887 decomp[0]=baseCh; 1888 for ( i=0; i<maxIndex ; i++ ) { 1889 decomp[1] = cmLookup->cPoints[i]; 1890 decomp[2]=0; 1891 decompLen=2; 1892 len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1893 if (len==1) { 1894 // Save the current precomposed char and its class to find any 1895 // other combining mark combinations. 1896 precomp[precompLen].cp=comp[0]; 1897 curClass = precomp[precompLen].cClass = 1898 index[unorm_getFCD16(fcdTrieIndex, decomp[1]) & 0xff]; 1899 precompLen++; 1900 replacedPos=0; 1901 for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) { 1902 decomp[decompLen] = el->cPoints[decompLen]; 1903 if (decomp[decompLen]==cMark) { 1904 replacedPos = decompLen; // record the position for later use 1905 } 1906 } 1907 if ( replacedPos != 0 ) { 1908 decomp[replacedPos]=cmLookup->cPoints[i]; 1909 } 1910 decomp[decompLen] = 0; 1911 len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1912 comp[len++] = decomp[decompLen++] = cMark; 1913 comp[len] = decomp[decompLen] = 0; 1914 element.cPoints = decomp; 1915 element.cSize = decompLen; 1916 element.noOfCEs = 0; 1917 element.prefix = el->prefixChars; 1918 element.prefixSize = 0; 1919 1920 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &element); 1921 element.cPoints = comp; 1922 element.cSize = len; 1923 element.prefix = el->prefixChars; 1924 element.prefixSize = 0; 1925 if(prefix == NULL) { 1926 element.noOfCEs = 0; 1927 ucol_setText(colEl, decomp, decompLen, status); 1928 while((element.CEs[element.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1929 element.noOfCEs++; 1930 } 1931 uprv_uca_setMapCE(t, &element, status); 1932 uprv_uca_finalizeAddition(t, &element, status); 1933 } 1934 1935 // This is a fix for tailoring contractions with accented 1936 // character at the end of contraction string. 1937 if ((len>2) && 1938 (unorm_getFCD16(fcdTrieIndex, comp[len-2]) & 0xff00)==0) { 1939 uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status); 1940 } 1941 1942 if (precompLen >1) { 1943 c.compLen = len; 1944 c.decompLen = decompLen; 1945 c.precompLen = precompLen; 1946 c.cmPos = i; 1947 uprv_uca_addMultiCMContractions(t, colEl, &c, &element, status); 1948 precompLen = c.precompLen; 1949 } 1950 } 1951 } 1952 } 1953 1954 U_CFUNC int32_t U_EXPORT2 1955 uprv_uca_canonicalClosure(tempUCATable *t, 1956 UColTokenParser *src, 1957 UnicodeSet *closed, 1958 UErrorCode *status) 1959 { 1960 enumStruct context; 1961 context.closed = closed; 1962 context.noOfClosures = 0; 1963 UCAElements el; 1964 UColToken *tok; 1965 uint32_t i = 0, j = 0; 1966 UChar baseChar, firstCM; 1967 UChar32 fcdHighStart; 1968 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 1969 context.nfcImpl=Normalizer2Factory::getNFCImpl(*status); 1970 if(U_FAILURE(*status)) { 1971 return 0; 1972 } 1973 1974 UCollator *tempColl = NULL; 1975 tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status); 1976 // Check for null pointer 1977 if (U_FAILURE(*status)) { 1978 return 0; 1979 } 1980 1981 UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status); 1982 tempColl = ucol_initCollator(tempData, 0, t->UCA, status); 1983 if ( tempTable->cmLookup != NULL ) { 1984 t->cmLookup = tempTable->cmLookup; // copy over to t 1985 tempTable->cmLookup = NULL; 1986 } 1987 uprv_uca_closeTempTable(tempTable); 1988 1989 if(U_SUCCESS(*status)) { 1990 tempColl->ucaRules = NULL; 1991 tempColl->actualLocale = NULL; 1992 tempColl->validLocale = NULL; 1993 tempColl->requestedLocale = NULL; 1994 tempColl->hasRealData = TRUE; 1995 tempColl->freeImageOnClose = TRUE; 1996 } else if(tempData != 0) { 1997 uprv_free(tempData); 1998 } 1999 2000 /* produce canonical closure */ 2001 UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status); 2002 // Check for null pointer 2003 if (U_FAILURE(*status)) { 2004 return 0; 2005 } 2006 context.t = t; 2007 context.tempColl = tempColl; 2008 context.colEl = colEl; 2009 context.status = status; 2010 u_enumCharTypes(_enumCategoryRangeClosureCategory, &context); 2011 2012 if ( (src==NULL) || !src->buildCCTabFlag ) { 2013 ucol_closeElements(colEl); 2014 ucol_close(tempColl); 2015 return context.noOfClosures; // no extra contraction needed to add 2016 } 2017 2018 for (i=0; i < src->resultLen; i++) { 2019 baseChar = firstCM= (UChar)0; 2020 tok = src->lh[i].first; 2021 while (tok != NULL && U_SUCCESS(*status)) { 2022 el.prefix = el.prefixChars; 2023 el.cPoints = el.uchars; 2024 if(tok->prefix != 0) { 2025 el.prefixSize = tok->prefix>>24; 2026 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 2027 2028 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 2029 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 2030 } else { 2031 el.prefixSize = 0; 2032 *el.prefix = 0; 2033 2034 el.cSize = (tok->source >> 24); 2035 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 2036 } 2037 if(src->UCA != NULL) { 2038 for(j = 0; j<el.cSize; j++) { 2039 int16_t fcd = unorm_getFCD16(fcdTrieIndex, el.cPoints[j]); 2040 if ( (fcd & 0xff) == 0 ) { 2041 baseChar = el.cPoints[j]; // last base character 2042 firstCM=0; // reset combining mark value 2043 } 2044 else { 2045 if ( (baseChar!=0) && (firstCM==0) ) { 2046 firstCM = el.cPoints[j]; // first combining mark 2047 } 2048 } 2049 } 2050 } 2051 if ( (baseChar!= (UChar)0) && (firstCM != (UChar)0) ) { 2052 // find all the canonical rules 2053 uprv_uca_addTailCanonicalClosures(t, colEl, baseChar, firstCM, &el, status); 2054 } 2055 tok = tok->next; 2056 } 2057 } 2058 ucol_closeElements(colEl); 2059 ucol_close(tempColl); 2060 2061 return context.noOfClosures; 2062 } 2063 2064 #endif /* #if !UCONFIG_NO_COLLATION */ 2065