1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucaelems.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This program reads the Franctional UCA table and generates 17 * internal format for UCA table as well as inverse UCA table. 18 * It then writes binary files containing the data: ucadata.dat 19 * & invuca.dat 20 * 21 * date name comments 22 * 03/02/2001 synwee added setMaxExpansion 23 * 03/07/2001 synwee merged UCA's maxexpansion and tailoring's 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if !UCONFIG_NO_COLLATION 29 30 #include "unicode/uchar.h" 31 #include "unicode/unistr.h" 32 #include "unicode/ucoleitr.h" 33 #include "unicode/normlzr.h" 34 #include "unicode/utf16.h" 35 #include "normalizer2impl.h" 36 #include "ucol_elm.h" 37 #include "ucol_tok.h" 38 #include "ucol_cnt.h" 39 #include "unicode/caniter.h" 40 #include "cmemory.h" 41 #include "uassert.h" 42 43 U_NAMESPACE_USE 44 45 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status); 46 47 U_CDECL_BEGIN 48 static int32_t U_CALLCONV 49 prefixLookupHash(const UHashTok e) { 50 UCAElements *element = (UCAElements *)e.pointer; 51 UChar buf[256]; 52 UHashTok key; 53 key.pointer = buf; 54 uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar)); 55 buf[element->cSize] = 0; 56 //key.pointer = element->cPoints; 57 //element->cPoints[element->cSize] = 0; 58 return uhash_hashUChars(key); 59 } 60 61 static int8_t U_CALLCONV 62 prefixLookupComp(const UHashTok e1, const UHashTok e2) { 63 UCAElements *element1 = (UCAElements *)e1.pointer; 64 UCAElements *element2 = (UCAElements *)e2.pointer; 65 66 UChar buf1[256]; 67 UHashTok key1; 68 key1.pointer = buf1; 69 uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar)); 70 buf1[element1->cSize] = 0; 71 72 UChar buf2[256]; 73 UHashTok key2; 74 key2.pointer = buf2; 75 uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar)); 76 buf2[element2->cSize] = 0; 77 78 return uhash_compareUChars(key1, key2); 79 } 80 U_CDECL_END 81 82 static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) { 83 if(U_FAILURE(*status)) { 84 return 0; 85 } 86 if(expansions->CEs == NULL) { 87 expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t)); 88 /* test for NULL */ 89 if (expansions->CEs == NULL) { 90 *status = U_MEMORY_ALLOCATION_ERROR; 91 return 0; 92 } 93 expansions->size = INIT_EXP_TABLE_SIZE; 94 expansions->position = 0; 95 } 96 97 if(expansions->position == expansions->size) { 98 uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t)); 99 if(newData == NULL) { 100 #ifdef UCOL_DEBUG 101 fprintf(stderr, "out of memory for expansions\n"); 102 #endif 103 *status = U_MEMORY_ALLOCATION_ERROR; 104 return -1; 105 } 106 expansions->CEs = newData; 107 expansions->size *= 2; 108 } 109 110 expansions->CEs[expansions->position] = value; 111 return(expansions->position++); 112 } 113 114 U_CAPI tempUCATable* U_EXPORT2 115 uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) { 116 MaxJamoExpansionTable *maxjet; 117 MaxExpansionTable *maxet; 118 tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 119 /* test for NULL */ 120 if (t == NULL) { 121 *status = U_MEMORY_ALLOCATION_ERROR; 122 return NULL; 123 } 124 uprv_memset(t, 0, sizeof(tempUCATable)); 125 126 maxet = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 127 if (maxet == NULL) { 128 goto allocation_failure; 129 } 130 uprv_memset(maxet, 0, sizeof(MaxExpansionTable)); 131 t->maxExpansions = maxet; 132 133 maxjet = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 134 if (maxjet == NULL) { 135 goto allocation_failure; 136 } 137 uprv_memset(maxjet, 0, sizeof(MaxJamoExpansionTable)); 138 t->maxJamoExpansions = maxjet; 139 140 t->image = image; 141 t->options = opts; 142 143 t->UCA = UCA; 144 t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 145 /* test for NULL */ 146 if (t->expansions == NULL) { 147 goto allocation_failure; 148 } 149 uprv_memset(t->expansions, 0, sizeof(ExpansionTable)); 150 151 t->mapping = utrie_open(NULL, NULL, UCOL_ELM_TRIE_CAPACITY, 152 UCOL_SPECIAL_FLAG | (initTag<<24), 153 UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24), 154 TRUE); // Do your own mallocs for the structure, array and have linear Latin 1 155 if (U_FAILURE(*status)) { 156 goto allocation_failure; 157 } 158 t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, NULL, status); 159 if (U_FAILURE(*status)) { 160 goto allocation_failure; 161 } 162 uhash_setValueDeleter(t->prefixLookup, uprv_free); 163 164 t->contractions = uprv_cnttab_open(t->mapping, status); 165 if (U_FAILURE(*status)) { 166 goto cleanup; 167 } 168 169 /* copy UCA's maxexpansion and merge as we go along */ 170 if (UCA != NULL) { 171 /* adding an extra initial value for easier manipulation */ 172 maxet->size = (int32_t)(UCA->lastEndExpansionCE - UCA->endExpansionCE) + 2; 173 maxet->position = maxet->size - 1; 174 maxet->endExpansionCE = 175 (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size); 176 /* test for NULL */ 177 if (maxet->endExpansionCE == NULL) { 178 goto allocation_failure; 179 } 180 maxet->expansionCESize = 181 (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size); 182 /* test for NULL */ 183 if (maxet->expansionCESize == NULL) { 184 goto allocation_failure; 185 } 186 /* initialized value */ 187 *(maxet->endExpansionCE) = 0; 188 *(maxet->expansionCESize) = 0; 189 uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE, 190 sizeof(uint32_t) * (maxet->size - 1)); 191 uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize, 192 sizeof(uint8_t) * (maxet->size - 1)); 193 } 194 else { 195 maxet->size = 0; 196 } 197 maxjet->endExpansionCE = NULL; 198 maxjet->isV = NULL; 199 maxjet->size = 0; 200 maxjet->position = 0; 201 maxjet->maxLSize = 1; 202 maxjet->maxVSize = 1; 203 maxjet->maxTSize = 1; 204 205 t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 206 /* test for NULL */ 207 if (t->unsafeCP == NULL) { 208 goto allocation_failure; 209 } 210 t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 211 /* test for NULL */ 212 if (t->contrEndCP == NULL) { 213 goto allocation_failure; 214 } 215 uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 216 uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE); 217 t->cmLookup = NULL; 218 return t; 219 220 allocation_failure: 221 *status = U_MEMORY_ALLOCATION_ERROR; 222 cleanup: 223 uprv_uca_closeTempTable(t); 224 return NULL; 225 } 226 227 static tempUCATable* U_EXPORT2 228 uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) { 229 if(U_FAILURE(*status)) { 230 return NULL; 231 } 232 233 tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable)); 234 /* test for NULL */ 235 if (r == NULL) { 236 *status = U_MEMORY_ALLOCATION_ERROR; 237 return NULL; 238 } 239 uprv_memset(r, 0, sizeof(tempUCATable)); 240 241 /* mapping */ 242 if(t->mapping != NULL) { 243 /*r->mapping = ucmpe32_clone(t->mapping, status);*/ 244 r->mapping = utrie_clone(NULL, t->mapping, NULL, 0); 245 } 246 247 // a hashing clone function would be very nice. We have none currently... 248 // However, we should be good, as closing should not produce any prefixed elements. 249 r->prefixLookup = NULL; // prefixes are not used in closing 250 251 /* expansions */ 252 if(t->expansions != NULL) { 253 r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); 254 /* test for NULL */ 255 if (r->expansions == NULL) { 256 *status = U_MEMORY_ALLOCATION_ERROR; 257 goto cleanup; 258 } 259 r->expansions->position = t->expansions->position; 260 r->expansions->size = t->expansions->size; 261 if(t->expansions->CEs != NULL) { 262 r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size); 263 /* test for NULL */ 264 if (r->expansions->CEs == NULL) { 265 *status = U_MEMORY_ALLOCATION_ERROR; 266 goto cleanup; 267 } 268 uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position); 269 } else { 270 r->expansions->CEs = NULL; 271 } 272 } 273 274 if(t->contractions != NULL) { 275 r->contractions = uprv_cnttab_clone(t->contractions, status); 276 // Check for cloning failure. 277 if (r->contractions == NULL) { 278 *status = U_MEMORY_ALLOCATION_ERROR; 279 goto cleanup; 280 } 281 r->contractions->mapping = r->mapping; 282 } 283 284 if(t->maxExpansions != NULL) { 285 r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable)); 286 /* test for NULL */ 287 if (r->maxExpansions == NULL) { 288 *status = U_MEMORY_ALLOCATION_ERROR; 289 goto cleanup; 290 } 291 r->maxExpansions->size = t->maxExpansions->size; 292 r->maxExpansions->position = t->maxExpansions->position; 293 if(t->maxExpansions->endExpansionCE != NULL) { 294 r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size); 295 /* test for NULL */ 296 if (r->maxExpansions->endExpansionCE == NULL) { 297 *status = U_MEMORY_ALLOCATION_ERROR; 298 goto cleanup; 299 } 300 uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size); 301 uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t)); 302 } else { 303 r->maxExpansions->endExpansionCE = NULL; 304 } 305 if(t->maxExpansions->expansionCESize != NULL) { 306 r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size); 307 /* test for NULL */ 308 if (r->maxExpansions->expansionCESize == NULL) { 309 *status = U_MEMORY_ALLOCATION_ERROR; 310 goto cleanup; 311 } 312 uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size); 313 uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t)); 314 } else { 315 r->maxExpansions->expansionCESize = NULL; 316 } 317 } 318 319 if(t->maxJamoExpansions != NULL) { 320 r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable)); 321 /* test for NULL */ 322 if (r->maxJamoExpansions == NULL) { 323 *status = U_MEMORY_ALLOCATION_ERROR; 324 goto cleanup; 325 } 326 r->maxJamoExpansions->size = t->maxJamoExpansions->size; 327 r->maxJamoExpansions->position = t->maxJamoExpansions->position; 328 r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize; 329 r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize; 330 r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize; 331 if(t->maxJamoExpansions->size != 0) { 332 r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size); 333 /* test for NULL */ 334 if (r->maxJamoExpansions->endExpansionCE == NULL) { 335 *status = U_MEMORY_ALLOCATION_ERROR; 336 goto cleanup; 337 } 338 uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t)); 339 r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size); 340 /* test for NULL */ 341 if (r->maxJamoExpansions->isV == NULL) { 342 *status = U_MEMORY_ALLOCATION_ERROR; 343 goto cleanup; 344 } 345 uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool)); 346 } else { 347 r->maxJamoExpansions->endExpansionCE = NULL; 348 r->maxJamoExpansions->isV = NULL; 349 } 350 } 351 352 if(t->unsafeCP != NULL) { 353 r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 354 /* test for NULL */ 355 if (r->unsafeCP == NULL) { 356 *status = U_MEMORY_ALLOCATION_ERROR; 357 goto cleanup; 358 } 359 uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 360 } 361 362 if(t->contrEndCP != NULL) { 363 r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE); 364 /* test for NULL */ 365 if (r->contrEndCP == NULL) { 366 *status = U_MEMORY_ALLOCATION_ERROR; 367 goto cleanup; 368 } 369 uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 370 } 371 372 r->UCA = t->UCA; 373 r->image = t->image; 374 r->options = t->options; 375 376 return r; 377 cleanup: 378 uprv_uca_closeTempTable(t); 379 return NULL; 380 } 381 382 383 U_CAPI void U_EXPORT2 384 uprv_uca_closeTempTable(tempUCATable *t) { 385 if(t != NULL) { 386 if (t->expansions != NULL) { 387 uprv_free(t->expansions->CEs); 388 uprv_free(t->expansions); 389 } 390 if(t->contractions != NULL) { 391 uprv_cnttab_close(t->contractions); 392 } 393 if (t->mapping != NULL) { 394 utrie_close(t->mapping); 395 } 396 397 if(t->prefixLookup != NULL) { 398 uhash_close(t->prefixLookup); 399 } 400 401 if (t->maxExpansions != NULL) { 402 uprv_free(t->maxExpansions->endExpansionCE); 403 uprv_free(t->maxExpansions->expansionCESize); 404 uprv_free(t->maxExpansions); 405 } 406 407 if (t->maxJamoExpansions->size > 0) { 408 uprv_free(t->maxJamoExpansions->endExpansionCE); 409 uprv_free(t->maxJamoExpansions->isV); 410 } 411 uprv_free(t->maxJamoExpansions); 412 413 uprv_free(t->unsafeCP); 414 uprv_free(t->contrEndCP); 415 416 if (t->cmLookup != NULL) { 417 uprv_free(t->cmLookup->cPoints); 418 uprv_free(t->cmLookup); 419 } 420 421 uprv_free(t); 422 } 423 } 424 425 /** 426 * Looks for the maximum length of all expansion sequences ending with the same 427 * collation element. The size required for maxexpansion and maxsize is 428 * returned if the arrays are too small. 429 * @param endexpansion the last expansion collation element to be added 430 * @param expansionsize size of the expansion 431 * @param maxexpansion data structure to store the maximum expansion data. 432 * @param status error status 433 * @returns size of the maxexpansion and maxsize used. 434 */ 435 static int uprv_uca_setMaxExpansion(uint32_t endexpansion, 436 uint8_t expansionsize, 437 MaxExpansionTable *maxexpansion, 438 UErrorCode *status) 439 { 440 if (maxexpansion->size == 0) { 441 /* we'll always make the first element 0, for easier manipulation */ 442 maxexpansion->endExpansionCE = 443 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t)); 444 /* test for NULL */ 445 if (maxexpansion->endExpansionCE == NULL) { 446 *status = U_MEMORY_ALLOCATION_ERROR; 447 return 0; 448 } 449 *(maxexpansion->endExpansionCE) = 0; 450 maxexpansion->expansionCESize = 451 (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t)); 452 /* test for NULL */; 453 if (maxexpansion->expansionCESize == NULL) { 454 *status = U_MEMORY_ALLOCATION_ERROR; 455 return 0; 456 } 457 *(maxexpansion->expansionCESize) = 0; 458 maxexpansion->size = INIT_EXP_TABLE_SIZE; 459 maxexpansion->position = 0; 460 } 461 462 if (maxexpansion->position + 1 == maxexpansion->size) { 463 uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 464 2 * maxexpansion->size * sizeof(uint32_t)); 465 if (neweece == NULL) { 466 *status = U_MEMORY_ALLOCATION_ERROR; 467 return 0; 468 } 469 maxexpansion->endExpansionCE = neweece; 470 471 uint8_t *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize, 472 2 * maxexpansion->size * sizeof(uint8_t)); 473 if (neweces == NULL) { 474 *status = U_MEMORY_ALLOCATION_ERROR; 475 return 0; 476 } 477 maxexpansion->expansionCESize = neweces; 478 maxexpansion->size *= 2; 479 } 480 481 uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 482 uint8_t *pexpansionsize = maxexpansion->expansionCESize; 483 int pos = maxexpansion->position; 484 485 uint32_t *start = pendexpansionce; 486 uint32_t *limit = pendexpansionce + pos; 487 488 /* using binary search to determine if last expansion element is 489 already in the array */ 490 uint32_t *mid; 491 int result = -1; 492 while (start < limit - 1) { 493 mid = start + ((limit - start) >> 1); 494 if (endexpansion <= *mid) { 495 limit = mid; 496 } 497 else { 498 start = mid; 499 } 500 } 501 502 if (*start == endexpansion) { 503 result = (int)(start - pendexpansionce); 504 } 505 else if (*limit == endexpansion) { 506 result = (int)(limit - pendexpansionce); 507 } 508 509 if (result > -1) { 510 /* found the ce in expansion, we'll just modify the size if it is 511 smaller */ 512 uint8_t *currentsize = pexpansionsize + result; 513 if (*currentsize < expansionsize) { 514 *currentsize = expansionsize; 515 } 516 } 517 else { 518 /* we'll need to squeeze the value into the array. 519 initial implementation. */ 520 /* shifting the subarray down by 1 */ 521 int shiftsize = (int)((pendexpansionce + pos) - start); 522 uint32_t *shiftpos = start + 1; 523 uint8_t *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce); 524 525 /* okay need to rearrange the array into sorted order */ 526 if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */ 527 *(pendexpansionce + pos + 1) = endexpansion; 528 *(pexpansionsize + pos + 1) = expansionsize; 529 } 530 else { 531 uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t)); 532 uprv_memmove(sizeshiftpos + 1, sizeshiftpos, 533 shiftsize * sizeof(uint8_t)); 534 *shiftpos = endexpansion; 535 *sizeshiftpos = expansionsize; 536 } 537 maxexpansion->position ++; 538 539 #ifdef UCOL_DEBUG 540 int temp; 541 UBool found = FALSE; 542 for (temp = 0; temp < maxexpansion->position; temp ++) { 543 if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) { 544 fprintf(stderr, "expansions %d\n", temp); 545 } 546 if (pendexpansionce[temp] == endexpansion) { 547 found =TRUE; 548 if (pexpansionsize[temp] < expansionsize) { 549 fprintf(stderr, "expansions size %d\n", temp); 550 } 551 } 552 } 553 if (pendexpansionce[temp] == endexpansion) { 554 found =TRUE; 555 if (pexpansionsize[temp] < expansionsize) { 556 fprintf(stderr, "expansions size %d\n", temp); 557 } 558 } 559 if (!found) 560 fprintf(stderr, "expansion not found %d\n", temp); 561 #endif 562 } 563 564 return maxexpansion->position; 565 } 566 567 /** 568 * Sets the maximum length of all jamo expansion sequences ending with the same 569 * collation element. The size required for maxexpansion and maxsize is 570 * returned if the arrays are too small. 571 * @param ch the jamo codepoint 572 * @param endexpansion the last expansion collation element to be added 573 * @param expansionsize size of the expansion 574 * @param maxexpansion data structure to store the maximum expansion data. 575 * @param status error status 576 * @returns size of the maxexpansion and maxsize used. 577 */ 578 static int uprv_uca_setMaxJamoExpansion(UChar ch, 579 uint32_t endexpansion, 580 uint8_t expansionsize, 581 MaxJamoExpansionTable *maxexpansion, 582 UErrorCode *status) 583 { 584 UBool isV = TRUE; 585 if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) { 586 /* determines L for Jamo, doesn't need to store this since it is never 587 at the end of a expansion */ 588 if (maxexpansion->maxLSize < expansionsize) { 589 maxexpansion->maxLSize = expansionsize; 590 } 591 return maxexpansion->position; 592 } 593 594 if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) { 595 /* determines V for Jamo */ 596 if (maxexpansion->maxVSize < expansionsize) { 597 maxexpansion->maxVSize = expansionsize; 598 } 599 } 600 601 if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) { 602 isV = FALSE; 603 /* determines T for Jamo */ 604 if (maxexpansion->maxTSize < expansionsize) { 605 maxexpansion->maxTSize = expansionsize; 606 } 607 } 608 609 if (maxexpansion->size == 0) { 610 /* we'll always make the first element 0, for easier manipulation */ 611 maxexpansion->endExpansionCE = 612 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t)); 613 /* test for NULL */; 614 if (maxexpansion->endExpansionCE == NULL) { 615 *status = U_MEMORY_ALLOCATION_ERROR; 616 return 0; 617 } 618 *(maxexpansion->endExpansionCE) = 0; 619 maxexpansion->isV = 620 (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool)); 621 /* test for NULL */; 622 if (maxexpansion->isV == NULL) { 623 *status = U_MEMORY_ALLOCATION_ERROR; 624 uprv_free(maxexpansion->endExpansionCE); 625 maxexpansion->endExpansionCE = NULL; 626 return 0; 627 } 628 *(maxexpansion->isV) = 0; 629 maxexpansion->size = INIT_EXP_TABLE_SIZE; 630 maxexpansion->position = 0; 631 } 632 633 if (maxexpansion->position + 1 == maxexpansion->size) { 634 maxexpansion->size *= 2; 635 maxexpansion->endExpansionCE = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE, 636 maxexpansion->size * sizeof(uint32_t)); 637 if (maxexpansion->endExpansionCE == NULL) { 638 #ifdef UCOL_DEBUG 639 fprintf(stderr, "out of memory for maxExpansions\n"); 640 #endif 641 *status = U_MEMORY_ALLOCATION_ERROR; 642 return 0; 643 } 644 maxexpansion->isV = (UBool *)uprv_realloc(maxexpansion->isV, 645 maxexpansion->size * sizeof(UBool)); 646 if (maxexpansion->isV == NULL) { 647 #ifdef UCOL_DEBUG 648 fprintf(stderr, "out of memory for maxExpansions\n"); 649 #endif 650 *status = U_MEMORY_ALLOCATION_ERROR; 651 uprv_free(maxexpansion->endExpansionCE); 652 maxexpansion->endExpansionCE = NULL; 653 return 0; 654 } 655 } 656 657 uint32_t *pendexpansionce = maxexpansion->endExpansionCE; 658 int pos = maxexpansion->position; 659 660 while (pos > 0) { 661 pos --; 662 if (*(pendexpansionce + pos) == endexpansion) { 663 return maxexpansion->position; 664 } 665 } 666 667 *(pendexpansionce + maxexpansion->position) = endexpansion; 668 *(maxexpansion->isV + maxexpansion->position) = isV; 669 maxexpansion->position ++; 670 671 return maxexpansion->position; 672 } 673 674 675 static void ContrEndCPSet(uint8_t *table, UChar c) { 676 uint32_t hash; 677 uint8_t *htByte; 678 679 hash = c; 680 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 681 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 682 } 683 htByte = &table[hash>>3]; 684 *htByte |= (1 << (hash & 7)); 685 } 686 687 688 static void unsafeCPSet(uint8_t *table, UChar c) { 689 uint32_t hash; 690 uint8_t *htByte; 691 692 hash = c; 693 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 694 if (hash >= 0xd800 && hash <= 0xf8ff) { 695 /* Part of a surrogate, or in private use area. */ 696 /* These don't go in the table */ 697 return; 698 } 699 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 700 } 701 htByte = &table[hash>>3]; 702 *htByte |= (1 << (hash & 7)); 703 } 704 705 static void 706 uprv_uca_createCMTable(tempUCATable *t, int32_t noOfCM, UErrorCode *status) { 707 t->cmLookup = (CombinClassTable *)uprv_malloc(sizeof(CombinClassTable)); 708 if (t->cmLookup==NULL) { 709 *status = U_MEMORY_ALLOCATION_ERROR; 710 return; 711 } 712 t->cmLookup->cPoints=(UChar *)uprv_malloc(noOfCM*sizeof(UChar)); 713 if (t->cmLookup->cPoints ==NULL) { 714 uprv_free(t->cmLookup); 715 t->cmLookup = NULL; 716 *status = U_MEMORY_ALLOCATION_ERROR; 717 return; 718 } 719 720 t->cmLookup->size=noOfCM; 721 uprv_memset(t->cmLookup->index, 0, sizeof(t->cmLookup->index)); 722 723 return; 724 } 725 726 static void 727 uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) { 728 int32_t count=0; 729 730 for (int32_t i=0; i<256; ++i) { 731 if (index[i]>0) { 732 // cPoints is ordered by combining class value. 733 uprv_memcpy(t->cmLookup->cPoints+count, cm+(i<<8), index[i]*sizeof(UChar)); 734 count += index[i]; 735 } 736 t->cmLookup->index[i]=count; 737 } 738 return; 739 } 740 741 /* 1. to the UnsafeCP hash table, add all chars with combining class != 0 */ 742 /* 2. build combining marks table for all chars with combining class != 0 */ 743 static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) { 744 745 UChar c; 746 uint16_t fcd; // Hi byte is lead combining class. lo byte is trailing combing class. 747 UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table 748 UChar *cm=NULL; 749 uint16_t index[256]; 750 int32_t count=0; 751 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 752 if (U_FAILURE(*status)) { 753 return; 754 } 755 756 if (buildCMTable) { 757 if (cm==NULL) { 758 cm = (UChar *)uprv_malloc(sizeof(UChar)*UCOL_MAX_CM_TAB); 759 if (cm==NULL) { 760 *status = U_MEMORY_ALLOCATION_ERROR; 761 return; 762 } 763 } 764 uprv_memset(index, 0, sizeof(index)); 765 } 766 for (c=0; c<0xffff; c++) { 767 if (U16_IS_LEAD(c)) { 768 fcd = 0; 769 if (nfcImpl->singleLeadMightHaveNonZeroFCD16(c)) { 770 UChar32 supp = U16_GET_SUPPLEMENTARY(c, 0xdc00); 771 UChar32 suppLimit = supp + 0x400; 772 while (supp < suppLimit) { 773 fcd |= nfcImpl->getFCD16FromNormData(supp++); 774 } 775 } 776 } else { 777 fcd = nfcImpl->getFCD16(c); 778 } 779 if (fcd >= 0x100 || // if the leading combining class(c) > 0 || 780 (U16_IS_LEAD(c) && fcd != 0)) {// c is a leading surrogate with some FCD data 781 if (buildCMTable) { 782 uint32_t cClass = fcd & 0xff; 783 //uint32_t temp=(cClass<<8)+index[cClass]; 784 cm[(cClass<<8)+index[cClass]] = c; // 785 index[cClass]++; 786 count++; 787 } 788 unsafeCPSet(t->unsafeCP, c); 789 } 790 } 791 792 // copy to cm table 793 if (buildCMTable) { 794 uprv_uca_createCMTable(t, count, status); 795 if(U_FAILURE(*status)) { 796 if (cm!=NULL) { 797 uprv_free(cm); 798 } 799 return; 800 } 801 uprv_uca_copyCMTable(t, cm, index); 802 } 803 804 if(t->prefixLookup != NULL) { 805 int32_t i = -1; 806 const UHashElement *e = NULL; 807 UCAElements *element = NULL; 808 UChar NFCbuf[256]; 809 while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) { 810 element = (UCAElements *)e->value.pointer; 811 // codepoints here are in the NFD form. We need to add the 812 // first code point of the NFC form to unsafe, because 813 // strcoll needs to backup over them. 814 unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, 815 NFCbuf, 256, status); 816 unsafeCPSet(t->unsafeCP, NFCbuf[0]); 817 } 818 } 819 820 if (cm!=NULL) { 821 uprv_free(cm); 822 } 823 } 824 825 static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE, 826 UCAElements *element, UErrorCode *status) 827 { 828 // currently the longest prefix we're supporting in Japanese is two characters 829 // long. Although this table could quite easily mimic complete contraction stuff 830 // there is no good reason to make a general solution, as it would require some 831 // error prone messing. 832 CntTable *contractions = t->contractions; 833 UChar32 cp; 834 uint32_t cpsize = 0; 835 UChar *oldCP = element->cPoints; 836 uint32_t oldCPSize = element->cSize; 837 838 839 contractions->currentTag = SPEC_PROC_TAG; 840 841 // here, we will normalize & add prefix to the table. 842 uint32_t j = 0; 843 #ifdef UCOL_DEBUG 844 for(j=0; j<element->cSize; j++) { 845 fprintf(stdout, "CP: %04X ", element->cPoints[j]); 846 } 847 fprintf(stdout, "El: %08X Pref: ", CE); 848 for(j=0; j<element->prefixSize; j++) { 849 fprintf(stdout, "%04X ", element->prefix[j]); 850 } 851 fprintf(stdout, "%08X ", element->mapCE); 852 #endif 853 854 for (j = 1; j<element->prefixSize; j++) { /* First add NFD prefix chars to unsafe CP hash table */ 855 // Unless it is a trail surrogate, which is handled algoritmically and 856 // shouldn't take up space in the table. 857 if(!(U16_IS_TRAIL(element->prefix[j]))) { 858 unsafeCPSet(t->unsafeCP, element->prefix[j]); 859 } 860 } 861 862 UChar tempPrefix = 0; 863 864 for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards 865 // therefore, we will promptly reverse the prefix buffer... 866 tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1); 867 *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j]; 868 element->prefix[j] = tempPrefix; 869 } 870 871 #ifdef UCOL_DEBUG 872 fprintf(stdout, "Reversed: "); 873 for(j=0; j<element->prefixSize; j++) { 874 fprintf(stdout, "%04X ", element->prefix[j]); 875 } 876 fprintf(stdout, "%08X\n", element->mapCE); 877 #endif 878 879 // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix 880 if(!(U16_IS_TRAIL(element->cPoints[0]))) { 881 unsafeCPSet(t->unsafeCP, element->cPoints[0]); 882 } 883 884 // Maybe we need this... To handle prefixes completely in the forward direction... 885 //if(element->cSize == 1) { 886 // if(!(U16_IS_TRAIL(element->cPoints[0]))) { 887 // ContrEndCPSet(t->contrEndCP, element->cPoints[0]); 888 // } 889 //} 890 891 element->cPoints = element->prefix; 892 element->cSize = element->prefixSize; 893 894 // Add the last char of the contraction to the contraction-end hash table. 895 // unless it is a trail surrogate, which is handled algorithmically and 896 // shouldn't be in the table 897 if(!(U16_IS_TRAIL(element->cPoints[element->cSize -1]))) { 898 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 899 } 900 901 // First we need to check if contractions starts with a surrogate 902 U16_NEXT(element->cPoints, cpsize, element->cSize, cp); 903 904 // If there are any Jamos in the contraction, we should turn on special 905 // processing for Jamos 906 if(UCOL_ISJAMO(element->prefix[0])) { 907 t->image->jamoSpecial = TRUE; 908 } 909 /* then we need to deal with it */ 910 /* we could aready have something in table - or we might not */ 911 912 if(!isPrefix(CE)) { 913 /* if it wasn't contraction, we wouldn't end up here*/ 914 int32_t firstContractionOffset = 0; 915 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 916 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 917 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status); 918 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 919 CE = constructContractCE(SPEC_PROC_TAG, firstContractionOffset); 920 } else { /* we are adding to existing contraction */ 921 /* there were already some elements in the table, so we need to add a new contraction */ 922 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 923 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status); 924 if(position > 0) { /* if it is we just continue down the chain */ 925 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 926 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 927 uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status); 928 } else { /* if it isn't, we will have to create a new sequence */ 929 uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 930 uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status); 931 } 932 } 933 934 element->cPoints = oldCP; 935 element->cSize = oldCPSize; 936 937 return CE; 938 } 939 940 // Note regarding surrogate handling: We are interested only in the single 941 // or leading surrogates in a contraction. If a surrogate is somewhere else 942 // in the contraction, it is going to be handled as a pair of code units, 943 // as it doesn't affect the performance AND handling surrogates specially 944 // would complicate code way too much. 945 static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, 946 UCAElements *element, UErrorCode *status) 947 { 948 CntTable *contractions = t->contractions; 949 UChar32 cp; 950 uint32_t cpsize = 0; 951 952 contractions->currentTag = CONTRACTION_TAG; 953 954 // First we need to check if contractions starts with a surrogate 955 U16_NEXT(element->cPoints, cpsize, element->cSize, cp); 956 957 if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first 958 uint32_t j = 0; 959 for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */ 960 // Unless it is a trail surrogate, which is handled algoritmically and 961 // shouldn't take up space in the table. 962 if(!(U16_IS_TRAIL(element->cPoints[j]))) { 963 unsafeCPSet(t->unsafeCP, element->cPoints[j]); 964 } 965 } 966 // Add the last char of the contraction to the contraction-end hash table. 967 // unless it is a trail surrogate, which is handled algorithmically and 968 // shouldn't be in the table 969 if(!(U16_IS_TRAIL(element->cPoints[element->cSize -1]))) { 970 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); 971 } 972 973 // If there are any Jamos in the contraction, we should turn on special 974 // processing for Jamos 975 if(UCOL_ISJAMO(element->cPoints[0])) { 976 t->image->jamoSpecial = TRUE; 977 } 978 /* then we need to deal with it */ 979 /* we could aready have something in table - or we might not */ 980 element->cPoints+=cpsize; 981 element->cSize-=cpsize; 982 if(!isContraction(CE)) { 983 /* if it wasn't contraction, we wouldn't end up here*/ 984 int32_t firstContractionOffset = 0; 985 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); 986 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 987 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 988 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); 989 CE = constructContractCE(CONTRACTION_TAG, firstContractionOffset); 990 } else { /* we are adding to existing contraction */ 991 /* there were already some elements in the table, so we need to add a new contraction */ 992 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 993 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status); 994 if(position > 0) { /* if it is we just continue down the chain */ 995 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); 996 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 997 uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status); 998 } else { /* if it isn't, we will have to create a new sequence */ 999 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1000 uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status); 1001 } 1002 } 1003 element->cPoints-=cpsize; 1004 element->cSize+=cpsize; 1005 /*ucmpe32_set(t->mapping, cp, CE);*/ 1006 utrie_set32(t->mapping, cp, CE); 1007 } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */ 1008 /*ucmpe32_set(t->mapping, cp, element->mapCE);*/ 1009 utrie_set32(t->mapping, cp, element->mapCE); 1010 } else { /* fill out the first stage of the contraction with the surrogate CE */ 1011 uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status); 1012 uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status); 1013 } 1014 return CE; 1015 } 1016 1017 1018 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) { 1019 int32_t firstContractionOffset = 0; 1020 // uint32_t contractionElement = UCOL_NOT_FOUND; 1021 1022 if(U_FAILURE(*status)) { 1023 return UCOL_NOT_FOUND; 1024 } 1025 1026 /* end of recursion */ 1027 if(element->cSize == 1) { 1028 if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) { 1029 uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status); 1030 uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status); 1031 return existingCE; 1032 } else { 1033 return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */ 1034 } 1035 } 1036 1037 /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ 1038 /* for both backward and forward cycles */ 1039 1040 /* we encountered either an empty space or a non-contraction element */ 1041 /* this means we are constructing a new contraction sequence */ 1042 element->cPoints++; 1043 element->cSize--; 1044 if(!isCntTableElement(existingCE)) { 1045 /* if it wasn't contraction, we wouldn't end up here*/ 1046 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status); 1047 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1048 uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); 1049 uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status); 1050 existingCE = constructContractCE(contractions->currentTag, firstContractionOffset); 1051 } else { /* we are adding to existing contraction */ 1052 /* there were already some elements in the table, so we need to add a new contraction */ 1053 /* Two things can happen here: either the codepoint is already in the table, or it is not */ 1054 int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status); 1055 if(position > 0) { /* if it is we just continue down the chain */ 1056 uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status); 1057 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); 1058 uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status); 1059 } else { /* if it isn't, we will have to create a new sequence */ 1060 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); 1061 uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status); 1062 } 1063 } 1064 element->cPoints--; 1065 element->cSize++; 1066 return existingCE; 1067 } 1068 1069 static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1070 uint32_t CE = UCOL_NOT_FOUND; 1071 // This should add a completely ignorable element to the 1072 // unsafe table, so that backward iteration will skip 1073 // over it when treating contractions. 1074 uint32_t i = 0; 1075 if(element->mapCE == 0) { 1076 for(i = 0; i < element->cSize; i++) { 1077 if(!U16_IS_TRAIL(element->cPoints[i])) { 1078 unsafeCPSet(t->unsafeCP, element->cPoints[i]); 1079 } 1080 } 1081 } 1082 if(element->cSize > 1) { /* we're adding a contraction */ 1083 uint32_t i = 0; 1084 UChar32 cp; 1085 1086 U16_NEXT(element->cPoints, i, element->cSize, cp); 1087 /*CE = ucmpe32_get(t->mapping, cp);*/ 1088 CE = utrie_get32(t->mapping, cp, NULL); 1089 1090 CE = uprv_uca_addContraction(t, CE, element, status); 1091 } else { /* easy case, */ 1092 /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/ 1093 CE = utrie_get32(t->mapping, element->cPoints[0], NULL); 1094 1095 if( CE != UCOL_NOT_FOUND) { 1096 if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */ 1097 if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop 1098 // Only expansions and regular CEs can go here... Contractions will never happen in this place 1099 uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status); 1100 /* This loop has to change the CE at the end of contraction REDO!*/ 1101 uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status); 1102 } 1103 } else { 1104 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1105 utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1106 if ((element->prefixSize!=0) && (!isSpecial(CE) || (getCETag(CE)!=IMPLICIT_TAG))) { 1107 UCAElements *origElem = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1108 /* test for NULL */ 1109 if (origElem== NULL) { 1110 *status = U_MEMORY_ALLOCATION_ERROR; 1111 return 0; 1112 } 1113 /* copy the original UCA value */ 1114 origElem->prefixSize = 0; 1115 origElem->prefix = NULL; 1116 origElem->cPoints = origElem->uchars; 1117 origElem->cPoints[0] = element->cPoints[0]; 1118 origElem->cSize = 1; 1119 origElem->CEs[0]=CE; 1120 origElem->mapCE=CE; 1121 origElem->noOfCEs=1; 1122 uprv_uca_finalizeAddition(t, origElem, status); 1123 uprv_free(origElem); 1124 } 1125 #ifdef UCOL_DEBUG 1126 fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]); 1127 //*status = U_ILLEGAL_ARGUMENT_ERROR; 1128 #endif 1129 } 1130 } else { 1131 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/ 1132 utrie_set32(t->mapping, element->cPoints[0], element->mapCE); 1133 } 1134 } 1135 return CE; 1136 } 1137 1138 /* This adds a read element, while testing for existence */ 1139 U_CAPI uint32_t U_EXPORT2 1140 uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1141 U_NAMESPACE_USE 1142 1143 ExpansionTable *expansions = t->expansions; 1144 1145 uint32_t i = 1; 1146 uint32_t expansion = 0; 1147 uint32_t CE; 1148 1149 if(U_FAILURE(*status)) { 1150 return 0xFFFF; 1151 } 1152 1153 element->mapCE = 0; // clear mapCE so that we can catch expansions 1154 1155 if(element->noOfCEs == 1) { 1156 element->mapCE = element->CEs[0]; 1157 } else { 1158 /* ICU 2.1 long primaries */ 1159 /* unfortunately, it looks like we have to look for a long primary here */ 1160 /* since in canonical closure we are going to hit some long primaries from */ 1161 /* the first phase, and they will come back as continuations/expansions */ 1162 /* destroying the effect of the previous opitimization */ 1163 /* A long primary is a three byte primary with starting secondaries and tertiaries */ 1164 /* It can appear in long runs of only primary differences (like east Asian tailorings) */ 1165 /* also, it should not be an expansion, as expansions would break with this */ 1166 // This part came in from ucol_bld.cpp 1167 //if(tok->expansion == 0 1168 //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1 1169 //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) { 1170 /* we will construct a special CE that will go unchanged to the table */ 1171 if(element->noOfCEs == 2 // a two CE expansion 1172 && isContinuation(element->CEs[1]) // which is a continuation 1173 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1174 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1175 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1176 ) 1177 { 1178 #ifdef UCOL_DEBUG 1179 fprintf(stdout, "Long primary %04X\n", element->cPoints[0]); 1180 #endif 1181 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1182 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1183 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1184 } 1185 else { 1186 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1187 | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1188 & 0xFFFFF0)); 1189 1190 for(i = 1; i<element->noOfCEs; i++) { 1191 uprv_uca_addExpansion(expansions, element->CEs[i], status); 1192 } 1193 if(element->noOfCEs <= 0xF) { 1194 expansion |= element->noOfCEs; 1195 } else { 1196 uprv_uca_addExpansion(expansions, 0, status); 1197 } 1198 element->mapCE = expansion; 1199 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1200 (uint8_t)element->noOfCEs, 1201 t->maxExpansions, 1202 status); 1203 if(UCOL_ISJAMO(element->cPoints[0])) { 1204 t->image->jamoSpecial = TRUE; 1205 uprv_uca_setMaxJamoExpansion(element->cPoints[0], 1206 element->CEs[element->noOfCEs - 1], 1207 (uint8_t)element->noOfCEs, 1208 t->maxJamoExpansions, 1209 status); 1210 } 1211 if (U_FAILURE(*status)) { 1212 return 0; 1213 } 1214 } 1215 } 1216 1217 // We treat digits differently - they are "uber special" and should be 1218 // processed differently if numeric collation is on. 1219 UChar32 uniChar = 0; 1220 //printElement(element); 1221 if ((element->cSize == 2) && U16_IS_LEAD(element->cPoints[0])){ 1222 uniChar = U16_GET_SUPPLEMENTARY(element->cPoints[0], element->cPoints[1]); 1223 } else if (element->cSize == 1){ 1224 uniChar = element->cPoints[0]; 1225 } 1226 1227 // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only 1228 // one element to the expansion buffer. When we encounter a digit and we don't 1229 // do numeric collation, we will just pick the CE we have and break out of case 1230 // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked 1231 // a special, further processing will occur. If it's a simple CE, we'll return due 1232 // to how the loop is constructed. 1233 if (uniChar != 0 && u_isdigit(uniChar)){ 1234 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element 1235 if(element->mapCE) { // if there is an expansion, we'll pick it here 1236 expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4); 1237 } else { 1238 expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4); 1239 } 1240 element->mapCE = expansion; 1241 1242 // Need to go back to the beginning of the digit string if in the middle! 1243 if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars 1244 unsafeCPSet(t->unsafeCP, (UChar)uniChar); 1245 } 1246 } 1247 1248 // here we want to add the prefix structure. 1249 // I will try to process it as a reverse contraction, if possible. 1250 // prefix buffer is already reversed. 1251 1252 if(element->prefixSize!=0) { 1253 // We keep the seen prefix starter elements in a hashtable 1254 // we need it to be able to distinguish between the simple 1255 // codepoints and prefix starters. Also, we need to use it 1256 // for canonical closure. 1257 1258 UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1259 /* test for NULL */ 1260 if (composed == NULL) { 1261 *status = U_MEMORY_ALLOCATION_ERROR; 1262 return 0; 1263 } 1264 uprv_memcpy(composed, element, sizeof(UCAElements)); 1265 composed->cPoints = composed->uchars; 1266 composed->prefix = composed->prefixChars; 1267 1268 composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status); 1269 1270 1271 if(t->prefixLookup != NULL) { 1272 UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element); 1273 if(uCE != NULL) { // there is already a set of code points here 1274 element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status); 1275 } else { // no code points, so this spot is clean 1276 element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status); 1277 uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements)); 1278 /* test for NULL */ 1279 if (uCE == NULL) { 1280 *status = U_MEMORY_ALLOCATION_ERROR; 1281 return 0; 1282 } 1283 uprv_memcpy(uCE, element, sizeof(UCAElements)); 1284 uCE->cPoints = uCE->uchars; 1285 uhash_put(t->prefixLookup, uCE, uCE, status); 1286 } 1287 if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) { 1288 // do it! 1289 composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status); 1290 } 1291 } 1292 uprv_free(composed); 1293 } 1294 1295 // We need to use the canonical iterator here 1296 // the way we do it is to generate the canonically equivalent strings 1297 // for the contraction and then add the sequences that pass FCD check 1298 if(element->cSize > 1 && !(element->cSize==2 && U16_IS_LEAD(element->cPoints[0]) && U16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included 1299 UnicodeString source(element->cPoints, element->cSize); 1300 CanonicalIterator it(source, *status); 1301 source = it.next(); 1302 while(!source.isBogus()) { 1303 if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) { 1304 element->cSize = source.extract(element->cPoints, 128, *status); 1305 uprv_uca_finalizeAddition(t, element, status); 1306 } 1307 source = it.next(); 1308 } 1309 CE = element->mapCE; 1310 } else { 1311 CE = uprv_uca_finalizeAddition(t, element, status); 1312 } 1313 1314 return CE; 1315 } 1316 1317 1318 /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */ 1319 static void uprv_uca_getMaxExpansionJamo(UNewTrie *mapping, 1320 MaxExpansionTable *maxexpansion, 1321 MaxJamoExpansionTable *maxjamoexpansion, 1322 UBool jamospecial, 1323 UErrorCode *status) 1324 { 1325 const uint32_t VBASE = 0x1161; 1326 const uint32_t TBASE = 0x11A8; 1327 const uint32_t VCOUNT = 21; 1328 const uint32_t TCOUNT = 28; 1329 1330 uint32_t v = VBASE + VCOUNT - 1; 1331 uint32_t t = TBASE + TCOUNT - 1; 1332 uint32_t ce; 1333 1334 while (v >= VBASE) { 1335 /*ce = ucmpe32_get(mapping, v);*/ 1336 ce = utrie_get32(mapping, v, NULL); 1337 if (ce < UCOL_SPECIAL_FLAG) { 1338 uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status); 1339 } 1340 v --; 1341 } 1342 1343 while (t >= TBASE) 1344 { 1345 /*ce = ucmpe32_get(mapping, t);*/ 1346 ce = utrie_get32(mapping, t, NULL); 1347 if (ce < UCOL_SPECIAL_FLAG) { 1348 uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status); 1349 } 1350 t --; 1351 } 1352 /* According to the docs, 99% of the time, the Jamo will not be special */ 1353 if (jamospecial) { 1354 /* gets the max expansion in all unicode characters */ 1355 int count = maxjamoexpansion->position; 1356 uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize + 1357 maxjamoexpansion->maxVSize + 1358 maxjamoexpansion->maxTSize); 1359 uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize + 1360 maxjamoexpansion->maxVSize); 1361 1362 while (count > 0) { 1363 count --; 1364 if (*(maxjamoexpansion->isV + count) == TRUE) { 1365 uprv_uca_setMaxExpansion( 1366 *(maxjamoexpansion->endExpansionCE + count), 1367 maxVSize, maxexpansion, status); 1368 } 1369 else { 1370 uprv_uca_setMaxExpansion( 1371 *(maxjamoexpansion->endExpansionCE + count), 1372 maxTSize, maxexpansion, status); 1373 } 1374 } 1375 } 1376 } 1377 1378 U_CDECL_BEGIN 1379 static inline uint32_t U_CALLCONV 1380 getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) 1381 { 1382 uint32_t value; 1383 uint32_t tag; 1384 UChar32 limit; 1385 UBool inBlockZero; 1386 1387 limit=start+0x400; 1388 while(start<limit) { 1389 value=utrie_get32(trie, start, &inBlockZero); 1390 tag = getCETag(value); 1391 if(inBlockZero == TRUE) { 1392 start+=UTRIE_DATA_BLOCK_LENGTH; 1393 } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) { 1394 /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the 1395 * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is 1396 * nothing in this position and that it should be skipped. 1397 */ 1398 #ifdef UCOL_DEBUG 1399 static int32_t count = 1; 1400 fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value); 1401 #endif 1402 return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset); 1403 } else { 1404 ++start; 1405 } 1406 } 1407 return 0; 1408 } 1409 U_CDECL_END 1410 1411 #ifdef UCOL_DEBUG 1412 // This is a debug function to print the contents of a trie. 1413 // It is used in conjuction with the code around utrie_unserialize call 1414 UBool enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 1415 if(start<0x10000) { 1416 fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value); 1417 } else { 1418 fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, U16_LEAD(start), U16_TRAIL(start), limit, U16_LEAD(limit), U16_TRAIL(limit), value); 1419 } 1420 return TRUE; 1421 } 1422 1423 int32_t 1424 myGetFoldingOffset(uint32_t data) { 1425 if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) { 1426 return (data&0xFFFFFF); 1427 } else { 1428 return 0; 1429 } 1430 } 1431 #endif 1432 1433 U_CAPI UCATableHeader* U_EXPORT2 1434 uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { 1435 /*CompactEIntArray *mapping = t->mapping;*/ 1436 UNewTrie *mapping = t->mapping; 1437 ExpansionTable *expansions = t->expansions; 1438 CntTable *contractions = t->contractions; 1439 MaxExpansionTable *maxexpansion = t->maxExpansions; 1440 1441 if(U_FAILURE(*status)) { 1442 return NULL; 1443 } 1444 1445 uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar)); 1446 1447 int32_t contractionsSize = 0; 1448 contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status); 1449 1450 /* the following operation depends on the trie data. Therefore, we have to do it before */ 1451 /* the trie is compacted */ 1452 /* sets jamo expansions */ 1453 uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions, 1454 t->image->jamoSpecial, status); 1455 1456 /*ucmpe32_compact(mapping);*/ 1457 /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/ 1458 /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/ 1459 /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/ 1460 1461 // After setting the jamo expansions, compact the trie and get the needed size 1462 int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status); 1463 1464 uint32_t tableOffset = 0; 1465 uint8_t *dataStart; 1466 1467 /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */ 1468 1469 uint32_t toAllocate =(uint32_t)(headersize+ 1470 paddedsize(expansions->position*sizeof(uint32_t))+ 1471 paddedsize(mappingSize)+ 1472 paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+ 1473 //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */ 1474 /* maxexpansion array */ 1475 + paddedsize(maxexpansion->position * sizeof(uint32_t)) + 1476 /* maxexpansion size array */ 1477 paddedsize(maxexpansion->position * sizeof(uint8_t)) + 1478 paddedsize(UCOL_UNSAFECP_TABLE_SIZE) + /* Unsafe chars */ 1479 paddedsize(UCOL_UNSAFECP_TABLE_SIZE)); /* Contraction Ending chars */ 1480 1481 1482 dataStart = (uint8_t *)uprv_malloc(toAllocate); 1483 /* test for NULL */ 1484 if (dataStart == NULL) { 1485 *status = U_MEMORY_ALLOCATION_ERROR; 1486 return NULL; 1487 } 1488 1489 UCATableHeader *myData = (UCATableHeader *)dataStart; 1490 // Please, do reset all the fields! 1491 uprv_memset(dataStart, 0, toAllocate); 1492 // Make sure we know this is reset 1493 myData->magic = UCOL_HEADER_MAGIC; 1494 myData->isBigEndian = U_IS_BIG_ENDIAN; 1495 myData->charSetFamily = U_CHARSET_FAMILY; 1496 myData->formatVersion[0] = UCA_FORMAT_VERSION_0; 1497 myData->formatVersion[1] = UCA_FORMAT_VERSION_1; 1498 myData->formatVersion[2] = UCA_FORMAT_VERSION_2; 1499 myData->formatVersion[3] = UCA_FORMAT_VERSION_3; 1500 myData->jamoSpecial = t->image->jamoSpecial; 1501 1502 // Don't copy stuff from UCA header! 1503 //uprv_memcpy(myData, t->image, sizeof(UCATableHeader)); 1504 1505 myData->contractionSize = contractionsSize; 1506 1507 tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader))); 1508 1509 myData->options = tableOffset; 1510 uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet)); 1511 tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet))); 1512 1513 /* copy expansions */ 1514 /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/ 1515 myData->expansion = tableOffset; 1516 uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t)); 1517 tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t))); 1518 1519 /* contractions block */ 1520 if(contractionsSize != 0) { 1521 /* copy contraction index */ 1522 /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/ 1523 myData->contractionIndex = tableOffset; 1524 uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar)); 1525 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar))); 1526 1527 /* copy contraction collation elements */ 1528 /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/ 1529 myData->contractionCEs = tableOffset; 1530 uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t)); 1531 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t))); 1532 } else { 1533 myData->contractionIndex = 0; 1534 myData->contractionCEs = 0; 1535 } 1536 1537 /* copy mapping table */ 1538 /*myData->mappingPosition = dataStart+tableOffset;*/ 1539 /*myData->mappingPosition = tableOffset;*/ 1540 /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/ 1541 1542 myData->mappingPosition = tableOffset; 1543 utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status); 1544 #ifdef UCOL_DEBUG 1545 // This is debug code to dump the contents of the trie. It needs two functions defined above 1546 { 1547 UTrie UCAt = { 0 }; 1548 uint32_t trieWord; 1549 utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status); 1550 UCAt.getFoldingOffset = myGetFoldingOffset; 1551 if(U_SUCCESS(*status)) { 1552 utrie_enum(&UCAt, NULL, enumRange, NULL); 1553 } 1554 trieWord = UTRIE_GET32_FROM_LEAD(&UCAt, 0xDC01); 1555 } 1556 #endif 1557 tableOffset += paddedsize(mappingSize); 1558 1559 1560 int32_t i = 0; 1561 1562 /* copy max expansion table */ 1563 myData->endExpansionCE = tableOffset; 1564 myData->endExpansionCECount = maxexpansion->position - 1; 1565 /* not copying the first element which is a dummy */ 1566 uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1, 1567 (maxexpansion->position - 1) * sizeof(uint32_t)); 1568 tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t))); 1569 myData->expansionCESize = tableOffset; 1570 uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1, 1571 (maxexpansion->position - 1) * sizeof(uint8_t)); 1572 tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t))); 1573 1574 /* Unsafe chars table. Finish it off, then copy it. */ 1575 uprv_uca_unsafeCPAddCCNZ(t, status); 1576 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1577 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1578 t->unsafeCP[i] |= t->UCA->unsafeCP[i]; 1579 } 1580 } 1581 myData->unsafeCP = tableOffset; 1582 uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE); 1583 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1584 1585 1586 /* Finish building Contraction Ending chars hash table and then copy it out. */ 1587 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */ 1588 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) { 1589 t->contrEndCP[i] |= t->UCA->contrEndCP[i]; 1590 } 1591 } 1592 myData->contrEndCP = tableOffset; 1593 uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE); 1594 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE); 1595 1596 if(tableOffset != toAllocate) { 1597 #ifdef UCOL_DEBUG 1598 fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset); 1599 #endif 1600 *status = U_INTERNAL_PROGRAM_ERROR; 1601 uprv_free(dataStart); 1602 return 0; 1603 } 1604 1605 myData->size = tableOffset; 1606 /* This should happen upon ressurection */ 1607 /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/ 1608 /*uprv_mstrm_close(ms);*/ 1609 return myData; 1610 } 1611 1612 1613 struct enumStruct { 1614 tempUCATable *t; 1615 UCollator *tempColl; 1616 UCollationElements* colEl; 1617 const Normalizer2Impl *nfcImpl; 1618 UnicodeSet *closed; 1619 int32_t noOfClosures; 1620 UErrorCode *status; 1621 }; 1622 U_CDECL_BEGIN 1623 static UBool U_CALLCONV 1624 _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1625 1626 if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later 1627 UErrorCode *status = ((enumStruct *)context)->status; 1628 tempUCATable *t = ((enumStruct *)context)->t; 1629 UCollator *tempColl = ((enumStruct *)context)->tempColl; 1630 UCollationElements* colEl = ((enumStruct *)context)->colEl; 1631 UCAElements el; 1632 UChar decompBuffer[4]; 1633 const UChar *decomp; 1634 int32_t noOfDec = 0; 1635 1636 UChar32 u32 = 0; 1637 UChar comp[2]; 1638 uint32_t len = 0; 1639 1640 for(u32 = start; u32 < limit; u32++) { 1641 decomp = ((enumStruct *)context)->nfcImpl-> 1642 getDecomposition(u32, decompBuffer, noOfDec); 1643 //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1 1644 //|| (noOfDec == 1 && *decomp != (UChar)u32)) 1645 if(decomp != NULL) 1646 { 1647 len = 0; 1648 U16_APPEND_UNSAFE(comp, len, u32); 1649 if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) { 1650 #ifdef UCOL_DEBUG 1651 fprintf(stderr, "Closure: U+%04X -> ", u32); 1652 UChar32 c; 1653 int32_t i = 0; 1654 while(i < noOfDec) { 1655 U16_NEXT(decomp, i, noOfDec, c); 1656 fprintf(stderr, "%04X ", c); 1657 } 1658 fprintf(stderr, "\n"); 1659 // print CEs for code point vs. decomposition 1660 fprintf(stderr, "U+%04X CEs: ", u32); 1661 UCollationElements *iter = ucol_openElements(tempColl, comp, len, status); 1662 int32_t ce; 1663 while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) { 1664 fprintf(stderr, "%08X ", ce); 1665 } 1666 fprintf(stderr, "\nDecomp CEs: "); 1667 ucol_setText(iter, decomp, noOfDec, status); 1668 while((ce = ucol_next(iter, status)) != UCOL_NULLORDER) { 1669 fprintf(stderr, "%08X ", ce); 1670 } 1671 fprintf(stderr, "\n"); 1672 ucol_closeElements(iter); 1673 #endif 1674 if(((enumStruct *)context)->closed != NULL) { 1675 ((enumStruct *)context)->closed->add(u32); 1676 } 1677 ((enumStruct *)context)->noOfClosures++; 1678 el.cPoints = (UChar *)decomp; 1679 el.cSize = noOfDec; 1680 el.noOfCEs = 0; 1681 el.prefix = el.prefixChars; 1682 el.prefixSize = 0; 1683 1684 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el); 1685 el.cPoints = comp; 1686 el.cSize = len; 1687 el.prefix = el.prefixChars; 1688 el.prefixSize = 0; 1689 if(prefix == NULL) { 1690 el.noOfCEs = 0; 1691 ucol_setText(colEl, decomp, noOfDec, status); 1692 while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1693 el.noOfCEs++; 1694 } 1695 } else { 1696 el.noOfCEs = 1; 1697 el.CEs[0] = prefix->mapCE; 1698 // This character uses a prefix. We have to add it 1699 // to the unsafe table, as it decomposed form is already 1700 // in. In Japanese, this happens for \u309e & \u30fe 1701 // Since unsafeCPSet is static in ucol_elm, we are going 1702 // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function 1703 } 1704 uprv_uca_addAnElement(t, &el, status); 1705 } 1706 } 1707 } 1708 } 1709 return TRUE; 1710 } 1711 U_CDECL_END 1712 1713 static void 1714 uprv_uca_setMapCE(tempUCATable *t, UCAElements *element, UErrorCode *status) { 1715 uint32_t expansion = 0; 1716 int32_t j; 1717 1718 ExpansionTable *expansions = t->expansions; 1719 if(element->noOfCEs == 2 // a two CE expansion 1720 && isContinuation(element->CEs[1]) // which is a continuation 1721 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation, 1722 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary 1723 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary 1724 ) { 1725 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special 1726 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary 1727 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary 1728 } else { 1729 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT) 1730 | (((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4) 1731 & 0xFFFFF0)); 1732 1733 for(j = 1; j<(int32_t)element->noOfCEs; j++) { 1734 uprv_uca_addExpansion(expansions, element->CEs[j], status); 1735 } 1736 if(element->noOfCEs <= 0xF) { 1737 expansion |= element->noOfCEs; 1738 } else { 1739 uprv_uca_addExpansion(expansions, 0, status); 1740 } 1741 element->mapCE = expansion; 1742 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1], 1743 (uint8_t)element->noOfCEs, 1744 t->maxExpansions, 1745 status); 1746 } 1747 } 1748 1749 static void 1750 uprv_uca_addFCD4AccentedContractions(tempUCATable *t, 1751 UCollationElements* colEl, 1752 UChar *data, 1753 int32_t len, 1754 UCAElements *el, 1755 UErrorCode *status) { 1756 UChar decomp[256], comp[256]; 1757 int32_t decLen, compLen; 1758 1759 decLen = unorm_normalize(data, len, UNORM_NFD, 0, decomp, 256, status); 1760 compLen = unorm_normalize(data, len, UNORM_NFC, 0, comp, 256, status); 1761 decomp[decLen] = comp[compLen] = 0; 1762 1763 el->cPoints = decomp; 1764 el->cSize = decLen; 1765 el->noOfCEs = 0; 1766 el->prefixSize = 0; 1767 el->prefix = el->prefixChars; 1768 1769 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1770 el->cPoints = comp; 1771 el->cSize = compLen; 1772 el->prefix = el->prefixChars; 1773 el->prefixSize = 0; 1774 if(prefix == NULL) { 1775 el->noOfCEs = 0; 1776 ucol_setText(colEl, decomp, decLen, status); 1777 while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1778 el->noOfCEs++; 1779 } 1780 uprv_uca_setMapCE(t, el, status); 1781 uprv_uca_addAnElement(t, el, status); 1782 } 1783 el->cPoints=NULL; /* don't leak reference to stack */ 1784 } 1785 1786 static void 1787 uprv_uca_addMultiCMContractions(tempUCATable *t, 1788 UCollationElements* colEl, 1789 tempTailorContext *c, 1790 UCAElements *el, 1791 UErrorCode *status) { 1792 CombinClassTable *cmLookup = t->cmLookup; 1793 UChar newDecomp[256]; 1794 int32_t maxComp, newDecLen; 1795 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1796 if (U_FAILURE(*status)) { 1797 return; 1798 } 1799 int16_t curClass = nfcImpl->getFCD16(c->tailoringCM) & 0xff; 1800 CompData *precomp = c->precomp; 1801 int32_t compLen = c->compLen; 1802 UChar *comp = c->comp; 1803 maxComp = c->precompLen; 1804 1805 for (int32_t j=0; j < maxComp; j++) { 1806 int32_t count=0; 1807 do { 1808 if ( count == 0 ) { // Decompose the saved precomposed char. 1809 UChar temp[2]; 1810 temp[0]=precomp[j].cp; 1811 temp[1]=0; 1812 newDecLen = unorm_normalize(temp, 1, UNORM_NFD, 0, 1813 newDecomp, sizeof(newDecomp)/sizeof(UChar), status); 1814 newDecomp[newDecLen++] = cmLookup->cPoints[c->cmPos]; 1815 } 1816 else { // swap 2 combining marks when they are equal. 1817 uprv_memcpy(newDecomp, c->decomp, sizeof(UChar)*(c->decompLen)); 1818 newDecLen = c->decompLen; 1819 newDecomp[newDecLen++] = precomp[j].cClass; 1820 } 1821 newDecomp[newDecLen] = 0; 1822 compLen = unorm_normalize(newDecomp, newDecLen, UNORM_NFC, 0, 1823 comp, 256, status); 1824 if (compLen==1) { 1825 comp[compLen++] = newDecomp[newDecLen++] = c->tailoringCM; 1826 comp[compLen] = newDecomp[newDecLen] = 0; 1827 el->cPoints = newDecomp; 1828 el->cSize = newDecLen; 1829 1830 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el); 1831 el->cPoints = c->comp; 1832 el->cSize = compLen; 1833 el->prefix = el->prefixChars; 1834 el->prefixSize = 0; 1835 if(prefix == NULL) { 1836 el->noOfCEs = 0; 1837 ucol_setText(colEl, newDecomp, newDecLen, status); 1838 while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1839 el->noOfCEs++; 1840 } 1841 uprv_uca_setMapCE(t, el, status); 1842 uprv_uca_finalizeAddition(t, el, status); 1843 1844 // Save the current precomposed char and its class to find any 1845 // other combining mark combinations. 1846 precomp[c->precompLen].cp=comp[0]; 1847 precomp[c->precompLen].cClass = curClass; 1848 c->precompLen++; 1849 } 1850 } 1851 } while (++count<2 && (precomp[j].cClass == curClass)); 1852 } 1853 1854 } 1855 1856 static void 1857 uprv_uca_addTailCanonicalClosures(tempUCATable *t, 1858 UCollationElements* colEl, 1859 UChar baseCh, 1860 UChar cMark, 1861 UCAElements *el, 1862 UErrorCode *status) { 1863 CombinClassTable *cmLookup = t->cmLookup; 1864 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 1865 if (U_FAILURE(*status)) { 1866 return; 1867 } 1868 int16_t maxIndex = nfcImpl->getFCD16(cMark) & 0xff; 1869 UCAElements element; 1870 uint16_t *index; 1871 UChar decomp[256]; 1872 UChar comp[256]; 1873 CompData precomp[256]; // precomposed array 1874 int32_t precompLen = 0; // count for precomp 1875 int32_t i, len, decompLen, replacedPos; 1876 tempTailorContext c; 1877 1878 if ( cmLookup == NULL ) { 1879 return; 1880 } 1881 index = cmLookup->index; 1882 int32_t cClass=nfcImpl->getFCD16(cMark) & 0xff; 1883 maxIndex = (int32_t)index[(nfcImpl->getFCD16(cMark) & 0xff)-1]; 1884 c.comp = comp; 1885 c.decomp = decomp; 1886 c.precomp = precomp; 1887 c.tailoringCM = cMark; 1888 1889 if (cClass>0) { 1890 maxIndex = (int32_t)index[cClass-1]; 1891 } 1892 else { 1893 maxIndex=0; 1894 } 1895 decomp[0]=baseCh; 1896 for ( i=0; i<maxIndex ; i++ ) { 1897 decomp[1] = cmLookup->cPoints[i]; 1898 decomp[2]=0; 1899 decompLen=2; 1900 len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1901 if (len==1) { 1902 // Save the current precomposed char and its class to find any 1903 // other combining mark combinations. 1904 precomp[precompLen].cp=comp[0]; 1905 precomp[precompLen].cClass = 1906 index[nfcImpl->getFCD16(decomp[1]) & 0xff]; 1907 precompLen++; 1908 replacedPos=0; 1909 for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) { 1910 decomp[decompLen] = el->cPoints[decompLen]; 1911 if (decomp[decompLen]==cMark) { 1912 replacedPos = decompLen; // record the position for later use 1913 } 1914 } 1915 if ( replacedPos != 0 ) { 1916 decomp[replacedPos]=cmLookup->cPoints[i]; 1917 } 1918 decomp[decompLen] = 0; 1919 len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status); 1920 comp[len++] = decomp[decompLen++] = cMark; 1921 comp[len] = decomp[decompLen] = 0; 1922 element.cPoints = decomp; 1923 element.cSize = decompLen; 1924 element.noOfCEs = 0; 1925 element.prefix = el->prefixChars; 1926 element.prefixSize = 0; 1927 1928 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &element); 1929 element.cPoints = comp; 1930 element.cSize = len; 1931 element.prefix = el->prefixChars; 1932 element.prefixSize = 0; 1933 if(prefix == NULL) { 1934 element.noOfCEs = 0; 1935 ucol_setText(colEl, decomp, decompLen, status); 1936 while((element.CEs[element.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) { 1937 element.noOfCEs++; 1938 } 1939 uprv_uca_setMapCE(t, &element, status); 1940 uprv_uca_finalizeAddition(t, &element, status); 1941 } 1942 1943 // This is a fix for tailoring contractions with accented 1944 // character at the end of contraction string. 1945 if ((len>2) && 1946 (nfcImpl->getFCD16(comp[len-2]) & 0xff00)==0) { 1947 uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status); 1948 } 1949 1950 if (precompLen >1) { 1951 c.compLen = len; 1952 c.decompLen = decompLen; 1953 c.precompLen = precompLen; 1954 c.cmPos = i; 1955 uprv_uca_addMultiCMContractions(t, colEl, &c, &element, status); 1956 precompLen = c.precompLen; 1957 } 1958 } 1959 } 1960 } 1961 1962 U_CFUNC int32_t U_EXPORT2 1963 uprv_uca_canonicalClosure(tempUCATable *t, 1964 UColTokenParser *src, 1965 UnicodeSet *closed, 1966 UErrorCode *status) 1967 { 1968 enumStruct context; 1969 context.closed = closed; 1970 context.noOfClosures = 0; 1971 UCAElements el; 1972 UColToken *tok; 1973 uint32_t i = 0, j = 0; 1974 UChar baseChar, firstCM; 1975 context.nfcImpl=Normalizer2Factory::getNFCImpl(*status); 1976 if(U_FAILURE(*status)) { 1977 return 0; 1978 } 1979 1980 UCollator *tempColl = NULL; 1981 tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status); 1982 // Check for null pointer 1983 if (U_FAILURE(*status)) { 1984 return 0; 1985 } 1986 1987 UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status); 1988 tempColl = ucol_initCollator(tempData, 0, t->UCA, status); 1989 if ( tempTable->cmLookup != NULL ) { 1990 t->cmLookup = tempTable->cmLookup; // copy over to t 1991 tempTable->cmLookup = NULL; 1992 } 1993 uprv_uca_closeTempTable(tempTable); 1994 1995 if(U_SUCCESS(*status)) { 1996 tempColl->ucaRules = NULL; 1997 tempColl->actualLocale = NULL; 1998 tempColl->validLocale = NULL; 1999 tempColl->requestedLocale = NULL; 2000 tempColl->hasRealData = TRUE; 2001 tempColl->freeImageOnClose = TRUE; 2002 } else if(tempData != 0) { 2003 uprv_free(tempData); 2004 } 2005 2006 /* produce canonical closure */ 2007 UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status); 2008 // Check for null pointer 2009 if (U_FAILURE(*status)) { 2010 return 0; 2011 } 2012 context.t = t; 2013 context.tempColl = tempColl; 2014 context.colEl = colEl; 2015 context.status = status; 2016 u_enumCharTypes(_enumCategoryRangeClosureCategory, &context); 2017 2018 if ( (src==NULL) || !src->buildCCTabFlag ) { 2019 ucol_closeElements(colEl); 2020 ucol_close(tempColl); 2021 return context.noOfClosures; // no extra contraction needed to add 2022 } 2023 2024 for (i=0; i < src->resultLen; i++) { 2025 baseChar = firstCM= (UChar)0; 2026 tok = src->lh[i].first; 2027 while (tok != NULL && U_SUCCESS(*status)) { 2028 el.prefix = el.prefixChars; 2029 el.cPoints = el.uchars; 2030 if(tok->prefix != 0) { 2031 el.prefixSize = tok->prefix>>24; 2032 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 2033 2034 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 2035 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 2036 } else { 2037 el.prefixSize = 0; 2038 *el.prefix = 0; 2039 2040 el.cSize = (tok->source >> 24); 2041 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 2042 } 2043 if(src->UCA != NULL) { 2044 for(j = 0; j<el.cSize; j++) { 2045 int16_t fcd = context.nfcImpl->getFCD16(el.cPoints[j]); 2046 if ( (fcd & 0xff) == 0 ) { 2047 baseChar = el.cPoints[j]; // last base character 2048 firstCM=0; // reset combining mark value 2049 } 2050 else { 2051 if ( (baseChar!=0) && (firstCM==0) ) { 2052 firstCM = el.cPoints[j]; // first combining mark 2053 } 2054 } 2055 } 2056 } 2057 if ( (baseChar!= (UChar)0) && (firstCM != (UChar)0) ) { 2058 // find all the canonical rules 2059 uprv_uca_addTailCanonicalClosures(t, colEl, baseChar, firstCM, &el, status); 2060 } 2061 tok = tok->next; 2062 } 2063 } 2064 ucol_closeElements(colEl); 2065 ucol_close(tempColl); 2066 2067 return context.noOfClosures; 2068 } 2069 2070 #endif /* #if !UCONFIG_NO_COLLATION */ 2071