1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2008, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_bld.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This module builds a collator based on the rule set. 17 * 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucoleitr.h" 25 #include "unicode/udata.h" 26 #include "unicode/uchar.h" 27 #include "unicode/uniset.h" 28 #include "ucol_bld.h" 29 #include "ucol_elm.h" 30 #include "ucol_cnt.h" 31 #include "ucln_in.h" 32 #include "umutex.h" 33 #include "unormimp.h" 34 #include "cmemory.h" 35 36 static const InverseUCATableHeader* _staticInvUCA = NULL; 37 static UDataMemory* invUCA_DATA_MEM = NULL; 38 39 U_CDECL_BEGIN 40 static UBool U_CALLCONV 41 isAcceptableInvUCA(void * /*context*/, 42 const char * /*type*/, const char * /*name*/, 43 const UDataInfo *pInfo) 44 { 45 /* context, type & name are intentionally not used */ 46 if( pInfo->size>=20 && 47 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 48 pInfo->charsetFamily==U_CHARSET_FAMILY && 49 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ 50 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && 51 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && 52 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && 53 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && 54 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& 55 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && 56 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && 57 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && 58 ) 59 { 60 UVersionInfo UCDVersion; 61 u_getUnicodeVersion(UCDVersion); 62 return (pInfo->dataVersion[0]==UCDVersion[0] && 63 pInfo->dataVersion[1]==UCDVersion[1]); 64 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && 65 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && 66 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { 67 } else { 68 return FALSE; 69 } 70 } 71 U_CDECL_END 72 73 /* 74 * Takes two CEs (lead and continuation) and 75 * compares them as CEs should be compared: 76 * primary vs. primary, secondary vs. secondary 77 * tertiary vs. tertiary 78 */ 79 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { 80 uint32_t s1 = source0, s2, t1 = target0, t2; 81 if(isContinuation(source1)) { 82 s2 = source1; 83 } else { 84 s2 = 0; 85 } 86 if(isContinuation(target1)) { 87 t2 = target1; 88 } else { 89 t2 = 0; 90 } 91 92 uint32_t s = 0, t = 0; 93 if(s1 == t1 && s2 == t2) { 94 return 0; 95 } 96 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 97 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 98 if(s < t) { 99 return -1; 100 } else if(s > t) { 101 return 1; 102 } else { 103 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; 104 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; 105 if(s < t) { 106 return -1; 107 } else if(s > t) { 108 return 1; 109 } else { 110 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); 111 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); 112 if(s < t) { 113 return -1; 114 } else { 115 return 1; 116 } 117 } 118 } 119 } 120 121 static 122 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { 123 uint32_t bottom = 0, top = src->invUCA->tableSize; 124 uint32_t i = 0; 125 uint32_t first = 0, second = 0; 126 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 127 int32_t res = 0; 128 129 while(bottom < top-1) { 130 i = (top+bottom)/2; 131 first = *(CETable+3*i); 132 second = *(CETable+3*i+1); 133 res = compareCEs(first, second, CE, SecondCE); 134 if(res > 0) { 135 top = i; 136 } else if(res < 0) { 137 bottom = i; 138 } else { 139 break; 140 } 141 } 142 143 /* weiv: */ 144 /* in searching for elements, I have removed the failure */ 145 /* The reason for this is that the builder does not rely */ 146 /* on search mechanism telling it that it didn't find an */ 147 /* element. However, indirect positioning relies on being */ 148 /* able to find the elements around any CE, even if it is */ 149 /* not defined in the UCA. */ 150 return i; 151 /* 152 if((first == CE && second == SecondCE)) { 153 return i; 154 } else { 155 return -1; 156 } 157 */ 158 } 159 160 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 161 0xFFFF0000, 162 0xFFFFFF00, 163 0xFFFFFFFF 164 }; 165 166 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, 167 uint32_t CE, uint32_t contCE, 168 uint32_t *nextCE, uint32_t *nextContCE, 169 uint32_t strength) 170 { 171 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 172 int32_t iCE; 173 174 iCE = ucol_inv_findCE(src, CE, contCE); 175 176 if(iCE<0) { 177 *nextCE = UCOL_NOT_FOUND; 178 return -1; 179 } 180 181 CE &= strengthMask[strength]; 182 contCE &= strengthMask[strength]; 183 184 *nextCE = CE; 185 *nextContCE = contCE; 186 187 while((*nextCE & strengthMask[strength]) == CE 188 && (*nextContCE & strengthMask[strength]) == contCE) 189 { 190 *nextCE = (*(CETable+3*(++iCE))); 191 *nextContCE = (*(CETable+3*(iCE)+1)); 192 } 193 194 return iCE; 195 } 196 197 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 198 uint32_t CE, uint32_t contCE, 199 uint32_t *prevCE, uint32_t *prevContCE, 200 uint32_t strength) 201 { 202 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 203 int32_t iCE; 204 205 iCE = ucol_inv_findCE(src, CE, contCE); 206 207 if(iCE<0) { 208 *prevCE = UCOL_NOT_FOUND; 209 return -1; 210 } 211 212 CE &= strengthMask[strength]; 213 contCE &= strengthMask[strength]; 214 215 *prevCE = CE; 216 *prevContCE = contCE; 217 218 while((*prevCE & strengthMask[strength]) == CE 219 && (*prevContCE & strengthMask[strength])== contCE 220 && iCE > 0) /* this condition should prevent falling off the edge of the world */ 221 { 222 /* here, we end up in a singularity - zero */ 223 *prevCE = (*(CETable+3*(--iCE))); 224 *prevContCE = (*(CETable+3*(iCE)+1)); 225 } 226 227 return iCE; 228 } 229 230 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 231 uint32_t prevCE, uint32_t prevContCE) 232 { 233 if(prevCE == CE && prevContCE == contCE) { 234 return UCOL_IDENTICAL; 235 } 236 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) 237 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) 238 { 239 return UCOL_PRIMARY; 240 } 241 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) 242 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) 243 { 244 return UCOL_SECONDARY; 245 } 246 return UCOL_TERTIARY; 247 } 248 249 250 /*static 251 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 252 253 uint32_t CE = lh->baseCE; 254 uint32_t SecondCE = lh->baseContCE; 255 256 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 257 uint32_t previousCE, previousContCE; 258 int32_t iCE; 259 260 iCE = ucol_inv_findCE(src, CE, SecondCE); 261 262 if(iCE<0) { 263 return -1; 264 } 265 266 CE &= strengthMask[strength]; 267 SecondCE &= strengthMask[strength]; 268 269 previousCE = CE; 270 previousContCE = SecondCE; 271 272 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { 273 previousCE = (*(CETable+3*(--iCE))); 274 previousContCE = (*(CETable+3*(iCE)+1)); 275 } 276 lh->previousCE = previousCE; 277 lh->previousContCE = previousContCE; 278 279 return iCE; 280 }*/ 281 282 static 283 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 284 uint32_t CE = lh->baseCE; 285 uint32_t SecondCE = lh->baseContCE; 286 287 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 288 uint32_t nextCE, nextContCE; 289 int32_t iCE; 290 291 iCE = ucol_inv_findCE(src, CE, SecondCE); 292 293 if(iCE<0) { 294 return -1; 295 } 296 297 CE &= strengthMask[strength]; 298 SecondCE &= strengthMask[strength]; 299 300 nextCE = CE; 301 nextContCE = SecondCE; 302 303 while((nextCE & strengthMask[strength]) == CE 304 && (nextContCE & strengthMask[strength]) == SecondCE) 305 { 306 nextCE = (*(CETable+3*(++iCE))); 307 nextContCE = (*(CETable+3*(iCE)+1)); 308 } 309 310 lh->nextCE = nextCE; 311 lh->nextContCE = nextContCE; 312 313 return iCE; 314 } 315 316 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 317 /* reset all the gaps */ 318 int32_t i = 0; 319 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 320 uint32_t st = 0; 321 uint32_t t1, t2; 322 int32_t pos; 323 324 UColToken *tok = lh->first; 325 uint32_t tokStrength = tok->strength; 326 327 for(i = 0; i<3; i++) { 328 lh->gapsHi[3*i] = 0; 329 lh->gapsHi[3*i+1] = 0; 330 lh->gapsHi[3*i+2] = 0; 331 lh->gapsLo[3*i] = 0; 332 lh->gapsLo[3*i+1] = 0; 333 lh->gapsLo[3*i+2] = 0; 334 lh->numStr[i] = 0; 335 lh->fStrToken[i] = NULL; 336 lh->lStrToken[i] = NULL; 337 lh->pos[i] = -1; 338 } 339 340 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 341 342 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 343 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 344 lh->pos[0] = 0; 345 t1 = lh->baseCE; 346 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; 347 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 348 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 349 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 350 uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16; 351 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); 352 353 t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505; 354 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; 355 356 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 357 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 358 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 359 } else if(lh->indirect == TRUE && lh->nextCE != 0) { 360 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { 361 lh->pos[0] = 0; 362 t1 = lh->baseCE; 363 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; 364 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 365 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 366 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 367 t1 = lh->nextCE; 368 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; 369 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 370 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 371 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 372 } else { 373 for(;;) { 374 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 375 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { 376 lh->fStrToken[tokStrength] = tok; 377 } else { /* The CE must be implicit, since it's not in the table */ 378 /* Error */ 379 *status = U_INTERNAL_PROGRAM_ERROR; 380 } 381 } 382 383 while(tok != NULL && tok->strength >= tokStrength) { 384 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 385 lh->lStrToken[tokStrength] = tok; 386 } 387 tok = tok->next; 388 } 389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { 390 /* check if previous interval is the same and merge the intervals if it is so */ 391 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { 392 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; 393 lh->fStrToken[tokStrength+1] = NULL; 394 lh->lStrToken[tokStrength+1] = NULL; 395 lh->pos[tokStrength+1] = -1; 396 } 397 } 398 if(tok != NULL) { 399 tokStrength = tok->strength; 400 } else { 401 break; 402 } 403 } 404 for(st = 0; st < 3; st++) { 405 if((pos = lh->pos[st]) >= 0) { 406 t1 = *(CETable+3*(pos)); 407 t2 = *(CETable+3*(pos)+1); 408 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 409 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 410 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 411 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 412 //pos--; 413 //t1 = *(CETable+3*(pos)); 414 //t2 = *(CETable+3*(pos)+1); 415 t1 = lh->baseCE; 416 t2 = lh->baseContCE; 417 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 418 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 419 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 420 } 421 } 422 } 423 } 424 425 426 #define ucol_countBytes(value, noOfBytes) \ 427 { \ 428 uint32_t mask = 0xFFFFFFFF; \ 429 (noOfBytes) = 0; \ 430 while(mask != 0) { \ 431 if(((value) & mask) != 0) { \ 432 (noOfBytes)++; \ 433 } \ 434 mask >>= 8; \ 435 } \ 436 } 437 438 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { 439 if(U_SUCCESS(*status)) { 440 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 441 } 442 return g->current; 443 } 444 445 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { 446 /* TODO: rename to enum names */ 447 uint32_t high, low, count=1; 448 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; 449 450 if(strength == UCOL_SECONDARY) { 451 low = UCOL_COMMON_TOP2<<24; 452 high = 0xFFFFFFFF; 453 count = 0xFF - UCOL_COMMON_TOP2; 454 } else { 455 low = UCOL_BYTE_COMMON << 24; //0x05000000; 456 high = 0x40000000; 457 count = 0x40 - UCOL_BYTE_COMMON; 458 } 459 460 if(tok->next != NULL && tok->next->strength == strength) { 461 count = tok->next->toInsert; 462 } 463 464 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 465 g->current = UCOL_BYTE_COMMON<<24; 466 467 if(g->noOfRanges == 0) { 468 *status = U_INTERNAL_PROGRAM_ERROR; 469 } 470 return g->current; 471 } 472 473 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { 474 uint32_t strength = tok->strength; 475 uint32_t low = lows[fStrength*3+strength]; 476 uint32_t high = highs[fStrength*3+strength]; 477 uint32_t maxByte = 0; 478 if(strength == UCOL_TERTIARY) { 479 maxByte = 0x3F; 480 } else if(strength == UCOL_PRIMARY) { 481 maxByte = 0xFE; 482 } else { 483 maxByte = 0xFF; 484 } 485 486 uint32_t count = tok->toInsert; 487 488 if(low >= high && strength > UCOL_PRIMARY) { 489 int32_t s = strength; 490 for(;;) { 491 s--; 492 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { 493 if(strength == UCOL_SECONDARY) { 494 if (low < UCOL_COMMON_TOP2<<24 ) { 495 // Override if low range is less than UCOL_COMMON_TOP2. 496 low = UCOL_COMMON_TOP2<<24; 497 } 498 high = 0xFFFFFFFF; 499 } else { 500 // Override if low range is less than UCOL_COMMON_BOT3. 501 if ( low < UCOL_COMMON_BOT3<<24 ) { 502 low = UCOL_COMMON_BOT3<<24; 503 } 504 high = 0x40000000; 505 } 506 break; 507 } 508 if(s<0) { 509 *status = U_INTERNAL_PROGRAM_ERROR; 510 return 0; 511 } 512 } 513 } 514 515 if(low == 0) { 516 low = 0x01000000; 517 } 518 519 if(strength == UCOL_SECONDARY) { /* similar as simple */ 520 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 521 low = UCOL_COMMON_TOP2<<24; 522 } 523 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 524 high = UCOL_COMMON_TOP2<<24; 525 } 526 if(low < (UCOL_COMMON_BOT2<<24)) { 527 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); 528 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 529 //g->current = UCOL_COMMON_BOT2<<24; 530 return g->current; 531 } 532 } 533 534 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 535 if(g->noOfRanges == 0) { 536 *status = U_INTERNAL_PROGRAM_ERROR; 537 } 538 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 539 return g->current; 540 } 541 542 static 543 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 544 uint32_t i = 0; 545 UChar c; 546 547 if(U_FAILURE(*status)) { 548 return 0; 549 } 550 551 if(sourceLen > resLen) { 552 *status = U_MEMORY_ALLOCATION_ERROR; 553 return 0; 554 } 555 556 for(i = 0; i < sourceLen; i++) { 557 c = source[i]; 558 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 559 switch(c - 0x3000) { 560 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: 561 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: 562 c++; 563 break; 564 case 0xF5: 565 c = 0x30AB; 566 break; 567 case 0xF6: 568 c = 0x30B1; 569 break; 570 } 571 } 572 resBuf[i] = c; 573 } 574 return sourceLen; 575 } 576 577 static 578 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 579 uint32_t i = 0; 580 UChar c; 581 582 if(U_FAILURE(*status)) { 583 return 0; 584 } 585 586 if(sourceLen > resLen) { 587 *status = U_MEMORY_ALLOCATION_ERROR; 588 return 0; 589 } 590 591 for(i = 0; i < sourceLen; i++) { 592 c = source[i]; 593 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 594 switch(c - 0x3000) { 595 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: 596 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: 597 c--; 598 break; 599 case 0xAB: 600 c = 0x30F5; 601 break; 602 case 0xB1: 603 c = 0x30F6; 604 break; 605 } 606 } 607 resBuf[i] = c; 608 } 609 return sourceLen; 610 } 611 612 static 613 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { 614 uint32_t i = 0; 615 UChar n[128]; 616 uint32_t nLen = 0; 617 uint32_t uCount = 0, lCount = 0; 618 619 collIterate s; 620 uint32_t order = 0; 621 622 if(U_FAILURE(*status)) { 623 return UCOL_LOWER_CASE; 624 } 625 626 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); 627 if(U_SUCCESS(*status)) { 628 for(i = 0; i < nLen; i++) { 629 uprv_init_collIterate(UCA, &n[i], 1, &s); 630 order = ucol_getNextCE(UCA, &s, status); 631 if(isContinuation(order)) { 632 *status = U_INTERNAL_PROGRAM_ERROR; 633 return UCOL_LOWER_CASE; 634 } 635 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { 636 uCount++; 637 } else { 638 if(u_islower(n[i])) { 639 lCount++; 640 } else if(U_SUCCESS(*status)) { 641 UChar sk[1], lk[1]; 642 u_toSmallKana(&n[i], 1, sk, 1, status); 643 u_toLargeKana(&n[i], 1, lk, 1, status); 644 if(sk[0] == n[i] && lk[0] != n[i]) { 645 lCount++; 646 } 647 } 648 } 649 } 650 } 651 652 if(uCount != 0 && lCount != 0) { 653 return UCOL_MIXED_CASE; 654 } else if(uCount != 0) { 655 return UCOL_UPPER_CASE; 656 } else { 657 return UCOL_LOWER_CASE; 658 } 659 } 660 661 662 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { 663 /* this one makes the table and stuff */ 664 uint32_t noOfBytes[3]; 665 uint32_t i; 666 667 for(i = 0; i<3; i++) { 668 ucol_countBytes(CEparts[i], noOfBytes[i]); 669 } 670 671 /* Here we have to pack CEs from parts */ 672 673 uint32_t CEi = 0; 674 uint32_t value = 0; 675 676 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { 677 if(CEi > 0) { 678 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 679 } else { 680 value = 0; 681 } 682 683 if(2*CEi<noOfBytes[0]) { 684 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; 685 } 686 if(CEi<noOfBytes[1]) { 687 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; 688 } 689 if(CEi<noOfBytes[2]) { 690 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); 691 } 692 tok->CEs[CEi] = value; 693 CEi++; 694 } 695 if(CEi == 0) { /* totally ignorable */ 696 tok->noOfCEs = 1; 697 tok->CEs[0] = 0; 698 } else { /* there is at least something */ 699 tok->noOfCEs = CEi; 700 } 701 702 703 // we want to set case bits here and now, not later. 704 // Case bits handling 705 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables 706 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field 707 int32_t cSize = (tok->source & 0xFF000000) >> 24; 708 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; 709 710 if(cSize > 1) { 711 // Do it manually 712 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); 713 } else { 714 // Copy it from the UCA 715 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); 716 tok->CEs[0] |= (caseCE & 0xC0); 717 } 718 } 719 720 #if UCOL_DEBUG==2 721 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); 722 for(i = 0; i<tok->noOfCEs; i++) { 723 fprintf(stderr, "%08X ", tok->CEs[i]); 724 } 725 fprintf(stderr, "\n"); 726 #endif 727 } 728 729 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 730 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; 731 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; 732 733 UColToken *tok = lh->last; 734 uint32_t t[UCOL_STRENGTH_LIMIT]; 735 736 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); 737 738 tok->toInsert = 1; 739 t[tok->strength] = 1; 740 741 while(tok->previous != NULL) { 742 if(tok->previous->strength < tok->strength) { /* going up */ 743 t[tok->strength] = 0; 744 t[tok->previous->strength]++; 745 } else if(tok->previous->strength > tok->strength) { /* going down */ 746 t[tok->previous->strength] = 1; 747 } else { 748 t[tok->strength]++; 749 } 750 tok=tok->previous; 751 tok->toInsert = t[tok->strength]; 752 } 753 754 tok->toInsert = t[tok->strength]; 755 ucol_inv_getGapPositions(src, lh, status); 756 757 #if UCOL_DEBUG 758 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); 759 int32_t j = 2; 760 for(j = 2; j >= 0; j--) { 761 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); 762 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); 763 } 764 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE]; 765 766 do { 767 fprintf(stderr,"%i", tok->strength); 768 tok = tok->next; 769 } while(tok != NULL); 770 fprintf(stderr, "\n"); 771 772 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE]; 773 774 do { 775 fprintf(stderr,"%i", tok->toInsert); 776 tok = tok->next; 777 } while(tok != NULL); 778 #endif 779 780 tok = lh->first; 781 uint32_t fStrength = UCOL_IDENTICAL; 782 uint32_t initStrength = UCOL_IDENTICAL; 783 784 785 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; 786 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; 787 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; 788 789 while (tok != NULL && U_SUCCESS(*status)) { 790 fStrength = tok->strength; 791 if(fStrength < initStrength) { 792 initStrength = fStrength; 793 if(lh->pos[fStrength] == -1) { 794 while(lh->pos[fStrength] == -1 && fStrength > 0) { 795 fStrength--; 796 } 797 if(lh->pos[fStrength] == -1) { 798 *status = U_INTERNAL_PROGRAM_ERROR; 799 return; 800 } 801 } 802 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ 803 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 804 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; 805 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ 806 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 807 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ 808 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 809 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ 810 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 811 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 812 } else { /* primaries */ 813 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ 814 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 815 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 816 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 817 } 818 } else { 819 if(tok->strength == UCOL_TERTIARY) { 820 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); 821 } else if(tok->strength == UCOL_SECONDARY) { 822 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); 823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 824 } else if(tok->strength == UCOL_PRIMARY) { 825 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); 826 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 827 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 828 } 829 } 830 ucol_doCE(src, CEparts, tok, status); 831 tok = tok->next; 832 } 833 } 834 835 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { 836 UCAElements el; 837 UColToken *tok = lh->first; 838 UColToken *expt = NULL; 839 uint32_t i = 0, j = 0; 840 UChar32 fcdHighStart; 841 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 842 843 while(tok != NULL && U_SUCCESS(*status)) { 844 /* first, check if there are any expansions */ 845 /* if there are expansions, we need to do a little bit more processing */ 846 /* since parts of expansion can be tailored, while others are not */ 847 if(tok->expansion != 0) { 848 uint32_t len = tok->expansion >> 24; 849 uint32_t currentSequenceLen = len; 850 uint32_t expOffset = tok->expansion & 0x00FFFFFF; 851 //uint32_t exp = currentSequenceLen | expOffset; 852 UColToken exp; 853 exp.source = currentSequenceLen | expOffset; 854 exp.rulesToParse = src->source; 855 856 while(len > 0) { 857 currentSequenceLen = len; 858 while(currentSequenceLen > 0) { 859 exp.source = (currentSequenceLen << 24) | expOffset; 860 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ 861 uint32_t noOfCEsToCopy = expt->noOfCEs; 862 for(j = 0; j<noOfCEsToCopy; j++) { 863 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; 864 } 865 tok->noOfExpCEs += noOfCEsToCopy; 866 // Smart people never try to add codepoints and CEs. 867 // For some odd reason, it won't work. 868 expOffset += currentSequenceLen; //noOfCEsToCopy; 869 len -= currentSequenceLen; //noOfCEsToCopy; 870 break; 871 } else { 872 currentSequenceLen--; 873 } 874 } 875 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ 876 /* will have to get one from UCA */ 877 /* first, get the UChars from the rules */ 878 /* then pick CEs out until there is no more and stuff them into expansion */ 879 collIterate s; 880 uint32_t order = 0; 881 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s); 882 883 for(;;) { 884 order = ucol_getNextCE(src->UCA, &s, status); 885 if(order == UCOL_NO_MORE_CES) { 886 break; 887 } 888 tok->expCEs[tok->noOfExpCEs++] = order; 889 } 890 expOffset++; 891 len--; 892 } 893 } 894 } else { 895 tok->noOfExpCEs = 0; 896 } 897 898 /* set the ucaelement with obtained values */ 899 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; 900 /* copy CEs */ 901 for(i = 0; i<tok->noOfCEs; i++) { 902 el.CEs[i] = tok->CEs[i]; 903 } 904 for(i = 0; i<tok->noOfExpCEs; i++) { 905 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; 906 } 907 908 /* copy UChars */ 909 // We kept prefix and source kind of together, as it is a kind of a contraction. 910 // However, now we have to slice the prefix off the main thing - 911 el.prefix = el.prefixChars; 912 el.cPoints = el.uchars; 913 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the 914 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND 915 // decomposed elements to the unsaf table. 916 el.prefixSize = tok->prefix>>24; 917 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 918 919 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 920 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 921 } else { 922 el.prefixSize = 0; 923 *el.prefix = 0; 924 925 el.cSize = (tok->source >> 24); 926 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 927 } 928 if(src->UCA != NULL) { 929 for(i = 0; i<el.cSize; i++) { 930 if(UCOL_ISJAMO(el.cPoints[i])) { 931 t->image->jamoSpecial = TRUE; 932 } 933 } 934 if (!src->buildCCTabFlag && el.cSize > 0) { 935 // Check the trailing canonical combining class (tccc) of the last character. 936 const UChar *s = el.cPoints + el.cSize; 937 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s); 938 if ((fcd & 0xff) != 0) { 939 src->buildCCTabFlag = TRUE; 940 } 941 } 942 } 943 944 /* and then, add it */ 945 #if UCOL_DEBUG==2 946 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); 947 #endif 948 uprv_uca_addAnElement(t, &el, status); 949 950 #if UCOL_DEBUG_DUPLICATES 951 if(*status != U_ZERO_ERROR) { 952 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); 953 *status = U_ZERO_ERROR; 954 } 955 #endif 956 957 tok = tok->next; 958 } 959 } 960 961 U_CDECL_BEGIN 962 static UBool U_CALLCONV 963 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 964 UErrorCode status = U_ZERO_ERROR; 965 tempUCATable *t = (tempUCATable *)context; 966 if(value == 0) { 967 while(start < limit) { 968 uint32_t CE = utrie_get32(t->mapping, start, NULL); 969 if(CE == UCOL_NOT_FOUND) { 970 UCAElements el; 971 el.isThai = FALSE; 972 el.prefixSize = 0; 973 el.prefixChars[0] = 0; 974 el.prefix = el.prefixChars; 975 el.cPoints = el.uchars; 976 977 el.cSize = 0; 978 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start); 979 980 el.noOfCEs = 1; 981 el.CEs[0] = 0; 982 uprv_uca_addAnElement(t, &el, &status); 983 984 } 985 start++; 986 } 987 } 988 if(U_FAILURE(status)) { 989 return FALSE; 990 } else { 991 return TRUE; 992 } 993 } 994 U_CDECL_END 995 996 static void 997 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, 998 UChar32 start, UChar32 end, 999 UErrorCode *status) 1000 { 1001 //UChar decomp[256]; 1002 uint32_t CE = UCOL_NOT_FOUND; 1003 UChar32 u = 0; 1004 UCAElements el; 1005 el.isThai = FALSE; 1006 el.prefixSize = 0; 1007 el.prefixChars[0] = 0; 1008 collIterate colIt; 1009 1010 if(U_SUCCESS(*status)) { 1011 for(u = start; u<=end; u++) { 1012 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND 1013 /* this test is for contractions that are missing the starting element. */ 1014 || ((isCntTableElement(CE)) && 1015 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) 1016 ) 1017 { 1018 el.cSize = 0; 1019 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); 1020 //decomp[0] = (UChar)u; 1021 //el.uchars[0] = (UChar)u; 1022 el.cPoints = el.uchars; 1023 //el.cSize = 1; 1024 el.noOfCEs = 0; 1025 el.prefix = el.prefixChars; 1026 el.prefixSize = 0; 1027 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); 1028 // We actually want to check whether this element is a special 1029 // If it is an implicit element (hangul, CJK - we want to copy the 1030 // special, not the resolved CEs) - for hangul, copying resolved 1031 // would just make things the same (there is an expansion and it 1032 // takes approximately the same amount of time to resolve as 1033 // falling back to the UCA). 1034 /* 1035 UTRIE_GET32(src->UCA->mapping, u, CE); 1036 tag = getCETag(CE); 1037 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG 1038 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG 1039 || tag == LEAD_SURROGATE_TAG) { 1040 el.CEs[el.noOfCEs++] = CE; 1041 } else { 1042 */ 1043 // It turns out that it does not make sense to keep implicits 1044 // unresolved. The cost of resolving them is big enough so that 1045 // it doesn't make any difference whether we have to go to the UCA 1046 // or not. 1047 { 1048 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt); 1049 while(CE != UCOL_NO_MORE_CES) { 1050 CE = ucol_getNextCE(src->UCA, &colIt, status); 1051 if(CE != UCOL_NO_MORE_CES) { 1052 el.CEs[el.noOfCEs++] = CE; 1053 } 1054 } 1055 } 1056 uprv_uca_addAnElement(t, &el, status); 1057 } 1058 } 1059 } 1060 } 1061 1062 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { 1063 U_NAMESPACE_USE 1064 1065 uint32_t i = 0; 1066 if(U_FAILURE(*status)) { 1067 return NULL; 1068 } 1069 /* 1070 2. Eliminate the negative lists by doing the following for each non-null negative list: 1071 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 1072 create new ListHeader X 1073 o reverse the list, add to the end of X's positive list. Reset the strength of the 1074 first item you add, based on the stronger strength levels of the two lists. 1075 */ 1076 /* 1077 3. For each ListHeader with a non-null positive list: 1078 */ 1079 /* 1080 o Find all character strings with CEs between the baseCE and the 1081 next/previous CE, at the strength of the first token. Add these to the 1082 tailoring. 1083 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the 1084 tailoring has & x < z... 1085 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... 1086 */ 1087 /* It is possible that this part should be done even while constructing list */ 1088 /* The problem is that it is unknown what is going to be the strongest weight */ 1089 /* So we might as well do it here */ 1090 1091 /* 1092 o Allocate CEs for each token in the list, based on the total number N of the 1093 largest level difference, and the gap G between baseCE and nextCE at that 1094 level. The relation * between the last item and nextCE is the same as the 1095 strongest strength. 1096 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 1097 ? There are 3 primary items: a, d, e. Fit them into the primary gap. 1098 Then fit b and c into the secondary gap between a and d, then fit q 1099 into the tertiary gap between b and c. 1100 1101 o Example: baseCE << b <<< q << c * nextCE(X,2) 1102 ? There are 2 secondary items: b, c. Fit them into the secondary gap. 1103 Then fit q into the tertiary gap between b and c. 1104 o When incrementing primary values, we will not cross high byte 1105 boundaries except where there is only a single-byte primary. That is to 1106 ensure that the script reordering will continue to work. 1107 */ 1108 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); 1109 /* test for NULL */ 1110 if (image == NULL) { 1111 *status = U_MEMORY_ALLOCATION_ERROR; 1112 return NULL; 1113 } 1114 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); 1115 1116 for(i = 0; i<src->resultLen; i++) { 1117 /* now we need to generate the CEs */ 1118 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1119 /* According to strength */ 1120 if(U_SUCCESS(*status)) { 1121 if(src->lh[i].first) { // if there are any elements 1122 // due to the way parser works, subsequent tailorings 1123 // may remove all the elements from a sequence, therefore 1124 // leaving an empty tailoring sequence. 1125 ucol_initBuffers(src, &src->lh[i], status); 1126 } 1127 } 1128 if(U_FAILURE(*status)) { 1129 uprv_free(image); 1130 return NULL; 1131 } 1132 } 1133 1134 if(src->varTop != NULL) { /* stuff the variable top value */ 1135 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; 1136 /* remove it from the list */ 1137 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ 1138 src->varTop->listHeader->first = src->varTop->next; 1139 } 1140 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ 1141 src->varTop->listHeader->last = src->varTop->previous; 1142 } 1143 if(src->varTop->next != NULL) { 1144 src->varTop->next->previous = src->varTop->previous; 1145 } 1146 if(src->varTop->previous != NULL) { 1147 src->varTop->previous->next = src->varTop->next; 1148 } 1149 } 1150 1151 1152 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); 1153 if(U_FAILURE(*status)) { 1154 uprv_free(image); 1155 return NULL; 1156 } 1157 1158 1159 /* After this, we have assigned CE values to all regular CEs */ 1160 /* now we will go through list once more and resolve expansions, */ 1161 /* make UCAElements structs and add them to table */ 1162 for(i = 0; i<src->resultLen; i++) { 1163 /* now we need to generate the CEs */ 1164 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1165 /* According to strength */ 1166 if(U_SUCCESS(*status)) { 1167 ucol_createElements(src, t, &src->lh[i], status); 1168 } 1169 } 1170 1171 UCAElements el; 1172 el.isThai = FALSE; 1173 el.prefixSize = 0; 1174 el.prefixChars[0] = 0; 1175 1176 /* add latin-1 stuff */ 1177 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); 1178 1179 /* add stuff for copying */ 1180 if(src->copySet != NULL) { 1181 int32_t i = 0; 1182 UnicodeSet *set = (UnicodeSet *)src->copySet; 1183 for(i = 0; i < set->getRangeCount(); i++) { 1184 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); 1185 } 1186 } 1187 1188 if(U_SUCCESS(*status)) { 1189 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ 1190 1191 uint32_t tailoredCE = UCOL_NOT_FOUND; 1192 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants)); 1193 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); 1194 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); 1195 // Check for null pointer 1196 if (ucaEl == NULL) { 1197 *status = U_MEMORY_ALLOCATION_ERROR; 1198 return NULL; 1199 } 1200 while(*conts != 0) { 1201 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/ 1202 tailoredCE = utrie_get32(t->mapping, *conts, NULL); 1203 if(tailoredCE != UCOL_NOT_FOUND) { 1204 UBool needToAdd = TRUE; 1205 if(isCntTableElement(tailoredCE)) { 1206 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) { 1207 needToAdd = FALSE; 1208 } 1209 } 1210 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { 1211 UCAElements elm; 1212 elm.cPoints = el.uchars; 1213 elm.noOfCEs = 0; 1214 elm.uchars[0] = *conts; 1215 elm.uchars[1] = 0; 1216 elm.cSize = 1; 1217 elm.prefixChars[0] = *(conts+2); 1218 elm.isThai = FALSE; 1219 elm.prefix = elm.prefixChars; 1220 elm.prefixSize = 1; 1221 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); 1222 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { 1223 needToAdd = TRUE; 1224 } 1225 } 1226 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { 1227 needToAdd = FALSE; 1228 } 1229 1230 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. 1231 if (*(conts+1) != 0) { // contractions 1232 el.prefix = el.prefixChars; 1233 el.prefixSize = 0; 1234 el.cPoints = el.uchars; 1235 el.noOfCEs = 0; 1236 el.uchars[0] = *conts; 1237 el.uchars[1] = *(conts+1); 1238 if(*(conts+2)!=0) { 1239 el.uchars[2] = *(conts+2); 1240 el.cSize = 3; 1241 } else { 1242 el.cSize = 2; 1243 } 1244 ucol_setText(ucaEl, el.uchars, el.cSize, status); 1245 } 1246 else { // pre-context character 1247 UChar str[4] = { 0 }; 1248 int32_t len=0; 1249 int32_t preKeyLen=0; 1250 1251 el.cPoints = el.uchars; 1252 el.noOfCEs = 0; 1253 el.uchars[0] = *conts; 1254 el.uchars[1] = 0; 1255 el.cSize = 1; 1256 el.prefixChars[0] = *(conts+2); 1257 el.prefix = el.prefixChars; 1258 el.prefixSize = 1; 1259 if (el.prefixChars[0]!=0) { 1260 // get CE of prefix character first 1261 str[0]=el.prefixChars[0]; 1262 str[1]=0; 1263 ucol_setText(ucaEl, str, 1, status); 1264 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) 1265 != UCOL_NULLORDER) { 1266 preKeyLen++; // count number of keys for prefix character 1267 } 1268 str[len++] = el.prefixChars[0]; 1269 } 1270 1271 str[len++] = el.uchars[0]; 1272 str[len]=0; 1273 ucol_setText(ucaEl, str, len, status); 1274 // Skip the keys for prefix character, then copy the rest to el. 1275 while ((preKeyLen-->0) && 1276 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1277 continue; 1278 } 1279 1280 } 1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1282 el.noOfCEs++; 1283 } 1284 uprv_uca_addAnElement(t, &el, status); 1285 } 1286 1287 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { 1288 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status); 1289 } 1290 conts+=3; 1291 } 1292 ucol_closeElements(ucaEl); 1293 } 1294 1295 // Add completely ignorable elements 1296 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); 1297 1298 // add tailoring characters related canonical closures 1299 uprv_uca_canonicalClosure(t, src, status); 1300 1301 /* still need to produce compatibility closure */ 1302 1303 UCATableHeader *myData = uprv_uca_assembleTable(t, status); 1304 1305 uprv_uca_closeTempTable(t); 1306 uprv_free(image); 1307 1308 return myData; 1309 } 1310 1311 U_CDECL_BEGIN 1312 static UBool U_CALLCONV 1313 ucol_bld_cleanup(void) 1314 { 1315 udata_close(invUCA_DATA_MEM); 1316 invUCA_DATA_MEM = NULL; 1317 _staticInvUCA = NULL; 1318 return TRUE; 1319 } 1320 U_CDECL_END 1321 1322 U_CAPI const InverseUCATableHeader * U_EXPORT2 1323 ucol_initInverseUCA(UErrorCode *status) 1324 { 1325 if(U_FAILURE(*status)) return NULL; 1326 1327 UBool needsInit; 1328 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit); 1329 1330 if(needsInit) { 1331 InverseUCATableHeader *newInvUCA = NULL; 1332 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); 1333 1334 if(U_FAILURE(*status)) { 1335 if (result) { 1336 udata_close(result); 1337 } 1338 // This is not needed, as we are talking about 1339 // memory we got from UData 1340 //uprv_free(newInvUCA); 1341 } 1342 1343 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ 1344 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); 1345 UCollator *UCA = ucol_initUCA(status); 1346 // UCA versions of UCA and inverse UCA should match 1347 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { 1348 *status = U_INVALID_FORMAT_ERROR; 1349 udata_close(result); 1350 return NULL; 1351 } 1352 1353 umtx_lock(NULL); 1354 if(_staticInvUCA == NULL) { 1355 invUCA_DATA_MEM = result; 1356 _staticInvUCA = newInvUCA; 1357 result = NULL; 1358 newInvUCA = NULL; 1359 } 1360 umtx_unlock(NULL); 1361 1362 if(newInvUCA != NULL) { 1363 udata_close(result); 1364 // This is not needed, as we are talking about 1365 // memory we got from UData 1366 //uprv_free(newInvUCA); 1367 } 1368 else { 1369 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); 1370 } 1371 } 1372 } 1373 return _staticInvUCA; 1374 } 1375 1376 #endif /* #if !UCONFIG_NO_COLLATION */ 1377