1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_bld.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This module builds a collator based on the rule set. 17 * 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucoleitr.h" 25 #include "unicode/udata.h" 26 #include "unicode/uchar.h" 27 #include "unicode/uniset.h" 28 #include "unicode/uscript.h" 29 #include "unicode/ustring.h" 30 #include "normalizer2impl.h" 31 #include "ucol_bld.h" 32 #include "ucol_elm.h" 33 #include "ucol_cnt.h" 34 #include "ucln_in.h" 35 #include "umutex.h" 36 #include "cmemory.h" 37 #include "cstring.h" 38 39 U_NAMESPACE_BEGIN 40 41 static const InverseUCATableHeader* _staticInvUCA = NULL; 42 static UDataMemory* invUCA_DATA_MEM = NULL; 43 44 U_CDECL_BEGIN 45 static UBool U_CALLCONV 46 isAcceptableInvUCA(void * /*context*/, 47 const char * /*type*/, const char * /*name*/, 48 const UDataInfo *pInfo) 49 { 50 /* context, type & name are intentionally not used */ 51 if( pInfo->size>=20 && 52 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 53 pInfo->charsetFamily==U_CHARSET_FAMILY && 54 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ 55 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && 56 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && 57 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && 58 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && 59 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& 60 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && 61 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && 62 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && 63 ) 64 { 65 UVersionInfo UCDVersion; 66 u_getUnicodeVersion(UCDVersion); 67 return (pInfo->dataVersion[0]==UCDVersion[0] && 68 pInfo->dataVersion[1]==UCDVersion[1]); 69 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && 70 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && 71 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { 72 } else { 73 return FALSE; 74 } 75 } 76 U_CDECL_END 77 78 /* 79 * Takes two CEs (lead and continuation) and 80 * compares them as CEs should be compared: 81 * primary vs. primary, secondary vs. secondary 82 * tertiary vs. tertiary 83 */ 84 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { 85 uint32_t s1 = source0, s2, t1 = target0, t2; 86 if(isContinuation(source1)) { 87 s2 = source1; 88 } else { 89 s2 = 0; 90 } 91 if(isContinuation(target1)) { 92 t2 = target1; 93 } else { 94 t2 = 0; 95 } 96 97 uint32_t s = 0, t = 0; 98 if(s1 == t1 && s2 == t2) { 99 return 0; 100 } 101 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 102 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 103 if(s < t) { 104 return -1; 105 } else if(s > t) { 106 return 1; 107 } else { 108 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; 109 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; 110 if(s < t) { 111 return -1; 112 } else if(s > t) { 113 return 1; 114 } else { 115 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); 116 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); 117 if(s < t) { 118 return -1; 119 } else { 120 return 1; 121 } 122 } 123 } 124 } 125 126 static 127 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { 128 uint32_t bottom = 0, top = src->invUCA->tableSize; 129 uint32_t i = 0; 130 uint32_t first = 0, second = 0; 131 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 132 int32_t res = 0; 133 134 while(bottom < top-1) { 135 i = (top+bottom)/2; 136 first = *(CETable+3*i); 137 second = *(CETable+3*i+1); 138 res = compareCEs(first, second, CE, SecondCE); 139 if(res > 0) { 140 top = i; 141 } else if(res < 0) { 142 bottom = i; 143 } else { 144 break; 145 } 146 } 147 148 /* weiv: */ 149 /* in searching for elements, I have removed the failure */ 150 /* The reason for this is that the builder does not rely */ 151 /* on search mechanism telling it that it didn't find an */ 152 /* element. However, indirect positioning relies on being */ 153 /* able to find the elements around any CE, even if it is */ 154 /* not defined in the UCA. */ 155 return i; 156 /* 157 if((first == CE && second == SecondCE)) { 158 return i; 159 } else { 160 return -1; 161 } 162 */ 163 } 164 165 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 166 0xFFFF0000, 167 0xFFFFFF00, 168 0xFFFFFFFF 169 }; 170 171 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, 172 uint32_t CE, uint32_t contCE, 173 uint32_t *nextCE, uint32_t *nextContCE, 174 uint32_t strength) 175 { 176 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 177 int32_t iCE; 178 179 iCE = ucol_inv_findCE(src, CE, contCE); 180 181 if(iCE<0) { 182 *nextCE = UCOL_NOT_FOUND; 183 return -1; 184 } 185 186 CE &= strengthMask[strength]; 187 contCE &= strengthMask[strength]; 188 189 *nextCE = CE; 190 *nextContCE = contCE; 191 192 while((*nextCE & strengthMask[strength]) == CE 193 && (*nextContCE & strengthMask[strength]) == contCE) 194 { 195 *nextCE = (*(CETable+3*(++iCE))); 196 *nextContCE = (*(CETable+3*(iCE)+1)); 197 } 198 199 return iCE; 200 } 201 202 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 203 uint32_t CE, uint32_t contCE, 204 uint32_t *prevCE, uint32_t *prevContCE, 205 uint32_t strength) 206 { 207 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 208 int32_t iCE; 209 210 iCE = ucol_inv_findCE(src, CE, contCE); 211 212 if(iCE<0) { 213 *prevCE = UCOL_NOT_FOUND; 214 return -1; 215 } 216 217 CE &= strengthMask[strength]; 218 contCE &= strengthMask[strength]; 219 220 *prevCE = CE; 221 *prevContCE = contCE; 222 223 while((*prevCE & strengthMask[strength]) == CE 224 && (*prevContCE & strengthMask[strength])== contCE 225 && iCE > 0) /* this condition should prevent falling off the edge of the world */ 226 { 227 /* here, we end up in a singularity - zero */ 228 *prevCE = (*(CETable+3*(--iCE))); 229 *prevContCE = (*(CETable+3*(iCE)+1)); 230 } 231 232 return iCE; 233 } 234 235 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 236 uint32_t prevCE, uint32_t prevContCE) 237 { 238 if(prevCE == CE && prevContCE == contCE) { 239 return UCOL_IDENTICAL; 240 } 241 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) 242 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) 243 { 244 return UCOL_PRIMARY; 245 } 246 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) 247 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) 248 { 249 return UCOL_SECONDARY; 250 } 251 return UCOL_TERTIARY; 252 } 253 254 255 /*static 256 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 257 258 uint32_t CE = lh->baseCE; 259 uint32_t SecondCE = lh->baseContCE; 260 261 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 262 uint32_t previousCE, previousContCE; 263 int32_t iCE; 264 265 iCE = ucol_inv_findCE(src, CE, SecondCE); 266 267 if(iCE<0) { 268 return -1; 269 } 270 271 CE &= strengthMask[strength]; 272 SecondCE &= strengthMask[strength]; 273 274 previousCE = CE; 275 previousContCE = SecondCE; 276 277 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { 278 previousCE = (*(CETable+3*(--iCE))); 279 previousContCE = (*(CETable+3*(iCE)+1)); 280 } 281 lh->previousCE = previousCE; 282 lh->previousContCE = previousContCE; 283 284 return iCE; 285 }*/ 286 287 static 288 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 289 uint32_t CE = lh->baseCE; 290 uint32_t SecondCE = lh->baseContCE; 291 292 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 293 uint32_t nextCE, nextContCE; 294 int32_t iCE; 295 296 iCE = ucol_inv_findCE(src, CE, SecondCE); 297 298 if(iCE<0) { 299 return -1; 300 } 301 302 CE &= strengthMask[strength]; 303 SecondCE &= strengthMask[strength]; 304 305 nextCE = CE; 306 nextContCE = SecondCE; 307 308 while((nextCE & strengthMask[strength]) == CE 309 && (nextContCE & strengthMask[strength]) == SecondCE) 310 { 311 nextCE = (*(CETable+3*(++iCE))); 312 nextContCE = (*(CETable+3*(iCE)+1)); 313 } 314 315 lh->nextCE = nextCE; 316 lh->nextContCE = nextContCE; 317 318 return iCE; 319 } 320 321 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 322 /* reset all the gaps */ 323 int32_t i = 0; 324 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 325 uint32_t st = 0; 326 uint32_t t1, t2; 327 int32_t pos; 328 329 UColToken *tok = lh->first; 330 uint32_t tokStrength = tok->strength; 331 332 for(i = 0; i<3; i++) { 333 lh->gapsHi[3*i] = 0; 334 lh->gapsHi[3*i+1] = 0; 335 lh->gapsHi[3*i+2] = 0; 336 lh->gapsLo[3*i] = 0; 337 lh->gapsLo[3*i+1] = 0; 338 lh->gapsLo[3*i+2] = 0; 339 lh->numStr[i] = 0; 340 lh->fStrToken[i] = NULL; 341 lh->lStrToken[i] = NULL; 342 lh->pos[i] = -1; 343 } 344 345 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 346 347 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 348 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 349 lh->pos[0] = 0; 350 t1 = lh->baseCE; 351 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; 352 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 353 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 354 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 355 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); 356 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); 357 358 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 359 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; 360 361 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 362 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 363 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 364 } else if(lh->indirect == TRUE && lh->nextCE != 0) { 365 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { 366 lh->pos[0] = 0; 367 t1 = lh->baseCE; 368 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; 369 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 370 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 371 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 372 t1 = lh->nextCE; 373 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; 374 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 375 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 376 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 377 } else { 378 for(;;) { 379 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 380 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { 381 lh->fStrToken[tokStrength] = tok; 382 } else { /* The CE must be implicit, since it's not in the table */ 383 /* Error */ 384 *status = U_INTERNAL_PROGRAM_ERROR; 385 } 386 } 387 388 while(tok != NULL && tok->strength >= tokStrength) { 389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 390 lh->lStrToken[tokStrength] = tok; 391 } 392 tok = tok->next; 393 } 394 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { 395 /* check if previous interval is the same and merge the intervals if it is so */ 396 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { 397 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; 398 lh->fStrToken[tokStrength+1] = NULL; 399 lh->lStrToken[tokStrength+1] = NULL; 400 lh->pos[tokStrength+1] = -1; 401 } 402 } 403 if(tok != NULL) { 404 tokStrength = tok->strength; 405 } else { 406 break; 407 } 408 } 409 for(st = 0; st < 3; st++) { 410 if((pos = lh->pos[st]) >= 0) { 411 t1 = *(CETable+3*(pos)); 412 t2 = *(CETable+3*(pos)+1); 413 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 414 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 415 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 416 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 417 //pos--; 418 //t1 = *(CETable+3*(pos)); 419 //t2 = *(CETable+3*(pos)+1); 420 t1 = lh->baseCE; 421 t2 = lh->baseContCE; 422 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 423 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 424 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 425 } 426 } 427 } 428 } 429 430 431 #define ucol_countBytes(value, noOfBytes) \ 432 { \ 433 uint32_t mask = 0xFFFFFFFF; \ 434 (noOfBytes) = 0; \ 435 while(mask != 0) { \ 436 if(((value) & mask) != 0) { \ 437 (noOfBytes)++; \ 438 } \ 439 mask >>= 8; \ 440 } \ 441 } 442 443 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { 444 if(U_SUCCESS(*status)) { 445 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 446 } 447 return g->current; 448 } 449 450 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { 451 /* TODO: rename to enum names */ 452 uint32_t high, low, count=1; 453 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; 454 455 if(strength == UCOL_SECONDARY) { 456 low = UCOL_COMMON_TOP2<<24; 457 high = 0xFFFFFFFF; 458 count = 0xFF - UCOL_COMMON_TOP2; 459 } else { 460 low = UCOL_BYTE_COMMON << 24; //0x05000000; 461 high = 0x40000000; 462 count = 0x40 - UCOL_BYTE_COMMON; 463 } 464 465 if(tok->next != NULL && tok->next->strength == strength) { 466 count = tok->next->toInsert; 467 } 468 469 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 470 g->current = UCOL_BYTE_COMMON<<24; 471 472 if(g->noOfRanges == 0) { 473 *status = U_INTERNAL_PROGRAM_ERROR; 474 } 475 return g->current; 476 } 477 478 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { 479 uint32_t strength = tok->strength; 480 uint32_t low = lows[fStrength*3+strength]; 481 uint32_t high = highs[fStrength*3+strength]; 482 uint32_t maxByte = 0; 483 if(strength == UCOL_TERTIARY) { 484 maxByte = 0x3F; 485 } else if(strength == UCOL_PRIMARY) { 486 maxByte = 0xFE; 487 } else { 488 maxByte = 0xFF; 489 } 490 491 uint32_t count = tok->toInsert; 492 493 if(low >= high && strength > UCOL_PRIMARY) { 494 int32_t s = strength; 495 for(;;) { 496 s--; 497 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { 498 if(strength == UCOL_SECONDARY) { 499 if (low < UCOL_COMMON_TOP2<<24 ) { 500 // Override if low range is less than UCOL_COMMON_TOP2. 501 low = UCOL_COMMON_TOP2<<24; 502 } 503 high = 0xFFFFFFFF; 504 } else { 505 // Override if low range is less than UCOL_COMMON_BOT3. 506 if ( low < UCOL_COMMON_BOT3<<24 ) { 507 low = UCOL_COMMON_BOT3<<24; 508 } 509 high = 0x40000000; 510 } 511 break; 512 } 513 if(s<0) { 514 *status = U_INTERNAL_PROGRAM_ERROR; 515 return 0; 516 } 517 } 518 } 519 520 if(low < 0x02000000) { 521 // We must not use CE weight byte 02, so we set it as the minimum lower bound. 522 // See http://site.icu-project.org/design/collation/bytes 523 low = 0x02000000; 524 } 525 526 if(strength == UCOL_SECONDARY) { /* similar as simple */ 527 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 528 low = UCOL_COMMON_TOP2<<24; 529 } 530 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 531 high = UCOL_COMMON_TOP2<<24; 532 } 533 if(low < (UCOL_COMMON_BOT2<<24)) { 534 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); 535 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 536 //g->current = UCOL_COMMON_BOT2<<24; 537 return g->current; 538 } 539 } 540 541 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 542 if(g->noOfRanges == 0) { 543 *status = U_INTERNAL_PROGRAM_ERROR; 544 } 545 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 546 return g->current; 547 } 548 549 static 550 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 551 uint32_t i = 0; 552 UChar c; 553 554 if(U_FAILURE(*status)) { 555 return 0; 556 } 557 558 if(sourceLen > resLen) { 559 *status = U_MEMORY_ALLOCATION_ERROR; 560 return 0; 561 } 562 563 for(i = 0; i < sourceLen; i++) { 564 c = source[i]; 565 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 566 switch(c - 0x3000) { 567 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: 568 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: 569 c++; 570 break; 571 case 0xF5: 572 c = 0x30AB; 573 break; 574 case 0xF6: 575 c = 0x30B1; 576 break; 577 } 578 } 579 resBuf[i] = c; 580 } 581 return sourceLen; 582 } 583 584 static 585 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 586 uint32_t i = 0; 587 UChar c; 588 589 if(U_FAILURE(*status)) { 590 return 0; 591 } 592 593 if(sourceLen > resLen) { 594 *status = U_MEMORY_ALLOCATION_ERROR; 595 return 0; 596 } 597 598 for(i = 0; i < sourceLen; i++) { 599 c = source[i]; 600 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 601 switch(c - 0x3000) { 602 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: 603 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: 604 c--; 605 break; 606 case 0xAB: 607 c = 0x30F5; 608 break; 609 case 0xB1: 610 c = 0x30F6; 611 break; 612 } 613 } 614 resBuf[i] = c; 615 } 616 return sourceLen; 617 } 618 619 static 620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { 621 uint32_t i = 0; 622 UChar n[128]; 623 uint32_t nLen = 0; 624 uint32_t uCount = 0, lCount = 0; 625 626 collIterate s; 627 uint32_t order = 0; 628 629 if(U_FAILURE(*status)) { 630 return UCOL_LOWER_CASE; 631 } 632 633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); 634 if(U_SUCCESS(*status)) { 635 for(i = 0; i < nLen; i++) { 636 uprv_init_collIterate(UCA, &n[i], 1, &s, status); 637 order = ucol_getNextCE(UCA, &s, status); 638 if(isContinuation(order)) { 639 *status = U_INTERNAL_PROGRAM_ERROR; 640 return UCOL_LOWER_CASE; 641 } 642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { 643 uCount++; 644 } else { 645 if(u_islower(n[i])) { 646 lCount++; 647 } else if(U_SUCCESS(*status)) { 648 UChar sk[1], lk[1]; 649 u_toSmallKana(&n[i], 1, sk, 1, status); 650 u_toLargeKana(&n[i], 1, lk, 1, status); 651 if(sk[0] == n[i] && lk[0] != n[i]) { 652 lCount++; 653 } 654 } 655 } 656 } 657 } 658 659 if(uCount != 0 && lCount != 0) { 660 return UCOL_MIXED_CASE; 661 } else if(uCount != 0) { 662 return UCOL_UPPER_CASE; 663 } else { 664 return UCOL_LOWER_CASE; 665 } 666 } 667 668 669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { 670 /* this one makes the table and stuff */ 671 uint32_t noOfBytes[3]; 672 uint32_t i; 673 674 for(i = 0; i<3; i++) { 675 ucol_countBytes(CEparts[i], noOfBytes[i]); 676 } 677 678 /* Here we have to pack CEs from parts */ 679 680 uint32_t CEi = 0; 681 uint32_t value = 0; 682 683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { 684 if(CEi > 0) { 685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 686 } else { 687 value = 0; 688 } 689 690 if(2*CEi<noOfBytes[0]) { 691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; 692 } 693 if(CEi<noOfBytes[1]) { 694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; 695 } 696 if(CEi<noOfBytes[2]) { 697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); 698 } 699 tok->CEs[CEi] = value; 700 CEi++; 701 } 702 if(CEi == 0) { /* totally ignorable */ 703 tok->noOfCEs = 1; 704 tok->CEs[0] = 0; 705 } else { /* there is at least something */ 706 tok->noOfCEs = CEi; 707 } 708 709 710 // we want to set case bits here and now, not later. 711 // Case bits handling 712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables 713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field 714 int32_t cSize = (tok->source & 0xFF000000) >> 24; 715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; 716 717 if(cSize > 1) { 718 // Do it manually 719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); 720 } else { 721 // Copy it from the UCA 722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); 723 tok->CEs[0] |= (caseCE & 0xC0); 724 } 725 } 726 727 #if UCOL_DEBUG==2 728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); 729 for(i = 0; i<tok->noOfCEs; i++) { 730 fprintf(stderr, "%08X ", tok->CEs[i]); 731 } 732 fprintf(stderr, "\n"); 733 #endif 734 } 735 736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; 738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; 739 740 UColToken *tok = lh->last; 741 uint32_t t[UCOL_STRENGTH_LIMIT]; 742 743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); 744 745 tok->toInsert = 1; 746 t[tok->strength] = 1; 747 748 while(tok->previous != NULL) { 749 if(tok->previous->strength < tok->strength) { /* going up */ 750 t[tok->strength] = 0; 751 t[tok->previous->strength]++; 752 } else if(tok->previous->strength > tok->strength) { /* going down */ 753 t[tok->previous->strength] = 1; 754 } else { 755 t[tok->strength]++; 756 } 757 tok=tok->previous; 758 tok->toInsert = t[tok->strength]; 759 } 760 761 tok->toInsert = t[tok->strength]; 762 ucol_inv_getGapPositions(src, lh, status); 763 764 #if UCOL_DEBUG 765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); 766 int32_t j = 2; 767 for(j = 2; j >= 0; j--) { 768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); 769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); 770 } 771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 772 773 do { 774 fprintf(stderr,"%i", tok->strength); 775 tok = tok->next; 776 } while(tok != NULL); 777 fprintf(stderr, "\n"); 778 779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 780 781 do { 782 fprintf(stderr,"%i", tok->toInsert); 783 tok = tok->next; 784 } while(tok != NULL); 785 #endif 786 787 tok = lh->first; 788 uint32_t fStrength = UCOL_IDENTICAL; 789 uint32_t initStrength = UCOL_IDENTICAL; 790 791 792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; 793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; 794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; 795 796 while (tok != NULL && U_SUCCESS(*status)) { 797 fStrength = tok->strength; 798 if(fStrength < initStrength) { 799 initStrength = fStrength; 800 if(lh->pos[fStrength] == -1) { 801 while(lh->pos[fStrength] == -1 && fStrength > 0) { 802 fStrength--; 803 } 804 if(lh->pos[fStrength] == -1) { 805 *status = U_INTERNAL_PROGRAM_ERROR; 806 return; 807 } 808 } 809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ 810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; 812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ 813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ 815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ 817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 819 } else { /* primaries */ 820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ 821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 824 } 825 } else { 826 if(tok->strength == UCOL_TERTIARY) { 827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); 828 } else if(tok->strength == UCOL_SECONDARY) { 829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); 830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 831 } else if(tok->strength == UCOL_PRIMARY) { 832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); 833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 835 } 836 } 837 ucol_doCE(src, CEparts, tok, status); 838 tok = tok->next; 839 } 840 } 841 842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { 843 UCAElements el; 844 UColToken *tok = lh->first; 845 UColToken *expt = NULL; 846 uint32_t i = 0, j = 0; 847 UChar32 fcdHighStart; 848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 849 850 while(tok != NULL && U_SUCCESS(*status)) { 851 /* first, check if there are any expansions */ 852 /* if there are expansions, we need to do a little bit more processing */ 853 /* since parts of expansion can be tailored, while others are not */ 854 if(tok->expansion != 0) { 855 uint32_t len = tok->expansion >> 24; 856 uint32_t currentSequenceLen = len; 857 uint32_t expOffset = tok->expansion & 0x00FFFFFF; 858 //uint32_t exp = currentSequenceLen | expOffset; 859 UColToken exp; 860 exp.source = currentSequenceLen | expOffset; 861 exp.rulesToParseHdl = &(src->source); 862 863 while(len > 0) { 864 currentSequenceLen = len; 865 while(currentSequenceLen > 0) { 866 exp.source = (currentSequenceLen << 24) | expOffset; 867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ 868 uint32_t noOfCEsToCopy = expt->noOfCEs; 869 for(j = 0; j<noOfCEsToCopy; j++) { 870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; 871 } 872 tok->noOfExpCEs += noOfCEsToCopy; 873 // Smart people never try to add codepoints and CEs. 874 // For some odd reason, it won't work. 875 expOffset += currentSequenceLen; //noOfCEsToCopy; 876 len -= currentSequenceLen; //noOfCEsToCopy; 877 break; 878 } else { 879 currentSequenceLen--; 880 } 881 } 882 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ 883 /* will have to get one from UCA */ 884 /* first, get the UChars from the rules */ 885 /* then pick CEs out until there is no more and stuff them into expansion */ 886 collIterate s; 887 uint32_t order = 0; 888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); 889 890 for(;;) { 891 order = ucol_getNextCE(src->UCA, &s, status); 892 if(order == UCOL_NO_MORE_CES) { 893 break; 894 } 895 tok->expCEs[tok->noOfExpCEs++] = order; 896 } 897 expOffset++; 898 len--; 899 } 900 } 901 } else { 902 tok->noOfExpCEs = 0; 903 } 904 905 /* set the ucaelement with obtained values */ 906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; 907 /* copy CEs */ 908 for(i = 0; i<tok->noOfCEs; i++) { 909 el.CEs[i] = tok->CEs[i]; 910 } 911 for(i = 0; i<tok->noOfExpCEs; i++) { 912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; 913 } 914 915 /* copy UChars */ 916 // We kept prefix and source kind of together, as it is a kind of a contraction. 917 // However, now we have to slice the prefix off the main thing - 918 el.prefix = el.prefixChars; 919 el.cPoints = el.uchars; 920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the 921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND 922 // decomposed elements to the unsaf table. 923 el.prefixSize = tok->prefix>>24; 924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 925 926 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 928 } else { 929 el.prefixSize = 0; 930 *el.prefix = 0; 931 932 el.cSize = (tok->source >> 24); 933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 934 } 935 if(src->UCA != NULL) { 936 for(i = 0; i<el.cSize; i++) { 937 if(UCOL_ISJAMO(el.cPoints[i])) { 938 t->image->jamoSpecial = TRUE; 939 } 940 } 941 if (!src->buildCCTabFlag && el.cSize > 0) { 942 // Check the trailing canonical combining class (tccc) of the last character. 943 const UChar *s = el.cPoints + el.cSize; 944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s); 945 if ((fcd & 0xff) != 0) { 946 src->buildCCTabFlag = TRUE; 947 } 948 } 949 } 950 951 /* and then, add it */ 952 #if UCOL_DEBUG==2 953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); 954 #endif 955 uprv_uca_addAnElement(t, &el, status); 956 957 #if UCOL_DEBUG_DUPLICATES 958 if(*status != U_ZERO_ERROR) { 959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); 960 *status = U_ZERO_ERROR; 961 } 962 #endif 963 964 tok = tok->next; 965 } 966 } 967 968 U_CDECL_BEGIN 969 static UBool U_CALLCONV 970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 971 UErrorCode status = U_ZERO_ERROR; 972 tempUCATable *t = (tempUCATable *)context; 973 if(value == 0) { 974 while(start < limit) { 975 uint32_t CE = utrie_get32(t->mapping, start, NULL); 976 if(CE == UCOL_NOT_FOUND) { 977 UCAElements el; 978 el.isThai = FALSE; 979 el.prefixSize = 0; 980 el.prefixChars[0] = 0; 981 el.prefix = el.prefixChars; 982 el.cPoints = el.uchars; 983 984 el.cSize = 0; 985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start); 986 987 el.noOfCEs = 1; 988 el.CEs[0] = 0; 989 uprv_uca_addAnElement(t, &el, &status); 990 991 } 992 start++; 993 } 994 } 995 if(U_FAILURE(status)) { 996 return FALSE; 997 } else { 998 return TRUE; 999 } 1000 } 1001 U_CDECL_END 1002 1003 static void 1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, 1005 UChar32 start, UChar32 end, 1006 UErrorCode *status) 1007 { 1008 //UChar decomp[256]; 1009 uint32_t CE = UCOL_NOT_FOUND; 1010 UChar32 u = 0; 1011 UCAElements el; 1012 el.isThai = FALSE; 1013 el.prefixSize = 0; 1014 el.prefixChars[0] = 0; 1015 collIterate colIt; 1016 1017 if(U_SUCCESS(*status)) { 1018 for(u = start; u<=end; u++) { 1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND 1020 /* this test is for contractions that are missing the starting element. */ 1021 || ((isCntTableElement(CE)) && 1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) 1023 ) 1024 { 1025 el.cSize = 0; 1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); 1027 //decomp[0] = (UChar)u; 1028 //el.uchars[0] = (UChar)u; 1029 el.cPoints = el.uchars; 1030 //el.cSize = 1; 1031 el.noOfCEs = 0; 1032 el.prefix = el.prefixChars; 1033 el.prefixSize = 0; 1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); 1035 // We actually want to check whether this element is a special 1036 // If it is an implicit element (hangul, CJK - we want to copy the 1037 // special, not the resolved CEs) - for hangul, copying resolved 1038 // would just make things the same (there is an expansion and it 1039 // takes approximately the same amount of time to resolve as 1040 // falling back to the UCA). 1041 /* 1042 UTRIE_GET32(src->UCA->mapping, u, CE); 1043 tag = getCETag(CE); 1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG 1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG 1046 || tag == LEAD_SURROGATE_TAG) { 1047 el.CEs[el.noOfCEs++] = CE; 1048 } else { 1049 */ 1050 // It turns out that it does not make sense to keep implicits 1051 // unresolved. The cost of resolving them is big enough so that 1052 // it doesn't make any difference whether we have to go to the UCA 1053 // or not. 1054 { 1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); 1056 while(CE != UCOL_NO_MORE_CES) { 1057 CE = ucol_getNextCE(src->UCA, &colIt, status); 1058 if(CE != UCOL_NO_MORE_CES) { 1059 el.CEs[el.noOfCEs++] = CE; 1060 } 1061 } 1062 } 1063 uprv_uca_addAnElement(t, &el, status); 1064 } 1065 } 1066 } 1067 } 1068 1069 U_CFUNC UCATableHeader * 1070 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { 1071 U_NAMESPACE_USE 1072 1073 uint32_t i = 0; 1074 if(U_FAILURE(*status)) { 1075 return NULL; 1076 } 1077 /* 1078 2. Eliminate the negative lists by doing the following for each non-null negative list: 1079 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 1080 create new ListHeader X 1081 o reverse the list, add to the end of X's positive list. Reset the strength of the 1082 first item you add, based on the stronger strength levels of the two lists. 1083 */ 1084 /* 1085 3. For each ListHeader with a non-null positive list: 1086 */ 1087 /* 1088 o Find all character strings with CEs between the baseCE and the 1089 next/previous CE, at the strength of the first token. Add these to the 1090 tailoring. 1091 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the 1092 tailoring has & x < z... 1093 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... 1094 */ 1095 /* It is possible that this part should be done even while constructing list */ 1096 /* The problem is that it is unknown what is going to be the strongest weight */ 1097 /* So we might as well do it here */ 1098 1099 /* 1100 o Allocate CEs for each token in the list, based on the total number N of the 1101 largest level difference, and the gap G between baseCE and nextCE at that 1102 level. The relation * between the last item and nextCE is the same as the 1103 strongest strength. 1104 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 1105 ? There are 3 primary items: a, d, e. Fit them into the primary gap. 1106 Then fit b and c into the secondary gap between a and d, then fit q 1107 into the tertiary gap between b and c. 1108 1109 o Example: baseCE << b <<< q << c * nextCE(X,2) 1110 ? There are 2 secondary items: b, c. Fit them into the secondary gap. 1111 Then fit q into the tertiary gap between b and c. 1112 o When incrementing primary values, we will not cross high byte 1113 boundaries except where there is only a single-byte primary. That is to 1114 ensure that the script reordering will continue to work. 1115 */ 1116 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); 1117 /* test for NULL */ 1118 if (image == NULL) { 1119 *status = U_MEMORY_ALLOCATION_ERROR; 1120 return NULL; 1121 } 1122 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); 1123 1124 for(i = 0; i<src->resultLen; i++) { 1125 /* now we need to generate the CEs */ 1126 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1127 /* According to strength */ 1128 if(U_SUCCESS(*status)) { 1129 if(src->lh[i].first) { // if there are any elements 1130 // due to the way parser works, subsequent tailorings 1131 // may remove all the elements from a sequence, therefore 1132 // leaving an empty tailoring sequence. 1133 ucol_initBuffers(src, &src->lh[i], status); 1134 } 1135 } 1136 if(U_FAILURE(*status)) { 1137 uprv_free(image); 1138 return NULL; 1139 } 1140 } 1141 1142 if(src->varTop != NULL) { /* stuff the variable top value */ 1143 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; 1144 /* remove it from the list */ 1145 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ 1146 src->varTop->listHeader->first = src->varTop->next; 1147 } 1148 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ 1149 src->varTop->listHeader->last = src->varTop->previous; 1150 } 1151 if(src->varTop->next != NULL) { 1152 src->varTop->next->previous = src->varTop->previous; 1153 } 1154 if(src->varTop->previous != NULL) { 1155 src->varTop->previous->next = src->varTop->next; 1156 } 1157 } 1158 1159 1160 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); 1161 if(U_FAILURE(*status)) { 1162 uprv_free(image); 1163 return NULL; 1164 } 1165 1166 1167 /* After this, we have assigned CE values to all regular CEs */ 1168 /* now we will go through list once more and resolve expansions, */ 1169 /* make UCAElements structs and add them to table */ 1170 for(i = 0; i<src->resultLen; i++) { 1171 /* now we need to generate the CEs */ 1172 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1173 /* According to strength */ 1174 if(U_SUCCESS(*status)) { 1175 ucol_createElements(src, t, &src->lh[i], status); 1176 } 1177 } 1178 1179 UCAElements el; 1180 el.isThai = FALSE; 1181 el.prefixSize = 0; 1182 el.prefixChars[0] = 0; 1183 1184 /* add latin-1 stuff */ 1185 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); 1186 1187 /* add stuff for copying */ 1188 if(src->copySet != NULL) { 1189 int32_t i = 0; 1190 UnicodeSet *set = (UnicodeSet *)src->copySet; 1191 for(i = 0; i < set->getRangeCount(); i++) { 1192 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); 1193 } 1194 } 1195 1196 if(U_SUCCESS(*status)) { 1197 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ 1198 1199 uint32_t tailoredCE = UCOL_NOT_FOUND; 1200 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants)); 1201 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); 1202 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); 1203 // Check for null pointer 1204 if (ucaEl == NULL) { 1205 *status = U_MEMORY_ALLOCATION_ERROR; 1206 return NULL; 1207 } 1208 while(*conts != 0) { 1209 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/ 1210 tailoredCE = utrie_get32(t->mapping, *conts, NULL); 1211 if(tailoredCE != UCOL_NOT_FOUND) { 1212 UBool needToAdd = TRUE; 1213 if(isCntTableElement(tailoredCE)) { 1214 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) { 1215 needToAdd = FALSE; 1216 } 1217 } 1218 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { 1219 UCAElements elm; 1220 elm.cPoints = el.uchars; 1221 elm.noOfCEs = 0; 1222 elm.uchars[0] = *conts; 1223 elm.uchars[1] = 0; 1224 elm.cSize = 1; 1225 elm.prefixChars[0] = *(conts+2); 1226 elm.isThai = FALSE; 1227 elm.prefix = elm.prefixChars; 1228 elm.prefixSize = 1; 1229 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); 1230 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { 1231 needToAdd = TRUE; 1232 } 1233 } 1234 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { 1235 needToAdd = FALSE; 1236 } 1237 1238 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. 1239 if (*(conts+1) != 0) { // contractions 1240 el.prefix = el.prefixChars; 1241 el.prefixSize = 0; 1242 el.cPoints = el.uchars; 1243 el.noOfCEs = 0; 1244 el.uchars[0] = *conts; 1245 el.uchars[1] = *(conts+1); 1246 if(*(conts+2)!=0) { 1247 el.uchars[2] = *(conts+2); 1248 el.cSize = 3; 1249 } else { 1250 el.cSize = 2; 1251 } 1252 ucol_setText(ucaEl, el.uchars, el.cSize, status); 1253 } 1254 else { // pre-context character 1255 UChar str[4] = { 0 }; 1256 int32_t len=0; 1257 int32_t preKeyLen=0; 1258 1259 el.cPoints = el.uchars; 1260 el.noOfCEs = 0; 1261 el.uchars[0] = *conts; 1262 el.uchars[1] = 0; 1263 el.cSize = 1; 1264 el.prefixChars[0] = *(conts+2); 1265 el.prefix = el.prefixChars; 1266 el.prefixSize = 1; 1267 if (el.prefixChars[0]!=0) { 1268 // get CE of prefix character first 1269 str[0]=el.prefixChars[0]; 1270 str[1]=0; 1271 ucol_setText(ucaEl, str, 1, status); 1272 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) 1273 != UCOL_NULLORDER) { 1274 preKeyLen++; // count number of keys for prefix character 1275 } 1276 str[len++] = el.prefixChars[0]; 1277 } 1278 1279 str[len++] = el.uchars[0]; 1280 str[len]=0; 1281 ucol_setText(ucaEl, str, len, status); 1282 // Skip the keys for prefix character, then copy the rest to el. 1283 while ((preKeyLen-->0) && 1284 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1285 continue; 1286 } 1287 1288 } 1289 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1290 el.noOfCEs++; 1291 } 1292 uprv_uca_addAnElement(t, &el, status); 1293 } 1294 1295 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { 1296 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status); 1297 } 1298 conts+=3; 1299 } 1300 ucol_closeElements(ucaEl); 1301 } 1302 1303 // Add completely ignorable elements 1304 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); 1305 1306 // add tailoring characters related canonical closures 1307 uprv_uca_canonicalClosure(t, src, NULL, status); 1308 1309 /* still need to produce compatibility closure */ 1310 1311 UCATableHeader *myData = uprv_uca_assembleTable(t, status); 1312 1313 uprv_uca_closeTempTable(t); 1314 uprv_free(image); 1315 1316 return myData; 1317 } 1318 1319 U_CDECL_BEGIN 1320 static UBool U_CALLCONV 1321 ucol_bld_cleanup(void) 1322 { 1323 udata_close(invUCA_DATA_MEM); 1324 invUCA_DATA_MEM = NULL; 1325 _staticInvUCA = NULL; 1326 return TRUE; 1327 } 1328 U_CDECL_END 1329 1330 U_CAPI const InverseUCATableHeader * U_EXPORT2 1331 ucol_initInverseUCA(UErrorCode *status) 1332 { 1333 if(U_FAILURE(*status)) return NULL; 1334 1335 UBool needsInit; 1336 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit); 1337 1338 if(needsInit) { 1339 InverseUCATableHeader *newInvUCA = NULL; 1340 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); 1341 1342 if(U_FAILURE(*status)) { 1343 if (result) { 1344 udata_close(result); 1345 } 1346 // This is not needed, as we are talking about 1347 // memory we got from UData 1348 //uprv_free(newInvUCA); 1349 } 1350 1351 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ 1352 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); 1353 UCollator *UCA = ucol_initUCA(status); 1354 // UCA versions of UCA and inverse UCA should match 1355 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { 1356 *status = U_INVALID_FORMAT_ERROR; 1357 udata_close(result); 1358 return NULL; 1359 } 1360 1361 umtx_lock(NULL); 1362 if(_staticInvUCA == NULL) { 1363 invUCA_DATA_MEM = result; 1364 _staticInvUCA = newInvUCA; 1365 result = NULL; 1366 newInvUCA = NULL; 1367 } 1368 umtx_unlock(NULL); 1369 1370 if(newInvUCA != NULL) { 1371 udata_close(result); 1372 // This is not needed, as we are talking about 1373 // memory we got from UData 1374 //uprv_free(newInvUCA); 1375 } 1376 else { 1377 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); 1378 } 1379 } 1380 } 1381 return _staticInvUCA; 1382 } 1383 1384 /* This is the data that is used for non-script reordering codes. These _must_ be kept 1385 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. 1386 */ 1387 static const char* ReorderingTokenNames[] = { 1388 "SPACE", 1389 "PUNCT", 1390 "SYMBOL", 1391 "CURRENCY", 1392 "DIGIT", 1393 NULL 1394 }; 1395 1396 static void toUpper(const char* src, char* dst, uint32_t length) { 1397 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { 1398 *dst = toupper(*src); 1399 } 1400 *dst = '\0'; 1401 } 1402 1403 U_INTERNAL int32_t U_EXPORT2 1404 ucol_findReorderingEntry(const char* name) { 1405 char buffer[32]; 1406 toUpper(name, buffer, 32); 1407 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) { 1408 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { 1409 return entry + UCOL_REORDER_CODE_FIRST; 1410 } 1411 } 1412 return USCRIPT_INVALID_CODE; 1413 } 1414 1415 U_NAMESPACE_END 1416 1417 #endif /* #if !UCONFIG_NO_COLLATION */ 1418