1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2013, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_bld.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This module builds a collator based on the rule set. 17 * 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucoleitr.h" 25 #include "unicode/udata.h" 26 #include "unicode/uchar.h" 27 #include "unicode/uniset.h" 28 #include "unicode/uscript.h" 29 #include "unicode/ustring.h" 30 #include "unicode/utf16.h" 31 #include "normalizer2impl.h" 32 #include "uassert.h" 33 #include "ucol_bld.h" 34 #include "ucol_elm.h" 35 #include "ucol_cnt.h" 36 #include "ucln_in.h" 37 #include "umutex.h" 38 #include "cmemory.h" 39 #include "cstring.h" 40 41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 42 43 static const InverseUCATableHeader* _staticInvUCA = NULL; 44 static UDataMemory* invUCA_DATA_MEM = NULL; 45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER; 46 47 U_CDECL_BEGIN 48 static UBool U_CALLCONV 49 isAcceptableInvUCA(void * /*context*/, 50 const char * /*type*/, const char * /*name*/, 51 const UDataInfo *pInfo) 52 { 53 /* context, type & name are intentionally not used */ 54 if( pInfo->size>=20 && 55 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 56 pInfo->charsetFamily==U_CHARSET_FAMILY && 57 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ 58 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && 59 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && 60 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && 61 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && 62 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& 63 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && 64 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && 65 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && 66 ) 67 { 68 // TODO: Check that the invuca data version (pInfo->dataVersion) 69 // matches the ucadata version. 70 return TRUE; 71 } else { 72 return FALSE; 73 } 74 } 75 U_CDECL_END 76 77 /* 78 * Takes two CEs (lead and continuation) and 79 * compares them as CEs should be compared: 80 * primary vs. primary, secondary vs. secondary 81 * tertiary vs. tertiary 82 */ 83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { 84 uint32_t s1 = source0, s2, t1 = target0, t2; 85 if(isContinuation(source1)) { 86 s2 = source1; 87 } else { 88 s2 = 0; 89 } 90 if(isContinuation(target1)) { 91 t2 = target1; 92 } else { 93 t2 = 0; 94 } 95 96 uint32_t s = 0, t = 0; 97 if(s1 == t1 && s2 == t2) { 98 return 0; 99 } 100 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 101 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 102 if(s < t) { 103 return -1; 104 } else if(s > t) { 105 return 1; 106 } else { 107 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; 108 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; 109 if(s < t) { 110 return -1; 111 } else if(s > t) { 112 return 1; 113 } else { 114 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); 115 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); 116 if(s < t) { 117 return -1; 118 } else { 119 return 1; 120 } 121 } 122 } 123 } 124 125 static 126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { 127 uint32_t bottom = 0, top = src->invUCA->tableSize; 128 uint32_t i = 0; 129 uint32_t first = 0, second = 0; 130 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 131 int32_t res = 0; 132 133 while(bottom < top-1) { 134 i = (top+bottom)/2; 135 first = *(CETable+3*i); 136 second = *(CETable+3*i+1); 137 res = compareCEs(first, second, CE, SecondCE); 138 if(res > 0) { 139 top = i; 140 } else if(res < 0) { 141 bottom = i; 142 } else { 143 break; 144 } 145 } 146 147 /* weiv: */ 148 /* in searching for elements, I have removed the failure */ 149 /* The reason for this is that the builder does not rely */ 150 /* on search mechanism telling it that it didn't find an */ 151 /* element. However, indirect positioning relies on being */ 152 /* able to find the elements around any CE, even if it is */ 153 /* not defined in the UCA. */ 154 return i; 155 /* 156 if((first == CE && second == SecondCE)) { 157 return i; 158 } else { 159 return -1; 160 } 161 */ 162 } 163 164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 165 0xFFFF0000, 166 0xFFFFFF00, 167 0xFFFFFFFF 168 }; 169 170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, 171 uint32_t CE, uint32_t contCE, 172 uint32_t *nextCE, uint32_t *nextContCE, 173 uint32_t strength) 174 { 175 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 176 int32_t iCE; 177 178 iCE = ucol_inv_findCE(src, CE, contCE); 179 180 if(iCE<0) { 181 *nextCE = UCOL_NOT_FOUND; 182 return -1; 183 } 184 185 CE &= strengthMask[strength]; 186 contCE &= strengthMask[strength]; 187 188 *nextCE = CE; 189 *nextContCE = contCE; 190 191 while((*nextCE & strengthMask[strength]) == CE 192 && (*nextContCE & strengthMask[strength]) == contCE) 193 { 194 *nextCE = (*(CETable+3*(++iCE))); 195 *nextContCE = (*(CETable+3*(iCE)+1)); 196 } 197 198 return iCE; 199 } 200 201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 202 uint32_t CE, uint32_t contCE, 203 uint32_t *prevCE, uint32_t *prevContCE, 204 uint32_t strength) 205 { 206 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 207 int32_t iCE; 208 209 iCE = ucol_inv_findCE(src, CE, contCE); 210 211 if(iCE<0) { 212 *prevCE = UCOL_NOT_FOUND; 213 return -1; 214 } 215 216 CE &= strengthMask[strength]; 217 contCE &= strengthMask[strength]; 218 219 *prevCE = CE; 220 *prevContCE = contCE; 221 222 while((*prevCE & strengthMask[strength]) == CE 223 && (*prevContCE & strengthMask[strength])== contCE 224 && iCE > 0) /* this condition should prevent falling off the edge of the world */ 225 { 226 /* here, we end up in a singularity - zero */ 227 *prevCE = (*(CETable+3*(--iCE))); 228 *prevContCE = (*(CETable+3*(iCE)+1)); 229 } 230 231 return iCE; 232 } 233 234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 235 uint32_t prevCE, uint32_t prevContCE) 236 { 237 if(prevCE == CE && prevContCE == contCE) { 238 return UCOL_IDENTICAL; 239 } 240 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) 241 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) 242 { 243 return UCOL_PRIMARY; 244 } 245 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) 246 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) 247 { 248 return UCOL_SECONDARY; 249 } 250 return UCOL_TERTIARY; 251 } 252 253 254 /*static 255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 256 257 uint32_t CE = lh->baseCE; 258 uint32_t SecondCE = lh->baseContCE; 259 260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 261 uint32_t previousCE, previousContCE; 262 int32_t iCE; 263 264 iCE = ucol_inv_findCE(src, CE, SecondCE); 265 266 if(iCE<0) { 267 return -1; 268 } 269 270 CE &= strengthMask[strength]; 271 SecondCE &= strengthMask[strength]; 272 273 previousCE = CE; 274 previousContCE = SecondCE; 275 276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { 277 previousCE = (*(CETable+3*(--iCE))); 278 previousContCE = (*(CETable+3*(iCE)+1)); 279 } 280 lh->previousCE = previousCE; 281 lh->previousContCE = previousContCE; 282 283 return iCE; 284 }*/ 285 286 static 287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 288 uint32_t CE = lh->baseCE; 289 uint32_t SecondCE = lh->baseContCE; 290 291 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 292 uint32_t nextCE, nextContCE; 293 int32_t iCE; 294 295 iCE = ucol_inv_findCE(src, CE, SecondCE); 296 297 if(iCE<0) { 298 return -1; 299 } 300 301 CE &= strengthMask[strength]; 302 SecondCE &= strengthMask[strength]; 303 304 nextCE = CE; 305 nextContCE = SecondCE; 306 307 while((nextCE & strengthMask[strength]) == CE 308 && (nextContCE & strengthMask[strength]) == SecondCE) 309 { 310 nextCE = (*(CETable+3*(++iCE))); 311 nextContCE = (*(CETable+3*(iCE)+1)); 312 } 313 314 lh->nextCE = nextCE; 315 lh->nextContCE = nextContCE; 316 317 return iCE; 318 } 319 320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 321 /* reset all the gaps */ 322 int32_t i = 0; 323 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 324 uint32_t st = 0; 325 uint32_t t1, t2; 326 int32_t pos; 327 328 UColToken *tok = lh->first; 329 uint32_t tokStrength = tok->strength; 330 331 for(i = 0; i<3; i++) { 332 lh->gapsHi[3*i] = 0; 333 lh->gapsHi[3*i+1] = 0; 334 lh->gapsHi[3*i+2] = 0; 335 lh->gapsLo[3*i] = 0; 336 lh->gapsLo[3*i+1] = 0; 337 lh->gapsLo[3*i+2] = 0; 338 lh->numStr[i] = 0; 339 lh->fStrToken[i] = NULL; 340 lh->lStrToken[i] = NULL; 341 lh->pos[i] = -1; 342 } 343 344 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 345 346 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 348 lh->pos[0] = 0; 349 t1 = lh->baseCE; 350 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; 351 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 352 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 353 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 354 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); 355 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); 356 357 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 358 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; 359 360 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 361 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 362 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 363 } else if(lh->indirect == TRUE && lh->nextCE != 0) { 364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { 365 lh->pos[0] = 0; 366 t1 = lh->baseCE; 367 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; 368 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 369 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 370 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 371 t1 = lh->nextCE; 372 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; 373 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 374 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 375 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 376 } else { 377 for(;;) { 378 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 379 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { 380 lh->fStrToken[tokStrength] = tok; 381 } else { /* The CE must be implicit, since it's not in the table */ 382 /* Error */ 383 *status = U_INTERNAL_PROGRAM_ERROR; 384 } 385 } 386 387 while(tok != NULL && tok->strength >= tokStrength) { 388 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 389 lh->lStrToken[tokStrength] = tok; 390 } 391 tok = tok->next; 392 } 393 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { 394 /* check if previous interval is the same and merge the intervals if it is so */ 395 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { 396 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; 397 lh->fStrToken[tokStrength+1] = NULL; 398 lh->lStrToken[tokStrength+1] = NULL; 399 lh->pos[tokStrength+1] = -1; 400 } 401 } 402 if(tok != NULL) { 403 tokStrength = tok->strength; 404 } else { 405 break; 406 } 407 } 408 for(st = 0; st < 3; st++) { 409 if((pos = lh->pos[st]) >= 0) { 410 t1 = *(CETable+3*(pos)); 411 t2 = *(CETable+3*(pos)+1); 412 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 413 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 415 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 416 //pos--; 417 //t1 = *(CETable+3*(pos)); 418 //t2 = *(CETable+3*(pos)+1); 419 t1 = lh->baseCE; 420 t2 = lh->baseContCE; 421 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 422 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 423 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 424 } 425 } 426 } 427 } 428 429 430 #define ucol_countBytes(value, noOfBytes) \ 431 { \ 432 uint32_t mask = 0xFFFFFFFF; \ 433 (noOfBytes) = 0; \ 434 while(mask != 0) { \ 435 if(((value) & mask) != 0) { \ 436 (noOfBytes)++; \ 437 } \ 438 mask >>= 8; \ 439 } \ 440 } 441 442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { 443 if(U_SUCCESS(*status)) { 444 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 445 } 446 return g->current; 447 } 448 449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { 450 /* TODO: rename to enum names */ 451 uint32_t high, low, count=1; 452 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; 453 454 if(strength == UCOL_SECONDARY) { 455 low = UCOL_COMMON_TOP2<<24; 456 high = 0xFFFFFFFF; 457 count = 0xFF - UCOL_COMMON_TOP2; 458 } else { 459 low = UCOL_BYTE_COMMON << 24; //0x05000000; 460 high = 0x40000000; 461 count = 0x40 - UCOL_BYTE_COMMON; 462 } 463 464 if(tok->next != NULL && tok->next->strength == strength) { 465 count = tok->next->toInsert; 466 } 467 468 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 469 g->current = UCOL_BYTE_COMMON<<24; 470 471 if(g->noOfRanges == 0) { 472 *status = U_INTERNAL_PROGRAM_ERROR; 473 } 474 return g->current; 475 } 476 477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { 478 uint32_t strength = tok->strength; 479 uint32_t low = lows[fStrength*3+strength]; 480 uint32_t high = highs[fStrength*3+strength]; 481 uint32_t maxByte = 0; 482 if(strength == UCOL_TERTIARY) { 483 maxByte = 0x3F; 484 } else if(strength == UCOL_PRIMARY) { 485 maxByte = 0xFE; 486 } else { 487 maxByte = 0xFF; 488 } 489 490 uint32_t count = tok->toInsert; 491 492 if(low >= high && strength > UCOL_PRIMARY) { 493 int32_t s = strength; 494 for(;;) { 495 s--; 496 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { 497 if(strength == UCOL_SECONDARY) { 498 if (low < UCOL_COMMON_TOP2<<24 ) { 499 // Override if low range is less than UCOL_COMMON_TOP2. 500 low = UCOL_COMMON_TOP2<<24; 501 } 502 high = 0xFFFFFFFF; 503 } else { 504 // Override if low range is less than UCOL_COMMON_BOT3. 505 if ( low < UCOL_COMMON_BOT3<<24 ) { 506 low = UCOL_COMMON_BOT3<<24; 507 } 508 high = 0x40000000; 509 } 510 break; 511 } 512 if(s<0) { 513 *status = U_INTERNAL_PROGRAM_ERROR; 514 return 0; 515 } 516 } 517 } 518 519 if(low < 0x02000000) { 520 // We must not use CE weight byte 02, so we set it as the minimum lower bound. 521 // See http://site.icu-project.org/design/collation/bytes 522 low = 0x02000000; 523 } 524 525 if(strength == UCOL_SECONDARY) { /* similar as simple */ 526 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 527 low = UCOL_COMMON_TOP2<<24; 528 } 529 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 530 high = UCOL_COMMON_TOP2<<24; 531 } 532 if(low < (UCOL_COMMON_BOT2<<24)) { 533 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); 534 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 535 //g->current = UCOL_COMMON_BOT2<<24; 536 return g->current; 537 } 538 } 539 540 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 541 if(g->noOfRanges == 0) { 542 *status = U_INTERNAL_PROGRAM_ERROR; 543 } 544 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 545 return g->current; 546 } 547 548 static 549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 550 uint32_t i = 0; 551 UChar c; 552 553 if(U_FAILURE(*status)) { 554 return 0; 555 } 556 557 if(sourceLen > resLen) { 558 *status = U_MEMORY_ALLOCATION_ERROR; 559 return 0; 560 } 561 562 for(i = 0; i < sourceLen; i++) { 563 c = source[i]; 564 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 565 switch(c - 0x3000) { 566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: 567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: 568 c++; 569 break; 570 case 0xF5: 571 c = 0x30AB; 572 break; 573 case 0xF6: 574 c = 0x30B1; 575 break; 576 } 577 } 578 resBuf[i] = c; 579 } 580 return sourceLen; 581 } 582 583 static 584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 585 uint32_t i = 0; 586 UChar c; 587 588 if(U_FAILURE(*status)) { 589 return 0; 590 } 591 592 if(sourceLen > resLen) { 593 *status = U_MEMORY_ALLOCATION_ERROR; 594 return 0; 595 } 596 597 for(i = 0; i < sourceLen; i++) { 598 c = source[i]; 599 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 600 switch(c - 0x3000) { 601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: 602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: 603 c--; 604 break; 605 case 0xAB: 606 c = 0x30F5; 607 break; 608 case 0xB1: 609 c = 0x30F6; 610 break; 611 } 612 } 613 resBuf[i] = c; 614 } 615 return sourceLen; 616 } 617 618 U_NAMESPACE_BEGIN 619 620 static 621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { 622 uint32_t i = 0; 623 UChar n[128]; 624 uint32_t nLen = 0; 625 uint32_t uCount = 0, lCount = 0; 626 627 collIterate s; 628 uint32_t order = 0; 629 630 if(U_FAILURE(*status)) { 631 return UCOL_LOWER_CASE; 632 } 633 634 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); 635 if(U_SUCCESS(*status)) { 636 for(i = 0; i < nLen; i++) { 637 uprv_init_collIterate(UCA, &n[i], 1, &s, status); 638 order = ucol_getNextCE(UCA, &s, status); 639 if(isContinuation(order)) { 640 *status = U_INTERNAL_PROGRAM_ERROR; 641 return UCOL_LOWER_CASE; 642 } 643 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { 644 uCount++; 645 } else { 646 if(u_islower(n[i])) { 647 lCount++; 648 } else if(U_SUCCESS(*status)) { 649 UChar sk[1], lk[1]; 650 u_toSmallKana(&n[i], 1, sk, 1, status); 651 u_toLargeKana(&n[i], 1, lk, 1, status); 652 if(sk[0] == n[i] && lk[0] != n[i]) { 653 lCount++; 654 } 655 } 656 } 657 } 658 } 659 660 if(uCount != 0 && lCount != 0) { 661 return UCOL_MIXED_CASE; 662 } else if(uCount != 0) { 663 return UCOL_UPPER_CASE; 664 } else { 665 return UCOL_LOWER_CASE; 666 } 667 } 668 669 670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { 671 /* this one makes the table and stuff */ 672 uint32_t noOfBytes[3]; 673 uint32_t i; 674 675 for(i = 0; i<3; i++) { 676 ucol_countBytes(CEparts[i], noOfBytes[i]); 677 } 678 679 /* Here we have to pack CEs from parts */ 680 681 uint32_t CEi = 0; 682 uint32_t value = 0; 683 684 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { 685 if(CEi > 0) { 686 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 687 } else { 688 value = 0; 689 } 690 691 if(2*CEi<noOfBytes[0]) { 692 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; 693 } 694 if(CEi<noOfBytes[1]) { 695 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; 696 } 697 if(CEi<noOfBytes[2]) { 698 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); 699 } 700 tok->CEs[CEi] = value; 701 CEi++; 702 } 703 if(CEi == 0) { /* totally ignorable */ 704 tok->noOfCEs = 1; 705 tok->CEs[0] = 0; 706 } else { /* there is at least something */ 707 tok->noOfCEs = CEi; 708 } 709 710 711 // we want to set case bits here and now, not later. 712 // Case bits handling 713 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables 714 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field 715 int32_t cSize = (tok->source & 0xFF000000) >> 24; 716 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; 717 718 if(cSize > 1) { 719 // Do it manually 720 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); 721 } else { 722 // Copy it from the UCA 723 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); 724 tok->CEs[0] |= (caseCE & 0xC0); 725 } 726 } 727 728 #if UCOL_DEBUG==2 729 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); 730 for(i = 0; i<tok->noOfCEs; i++) { 731 fprintf(stderr, "%08X ", tok->CEs[i]); 732 } 733 fprintf(stderr, "\n"); 734 #endif 735 } 736 737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 738 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; 739 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; 740 741 UColToken *tok = lh->last; 742 uint32_t t[UCOL_STRENGTH_LIMIT]; 743 744 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); 745 746 /* must initialize ranges to avoid memory check warnings */ 747 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { 748 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); 749 } 750 751 tok->toInsert = 1; 752 t[tok->strength] = 1; 753 754 while(tok->previous != NULL) { 755 if(tok->previous->strength < tok->strength) { /* going up */ 756 t[tok->strength] = 0; 757 t[tok->previous->strength]++; 758 } else if(tok->previous->strength > tok->strength) { /* going down */ 759 t[tok->previous->strength] = 1; 760 } else { 761 t[tok->strength]++; 762 } 763 tok=tok->previous; 764 tok->toInsert = t[tok->strength]; 765 } 766 767 tok->toInsert = t[tok->strength]; 768 ucol_inv_getGapPositions(src, lh, status); 769 770 #if UCOL_DEBUG 771 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); 772 int32_t j = 2; 773 for(j = 2; j >= 0; j--) { 774 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); 775 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); 776 } 777 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 778 779 do { 780 fprintf(stderr,"%i", tok->strength); 781 tok = tok->next; 782 } while(tok != NULL); 783 fprintf(stderr, "\n"); 784 785 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 786 787 do { 788 fprintf(stderr,"%i", tok->toInsert); 789 tok = tok->next; 790 } while(tok != NULL); 791 #endif 792 793 tok = lh->first; 794 uint32_t fStrength = UCOL_IDENTICAL; 795 uint32_t initStrength = UCOL_IDENTICAL; 796 797 798 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; 799 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; 800 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; 801 802 while (tok != NULL && U_SUCCESS(*status)) { 803 fStrength = tok->strength; 804 if(fStrength < initStrength) { 805 initStrength = fStrength; 806 if(lh->pos[fStrength] == -1) { 807 while(lh->pos[fStrength] == -1 && fStrength > 0) { 808 fStrength--; 809 } 810 if(lh->pos[fStrength] == -1) { 811 *status = U_INTERNAL_PROGRAM_ERROR; 812 return; 813 } 814 } 815 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ 816 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 817 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; 818 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ 819 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 820 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ 821 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 822 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ 823 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 824 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 825 } else { /* primaries */ 826 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ 827 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 828 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 829 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 830 } 831 } else { 832 if(tok->strength == UCOL_TERTIARY) { 833 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); 834 } else if(tok->strength == UCOL_SECONDARY) { 835 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); 836 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 837 } else if(tok->strength == UCOL_PRIMARY) { 838 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); 839 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 840 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 841 } 842 } 843 ucol_doCE(src, CEparts, tok, status); 844 tok = tok->next; 845 } 846 } 847 848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { 849 UCAElements el; 850 UColToken *tok = lh->first; 851 UColToken *expt = NULL; 852 uint32_t i = 0, j = 0; 853 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 854 855 while(tok != NULL && U_SUCCESS(*status)) { 856 /* first, check if there are any expansions */ 857 /* if there are expansions, we need to do a little bit more processing */ 858 /* since parts of expansion can be tailored, while others are not */ 859 if(tok->expansion != 0) { 860 uint32_t len = tok->expansion >> 24; 861 uint32_t currentSequenceLen = len; 862 uint32_t expOffset = tok->expansion & 0x00FFFFFF; 863 //uint32_t exp = currentSequenceLen | expOffset; 864 UColToken exp; 865 exp.source = currentSequenceLen | expOffset; 866 exp.rulesToParseHdl = &(src->source); 867 868 while(len > 0) { 869 currentSequenceLen = len; 870 while(currentSequenceLen > 0) { 871 exp.source = (currentSequenceLen << 24) | expOffset; 872 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ 873 uint32_t noOfCEsToCopy = expt->noOfCEs; 874 for(j = 0; j<noOfCEsToCopy; j++) { 875 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; 876 } 877 tok->noOfExpCEs += noOfCEsToCopy; 878 // Smart people never try to add codepoints and CEs. 879 // For some odd reason, it won't work. 880 expOffset += currentSequenceLen; //noOfCEsToCopy; 881 len -= currentSequenceLen; //noOfCEsToCopy; 882 break; 883 } else { 884 currentSequenceLen--; 885 } 886 } 887 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ 888 /* will have to get one from UCA */ 889 /* first, get the UChars from the rules */ 890 /* then pick CEs out until there is no more and stuff them into expansion */ 891 collIterate s; 892 uint32_t order = 0; 893 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); 894 895 for(;;) { 896 order = ucol_getNextCE(src->UCA, &s, status); 897 if(order == UCOL_NO_MORE_CES) { 898 break; 899 } 900 tok->expCEs[tok->noOfExpCEs++] = order; 901 } 902 expOffset++; 903 len--; 904 } 905 } 906 } else { 907 tok->noOfExpCEs = 0; 908 } 909 910 /* set the ucaelement with obtained values */ 911 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; 912 /* copy CEs */ 913 for(i = 0; i<tok->noOfCEs; i++) { 914 el.CEs[i] = tok->CEs[i]; 915 } 916 for(i = 0; i<tok->noOfExpCEs; i++) { 917 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; 918 } 919 920 /* copy UChars */ 921 // We kept prefix and source kind of together, as it is a kind of a contraction. 922 // However, now we have to slice the prefix off the main thing - 923 el.prefix = el.prefixChars; 924 el.cPoints = el.uchars; 925 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the 926 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND 927 // decomposed elements to the unsaf table. 928 el.prefixSize = tok->prefix>>24; 929 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 930 931 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 932 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 933 } else { 934 el.prefixSize = 0; 935 *el.prefix = 0; 936 937 el.cSize = (tok->source >> 24); 938 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 939 } 940 if(src->UCA != NULL) { 941 for(i = 0; i<el.cSize; i++) { 942 if(UCOL_ISJAMO(el.cPoints[i])) { 943 t->image->jamoSpecial = TRUE; 944 } 945 } 946 if (!src->buildCCTabFlag && el.cSize > 0) { 947 // Check the trailing canonical combining class (tccc) of the last character. 948 const UChar *s = el.cPoints + el.cSize; 949 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); 950 if ((fcd & 0xff) != 0) { 951 src->buildCCTabFlag = TRUE; 952 } 953 } 954 } 955 956 /* and then, add it */ 957 #if UCOL_DEBUG==2 958 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); 959 #endif 960 uprv_uca_addAnElement(t, &el, status); 961 962 #if UCOL_DEBUG_DUPLICATES 963 if(*status != U_ZERO_ERROR) { 964 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); 965 *status = U_ZERO_ERROR; 966 } 967 #endif 968 969 tok = tok->next; 970 } 971 } 972 973 U_CDECL_BEGIN 974 static UBool U_CALLCONV 975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 976 UErrorCode status = U_ZERO_ERROR; 977 tempUCATable *t = (tempUCATable *)context; 978 if(value == 0) { 979 while(start < limit) { 980 uint32_t CE = utrie_get32(t->mapping, start, NULL); 981 if(CE == UCOL_NOT_FOUND) { 982 UCAElements el; 983 el.isThai = FALSE; 984 el.prefixSize = 0; 985 el.prefixChars[0] = 0; 986 el.prefix = el.prefixChars; 987 el.cPoints = el.uchars; 988 989 el.cSize = 0; 990 U16_APPEND_UNSAFE(el.uchars, el.cSize, start); 991 992 el.noOfCEs = 1; 993 el.CEs[0] = 0; 994 uprv_uca_addAnElement(t, &el, &status); 995 996 } 997 start++; 998 } 999 } 1000 if(U_FAILURE(status)) { 1001 return FALSE; 1002 } else { 1003 return TRUE; 1004 } 1005 } 1006 U_CDECL_END 1007 1008 static void 1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, 1010 UChar32 start, UChar32 end, 1011 UErrorCode *status) 1012 { 1013 //UChar decomp[256]; 1014 uint32_t CE = UCOL_NOT_FOUND; 1015 UChar32 u = 0; 1016 UCAElements el; 1017 el.isThai = FALSE; 1018 el.prefixSize = 0; 1019 el.prefixChars[0] = 0; 1020 collIterate colIt; 1021 1022 if(U_SUCCESS(*status)) { 1023 for(u = start; u<=end; u++) { 1024 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND 1025 /* this test is for contractions that are missing the starting element. */ 1026 || ((isCntTableElement(CE)) && 1027 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) 1028 ) 1029 { 1030 el.cSize = 0; 1031 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); 1032 //decomp[0] = (UChar)u; 1033 //el.uchars[0] = (UChar)u; 1034 el.cPoints = el.uchars; 1035 //el.cSize = 1; 1036 el.noOfCEs = 0; 1037 el.prefix = el.prefixChars; 1038 el.prefixSize = 0; 1039 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); 1040 // We actually want to check whether this element is a special 1041 // If it is an implicit element (hangul, CJK - we want to copy the 1042 // special, not the resolved CEs) - for hangul, copying resolved 1043 // would just make things the same (there is an expansion and it 1044 // takes approximately the same amount of time to resolve as 1045 // falling back to the UCA). 1046 /* 1047 UTRIE_GET32(src->UCA->mapping, u, CE); 1048 tag = getCETag(CE); 1049 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG 1050 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG 1051 || tag == LEAD_SURROGATE_TAG) { 1052 el.CEs[el.noOfCEs++] = CE; 1053 } else { 1054 */ 1055 // It turns out that it does not make sense to keep implicits 1056 // unresolved. The cost of resolving them is big enough so that 1057 // it doesn't make any difference whether we have to go to the UCA 1058 // or not. 1059 { 1060 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); 1061 while(CE != UCOL_NO_MORE_CES) { 1062 CE = ucol_getNextCE(src->UCA, &colIt, status); 1063 if(CE != UCOL_NO_MORE_CES) { 1064 el.CEs[el.noOfCEs++] = CE; 1065 } 1066 } 1067 } 1068 uprv_uca_addAnElement(t, &el, status); 1069 } 1070 } 1071 } 1072 } 1073 1074 U_NAMESPACE_END 1075 1076 U_CFUNC UCATableHeader * 1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { 1078 U_NAMESPACE_USE 1079 1080 uint32_t i = 0; 1081 if(U_FAILURE(*status)) { 1082 return NULL; 1083 } 1084 /* 1085 2. Eliminate the negative lists by doing the following for each non-null negative list: 1086 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 1087 create new ListHeader X 1088 o reverse the list, add to the end of X's positive list. Reset the strength of the 1089 first item you add, based on the stronger strength levels of the two lists. 1090 */ 1091 /* 1092 3. For each ListHeader with a non-null positive list: 1093 */ 1094 /* 1095 o Find all character strings with CEs between the baseCE and the 1096 next/previous CE, at the strength of the first token. Add these to the 1097 tailoring. 1098 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the 1099 tailoring has & x < z... 1100 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... 1101 */ 1102 /* It is possible that this part should be done even while constructing list */ 1103 /* The problem is that it is unknown what is going to be the strongest weight */ 1104 /* So we might as well do it here */ 1105 1106 /* 1107 o Allocate CEs for each token in the list, based on the total number N of the 1108 largest level difference, and the gap G between baseCE and nextCE at that 1109 level. The relation * between the last item and nextCE is the same as the 1110 strongest strength. 1111 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 1112 ? There are 3 primary items: a, d, e. Fit them into the primary gap. 1113 Then fit b and c into the secondary gap between a and d, then fit q 1114 into the tertiary gap between b and c. 1115 1116 o Example: baseCE << b <<< q << c * nextCE(X,2) 1117 ? There are 2 secondary items: b, c. Fit them into the secondary gap. 1118 Then fit q into the tertiary gap between b and c. 1119 o When incrementing primary values, we will not cross high byte 1120 boundaries except where there is only a single-byte primary. That is to 1121 ensure that the script reordering will continue to work. 1122 */ 1123 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); 1124 /* test for NULL */ 1125 if (image == NULL) { 1126 *status = U_MEMORY_ALLOCATION_ERROR; 1127 return NULL; 1128 } 1129 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); 1130 1131 for(i = 0; i<src->resultLen; i++) { 1132 /* now we need to generate the CEs */ 1133 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1134 /* According to strength */ 1135 if(U_SUCCESS(*status)) { 1136 if(src->lh[i].first) { // if there are any elements 1137 // due to the way parser works, subsequent tailorings 1138 // may remove all the elements from a sequence, therefore 1139 // leaving an empty tailoring sequence. 1140 ucol_initBuffers(src, &src->lh[i], status); 1141 } 1142 } 1143 if(U_FAILURE(*status)) { 1144 uprv_free(image); 1145 return NULL; 1146 } 1147 } 1148 1149 if(src->varTop != NULL) { /* stuff the variable top value */ 1150 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; 1151 /* remove it from the list */ 1152 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ 1153 src->varTop->listHeader->first = src->varTop->next; 1154 } 1155 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ 1156 src->varTop->listHeader->last = src->varTop->previous; 1157 } 1158 if(src->varTop->next != NULL) { 1159 src->varTop->next->previous = src->varTop->previous; 1160 } 1161 if(src->varTop->previous != NULL) { 1162 src->varTop->previous->next = src->varTop->next; 1163 } 1164 } 1165 1166 1167 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); 1168 if(U_FAILURE(*status)) { 1169 uprv_free(image); 1170 return NULL; 1171 } 1172 1173 1174 /* After this, we have assigned CE values to all regular CEs */ 1175 /* now we will go through list once more and resolve expansions, */ 1176 /* make UCAElements structs and add them to table */ 1177 for(i = 0; i<src->resultLen; i++) { 1178 /* now we need to generate the CEs */ 1179 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1180 /* According to strength */ 1181 if(U_SUCCESS(*status)) { 1182 ucol_createElements(src, t, &src->lh[i], status); 1183 } 1184 } 1185 1186 UCAElements el; 1187 el.isThai = FALSE; 1188 el.prefixSize = 0; 1189 el.prefixChars[0] = 0; 1190 1191 /* add latin-1 stuff */ 1192 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); 1193 1194 /* add stuff for copying */ 1195 if(src->copySet != NULL) { 1196 int32_t i = 0; 1197 UnicodeSet *set = (UnicodeSet *)src->copySet; 1198 for(i = 0; i < set->getRangeCount(); i++) { 1199 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); 1200 } 1201 } 1202 1203 if(U_SUCCESS(*status)) { 1204 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ 1205 1206 uint32_t tailoredCE = UCOL_NOT_FOUND; 1207 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); 1208 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; 1209 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); 1210 // Check for null pointer 1211 if (ucaEl == NULL) { 1212 *status = U_MEMORY_ALLOCATION_ERROR; 1213 return NULL; 1214 } 1215 while(*conts != 0) { 1216 // A continuation is NUL-terminated and NUL-padded 1217 // except if it has the maximum length. 1218 int32_t contractionLength = maxUCAContractionLength; 1219 while(contractionLength > 0 && conts[contractionLength - 1] == 0) { 1220 --contractionLength; 1221 } 1222 UChar32 first; 1223 int32_t firstLength = 0; 1224 U16_NEXT(conts, firstLength, contractionLength, first); 1225 tailoredCE = utrie_get32(t->mapping, first, NULL); 1226 if(tailoredCE != UCOL_NOT_FOUND) { 1227 UBool needToAdd = TRUE; 1228 if(isCntTableElement(tailoredCE)) { 1229 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { 1230 needToAdd = FALSE; 1231 } 1232 } 1233 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { 1234 UCAElements elm; 1235 elm.cPoints = el.uchars; 1236 elm.noOfCEs = 0; 1237 elm.uchars[0] = *conts; 1238 elm.uchars[1] = 0; 1239 elm.cSize = 1; 1240 elm.prefixChars[0] = *(conts+2); 1241 elm.isThai = FALSE; 1242 elm.prefix = elm.prefixChars; 1243 elm.prefixSize = 1; 1244 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); 1245 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { 1246 needToAdd = TRUE; 1247 } 1248 } 1249 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { 1250 needToAdd = FALSE; 1251 } 1252 1253 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. 1254 if (*(conts+1) != 0) { // contractions 1255 el.prefix = el.prefixChars; 1256 el.prefixSize = 0; 1257 el.cPoints = el.uchars; 1258 el.noOfCEs = 0; 1259 u_memcpy(el.uchars, conts, contractionLength); 1260 el.cSize = contractionLength; 1261 ucol_setText(ucaEl, el.uchars, el.cSize, status); 1262 } 1263 else { // pre-context character 1264 UChar str[4] = { 0 }; 1265 int32_t len=0; 1266 int32_t preKeyLen=0; 1267 1268 el.cPoints = el.uchars; 1269 el.noOfCEs = 0; 1270 el.uchars[0] = *conts; 1271 el.uchars[1] = 0; 1272 el.cSize = 1; 1273 el.prefixChars[0] = *(conts+2); 1274 el.prefix = el.prefixChars; 1275 el.prefixSize = 1; 1276 if (el.prefixChars[0]!=0) { 1277 // get CE of prefix character first 1278 str[0]=el.prefixChars[0]; 1279 str[1]=0; 1280 ucol_setText(ucaEl, str, 1, status); 1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) 1282 != UCOL_NULLORDER) { 1283 preKeyLen++; // count number of keys for prefix character 1284 } 1285 str[len++] = el.prefixChars[0]; 1286 } 1287 1288 str[len++] = el.uchars[0]; 1289 str[len]=0; 1290 ucol_setText(ucaEl, str, len, status); 1291 // Skip the keys for prefix character, then copy the rest to el. 1292 while ((preKeyLen-->0) && 1293 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1294 continue; 1295 } 1296 1297 } 1298 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1299 el.noOfCEs++; 1300 } 1301 uprv_uca_addAnElement(t, &el, status); 1302 } 1303 1304 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { 1305 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); 1306 } 1307 conts+=maxUCAContractionLength; 1308 } 1309 ucol_closeElements(ucaEl); 1310 } 1311 1312 // Add completely ignorable elements 1313 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); 1314 1315 // add tailoring characters related canonical closures 1316 uprv_uca_canonicalClosure(t, src, NULL, status); 1317 1318 /* still need to produce compatibility closure */ 1319 1320 UCATableHeader *myData = uprv_uca_assembleTable(t, status); 1321 1322 uprv_uca_closeTempTable(t); 1323 uprv_free(image); 1324 1325 return myData; 1326 } 1327 1328 U_CDECL_BEGIN 1329 static UBool U_CALLCONV 1330 ucol_bld_cleanup(void) 1331 { 1332 udata_close(invUCA_DATA_MEM); 1333 invUCA_DATA_MEM = NULL; 1334 _staticInvUCA = NULL; 1335 gStaticInvUCAInitOnce.reset(); 1336 return TRUE; 1337 } 1338 U_CDECL_END 1339 1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) { 1341 U_ASSERT(invUCA_DATA_MEM == NULL); 1342 U_ASSERT(_staticInvUCA == NULL); 1343 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); 1344 InverseUCATableHeader *newInvUCA = NULL; 1345 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status); 1346 1347 if(U_FAILURE(status)) { 1348 if (result) { 1349 udata_close(result); 1350 } 1351 // This is not needed, as we are talking about 1352 // memory we got from UData 1353 //uprv_free(newInvUCA); 1354 return; 1355 } 1356 1357 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ 1358 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); 1359 UCollator *UCA = ucol_initUCA(&status); 1360 // UCA versions of UCA and inverse UCA should match 1361 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { 1362 status = U_INVALID_FORMAT_ERROR; 1363 udata_close(result); 1364 return; 1365 } 1366 1367 invUCA_DATA_MEM = result; 1368 _staticInvUCA = newInvUCA; 1369 } 1370 } 1371 1372 1373 U_CAPI const InverseUCATableHeader * U_EXPORT2 1374 ucol_initInverseUCA(UErrorCode *status) 1375 { 1376 umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status); 1377 return _staticInvUCA; 1378 } 1379 1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept 1381 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. 1382 */ 1383 static const char * const ReorderingTokenNames[] = { 1384 "SPACE", 1385 "PUNCT", 1386 "SYMBOL", 1387 "CURRENCY", 1388 "DIGIT" 1389 }; 1390 1391 static void toUpper(const char* src, char* dst, uint32_t length) { 1392 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { 1393 *dst = uprv_toupper(*src); 1394 } 1395 *dst = '\0'; 1396 } 1397 1398 U_INTERNAL int32_t U_EXPORT2 1399 ucol_findReorderingEntry(const char* name) { 1400 char buffer[32]; 1401 toUpper(name, buffer, 32); 1402 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { 1403 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { 1404 return entry + UCOL_REORDER_CODE_FIRST; 1405 } 1406 } 1407 return USCRIPT_INVALID_CODE; 1408 } 1409 1410 #endif /* #if !UCONFIG_NO_COLLATION */ 1411