1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_bld.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created 02/22/2001 14 * created by: Vladimir Weinstein 15 * 16 * This module builds a collator based on the rule set. 17 * 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/ucoleitr.h" 25 #include "unicode/udata.h" 26 #include "unicode/uchar.h" 27 #include "unicode/uniset.h" 28 #include "unicode/uscript.h" 29 #include "unicode/ustring.h" 30 #include "unicode/utf16.h" 31 #include "normalizer2impl.h" 32 #include "ucol_bld.h" 33 #include "ucol_elm.h" 34 #include "ucol_cnt.h" 35 #include "ucln_in.h" 36 #include "umutex.h" 37 #include "cmemory.h" 38 #include "cstring.h" 39 40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 41 42 static const InverseUCATableHeader* _staticInvUCA = NULL; 43 static UDataMemory* invUCA_DATA_MEM = NULL; 44 45 U_CDECL_BEGIN 46 static UBool U_CALLCONV 47 isAcceptableInvUCA(void * /*context*/, 48 const char * /*type*/, const char * /*name*/, 49 const UDataInfo *pInfo) 50 { 51 /* context, type & name are intentionally not used */ 52 if( pInfo->size>=20 && 53 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 54 pInfo->charsetFamily==U_CHARSET_FAMILY && 55 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ 56 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && 57 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && 58 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && 59 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && 60 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& 61 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && 62 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && 63 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && 64 ) 65 { 66 UVersionInfo UCDVersion; 67 u_getUnicodeVersion(UCDVersion); 68 return (pInfo->dataVersion[0]==UCDVersion[0] && 69 pInfo->dataVersion[1]==UCDVersion[1]); 70 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && 71 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && 72 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { 73 } else { 74 return FALSE; 75 } 76 } 77 U_CDECL_END 78 79 /* 80 * Takes two CEs (lead and continuation) and 81 * compares them as CEs should be compared: 82 * primary vs. primary, secondary vs. secondary 83 * tertiary vs. tertiary 84 */ 85 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { 86 uint32_t s1 = source0, s2, t1 = target0, t2; 87 if(isContinuation(source1)) { 88 s2 = source1; 89 } else { 90 s2 = 0; 91 } 92 if(isContinuation(target1)) { 93 t2 = target1; 94 } else { 95 t2 = 0; 96 } 97 98 uint32_t s = 0, t = 0; 99 if(s1 == t1 && s2 == t2) { 100 return 0; 101 } 102 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 103 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 104 if(s < t) { 105 return -1; 106 } else if(s > t) { 107 return 1; 108 } else { 109 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; 110 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; 111 if(s < t) { 112 return -1; 113 } else if(s > t) { 114 return 1; 115 } else { 116 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); 117 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); 118 if(s < t) { 119 return -1; 120 } else { 121 return 1; 122 } 123 } 124 } 125 } 126 127 static 128 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { 129 uint32_t bottom = 0, top = src->invUCA->tableSize; 130 uint32_t i = 0; 131 uint32_t first = 0, second = 0; 132 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 133 int32_t res = 0; 134 135 while(bottom < top-1) { 136 i = (top+bottom)/2; 137 first = *(CETable+3*i); 138 second = *(CETable+3*i+1); 139 res = compareCEs(first, second, CE, SecondCE); 140 if(res > 0) { 141 top = i; 142 } else if(res < 0) { 143 bottom = i; 144 } else { 145 break; 146 } 147 } 148 149 /* weiv: */ 150 /* in searching for elements, I have removed the failure */ 151 /* The reason for this is that the builder does not rely */ 152 /* on search mechanism telling it that it didn't find an */ 153 /* element. However, indirect positioning relies on being */ 154 /* able to find the elements around any CE, even if it is */ 155 /* not defined in the UCA. */ 156 return i; 157 /* 158 if((first == CE && second == SecondCE)) { 159 return i; 160 } else { 161 return -1; 162 } 163 */ 164 } 165 166 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 167 0xFFFF0000, 168 0xFFFFFF00, 169 0xFFFFFFFF 170 }; 171 172 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, 173 uint32_t CE, uint32_t contCE, 174 uint32_t *nextCE, uint32_t *nextContCE, 175 uint32_t strength) 176 { 177 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 178 int32_t iCE; 179 180 iCE = ucol_inv_findCE(src, CE, contCE); 181 182 if(iCE<0) { 183 *nextCE = UCOL_NOT_FOUND; 184 return -1; 185 } 186 187 CE &= strengthMask[strength]; 188 contCE &= strengthMask[strength]; 189 190 *nextCE = CE; 191 *nextContCE = contCE; 192 193 while((*nextCE & strengthMask[strength]) == CE 194 && (*nextContCE & strengthMask[strength]) == contCE) 195 { 196 *nextCE = (*(CETable+3*(++iCE))); 197 *nextContCE = (*(CETable+3*(iCE)+1)); 198 } 199 200 return iCE; 201 } 202 203 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 204 uint32_t CE, uint32_t contCE, 205 uint32_t *prevCE, uint32_t *prevContCE, 206 uint32_t strength) 207 { 208 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 209 int32_t iCE; 210 211 iCE = ucol_inv_findCE(src, CE, contCE); 212 213 if(iCE<0) { 214 *prevCE = UCOL_NOT_FOUND; 215 return -1; 216 } 217 218 CE &= strengthMask[strength]; 219 contCE &= strengthMask[strength]; 220 221 *prevCE = CE; 222 *prevContCE = contCE; 223 224 while((*prevCE & strengthMask[strength]) == CE 225 && (*prevContCE & strengthMask[strength])== contCE 226 && iCE > 0) /* this condition should prevent falling off the edge of the world */ 227 { 228 /* here, we end up in a singularity - zero */ 229 *prevCE = (*(CETable+3*(--iCE))); 230 *prevContCE = (*(CETable+3*(iCE)+1)); 231 } 232 233 return iCE; 234 } 235 236 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 237 uint32_t prevCE, uint32_t prevContCE) 238 { 239 if(prevCE == CE && prevContCE == contCE) { 240 return UCOL_IDENTICAL; 241 } 242 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) 243 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) 244 { 245 return UCOL_PRIMARY; 246 } 247 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) 248 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) 249 { 250 return UCOL_SECONDARY; 251 } 252 return UCOL_TERTIARY; 253 } 254 255 256 /*static 257 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 258 259 uint32_t CE = lh->baseCE; 260 uint32_t SecondCE = lh->baseContCE; 261 262 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 263 uint32_t previousCE, previousContCE; 264 int32_t iCE; 265 266 iCE = ucol_inv_findCE(src, CE, SecondCE); 267 268 if(iCE<0) { 269 return -1; 270 } 271 272 CE &= strengthMask[strength]; 273 SecondCE &= strengthMask[strength]; 274 275 previousCE = CE; 276 previousContCE = SecondCE; 277 278 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { 279 previousCE = (*(CETable+3*(--iCE))); 280 previousContCE = (*(CETable+3*(iCE)+1)); 281 } 282 lh->previousCE = previousCE; 283 lh->previousContCE = previousContCE; 284 285 return iCE; 286 }*/ 287 288 static 289 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 290 uint32_t CE = lh->baseCE; 291 uint32_t SecondCE = lh->baseContCE; 292 293 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 294 uint32_t nextCE, nextContCE; 295 int32_t iCE; 296 297 iCE = ucol_inv_findCE(src, CE, SecondCE); 298 299 if(iCE<0) { 300 return -1; 301 } 302 303 CE &= strengthMask[strength]; 304 SecondCE &= strengthMask[strength]; 305 306 nextCE = CE; 307 nextContCE = SecondCE; 308 309 while((nextCE & strengthMask[strength]) == CE 310 && (nextContCE & strengthMask[strength]) == SecondCE) 311 { 312 nextCE = (*(CETable+3*(++iCE))); 313 nextContCE = (*(CETable+3*(iCE)+1)); 314 } 315 316 lh->nextCE = nextCE; 317 lh->nextContCE = nextContCE; 318 319 return iCE; 320 } 321 322 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 323 /* reset all the gaps */ 324 int32_t i = 0; 325 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 326 uint32_t st = 0; 327 uint32_t t1, t2; 328 int32_t pos; 329 330 UColToken *tok = lh->first; 331 uint32_t tokStrength = tok->strength; 332 333 for(i = 0; i<3; i++) { 334 lh->gapsHi[3*i] = 0; 335 lh->gapsHi[3*i+1] = 0; 336 lh->gapsHi[3*i+2] = 0; 337 lh->gapsLo[3*i] = 0; 338 lh->gapsLo[3*i+1] = 0; 339 lh->gapsLo[3*i+2] = 0; 340 lh->numStr[i] = 0; 341 lh->fStrToken[i] = NULL; 342 lh->lStrToken[i] = NULL; 343 lh->pos[i] = -1; 344 } 345 346 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 347 348 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 349 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 350 lh->pos[0] = 0; 351 t1 = lh->baseCE; 352 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; 353 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 354 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 355 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 356 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); 357 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); 358 359 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 360 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; 361 362 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 363 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 364 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 365 } else if(lh->indirect == TRUE && lh->nextCE != 0) { 366 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { 367 lh->pos[0] = 0; 368 t1 = lh->baseCE; 369 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; 370 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 371 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 372 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 373 t1 = lh->nextCE; 374 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; 375 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 376 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 377 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 378 } else { 379 for(;;) { 380 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 381 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { 382 lh->fStrToken[tokStrength] = tok; 383 } else { /* The CE must be implicit, since it's not in the table */ 384 /* Error */ 385 *status = U_INTERNAL_PROGRAM_ERROR; 386 } 387 } 388 389 while(tok != NULL && tok->strength >= tokStrength) { 390 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 391 lh->lStrToken[tokStrength] = tok; 392 } 393 tok = tok->next; 394 } 395 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { 396 /* check if previous interval is the same and merge the intervals if it is so */ 397 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { 398 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; 399 lh->fStrToken[tokStrength+1] = NULL; 400 lh->lStrToken[tokStrength+1] = NULL; 401 lh->pos[tokStrength+1] = -1; 402 } 403 } 404 if(tok != NULL) { 405 tokStrength = tok->strength; 406 } else { 407 break; 408 } 409 } 410 for(st = 0; st < 3; st++) { 411 if((pos = lh->pos[st]) >= 0) { 412 t1 = *(CETable+3*(pos)); 413 t2 = *(CETable+3*(pos)+1); 414 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 415 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 416 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 417 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 418 //pos--; 419 //t1 = *(CETable+3*(pos)); 420 //t2 = *(CETable+3*(pos)+1); 421 t1 = lh->baseCE; 422 t2 = lh->baseContCE; 423 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 424 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 425 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 426 } 427 } 428 } 429 } 430 431 432 #define ucol_countBytes(value, noOfBytes) \ 433 { \ 434 uint32_t mask = 0xFFFFFFFF; \ 435 (noOfBytes) = 0; \ 436 while(mask != 0) { \ 437 if(((value) & mask) != 0) { \ 438 (noOfBytes)++; \ 439 } \ 440 mask >>= 8; \ 441 } \ 442 } 443 444 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { 445 if(U_SUCCESS(*status)) { 446 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 447 } 448 return g->current; 449 } 450 451 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { 452 /* TODO: rename to enum names */ 453 uint32_t high, low, count=1; 454 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; 455 456 if(strength == UCOL_SECONDARY) { 457 low = UCOL_COMMON_TOP2<<24; 458 high = 0xFFFFFFFF; 459 count = 0xFF - UCOL_COMMON_TOP2; 460 } else { 461 low = UCOL_BYTE_COMMON << 24; //0x05000000; 462 high = 0x40000000; 463 count = 0x40 - UCOL_BYTE_COMMON; 464 } 465 466 if(tok->next != NULL && tok->next->strength == strength) { 467 count = tok->next->toInsert; 468 } 469 470 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 471 g->current = UCOL_BYTE_COMMON<<24; 472 473 if(g->noOfRanges == 0) { 474 *status = U_INTERNAL_PROGRAM_ERROR; 475 } 476 return g->current; 477 } 478 479 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { 480 uint32_t strength = tok->strength; 481 uint32_t low = lows[fStrength*3+strength]; 482 uint32_t high = highs[fStrength*3+strength]; 483 uint32_t maxByte = 0; 484 if(strength == UCOL_TERTIARY) { 485 maxByte = 0x3F; 486 } else if(strength == UCOL_PRIMARY) { 487 maxByte = 0xFE; 488 } else { 489 maxByte = 0xFF; 490 } 491 492 uint32_t count = tok->toInsert; 493 494 if(low >= high && strength > UCOL_PRIMARY) { 495 int32_t s = strength; 496 for(;;) { 497 s--; 498 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { 499 if(strength == UCOL_SECONDARY) { 500 if (low < UCOL_COMMON_TOP2<<24 ) { 501 // Override if low range is less than UCOL_COMMON_TOP2. 502 low = UCOL_COMMON_TOP2<<24; 503 } 504 high = 0xFFFFFFFF; 505 } else { 506 // Override if low range is less than UCOL_COMMON_BOT3. 507 if ( low < UCOL_COMMON_BOT3<<24 ) { 508 low = UCOL_COMMON_BOT3<<24; 509 } 510 high = 0x40000000; 511 } 512 break; 513 } 514 if(s<0) { 515 *status = U_INTERNAL_PROGRAM_ERROR; 516 return 0; 517 } 518 } 519 } 520 521 if(low < 0x02000000) { 522 // We must not use CE weight byte 02, so we set it as the minimum lower bound. 523 // See http://site.icu-project.org/design/collation/bytes 524 low = 0x02000000; 525 } 526 527 if(strength == UCOL_SECONDARY) { /* similar as simple */ 528 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 529 low = UCOL_COMMON_TOP2<<24; 530 } 531 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 532 high = UCOL_COMMON_TOP2<<24; 533 } 534 if(low < (UCOL_COMMON_BOT2<<24)) { 535 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); 536 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 537 //g->current = UCOL_COMMON_BOT2<<24; 538 return g->current; 539 } 540 } 541 542 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 543 if(g->noOfRanges == 0) { 544 *status = U_INTERNAL_PROGRAM_ERROR; 545 } 546 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 547 return g->current; 548 } 549 550 static 551 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 552 uint32_t i = 0; 553 UChar c; 554 555 if(U_FAILURE(*status)) { 556 return 0; 557 } 558 559 if(sourceLen > resLen) { 560 *status = U_MEMORY_ALLOCATION_ERROR; 561 return 0; 562 } 563 564 for(i = 0; i < sourceLen; i++) { 565 c = source[i]; 566 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 567 switch(c - 0x3000) { 568 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: 569 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: 570 c++; 571 break; 572 case 0xF5: 573 c = 0x30AB; 574 break; 575 case 0xF6: 576 c = 0x30B1; 577 break; 578 } 579 } 580 resBuf[i] = c; 581 } 582 return sourceLen; 583 } 584 585 static 586 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 587 uint32_t i = 0; 588 UChar c; 589 590 if(U_FAILURE(*status)) { 591 return 0; 592 } 593 594 if(sourceLen > resLen) { 595 *status = U_MEMORY_ALLOCATION_ERROR; 596 return 0; 597 } 598 599 for(i = 0; i < sourceLen; i++) { 600 c = source[i]; 601 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 602 switch(c - 0x3000) { 603 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: 604 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: 605 c--; 606 break; 607 case 0xAB: 608 c = 0x30F5; 609 break; 610 case 0xB1: 611 c = 0x30F6; 612 break; 613 } 614 } 615 resBuf[i] = c; 616 } 617 return sourceLen; 618 } 619 620 U_NAMESPACE_BEGIN 621 622 static 623 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { 624 uint32_t i = 0; 625 UChar n[128]; 626 uint32_t nLen = 0; 627 uint32_t uCount = 0, lCount = 0; 628 629 collIterate s; 630 uint32_t order = 0; 631 632 if(U_FAILURE(*status)) { 633 return UCOL_LOWER_CASE; 634 } 635 636 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); 637 if(U_SUCCESS(*status)) { 638 for(i = 0; i < nLen; i++) { 639 uprv_init_collIterate(UCA, &n[i], 1, &s, status); 640 order = ucol_getNextCE(UCA, &s, status); 641 if(isContinuation(order)) { 642 *status = U_INTERNAL_PROGRAM_ERROR; 643 return UCOL_LOWER_CASE; 644 } 645 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { 646 uCount++; 647 } else { 648 if(u_islower(n[i])) { 649 lCount++; 650 } else if(U_SUCCESS(*status)) { 651 UChar sk[1], lk[1]; 652 u_toSmallKana(&n[i], 1, sk, 1, status); 653 u_toLargeKana(&n[i], 1, lk, 1, status); 654 if(sk[0] == n[i] && lk[0] != n[i]) { 655 lCount++; 656 } 657 } 658 } 659 } 660 } 661 662 if(uCount != 0 && lCount != 0) { 663 return UCOL_MIXED_CASE; 664 } else if(uCount != 0) { 665 return UCOL_UPPER_CASE; 666 } else { 667 return UCOL_LOWER_CASE; 668 } 669 } 670 671 672 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { 673 /* this one makes the table and stuff */ 674 uint32_t noOfBytes[3]; 675 uint32_t i; 676 677 for(i = 0; i<3; i++) { 678 ucol_countBytes(CEparts[i], noOfBytes[i]); 679 } 680 681 /* Here we have to pack CEs from parts */ 682 683 uint32_t CEi = 0; 684 uint32_t value = 0; 685 686 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { 687 if(CEi > 0) { 688 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 689 } else { 690 value = 0; 691 } 692 693 if(2*CEi<noOfBytes[0]) { 694 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; 695 } 696 if(CEi<noOfBytes[1]) { 697 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; 698 } 699 if(CEi<noOfBytes[2]) { 700 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); 701 } 702 tok->CEs[CEi] = value; 703 CEi++; 704 } 705 if(CEi == 0) { /* totally ignorable */ 706 tok->noOfCEs = 1; 707 tok->CEs[0] = 0; 708 } else { /* there is at least something */ 709 tok->noOfCEs = CEi; 710 } 711 712 713 // we want to set case bits here and now, not later. 714 // Case bits handling 715 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables 716 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field 717 int32_t cSize = (tok->source & 0xFF000000) >> 24; 718 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; 719 720 if(cSize > 1) { 721 // Do it manually 722 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); 723 } else { 724 // Copy it from the UCA 725 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); 726 tok->CEs[0] |= (caseCE & 0xC0); 727 } 728 } 729 730 #if UCOL_DEBUG==2 731 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); 732 for(i = 0; i<tok->noOfCEs; i++) { 733 fprintf(stderr, "%08X ", tok->CEs[i]); 734 } 735 fprintf(stderr, "\n"); 736 #endif 737 } 738 739 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 740 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; 741 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; 742 743 UColToken *tok = lh->last; 744 uint32_t t[UCOL_STRENGTH_LIMIT]; 745 746 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); 747 748 /* must initialize ranges to avoid memory check warnings */ 749 for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) { 750 uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges)); 751 } 752 753 tok->toInsert = 1; 754 t[tok->strength] = 1; 755 756 while(tok->previous != NULL) { 757 if(tok->previous->strength < tok->strength) { /* going up */ 758 t[tok->strength] = 0; 759 t[tok->previous->strength]++; 760 } else if(tok->previous->strength > tok->strength) { /* going down */ 761 t[tok->previous->strength] = 1; 762 } else { 763 t[tok->strength]++; 764 } 765 tok=tok->previous; 766 tok->toInsert = t[tok->strength]; 767 } 768 769 tok->toInsert = t[tok->strength]; 770 ucol_inv_getGapPositions(src, lh, status); 771 772 #if UCOL_DEBUG 773 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); 774 int32_t j = 2; 775 for(j = 2; j >= 0; j--) { 776 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); 777 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); 778 } 779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 780 781 do { 782 fprintf(stderr,"%i", tok->strength); 783 tok = tok->next; 784 } while(tok != NULL); 785 fprintf(stderr, "\n"); 786 787 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 788 789 do { 790 fprintf(stderr,"%i", tok->toInsert); 791 tok = tok->next; 792 } while(tok != NULL); 793 #endif 794 795 tok = lh->first; 796 uint32_t fStrength = UCOL_IDENTICAL; 797 uint32_t initStrength = UCOL_IDENTICAL; 798 799 800 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; 801 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; 802 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; 803 804 while (tok != NULL && U_SUCCESS(*status)) { 805 fStrength = tok->strength; 806 if(fStrength < initStrength) { 807 initStrength = fStrength; 808 if(lh->pos[fStrength] == -1) { 809 while(lh->pos[fStrength] == -1 && fStrength > 0) { 810 fStrength--; 811 } 812 if(lh->pos[fStrength] == -1) { 813 *status = U_INTERNAL_PROGRAM_ERROR; 814 return; 815 } 816 } 817 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ 818 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 819 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; 820 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ 821 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 822 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ 823 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 824 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ 825 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 826 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 827 } else { /* primaries */ 828 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ 829 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 830 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 831 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 832 } 833 } else { 834 if(tok->strength == UCOL_TERTIARY) { 835 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); 836 } else if(tok->strength == UCOL_SECONDARY) { 837 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); 838 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 839 } else if(tok->strength == UCOL_PRIMARY) { 840 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); 841 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 842 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 843 } 844 } 845 ucol_doCE(src, CEparts, tok, status); 846 tok = tok->next; 847 } 848 } 849 850 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { 851 UCAElements el; 852 UColToken *tok = lh->first; 853 UColToken *expt = NULL; 854 uint32_t i = 0, j = 0; 855 const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status); 856 857 while(tok != NULL && U_SUCCESS(*status)) { 858 /* first, check if there are any expansions */ 859 /* if there are expansions, we need to do a little bit more processing */ 860 /* since parts of expansion can be tailored, while others are not */ 861 if(tok->expansion != 0) { 862 uint32_t len = tok->expansion >> 24; 863 uint32_t currentSequenceLen = len; 864 uint32_t expOffset = tok->expansion & 0x00FFFFFF; 865 //uint32_t exp = currentSequenceLen | expOffset; 866 UColToken exp; 867 exp.source = currentSequenceLen | expOffset; 868 exp.rulesToParseHdl = &(src->source); 869 870 while(len > 0) { 871 currentSequenceLen = len; 872 while(currentSequenceLen > 0) { 873 exp.source = (currentSequenceLen << 24) | expOffset; 874 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ 875 uint32_t noOfCEsToCopy = expt->noOfCEs; 876 for(j = 0; j<noOfCEsToCopy; j++) { 877 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; 878 } 879 tok->noOfExpCEs += noOfCEsToCopy; 880 // Smart people never try to add codepoints and CEs. 881 // For some odd reason, it won't work. 882 expOffset += currentSequenceLen; //noOfCEsToCopy; 883 len -= currentSequenceLen; //noOfCEsToCopy; 884 break; 885 } else { 886 currentSequenceLen--; 887 } 888 } 889 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ 890 /* will have to get one from UCA */ 891 /* first, get the UChars from the rules */ 892 /* then pick CEs out until there is no more and stuff them into expansion */ 893 collIterate s; 894 uint32_t order = 0; 895 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); 896 897 for(;;) { 898 order = ucol_getNextCE(src->UCA, &s, status); 899 if(order == UCOL_NO_MORE_CES) { 900 break; 901 } 902 tok->expCEs[tok->noOfExpCEs++] = order; 903 } 904 expOffset++; 905 len--; 906 } 907 } 908 } else { 909 tok->noOfExpCEs = 0; 910 } 911 912 /* set the ucaelement with obtained values */ 913 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; 914 /* copy CEs */ 915 for(i = 0; i<tok->noOfCEs; i++) { 916 el.CEs[i] = tok->CEs[i]; 917 } 918 for(i = 0; i<tok->noOfExpCEs; i++) { 919 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; 920 } 921 922 /* copy UChars */ 923 // We kept prefix and source kind of together, as it is a kind of a contraction. 924 // However, now we have to slice the prefix off the main thing - 925 el.prefix = el.prefixChars; 926 el.cPoints = el.uchars; 927 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the 928 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND 929 // decomposed elements to the unsaf table. 930 el.prefixSize = tok->prefix>>24; 931 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 932 933 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 934 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 935 } else { 936 el.prefixSize = 0; 937 *el.prefix = 0; 938 939 el.cSize = (tok->source >> 24); 940 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 941 } 942 if(src->UCA != NULL) { 943 for(i = 0; i<el.cSize; i++) { 944 if(UCOL_ISJAMO(el.cPoints[i])) { 945 t->image->jamoSpecial = TRUE; 946 } 947 } 948 if (!src->buildCCTabFlag && el.cSize > 0) { 949 // Check the trailing canonical combining class (tccc) of the last character. 950 const UChar *s = el.cPoints + el.cSize; 951 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s); 952 if ((fcd & 0xff) != 0) { 953 src->buildCCTabFlag = TRUE; 954 } 955 } 956 } 957 958 /* and then, add it */ 959 #if UCOL_DEBUG==2 960 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); 961 #endif 962 uprv_uca_addAnElement(t, &el, status); 963 964 #if UCOL_DEBUG_DUPLICATES 965 if(*status != U_ZERO_ERROR) { 966 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); 967 *status = U_ZERO_ERROR; 968 } 969 #endif 970 971 tok = tok->next; 972 } 973 } 974 975 U_CDECL_BEGIN 976 static UBool U_CALLCONV 977 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 978 UErrorCode status = U_ZERO_ERROR; 979 tempUCATable *t = (tempUCATable *)context; 980 if(value == 0) { 981 while(start < limit) { 982 uint32_t CE = utrie_get32(t->mapping, start, NULL); 983 if(CE == UCOL_NOT_FOUND) { 984 UCAElements el; 985 el.isThai = FALSE; 986 el.prefixSize = 0; 987 el.prefixChars[0] = 0; 988 el.prefix = el.prefixChars; 989 el.cPoints = el.uchars; 990 991 el.cSize = 0; 992 U16_APPEND_UNSAFE(el.uchars, el.cSize, start); 993 994 el.noOfCEs = 1; 995 el.CEs[0] = 0; 996 uprv_uca_addAnElement(t, &el, &status); 997 998 } 999 start++; 1000 } 1001 } 1002 if(U_FAILURE(status)) { 1003 return FALSE; 1004 } else { 1005 return TRUE; 1006 } 1007 } 1008 U_CDECL_END 1009 1010 static void 1011 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, 1012 UChar32 start, UChar32 end, 1013 UErrorCode *status) 1014 { 1015 //UChar decomp[256]; 1016 uint32_t CE = UCOL_NOT_FOUND; 1017 UChar32 u = 0; 1018 UCAElements el; 1019 el.isThai = FALSE; 1020 el.prefixSize = 0; 1021 el.prefixChars[0] = 0; 1022 collIterate colIt; 1023 1024 if(U_SUCCESS(*status)) { 1025 for(u = start; u<=end; u++) { 1026 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND 1027 /* this test is for contractions that are missing the starting element. */ 1028 || ((isCntTableElement(CE)) && 1029 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) 1030 ) 1031 { 1032 el.cSize = 0; 1033 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); 1034 //decomp[0] = (UChar)u; 1035 //el.uchars[0] = (UChar)u; 1036 el.cPoints = el.uchars; 1037 //el.cSize = 1; 1038 el.noOfCEs = 0; 1039 el.prefix = el.prefixChars; 1040 el.prefixSize = 0; 1041 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); 1042 // We actually want to check whether this element is a special 1043 // If it is an implicit element (hangul, CJK - we want to copy the 1044 // special, not the resolved CEs) - for hangul, copying resolved 1045 // would just make things the same (there is an expansion and it 1046 // takes approximately the same amount of time to resolve as 1047 // falling back to the UCA). 1048 /* 1049 UTRIE_GET32(src->UCA->mapping, u, CE); 1050 tag = getCETag(CE); 1051 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG 1052 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG 1053 || tag == LEAD_SURROGATE_TAG) { 1054 el.CEs[el.noOfCEs++] = CE; 1055 } else { 1056 */ 1057 // It turns out that it does not make sense to keep implicits 1058 // unresolved. The cost of resolving them is big enough so that 1059 // it doesn't make any difference whether we have to go to the UCA 1060 // or not. 1061 { 1062 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); 1063 while(CE != UCOL_NO_MORE_CES) { 1064 CE = ucol_getNextCE(src->UCA, &colIt, status); 1065 if(CE != UCOL_NO_MORE_CES) { 1066 el.CEs[el.noOfCEs++] = CE; 1067 } 1068 } 1069 } 1070 uprv_uca_addAnElement(t, &el, status); 1071 } 1072 } 1073 } 1074 } 1075 1076 U_NAMESPACE_END 1077 1078 U_CFUNC UCATableHeader * 1079 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { 1080 U_NAMESPACE_USE 1081 1082 uint32_t i = 0; 1083 if(U_FAILURE(*status)) { 1084 return NULL; 1085 } 1086 /* 1087 2. Eliminate the negative lists by doing the following for each non-null negative list: 1088 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 1089 create new ListHeader X 1090 o reverse the list, add to the end of X's positive list. Reset the strength of the 1091 first item you add, based on the stronger strength levels of the two lists. 1092 */ 1093 /* 1094 3. For each ListHeader with a non-null positive list: 1095 */ 1096 /* 1097 o Find all character strings with CEs between the baseCE and the 1098 next/previous CE, at the strength of the first token. Add these to the 1099 tailoring. 1100 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the 1101 tailoring has & x < z... 1102 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... 1103 */ 1104 /* It is possible that this part should be done even while constructing list */ 1105 /* The problem is that it is unknown what is going to be the strongest weight */ 1106 /* So we might as well do it here */ 1107 1108 /* 1109 o Allocate CEs for each token in the list, based on the total number N of the 1110 largest level difference, and the gap G between baseCE and nextCE at that 1111 level. The relation * between the last item and nextCE is the same as the 1112 strongest strength. 1113 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 1114 ? There are 3 primary items: a, d, e. Fit them into the primary gap. 1115 Then fit b and c into the secondary gap between a and d, then fit q 1116 into the tertiary gap between b and c. 1117 1118 o Example: baseCE << b <<< q << c * nextCE(X,2) 1119 ? There are 2 secondary items: b, c. Fit them into the secondary gap. 1120 Then fit q into the tertiary gap between b and c. 1121 o When incrementing primary values, we will not cross high byte 1122 boundaries except where there is only a single-byte primary. That is to 1123 ensure that the script reordering will continue to work. 1124 */ 1125 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); 1126 /* test for NULL */ 1127 if (image == NULL) { 1128 *status = U_MEMORY_ALLOCATION_ERROR; 1129 return NULL; 1130 } 1131 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); 1132 1133 for(i = 0; i<src->resultLen; i++) { 1134 /* now we need to generate the CEs */ 1135 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1136 /* According to strength */ 1137 if(U_SUCCESS(*status)) { 1138 if(src->lh[i].first) { // if there are any elements 1139 // due to the way parser works, subsequent tailorings 1140 // may remove all the elements from a sequence, therefore 1141 // leaving an empty tailoring sequence. 1142 ucol_initBuffers(src, &src->lh[i], status); 1143 } 1144 } 1145 if(U_FAILURE(*status)) { 1146 uprv_free(image); 1147 return NULL; 1148 } 1149 } 1150 1151 if(src->varTop != NULL) { /* stuff the variable top value */ 1152 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; 1153 /* remove it from the list */ 1154 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ 1155 src->varTop->listHeader->first = src->varTop->next; 1156 } 1157 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ 1158 src->varTop->listHeader->last = src->varTop->previous; 1159 } 1160 if(src->varTop->next != NULL) { 1161 src->varTop->next->previous = src->varTop->previous; 1162 } 1163 if(src->varTop->previous != NULL) { 1164 src->varTop->previous->next = src->varTop->next; 1165 } 1166 } 1167 1168 1169 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); 1170 if(U_FAILURE(*status)) { 1171 uprv_free(image); 1172 return NULL; 1173 } 1174 1175 1176 /* After this, we have assigned CE values to all regular CEs */ 1177 /* now we will go through list once more and resolve expansions, */ 1178 /* make UCAElements structs and add them to table */ 1179 for(i = 0; i<src->resultLen; i++) { 1180 /* now we need to generate the CEs */ 1181 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1182 /* According to strength */ 1183 if(U_SUCCESS(*status)) { 1184 ucol_createElements(src, t, &src->lh[i], status); 1185 } 1186 } 1187 1188 UCAElements el; 1189 el.isThai = FALSE; 1190 el.prefixSize = 0; 1191 el.prefixChars[0] = 0; 1192 1193 /* add latin-1 stuff */ 1194 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); 1195 1196 /* add stuff for copying */ 1197 if(src->copySet != NULL) { 1198 int32_t i = 0; 1199 UnicodeSet *set = (UnicodeSet *)src->copySet; 1200 for(i = 0; i < set->getRangeCount(); i++) { 1201 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); 1202 } 1203 } 1204 1205 if(U_SUCCESS(*status)) { 1206 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ 1207 1208 uint32_t tailoredCE = UCOL_NOT_FOUND; 1209 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); 1210 int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth; 1211 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); 1212 // Check for null pointer 1213 if (ucaEl == NULL) { 1214 *status = U_MEMORY_ALLOCATION_ERROR; 1215 return NULL; 1216 } 1217 while(*conts != 0) { 1218 // A continuation is NUL-terminated and NUL-padded 1219 // except if it has the maximum length. 1220 int32_t contractionLength = maxUCAContractionLength; 1221 while(contractionLength > 0 && conts[contractionLength - 1] == 0) { 1222 --contractionLength; 1223 } 1224 UChar32 first; 1225 int32_t firstLength = 0; 1226 U16_NEXT(conts, firstLength, contractionLength, first); 1227 tailoredCE = utrie_get32(t->mapping, first, NULL); 1228 if(tailoredCE != UCOL_NOT_FOUND) { 1229 UBool needToAdd = TRUE; 1230 if(isCntTableElement(tailoredCE)) { 1231 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) { 1232 needToAdd = FALSE; 1233 } 1234 } 1235 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { 1236 UCAElements elm; 1237 elm.cPoints = el.uchars; 1238 elm.noOfCEs = 0; 1239 elm.uchars[0] = *conts; 1240 elm.uchars[1] = 0; 1241 elm.cSize = 1; 1242 elm.prefixChars[0] = *(conts+2); 1243 elm.isThai = FALSE; 1244 elm.prefix = elm.prefixChars; 1245 elm.prefixSize = 1; 1246 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); 1247 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { 1248 needToAdd = TRUE; 1249 } 1250 } 1251 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { 1252 needToAdd = FALSE; 1253 } 1254 1255 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. 1256 if (*(conts+1) != 0) { // contractions 1257 el.prefix = el.prefixChars; 1258 el.prefixSize = 0; 1259 el.cPoints = el.uchars; 1260 el.noOfCEs = 0; 1261 u_memcpy(el.uchars, conts, contractionLength); 1262 el.cSize = contractionLength; 1263 ucol_setText(ucaEl, el.uchars, el.cSize, status); 1264 } 1265 else { // pre-context character 1266 UChar str[4] = { 0 }; 1267 int32_t len=0; 1268 int32_t preKeyLen=0; 1269 1270 el.cPoints = el.uchars; 1271 el.noOfCEs = 0; 1272 el.uchars[0] = *conts; 1273 el.uchars[1] = 0; 1274 el.cSize = 1; 1275 el.prefixChars[0] = *(conts+2); 1276 el.prefix = el.prefixChars; 1277 el.prefixSize = 1; 1278 if (el.prefixChars[0]!=0) { 1279 // get CE of prefix character first 1280 str[0]=el.prefixChars[0]; 1281 str[1]=0; 1282 ucol_setText(ucaEl, str, 1, status); 1283 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) 1284 != UCOL_NULLORDER) { 1285 preKeyLen++; // count number of keys for prefix character 1286 } 1287 str[len++] = el.prefixChars[0]; 1288 } 1289 1290 str[len++] = el.uchars[0]; 1291 str[len]=0; 1292 ucol_setText(ucaEl, str, len, status); 1293 // Skip the keys for prefix character, then copy the rest to el. 1294 while ((preKeyLen-->0) && 1295 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1296 continue; 1297 } 1298 1299 } 1300 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1301 el.noOfCEs++; 1302 } 1303 uprv_uca_addAnElement(t, &el, status); 1304 } 1305 1306 } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) { 1307 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status); 1308 } 1309 conts+=maxUCAContractionLength; 1310 } 1311 ucol_closeElements(ucaEl); 1312 } 1313 1314 // Add completely ignorable elements 1315 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); 1316 1317 // add tailoring characters related canonical closures 1318 uprv_uca_canonicalClosure(t, src, NULL, status); 1319 1320 /* still need to produce compatibility closure */ 1321 1322 UCATableHeader *myData = uprv_uca_assembleTable(t, status); 1323 1324 uprv_uca_closeTempTable(t); 1325 uprv_free(image); 1326 1327 return myData; 1328 } 1329 1330 U_CDECL_BEGIN 1331 static UBool U_CALLCONV 1332 ucol_bld_cleanup(void) 1333 { 1334 udata_close(invUCA_DATA_MEM); 1335 invUCA_DATA_MEM = NULL; 1336 _staticInvUCA = NULL; 1337 return TRUE; 1338 } 1339 U_CDECL_END 1340 1341 U_CAPI const InverseUCATableHeader * U_EXPORT2 1342 ucol_initInverseUCA(UErrorCode *status) 1343 { 1344 if(U_FAILURE(*status)) return NULL; 1345 1346 UBool needsInit; 1347 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit); 1348 1349 if(needsInit) { 1350 InverseUCATableHeader *newInvUCA = NULL; 1351 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); 1352 1353 if(U_FAILURE(*status)) { 1354 if (result) { 1355 udata_close(result); 1356 } 1357 // This is not needed, as we are talking about 1358 // memory we got from UData 1359 //uprv_free(newInvUCA); 1360 } 1361 1362 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ 1363 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); 1364 UCollator *UCA = ucol_initUCA(status); 1365 // UCA versions of UCA and inverse UCA should match 1366 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { 1367 *status = U_INVALID_FORMAT_ERROR; 1368 udata_close(result); 1369 return NULL; 1370 } 1371 1372 umtx_lock(NULL); 1373 if(_staticInvUCA == NULL) { 1374 invUCA_DATA_MEM = result; 1375 _staticInvUCA = newInvUCA; 1376 result = NULL; 1377 newInvUCA = NULL; 1378 } 1379 umtx_unlock(NULL); 1380 1381 if(newInvUCA != NULL) { 1382 udata_close(result); 1383 // This is not needed, as we are talking about 1384 // memory we got from UData 1385 //uprv_free(newInvUCA); 1386 } 1387 else { 1388 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); 1389 } 1390 } 1391 } 1392 return _staticInvUCA; 1393 } 1394 1395 /* This is the data that is used for non-script reordering codes. These _must_ be kept 1396 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. 1397 */ 1398 static const char * const ReorderingTokenNames[] = { 1399 "SPACE", 1400 "PUNCT", 1401 "SYMBOL", 1402 "CURRENCY", 1403 "DIGIT" 1404 }; 1405 1406 static void toUpper(const char* src, char* dst, uint32_t length) { 1407 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { 1408 *dst = uprv_toupper(*src); 1409 } 1410 *dst = '\0'; 1411 } 1412 1413 U_INTERNAL int32_t U_EXPORT2 1414 ucol_findReorderingEntry(const char* name) { 1415 char buffer[32]; 1416 toUpper(name, buffer, 32); 1417 for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) { 1418 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { 1419 return entry + UCOL_REORDER_CODE_FIRST; 1420 } 1421 } 1422 return USCRIPT_INVALID_CODE; 1423 } 1424 1425 #endif /* #if !UCONFIG_NO_COLLATION */ 1426