1 /* 2 ******************************************************************************* 3 * Copyright (C) 2004-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol_sit.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 03/12/2004 weiv Creation 14 */ 15 16 #include "unicode/ustring.h" 17 #include "unicode/udata.h" 18 19 #include "utracimp.h" 20 #include "ucol_imp.h" 21 #include "ucol_tok.h" 22 #include "cmemory.h" 23 #include "cstring.h" 24 #include "uresimp.h" 25 26 #if !UCONFIG_NO_COLLATION 27 28 enum OptionsList { 29 UCOL_SIT_LANGUAGE = 0, 30 UCOL_SIT_SCRIPT, 31 UCOL_SIT_REGION, 32 UCOL_SIT_VARIANT, 33 UCOL_SIT_KEYWORD, 34 UCOL_SIT_BCP47, 35 UCOL_SIT_STRENGTH, 36 UCOL_SIT_CASE_LEVEL, 37 UCOL_SIT_CASE_FIRST, 38 UCOL_SIT_NUMERIC_COLLATION, 39 UCOL_SIT_ALTERNATE_HANDLING, 40 UCOL_SIT_NORMALIZATION_MODE, 41 UCOL_SIT_FRENCH_COLLATION, 42 UCOL_SIT_HIRAGANA_QUATERNARY, 43 UCOL_SIT_VARIABLE_TOP, 44 UCOL_SIT_VARIABLE_TOP_VALUE, 45 UCOL_SIT_ITEMS_COUNT 46 }; 47 48 /* option starters chars. */ 49 static const char alternateHArg = 'A'; 50 static const char variableTopValArg = 'B'; 51 static const char caseFirstArg = 'C'; 52 static const char numericCollArg = 'D'; 53 static const char caseLevelArg = 'E'; 54 static const char frenchCollArg = 'F'; 55 static const char hiraganaQArg = 'H'; 56 static const char keywordArg = 'K'; 57 static const char languageArg = 'L'; 58 static const char normArg = 'N'; 59 static const char regionArg = 'R'; 60 static const char strengthArg = 'S'; 61 static const char variableTopArg = 'T'; 62 static const char variantArg = 'V'; 63 static const char RFC3066Arg = 'X'; 64 static const char scriptArg = 'Z'; 65 66 static const char collationKeyword[] = "@collation="; 67 68 static const int32_t locElementCount = 5; 69 static const int32_t locElementCapacity = 32; 70 static const int32_t loc3066Capacity = 256; 71 static const int32_t internalBufferSize = 512; 72 73 /* structure containing specification of a collator. Initialized 74 * from a short string. Also used to construct a short string from a 75 * collator instance 76 */ 77 struct CollatorSpec { 78 char locElements[locElementCount][locElementCapacity]; 79 char locale[loc3066Capacity]; 80 UColAttributeValue options[UCOL_ATTRIBUTE_COUNT]; 81 uint32_t variableTopValue; 82 UChar variableTopString[locElementCapacity]; 83 int32_t variableTopStringLen; 84 UBool variableTopSet; 85 struct { 86 const char *start; 87 int32_t len; 88 } entries[UCOL_SIT_ITEMS_COUNT]; 89 }; 90 91 92 /* structure for converting between character attribute 93 * representation and real collation attribute value. 94 */ 95 struct AttributeConversion { 96 char letter; 97 UColAttributeValue value; 98 }; 99 100 static const AttributeConversion conversions[12] = { 101 { '1', UCOL_PRIMARY }, 102 { '2', UCOL_SECONDARY }, 103 { '3', UCOL_TERTIARY }, 104 { '4', UCOL_QUATERNARY }, 105 { 'D', UCOL_DEFAULT }, 106 { 'I', UCOL_IDENTICAL }, 107 { 'L', UCOL_LOWER_FIRST }, 108 { 'N', UCOL_NON_IGNORABLE }, 109 { 'O', UCOL_ON }, 110 { 'S', UCOL_SHIFTED }, 111 { 'U', UCOL_UPPER_FIRST }, 112 { 'X', UCOL_OFF } 113 }; 114 115 116 static char 117 ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) { 118 uint32_t i = 0; 119 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 120 if(conversions[i].value == value) { 121 return conversions[i].letter; 122 } 123 } 124 *status = U_ILLEGAL_ARGUMENT_ERROR; 125 return 0; 126 } 127 128 static UColAttributeValue 129 ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { 130 uint32_t i = 0; 131 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 132 if(conversions[i].letter == letter) { 133 return conversions[i].value; 134 } 135 } 136 *status = U_ILLEGAL_ARGUMENT_ERROR; 137 return UCOL_DEFAULT; 138 } 139 140 /* function prototype for functions used to parse a short string */ 141 U_CDECL_BEGIN 142 typedef const char* U_CALLCONV 143 ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string, 144 UErrorCode *status); 145 U_CDECL_END 146 147 U_CDECL_BEGIN 148 static const char* U_CALLCONV 149 _processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, 150 UErrorCode *status) 151 { 152 int32_t len = 0; 153 do { 154 if(value == 0 || value == 4) { 155 spec->locElements[value][len++] = uprv_tolower(*string); 156 } else { 157 spec->locElements[value][len++] = *string; 158 } 159 } while(*(++string) != '_' && *string && len < locElementCapacity); 160 if(len >= locElementCapacity) { 161 *status = U_BUFFER_OVERFLOW_ERROR; 162 return string; 163 } 164 // don't skip the underscore at the end 165 return string; 166 } 167 U_CDECL_END 168 169 U_CDECL_BEGIN 170 static const char* U_CALLCONV 171 _processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, 172 UErrorCode *status) 173 { 174 char terminator = *string; 175 string++; 176 const char *end = uprv_strchr(string+1, terminator); 177 if(end == NULL || end - string >= loc3066Capacity) { 178 *status = U_BUFFER_OVERFLOW_ERROR; 179 return string; 180 } else { 181 uprv_strncpy(spec->locale, string, end-string); 182 return end+1; 183 } 184 } 185 186 U_CDECL_END 187 188 U_CDECL_BEGIN 189 static const char* U_CALLCONV 190 _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, 191 UErrorCode *status) 192 { 193 spec->options[option] = ucol_sit_letterToAttributeValue(*string, status); 194 if((*(++string) != '_' && *string) || U_FAILURE(*status)) { 195 *status = U_ILLEGAL_ARGUMENT_ERROR; 196 } 197 return string; 198 } 199 U_CDECL_END 200 201 202 static UChar 203 readHexCodeUnit(const char **string, UErrorCode *status) 204 { 205 UChar result = 0; 206 int32_t value = 0; 207 char c; 208 int32_t noDigits = 0; 209 while((c = **string) != 0 && noDigits < 4) { 210 if( c >= '0' && c <= '9') { 211 value = c - '0'; 212 } else if ( c >= 'a' && c <= 'f') { 213 value = c - 'a' + 10; 214 } else if ( c >= 'A' && c <= 'F') { 215 value = c - 'A' + 10; 216 } else { 217 *status = U_ILLEGAL_ARGUMENT_ERROR; 218 return 0; 219 } 220 result = (result << 4) | (UChar)value; 221 noDigits++; 222 (*string)++; 223 } 224 // if the string was terminated before we read 4 digits, set an error 225 if(noDigits < 4) { 226 *status = U_ILLEGAL_ARGUMENT_ERROR; 227 } 228 return result; 229 } 230 231 U_CDECL_BEGIN 232 static const char* U_CALLCONV 233 _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) 234 { 235 // get four digits 236 int32_t i = 0; 237 if(!value1) { 238 while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') { 239 spec->variableTopString[i++] = readHexCodeUnit(&string, status); 240 } 241 spec->variableTopStringLen = i; 242 if(i == locElementCapacity && (*string != 0 || *string != '_')) { 243 *status = U_BUFFER_OVERFLOW_ERROR; 244 } 245 } else { 246 spec->variableTopValue = readHexCodeUnit(&string, status); 247 } 248 if(U_SUCCESS(*status)) { 249 spec->variableTopSet = TRUE; 250 } 251 return string; 252 } 253 U_CDECL_END 254 255 256 /* Table for parsing short strings */ 257 struct ShortStringOptions { 258 char optionStart; 259 ActionFunction *action; 260 uint32_t attr; 261 }; 262 263 static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] = 264 { 265 /* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D 266 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 }, 267 /* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D 268 /* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D 269 /* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D 270 /* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D 271 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D 272 /* 04 KEYWORD */ {keywordArg, _processLocaleElement, 4 }, // keyword 273 /* 00 LANGUAGE */ {languageArg, _processLocaleElement, 0 }, // language 274 /* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D 275 /* 02 REGION */ {regionArg, _processLocaleElement, 2 }, // region 276 /* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D 277 /* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 }, 278 /* 03 VARIANT */ {variantArg, _processLocaleElement, 3 }, // variant 279 /* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name 280 /* 01 SCRIPT */ {scriptArg, _processLocaleElement, 1 } // script 281 }; 282 283 284 static 285 const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, 286 UErrorCode *status) 287 { 288 int32_t i = 0; 289 290 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 291 if(*start == options[i].optionStart) { 292 spec->entries[i].start = start; 293 const char* end = options[i].action(spec, options[i].attr, start+1, status); 294 spec->entries[i].len = (int32_t)(end - start); 295 return end; 296 } 297 } 298 *status = U_ILLEGAL_ARGUMENT_ERROR; 299 return start; 300 } 301 302 static 303 void ucol_sit_initCollatorSpecs(CollatorSpec *spec) 304 { 305 // reset everything 306 uprv_memset(spec, 0, sizeof(CollatorSpec)); 307 // set collation options to default 308 int32_t i = 0; 309 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 310 spec->options[i] = UCOL_DEFAULT; 311 } 312 } 313 314 static const char* 315 ucol_sit_readSpecs(CollatorSpec *s, const char *string, 316 UParseError *parseError, UErrorCode *status) 317 { 318 const char *definition = string; 319 while(U_SUCCESS(*status) && *string) { 320 string = ucol_sit_readOption(string, s, status); 321 // advance over '_' 322 while(*string && *string == '_') { 323 string++; 324 } 325 } 326 if(U_FAILURE(*status)) { 327 parseError->offset = (int32_t)(string - definition); 328 } 329 return string; 330 } 331 332 static 333 int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status) 334 { 335 int32_t i = 0, j = 0; 336 int32_t len = 0; 337 char optName; 338 if(U_SUCCESS(*status)) { 339 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 340 if(s->entries[i].start) { 341 if(len) { 342 if(len < capacity) { 343 uprv_strcat(destination, "_"); 344 } 345 len++; 346 } 347 optName = *(s->entries[i].start); 348 if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) { 349 for(j = 0; j < s->entries[i].len; j++) { 350 if(len + j < capacity) { 351 destination[len+j] = uprv_toupper(*(s->entries[i].start+j)); 352 } 353 } 354 len += s->entries[i].len; 355 } else { 356 len += s->entries[i].len; 357 if(len < capacity) { 358 uprv_strncat(destination,s->entries[i].start, s->entries[i].len); 359 } 360 } 361 } 362 } 363 return len; 364 } else { 365 return 0; 366 } 367 } 368 369 static void 370 ucol_sit_calculateWholeLocale(CollatorSpec *s) { 371 // put the locale together, unless we have a done 372 // locale 373 if(s->locale[0] == 0) { 374 // first the language 375 uprv_strcat(s->locale, s->locElements[0]); 376 // then the script, if present 377 if(*(s->locElements[1])) { 378 uprv_strcat(s->locale, "_"); 379 uprv_strcat(s->locale, s->locElements[1]); 380 } 381 // then the region, if present 382 if(*(s->locElements[2])) { 383 uprv_strcat(s->locale, "_"); 384 uprv_strcat(s->locale, s->locElements[2]); 385 } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore 386 uprv_strcat(s->locale, "_"); 387 } 388 // add variant, if there 389 if(*(s->locElements[3])) { 390 uprv_strcat(s->locale, "_"); 391 uprv_strcat(s->locale, s->locElements[3]); 392 } 393 394 // if there is a collation keyword, add that too 395 if(*(s->locElements[4])) { 396 uprv_strcat(s->locale, collationKeyword); 397 uprv_strcat(s->locale, s->locElements[4]); 398 } 399 } 400 } 401 402 403 U_CAPI void U_EXPORT2 404 ucol_prepareShortStringOpen( const char *definition, 405 UBool, 406 UParseError *parseError, 407 UErrorCode *status) 408 { 409 if(U_FAILURE(*status)) return; 410 411 UParseError internalParseError; 412 413 if(!parseError) { 414 parseError = &internalParseError; 415 } 416 parseError->line = 0; 417 parseError->offset = 0; 418 parseError->preContext[0] = 0; 419 parseError->postContext[0] = 0; 420 421 422 // first we want to pick stuff out of short string. 423 // we'll end up with an UCA version, locale and a bunch of 424 // settings 425 426 // analyse the string in order to get everything we need. 427 CollatorSpec s; 428 ucol_sit_initCollatorSpecs(&s); 429 ucol_sit_readSpecs(&s, definition, parseError, status); 430 ucol_sit_calculateWholeLocale(&s); 431 432 char buffer[internalBufferSize]; 433 uprv_memset(buffer, 0, internalBufferSize); 434 uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 435 436 UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status); 437 /* we try to find stuff from keyword */ 438 UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status); 439 UResourceBundle *collElem = NULL; 440 char keyBuffer[256]; 441 // if there is a keyword, we pick it up and try to get elements 442 if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) { 443 // no keyword. we try to find the default setting, which will give us the keyword value 444 UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); 445 if(U_SUCCESS(*status)) { 446 int32_t defaultKeyLen = 0; 447 const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status); 448 u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen); 449 keyBuffer[defaultKeyLen] = 0; 450 } else { 451 *status = U_INTERNAL_PROGRAM_ERROR; 452 return; 453 } 454 ures_close(defaultColl); 455 } 456 collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status); 457 ures_close(collElem); 458 ures_close(collations); 459 ures_close(b); 460 } 461 462 463 U_CAPI UCollator* U_EXPORT2 464 ucol_openFromShortString( const char *definition, 465 UBool forceDefaults, 466 UParseError *parseError, 467 UErrorCode *status) 468 { 469 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING); 470 UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition); 471 472 if(U_FAILURE(*status)) return 0; 473 474 UParseError internalParseError; 475 476 if(!parseError) { 477 parseError = &internalParseError; 478 } 479 parseError->line = 0; 480 parseError->offset = 0; 481 parseError->preContext[0] = 0; 482 parseError->postContext[0] = 0; 483 484 485 // first we want to pick stuff out of short string. 486 // we'll end up with an UCA version, locale and a bunch of 487 // settings 488 489 // analyse the string in order to get everything we need. 490 const char *string = definition; 491 CollatorSpec s; 492 ucol_sit_initCollatorSpecs(&s); 493 string = ucol_sit_readSpecs(&s, definition, parseError, status); 494 ucol_sit_calculateWholeLocale(&s); 495 496 char buffer[internalBufferSize]; 497 uprv_memset(buffer, 0, internalBufferSize); 498 uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 499 500 UCollator *result = ucol_open(buffer, status); 501 int32_t i = 0; 502 503 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 504 if(s.options[i] != UCOL_DEFAULT) { 505 if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) { 506 ucol_setAttribute(result, (UColAttribute)i, s.options[i], status); 507 } 508 509 if(U_FAILURE(*status)) { 510 parseError->offset = (int32_t)(string - definition); 511 ucol_close(result); 512 return NULL; 513 } 514 515 } 516 } 517 if(s.variableTopSet) { 518 if(s.variableTopString[0]) { 519 ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status); 520 } else { // we set by value, using 'B' 521 ucol_restoreVariableTop(result, s.variableTopValue, status); 522 } 523 } 524 525 526 if(U_FAILURE(*status)) { // here it can only be a bogus value 527 ucol_close(result); 528 result = NULL; 529 } 530 531 UTRACE_EXIT_PTR_STATUS(result, *status); 532 return result; 533 } 534 535 536 static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg) 537 { 538 if(len) { 539 if(*resultSize) { 540 if(*resultSize < capacity) { 541 uprv_strcat(result, "_"); 542 } 543 (*resultSize)++; 544 } 545 *resultSize += len + 1; 546 if(*resultSize < capacity) { 547 uprv_strncat(result, &arg, 1); 548 uprv_strncat(result, src, len); 549 } 550 } 551 } 552 553 U_CAPI int32_t U_EXPORT2 554 ucol_getShortDefinitionString(const UCollator *coll, 555 const char *locale, 556 char *dst, 557 int32_t capacity, 558 UErrorCode *status) 559 { 560 if(U_FAILURE(*status)) return 0; 561 char buffer[internalBufferSize]; 562 uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 563 int32_t resultSize = 0; 564 char tempbuff[internalBufferSize]; 565 char locBuff[internalBufferSize]; 566 uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 567 int32_t elementSize = 0; 568 UBool isAvailable = 0; 569 CollatorSpec s; 570 ucol_sit_initCollatorSpecs(&s); 571 572 if(!locale) { 573 locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status); 574 } 575 elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status); 576 577 if(elementSize) { 578 // we should probably canonicalize here... 579 elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); 580 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg); 581 elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); 582 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg); 583 elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); 584 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg); 585 elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); 586 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg); 587 elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); 588 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg); 589 } 590 591 int32_t i = 0; 592 UColAttributeValue attribute = UCOL_DEFAULT; 593 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 594 if(options[i].action == _processCollatorOption) { 595 attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status); 596 if(attribute != UCOL_DEFAULT) { 597 char letter = ucol_sit_attributeValueToLetter(attribute, status); 598 appendShortStringElement(&letter, 1, 599 buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart); 600 } 601 } 602 } 603 if(coll->variableTopValueisDefault == FALSE) { 604 //s.variableTopValue = ucol_getVariableTop(coll, status); 605 elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16); 606 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg); 607 } 608 609 UParseError parseError; 610 return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status); 611 } 612 613 U_CAPI int32_t U_EXPORT2 614 ucol_normalizeShortDefinitionString(const char *definition, 615 char *destination, 616 int32_t capacity, 617 UParseError *parseError, 618 UErrorCode *status) 619 { 620 621 if(U_FAILURE(*status)) { 622 return 0; 623 } 624 625 if(destination) { 626 uprv_memset(destination, 0, capacity*sizeof(char)); 627 } 628 629 UParseError pe; 630 if(!parseError) { 631 parseError = &pe; 632 } 633 634 // validate 635 CollatorSpec s; 636 ucol_sit_initCollatorSpecs(&s); 637 ucol_sit_readSpecs(&s, definition, parseError, status); 638 return ucol_sit_dumpSpecs(&s, destination, capacity, status); 639 } 640 641 U_CAPI UColAttributeValue U_EXPORT2 642 ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status) 643 { 644 if(U_FAILURE(*status) || coll == NULL) { 645 return UCOL_DEFAULT; 646 } 647 switch(attr) { 648 case UCOL_NUMERIC_COLLATION: 649 return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation; 650 case UCOL_HIRAGANA_QUATERNARY_MODE: 651 return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ; 652 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 653 return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation; 654 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 655 return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling; 656 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 657 return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst; 658 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 659 return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel; 660 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 661 return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode; 662 case UCOL_STRENGTH: /* attribute for strength */ 663 return coll->strengthisDefault?UCOL_DEFAULT:coll->strength; 664 case UCOL_ATTRIBUTE_COUNT: 665 default: 666 *status = U_ILLEGAL_ARGUMENT_ERROR; 667 break; 668 } 669 return UCOL_DEFAULT; 670 } 671 672 673 struct contContext { 674 const UCollator *coll; 675 USet *conts; 676 USet *expansions; 677 USet *removedContractions; 678 UBool addPrefixes; 679 UErrorCode *status; 680 }; 681 682 683 684 static void 685 addSpecial(contContext *context, UChar *buffer, int32_t bufLen, 686 uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status) 687 { 688 const UCollator *coll = context->coll; 689 USet *contractions = context->conts; 690 USet *expansions = context->expansions; 691 UBool addPrefixes = context->addPrefixes; 692 693 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 694 uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 695 // we might have a contraction that ends from previous level 696 if(newCE != UCOL_NOT_FOUND) { 697 if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) { 698 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 699 } 700 if(contractions && rightIndex-leftIndex > 1) { 701 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 702 if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) { 703 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 704 } 705 } 706 } 707 708 UCharOffset++; 709 // check whether we're doing contraction or prefix 710 if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) { 711 if(leftIndex == 0) { 712 *status = U_INTERNAL_PROGRAM_ERROR; 713 return; 714 } 715 --leftIndex; 716 while(*UCharOffset != 0xFFFF) { 717 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 718 buffer[leftIndex] = *UCharOffset; 719 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 720 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 721 } else { 722 if(contractions) { 723 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 724 } 725 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 726 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 727 } 728 } 729 UCharOffset++; 730 } 731 } else if(getCETag(CE) == CONTRACTION_TAG) { 732 if(rightIndex == bufLen-1) { 733 *status = U_INTERNAL_PROGRAM_ERROR; 734 return; 735 } 736 while(*UCharOffset != 0xFFFF) { 737 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 738 buffer[rightIndex] = *UCharOffset; 739 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 740 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status); 741 } else { 742 if(contractions) { 743 uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex); 744 } 745 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 746 uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex); 747 } 748 } 749 UCharOffset++; 750 } 751 } 752 753 } 754 755 U_CDECL_BEGIN 756 static UBool U_CALLCONV 757 _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) 758 { 759 UErrorCode *status = ((contContext *)context)->status; 760 USet *expansions = ((contContext *)context)->expansions; 761 USet *removed = ((contContext *)context)->removedContractions; 762 UBool addPrefixes = ((contContext *)context)->addPrefixes; 763 UChar contraction[internalBufferSize]; 764 if(isSpecial(CE)) { 765 if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { 766 while(start < limit && U_SUCCESS(*status)) { 767 // if there are suppressed contractions, we don't 768 // want to add them. 769 if(removed && uset_contains(removed, start)) { 770 start++; 771 continue; 772 } 773 // we start our contraction from middle, since we don't know if it 774 // will grow toward right or left 775 contraction[internalBufferSize/2] = (UChar)start; 776 addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); 777 start++; 778 } 779 } else if(expansions && getCETag(CE) == EXPANSION_TAG) { 780 while(start < limit && U_SUCCESS(*status)) { 781 uset_add(expansions, start++); 782 } 783 } 784 } 785 if(U_FAILURE(*status)) { 786 return FALSE; 787 } else { 788 return TRUE; 789 } 790 } 791 792 U_CDECL_END 793 794 795 796 /** 797 * Get a set containing the contractions defined by the collator. The set includes 798 * both the UCA contractions and the contractions defined by the collator 799 * @param coll collator 800 * @param conts the set to hold the result 801 * @param status to hold the error code 802 * @return the size of the contraction set 803 */ 804 U_CAPI int32_t U_EXPORT2 805 ucol_getContractions( const UCollator *coll, 806 USet *contractions, 807 UErrorCode *status) 808 { 809 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 810 return uset_getItemCount(contractions); 811 } 812 813 /** 814 * Get a set containing the expansions defined by the collator. The set includes 815 * both the UCA expansions and the expansions defined by the tailoring 816 * @param coll collator 817 * @param conts the set to hold the result 818 * @param addPrefixes add the prefix contextual elements to contractions 819 * @param status to hold the error code 820 * 821 * @draft ICU 3.4 822 */ 823 U_CAPI void U_EXPORT2 824 ucol_getContractionsAndExpansions( const UCollator *coll, 825 USet *contractions, 826 USet *expansions, 827 UBool addPrefixes, 828 UErrorCode *status) 829 { 830 if(U_FAILURE(*status)) { 831 return; 832 } 833 if(coll == NULL) { 834 *status = U_ILLEGAL_ARGUMENT_ERROR; 835 return; 836 } 837 838 if(contractions) { 839 uset_clear(contractions); 840 } 841 if(expansions) { 842 uset_clear(expansions); 843 } 844 int32_t rulesLen = 0; 845 const UChar* rules = ucol_getRules(coll, &rulesLen); 846 UColTokenParser src; 847 ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, 848 ucol_tok_getRulesFromBundle, NULL, status); 849 850 contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status }; 851 852 // Add the UCA contractions 853 c.coll = coll->UCA; 854 utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c); 855 856 // This is collator specific. Add contractions from a collator 857 c.coll = coll; 858 c.removedContractions = NULL; 859 utrie_enum(&coll->mapping, NULL, _processSpecials, &c); 860 ucol_tok_closeTokenList(&src); 861 } 862 863 U_CAPI int32_t U_EXPORT2 864 ucol_getUnsafeSet( const UCollator *coll, 865 USet *unsafe, 866 UErrorCode *status) 867 { 868 UChar buffer[internalBufferSize]; 869 int32_t len = 0; 870 871 uset_clear(unsafe); 872 873 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 874 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 875 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 876 877 // add chars that fail the fcd check 878 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 879 880 // add Thai/Lao prevowels 881 uset_addRange(unsafe, 0xe40, 0xe44); 882 uset_addRange(unsafe, 0xec0, 0xec4); 883 // add lead/trail surrogates 884 uset_addRange(unsafe, 0xd800, 0xdfff); 885 886 USet *contractions = uset_open(0,0); 887 888 int32_t i = 0, j = 0; 889 int32_t contsSize = ucol_getContractions(coll, contractions, status); 890 UChar32 c = 0; 891 // Contraction set consists only of strings 892 // to get unsafe code points, we need to 893 // break the strings apart and add them to the unsafe set 894 for(i = 0; i < contsSize; i++) { 895 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 896 if(len > 0) { 897 j = 0; 898 while(j < len) { 899 U16_NEXT(buffer, j, len, c); 900 if(j < len) { 901 uset_add(unsafe, c); 902 } 903 } 904 } 905 } 906 907 uset_close(contractions); 908 909 return uset_size(unsafe); 910 } 911 #endif 912