1 /* 2 ******************************************************************************* 3 * Copyright (C) 2004-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucol_sit.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * Modification history 12 * Date Name Comments 13 * 03/12/2004 weiv Creation 14 */ 15 16 #include "unicode/ustring.h" 17 #include "unicode/udata.h" 18 19 #include "utracimp.h" 20 #include "ucol_imp.h" 21 #include "ucol_tok.h" 22 #include "cmemory.h" 23 #include "cstring.h" 24 #include "uresimp.h" 25 #include "unicode/coll.h" 26 27 #ifdef UCOL_TRACE_SIT 28 # include <stdio.h> 29 #endif 30 31 #if !UCONFIG_NO_COLLATION 32 33 enum OptionsList { 34 UCOL_SIT_LANGUAGE = 0, 35 UCOL_SIT_SCRIPT = 1, 36 UCOL_SIT_REGION = 2, 37 UCOL_SIT_VARIANT = 3, 38 UCOL_SIT_KEYWORD = 4, 39 UCOL_SIT_PROVIDER = 5, 40 UCOL_SIT_LOCELEMENT_MAX = UCOL_SIT_PROVIDER, /* the last element that's part of LocElements */ 41 42 UCOL_SIT_BCP47, 43 UCOL_SIT_STRENGTH, 44 UCOL_SIT_CASE_LEVEL, 45 UCOL_SIT_CASE_FIRST, 46 UCOL_SIT_NUMERIC_COLLATION, 47 UCOL_SIT_ALTERNATE_HANDLING, 48 UCOL_SIT_NORMALIZATION_MODE, 49 UCOL_SIT_FRENCH_COLLATION, 50 UCOL_SIT_HIRAGANA_QUATERNARY, 51 UCOL_SIT_VARIABLE_TOP, 52 UCOL_SIT_VARIABLE_TOP_VALUE, 53 UCOL_SIT_ITEMS_COUNT 54 }; 55 56 /* option starters chars. */ 57 static const char alternateHArg = 'A'; 58 static const char variableTopValArg = 'B'; 59 static const char caseFirstArg = 'C'; 60 static const char numericCollArg = 'D'; 61 static const char caseLevelArg = 'E'; 62 static const char frenchCollArg = 'F'; 63 static const char hiraganaQArg = 'H'; 64 static const char keywordArg = 'K'; 65 static const char languageArg = 'L'; 66 static const char normArg = 'N'; 67 static const char providerArg = 'P'; 68 static const char regionArg = 'R'; 69 static const char strengthArg = 'S'; 70 static const char variableTopArg = 'T'; 71 static const char variantArg = 'V'; 72 static const char RFC3066Arg = 'X'; 73 static const char scriptArg = 'Z'; 74 75 static const char collationKeyword[] = "@collation="; 76 static const char providerKeyword[] = "@sp="; 77 78 79 static const int32_t locElementCount = UCOL_SIT_LOCELEMENT_MAX+1; 80 static const int32_t locElementCapacity = 32; 81 static const int32_t loc3066Capacity = 256; 82 static const int32_t locProviderCapacity = 10; 83 static const int32_t internalBufferSize = 512; 84 85 /* structure containing specification of a collator. Initialized 86 * from a short string. Also used to construct a short string from a 87 * collator instance 88 */ 89 struct CollatorSpec { 90 char locElements[locElementCount][locElementCapacity]; 91 char locale[loc3066Capacity]; 92 char provider[locProviderCapacity]; 93 UColAttributeValue options[UCOL_ATTRIBUTE_COUNT]; 94 uint32_t variableTopValue; 95 UChar variableTopString[locElementCapacity]; 96 int32_t variableTopStringLen; 97 UBool variableTopSet; 98 struct { 99 const char *start; 100 int32_t len; 101 } entries[UCOL_SIT_ITEMS_COUNT]; 102 }; 103 104 105 /* structure for converting between character attribute 106 * representation and real collation attribute value. 107 */ 108 struct AttributeConversion { 109 char letter; 110 UColAttributeValue value; 111 }; 112 113 static const AttributeConversion conversions[12] = { 114 { '1', UCOL_PRIMARY }, 115 { '2', UCOL_SECONDARY }, 116 { '3', UCOL_TERTIARY }, 117 { '4', UCOL_QUATERNARY }, 118 { 'D', UCOL_DEFAULT }, 119 { 'I', UCOL_IDENTICAL }, 120 { 'L', UCOL_LOWER_FIRST }, 121 { 'N', UCOL_NON_IGNORABLE }, 122 { 'O', UCOL_ON }, 123 { 'S', UCOL_SHIFTED }, 124 { 'U', UCOL_UPPER_FIRST }, 125 { 'X', UCOL_OFF } 126 }; 127 128 129 static char 130 ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) { 131 uint32_t i = 0; 132 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 133 if(conversions[i].value == value) { 134 return conversions[i].letter; 135 } 136 } 137 *status = U_ILLEGAL_ARGUMENT_ERROR; 138 #ifdef UCOL_TRACE_SIT 139 fprintf(stderr, "%s:%d: unknown UColAttributeValue %d: %s\n", __FILE__, __LINE__, value, u_errorName(*status)); 140 #endif 141 return 0; 142 } 143 144 static UColAttributeValue 145 ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { 146 uint32_t i = 0; 147 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 148 if(conversions[i].letter == letter) { 149 return conversions[i].value; 150 } 151 } 152 *status = U_ILLEGAL_ARGUMENT_ERROR; 153 #ifdef UCOL_TRACE_SIT 154 fprintf(stderr, "%s:%d: unknown letter %c: %s\n", __FILE__, __LINE__, letter, u_errorName(*status)); 155 #endif 156 return UCOL_DEFAULT; 157 } 158 159 /* function prototype for functions used to parse a short string */ 160 U_CDECL_BEGIN 161 typedef const char* U_CALLCONV 162 ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string, 163 UErrorCode *status); 164 U_CDECL_END 165 166 U_CDECL_BEGIN 167 static const char* U_CALLCONV 168 _processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, 169 UErrorCode *status) 170 { 171 int32_t len = 0; 172 do { 173 if(value == UCOL_SIT_LANGUAGE || value == UCOL_SIT_KEYWORD || value == UCOL_SIT_PROVIDER) { 174 spec->locElements[value][len++] = uprv_tolower(*string); 175 } else { 176 spec->locElements[value][len++] = *string; 177 } 178 } while(*(++string) != '_' && *string && len < locElementCapacity); 179 if(len >= locElementCapacity) { 180 *status = U_BUFFER_OVERFLOW_ERROR; 181 return string; 182 } 183 // don't skip the underscore at the end 184 return string; 185 } 186 U_CDECL_END 187 188 U_CDECL_BEGIN 189 static const char* U_CALLCONV 190 _processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, 191 UErrorCode *status) 192 { 193 char terminator = *string; 194 string++; 195 const char *end = uprv_strchr(string+1, terminator); 196 if(end == NULL || end - string >= loc3066Capacity) { 197 *status = U_BUFFER_OVERFLOW_ERROR; 198 return string; 199 } else { 200 uprv_strncpy(spec->locale, string, end-string); 201 return end+1; 202 } 203 } 204 205 U_CDECL_END 206 207 U_CDECL_BEGIN 208 static const char* U_CALLCONV 209 _processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, 210 UErrorCode *status) 211 { 212 spec->options[option] = ucol_sit_letterToAttributeValue(*string, status); 213 if((*(++string) != '_' && *string) || U_FAILURE(*status)) { 214 #ifdef UCOL_TRACE_SIT 215 fprintf(stderr, "%s:%d: unknown collator option at '%s': %s\n", __FILE__, __LINE__, string, u_errorName(*status)); 216 #endif 217 *status = U_ILLEGAL_ARGUMENT_ERROR; 218 } 219 return string; 220 } 221 U_CDECL_END 222 223 224 static UChar 225 readHexCodeUnit(const char **string, UErrorCode *status) 226 { 227 UChar result = 0; 228 int32_t value = 0; 229 char c; 230 int32_t noDigits = 0; 231 while((c = **string) != 0 && noDigits < 4) { 232 if( c >= '0' && c <= '9') { 233 value = c - '0'; 234 } else if ( c >= 'a' && c <= 'f') { 235 value = c - 'a' + 10; 236 } else if ( c >= 'A' && c <= 'F') { 237 value = c - 'A' + 10; 238 } else { 239 *status = U_ILLEGAL_ARGUMENT_ERROR; 240 #ifdef UCOL_TRACE_SIT 241 fprintf(stderr, "%s:%d: Bad hex char at '%s': %s\n", __FILE__, __LINE__, *string, u_errorName(*status)); 242 #endif 243 return 0; 244 } 245 result = (result << 4) | (UChar)value; 246 noDigits++; 247 (*string)++; 248 } 249 // if the string was terminated before we read 4 digits, set an error 250 if(noDigits < 4) { 251 *status = U_ILLEGAL_ARGUMENT_ERROR; 252 #ifdef UCOL_TRACE_SIT 253 fprintf(stderr, "%s:%d: Short (only %d digits, wanted 4) at '%s': %s\n", __FILE__, __LINE__, noDigits,*string, u_errorName(*status)); 254 #endif 255 } 256 return result; 257 } 258 259 U_CDECL_BEGIN 260 static const char* U_CALLCONV 261 _processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) 262 { 263 // get four digits 264 int32_t i = 0; 265 if(!value1) { 266 while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') { 267 spec->variableTopString[i++] = readHexCodeUnit(&string, status); 268 } 269 spec->variableTopStringLen = i; 270 if(i == locElementCapacity && *string != 0 && *string != '_') { 271 *status = U_BUFFER_OVERFLOW_ERROR; 272 } 273 } else { 274 spec->variableTopValue = readHexCodeUnit(&string, status); 275 } 276 if(U_SUCCESS(*status)) { 277 spec->variableTopSet = TRUE; 278 } 279 return string; 280 } 281 U_CDECL_END 282 283 284 /* Table for parsing short strings */ 285 struct ShortStringOptions { 286 char optionStart; 287 ActionFunction *action; 288 uint32_t attr; 289 }; 290 291 static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] = 292 { 293 /* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D 294 /* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 }, 295 /* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D 296 /* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D 297 /* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D 298 /* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D 299 /* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D 300 /* 04 KEYWORD */ {keywordArg, _processLocaleElement, UCOL_SIT_KEYWORD }, // keyword 301 /* 00 LANGUAGE */ {languageArg, _processLocaleElement, UCOL_SIT_LANGUAGE }, // language 302 /* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D 303 /* 02 REGION */ {regionArg, _processLocaleElement, UCOL_SIT_REGION }, // region 304 /* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D 305 /* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 }, 306 /* 03 VARIANT */ {variantArg, _processLocaleElement, UCOL_SIT_VARIANT }, // variant 307 /* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name 308 /* 01 SCRIPT */ {scriptArg, _processLocaleElement, UCOL_SIT_SCRIPT }, // script 309 /* PROVIDER */ {providerArg, _processLocaleElement, UCOL_SIT_PROVIDER } 310 }; 311 312 313 static 314 const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, 315 UErrorCode *status) 316 { 317 int32_t i = 0; 318 319 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 320 if(*start == options[i].optionStart) { 321 spec->entries[i].start = start; 322 const char* end = options[i].action(spec, options[i].attr, start+1, status); 323 spec->entries[i].len = (int32_t)(end - start); 324 return end; 325 } 326 } 327 *status = U_ILLEGAL_ARGUMENT_ERROR; 328 #ifdef UCOL_TRACE_SIT 329 fprintf(stderr, "%s:%d: Unknown option at '%s': %s\n", __FILE__, __LINE__, start, u_errorName(*status)); 330 #endif 331 return start; 332 } 333 334 static 335 void ucol_sit_initCollatorSpecs(CollatorSpec *spec) 336 { 337 // reset everything 338 uprv_memset(spec, 0, sizeof(CollatorSpec)); 339 // set collation options to default 340 int32_t i = 0; 341 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 342 spec->options[i] = UCOL_DEFAULT; 343 } 344 } 345 346 static const char* 347 ucol_sit_readSpecs(CollatorSpec *s, const char *string, 348 UParseError *parseError, UErrorCode *status) 349 { 350 const char *definition = string; 351 while(U_SUCCESS(*status) && *string) { 352 string = ucol_sit_readOption(string, s, status); 353 // advance over '_' 354 while(*string && *string == '_') { 355 string++; 356 } 357 } 358 if(U_FAILURE(*status)) { 359 parseError->offset = (int32_t)(string - definition); 360 } 361 return string; 362 } 363 364 static 365 int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status) 366 { 367 int32_t i = 0, j = 0; 368 int32_t len = 0; 369 char optName; 370 if(U_SUCCESS(*status)) { 371 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 372 if(s->entries[i].start) { 373 if(len) { 374 if(len < capacity) { 375 uprv_strcat(destination, "_"); 376 } 377 len++; 378 } 379 optName = *(s->entries[i].start); 380 if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) { 381 for(j = 0; j < s->entries[i].len; j++) { 382 if(len + j < capacity) { 383 destination[len+j] = uprv_toupper(*(s->entries[i].start+j)); 384 } 385 } 386 len += s->entries[i].len; 387 } else { 388 len += s->entries[i].len; 389 if(len < capacity) { 390 uprv_strncat(destination,s->entries[i].start, s->entries[i].len); 391 } 392 } 393 } 394 } 395 return len; 396 } else { 397 return 0; 398 } 399 } 400 401 static void 402 ucol_sit_calculateWholeLocale(CollatorSpec *s) { 403 // put the locale together, unless we have a done 404 // locale 405 if(s->locale[0] == 0) { 406 // first the language 407 uprv_strcat(s->locale, s->locElements[UCOL_SIT_LANGUAGE]); 408 // then the script, if present 409 if(*(s->locElements[UCOL_SIT_SCRIPT])) { 410 uprv_strcat(s->locale, "_"); 411 uprv_strcat(s->locale, s->locElements[UCOL_SIT_SCRIPT]); 412 } 413 // then the region, if present 414 if(*(s->locElements[UCOL_SIT_REGION])) { 415 uprv_strcat(s->locale, "_"); 416 uprv_strcat(s->locale, s->locElements[UCOL_SIT_REGION]); 417 } else if(*(s->locElements[UCOL_SIT_VARIANT])) { // if there is a variant, we need an underscore 418 uprv_strcat(s->locale, "_"); 419 } 420 // add variant, if there 421 if(*(s->locElements[UCOL_SIT_VARIANT])) { 422 uprv_strcat(s->locale, "_"); 423 uprv_strcat(s->locale, s->locElements[UCOL_SIT_VARIANT]); 424 } 425 426 // if there is a collation keyword, add that too 427 if(*(s->locElements[UCOL_SIT_KEYWORD])) { 428 uprv_strcat(s->locale, collationKeyword); 429 uprv_strcat(s->locale, s->locElements[UCOL_SIT_KEYWORD]); 430 } 431 432 // if there is a provider keyword, add that too 433 if(*(s->locElements[UCOL_SIT_PROVIDER])) { 434 uprv_strcat(s->locale, providerKeyword); 435 uprv_strcat(s->locale, s->locElements[UCOL_SIT_PROVIDER]); 436 } 437 } 438 } 439 440 441 U_CAPI void U_EXPORT2 442 ucol_prepareShortStringOpen( const char *definition, 443 UBool, 444 UParseError *parseError, 445 UErrorCode *status) 446 { 447 if(U_FAILURE(*status)) return; 448 449 UParseError internalParseError; 450 451 if(!parseError) { 452 parseError = &internalParseError; 453 } 454 parseError->line = 0; 455 parseError->offset = 0; 456 parseError->preContext[0] = 0; 457 parseError->postContext[0] = 0; 458 459 460 // first we want to pick stuff out of short string. 461 // we'll end up with an UCA version, locale and a bunch of 462 // settings 463 464 // analyse the string in order to get everything we need. 465 CollatorSpec s; 466 ucol_sit_initCollatorSpecs(&s); 467 ucol_sit_readSpecs(&s, definition, parseError, status); 468 ucol_sit_calculateWholeLocale(&s); 469 470 char buffer[internalBufferSize]; 471 uprv_memset(buffer, 0, internalBufferSize); 472 uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 473 474 UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status); 475 /* we try to find stuff from keyword */ 476 UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status); 477 UResourceBundle *collElem = NULL; 478 char keyBuffer[256]; 479 // if there is a keyword, we pick it up and try to get elements 480 if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) { 481 // no keyword. we try to find the default setting, which will give us the keyword value 482 UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); 483 if(U_SUCCESS(*status)) { 484 int32_t defaultKeyLen = 0; 485 const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status); 486 u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen); 487 keyBuffer[defaultKeyLen] = 0; 488 } else { 489 *status = U_INTERNAL_PROGRAM_ERROR; 490 return; 491 } 492 ures_close(defaultColl); 493 } 494 collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status); 495 ures_close(collElem); 496 ures_close(collations); 497 ures_close(b); 498 } 499 500 501 U_CAPI UCollator* U_EXPORT2 502 ucol_openFromShortString( const char *definition, 503 UBool forceDefaults, 504 UParseError *parseError, 505 UErrorCode *status) 506 { 507 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING); 508 UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition); 509 510 if(U_FAILURE(*status)) return 0; 511 512 UParseError internalParseError; 513 514 if(!parseError) { 515 parseError = &internalParseError; 516 } 517 parseError->line = 0; 518 parseError->offset = 0; 519 parseError->preContext[0] = 0; 520 parseError->postContext[0] = 0; 521 522 523 // first we want to pick stuff out of short string. 524 // we'll end up with an UCA version, locale and a bunch of 525 // settings 526 527 // analyse the string in order to get everything we need. 528 const char *string = definition; 529 CollatorSpec s; 530 ucol_sit_initCollatorSpecs(&s); 531 string = ucol_sit_readSpecs(&s, definition, parseError, status); 532 ucol_sit_calculateWholeLocale(&s); 533 534 char buffer[internalBufferSize]; 535 uprv_memset(buffer, 0, internalBufferSize); 536 uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 537 538 UCollator *result = ucol_open(buffer, status); 539 int32_t i = 0; 540 541 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 542 if(s.options[i] != UCOL_DEFAULT) { 543 if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) { 544 ucol_setAttribute(result, (UColAttribute)i, s.options[i], status); 545 } 546 547 if(U_FAILURE(*status)) { 548 parseError->offset = (int32_t)(string - definition); 549 ucol_close(result); 550 return NULL; 551 } 552 553 } 554 } 555 if(s.variableTopSet) { 556 if(s.variableTopString[0]) { 557 ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status); 558 } else { // we set by value, using 'B' 559 ucol_restoreVariableTop(result, s.variableTopValue, status); 560 } 561 } 562 563 564 if(U_FAILURE(*status)) { // here it can only be a bogus value 565 ucol_close(result); 566 result = NULL; 567 } 568 569 UTRACE_EXIT_PTR_STATUS(result, *status); 570 return result; 571 } 572 573 574 static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg) 575 { 576 if(len) { 577 if(*resultSize) { 578 if(*resultSize < capacity) { 579 uprv_strcat(result, "_"); 580 } 581 (*resultSize)++; 582 } 583 *resultSize += len + 1; 584 if(*resultSize < capacity) { 585 uprv_strncat(result, &arg, 1); 586 uprv_strncat(result, src, len); 587 } 588 } 589 } 590 591 U_CAPI int32_t U_EXPORT2 592 ucol_getShortDefinitionString(const UCollator *coll, 593 const char *locale, 594 char *dst, 595 int32_t capacity, 596 UErrorCode *status) 597 { 598 if(U_FAILURE(*status)) return 0; 599 if(coll->delegate != NULL) { 600 return ((icu::Collator*)coll->delegate)->internalGetShortDefinitionString(locale,dst,capacity,*status); 601 } 602 char buffer[internalBufferSize]; 603 uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 604 int32_t resultSize = 0; 605 char tempbuff[internalBufferSize]; 606 char locBuff[internalBufferSize]; 607 uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 608 int32_t elementSize = 0; 609 UBool isAvailable = 0; 610 CollatorSpec s; 611 ucol_sit_initCollatorSpecs(&s); 612 613 if(!locale) { 614 locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status); 615 } 616 elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status); 617 618 if(elementSize) { 619 // we should probably canonicalize here... 620 elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); 621 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg); 622 elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); 623 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg); 624 elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); 625 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg); 626 elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); 627 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg); 628 elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); 629 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg); 630 } 631 632 int32_t i = 0; 633 UColAttributeValue attribute = UCOL_DEFAULT; 634 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 635 if(options[i].action == _processCollatorOption) { 636 attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status); 637 if(attribute != UCOL_DEFAULT) { 638 char letter = ucol_sit_attributeValueToLetter(attribute, status); 639 appendShortStringElement(&letter, 1, 640 buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart); 641 } 642 } 643 } 644 if(coll->variableTopValueisDefault == FALSE) { 645 //s.variableTopValue = ucol_getVariableTop(coll, status); 646 elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16); 647 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg); 648 } 649 650 UParseError parseError; 651 return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status); 652 } 653 654 U_CAPI int32_t U_EXPORT2 655 ucol_normalizeShortDefinitionString(const char *definition, 656 char *destination, 657 int32_t capacity, 658 UParseError *parseError, 659 UErrorCode *status) 660 { 661 662 if(U_FAILURE(*status)) { 663 return 0; 664 } 665 666 if(destination) { 667 uprv_memset(destination, 0, capacity*sizeof(char)); 668 } 669 670 UParseError pe; 671 if(!parseError) { 672 parseError = &pe; 673 } 674 675 // validate 676 CollatorSpec s; 677 ucol_sit_initCollatorSpecs(&s); 678 ucol_sit_readSpecs(&s, definition, parseError, status); 679 return ucol_sit_dumpSpecs(&s, destination, capacity, status); 680 } 681 682 U_CAPI UColAttributeValue U_EXPORT2 683 ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status) 684 { 685 if(U_FAILURE(*status) || coll == NULL) { 686 return UCOL_DEFAULT; 687 } 688 switch(attr) { 689 case UCOL_NUMERIC_COLLATION: 690 return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation; 691 case UCOL_HIRAGANA_QUATERNARY_MODE: 692 return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ; 693 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 694 return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation; 695 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 696 return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling; 697 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 698 return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst; 699 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 700 return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel; 701 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 702 return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode; 703 case UCOL_STRENGTH: /* attribute for strength */ 704 return coll->strengthisDefault?UCOL_DEFAULT:coll->strength; 705 case UCOL_ATTRIBUTE_COUNT: 706 default: 707 *status = U_ILLEGAL_ARGUMENT_ERROR; 708 #ifdef UCOL_TRACE_SIT 709 fprintf(stderr, "%s:%d: Unknown attr value '%d': %s\n", __FILE__, __LINE__, (int)attr, u_errorName(*status)); 710 #endif 711 break; 712 } 713 return UCOL_DEFAULT; 714 } 715 716 717 struct contContext { 718 const UCollator *coll; 719 USet *conts; 720 USet *expansions; 721 USet *removedContractions; 722 UBool addPrefixes; 723 UErrorCode *status; 724 }; 725 726 727 728 static void 729 addSpecial(contContext *context, UChar *buffer, int32_t bufLen, 730 uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status) 731 { 732 const UCollator *coll = context->coll; 733 USet *contractions = context->conts; 734 USet *expansions = context->expansions; 735 UBool addPrefixes = context->addPrefixes; 736 737 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 738 uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 739 // we might have a contraction that ends from previous level 740 if(newCE != UCOL_NOT_FOUND) { 741 if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) { 742 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 743 } 744 if(contractions && rightIndex-leftIndex > 1) { 745 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 746 if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) { 747 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 748 } 749 } 750 } 751 752 UCharOffset++; 753 // check whether we're doing contraction or prefix 754 if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) { 755 if(leftIndex == 0) { 756 *status = U_INTERNAL_PROGRAM_ERROR; 757 return; 758 } 759 --leftIndex; 760 while(*UCharOffset != 0xFFFF) { 761 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 762 buffer[leftIndex] = *UCharOffset; 763 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 764 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 765 } else { 766 if(contractions) { 767 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 768 } 769 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 770 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 771 } 772 } 773 UCharOffset++; 774 } 775 } else if(getCETag(CE) == CONTRACTION_TAG) { 776 if(rightIndex == bufLen-1) { 777 *status = U_INTERNAL_PROGRAM_ERROR; 778 return; 779 } 780 while(*UCharOffset != 0xFFFF) { 781 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 782 buffer[rightIndex] = *UCharOffset; 783 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 784 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status); 785 } else { 786 if(contractions) { 787 uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex); 788 } 789 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 790 uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex); 791 } 792 } 793 UCharOffset++; 794 } 795 } 796 797 } 798 799 U_CDECL_BEGIN 800 static UBool U_CALLCONV 801 _processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) 802 { 803 UErrorCode *status = ((contContext *)context)->status; 804 USet *expansions = ((contContext *)context)->expansions; 805 USet *removed = ((contContext *)context)->removedContractions; 806 UBool addPrefixes = ((contContext *)context)->addPrefixes; 807 UChar contraction[internalBufferSize]; 808 if(isSpecial(CE)) { 809 if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { 810 while(start < limit && U_SUCCESS(*status)) { 811 // if there are suppressed contractions, we don't 812 // want to add them. 813 if(removed && uset_contains(removed, start)) { 814 start++; 815 continue; 816 } 817 // we start our contraction from middle, since we don't know if it 818 // will grow toward right or left 819 contraction[internalBufferSize/2] = (UChar)start; 820 addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); 821 start++; 822 } 823 } else if(expansions && getCETag(CE) == EXPANSION_TAG) { 824 while(start < limit && U_SUCCESS(*status)) { 825 uset_add(expansions, start++); 826 } 827 } 828 } 829 if(U_FAILURE(*status)) { 830 return FALSE; 831 } else { 832 return TRUE; 833 } 834 } 835 836 U_CDECL_END 837 838 839 840 /** 841 * Get a set containing the contractions defined by the collator. The set includes 842 * both the UCA contractions and the contractions defined by the collator 843 * @param coll collator 844 * @param conts the set to hold the result 845 * @param status to hold the error code 846 * @return the size of the contraction set 847 */ 848 U_CAPI int32_t U_EXPORT2 849 ucol_getContractions( const UCollator *coll, 850 USet *contractions, 851 UErrorCode *status) 852 { 853 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 854 return uset_getItemCount(contractions); 855 } 856 857 /** 858 * Get a set containing the expansions defined by the collator. The set includes 859 * both the UCA expansions and the expansions defined by the tailoring 860 * @param coll collator 861 * @param conts the set to hold the result 862 * @param addPrefixes add the prefix contextual elements to contractions 863 * @param status to hold the error code 864 * 865 * @draft ICU 3.4 866 */ 867 U_CAPI void U_EXPORT2 868 ucol_getContractionsAndExpansions( const UCollator *coll, 869 USet *contractions, 870 USet *expansions, 871 UBool addPrefixes, 872 UErrorCode *status) 873 { 874 if(U_FAILURE(*status)) { 875 return; 876 } 877 if(coll == NULL) { 878 *status = U_ILLEGAL_ARGUMENT_ERROR; 879 return; 880 } 881 882 if(contractions) { 883 uset_clear(contractions); 884 } 885 if(expansions) { 886 uset_clear(expansions); 887 } 888 int32_t rulesLen = 0; 889 const UChar* rules = ucol_getRules(coll, &rulesLen); 890 UColTokenParser src; 891 ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, 892 ucol_tok_getRulesFromBundle, NULL, status); 893 894 contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status }; 895 896 // Add the UCA contractions 897 c.coll = coll->UCA; 898 utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c); 899 900 // This is collator specific. Add contractions from a collator 901 c.coll = coll; 902 c.removedContractions = NULL; 903 utrie_enum(&coll->mapping, NULL, _processSpecials, &c); 904 ucol_tok_closeTokenList(&src); 905 } 906 907 U_CAPI int32_t U_EXPORT2 908 ucol_getUnsafeSet( const UCollator *coll, 909 USet *unsafe, 910 UErrorCode *status) 911 { 912 UChar buffer[internalBufferSize]; 913 int32_t len = 0; 914 915 uset_clear(unsafe); 916 917 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 918 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 919 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 920 921 // add chars that fail the fcd check 922 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 923 924 // add Thai/Lao prevowels 925 uset_addRange(unsafe, 0xe40, 0xe44); 926 uset_addRange(unsafe, 0xec0, 0xec4); 927 // add lead/trail surrogates 928 uset_addRange(unsafe, 0xd800, 0xdfff); 929 930 USet *contractions = uset_open(0,0); 931 932 int32_t i = 0, j = 0; 933 int32_t contsSize = ucol_getContractions(coll, contractions, status); 934 UChar32 c = 0; 935 // Contraction set consists only of strings 936 // to get unsafe code points, we need to 937 // break the strings apart and add them to the unsafe set 938 for(i = 0; i < contsSize; i++) { 939 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 940 if(len > 0) { 941 j = 0; 942 while(j < len) { 943 U16_NEXT(buffer, j, len, c); 944 if(j < len) { 945 uset_add(unsafe, c); 946 } 947 } 948 } 949 } 950 951 uset_close(contractions); 952 953 return uset_size(unsafe); 954 } 955 #endif 956