1 /* 2 ******************************************************************************* 3 * Copyright (c) 1996-2010, International Business Machines Corporation and others. 4 * All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #ifndef UCOL_H 9 #define UCOL_H 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_COLLATION 14 15 #include "unicode/unorm.h" 16 #include "unicode/localpointer.h" 17 #include "unicode/parseerr.h" 18 #include "unicode/uloc.h" 19 #include "unicode/uset.h" 20 21 /** 22 * \file 23 * \brief C API: Collator 24 * 25 * <h2> Collator C API </h2> 26 * 27 * The C API for Collator performs locale-sensitive 28 * string comparison. You use this service to build 29 * searching and sorting routines for natural language text. 30 * <em>Important: </em>The ICU collation service has been reimplemented 31 * in order to achieve better performance and UCA compliance. 32 * For details, see the 33 * <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 34 * collation design document</a>. 35 * <p> 36 * For more information about the collation service see 37 * <a href="http://icu-project.org/userguide/Collate_Intro.html">the users guide</a>. 38 * <p> 39 * Collation service provides correct sorting orders for most locales supported in ICU. 40 * If specific data for a locale is not available, the orders eventually falls back 41 * to the <a href="http://www.unicode.org/unicode/reports/tr10/">UCA sort order</a>. 42 * <p> 43 * Sort ordering may be customized by providing your own set of rules. For more on 44 * this subject see the 45 * <a href="http://icu-project.org/userguide/Collate_Customization.html"> 46 * Collation customization</a> section of the users guide. 47 * <p> 48 * @see UCollationResult 49 * @see UNormalizationMode 50 * @see UCollationStrength 51 * @see UCollationElements 52 */ 53 54 /** A collator. 55 * For usage in C programs. 56 */ 57 struct UCollator; 58 /** structure representing a collator object instance 59 * @stable ICU 2.0 60 */ 61 typedef struct UCollator UCollator; 62 63 64 /** 65 * UCOL_LESS is returned if source string is compared to be less than target 66 * string in the u_strcoll() method. 67 * UCOL_EQUAL is returned if source string is compared to be equal to target 68 * string in the u_strcoll() method. 69 * UCOL_GREATER is returned if source string is compared to be greater than 70 * target string in the u_strcoll() method. 71 * @see u_strcoll() 72 * <p> 73 * Possible values for a comparison result 74 * @stable ICU 2.0 75 */ 76 typedef enum { 77 /** string a == string b */ 78 UCOL_EQUAL = 0, 79 /** string a > string b */ 80 UCOL_GREATER = 1, 81 /** string a < string b */ 82 UCOL_LESS = -1 83 } UCollationResult ; 84 85 86 /** Enum containing attribute values for controling collation behavior. 87 * Here are all the allowable values. Not every attribute can take every value. The only 88 * universal value is UCOL_DEFAULT, which resets the attribute value to the predefined 89 * value for that locale 90 * @stable ICU 2.0 91 */ 92 typedef enum { 93 /** accepted by most attributes */ 94 UCOL_DEFAULT = -1, 95 96 /** Primary collation strength */ 97 UCOL_PRIMARY = 0, 98 /** Secondary collation strength */ 99 UCOL_SECONDARY = 1, 100 /** Tertiary collation strength */ 101 UCOL_TERTIARY = 2, 102 /** Default collation strength */ 103 UCOL_DEFAULT_STRENGTH = UCOL_TERTIARY, 104 UCOL_CE_STRENGTH_LIMIT, 105 /** Quaternary collation strength */ 106 UCOL_QUATERNARY=3, 107 /** Identical collation strength */ 108 UCOL_IDENTICAL=15, 109 UCOL_STRENGTH_LIMIT, 110 111 /** Turn the feature off - works for UCOL_FRENCH_COLLATION, 112 UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE 113 & UCOL_DECOMPOSITION_MODE*/ 114 UCOL_OFF = 16, 115 /** Turn the feature on - works for UCOL_FRENCH_COLLATION, 116 UCOL_CASE_LEVEL, UCOL_HIRAGANA_QUATERNARY_MODE 117 & UCOL_DECOMPOSITION_MODE*/ 118 UCOL_ON = 17, 119 120 /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be shifted */ 121 UCOL_SHIFTED = 20, 122 /** Valid for UCOL_ALTERNATE_HANDLING. Alternate handling will be non ignorable */ 123 UCOL_NON_IGNORABLE = 21, 124 125 /** Valid for UCOL_CASE_FIRST - 126 lower case sorts before upper case */ 127 UCOL_LOWER_FIRST = 24, 128 /** upper case sorts before lower case */ 129 UCOL_UPPER_FIRST = 25, 130 131 UCOL_ATTRIBUTE_VALUE_COUNT 132 133 } UColAttributeValue; 134 135 /** 136 * Base letter represents a primary difference. Set comparison 137 * level to UCOL_PRIMARY to ignore secondary and tertiary differences. 138 * Use this to set the strength of a Collator object. 139 * Example of primary difference, "abc" < "abd" 140 * 141 * Diacritical differences on the same base letter represent a secondary 142 * difference. Set comparison level to UCOL_SECONDARY to ignore tertiary 143 * differences. Use this to set the strength of a Collator object. 144 * Example of secondary difference, "ä" >> "a". 145 * 146 * Uppercase and lowercase versions of the same character represents a 147 * tertiary difference. Set comparison level to UCOL_TERTIARY to include 148 * all comparison differences. Use this to set the strength of a Collator 149 * object. 150 * Example of tertiary difference, "abc" <<< "ABC". 151 * 152 * Two characters are considered "identical" when they have the same 153 * unicode spellings. UCOL_IDENTICAL. 154 * For example, "ä" == "ä". 155 * 156 * UCollationStrength is also used to determine the strength of sort keys 157 * generated from UCollator objects 158 * These values can be now found in the UColAttributeValue enum. 159 * @stable ICU 2.0 160 **/ 161 typedef UColAttributeValue UCollationStrength; 162 163 /** Attributes that collation service understands. All the attributes can take UCOL_DEFAULT 164 * value, as well as the values specific to each one. 165 * @stable ICU 2.0 166 */ 167 typedef enum { 168 /** Attribute for direction of secondary weights - used in French. 169 * Acceptable values are UCOL_ON, which results in secondary weights 170 * being considered backwards and UCOL_OFF which treats secondary 171 * weights in the order they appear.*/ 172 UCOL_FRENCH_COLLATION, 173 /** Attribute for handling variable elements. 174 * Acceptable values are UCOL_NON_IGNORABLE (default) 175 * which treats all the codepoints with non-ignorable 176 * primary weights in the same way, 177 * and UCOL_SHIFTED which causes codepoints with primary 178 * weights that are equal or below the variable top value 179 * to be ignored on primary level and moved to the quaternary 180 * level.*/ 181 UCOL_ALTERNATE_HANDLING, 182 /** Controls the ordering of upper and lower case letters. 183 * Acceptable values are UCOL_OFF (default), which orders 184 * upper and lower case letters in accordance to their tertiary 185 * weights, UCOL_UPPER_FIRST which forces upper case letters to 186 * sort before lower case letters, and UCOL_LOWER_FIRST which does 187 * the opposite. */ 188 UCOL_CASE_FIRST, 189 /** Controls whether an extra case level (positioned before the third 190 * level) is generated or not. Acceptable values are UCOL_OFF (default), 191 * when case level is not generated, and UCOL_ON which causes the case 192 * level to be generated. Contents of the case level are affected by 193 * the value of UCOL_CASE_FIRST attribute. A simple way to ignore 194 * accent differences in a string is to set the strength to UCOL_PRIMARY 195 * and enable case level. */ 196 UCOL_CASE_LEVEL, 197 /** Controls whether the normalization check and necessary normalizations 198 * are performed. When set to UCOL_OFF (default) no normalization check 199 * is performed. The correctness of the result is guaranteed only if the 200 * input data is in so-called FCD form (see users manual for more info). 201 * When set to UCOL_ON, an incremental check is performed to see whether 202 * the input data is in the FCD form. If the data is not in the FCD form, 203 * incremental NFD normalization is performed. */ 204 UCOL_NORMALIZATION_MODE, 205 /** An alias for UCOL_NORMALIZATION_MODE attribute */ 206 UCOL_DECOMPOSITION_MODE = UCOL_NORMALIZATION_MODE, 207 /** The strength attribute. Can be either UCOL_PRIMARY, UCOL_SECONDARY, 208 * UCOL_TERTIARY, UCOL_QUATERNARY or UCOL_IDENTICAL. The usual strength 209 * for most locales (except Japanese) is tertiary. Quaternary strength 210 * is useful when combined with shifted setting for alternate handling 211 * attribute and for JIS x 4061 collation, when it is used to distinguish 212 * between Katakana and Hiragana (this is achieved by setting the 213 * UCOL_HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level 214 * is affected only by the number of non ignorable code points in 215 * the string. Identical strength is rarely useful, as it amounts 216 * to codepoints of the NFD form of the string. */ 217 UCOL_STRENGTH, 218 /** When turned on, this attribute positions Hiragana before all 219 * non-ignorables on quaternary level This is a sneaky way to produce JIS 220 * sort order */ 221 UCOL_HIRAGANA_QUATERNARY_MODE, 222 /** When turned on, this attribute generates a collation key 223 * for the numeric value of substrings of digits. 224 * This is a way to get '100' to sort AFTER '2'. Note that the longest 225 * digit substring that can be treated as a single collation element is 226 * 254 digits (not counting leading zeros). If a digit substring is 227 * longer than that, the digits beyond the limit will be treated as a 228 * separate digit substring associated with a separate collation element. */ 229 UCOL_NUMERIC_COLLATION, 230 UCOL_ATTRIBUTE_COUNT 231 } UColAttribute; 232 233 /** Options for retrieving the rule string 234 * @stable ICU 2.0 235 */ 236 typedef enum { 237 /** Retrieve tailoring only */ 238 UCOL_TAILORING_ONLY, 239 /** Retrieve UCA rules and tailoring */ 240 UCOL_FULL_RULES 241 } UColRuleOption ; 242 243 /** 244 * Open a UCollator for comparing strings. 245 * The UCollator pointer is used in all the calls to the Collation 246 * service. After finished, collator must be disposed of by calling 247 * {@link #ucol_close }. 248 * @param loc The locale containing the required collation rules. 249 * Special values for locales can be passed in - 250 * if NULL is passed for the locale, the default locale 251 * collation rules will be used. If empty string ("") or 252 * "root" are passed, UCA rules will be used. 253 * @param status A pointer to an UErrorCode to receive any errors 254 * @return A pointer to a UCollator, or 0 if an error occurred. 255 * @see ucol_openRules 256 * @see ucol_safeClone 257 * @see ucol_close 258 * @stable ICU 2.0 259 */ 260 U_STABLE UCollator* U_EXPORT2 261 ucol_open(const char *loc, UErrorCode *status); 262 263 /** 264 * Produce an UCollator instance according to the rules supplied. 265 * The rules are used to change the default ordering, defined in the 266 * UCA in a process called tailoring. The resulting UCollator pointer 267 * can be used in the same way as the one obtained by {@link #ucol_strcoll }. 268 * @param rules A string describing the collation rules. For the syntax 269 * of the rules please see users guide. 270 * @param rulesLength The length of rules, or -1 if null-terminated. 271 * @param normalizationMode The normalization mode: One of 272 * UCOL_OFF (expect the text to not need normalization), 273 * UCOL_ON (normalize), or 274 * UCOL_DEFAULT (set the mode according to the rules) 275 * @param strength The default collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY, 276 * UCOL_TERTIARY, UCOL_IDENTICAL,UCOL_DEFAULT_STRENGTH - can be also set in the rules. 277 * @param parseError A pointer to UParseError to recieve information about errors 278 * occurred during parsing. This argument can currently be set 279 * to NULL, but at users own risk. Please provide a real structure. 280 * @param status A pointer to an UErrorCode to receive any errors 281 * @return A pointer to a UCollator. It is not guaranteed that NULL be returned in case 282 * of error - please use status argument to check for errors. 283 * @see ucol_open 284 * @see ucol_safeClone 285 * @see ucol_close 286 * @stable ICU 2.0 287 */ 288 U_STABLE UCollator* U_EXPORT2 289 ucol_openRules( const UChar *rules, 290 int32_t rulesLength, 291 UColAttributeValue normalizationMode, 292 UCollationStrength strength, 293 UParseError *parseError, 294 UErrorCode *status); 295 296 /** 297 * Open a collator defined by a short form string. 298 * The structure and the syntax of the string is defined in the "Naming collators" 299 * section of the users guide: 300 * http://icu-project.org/userguide/Collate_Concepts.html#Naming_Collators 301 * Attributes are overriden by the subsequent attributes. So, for "S2_S3", final 302 * strength will be 3. 3066bis locale overrides individual locale parts. 303 * The call to this function is equivalent to a call to ucol_open, followed by a 304 * series of calls to ucol_setAttribute and ucol_setVariableTop. 305 * @param definition A short string containing a locale and a set of attributes. 306 * Attributes not explicitly mentioned are left at the default 307 * state for a locale. 308 * @param parseError if not NULL, structure that will get filled with error's pre 309 * and post context in case of error. 310 * @param forceDefaults if FALSE, the settings that are the same as the collator 311 * default settings will not be applied (for example, setting 312 * French secondary on a French collator would not be executed). 313 * If TRUE, all the settings will be applied regardless of the 314 * collator default value. If the definition 315 * strings are to be cached, should be set to FALSE. 316 * @param status Error code. Apart from regular error conditions connected to 317 * instantiating collators (like out of memory or similar), this 318 * API will return an error if an invalid attribute or attribute/value 319 * combination is specified. 320 * @return A pointer to a UCollator or 0 if an error occured (including an 321 * invalid attribute). 322 * @see ucol_open 323 * @see ucol_setAttribute 324 * @see ucol_setVariableTop 325 * @see ucol_getShortDefinitionString 326 * @see ucol_normalizeShortDefinitionString 327 * @stable ICU 3.0 328 * 329 */ 330 U_STABLE UCollator* U_EXPORT2 331 ucol_openFromShortString( const char *definition, 332 UBool forceDefaults, 333 UParseError *parseError, 334 UErrorCode *status); 335 336 /** 337 * Get a set containing the contractions defined by the collator. The set includes 338 * both the UCA contractions and the contractions defined by the collator. This set 339 * will contain only strings. If a tailoring explicitly suppresses contractions from 340 * the UCA (like Russian), removed contractions will not be in the resulting set. 341 * @param coll collator 342 * @param conts the set to hold the result. It gets emptied before 343 * contractions are added. 344 * @param status to hold the error code 345 * @return the size of the contraction set 346 * 347 * @deprecated ICU 3.4, use ucol_getContractionsAndExpansions instead 348 */ 349 U_DEPRECATED int32_t U_EXPORT2 350 ucol_getContractions( const UCollator *coll, 351 USet *conts, 352 UErrorCode *status); 353 354 /** 355 * Get a set containing the expansions defined by the collator. The set includes 356 * both the UCA expansions and the expansions defined by the tailoring 357 * @param coll collator 358 * @param contractions if not NULL, the set to hold the contractions 359 * @param expansions if not NULL, the set to hold the expansions 360 * @param addPrefixes add the prefix contextual elements to contractions 361 * @param status to hold the error code 362 * 363 * @stable ICU 3.4 364 */ 365 U_STABLE void U_EXPORT2 366 ucol_getContractionsAndExpansions( const UCollator *coll, 367 USet *contractions, USet *expansions, 368 UBool addPrefixes, UErrorCode *status); 369 370 /** 371 * Close a UCollator. 372 * Once closed, a UCollator should not be used. Every open collator should 373 * be closed. Otherwise, a memory leak will result. 374 * @param coll The UCollator to close. 375 * @see ucol_open 376 * @see ucol_openRules 377 * @see ucol_safeClone 378 * @stable ICU 2.0 379 */ 380 U_STABLE void U_EXPORT2 381 ucol_close(UCollator *coll); 382 383 #if U_SHOW_CPLUSPLUS_API 384 385 U_NAMESPACE_BEGIN 386 387 /** 388 * \class LocalUCollatorPointer 389 * "Smart pointer" class, closes a UCollator via ucol_close(). 390 * For most methods see the LocalPointerBase base class. 391 * 392 * @see LocalPointerBase 393 * @see LocalPointer 394 * @draft ICU 4.4 395 */ 396 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCollatorPointer, UCollator, ucol_close); 397 398 U_NAMESPACE_END 399 400 #endif 401 402 /** 403 * Compare two strings. 404 * The strings will be compared using the options already specified. 405 * @param coll The UCollator containing the comparison rules. 406 * @param source The source string. 407 * @param sourceLength The length of source, or -1 if null-terminated. 408 * @param target The target string. 409 * @param targetLength The length of target, or -1 if null-terminated. 410 * @return The result of comparing the strings; one of UCOL_EQUAL, 411 * UCOL_GREATER, UCOL_LESS 412 * @see ucol_greater 413 * @see ucol_greaterOrEqual 414 * @see ucol_equal 415 * @stable ICU 2.0 416 */ 417 U_STABLE UCollationResult U_EXPORT2 418 ucol_strcoll( const UCollator *coll, 419 const UChar *source, 420 int32_t sourceLength, 421 const UChar *target, 422 int32_t targetLength); 423 424 /** 425 * Determine if one string is greater than another. 426 * This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER 427 * @param coll The UCollator containing the comparison rules. 428 * @param source The source string. 429 * @param sourceLength The length of source, or -1 if null-terminated. 430 * @param target The target string. 431 * @param targetLength The length of target, or -1 if null-terminated. 432 * @return TRUE if source is greater than target, FALSE otherwise. 433 * @see ucol_strcoll 434 * @see ucol_greaterOrEqual 435 * @see ucol_equal 436 * @stable ICU 2.0 437 */ 438 U_STABLE UBool U_EXPORT2 439 ucol_greater(const UCollator *coll, 440 const UChar *source, int32_t sourceLength, 441 const UChar *target, int32_t targetLength); 442 443 /** 444 * Determine if one string is greater than or equal to another. 445 * This function is equivalent to {@link #ucol_strcoll } != UCOL_LESS 446 * @param coll The UCollator containing the comparison rules. 447 * @param source The source string. 448 * @param sourceLength The length of source, or -1 if null-terminated. 449 * @param target The target string. 450 * @param targetLength The length of target, or -1 if null-terminated. 451 * @return TRUE if source is greater than or equal to target, FALSE otherwise. 452 * @see ucol_strcoll 453 * @see ucol_greater 454 * @see ucol_equal 455 * @stable ICU 2.0 456 */ 457 U_STABLE UBool U_EXPORT2 458 ucol_greaterOrEqual(const UCollator *coll, 459 const UChar *source, int32_t sourceLength, 460 const UChar *target, int32_t targetLength); 461 462 /** 463 * Compare two strings for equality. 464 * This function is equivalent to {@link #ucol_strcoll } == UCOL_EQUAL 465 * @param coll The UCollator containing the comparison rules. 466 * @param source The source string. 467 * @param sourceLength The length of source, or -1 if null-terminated. 468 * @param target The target string. 469 * @param targetLength The length of target, or -1 if null-terminated. 470 * @return TRUE if source is equal to target, FALSE otherwise 471 * @see ucol_strcoll 472 * @see ucol_greater 473 * @see ucol_greaterOrEqual 474 * @stable ICU 2.0 475 */ 476 U_STABLE UBool U_EXPORT2 477 ucol_equal(const UCollator *coll, 478 const UChar *source, int32_t sourceLength, 479 const UChar *target, int32_t targetLength); 480 481 /** 482 * Compare two UTF-8 encoded trings. 483 * The strings will be compared using the options already specified. 484 * @param coll The UCollator containing the comparison rules. 485 * @param sIter The source string iterator. 486 * @param tIter The target string iterator. 487 * @return The result of comparing the strings; one of UCOL_EQUAL, 488 * UCOL_GREATER, UCOL_LESS 489 * @param status A pointer to an UErrorCode to receive any errors 490 * @see ucol_strcoll 491 * @stable ICU 2.6 492 */ 493 U_STABLE UCollationResult U_EXPORT2 494 ucol_strcollIter( const UCollator *coll, 495 UCharIterator *sIter, 496 UCharIterator *tIter, 497 UErrorCode *status); 498 499 /** 500 * Get the collation strength used in a UCollator. 501 * The strength influences how strings are compared. 502 * @param coll The UCollator to query. 503 * @return The collation strength; one of UCOL_PRIMARY, UCOL_SECONDARY, 504 * UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL 505 * @see ucol_setStrength 506 * @stable ICU 2.0 507 */ 508 U_STABLE UCollationStrength U_EXPORT2 509 ucol_getStrength(const UCollator *coll); 510 511 /** 512 * Set the collation strength used in a UCollator. 513 * The strength influences how strings are compared. 514 * @param coll The UCollator to set. 515 * @param strength The desired collation strength; one of UCOL_PRIMARY, 516 * UCOL_SECONDARY, UCOL_TERTIARY, UCOL_QUATERNARY, UCOL_IDENTICAL, UCOL_DEFAULT 517 * @see ucol_getStrength 518 * @stable ICU 2.0 519 */ 520 U_STABLE void U_EXPORT2 521 ucol_setStrength(UCollator *coll, 522 UCollationStrength strength); 523 524 /** 525 * Get the display name for a UCollator. 526 * The display name is suitable for presentation to a user. 527 * @param objLoc The locale of the collator in question. 528 * @param dispLoc The locale for display. 529 * @param result A pointer to a buffer to receive the attribute. 530 * @param resultLength The maximum size of result. 531 * @param status A pointer to an UErrorCode to receive any errors 532 * @return The total buffer size needed; if greater than resultLength, 533 * the output was truncated. 534 * @stable ICU 2.0 535 */ 536 U_STABLE int32_t U_EXPORT2 537 ucol_getDisplayName( const char *objLoc, 538 const char *dispLoc, 539 UChar *result, 540 int32_t resultLength, 541 UErrorCode *status); 542 543 /** 544 * Get a locale for which collation rules are available. 545 * A UCollator in a locale returned by this function will perform the correct 546 * collation for the locale. 547 * @param localeIndex The index of the desired locale. 548 * @return A locale for which collation rules are available, or 0 if none. 549 * @see ucol_countAvailable 550 * @stable ICU 2.0 551 */ 552 U_STABLE const char* U_EXPORT2 553 ucol_getAvailable(int32_t localeIndex); 554 555 /** 556 * Determine how many locales have collation rules available. 557 * This function is most useful as determining the loop ending condition for 558 * calls to {@link #ucol_getAvailable }. 559 * @return The number of locales for which collation rules are available. 560 * @see ucol_getAvailable 561 * @stable ICU 2.0 562 */ 563 U_STABLE int32_t U_EXPORT2 564 ucol_countAvailable(void); 565 566 #if !UCONFIG_NO_SERVICE 567 /** 568 * Create a string enumerator of all locales for which a valid 569 * collator may be opened. 570 * @param status input-output error code 571 * @return a string enumeration over locale strings. The caller is 572 * responsible for closing the result. 573 * @stable ICU 3.0 574 */ 575 U_STABLE UEnumeration* U_EXPORT2 576 ucol_openAvailableLocales(UErrorCode *status); 577 #endif 578 579 /** 580 * Create a string enumerator of all possible keywords that are relevant to 581 * collation. At this point, the only recognized keyword for this 582 * service is "collation". 583 * @param status input-output error code 584 * @return a string enumeration over locale strings. The caller is 585 * responsible for closing the result. 586 * @stable ICU 3.0 587 */ 588 U_STABLE UEnumeration* U_EXPORT2 589 ucol_getKeywords(UErrorCode *status); 590 591 /** 592 * Given a keyword, create a string enumeration of all values 593 * for that keyword that are currently in use. 594 * @param keyword a particular keyword as enumerated by 595 * ucol_getKeywords. If any other keyword is passed in, *status is set 596 * to U_ILLEGAL_ARGUMENT_ERROR. 597 * @param status input-output error code 598 * @return a string enumeration over collation keyword values, or NULL 599 * upon error. The caller is responsible for closing the result. 600 * @stable ICU 3.0 601 */ 602 U_STABLE UEnumeration* U_EXPORT2 603 ucol_getKeywordValues(const char *keyword, UErrorCode *status); 604 605 /** 606 * Given a key and a locale, returns an array of string values in a preferred 607 * order that would make a difference. These are all and only those values where 608 * the open (creation) of the service with the locale formed from the input locale 609 * plus input keyword and that value has different behavior than creation with the 610 * input locale alone. 611 * @param key one of the keys supported by this service. For now, only 612 * "collation" is supported. 613 * @param locale the locale 614 * @param commonlyUsed if set to true it will return only commonly used values 615 * with the given locale in preferred order. Otherwise, 616 * it will return all the available values for the locale. 617 * @param status error status 618 * @return a string enumeration over keyword values for the given key and the locale. 619 * @stable ICU 4.2 620 */ 621 U_STABLE UEnumeration* U_EXPORT2 622 ucol_getKeywordValuesForLocale(const char* key, 623 const char* locale, 624 UBool commonlyUsed, 625 UErrorCode* status); 626 627 /** 628 * Return the functionally equivalent locale for the given 629 * requested locale, with respect to given keyword, for the 630 * collation service. If two locales return the same result, then 631 * collators instantiated for these locales will behave 632 * equivalently. The converse is not always true; two collators 633 * may in fact be equivalent, but return different results, due to 634 * internal details. The return result has no other meaning than 635 * that stated above, and implies nothing as to the relationship 636 * between the two locales. This is intended for use by 637 * applications who wish to cache collators, or otherwise reuse 638 * collators when possible. The functional equivalent may change 639 * over time. For more information, please see the <a 640 * href="http://icu-project.org/userguide/locale.html#services"> 641 * Locales and Services</a> section of the ICU User Guide. 642 * @param result fillin for the functionally equivalent locale 643 * @param resultCapacity capacity of the fillin buffer 644 * @param keyword a particular keyword as enumerated by 645 * ucol_getKeywords. 646 * @param locale the requested locale 647 * @param isAvailable if non-NULL, pointer to a fillin parameter that 648 * indicates whether the requested locale was 'available' to the 649 * collation service. A locale is defined as 'available' if it 650 * physically exists within the collation locale data. 651 * @param status pointer to input-output error code 652 * @return the actual buffer size needed for the locale. If greater 653 * than resultCapacity, the returned full name will be truncated and 654 * an error code will be returned. 655 * @stable ICU 3.0 656 */ 657 U_STABLE int32_t U_EXPORT2 658 ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity, 659 const char* keyword, const char* locale, 660 UBool* isAvailable, UErrorCode* status); 661 662 /** 663 * Get the collation rules from a UCollator. 664 * The rules will follow the rule syntax. 665 * @param coll The UCollator to query. 666 * @param length 667 * @return The collation rules. 668 * @stable ICU 2.0 669 */ 670 U_STABLE const UChar* U_EXPORT2 671 ucol_getRules( const UCollator *coll, 672 int32_t *length); 673 674 /** Get the short definition string for a collator. This API harvests the collator's 675 * locale and the attribute set and produces a string that can be used for opening 676 * a collator with the same properties using the ucol_openFromShortString API. 677 * This string will be normalized. 678 * The structure and the syntax of the string is defined in the "Naming collators" 679 * section of the users guide: 680 * http://icu-project.org/userguide/Collate_Concepts.html#Naming_Collators 681 * This API supports preflighting. 682 * @param coll a collator 683 * @param locale a locale that will appear as a collators locale in the resulting 684 * short string definition. If NULL, the locale will be harvested 685 * from the collator. 686 * @param buffer space to hold the resulting string 687 * @param capacity capacity of the buffer 688 * @param status for returning errors. All the preflighting errors are featured 689 * @return length of the resulting string 690 * @see ucol_openFromShortString 691 * @see ucol_normalizeShortDefinitionString 692 * @stable ICU 3.0 693 */ 694 U_STABLE int32_t U_EXPORT2 695 ucol_getShortDefinitionString(const UCollator *coll, 696 const char *locale, 697 char *buffer, 698 int32_t capacity, 699 UErrorCode *status); 700 701 /** Verifies and normalizes short definition string. 702 * Normalized short definition string has all the option sorted by the argument name, 703 * so that equivalent definition strings are the same. 704 * This API supports preflighting. 705 * @param source definition string 706 * @param destination space to hold the resulting string 707 * @param capacity capacity of the buffer 708 * @param parseError if not NULL, structure that will get filled with error's pre 709 * and post context in case of error. 710 * @param status Error code. This API will return an error if an invalid attribute 711 * or attribute/value combination is specified. All the preflighting 712 * errors are also featured 713 * @return length of the resulting normalized string. 714 * 715 * @see ucol_openFromShortString 716 * @see ucol_getShortDefinitionString 717 * 718 * @stable ICU 3.0 719 */ 720 721 U_STABLE int32_t U_EXPORT2 722 ucol_normalizeShortDefinitionString(const char *source, 723 char *destination, 724 int32_t capacity, 725 UParseError *parseError, 726 UErrorCode *status); 727 728 729 /** 730 * Get a sort key for a string from a UCollator. 731 * Sort keys may be compared using <TT>strcmp</TT>. 732 * @param coll The UCollator containing the collation rules. 733 * @param source The string to transform. 734 * @param sourceLength The length of source, or -1 if null-terminated. 735 * @param result A pointer to a buffer to receive the attribute. 736 * @param resultLength The maximum size of result. 737 * @return The size needed to fully store the sort key. 738 * If there was an internal error generating the sort key, 739 * a zero value is returned. 740 * @see ucol_keyHashCode 741 * @stable ICU 2.0 742 */ 743 U_STABLE int32_t U_EXPORT2 744 ucol_getSortKey(const UCollator *coll, 745 const UChar *source, 746 int32_t sourceLength, 747 uint8_t *result, 748 int32_t resultLength); 749 750 751 /** Gets the next count bytes of a sort key. Caller needs 752 * to preserve state array between calls and to provide 753 * the same type of UCharIterator set with the same string. 754 * The destination buffer provided must be big enough to store 755 * the number of requested bytes. Generated sortkey is not 756 * compatible with sortkeys generated using ucol_getSortKey 757 * API, since we don't do any compression. If uncompressed 758 * sortkeys are required, this API can be used. 759 * @param coll The UCollator containing the collation rules. 760 * @param iter UCharIterator containing the string we need 761 * the sort key to be calculated for. 762 * @param state Opaque state of sortkey iteration. 763 * @param dest Buffer to hold the resulting sortkey part 764 * @param count number of sort key bytes required. 765 * @param status error code indicator. 766 * @return the actual number of bytes of a sortkey. It can be 767 * smaller than count if we have reached the end of 768 * the sort key. 769 * @stable ICU 2.6 770 */ 771 U_STABLE int32_t U_EXPORT2 772 ucol_nextSortKeyPart(const UCollator *coll, 773 UCharIterator *iter, 774 uint32_t state[2], 775 uint8_t *dest, int32_t count, 776 UErrorCode *status); 777 778 /** enum that is taken by ucol_getBound API 779 * See below for explanation 780 * do not change the values assigned to the 781 * members of this enum. Underlying code 782 * depends on them having these numbers 783 * @stable ICU 2.0 784 */ 785 typedef enum { 786 /** lower bound */ 787 UCOL_BOUND_LOWER = 0, 788 /** upper bound that will match strings of exact size */ 789 UCOL_BOUND_UPPER = 1, 790 /** upper bound that will match all the strings that have the same initial substring as the given string */ 791 UCOL_BOUND_UPPER_LONG = 2, 792 UCOL_BOUND_VALUE_COUNT 793 } UColBoundMode; 794 795 /** 796 * Produce a bound for a given sortkey and a number of levels. 797 * Return value is always the number of bytes needed, regardless of 798 * whether the result buffer was big enough or even valid.<br> 799 * Resulting bounds can be used to produce a range of strings that are 800 * between upper and lower bounds. For example, if bounds are produced 801 * for a sortkey of string "smith", strings between upper and lower 802 * bounds with one level would include "Smith", "SMITH", "sMiTh".<br> 803 * There are two upper bounds that can be produced. If UCOL_BOUND_UPPER 804 * is produced, strings matched would be as above. However, if bound 805 * produced using UCOL_BOUND_UPPER_LONG is used, the above example will 806 * also match "Smithsonian" and similar.<br> 807 * For more on usage, see example in cintltst/capitst.c in procedure 808 * TestBounds. 809 * Sort keys may be compared using <TT>strcmp</TT>. 810 * @param source The source sortkey. 811 * @param sourceLength The length of source, or -1 if null-terminated. 812 * (If an unmodified sortkey is passed, it is always null 813 * terminated). 814 * @param boundType Type of bound required. It can be UCOL_BOUND_LOWER, which 815 * produces a lower inclusive bound, UCOL_BOUND_UPPER, that 816 * produces upper bound that matches strings of the same length 817 * or UCOL_BOUND_UPPER_LONG that matches strings that have the 818 * same starting substring as the source string. 819 * @param noOfLevels Number of levels required in the resulting bound (for most 820 * uses, the recommended value is 1). See users guide for 821 * explanation on number of levels a sortkey can have. 822 * @param result A pointer to a buffer to receive the resulting sortkey. 823 * @param resultLength The maximum size of result. 824 * @param status Used for returning error code if something went wrong. If the 825 * number of levels requested is higher than the number of levels 826 * in the source key, a warning (U_SORT_KEY_TOO_SHORT_WARNING) is 827 * issued. 828 * @return The size needed to fully store the bound. 829 * @see ucol_keyHashCode 830 * @stable ICU 2.1 831 */ 832 U_STABLE int32_t U_EXPORT2 833 ucol_getBound(const uint8_t *source, 834 int32_t sourceLength, 835 UColBoundMode boundType, 836 uint32_t noOfLevels, 837 uint8_t *result, 838 int32_t resultLength, 839 UErrorCode *status); 840 841 /** 842 * Gets the version information for a Collator. Version is currently 843 * an opaque 32-bit number which depends, among other things, on major 844 * versions of the collator tailoring and UCA. 845 * @param coll The UCollator to query. 846 * @param info the version # information, the result will be filled in 847 * @stable ICU 2.0 848 */ 849 U_STABLE void U_EXPORT2 850 ucol_getVersion(const UCollator* coll, UVersionInfo info); 851 852 /** 853 * Gets the UCA version information for a Collator. Version is the 854 * UCA version number (3.1.1, 4.0). 855 * @param coll The UCollator to query. 856 * @param info the version # information, the result will be filled in 857 * @stable ICU 2.8 858 */ 859 U_STABLE void U_EXPORT2 860 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info); 861 862 /** 863 * Merge two sort keys. The levels are merged with their corresponding counterparts 864 * (primaries with primaries, secondaries with secondaries etc.). Between the values 865 * from the same level a separator is inserted. 866 * example (uncompressed): 867 * 191B1D 01 050505 01 910505 00 and 1F2123 01 050505 01 910505 00 868 * will be merged as 869 * 191B1D 02 1F212301 050505 02 050505 01 910505 02 910505 00 870 * This allows for concatenating of first and last names for sorting, among other things. 871 * If the destination buffer is not big enough, the results are undefined. 872 * If any of source lengths are zero or any of source pointers are NULL/undefined, 873 * result is of size zero. 874 * @param src1 pointer to the first sortkey 875 * @param src1Length length of the first sortkey 876 * @param src2 pointer to the second sortkey 877 * @param src2Length length of the second sortkey 878 * @param dest buffer to hold the result 879 * @param destCapacity size of the buffer for the result 880 * @return size of the result. If the buffer is big enough size is always 881 * src1Length+src2Length-1 882 * @stable ICU 2.0 883 */ 884 U_STABLE int32_t U_EXPORT2 885 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 886 const uint8_t *src2, int32_t src2Length, 887 uint8_t *dest, int32_t destCapacity); 888 889 /** 890 * Universal attribute setter 891 * @param coll collator which attributes are to be changed 892 * @param attr attribute type 893 * @param value attribute value 894 * @param status to indicate whether the operation went on smoothly or there were errors 895 * @see UColAttribute 896 * @see UColAttributeValue 897 * @see ucol_getAttribute 898 * @stable ICU 2.0 899 */ 900 U_STABLE void U_EXPORT2 901 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status); 902 903 /** 904 * Universal attribute getter 905 * @param coll collator which attributes are to be changed 906 * @param attr attribute type 907 * @return attribute value 908 * @param status to indicate whether the operation went on smoothly or there were errors 909 * @see UColAttribute 910 * @see UColAttributeValue 911 * @see ucol_setAttribute 912 * @stable ICU 2.0 913 */ 914 U_STABLE UColAttributeValue U_EXPORT2 915 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status); 916 917 /** Variable top 918 * is a two byte primary value which causes all the codepoints with primary values that 919 * are less or equal than the variable top to be shifted when alternate handling is set 920 * to UCOL_SHIFTED. 921 * Sets the variable top to a collation element value of a string supplied. 922 * @param coll collator which variable top needs to be changed 923 * @param varTop one or more (if contraction) UChars to which the variable top should be set 924 * @param len length of variable top string. If -1 it is considered to be zero terminated. 925 * @param status error code. If error code is set, the return value is undefined. 926 * Errors set by this function are: <br> 927 * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such 928 * a contraction<br> 929 * U_PRIMARY_TOO_LONG_ERROR if the primary for the variable top has more than two bytes 930 * @return a 32 bit value containing the value of the variable top in upper 16 bits. 931 * Lower 16 bits are undefined 932 * @see ucol_getVariableTop 933 * @see ucol_restoreVariableTop 934 * @stable ICU 2.0 935 */ 936 U_STABLE uint32_t U_EXPORT2 937 ucol_setVariableTop(UCollator *coll, 938 const UChar *varTop, int32_t len, 939 UErrorCode *status); 940 941 /** 942 * Gets the variable top value of a Collator. 943 * Lower 16 bits are undefined and should be ignored. 944 * @param coll collator which variable top needs to be retrieved 945 * @param status error code (not changed by function). If error code is set, 946 * the return value is undefined. 947 * @return the variable top value of a Collator. 948 * @see ucol_setVariableTop 949 * @see ucol_restoreVariableTop 950 * @stable ICU 2.0 951 */ 952 U_STABLE uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status); 953 954 /** 955 * Sets the variable top to a collation element value supplied. Variable top is 956 * set to the upper 16 bits. 957 * Lower 16 bits are ignored. 958 * @param coll collator which variable top needs to be changed 959 * @param varTop CE value, as returned by ucol_setVariableTop or ucol)getVariableTop 960 * @param status error code (not changed by function) 961 * @see ucol_getVariableTop 962 * @see ucol_setVariableTop 963 * @stable ICU 2.0 964 */ 965 U_STABLE void U_EXPORT2 966 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status); 967 968 /** 969 * Thread safe cloning operation. The result is a clone of a given collator. 970 * @param coll collator to be cloned 971 * @param stackBuffer user allocated space for the new clone. 972 * If NULL new memory will be allocated. 973 * If buffer is not large enough, new memory will be allocated. 974 * Clients can use the U_COL_SAFECLONE_BUFFERSIZE. 975 * This will probably be enough to avoid memory allocations. 976 * @param pBufferSize pointer to size of allocated space. 977 * If *pBufferSize == 0, a sufficient size for use in cloning will 978 * be returned ('pre-flighting') 979 * If *pBufferSize is not enough for a stack-based safe clone, 980 * new memory will be allocated. 981 * @param status to indicate whether the operation went on smoothly or there were errors 982 * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any 983 * allocations were necessary. 984 * @return pointer to the new clone 985 * @see ucol_open 986 * @see ucol_openRules 987 * @see ucol_close 988 * @stable ICU 2.0 989 */ 990 U_STABLE UCollator* U_EXPORT2 991 ucol_safeClone(const UCollator *coll, 992 void *stackBuffer, 993 int32_t *pBufferSize, 994 UErrorCode *status); 995 996 /** default memory size for the new clone. It needs to be this large for os/400 large pointers 997 * @stable ICU 2.0 998 */ 999 #define U_COL_SAFECLONE_BUFFERSIZE 512 1000 1001 /** 1002 * Returns current rules. Delta defines whether full rules are returned or just the tailoring. 1003 * Returns number of UChars needed to store rules. If buffer is NULL or bufferLen is not enough 1004 * to store rules, will store up to available space. 1005 * @param coll collator to get the rules from 1006 * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES. 1007 * @param buffer buffer to store the result in. If NULL, you'll get no rules. 1008 * @param bufferLen lenght of buffer to store rules in. If less then needed you'll get only the part that fits in. 1009 * @return current rules 1010 * @stable ICU 2.0 1011 */ 1012 U_STABLE int32_t U_EXPORT2 1013 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen); 1014 1015 /** 1016 * gets the locale name of the collator. If the collator 1017 * is instantiated from the rules, then this function returns 1018 * NULL. 1019 * @param coll The UCollator for which the locale is needed 1020 * @param type You can choose between requested, valid and actual 1021 * locale. For description see the definition of 1022 * ULocDataLocaleType in uloc.h 1023 * @param status error code of the operation 1024 * @return real locale name from which the collation data comes. 1025 * If the collator was instantiated from rules, returns 1026 * NULL. 1027 * @deprecated ICU 2.8 Use ucol_getLocaleByType instead 1028 */ 1029 U_DEPRECATED const char * U_EXPORT2 1030 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status); 1031 1032 1033 /** 1034 * gets the locale name of the collator. If the collator 1035 * is instantiated from the rules, then this function returns 1036 * NULL. 1037 * @param coll The UCollator for which the locale is needed 1038 * @param type You can choose between requested, valid and actual 1039 * locale. For description see the definition of 1040 * ULocDataLocaleType in uloc.h 1041 * @param status error code of the operation 1042 * @return real locale name from which the collation data comes. 1043 * If the collator was instantiated from rules, returns 1044 * NULL. 1045 * @stable ICU 2.8 1046 */ 1047 U_STABLE const char * U_EXPORT2 1048 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status); 1049 1050 /** 1051 * Get an Unicode set that contains all the characters and sequences tailored in 1052 * this collator. The result must be disposed of by using uset_close. 1053 * @param coll The UCollator for which we want to get tailored chars 1054 * @param status error code of the operation 1055 * @return a pointer to newly created USet. Must be be disposed by using uset_close 1056 * @see ucol_openRules 1057 * @see uset_close 1058 * @stable ICU 2.4 1059 */ 1060 U_STABLE USet * U_EXPORT2 1061 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status); 1062 1063 /** 1064 * Universal attribute getter that returns UCOL_DEFAULT if the value is default 1065 * @param coll collator which attributes are to be changed 1066 * @param attr attribute type 1067 * @return attribute value or UCOL_DEFAULT if the value is default 1068 * @param status to indicate whether the operation went on smoothly or there were errors 1069 * @see UColAttribute 1070 * @see UColAttributeValue 1071 * @see ucol_setAttribute 1072 * @internal ICU 3.0 1073 */ 1074 U_INTERNAL UColAttributeValue U_EXPORT2 1075 ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status); 1076 1077 /** Check whether two collators are equal. Collators are considered equal if they 1078 * will sort strings the same. This means that both the current attributes and the 1079 * rules must be equivalent. Currently used for RuleBasedCollator::operator==. 1080 * @param source first collator 1081 * @param target second collator 1082 * @return TRUE or FALSE 1083 * @internal ICU 3.0 1084 */ 1085 U_INTERNAL UBool U_EXPORT2 1086 ucol_equals(const UCollator *source, const UCollator *target); 1087 1088 /** Calculates the set of unsafe code points, given a collator. 1089 * A character is unsafe if you could append any character and cause the ordering to alter significantly. 1090 * Collation sorts in normalized order, so anything that rearranges in normalization can cause this. 1091 * Thus if you have a character like a_umlaut, and you add a lower_dot to it, 1092 * then it normalizes to a_lower_dot + umlaut, and sorts differently. 1093 * @param coll Collator 1094 * @param unsafe a fill-in set to receive the unsafe points 1095 * @param status for catching errors 1096 * @return number of elements in the set 1097 * @internal ICU 3.0 1098 */ 1099 U_INTERNAL int32_t U_EXPORT2 1100 ucol_getUnsafeSet( const UCollator *coll, 1101 USet *unsafe, 1102 UErrorCode *status); 1103 1104 /** Reset UCA's static pointers. You don't want to use this, unless your static memory can go away. 1105 * @internal ICU 3.2.1 1106 */ 1107 U_INTERNAL void U_EXPORT2 1108 ucol_forgetUCA(void); 1109 1110 /** Touches all resources needed for instantiating a collator from a short string definition, 1111 * thus filling up the cache. 1112 * @param definition A short string containing a locale and a set of attributes. 1113 * Attributes not explicitly mentioned are left at the default 1114 * state for a locale. 1115 * @param parseError if not NULL, structure that will get filled with error's pre 1116 * and post context in case of error. 1117 * @param forceDefaults if FALSE, the settings that are the same as the collator 1118 * default settings will not be applied (for example, setting 1119 * French secondary on a French collator would not be executed). 1120 * If TRUE, all the settings will be applied regardless of the 1121 * collator default value. If the definition 1122 * strings are to be cached, should be set to FALSE. 1123 * @param status Error code. Apart from regular error conditions connected to 1124 * instantiating collators (like out of memory or similar), this 1125 * API will return an error if an invalid attribute or attribute/value 1126 * combination is specified. 1127 * @see ucol_openFromShortString 1128 * @internal ICU 3.2.1 1129 */ 1130 U_INTERNAL void U_EXPORT2 1131 ucol_prepareShortStringOpen( const char *definition, 1132 UBool forceDefaults, 1133 UParseError *parseError, 1134 UErrorCode *status); 1135 1136 /** Creates a binary image of a collator. This binary image can be stored and 1137 * later used to instantiate a collator using ucol_openBinary. 1138 * This API supports preflighting. 1139 * @param coll Collator 1140 * @param buffer a fill-in buffer to receive the binary image 1141 * @param capacity capacity of the destination buffer 1142 * @param status for catching errors 1143 * @return size of the image 1144 * @see ucol_openBinary 1145 * @stable ICU 3.2 1146 */ 1147 U_STABLE int32_t U_EXPORT2 1148 ucol_cloneBinary(const UCollator *coll, 1149 uint8_t *buffer, int32_t capacity, 1150 UErrorCode *status); 1151 1152 /** Opens a collator from a collator binary image created using 1153 * ucol_cloneBinary. Binary image used in instantiation of the 1154 * collator remains owned by the user and should stay around for 1155 * the lifetime of the collator. The API also takes a base collator 1156 * which usualy should be UCA. 1157 * @param bin binary image owned by the user and required through the 1158 * lifetime of the collator 1159 * @param length size of the image. If negative, the API will try to 1160 * figure out the length of the image 1161 * @param base fallback collator, usually UCA. Base is required to be 1162 * present through the lifetime of the collator. Currently 1163 * it cannot be NULL. 1164 * @param status for catching errors 1165 * @return newly created collator 1166 * @see ucol_cloneBinary 1167 * @stable ICU 3.2 1168 */ 1169 U_STABLE UCollator* U_EXPORT2 1170 ucol_openBinary(const uint8_t *bin, int32_t length, 1171 const UCollator *base, 1172 UErrorCode *status); 1173 1174 1175 #endif /* #if !UCONFIG_NO_COLLATION */ 1176 1177 #endif 1178 1179