1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2005-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucasemap.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2005may06 14 * created by: Markus W. Scherer 15 * 16 * Case mapping service object and functions using it. 17 */ 18 19 #ifndef __UCASEMAP_H__ 20 #define __UCASEMAP_H__ 21 22 #include "unicode/utypes.h" 23 #include "unicode/ustring.h" 24 25 /** 26 * \file 27 * \brief C API: Unicode case mapping functions using a UCaseMap service object. 28 * 29 * The service object takes care of memory allocations, data loading, and setup 30 * for the attributes, as usual. 31 * 32 * Currently, the functionality provided here does not overlap with uchar.h 33 * and ustring.h, except for ucasemap_toTitle(). 34 * 35 * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings. 36 */ 37 38 /** 39 * UCaseMap is an opaque service object for newer ICU case mapping functions. 40 * Older functions did not use a service object. 41 * @stable ICU 3.4 42 */ 43 struct UCaseMap; 44 typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */ 45 46 /** 47 * Open a UCaseMap service object for a locale and a set of options. 48 * The locale ID and options are preprocessed so that functions using the 49 * service object need not process them in each call. 50 * 51 * @param locale ICU locale ID, used for language-dependent 52 * upper-/lower-/title-casing according to the Unicode standard. 53 * Usual semantics: ""=root, NULL=default locale, etc. 54 * @param options Options bit set, used for case folding and string comparisons. 55 * Same flags as for u_foldCase(), u_strFoldCase(), 56 * u_strCaseCompare(), etc. 57 * Use 0 or U_FOLD_CASE_DEFAULT for default behavior. 58 * @param pErrorCode Must be a valid pointer to an error code value, 59 * which must not indicate a failure before the function call. 60 * @return Pointer to a UCaseMap service object, if successful. 61 * 62 * @see U_FOLD_CASE_DEFAULT 63 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I 64 * @see U_TITLECASE_NO_LOWERCASE 65 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT 66 * @stable ICU 3.4 67 */ 68 U_STABLE UCaseMap * U_EXPORT2 69 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode); 70 71 /** 72 * Close a UCaseMap service object. 73 * @param csm Object to be closed. 74 * @stable ICU 3.4 75 */ 76 U_STABLE void U_EXPORT2 77 ucasemap_close(UCaseMap *csm); 78 79 /** 80 * Get the locale ID that is used for language-dependent case mappings. 81 * @param csm UCaseMap service object. 82 * @return locale ID 83 * @stable ICU 3.4 84 */ 85 U_STABLE const char * U_EXPORT2 86 ucasemap_getLocale(const UCaseMap *csm); 87 88 /** 89 * Get the options bit set that is used for case folding and string comparisons. 90 * @param csm UCaseMap service object. 91 * @return options bit set 92 * @stable ICU 3.4 93 */ 94 U_STABLE uint32_t U_EXPORT2 95 ucasemap_getOptions(const UCaseMap *csm); 96 97 /** 98 * Set the locale ID that is used for language-dependent case mappings. 99 * 100 * @param csm UCaseMap service object. 101 * @param locale Locale ID, see ucasemap_open(). 102 * @param pErrorCode Must be a valid pointer to an error code value, 103 * which must not indicate a failure before the function call. 104 * 105 * @see ucasemap_open 106 * @stable ICU 3.4 107 */ 108 U_STABLE void U_EXPORT2 109 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode); 110 111 /** 112 * Set the options bit set that is used for case folding and string comparisons. 113 * 114 * @param csm UCaseMap service object. 115 * @param options Options bit set, see ucasemap_open(). 116 * @param pErrorCode Must be a valid pointer to an error code value, 117 * which must not indicate a failure before the function call. 118 * 119 * @see ucasemap_open 120 * @stable ICU 3.4 121 */ 122 U_STABLE void U_EXPORT2 123 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode); 124 125 /** 126 * Do not lowercase non-initial parts of words when titlecasing. 127 * Option bit for titlecasing APIs that take an options bit set. 128 * 129 * By default, titlecasing will titlecase the first cased character 130 * of a word and lowercase all other characters. 131 * With this option, the other characters will not be modified. 132 * 133 * @see ucasemap_setOptions 134 * @see ucasemap_toTitle 135 * @see ucasemap_utf8ToTitle 136 * @see UnicodeString::toTitle 137 * @stable ICU 3.8 138 */ 139 #define U_TITLECASE_NO_LOWERCASE 0x100 140 141 /** 142 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes; 143 * titlecase exactly the characters at breaks from the iterator. 144 * Option bit for titlecasing APIs that take an options bit set. 145 * 146 * By default, titlecasing will take each break iterator index, 147 * adjust it by looking for the next cased character, and titlecase that one. 148 * Other characters are lowercased. 149 * 150 * This follows Unicode 4 & 5 section 3.13 Default Case Operations: 151 * 152 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 153 * #29, "Text Boundaries." Between each pair of word boundaries, find the first 154 * cased character F. If F exists, map F to default_title(F); then map each 155 * subsequent character C to default_lower(C). 156 * 157 * @see ucasemap_setOptions 158 * @see ucasemap_toTitle 159 * @see ucasemap_utf8ToTitle 160 * @see UnicodeString::toTitle 161 * @see U_TITLECASE_NO_LOWERCASE 162 * @stable ICU 3.8 163 */ 164 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200 165 166 #if !UCONFIG_NO_BREAK_ITERATION 167 168 /** 169 * Get the break iterator that is used for titlecasing. 170 * Do not modify the returned break iterator. 171 * @param csm UCaseMap service object. 172 * @return titlecasing break iterator 173 * @stable ICU 3.8 174 */ 175 U_DRAFT const UBreakIterator * U_EXPORT2 176 ucasemap_getBreakIterator(const UCaseMap *csm); 177 178 /** 179 * Set the break iterator that is used for titlecasing. 180 * The UCaseMap service object releases a previously set break iterator 181 * and "adopts" this new one, taking ownership of it. 182 * It will be released in a subsequent call to ucasemap_setBreakIterator() 183 * or ucasemap_close(). 184 * 185 * Break iterator operations are not thread-safe. Therefore, titlecasing 186 * functions use non-const UCaseMap objects. It is not possible to titlecase 187 * strings concurrently using the same UCaseMap. 188 * 189 * @param csm UCaseMap service object. 190 * @param iterToAdopt Break iterator to be adopted for titlecasing. 191 * @param pErrorCode Must be a valid pointer to an error code value, 192 * which must not indicate a failure before the function call. 193 * 194 * @see ucasemap_toTitle 195 * @see ucasemap_utf8ToTitle 196 * @stable ICU 3.8 197 */ 198 U_DRAFT void U_EXPORT2 199 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode); 200 201 /** 202 * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(), 203 * except that it takes ucasemap_setOptions() into account and has performance 204 * advantages from being able to use a UCaseMap object for multiple case mapping 205 * operations, saving setup time. 206 * 207 * Casing is locale-dependent and context-sensitive. 208 * Titlecasing uses a break iterator to find the first characters of words 209 * that are to be titlecased. It titlecases those characters and lowercases 210 * all others. (This can be modified with ucasemap_setOptions().) 211 * 212 * Note: This function takes a non-const UCaseMap pointer because it will 213 * open a default break iterator if no break iterator was set yet, 214 * and effectively call ucasemap_setBreakIterator(); 215 * also because the break iterator is stateful and will be modified during 216 * the iteration. 217 * 218 * The titlecase break iterator can be provided to customize for arbitrary 219 * styles, using rules and dictionaries beyond the standard iterators. 220 * The standard titlecase iterator for the root locale implements the 221 * algorithm of Unicode TR 21. 222 * 223 * This function uses only the setUText(), first(), next() and close() methods of the 224 * provided break iterator. 225 * 226 * The result may be longer or shorter than the original. 227 * The source string and the destination buffer must not overlap. 228 * 229 * @param csm UCaseMap service object. This pointer is non-const! 230 * See the note above for details. 231 * @param dest A buffer for the result string. The result will be NUL-terminated if 232 * the buffer is large enough. 233 * The contents is undefined in case of failure. 234 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 235 * dest may be NULL and the function will only return the length of the result 236 * without writing any of the result string. 237 * @param src The original string. 238 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 239 * @param pErrorCode Must be a valid pointer to an error code value, 240 * which must not indicate a failure before the function call. 241 * @return The length of the result string, if successful - or in case of a buffer overflow, 242 * in which case it will be greater than destCapacity. 243 * 244 * @see u_strToTitle 245 * @stable ICU 3.8 246 */ 247 U_DRAFT int32_t U_EXPORT2 248 ucasemap_toTitle(UCaseMap *csm, 249 UChar *dest, int32_t destCapacity, 250 const UChar *src, int32_t srcLength, 251 UErrorCode *pErrorCode); 252 253 #endif 254 255 /** 256 * Lowercase the characters in a UTF-8 string. 257 * Casing is locale-dependent and context-sensitive. 258 * The result may be longer or shorter than the original. 259 * The source string and the destination buffer must not overlap. 260 * 261 * @param csm UCaseMap service object. 262 * @param dest A buffer for the result string. The result will be NUL-terminated if 263 * the buffer is large enough. 264 * The contents is undefined in case of failure. 265 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 266 * dest may be NULL and the function will only return the length of the result 267 * without writing any of the result string. 268 * @param src The original string. 269 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 270 * @param pErrorCode Must be a valid pointer to an error code value, 271 * which must not indicate a failure before the function call. 272 * @return The length of the result string, if successful - or in case of a buffer overflow, 273 * in which case it will be greater than destCapacity. 274 * 275 * @see u_strToLower 276 * @stable ICU 3.4 277 */ 278 U_STABLE int32_t U_EXPORT2 279 ucasemap_utf8ToLower(const UCaseMap *csm, 280 char *dest, int32_t destCapacity, 281 const char *src, int32_t srcLength, 282 UErrorCode *pErrorCode); 283 284 /** 285 * Uppercase the characters in a UTF-8 string. 286 * Casing is locale-dependent and context-sensitive. 287 * The result may be longer or shorter than the original. 288 * The source string and the destination buffer must not overlap. 289 * 290 * @param csm UCaseMap service object. 291 * @param dest A buffer for the result string. The result will be NUL-terminated if 292 * the buffer is large enough. 293 * The contents is undefined in case of failure. 294 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 295 * dest may be NULL and the function will only return the length of the result 296 * without writing any of the result string. 297 * @param src The original string. 298 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 299 * @param pErrorCode Must be a valid pointer to an error code value, 300 * which must not indicate a failure before the function call. 301 * @return The length of the result string, if successful - or in case of a buffer overflow, 302 * in which case it will be greater than destCapacity. 303 * 304 * @see u_strToUpper 305 * @stable ICU 3.4 306 */ 307 U_STABLE int32_t U_EXPORT2 308 ucasemap_utf8ToUpper(const UCaseMap *csm, 309 char *dest, int32_t destCapacity, 310 const char *src, int32_t srcLength, 311 UErrorCode *pErrorCode); 312 313 #if !UCONFIG_NO_BREAK_ITERATION 314 315 /** 316 * Titlecase a UTF-8 string. 317 * Casing is locale-dependent and context-sensitive. 318 * Titlecasing uses a break iterator to find the first characters of words 319 * that are to be titlecased. It titlecases those characters and lowercases 320 * all others. (This can be modified with ucasemap_setOptions().) 321 * 322 * Note: This function takes a non-const UCaseMap pointer because it will 323 * open a default break iterator if no break iterator was set yet, 324 * and effectively call ucasemap_setBreakIterator(); 325 * also because the break iterator is stateful and will be modified during 326 * the iteration. 327 * 328 * The titlecase break iterator can be provided to customize for arbitrary 329 * styles, using rules and dictionaries beyond the standard iterators. 330 * The standard titlecase iterator for the root locale implements the 331 * algorithm of Unicode TR 21. 332 * 333 * This function uses only the setUText(), first(), next() and close() methods of the 334 * provided break iterator. 335 * 336 * The result may be longer or shorter than the original. 337 * The source string and the destination buffer must not overlap. 338 * 339 * @param csm UCaseMap service object. This pointer is non-const! 340 * See the note above for details. 341 * @param dest A buffer for the result string. The result will be NUL-terminated if 342 * the buffer is large enough. 343 * The contents is undefined in case of failure. 344 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 345 * dest may be NULL and the function will only return the length of the result 346 * without writing any of the result string. 347 * @param src The original string. 348 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 349 * @param pErrorCode Must be a valid pointer to an error code value, 350 * which must not indicate a failure before the function call. 351 * @return The length of the result string, if successful - or in case of a buffer overflow, 352 * in which case it will be greater than destCapacity. 353 * 354 * @see u_strToTitle 355 * @see U_TITLECASE_NO_LOWERCASE 356 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT 357 * @stable ICU 3.8 358 */ 359 U_DRAFT int32_t U_EXPORT2 360 ucasemap_utf8ToTitle(UCaseMap *csm, 361 char *dest, int32_t destCapacity, 362 const char *src, int32_t srcLength, 363 UErrorCode *pErrorCode); 364 365 #endif 366 367 /** 368 * Case-fold the characters in a UTF-8 string. 369 * Case-folding is locale-independent and not context-sensitive, 370 * but there is an option for whether to include or exclude mappings for dotted I 371 * and dotless i that are marked with 'I' in CaseFolding.txt. 372 * The result may be longer or shorter than the original. 373 * The source string and the destination buffer must not overlap. 374 * 375 * @param csm UCaseMap service object. 376 * @param dest A buffer for the result string. The result will be NUL-terminated if 377 * the buffer is large enough. 378 * The contents is undefined in case of failure. 379 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then 380 * dest may be NULL and the function will only return the length of the result 381 * without writing any of the result string. 382 * @param src The original string. 383 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated. 384 * @param pErrorCode Must be a valid pointer to an error code value, 385 * which must not indicate a failure before the function call. 386 * @return The length of the result string, if successful - or in case of a buffer overflow, 387 * in which case it will be greater than destCapacity. 388 * 389 * @see u_strFoldCase 390 * @see ucasemap_setOptions 391 * @see U_FOLD_CASE_DEFAULT 392 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I 393 * @stable ICU 3.8 394 */ 395 U_DRAFT int32_t U_EXPORT2 396 ucasemap_utf8FoldCase(const UCaseMap *csm, 397 char *dest, int32_t destCapacity, 398 const char *src, int32_t srcLength, 399 UErrorCode *pErrorCode); 400 401 #endif 402