Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2005-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucasemap.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2005may06
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Case mapping service object and functions using it.
     17 */
     18 
     19 #ifndef __UCASEMAP_H__
     20 #define __UCASEMAP_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/ustring.h"
     24 
     25 /**
     26  * \file
     27  * \brief C API: Unicode case mapping functions using a UCaseMap service object.
     28  *
     29  * The service object takes care of memory allocations, data loading, and setup
     30  * for the attributes, as usual.
     31  *
     32  * Currently, the functionality provided here does not overlap with uchar.h
     33  * and ustring.h, except for ucasemap_toTitle().
     34  *
     35  * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
     36  */
     37 
     38 /**
     39  * UCaseMap is an opaque service object for newer ICU case mapping functions.
     40  * Older functions did not use a service object.
     41  * @stable ICU 3.4
     42  */
     43 struct UCaseMap;
     44 typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */
     45 
     46 /**
     47  * Open a UCaseMap service object for a locale and a set of options.
     48  * The locale ID and options are preprocessed so that functions using the
     49  * service object need not process them in each call.
     50  *
     51  * @param locale ICU locale ID, used for language-dependent
     52  *               upper-/lower-/title-casing according to the Unicode standard.
     53  *               Usual semantics: ""=root, NULL=default locale, etc.
     54  * @param options Options bit set, used for case folding and string comparisons.
     55  *                Same flags as for u_foldCase(), u_strFoldCase(),
     56  *                u_strCaseCompare(), etc.
     57  *                Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
     58  * @param pErrorCode Must be a valid pointer to an error code value,
     59  *                   which must not indicate a failure before the function call.
     60  * @return Pointer to a UCaseMap service object, if successful.
     61  *
     62  * @see U_FOLD_CASE_DEFAULT
     63  * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
     64  * @see U_TITLECASE_NO_LOWERCASE
     65  * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
     66  * @stable ICU 3.4
     67  */
     68 U_STABLE UCaseMap * U_EXPORT2
     69 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
     70 
     71 /**
     72  * Close a UCaseMap service object.
     73  * @param csm Object to be closed.
     74  * @stable ICU 3.4
     75  */
     76 U_STABLE void U_EXPORT2
     77 ucasemap_close(UCaseMap *csm);
     78 
     79 /**
     80  * Get the locale ID that is used for language-dependent case mappings.
     81  * @param csm UCaseMap service object.
     82  * @return locale ID
     83  * @stable ICU 3.4
     84  */
     85 U_STABLE const char * U_EXPORT2
     86 ucasemap_getLocale(const UCaseMap *csm);
     87 
     88 /**
     89  * Get the options bit set that is used for case folding and string comparisons.
     90  * @param csm UCaseMap service object.
     91  * @return options bit set
     92  * @stable ICU 3.4
     93  */
     94 U_STABLE uint32_t U_EXPORT2
     95 ucasemap_getOptions(const UCaseMap *csm);
     96 
     97 /**
     98  * Set the locale ID that is used for language-dependent case mappings.
     99  *
    100  * @param csm UCaseMap service object.
    101  * @param locale Locale ID, see ucasemap_open().
    102  * @param pErrorCode Must be a valid pointer to an error code value,
    103  *                   which must not indicate a failure before the function call.
    104  *
    105  * @see ucasemap_open
    106  * @stable ICU 3.4
    107  */
    108 U_STABLE void U_EXPORT2
    109 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
    110 
    111 /**
    112  * Set the options bit set that is used for case folding and string comparisons.
    113  *
    114  * @param csm UCaseMap service object.
    115  * @param options Options bit set, see ucasemap_open().
    116  * @param pErrorCode Must be a valid pointer to an error code value,
    117  *                   which must not indicate a failure before the function call.
    118  *
    119  * @see ucasemap_open
    120  * @stable ICU 3.4
    121  */
    122 U_STABLE void U_EXPORT2
    123 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
    124 
    125 /**
    126  * Do not lowercase non-initial parts of words when titlecasing.
    127  * Option bit for titlecasing APIs that take an options bit set.
    128  *
    129  * By default, titlecasing will titlecase the first cased character
    130  * of a word and lowercase all other characters.
    131  * With this option, the other characters will not be modified.
    132  *
    133  * @see ucasemap_setOptions
    134  * @see ucasemap_toTitle
    135  * @see ucasemap_utf8ToTitle
    136  * @see UnicodeString::toTitle
    137  * @stable ICU 3.8
    138  */
    139 #define U_TITLECASE_NO_LOWERCASE 0x100
    140 
    141 /**
    142  * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
    143  * titlecase exactly the characters at breaks from the iterator.
    144  * Option bit for titlecasing APIs that take an options bit set.
    145  *
    146  * By default, titlecasing will take each break iterator index,
    147  * adjust it by looking for the next cased character, and titlecase that one.
    148  * Other characters are lowercased.
    149  *
    150  * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
    151  *
    152  * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
    153  * #29, "Text Boundaries." Between each pair of word boundaries, find the first
    154  * cased character F. If F exists, map F to default_title(F); then map each
    155  * subsequent character C to default_lower(C).
    156  *
    157  * @see ucasemap_setOptions
    158  * @see ucasemap_toTitle
    159  * @see ucasemap_utf8ToTitle
    160  * @see UnicodeString::toTitle
    161  * @see U_TITLECASE_NO_LOWERCASE
    162  * @stable ICU 3.8
    163  */
    164 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
    165 
    166 #if !UCONFIG_NO_BREAK_ITERATION
    167 
    168 /**
    169  * Get the break iterator that is used for titlecasing.
    170  * Do not modify the returned break iterator.
    171  * @param csm UCaseMap service object.
    172  * @return titlecasing break iterator
    173  * @stable ICU 3.8
    174  */
    175 U_DRAFT const UBreakIterator * U_EXPORT2
    176 ucasemap_getBreakIterator(const UCaseMap *csm);
    177 
    178 /**
    179  * Set the break iterator that is used for titlecasing.
    180  * The UCaseMap service object releases a previously set break iterator
    181  * and "adopts" this new one, taking ownership of it.
    182  * It will be released in a subsequent call to ucasemap_setBreakIterator()
    183  * or ucasemap_close().
    184  *
    185  * Break iterator operations are not thread-safe. Therefore, titlecasing
    186  * functions use non-const UCaseMap objects. It is not possible to titlecase
    187  * strings concurrently using the same UCaseMap.
    188  *
    189  * @param csm UCaseMap service object.
    190  * @param iterToAdopt Break iterator to be adopted for titlecasing.
    191  * @param pErrorCode Must be a valid pointer to an error code value,
    192  *                   which must not indicate a failure before the function call.
    193  *
    194  * @see ucasemap_toTitle
    195  * @see ucasemap_utf8ToTitle
    196  * @stable ICU 3.8
    197  */
    198 U_DRAFT void U_EXPORT2
    199 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
    200 
    201 /**
    202  * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
    203  * except that it takes ucasemap_setOptions() into account and has performance
    204  * advantages from being able to use a UCaseMap object for multiple case mapping
    205  * operations, saving setup time.
    206  *
    207  * Casing is locale-dependent and context-sensitive.
    208  * Titlecasing uses a break iterator to find the first characters of words
    209  * that are to be titlecased. It titlecases those characters and lowercases
    210  * all others. (This can be modified with ucasemap_setOptions().)
    211  *
    212  * Note: This function takes a non-const UCaseMap pointer because it will
    213  * open a default break iterator if no break iterator was set yet,
    214  * and effectively call ucasemap_setBreakIterator();
    215  * also because the break iterator is stateful and will be modified during
    216  * the iteration.
    217  *
    218  * The titlecase break iterator can be provided to customize for arbitrary
    219  * styles, using rules and dictionaries beyond the standard iterators.
    220  * The standard titlecase iterator for the root locale implements the
    221  * algorithm of Unicode TR 21.
    222  *
    223  * This function uses only the setUText(), first(), next() and close() methods of the
    224  * provided break iterator.
    225  *
    226  * The result may be longer or shorter than the original.
    227  * The source string and the destination buffer must not overlap.
    228  *
    229  * @param csm       UCaseMap service object. This pointer is non-const!
    230  *                  See the note above for details.
    231  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    232  *                  the buffer is large enough.
    233  *                  The contents is undefined in case of failure.
    234  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    235  *                  dest may be NULL and the function will only return the length of the result
    236  *                  without writing any of the result string.
    237  * @param src       The original string.
    238  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    239  * @param pErrorCode Must be a valid pointer to an error code value,
    240  *                  which must not indicate a failure before the function call.
    241  * @return The length of the result string, if successful - or in case of a buffer overflow,
    242  *         in which case it will be greater than destCapacity.
    243  *
    244  * @see u_strToTitle
    245  * @stable ICU 3.8
    246  */
    247 U_DRAFT int32_t U_EXPORT2
    248 ucasemap_toTitle(UCaseMap *csm,
    249                  UChar *dest, int32_t destCapacity,
    250                  const UChar *src, int32_t srcLength,
    251                  UErrorCode *pErrorCode);
    252 
    253 #endif
    254 
    255 /**
    256  * Lowercase the characters in a UTF-8 string.
    257  * Casing is locale-dependent and context-sensitive.
    258  * The result may be longer or shorter than the original.
    259  * The source string and the destination buffer must not overlap.
    260  *
    261  * @param csm       UCaseMap service object.
    262  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    263  *                  the buffer is large enough.
    264  *                  The contents is undefined in case of failure.
    265  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    266  *                  dest may be NULL and the function will only return the length of the result
    267  *                  without writing any of the result string.
    268  * @param src       The original string.
    269  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    270  * @param pErrorCode Must be a valid pointer to an error code value,
    271  *                  which must not indicate a failure before the function call.
    272  * @return The length of the result string, if successful - or in case of a buffer overflow,
    273  *         in which case it will be greater than destCapacity.
    274  *
    275  * @see u_strToLower
    276  * @stable ICU 3.4
    277  */
    278 U_STABLE int32_t U_EXPORT2
    279 ucasemap_utf8ToLower(const UCaseMap *csm,
    280                      char *dest, int32_t destCapacity,
    281                      const char *src, int32_t srcLength,
    282                      UErrorCode *pErrorCode);
    283 
    284 /**
    285  * Uppercase the characters in a UTF-8 string.
    286  * Casing is locale-dependent and context-sensitive.
    287  * The result may be longer or shorter than the original.
    288  * The source string and the destination buffer must not overlap.
    289  *
    290  * @param csm       UCaseMap service object.
    291  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    292  *                  the buffer is large enough.
    293  *                  The contents is undefined in case of failure.
    294  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    295  *                  dest may be NULL and the function will only return the length of the result
    296  *                  without writing any of the result string.
    297  * @param src       The original string.
    298  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    299  * @param pErrorCode Must be a valid pointer to an error code value,
    300  *                  which must not indicate a failure before the function call.
    301  * @return The length of the result string, if successful - or in case of a buffer overflow,
    302  *         in which case it will be greater than destCapacity.
    303  *
    304  * @see u_strToUpper
    305  * @stable ICU 3.4
    306  */
    307 U_STABLE int32_t U_EXPORT2
    308 ucasemap_utf8ToUpper(const UCaseMap *csm,
    309                      char *dest, int32_t destCapacity,
    310                      const char *src, int32_t srcLength,
    311                      UErrorCode *pErrorCode);
    312 
    313 #if !UCONFIG_NO_BREAK_ITERATION
    314 
    315 /**
    316  * Titlecase a UTF-8 string.
    317  * Casing is locale-dependent and context-sensitive.
    318  * Titlecasing uses a break iterator to find the first characters of words
    319  * that are to be titlecased. It titlecases those characters and lowercases
    320  * all others. (This can be modified with ucasemap_setOptions().)
    321  *
    322  * Note: This function takes a non-const UCaseMap pointer because it will
    323  * open a default break iterator if no break iterator was set yet,
    324  * and effectively call ucasemap_setBreakIterator();
    325  * also because the break iterator is stateful and will be modified during
    326  * the iteration.
    327  *
    328  * The titlecase break iterator can be provided to customize for arbitrary
    329  * styles, using rules and dictionaries beyond the standard iterators.
    330  * The standard titlecase iterator for the root locale implements the
    331  * algorithm of Unicode TR 21.
    332  *
    333  * This function uses only the setUText(), first(), next() and close() methods of the
    334  * provided break iterator.
    335  *
    336  * The result may be longer or shorter than the original.
    337  * The source string and the destination buffer must not overlap.
    338  *
    339  * @param csm       UCaseMap service object. This pointer is non-const!
    340  *                  See the note above for details.
    341  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    342  *                  the buffer is large enough.
    343  *                  The contents is undefined in case of failure.
    344  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    345  *                  dest may be NULL and the function will only return the length of the result
    346  *                  without writing any of the result string.
    347  * @param src       The original string.
    348  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    349  * @param pErrorCode Must be a valid pointer to an error code value,
    350  *                  which must not indicate a failure before the function call.
    351  * @return The length of the result string, if successful - or in case of a buffer overflow,
    352  *         in which case it will be greater than destCapacity.
    353  *
    354  * @see u_strToTitle
    355  * @see U_TITLECASE_NO_LOWERCASE
    356  * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
    357  * @stable ICU 3.8
    358  */
    359 U_DRAFT int32_t U_EXPORT2
    360 ucasemap_utf8ToTitle(UCaseMap *csm,
    361                     char *dest, int32_t destCapacity,
    362                     const char *src, int32_t srcLength,
    363                     UErrorCode *pErrorCode);
    364 
    365 #endif
    366 
    367 /**
    368  * Case-fold the characters in a UTF-8 string.
    369  * Case-folding is locale-independent and not context-sensitive,
    370  * but there is an option for whether to include or exclude mappings for dotted I
    371  * and dotless i that are marked with 'I' in CaseFolding.txt.
    372  * The result may be longer or shorter than the original.
    373  * The source string and the destination buffer must not overlap.
    374  *
    375  * @param csm       UCaseMap service object.
    376  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    377  *                  the buffer is large enough.
    378  *                  The contents is undefined in case of failure.
    379  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    380  *                  dest may be NULL and the function will only return the length of the result
    381  *                  without writing any of the result string.
    382  * @param src       The original string.
    383  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    384  * @param pErrorCode Must be a valid pointer to an error code value,
    385  *                  which must not indicate a failure before the function call.
    386  * @return The length of the result string, if successful - or in case of a buffer overflow,
    387  *         in which case it will be greater than destCapacity.
    388  *
    389  * @see u_strFoldCase
    390  * @see ucasemap_setOptions
    391  * @see U_FOLD_CASE_DEFAULT
    392  * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
    393  * @stable ICU 3.8
    394  */
    395 U_DRAFT int32_t U_EXPORT2
    396 ucasemap_utf8FoldCase(const UCaseMap *csm,
    397                       char *dest, int32_t destCapacity,
    398                       const char *src, int32_t srcLength,
    399                       UErrorCode *pErrorCode);
    400 
    401 #endif
    402