Home | History | Annotate | Download | only in unicode
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2005-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucasemap.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2005may06
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Case mapping service object and functions using it.
     17 */
     18 
     19 #ifndef __UCASEMAP_H__
     20 #define __UCASEMAP_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/localpointer.h"
     25 
     26 /**
     27  * \file
     28  * \brief C API: Unicode case mapping functions using a UCaseMap service object.
     29  *
     30  * The service object takes care of memory allocations, data loading, and setup
     31  * for the attributes, as usual.
     32  *
     33  * Currently, the functionality provided here does not overlap with uchar.h
     34  * and ustring.h, except for ucasemap_toTitle().
     35  *
     36  * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
     37  */
     38 
     39 /**
     40  * UCaseMap is an opaque service object for newer ICU case mapping functions.
     41  * Older functions did not use a service object.
     42  * @stable ICU 3.4
     43  */
     44 struct UCaseMap;
     45 typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */
     46 
     47 /**
     48  * Open a UCaseMap service object for a locale and a set of options.
     49  * The locale ID and options are preprocessed so that functions using the
     50  * service object need not process them in each call.
     51  *
     52  * @param locale ICU locale ID, used for language-dependent
     53  *               upper-/lower-/title-casing according to the Unicode standard.
     54  *               Usual semantics: ""=root, NULL=default locale, etc.
     55  * @param options Options bit set, used for case folding and string comparisons.
     56  *                Same flags as for u_foldCase(), u_strFoldCase(),
     57  *                u_strCaseCompare(), etc.
     58  *                Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
     59  * @param pErrorCode Must be a valid pointer to an error code value,
     60  *                   which must not indicate a failure before the function call.
     61  * @return Pointer to a UCaseMap service object, if successful.
     62  *
     63  * @see U_FOLD_CASE_DEFAULT
     64  * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
     65  * @see U_TITLECASE_NO_LOWERCASE
     66  * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
     67  * @stable ICU 3.4
     68  */
     69 U_STABLE UCaseMap * U_EXPORT2
     70 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
     71 
     72 /**
     73  * Close a UCaseMap service object.
     74  * @param csm Object to be closed.
     75  * @stable ICU 3.4
     76  */
     77 U_STABLE void U_EXPORT2
     78 ucasemap_close(UCaseMap *csm);
     79 
     80 #if U_SHOW_CPLUSPLUS_API
     81 
     82 U_NAMESPACE_BEGIN
     83 
     84 /**
     85  * \class LocalUCaseMapPointer
     86  * "Smart pointer" class, closes a UCaseMap via ucasemap_close().
     87  * For most methods see the LocalPointerBase base class.
     88  *
     89  * @see LocalPointerBase
     90  * @see LocalPointer
     91  * @stable ICU 4.4
     92  */
     93 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close);
     94 
     95 U_NAMESPACE_END
     96 
     97 #endif
     98 
     99 /**
    100  * Get the locale ID that is used for language-dependent case mappings.
    101  * @param csm UCaseMap service object.
    102  * @return locale ID
    103  * @stable ICU 3.4
    104  */
    105 U_STABLE const char * U_EXPORT2
    106 ucasemap_getLocale(const UCaseMap *csm);
    107 
    108 /**
    109  * Get the options bit set that is used for case folding and string comparisons.
    110  * @param csm UCaseMap service object.
    111  * @return options bit set
    112  * @stable ICU 3.4
    113  */
    114 U_STABLE uint32_t U_EXPORT2
    115 ucasemap_getOptions(const UCaseMap *csm);
    116 
    117 /**
    118  * Set the locale ID that is used for language-dependent case mappings.
    119  *
    120  * @param csm UCaseMap service object.
    121  * @param locale Locale ID, see ucasemap_open().
    122  * @param pErrorCode Must be a valid pointer to an error code value,
    123  *                   which must not indicate a failure before the function call.
    124  *
    125  * @see ucasemap_open
    126  * @stable ICU 3.4
    127  */
    128 U_STABLE void U_EXPORT2
    129 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
    130 
    131 /**
    132  * Set the options bit set that is used for case folding and string comparisons.
    133  *
    134  * @param csm UCaseMap service object.
    135  * @param options Options bit set, see ucasemap_open().
    136  * @param pErrorCode Must be a valid pointer to an error code value,
    137  *                   which must not indicate a failure before the function call.
    138  *
    139  * @see ucasemap_open
    140  * @stable ICU 3.4
    141  */
    142 U_STABLE void U_EXPORT2
    143 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
    144 
    145 /**
    146  * Do not lowercase non-initial parts of words when titlecasing.
    147  * Option bit for titlecasing APIs that take an options bit set.
    148  *
    149  * By default, titlecasing will titlecase the first cased character
    150  * of a word and lowercase all other characters.
    151  * With this option, the other characters will not be modified.
    152  *
    153  * @see ucasemap_setOptions
    154  * @see ucasemap_toTitle
    155  * @see ucasemap_utf8ToTitle
    156  * @see UnicodeString::toTitle
    157  * @stable ICU 3.8
    158  */
    159 #define U_TITLECASE_NO_LOWERCASE 0x100
    160 
    161 /**
    162  * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
    163  * titlecase exactly the characters at breaks from the iterator.
    164  * Option bit for titlecasing APIs that take an options bit set.
    165  *
    166  * By default, titlecasing will take each break iterator index,
    167  * adjust it by looking for the next cased character, and titlecase that one.
    168  * Other characters are lowercased.
    169  *
    170  * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
    171  *
    172  * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
    173  * #29, "Text Boundaries." Between each pair of word boundaries, find the first
    174  * cased character F. If F exists, map F to default_title(F); then map each
    175  * subsequent character C to default_lower(C).
    176  *
    177  * @see ucasemap_setOptions
    178  * @see ucasemap_toTitle
    179  * @see ucasemap_utf8ToTitle
    180  * @see UnicodeString::toTitle
    181  * @see U_TITLECASE_NO_LOWERCASE
    182  * @stable ICU 3.8
    183  */
    184 #define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
    185 
    186 #if !UCONFIG_NO_BREAK_ITERATION
    187 
    188 /**
    189  * Get the break iterator that is used for titlecasing.
    190  * Do not modify the returned break iterator.
    191  * @param csm UCaseMap service object.
    192  * @return titlecasing break iterator
    193  * @stable ICU 3.8
    194  */
    195 U_STABLE const UBreakIterator * U_EXPORT2
    196 ucasemap_getBreakIterator(const UCaseMap *csm);
    197 
    198 /**
    199  * Set the break iterator that is used for titlecasing.
    200  * The UCaseMap service object releases a previously set break iterator
    201  * and "adopts" this new one, taking ownership of it.
    202  * It will be released in a subsequent call to ucasemap_setBreakIterator()
    203  * or ucasemap_close().
    204  *
    205  * Break iterator operations are not thread-safe. Therefore, titlecasing
    206  * functions use non-const UCaseMap objects. It is not possible to titlecase
    207  * strings concurrently using the same UCaseMap.
    208  *
    209  * @param csm UCaseMap service object.
    210  * @param iterToAdopt Break iterator to be adopted for titlecasing.
    211  * @param pErrorCode Must be a valid pointer to an error code value,
    212  *                   which must not indicate a failure before the function call.
    213  *
    214  * @see ucasemap_toTitle
    215  * @see ucasemap_utf8ToTitle
    216  * @stable ICU 3.8
    217  */
    218 U_STABLE void U_EXPORT2
    219 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
    220 
    221 /**
    222  * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
    223  * except that it takes ucasemap_setOptions() into account and has performance
    224  * advantages from being able to use a UCaseMap object for multiple case mapping
    225  * operations, saving setup time.
    226  *
    227  * Casing is locale-dependent and context-sensitive.
    228  * Titlecasing uses a break iterator to find the first characters of words
    229  * that are to be titlecased. It titlecases those characters and lowercases
    230  * all others. (This can be modified with ucasemap_setOptions().)
    231  *
    232  * Note: This function takes a non-const UCaseMap pointer because it will
    233  * open a default break iterator if no break iterator was set yet,
    234  * and effectively call ucasemap_setBreakIterator();
    235  * also because the break iterator is stateful and will be modified during
    236  * the iteration.
    237  *
    238  * The titlecase break iterator can be provided to customize for arbitrary
    239  * styles, using rules and dictionaries beyond the standard iterators.
    240  * The standard titlecase iterator for the root locale implements the
    241  * algorithm of Unicode TR 21.
    242  *
    243  * This function uses only the setUText(), first(), next() and close() methods of the
    244  * provided break iterator.
    245  *
    246  * The result may be longer or shorter than the original.
    247  * The source string and the destination buffer must not overlap.
    248  *
    249  * @param csm       UCaseMap service object. This pointer is non-const!
    250  *                  See the note above for details.
    251  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    252  *                  the buffer is large enough.
    253  *                  The contents is undefined in case of failure.
    254  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    255  *                  dest may be NULL and the function will only return the length of the result
    256  *                  without writing any of the result string.
    257  * @param src       The original string.
    258  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    259  * @param pErrorCode Must be a valid pointer to an error code value,
    260  *                  which must not indicate a failure before the function call.
    261  * @return The length of the result string, if successful - or in case of a buffer overflow,
    262  *         in which case it will be greater than destCapacity.
    263  *
    264  * @see u_strToTitle
    265  * @stable ICU 3.8
    266  */
    267 U_STABLE int32_t U_EXPORT2
    268 ucasemap_toTitle(UCaseMap *csm,
    269                  UChar *dest, int32_t destCapacity,
    270                  const UChar *src, int32_t srcLength,
    271                  UErrorCode *pErrorCode);
    272 
    273 #endif
    274 
    275 /**
    276  * Lowercase the characters in a UTF-8 string.
    277  * Casing is locale-dependent and context-sensitive.
    278  * The result may be longer or shorter than the original.
    279  * The source string and the destination buffer must not overlap.
    280  *
    281  * @param csm       UCaseMap service object.
    282  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    283  *                  the buffer is large enough.
    284  *                  The contents is undefined in case of failure.
    285  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    286  *                  dest may be NULL and the function will only return the length of the result
    287  *                  without writing any of the result string.
    288  * @param src       The original string.
    289  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    290  * @param pErrorCode Must be a valid pointer to an error code value,
    291  *                  which must not indicate a failure before the function call.
    292  * @return The length of the result string, if successful - or in case of a buffer overflow,
    293  *         in which case it will be greater than destCapacity.
    294  *
    295  * @see u_strToLower
    296  * @stable ICU 3.4
    297  */
    298 U_STABLE int32_t U_EXPORT2
    299 ucasemap_utf8ToLower(const UCaseMap *csm,
    300                      char *dest, int32_t destCapacity,
    301                      const char *src, int32_t srcLength,
    302                      UErrorCode *pErrorCode);
    303 
    304 /**
    305  * Uppercase the characters in a UTF-8 string.
    306  * Casing is locale-dependent and context-sensitive.
    307  * The result may be longer or shorter than the original.
    308  * The source string and the destination buffer must not overlap.
    309  *
    310  * @param csm       UCaseMap service object.
    311  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    312  *                  the buffer is large enough.
    313  *                  The contents is undefined in case of failure.
    314  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    315  *                  dest may be NULL and the function will only return the length of the result
    316  *                  without writing any of the result string.
    317  * @param src       The original string.
    318  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    319  * @param pErrorCode Must be a valid pointer to an error code value,
    320  *                  which must not indicate a failure before the function call.
    321  * @return The length of the result string, if successful - or in case of a buffer overflow,
    322  *         in which case it will be greater than destCapacity.
    323  *
    324  * @see u_strToUpper
    325  * @stable ICU 3.4
    326  */
    327 U_STABLE int32_t U_EXPORT2
    328 ucasemap_utf8ToUpper(const UCaseMap *csm,
    329                      char *dest, int32_t destCapacity,
    330                      const char *src, int32_t srcLength,
    331                      UErrorCode *pErrorCode);
    332 
    333 #if !UCONFIG_NO_BREAK_ITERATION
    334 
    335 /**
    336  * Titlecase a UTF-8 string.
    337  * Casing is locale-dependent and context-sensitive.
    338  * Titlecasing uses a break iterator to find the first characters of words
    339  * that are to be titlecased. It titlecases those characters and lowercases
    340  * all others. (This can be modified with ucasemap_setOptions().)
    341  *
    342  * Note: This function takes a non-const UCaseMap pointer because it will
    343  * open a default break iterator if no break iterator was set yet,
    344  * and effectively call ucasemap_setBreakIterator();
    345  * also because the break iterator is stateful and will be modified during
    346  * the iteration.
    347  *
    348  * The titlecase break iterator can be provided to customize for arbitrary
    349  * styles, using rules and dictionaries beyond the standard iterators.
    350  * The standard titlecase iterator for the root locale implements the
    351  * algorithm of Unicode TR 21.
    352  *
    353  * This function uses only the setUText(), first(), next() and close() methods of the
    354  * provided break iterator.
    355  *
    356  * The result may be longer or shorter than the original.
    357  * The source string and the destination buffer must not overlap.
    358  *
    359  * @param csm       UCaseMap service object. This pointer is non-const!
    360  *                  See the note above for details.
    361  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    362  *                  the buffer is large enough.
    363  *                  The contents is undefined in case of failure.
    364  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    365  *                  dest may be NULL and the function will only return the length of the result
    366  *                  without writing any of the result string.
    367  * @param src       The original string.
    368  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    369  * @param pErrorCode Must be a valid pointer to an error code value,
    370  *                  which must not indicate a failure before the function call.
    371  * @return The length of the result string, if successful - or in case of a buffer overflow,
    372  *         in which case it will be greater than destCapacity.
    373  *
    374  * @see u_strToTitle
    375  * @see U_TITLECASE_NO_LOWERCASE
    376  * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
    377  * @stable ICU 3.8
    378  */
    379 U_STABLE int32_t U_EXPORT2
    380 ucasemap_utf8ToTitle(UCaseMap *csm,
    381                     char *dest, int32_t destCapacity,
    382                     const char *src, int32_t srcLength,
    383                     UErrorCode *pErrorCode);
    384 
    385 #endif
    386 
    387 /**
    388  * Case-fold the characters in a UTF-8 string.
    389  * Case-folding is locale-independent and not context-sensitive,
    390  * but there is an option for whether to include or exclude mappings for dotted I
    391  * and dotless i that are marked with 'I' in CaseFolding.txt.
    392  * The result may be longer or shorter than the original.
    393  * The source string and the destination buffer must not overlap.
    394  *
    395  * @param csm       UCaseMap service object.
    396  * @param dest      A buffer for the result string. The result will be NUL-terminated if
    397  *                  the buffer is large enough.
    398  *                  The contents is undefined in case of failure.
    399  * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
    400  *                  dest may be NULL and the function will only return the length of the result
    401  *                  without writing any of the result string.
    402  * @param src       The original string.
    403  * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
    404  * @param pErrorCode Must be a valid pointer to an error code value,
    405  *                  which must not indicate a failure before the function call.
    406  * @return The length of the result string, if successful - or in case of a buffer overflow,
    407  *         in which case it will be greater than destCapacity.
    408  *
    409  * @see u_strFoldCase
    410  * @see ucasemap_setOptions
    411  * @see U_FOLD_CASE_DEFAULT
    412  * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
    413  * @stable ICU 3.8
    414  */
    415 U_STABLE int32_t U_EXPORT2
    416 ucasemap_utf8FoldCase(const UCaseMap *csm,
    417                       char *dest, int32_t destCapacity,
    418                       const char *src, int32_t srcLength,
    419                       UErrorCode *pErrorCode);
    420 
    421 #endif
    422