Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucase.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Low-level Unicode character/string case mapping code.
     17 */
     18 
     19 #ifndef __UCASE_H__
     20 #define __UCASE_H__
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/uset.h"
     24 #include "putilimp.h"
     25 #include "uset_imp.h"
     26 #include "udataswp.h"
     27 
     28 #ifdef __cplusplus
     29 U_NAMESPACE_BEGIN
     30 
     31 class UnicodeString;
     32 
     33 U_NAMESPACE_END
     34 #endif
     35 
     36 /* library API -------------------------------------------------------------- */
     37 
     38 U_CDECL_BEGIN
     39 
     40 struct UCaseProps;
     41 typedef struct UCaseProps UCaseProps;
     42 
     43 U_CDECL_END
     44 
     45 U_CAPI const UCaseProps * U_EXPORT2
     46 ucase_getSingleton(void);
     47 
     48 U_CFUNC void U_EXPORT2
     49 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
     50 
     51 /**
     52  * Requires non-NULL locale ID but otherwise does the equivalent of
     53  * checking for language codes as if uloc_getLanguage() were called:
     54  * Accepts both 2- and 3-letter codes and accepts case variants.
     55  */
     56 U_CFUNC int32_t
     57 ucase_getCaseLocale(const char *locale, int32_t *locCache);
     58 
     59 /* Casing locale types for ucase_getCaseLocale */
     60 enum {
     61     UCASE_LOC_UNKNOWN,
     62     UCASE_LOC_ROOT,
     63     UCASE_LOC_TURKISH,
     64     UCASE_LOC_LITHUANIAN,
     65     UCASE_LOC_DUTCH
     66 };
     67 
     68 /**
     69  * Bit mask for getting just the options from a string compare options word
     70  * that are relevant for case-insensitive string comparison.
     71  * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
     72  * @internal
     73  */
     74 #define _STRCASECMP_OPTIONS_MASK 0xffff
     75 
     76 /**
     77  * Bit mask for getting just the options from a string compare options word
     78  * that are relevant for case folding (of a single string or code point).
     79  * See uchar.h.
     80  * @internal
     81  */
     82 #define _FOLD_CASE_OPTIONS_MASK 0xff
     83 
     84 /* single-code point functions */
     85 
     86 U_CAPI UChar32 U_EXPORT2
     87 ucase_tolower(const UCaseProps *csp, UChar32 c);
     88 
     89 U_CAPI UChar32 U_EXPORT2
     90 ucase_toupper(const UCaseProps *csp, UChar32 c);
     91 
     92 U_CAPI UChar32 U_EXPORT2
     93 ucase_totitle(const UCaseProps *csp, UChar32 c);
     94 
     95 U_CAPI UChar32 U_EXPORT2
     96 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
     97 
     98 /**
     99  * Adds all simple case mappings and the full case folding for c to sa,
    100  * and also adds special case closure mappings.
    101  * c itself is not added.
    102  * For example, the mappings
    103  * - for s include long s
    104  * - for sharp s include ss
    105  * - for k include the Kelvin sign
    106  */
    107 U_CFUNC void U_EXPORT2
    108 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
    109 
    110 /**
    111  * Maps the string to single code points and adds the associated case closure
    112  * mappings.
    113  * The string is mapped to code points if it is their full case folding string.
    114  * In other words, this performs a reverse full case folding and then
    115  * adds the case closure items of the resulting code points.
    116  * If the string is found and its closure applied, then
    117  * the string itself is added as well as part of its code points' closure.
    118  * It must be length>=0.
    119  *
    120  * @return TRUE if the string was found
    121  */
    122 U_CFUNC UBool U_EXPORT2
    123 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
    124 
    125 #ifdef __cplusplus
    126 U_NAMESPACE_BEGIN
    127 
    128 /**
    129  * Iterator over characters with more than one code point in the full default Case_Folding.
    130  */
    131 class U_COMMON_API FullCaseFoldingIterator {
    132 public:
    133     /** Constructor. */
    134     FullCaseFoldingIterator();
    135     /**
    136      * Returns the next (cp, full) pair where "full" is cp's full default Case_Folding.
    137      * Returns a negative cp value at the end of the iteration.
    138      */
    139     UChar32 next(UnicodeString &full);
    140 private:
    141     FullCaseFoldingIterator(const FullCaseFoldingIterator &);  // no copy
    142     FullCaseFoldingIterator &operator=(const FullCaseFoldingIterator &);  // no assignment
    143 
    144     const UChar *unfold;
    145     int32_t unfoldRows;
    146     int32_t unfoldRowWidth;
    147     int32_t unfoldStringWidth;
    148     int32_t currentRow;
    149     int32_t rowCpIndex;
    150 };
    151 
    152 U_NAMESPACE_END
    153 #endif
    154 
    155 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
    156 U_CAPI int32_t U_EXPORT2
    157 ucase_getType(const UCaseProps *csp, UChar32 c);
    158 
    159 /** @return same as ucase_getType(), or <0 if c is case-ignorable */
    160 U_CAPI int32_t U_EXPORT2
    161 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
    162 
    163 U_CAPI UBool U_EXPORT2
    164 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
    165 
    166 U_CAPI UBool U_EXPORT2
    167 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
    168 
    169 /* string case mapping functions */
    170 
    171 U_CDECL_BEGIN
    172 
    173 /**
    174  * Iterator function for string case mappings, which need to look at the
    175  * context (surrounding text) of a given character for conditional mappings.
    176  *
    177  * The iterator only needs to go backward or forward away from the
    178  * character in question. It does not use any indexes on this interface.
    179  * It does not support random access or an arbitrary change of
    180  * iteration direction.
    181  *
    182  * The code point being case-mapped itself is never returned by
    183  * this iterator.
    184  *
    185  * @param context A pointer to the iterator's working data.
    186  * @param dir If <0 then start iterating backward from the character;
    187  *            if >0 then start iterating forward from the character;
    188  *            if 0 then continue iterating in the current direction.
    189  * @return Next code point, or <0 when the iteration is done.
    190  */
    191 typedef UChar32 U_CALLCONV
    192 UCaseContextIterator(void *context, int8_t dir);
    193 
    194 /**
    195  * Sample struct which may be used by some implementations of
    196  * UCaseContextIterator.
    197  */
    198 struct UCaseContext {
    199     void *p;
    200     int32_t start, index, limit;
    201     int32_t cpStart, cpLimit;
    202     int8_t dir;
    203     int8_t b1, b2, b3;
    204 };
    205 typedef struct UCaseContext UCaseContext;
    206 
    207 U_CDECL_END
    208 
    209 #define UCASECONTEXT_INITIALIZER { NULL,  0, 0, 0,  0, 0,  0,  0, 0, 0 }
    210 
    211 enum {
    212     /**
    213      * For string case mappings, a single character (a code point) is mapped
    214      * either to itself (in which case in-place mapping functions do nothing),
    215      * or to another single code point, or to a string.
    216      * Aside from the string contents, these are indicated with a single int32_t
    217      * value as follows:
    218      *
    219      * Mapping to self: Negative values (~self instead of -self to support U+0000)
    220      *
    221      * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
    222      *
    223      * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
    224      * returned. Note that the string result may indeed have zero length.
    225      */
    226     UCASE_MAX_STRING_LENGTH=0x1f
    227 };
    228 
    229 /**
    230  * Get the full lowercase mapping for c.
    231  *
    232  * @param csp Case mapping properties.
    233  * @param c Character to be mapped.
    234  * @param iter Character iterator, used for context-sensitive mappings.
    235  *             See UCaseContextIterator for details.
    236  *             If iter==NULL then a context-independent result is returned.
    237  * @param context Pointer to be passed into iter.
    238  * @param pString If the mapping result is a string, then the pointer is
    239  *                written to *pString.
    240  * @param locale Locale ID for locale-dependent mappings.
    241  * @param locCache Initialize to 0; may be used to cache the result of parsing
    242  *                 the locale ID for subsequent calls.
    243  *                 Can be NULL.
    244  * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
    245  *
    246  * @see UCaseContextIterator
    247  * @see UCASE_MAX_STRING_LENGTH
    248  * @internal
    249  */
    250 U_CAPI int32_t U_EXPORT2
    251 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
    252                   UCaseContextIterator *iter, void *context,
    253                   const UChar **pString,
    254                   const char *locale, int32_t *locCache);
    255 
    256 U_CAPI int32_t U_EXPORT2
    257 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
    258                   UCaseContextIterator *iter, void *context,
    259                   const UChar **pString,
    260                   const char *locale, int32_t *locCache);
    261 
    262 U_CAPI int32_t U_EXPORT2
    263 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
    264                   UCaseContextIterator *iter, void *context,
    265                   const UChar **pString,
    266                   const char *locale, int32_t *locCache);
    267 
    268 U_CAPI int32_t U_EXPORT2
    269 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
    270                     const UChar **pString,
    271                     uint32_t options);
    272 
    273 U_CFUNC int32_t U_EXPORT2
    274 ucase_hasBinaryProperty(UChar32 c, UProperty which);
    275 
    276 
    277 U_CDECL_BEGIN
    278 
    279 /**
    280  * @internal
    281  */
    282 typedef int32_t U_CALLCONV
    283 UCaseMapFull(const UCaseProps *csp, UChar32 c,
    284              UCaseContextIterator *iter, void *context,
    285              const UChar **pString,
    286              const char *locale, int32_t *locCache);
    287 
    288 U_CDECL_END
    289 
    290 /* file definitions --------------------------------------------------------- */
    291 
    292 #define UCASE_DATA_NAME "ucase"
    293 #define UCASE_DATA_TYPE "icu"
    294 
    295 /* format "cAsE" */
    296 #define UCASE_FMT_0 0x63
    297 #define UCASE_FMT_1 0x41
    298 #define UCASE_FMT_2 0x53
    299 #define UCASE_FMT_3 0x45
    300 
    301 /* indexes into indexes[] */
    302 enum {
    303     UCASE_IX_INDEX_TOP,
    304     UCASE_IX_LENGTH,
    305     UCASE_IX_TRIE_SIZE,
    306     UCASE_IX_EXC_LENGTH,
    307     UCASE_IX_UNFOLD_LENGTH,
    308 
    309     UCASE_IX_MAX_FULL_LENGTH=15,
    310     UCASE_IX_TOP=16
    311 };
    312 
    313 /* definitions for 16-bit case properties word ------------------------------ */
    314 
    315 /* 2-bit constants for types of cased characters */
    316 #define UCASE_TYPE_MASK     3
    317 enum {
    318     UCASE_NONE,
    319     UCASE_LOWER,
    320     UCASE_UPPER,
    321     UCASE_TITLE
    322 };
    323 
    324 #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
    325 #define UCASE_GET_TYPE_AND_IGNORABLE(props) ((props)&7)
    326 
    327 #define UCASE_IGNORABLE         4
    328 #define UCASE_SENSITIVE         8
    329 #define UCASE_EXCEPTION         0x10
    330 
    331 #define UCASE_DOT_MASK      0x60
    332 enum {
    333     UCASE_NO_DOT=0,         /* normal characters with cc=0 */
    334     UCASE_SOFT_DOTTED=0x20, /* soft-dotted characters with cc=0 */
    335     UCASE_ABOVE=0x40,       /* "above" accents with cc=230 */
    336     UCASE_OTHER_ACCENT=0x60 /* other accent character (0<cc!=230) */
    337 };
    338 
    339 /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
    340 #define UCASE_DELTA_SHIFT   7
    341 #define UCASE_DELTA_MASK    0xff80
    342 #define UCASE_MAX_DELTA     0xff
    343 #define UCASE_MIN_DELTA     (-UCASE_MAX_DELTA-1)
    344 
    345 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
    346 #   define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
    347 #else
    348 #   define UCASE_GET_DELTA(props) (int16_t)(((props)&0x8000) ? (((props)>>UCASE_DELTA_SHIFT)|0xfe00) : ((uint16_t)(props)>>UCASE_DELTA_SHIFT))
    349 #endif
    350 
    351 /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
    352 #define UCASE_EXC_SHIFT     5
    353 #define UCASE_EXC_MASK      0xffe0
    354 #define UCASE_MAX_EXCEPTIONS ((UCASE_EXC_MASK>>UCASE_EXC_SHIFT)+1)
    355 
    356 /* definitions for 16-bit main exceptions word ------------------------------ */
    357 
    358 /* first 8 bits indicate values in optional slots */
    359 enum {
    360     UCASE_EXC_LOWER,
    361     UCASE_EXC_FOLD,
    362     UCASE_EXC_UPPER,
    363     UCASE_EXC_TITLE,
    364     UCASE_EXC_4,            /* reserved */
    365     UCASE_EXC_5,            /* reserved */
    366     UCASE_EXC_CLOSURE,
    367     UCASE_EXC_FULL_MAPPINGS,
    368     UCASE_EXC_ALL_SLOTS     /* one past the last slot */
    369 };
    370 
    371 /* each slot is 2 uint16_t instead of 1 */
    372 #define UCASE_EXC_DOUBLE_SLOTS      0x100
    373 
    374 /* reserved: exception bits 11..9 */
    375 
    376 /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
    377 #define UCASE_EXC_DOT_SHIFT     7
    378 
    379 /* normally stored in the main word, but pushed out for larger exception indexes */
    380 #define UCASE_EXC_DOT_MASK      0x3000
    381 enum {
    382     UCASE_EXC_NO_DOT=0,
    383     UCASE_EXC_SOFT_DOTTED=0x1000,
    384     UCASE_EXC_ABOVE=0x2000,         /* "above" accents with cc=230 */
    385     UCASE_EXC_OTHER_ACCENT=0x3000   /* other character (0<cc!=230) */
    386 };
    387 
    388 /* complex/conditional mappings */
    389 #define UCASE_EXC_CONDITIONAL_SPECIAL   0x4000
    390 #define UCASE_EXC_CONDITIONAL_FOLD      0x8000
    391 
    392 /* definitions for lengths word for full case mappings */
    393 #define UCASE_FULL_LOWER    0xf
    394 #define UCASE_FULL_FOLDING  0xf0
    395 #define UCASE_FULL_UPPER    0xf00
    396 #define UCASE_FULL_TITLE    0xf000
    397 
    398 /* maximum lengths */
    399 #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
    400 #define UCASE_CLOSURE_MAX_LENGTH 0xf
    401 
    402 /* constants for reverse case folding ("unfold") data */
    403 enum {
    404     UCASE_UNFOLD_ROWS,
    405     UCASE_UNFOLD_ROW_WIDTH,
    406     UCASE_UNFOLD_STRING_WIDTH
    407 };
    408 
    409 #endif
    410