Home | History | Annotate | Download | only in common
      1 //  2017 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 // ucasemap_imp.h
      5 // created: 2017feb08 Markus W. Scherer
      6 
      7 #ifndef __UCASEMAP_IMP_H__
      8 #define __UCASEMAP_IMP_H__
      9 
     10 #include "unicode/utypes.h"
     11 #include "unicode/ucasemap.h"
     12 #include "unicode/uchar.h"
     13 #include "ucase.h"
     14 
     15 /**
     16  * Bit mask for the titlecasing iterator options bit field.
     17  * Currently only 3 out of 8 values are used:
     18  * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
     19  * See stringoptions.h.
     20  * @internal
     21  */
     22 #define U_TITLECASE_ITERATOR_MASK 0xe0
     23 
     24 /**
     25  * Bit mask for the titlecasing index adjustment options bit set.
     26  * Currently two bits are defined:
     27  * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
     28  * See stringoptions.h.
     29  * @internal
     30  */
     31 #define U_TITLECASE_ADJUSTMENT_MASK 0x600
     32 
     33 /**
     34  * Internal API, used by u_strcasecmp() etc.
     35  * Compare strings case-insensitively,
     36  * in code point order or code unit order.
     37  */
     38 U_CFUNC int32_t
     39 u_strcmpFold(const UChar *s1, int32_t length1,
     40              const UChar *s2, int32_t length2,
     41              uint32_t options,
     42              UErrorCode *pErrorCode);
     43 
     44 /**
     45  * Internal API, used for detecting length of
     46  * shared prefix case-insensitively.
     47  * @param s1            input string 1
     48  * @param length1       length of string 1, or -1 (NULL terminated)
     49  * @param s2            input string 2
     50  * @param length2       length of string 2, or -1 (NULL terminated)
     51  * @param options       compare options
     52  * @param matchLen1     (output) length of partial prefix match in s1
     53  * @param matchLen2     (output) length of partial prefix match in s2
     54  * @param pErrorCode    receives error status
     55  */
     56 U_CAPI void
     57 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
     58                              const UChar *s2, int32_t length2,
     59                              uint32_t options,
     60                              int32_t *matchLen1, int32_t *matchLen2,
     61                              UErrorCode *pErrorCode);
     62 
     63 /**
     64  * Are the Unicode properties loaded?
     65  * This must be used before internal functions are called that do
     66  * not perform this check.
     67  * Generate a debug assertion failure if data is not loaded.
     68  */
     69 U_CFUNC UBool
     70 uprv_haveProperties(UErrorCode *pErrorCode);
     71 
     72 #ifdef __cplusplus
     73 
     74 U_NAMESPACE_BEGIN
     75 
     76 class BreakIterator;        // unicode/brkiter.h
     77 class ByteSink;
     78 class Locale;               // unicode/locid.h
     79 
     80 /** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
     81 inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
     82     if (U_FAILURE(errorCode)) { return FALSE; }
     83     if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
     84         // Both options together.
     85         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     86         return FALSE;
     87     }
     88     return TRUE;
     89 }
     90 
     91 inline UBool ustrcase_isLNS(UChar32 c) {
     92     // Letter, number, symbol,
     93     // or a private use code point because those are typically used as letters or numbers.
     94     // Consider modifier letters only if they are cased.
     95     const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
     96     int gc = u_charType(c);
     97     return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
     98 }
     99 
    100 #if !UCONFIG_NO_BREAK_ITERATION
    101 
    102 /** Returns nullptr if error. Pass in either locale or locID, not both. */
    103 U_CFUNC
    104 BreakIterator *ustrcase_getTitleBreakIterator(
    105         const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
    106         LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
    107 
    108 #endif
    109 
    110 U_NAMESPACE_END
    111 
    112 #include "unicode/unistr.h"  // for UStringCaseMapper
    113 
    114 /*
    115  * Internal string casing functions implementing
    116  * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
    117  */
    118 
    119 struct UCaseMap : public icu::UMemory {
    120     /** Implements most of ucasemap_open(). */
    121     UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
    122     ~UCaseMap();
    123 
    124 #if !UCONFIG_NO_BREAK_ITERATION
    125     icu::BreakIterator *iter;  /* We adopt the iterator, so we own it. */
    126 #endif
    127     char locale[32];
    128     int32_t caseLocale;
    129     uint32_t options;
    130 };
    131 
    132 #if UCONFIG_NO_BREAK_ITERATION
    133 #   define UCASEMAP_BREAK_ITERATOR_PARAM
    134 #   define UCASEMAP_BREAK_ITERATOR_UNUSED
    135 #   define UCASEMAP_BREAK_ITERATOR
    136 #   define UCASEMAP_BREAK_ITERATOR_NULL
    137 #else
    138 #   define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
    139 #   define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
    140 #   define UCASEMAP_BREAK_ITERATOR iter,
    141 #   define UCASEMAP_BREAK_ITERATOR_NULL NULL,
    142 #endif
    143 
    144 U_CFUNC int32_t
    145 ustrcase_getCaseLocale(const char *locale);
    146 
    147 // TODO: swap src / dest if approved for new public api
    148 /** Implements UStringCaseMapper. */
    149 U_CFUNC int32_t U_CALLCONV
    150 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    151                          UChar *dest, int32_t destCapacity,
    152                          const UChar *src, int32_t srcLength,
    153                          icu::Edits *edits,
    154                          UErrorCode &errorCode);
    155 
    156 /** Implements UStringCaseMapper. */
    157 U_CFUNC int32_t U_CALLCONV
    158 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    159                          UChar *dest, int32_t destCapacity,
    160                          const UChar *src, int32_t srcLength,
    161                          icu::Edits *edits,
    162                          UErrorCode &errorCode);
    163 
    164 #if !UCONFIG_NO_BREAK_ITERATION
    165 
    166 /** Implements UStringCaseMapper. */
    167 U_CFUNC int32_t U_CALLCONV
    168 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
    169                          icu::BreakIterator *iter,
    170                          UChar *dest, int32_t destCapacity,
    171                          const UChar *src, int32_t srcLength,
    172                          icu::Edits *edits,
    173                          UErrorCode &errorCode);
    174 
    175 #endif
    176 
    177 /** Implements UStringCaseMapper. */
    178 U_CFUNC int32_t U_CALLCONV
    179 ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    180                       UChar *dest, int32_t destCapacity,
    181                       const UChar *src, int32_t srcLength,
    182                       icu::Edits *edits,
    183                       UErrorCode &errorCode);
    184 
    185 /**
    186  * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
    187  * Implements argument checking.
    188  */
    189 U_CFUNC int32_t
    190 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    191              UChar *dest, int32_t destCapacity,
    192              const UChar *src, int32_t srcLength,
    193              UStringCaseMapper *stringCaseMapper,
    194              icu::Edits *edits,
    195              UErrorCode &errorCode);
    196 
    197 /**
    198  * Common string case mapping implementation for old-fashioned u_strToXyz() functions
    199  * that allow the source string to overlap the destination buffer.
    200  * Implements argument checking and internally works with an intermediate buffer if necessary.
    201  */
    202 U_CFUNC int32_t
    203 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    204                         UChar *dest, int32_t destCapacity,
    205                         const UChar *src, int32_t srcLength,
    206                         UStringCaseMapper *stringCaseMapper,
    207                         UErrorCode &errorCode);
    208 
    209 /**
    210  * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
    211  * UTF-8 version of UStringCaseMapper.
    212  * All error checking must be done.
    213  * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
    214  */
    215 typedef void U_CALLCONV
    216 UTF8CaseMapper(int32_t caseLocale, uint32_t options,
    217 #if !UCONFIG_NO_BREAK_ITERATION
    218                icu::BreakIterator *iter,
    219 #endif
    220                const uint8_t *src, int32_t srcLength,
    221                icu::ByteSink &sink, icu::Edits *edits,
    222                UErrorCode &errorCode);
    223 
    224 #if !UCONFIG_NO_BREAK_ITERATION
    225 
    226 /** Implements UTF8CaseMapper. */
    227 U_CFUNC void U_CALLCONV
    228 ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
    229         icu::BreakIterator *iter,
    230         const uint8_t *src, int32_t srcLength,
    231         icu::ByteSink &sink, icu::Edits *edits,
    232         UErrorCode &errorCode);
    233 
    234 #endif
    235 
    236 void
    237 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    238                  const char *src, int32_t srcLength,
    239                  UTF8CaseMapper *stringCaseMapper,
    240                  icu::ByteSink &sink, icu::Edits *edits,
    241                  UErrorCode &errorCode);
    242 
    243 /**
    244  * Implements argument checking and buffer handling
    245  * for UTF-8 string case mapping as a common function.
    246  */
    247 int32_t
    248 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    249                  char *dest, int32_t destCapacity,
    250                  const char *src, int32_t srcLength,
    251                  UTF8CaseMapper *stringCaseMapper,
    252                  icu::Edits *edits,
    253                  UErrorCode &errorCode);
    254 
    255 U_NAMESPACE_BEGIN
    256 namespace GreekUpper {
    257 
    258 // Data bits.
    259 static const uint32_t UPPER_MASK = 0x3ff;
    260 static const uint32_t HAS_VOWEL = 0x1000;
    261 static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
    262 static const uint32_t HAS_ACCENT = 0x4000;
    263 static const uint32_t HAS_DIALYTIKA = 0x8000;
    264 // Further bits during data building and processing, not stored in the data map.
    265 static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
    266 static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
    267 
    268 static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
    269 static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
    270         HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
    271 static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
    272 
    273 // State bits.
    274 static const uint32_t AFTER_CASED = 1;
    275 static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
    276 
    277 uint32_t getLetterData(UChar32 c);
    278 
    279 /**
    280  * Returns a non-zero value for each of the Greek combining diacritics
    281  * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
    282  * plus some perispomeni look-alikes.
    283  */
    284 uint32_t getDiacriticData(UChar32 c);
    285 
    286 }  // namespace GreekUpper
    287 U_NAMESPACE_END
    288 
    289 #endif  // __cplusplus
    290 
    291 #endif  // __UCASEMAP_IMP_H__
    292