Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2005-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucasemap.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2005may06
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Case mapping service object and functions using it.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/brkiter.h"
     23 #include "unicode/bytestream.h"
     24 #include "unicode/casemap.h"
     25 #include "unicode/edits.h"
     26 #include "unicode/stringoptions.h"
     27 #include "unicode/stringpiece.h"
     28 #include "unicode/ubrk.h"
     29 #include "unicode/uloc.h"
     30 #include "unicode/ustring.h"
     31 #include "unicode/ucasemap.h"
     32 #if !UCONFIG_NO_BREAK_ITERATION
     33 #include "unicode/utext.h"
     34 #endif
     35 #include "unicode/utf.h"
     36 #include "unicode/utf8.h"
     37 #include "unicode/utf16.h"
     38 #include "bytesinkutil.h"
     39 #include "cmemory.h"
     40 #include "cstring.h"
     41 #include "uassert.h"
     42 #include "ucase.h"
     43 #include "ucasemap_imp.h"
     44 #include "ustr_imp.h"
     45 
     46 U_NAMESPACE_USE
     47 
     48 /* UCaseMap service object -------------------------------------------------- */
     49 
     50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
     51 #if !UCONFIG_NO_BREAK_ITERATION
     52         iter(NULL),
     53 #endif
     54         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
     55     ucasemap_setLocale(this, localeID, pErrorCode);
     56 }
     57 
     58 UCaseMap::~UCaseMap() {
     59 #if !UCONFIG_NO_BREAK_ITERATION
     60     delete iter;
     61 #endif
     62 }
     63 
     64 U_CAPI UCaseMap * U_EXPORT2
     65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
     66     if(U_FAILURE(*pErrorCode)) {
     67         return NULL;
     68     }
     69     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
     70     if(csm==NULL) {
     71         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
     72         return NULL;
     73     } else if (U_FAILURE(*pErrorCode)) {
     74         delete csm;
     75         return NULL;
     76     }
     77     return csm;
     78 }
     79 
     80 U_CAPI void U_EXPORT2
     81 ucasemap_close(UCaseMap *csm) {
     82     delete csm;
     83 }
     84 
     85 U_CAPI const char * U_EXPORT2
     86 ucasemap_getLocale(const UCaseMap *csm) {
     87     return csm->locale;
     88 }
     89 
     90 U_CAPI uint32_t U_EXPORT2
     91 ucasemap_getOptions(const UCaseMap *csm) {
     92     return csm->options;
     93 }
     94 
     95 U_CAPI void U_EXPORT2
     96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
     97     if(U_FAILURE(*pErrorCode)) {
     98         return;
     99     }
    100     if (locale != NULL && *locale == 0) {
    101         csm->locale[0] = 0;
    102         csm->caseLocale = UCASE_LOC_ROOT;
    103         return;
    104     }
    105 
    106     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
    107     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
    108         *pErrorCode=U_ZERO_ERROR;
    109         /* we only really need the language code for case mappings */
    110         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
    111     }
    112     if(length==sizeof(csm->locale)) {
    113         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    114     }
    115     if(U_SUCCESS(*pErrorCode)) {
    116         csm->caseLocale=UCASE_LOC_UNKNOWN;
    117         csm->caseLocale = ucase_getCaseLocale(csm->locale);
    118     } else {
    119         csm->locale[0]=0;
    120         csm->caseLocale = UCASE_LOC_ROOT;
    121     }
    122 }
    123 
    124 U_CAPI void U_EXPORT2
    125 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
    126     if(U_FAILURE(*pErrorCode)) {
    127         return;
    128     }
    129     csm->options=options;
    130 }
    131 
    132 /* UTF-8 string case mappings ----------------------------------------------- */
    133 
    134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
    135 
    136 namespace {
    137 
    138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
    139 inline UBool
    140 appendResult(int32_t cpLength, int32_t result, const UChar *s,
    141              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
    142     U_ASSERT(U_SUCCESS(errorCode));
    143 
    144     /* decode the result */
    145     if(result<0) {
    146         /* (not) original code point */
    147         if(edits!=NULL) {
    148             edits->addUnchanged(cpLength);
    149         }
    150         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
    151             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
    152         }
    153     } else {
    154         if(result<=UCASE_MAX_STRING_LENGTH) {
    155             // string: "result" is the UTF-16 length
    156             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
    157         } else {
    158             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
    159         }
    160     }
    161     return TRUE;
    162 }
    163 
    164 // See unicode/utf8.h U8_APPEND_UNSAFE().
    165 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
    166 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
    167 
    168 UChar32 U_CALLCONV
    169 utf8_caseContextIterator(void *context, int8_t dir) {
    170     UCaseContext *csc=(UCaseContext *)context;
    171     UChar32 c;
    172 
    173     if(dir<0) {
    174         /* reset for backward iteration */
    175         csc->index=csc->cpStart;
    176         csc->dir=dir;
    177     } else if(dir>0) {
    178         /* reset for forward iteration */
    179         csc->index=csc->cpLimit;
    180         csc->dir=dir;
    181     } else {
    182         /* continue current iteration direction */
    183         dir=csc->dir;
    184     }
    185 
    186     if(dir<0) {
    187         if(csc->start<csc->index) {
    188             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
    189             return c;
    190         }
    191     } else {
    192         if(csc->index<csc->limit) {
    193             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
    194             return c;
    195         }
    196     }
    197     return U_SENTINEL;
    198 }
    199 
    200 /**
    201  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
    202  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
    203  */
    204 void toLower(int32_t caseLocale, uint32_t options,
    205              const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
    206              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
    207     const int8_t *latinToLower;
    208     if (caseLocale == UCASE_LOC_ROOT ||
    209             (caseLocale >= 0 ?
    210                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
    211                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
    212         latinToLower = LatinCase::TO_LOWER_NORMAL;
    213     } else {
    214         latinToLower = LatinCase::TO_LOWER_TR_LT;
    215     }
    216     const UTrie2 *trie = ucase_getTrie();
    217     int32_t prev = srcStart;
    218     int32_t srcIndex = srcStart;
    219     for (;;) {
    220         // fast path for simple cases
    221         int32_t cpStart;
    222         UChar32 c;
    223         for (;;) {
    224             if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
    225                 c = U_SENTINEL;
    226                 break;
    227             }
    228             uint8_t lead = src[srcIndex++];
    229             if (lead <= 0x7f) {
    230                 int8_t d = latinToLower[lead];
    231                 if (d == LatinCase::EXC) {
    232                     cpStart = srcIndex - 1;
    233                     c = lead;
    234                     break;
    235                 }
    236                 if (d == 0) { continue; }
    237                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
    238                                               sink, options, edits, errorCode);
    239                 char ascii = (char)(lead + d);
    240                 sink.Append(&ascii, 1);
    241                 if (edits != nullptr) {
    242                     edits->addReplace(1, 1);
    243                 }
    244                 prev = srcIndex;
    245                 continue;
    246             } else if (lead < 0xe3) {
    247                 uint8_t t;
    248                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
    249                         (t = src[srcIndex] - 0x80) <= 0x3f) {
    250                     // U+0080..U+017F
    251                     ++srcIndex;
    252                     c = ((lead - 0xc0) << 6) | t;
    253                     int8_t d = latinToLower[c];
    254                     if (d == LatinCase::EXC) {
    255                         cpStart = srcIndex - 2;
    256                         break;
    257                     }
    258                     if (d == 0) { continue; }
    259                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
    260                                                   sink, options, edits, errorCode);
    261                     ByteSinkUtil::appendTwoBytes(c + d, sink);
    262                     if (edits != nullptr) {
    263                         edits->addReplace(2, 2);
    264                     }
    265                     prev = srcIndex;
    266                     continue;
    267                 }
    268             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
    269                     (srcIndex + 2) <= srcLimit &&
    270                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
    271                 // most of CJK: no case mappings
    272                 srcIndex += 2;
    273                 continue;
    274             }
    275             cpStart = --srcIndex;
    276             U8_NEXT(src, srcIndex, srcLimit, c);
    277             if (c < 0) {
    278                 // ill-formed UTF-8
    279                 continue;
    280             }
    281             uint16_t props = UTRIE2_GET16(trie, c);
    282             if (UCASE_HAS_EXCEPTION(props)) { break; }
    283             int32_t delta;
    284             if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
    285                 continue;
    286             }
    287             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    288                                           sink, options, edits, errorCode);
    289             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
    290             prev = srcIndex;
    291         }
    292         if (c < 0) {
    293             break;
    294         }
    295         // slow path
    296         const UChar *s;
    297         if (caseLocale >= 0) {
    298             csc->cpStart = cpStart;
    299             csc->cpLimit = srcIndex;
    300             c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
    301         } else {
    302             c = ucase_toFullFolding(c, &s, options);
    303         }
    304         if (c >= 0) {
    305             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    306                                           sink, options, edits, errorCode);
    307             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
    308             prev = srcIndex;
    309         }
    310     }
    311     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
    312                                   sink, options, edits, errorCode);
    313 }
    314 
    315 void toUpper(int32_t caseLocale, uint32_t options,
    316              const uint8_t *src, UCaseContext *csc, int32_t srcLength,
    317              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
    318     const int8_t *latinToUpper;
    319     if (caseLocale == UCASE_LOC_TURKISH) {
    320         latinToUpper = LatinCase::TO_UPPER_TR;
    321     } else {
    322         latinToUpper = LatinCase::TO_UPPER_NORMAL;
    323     }
    324     const UTrie2 *trie = ucase_getTrie();
    325     int32_t prev = 0;
    326     int32_t srcIndex = 0;
    327     for (;;) {
    328         // fast path for simple cases
    329         int32_t cpStart;
    330         UChar32 c;
    331         for (;;) {
    332             if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
    333                 c = U_SENTINEL;
    334                 break;
    335             }
    336             uint8_t lead = src[srcIndex++];
    337             if (lead <= 0x7f) {
    338                 int8_t d = latinToUpper[lead];
    339                 if (d == LatinCase::EXC) {
    340                     cpStart = srcIndex - 1;
    341                     c = lead;
    342                     break;
    343                 }
    344                 if (d == 0) { continue; }
    345                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
    346                                               sink, options, edits, errorCode);
    347                 char ascii = (char)(lead + d);
    348                 sink.Append(&ascii, 1);
    349                 if (edits != nullptr) {
    350                     edits->addReplace(1, 1);
    351                 }
    352                 prev = srcIndex;
    353                 continue;
    354             } else if (lead < 0xe3) {
    355                 uint8_t t;
    356                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
    357                         (t = src[srcIndex] - 0x80) <= 0x3f) {
    358                     // U+0080..U+017F
    359                     ++srcIndex;
    360                     c = ((lead - 0xc0) << 6) | t;
    361                     int8_t d = latinToUpper[c];
    362                     if (d == LatinCase::EXC) {
    363                         cpStart = srcIndex - 2;
    364                         break;
    365                     }
    366                     if (d == 0) { continue; }
    367                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
    368                                                   sink, options, edits, errorCode);
    369                     ByteSinkUtil::appendTwoBytes(c + d, sink);
    370                     if (edits != nullptr) {
    371                         edits->addReplace(2, 2);
    372                     }
    373                     prev = srcIndex;
    374                     continue;
    375                 }
    376             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
    377                     (srcIndex + 2) <= srcLength &&
    378                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
    379                 // most of CJK: no case mappings
    380                 srcIndex += 2;
    381                 continue;
    382             }
    383             cpStart = --srcIndex;
    384             U8_NEXT(src, srcIndex, srcLength, c);
    385             if (c < 0) {
    386                 // ill-formed UTF-8
    387                 continue;
    388             }
    389             uint16_t props = UTRIE2_GET16(trie, c);
    390             if (UCASE_HAS_EXCEPTION(props)) { break; }
    391             int32_t delta;
    392             if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
    393                 continue;
    394             }
    395             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    396                                           sink, options, edits, errorCode);
    397             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
    398             prev = srcIndex;
    399         }
    400         if (c < 0) {
    401             break;
    402         }
    403         // slow path
    404         csc->cpStart = cpStart;
    405         csc->cpLimit = srcIndex;
    406         const UChar *s;
    407         c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
    408         if (c >= 0) {
    409             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
    410                                           sink, options, edits, errorCode);
    411             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
    412             prev = srcIndex;
    413         }
    414     }
    415     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
    416                                   sink, options, edits, errorCode);
    417 }
    418 
    419 }  // namespace
    420 
    421 #if !UCONFIG_NO_BREAK_ITERATION
    422 
    423 U_CFUNC void U_CALLCONV
    424 ucasemap_internalUTF8ToTitle(
    425         int32_t caseLocale, uint32_t options, BreakIterator *iter,
    426         const uint8_t *src, int32_t srcLength,
    427         ByteSink &sink, icu::Edits *edits,
    428         UErrorCode &errorCode) {
    429     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
    430         return;
    431     }
    432 
    433     /* set up local variables */
    434     UCaseContext csc=UCASECONTEXT_INITIALIZER;
    435     csc.p=(void *)src;
    436     csc.limit=srcLength;
    437     int32_t prev=0;
    438     UBool isFirstIndex=TRUE;
    439 
    440     /* titlecasing loop */
    441     while(prev<srcLength) {
    442         /* find next index where to titlecase */
    443         int32_t index;
    444         if(isFirstIndex) {
    445             isFirstIndex=FALSE;
    446             index=iter->first();
    447         } else {
    448             index=iter->next();
    449         }
    450         if(index==UBRK_DONE || index>srcLength) {
    451             index=srcLength;
    452         }
    453 
    454         /*
    455          * Segment [prev..index[ into 3 parts:
    456          * a) skipped characters (copy as-is) [prev..titleStart[
    457          * b) first letter (titlecase)              [titleStart..titleLimit[
    458          * c) subsequent characters (lowercase)                 [titleLimit..index[
    459          */
    460         if(prev<index) {
    461             /* find and copy skipped characters [prev..titleStart[ */
    462             int32_t titleStart=prev;
    463             int32_t titleLimit=prev;
    464             UChar32 c;
    465             U8_NEXT(src, titleLimit, index, c);
    466             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
    467                 // Adjust the titlecasing index to the next cased character,
    468                 // or to the next letter/number/symbol/private use.
    469                 // Stop with titleStart<titleLimit<=index
    470                 // if there is a character to be titlecased,
    471                 // or else stop with titleStart==titleLimit==index.
    472                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
    473                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
    474                     titleStart=titleLimit;
    475                     if(titleLimit==index) {
    476                         break;
    477                     }
    478                     U8_NEXT(src, titleLimit, index, c);
    479                 }
    480                 if (prev < titleStart) {
    481                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
    482                                                        sink, options, edits, errorCode)) {
    483                         return;
    484                     }
    485                 }
    486             }
    487 
    488             if(titleStart<titleLimit) {
    489                 /* titlecase c which is from [titleStart..titleLimit[ */
    490                 if(c>=0) {
    491                     csc.cpStart=titleStart;
    492                     csc.cpLimit=titleLimit;
    493                     const UChar *s;
    494                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
    495                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
    496                         return;
    497                     }
    498                 } else {
    499                     // Malformed UTF-8.
    500                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
    501                                                        sink, options, edits, errorCode)) {
    502                         return;
    503                     }
    504                 }
    505 
    506                 /* Special case Dutch IJ titlecasing */
    507                 if (titleStart+1 < index &&
    508                         caseLocale == UCASE_LOC_DUTCH &&
    509                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
    510                     if (src[titleStart+1] == 0x006A) {
    511                         ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
    512                         titleLimit++;
    513                     } else if (src[titleStart+1] == 0x004A) {
    514                         // Keep the capital J from getting lowercased.
    515                         if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
    516                                                            sink, options, edits, errorCode)) {
    517                             return;
    518                         }
    519                         titleLimit++;
    520                     }
    521                 }
    522 
    523                 /* lowercase [titleLimit..index[ */
    524                 if(titleLimit<index) {
    525                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
    526                         /* Normal operation: Lowercase the rest of the word. */
    527                         toLower(caseLocale, options,
    528                                 src, &csc, titleLimit, index,
    529                                 sink, edits, errorCode);
    530                         if(U_FAILURE(errorCode)) {
    531                             return;
    532                         }
    533                     } else {
    534                         /* Optionally just copy the rest of the word unchanged. */
    535                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
    536                                                            sink, options, edits, errorCode)) {
    537                             return;
    538                         }
    539                     }
    540                 }
    541             }
    542         }
    543 
    544         prev=index;
    545     }
    546 }
    547 
    548 #endif
    549 
    550 U_NAMESPACE_BEGIN
    551 namespace GreekUpper {
    552 
    553 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
    554     while (i < length) {
    555         UChar32 c;
    556         U8_NEXT(s, i, length, c);
    557         int32_t type = ucase_getTypeOrIgnorable(c);
    558         if ((type & UCASE_IGNORABLE) != 0) {
    559             // Case-ignorable, continue with the loop.
    560         } else if (type != UCASE_NONE) {
    561             return TRUE;  // Followed by cased letter.
    562         } else {
    563             return FALSE;  // Uncased and not case-ignorable.
    564         }
    565     }
    566     return FALSE;  // Not followed by cased letter.
    567 }
    568 
    569 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
    570 void toUpper(uint32_t options,
    571              const uint8_t *src, int32_t srcLength,
    572              ByteSink &sink, Edits *edits,
    573              UErrorCode &errorCode) {
    574     uint32_t state = 0;
    575     for (int32_t i = 0; i < srcLength;) {
    576         int32_t nextIndex = i;
    577         UChar32 c;
    578         U8_NEXT(src, nextIndex, srcLength, c);
    579         uint32_t nextState = 0;
    580         int32_t type = ucase_getTypeOrIgnorable(c);
    581         if ((type & UCASE_IGNORABLE) != 0) {
    582             // c is case-ignorable
    583             nextState |= (state & AFTER_CASED);
    584         } else if (type != UCASE_NONE) {
    585             // c is cased
    586             nextState |= AFTER_CASED;
    587         }
    588         uint32_t data = getLetterData(c);
    589         if (data > 0) {
    590             uint32_t upper = data & UPPER_MASK;
    591             // Add a dialytika to this iota or ypsilon vowel
    592             // if we removed a tonos from the previous vowel,
    593             // and that previous vowel did not also have (or gain) a dialytika.
    594             // Adding one only to the final vowel in a longer sequence
    595             // (which does not occur in normal writing) would require lookahead.
    596             // Set the same flag as for preserving an existing dialytika.
    597             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
    598                     (upper == 0x399 || upper == 0x3A5)) {
    599                 data |= HAS_DIALYTIKA;
    600             }
    601             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
    602             if ((data & HAS_YPOGEGRAMMENI) != 0) {
    603                 numYpogegrammeni = 1;
    604             }
    605             // Skip combining diacritics after this Greek letter.
    606             int32_t nextNextIndex = nextIndex;
    607             while (nextIndex < srcLength) {
    608                 UChar32 c2;
    609                 U8_NEXT(src, nextNextIndex, srcLength, c2);
    610                 uint32_t diacriticData = getDiacriticData(c2);
    611                 if (diacriticData != 0) {
    612                     data |= diacriticData;
    613                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
    614                         ++numYpogegrammeni;
    615                     }
    616                     nextIndex = nextNextIndex;
    617                 } else {
    618                     break;  // not a Greek diacritic
    619                 }
    620             }
    621             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
    622                 nextState |= AFTER_VOWEL_WITH_ACCENT;
    623             }
    624             // Map according to Greek rules.
    625             UBool addTonos = FALSE;
    626             if (upper == 0x397 &&
    627                     (data & HAS_ACCENT) != 0 &&
    628                     numYpogegrammeni == 0 &&
    629                     (state & AFTER_CASED) == 0 &&
    630                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
    631                 // Keep disjunctive "or" with (only) a tonos.
    632                 // We use the same "word boundary" conditions as for the Final_Sigma test.
    633                 if (i == nextIndex) {
    634                     upper = 0x389;  // Preserve the precomposed form.
    635                 } else {
    636                     addTonos = TRUE;
    637                 }
    638             } else if ((data & HAS_DIALYTIKA) != 0) {
    639                 // Preserve a vowel with dialytika in precomposed form if it exists.
    640                 if (upper == 0x399) {
    641                     upper = 0x3AA;
    642                     data &= ~HAS_EITHER_DIALYTIKA;
    643                 } else if (upper == 0x3A5) {
    644                     upper = 0x3AB;
    645                     data &= ~HAS_EITHER_DIALYTIKA;
    646                 }
    647             }
    648 
    649             UBool change;
    650             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
    651                 change = TRUE;  // common, simple usage
    652             } else {
    653                 // Find out first whether we are changing the text.
    654                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
    655                 change = (i + 2) > nextIndex ||
    656                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
    657                         numYpogegrammeni > 0;
    658                 int32_t i2 = i + 2;
    659                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
    660                     change |= (i2 + 2) > nextIndex ||
    661                             src[i2] != (uint8_t)u8"\u0308"[0] ||
    662                             src[i2 + 1] != (uint8_t)u8"\u0308"[1];
    663                     i2 += 2;
    664                 }
    665                 if (addTonos) {
    666                     change |= (i2 + 2) > nextIndex ||
    667                             src[i2] != (uint8_t)u8"\u0301"[0] ||
    668                             src[i2 + 1] != (uint8_t)u8"\u0301"[1];
    669                     i2 += 2;
    670                 }
    671                 int32_t oldLength = nextIndex - i;
    672                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
    673                 change |= oldLength != newLength;
    674                 if (change) {
    675                     if (edits != NULL) {
    676                         edits->addReplace(oldLength, newLength);
    677                     }
    678                 } else {
    679                     if (edits != NULL) {
    680                         edits->addUnchanged(oldLength);
    681                     }
    682                     // Write unchanged text?
    683                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
    684                 }
    685             }
    686 
    687             if (change) {
    688                 ByteSinkUtil::appendTwoBytes(upper, sink);
    689                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
    690                     sink.Append(u8"\u0308", 2);  // restore or add a dialytika
    691                 }
    692                 if (addTonos) {
    693                     sink.Append(u8"\u0301", 2);
    694                 }
    695                 while (numYpogegrammeni > 0) {
    696                     sink.Append(u8"\u0399", 2);
    697                     --numYpogegrammeni;
    698                 }
    699             }
    700         } else if(c>=0) {
    701             const UChar *s;
    702             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
    703             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
    704                 return;
    705             }
    706         } else {
    707             // Malformed UTF-8.
    708             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
    709                                                sink, options, edits, errorCode)) {
    710                 return;
    711             }
    712         }
    713         i = nextIndex;
    714         state = nextState;
    715     }
    716 }
    717 
    718 }  // namespace GreekUpper
    719 U_NAMESPACE_END
    720 
    721 static void U_CALLCONV
    722 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
    723                              const uint8_t *src, int32_t srcLength,
    724                              icu::ByteSink &sink, icu::Edits *edits,
    725                              UErrorCode &errorCode) {
    726     UCaseContext csc=UCASECONTEXT_INITIALIZER;
    727     csc.p=(void *)src;
    728     csc.limit=srcLength;
    729     toLower(
    730         caseLocale, options,
    731         src, &csc, 0, srcLength,
    732         sink, edits, errorCode);
    733 }
    734 
    735 static void U_CALLCONV
    736 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
    737                              const uint8_t *src, int32_t srcLength,
    738                              icu::ByteSink &sink, icu::Edits *edits,
    739                              UErrorCode &errorCode) {
    740     if (caseLocale == UCASE_LOC_GREEK) {
    741         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
    742     } else {
    743         UCaseContext csc=UCASECONTEXT_INITIALIZER;
    744         csc.p=(void *)src;
    745         csc.limit=srcLength;
    746         toUpper(
    747             caseLocale, options,
    748             src, &csc, srcLength,
    749             sink, edits, errorCode);
    750     }
    751 }
    752 
    753 static void U_CALLCONV
    754 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
    755                           const uint8_t *src, int32_t srcLength,
    756                           icu::ByteSink &sink, icu::Edits *edits,
    757                           UErrorCode &errorCode) {
    758     toLower(
    759         -1, options,
    760         src, nullptr, 0, srcLength,
    761         sink, edits, errorCode);
    762 }
    763 
    764 void
    765 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    766                  const char *src, int32_t srcLength,
    767                  UTF8CaseMapper *stringCaseMapper,
    768                  icu::ByteSink &sink, icu::Edits *edits,
    769                  UErrorCode &errorCode) {
    770     /* check argument values */
    771     if (U_FAILURE(errorCode)) {
    772         return;
    773     }
    774     if ((src == nullptr && srcLength != 0) || srcLength < -1) {
    775         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    776         return;
    777     }
    778 
    779     // Get the string length.
    780     if (srcLength == -1) {
    781         srcLength = (int32_t)uprv_strlen((const char *)src);
    782     }
    783 
    784     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
    785         edits->reset();
    786     }
    787     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
    788                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
    789     sink.Flush();
    790     if (U_SUCCESS(errorCode)) {
    791         if (edits != nullptr) {
    792             edits->copyErrorTo(errorCode);
    793         }
    794     }
    795 }
    796 
    797 int32_t
    798 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
    799                  char *dest, int32_t destCapacity,
    800                  const char *src, int32_t srcLength,
    801                  UTF8CaseMapper *stringCaseMapper,
    802                  icu::Edits *edits,
    803                  UErrorCode &errorCode) {
    804     /* check argument values */
    805     if(U_FAILURE(errorCode)) {
    806         return 0;
    807     }
    808     if( destCapacity<0 ||
    809         (dest==NULL && destCapacity>0) ||
    810         (src==NULL && srcLength!=0) || srcLength<-1
    811     ) {
    812         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    813         return 0;
    814     }
    815 
    816     /* get the string length */
    817     if(srcLength==-1) {
    818         srcLength=(int32_t)uprv_strlen((const char *)src);
    819     }
    820 
    821     /* check for overlapping source and destination */
    822     if( dest!=NULL &&
    823         ((src>=dest && src<(dest+destCapacity)) ||
    824          (dest>=src && dest<(src+srcLength)))
    825     ) {
    826         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    827         return 0;
    828     }
    829 
    830     CheckedArrayByteSink sink(dest, destCapacity);
    831     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
    832         edits->reset();
    833     }
    834     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
    835                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
    836     sink.Flush();
    837     if (U_SUCCESS(errorCode)) {
    838         if (sink.Overflowed()) {
    839             errorCode = U_BUFFER_OVERFLOW_ERROR;
    840         } else if (edits != nullptr) {
    841             edits->copyErrorTo(errorCode);
    842         }
    843     }
    844     return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
    845 }
    846 
    847 /* public API functions */
    848 
    849 U_CAPI int32_t U_EXPORT2
    850 ucasemap_utf8ToLower(const UCaseMap *csm,
    851                      char *dest, int32_t destCapacity,
    852                      const char *src, int32_t srcLength,
    853                      UErrorCode *pErrorCode) {
    854     return ucasemap_mapUTF8(
    855         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
    856         dest, destCapacity,
    857         src, srcLength,
    858         ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
    859 }
    860 
    861 U_CAPI int32_t U_EXPORT2
    862 ucasemap_utf8ToUpper(const UCaseMap *csm,
    863                      char *dest, int32_t destCapacity,
    864                      const char *src, int32_t srcLength,
    865                      UErrorCode *pErrorCode) {
    866     return ucasemap_mapUTF8(
    867         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
    868         dest, destCapacity,
    869         src, srcLength,
    870         ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
    871 }
    872 
    873 U_CAPI int32_t U_EXPORT2
    874 ucasemap_utf8FoldCase(const UCaseMap *csm,
    875                       char *dest, int32_t destCapacity,
    876                       const char *src, int32_t srcLength,
    877                       UErrorCode *pErrorCode) {
    878     return ucasemap_mapUTF8(
    879         UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
    880         dest, destCapacity,
    881         src, srcLength,
    882         ucasemap_internalUTF8Fold, NULL, *pErrorCode);
    883 }
    884 
    885 U_NAMESPACE_BEGIN
    886 
    887 void CaseMap::utf8ToLower(
    888         const char *locale, uint32_t options,
    889         StringPiece src, ByteSink &sink, Edits *edits,
    890         UErrorCode &errorCode) {
    891     ucasemap_mapUTF8(
    892         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
    893         src.data(), src.length(),
    894         ucasemap_internalUTF8ToLower, sink, edits, errorCode);
    895 }
    896 
    897 void CaseMap::utf8ToUpper(
    898         const char *locale, uint32_t options,
    899         StringPiece src, ByteSink &sink, Edits *edits,
    900         UErrorCode &errorCode) {
    901     ucasemap_mapUTF8(
    902         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
    903         src.data(), src.length(),
    904         ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
    905 }
    906 
    907 void CaseMap::utf8Fold(
    908         uint32_t options,
    909         StringPiece src, ByteSink &sink, Edits *edits,
    910         UErrorCode &errorCode) {
    911     ucasemap_mapUTF8(
    912         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
    913         src.data(), src.length(),
    914         ucasemap_internalUTF8Fold, sink, edits, errorCode);
    915 }
    916 
    917 int32_t CaseMap::utf8ToLower(
    918         const char *locale, uint32_t options,
    919         const char *src, int32_t srcLength,
    920         char *dest, int32_t destCapacity, Edits *edits,
    921         UErrorCode &errorCode) {
    922     return ucasemap_mapUTF8(
    923         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
    924         dest, destCapacity,
    925         src, srcLength,
    926         ucasemap_internalUTF8ToLower, edits, errorCode);
    927 }
    928 
    929 int32_t CaseMap::utf8ToUpper(
    930         const char *locale, uint32_t options,
    931         const char *src, int32_t srcLength,
    932         char *dest, int32_t destCapacity, Edits *edits,
    933         UErrorCode &errorCode) {
    934     return ucasemap_mapUTF8(
    935         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
    936         dest, destCapacity,
    937         src, srcLength,
    938         ucasemap_internalUTF8ToUpper, edits, errorCode);
    939 }
    940 
    941 int32_t CaseMap::utf8Fold(
    942         uint32_t options,
    943         const char *src, int32_t srcLength,
    944         char *dest, int32_t destCapacity, Edits *edits,
    945         UErrorCode &errorCode) {
    946     return ucasemap_mapUTF8(
    947         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
    948         dest, destCapacity,
    949         src, srcLength,
    950         ucasemap_internalUTF8Fold, edits, errorCode);
    951 }
    952 
    953 U_NAMESPACE_END
    954