Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2015, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ustrcase.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002feb20
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Implementation file for string casing C API functions.
     19 *   Uses functions from uchar.c for basic functionality that requires access
     20 *   to the Unicode Character Database (uprops.dat).
     21 */
     22 
     23 #include "unicode/utypes.h"
     24 #include "unicode/brkiter.h"
     25 #include "unicode/casemap.h"
     26 #include "unicode/edits.h"
     27 #include "unicode/stringoptions.h"
     28 #include "unicode/ustring.h"
     29 #include "unicode/ucasemap.h"
     30 #include "unicode/ubrk.h"
     31 #include "unicode/utf.h"
     32 #include "unicode/utf16.h"
     33 #include "cmemory.h"
     34 #include "ucase.h"
     35 #include "ucasemap_imp.h"
     36 #include "ustr_imp.h"
     37 #include "uassert.h"
     38 
     39 U_NAMESPACE_BEGIN
     40 
     41 namespace {
     42 
     43 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
     44                                    Edits *edits, UErrorCode &errorCode) {
     45     if (U_SUCCESS(errorCode)) {
     46         if (destIndex > destCapacity) {
     47             errorCode = U_BUFFER_OVERFLOW_ERROR;
     48         } else if (edits != NULL) {
     49             edits->copyErrorTo(errorCode);
     50         }
     51     }
     52     return destIndex;
     53 }
     54 
     55 }  // namespace
     56 
     57 U_NAMESPACE_END
     58 
     59 U_NAMESPACE_USE
     60 
     61 /* string casing ------------------------------------------------------------ */
     62 
     63 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
     64 static inline int32_t
     65 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
     66              int32_t result, const UChar *s,
     67              int32_t cpLength, uint32_t options, icu::Edits *edits) {
     68     UChar32 c;
     69     int32_t length;
     70 
     71     /* decode the result */
     72     if(result<0) {
     73         /* (not) original code point */
     74         if(edits!=NULL) {
     75             edits->addUnchanged(cpLength);
     76         }
     77         if(options & U_OMIT_UNCHANGED_TEXT) {
     78             return destIndex;
     79         }
     80         c=~result;
     81         if(destIndex<destCapacity && c<=0xffff) {  // BMP slightly-fastpath
     82             dest[destIndex++]=(UChar)c;
     83             return destIndex;
     84         }
     85         length=cpLength;
     86     } else {
     87         if(result<=UCASE_MAX_STRING_LENGTH) {
     88             c=U_SENTINEL;
     89             length=result;
     90         } else if(destIndex<destCapacity && result<=0xffff) {  // BMP slightly-fastpath
     91             dest[destIndex++]=(UChar)result;
     92             if(edits!=NULL) {
     93                 edits->addReplace(cpLength, 1);
     94             }
     95             return destIndex;
     96         } else {
     97             c=result;
     98             length=U16_LENGTH(c);
     99         }
    100         if(edits!=NULL) {
    101             edits->addReplace(cpLength, length);
    102         }
    103     }
    104     if(length>(INT32_MAX-destIndex)) {
    105         return -1;  // integer overflow
    106     }
    107 
    108     if(destIndex<destCapacity) {
    109         /* append the result */
    110         if(c>=0) {
    111             /* code point */
    112             UBool isError=FALSE;
    113             U16_APPEND(dest, destIndex, destCapacity, c, isError);
    114             if(isError) {
    115                 /* overflow, nothing written */
    116                 destIndex+=length;
    117             }
    118         } else {
    119             /* string */
    120             if((destIndex+length)<=destCapacity) {
    121                 while(length>0) {
    122                     dest[destIndex++]=*s++;
    123                     --length;
    124                 }
    125             } else {
    126                 /* overflow */
    127                 destIndex+=length;
    128             }
    129         }
    130     } else {
    131         /* preflight */
    132         destIndex+=length;
    133     }
    134     return destIndex;
    135 }
    136 
    137 static inline int32_t
    138 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
    139     if(destIndex<destCapacity) {
    140         dest[destIndex]=c;
    141     } else if(destIndex==INT32_MAX) {
    142         return -1;  // integer overflow
    143     }
    144     return destIndex+1;
    145 }
    146 
    147 static inline int32_t
    148 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
    149                 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
    150     if(length>0) {
    151         if(edits!=NULL) {
    152             edits->addUnchanged(length);
    153         }
    154         if(options & U_OMIT_UNCHANGED_TEXT) {
    155             return destIndex;
    156         }
    157         if(length>(INT32_MAX-destIndex)) {
    158             return -1;  // integer overflow
    159         }
    160         if((destIndex+length)<=destCapacity) {
    161             u_memcpy(dest+destIndex, s, length);
    162         }
    163         destIndex+=length;
    164     }
    165     return destIndex;
    166 }
    167 
    168 static UChar32 U_CALLCONV
    169 utf16_caseContextIterator(void *context, int8_t dir) {
    170     UCaseContext *csc=(UCaseContext *)context;
    171     UChar32 c;
    172 
    173     if(dir<0) {
    174         /* reset for backward iteration */
    175         csc->index=csc->cpStart;
    176         csc->dir=dir;
    177     } else if(dir>0) {
    178         /* reset for forward iteration */
    179         csc->index=csc->cpLimit;
    180         csc->dir=dir;
    181     } else {
    182         /* continue current iteration direction */
    183         dir=csc->dir;
    184     }
    185 
    186     if(dir<0) {
    187         if(csc->start<csc->index) {
    188             U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
    189             return c;
    190         }
    191     } else {
    192         if(csc->index<csc->limit) {
    193             U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
    194             return c;
    195         }
    196     }
    197     return U_SENTINEL;
    198 }
    199 
    200 /*
    201  * Case-maps [srcStart..srcLimit[ but takes
    202  * context [0..srcLength[ into account.
    203  */
    204 static int32_t
    205 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
    206          UChar *dest, int32_t destCapacity,
    207          const UChar *src, UCaseContext *csc,
    208          int32_t srcStart, int32_t srcLimit,
    209          icu::Edits *edits,
    210          UErrorCode &errorCode) {
    211     /* case mapping loop */
    212     int32_t srcIndex=srcStart;
    213     int32_t destIndex=0;
    214     while(srcIndex<srcLimit) {
    215         int32_t cpStart;
    216         csc->cpStart=cpStart=srcIndex;
    217         UChar32 c;
    218         U16_NEXT(src, srcIndex, srcLimit, c);
    219         csc->cpLimit=srcIndex;
    220         const UChar *s;
    221         c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
    222         destIndex = appendResult(dest, destIndex, destCapacity, c, s,
    223                                  srcIndex - cpStart, options, edits);
    224         if (destIndex < 0) {
    225             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    226             return 0;
    227         }
    228     }
    229 
    230     return destIndex;
    231 }
    232 
    233 #if !UCONFIG_NO_BREAK_ITERATION
    234 
    235 U_CFUNC int32_t U_CALLCONV
    236 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
    237                          UChar *dest, int32_t destCapacity,
    238                          const UChar *src, int32_t srcLength,
    239                          icu::Edits *edits,
    240                          UErrorCode &errorCode) {
    241     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
    242         return 0;
    243     }
    244 
    245     /* set up local variables */
    246     UCaseContext csc=UCASECONTEXT_INITIALIZER;
    247     csc.p=(void *)src;
    248     csc.limit=srcLength;
    249     int32_t destIndex=0;
    250     int32_t prev=0;
    251     UBool isFirstIndex=TRUE;
    252 
    253     /* titlecasing loop */
    254     while(prev<srcLength) {
    255         /* find next index where to titlecase */
    256         int32_t index;
    257         if(isFirstIndex) {
    258             isFirstIndex=FALSE;
    259             index=iter->first();
    260         } else {
    261             index=iter->next();
    262         }
    263         if(index==UBRK_DONE || index>srcLength) {
    264             index=srcLength;
    265         }
    266 
    267         /*
    268          * Segment [prev..index[ into 3 parts:
    269          * a) skipped characters (copy as-is) [prev..titleStart[
    270          * b) first letter (titlecase)              [titleStart..titleLimit[
    271          * c) subsequent characters (lowercase)                 [titleLimit..index[
    272          */
    273         if(prev<index) {
    274             // Find and copy skipped characters [prev..titleStart[
    275             int32_t titleStart=prev;
    276             int32_t titleLimit=prev;
    277             UChar32 c;
    278             U16_NEXT(src, titleLimit, index, c);
    279             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
    280                 // Adjust the titlecasing index to the next cased character,
    281                 // or to the next letter/number/symbol/private use.
    282                 // Stop with titleStart<titleLimit<=index
    283                 // if there is a character to be titlecased,
    284                 // or else stop with titleStart==titleLimit==index.
    285                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
    286                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
    287                     titleStart=titleLimit;
    288                     if(titleLimit==index) {
    289                         break;
    290                     }
    291                     U16_NEXT(src, titleLimit, index, c);
    292                 }
    293                 if (prev < titleStart) {
    294                     destIndex=appendUnchanged(dest, destIndex, destCapacity,
    295                                               src+prev, titleStart-prev, options, edits);
    296                     if(destIndex<0) {
    297                         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    298                         return 0;
    299                     }
    300                 }
    301             }
    302 
    303             if(titleStart<titleLimit) {
    304                 /* titlecase c which is from [titleStart..titleLimit[ */
    305                 csc.cpStart=titleStart;
    306                 csc.cpLimit=titleLimit;
    307                 const UChar *s;
    308                 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
    309                 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
    310                                        titleLimit-titleStart, options, edits);
    311                 if(destIndex<0) {
    312                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    313                     return 0;
    314                 }
    315 
    316                 /* Special case Dutch IJ titlecasing */
    317                 if (titleStart+1 < index &&
    318                         caseLocale == UCASE_LOC_DUTCH &&
    319                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
    320                     if (src[titleStart+1] == 0x006A) {
    321                         destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
    322                         if(destIndex<0) {
    323                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    324                             return 0;
    325                         }
    326                         if(edits!=NULL) {
    327                             edits->addReplace(1, 1);
    328                         }
    329                         titleLimit++;
    330                     } else if (src[titleStart+1] == 0x004A) {
    331                         // Keep the capital J from getting lowercased.
    332                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
    333                                                   src+titleStart+1, 1, options, edits);
    334                         if(destIndex<0) {
    335                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    336                             return 0;
    337                         }
    338                         titleLimit++;
    339                     }
    340                 }
    341 
    342                 /* lowercase [titleLimit..index[ */
    343                 if(titleLimit<index) {
    344                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
    345                         /* Normal operation: Lowercase the rest of the word. */
    346                         destIndex+=
    347                             _caseMap(
    348                                 caseLocale, options, ucase_toFullLower,
    349                                 dest+destIndex, destCapacity-destIndex,
    350                                 src, &csc,
    351                                 titleLimit, index,
    352                                 edits, errorCode);
    353                         if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    354                             errorCode=U_ZERO_ERROR;
    355                         }
    356                         if(U_FAILURE(errorCode)) {
    357                             return destIndex;
    358                         }
    359                     } else {
    360                         /* Optionally just copy the rest of the word unchanged. */
    361                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
    362                                                   src+titleLimit, index-titleLimit, options, edits);
    363                         if(destIndex<0) {
    364                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    365                             return 0;
    366                         }
    367                     }
    368                 }
    369             }
    370         }
    371 
    372         prev=index;
    373     }
    374 
    375     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
    376 }
    377 
    378 #endif  // !UCONFIG_NO_BREAK_ITERATION
    379 
    380 U_NAMESPACE_BEGIN
    381 namespace GreekUpper {
    382 
    383 // Data generated by prototype code, see
    384 // http://site.icu-project.org/design/case/greek-upper
    385 // TODO: Move this data into ucase.icu.
    386 static const uint16_t data0370[] = {
    387     // U+0370..03FF
    388     0x0370,
    389     0x0370,
    390     0x0372,
    391     0x0372,
    392     0,
    393     0,
    394     0x0376,
    395     0x0376,
    396     0,
    397     0,
    398     0x037A,
    399     0x03FD,
    400     0x03FE,
    401     0x03FF,
    402     0,
    403     0x037F,
    404     0,
    405     0,
    406     0,
    407     0,
    408     0,
    409     0,
    410     0x0391 | HAS_VOWEL | HAS_ACCENT,
    411     0,
    412     0x0395 | HAS_VOWEL | HAS_ACCENT,
    413     0x0397 | HAS_VOWEL | HAS_ACCENT,
    414     0x0399 | HAS_VOWEL | HAS_ACCENT,
    415     0,
    416     0x039F | HAS_VOWEL | HAS_ACCENT,
    417     0,
    418     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    419     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    420     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    421     0x0391 | HAS_VOWEL,
    422     0x0392,
    423     0x0393,
    424     0x0394,
    425     0x0395 | HAS_VOWEL,
    426     0x0396,
    427     0x0397 | HAS_VOWEL,
    428     0x0398,
    429     0x0399 | HAS_VOWEL,
    430     0x039A,
    431     0x039B,
    432     0x039C,
    433     0x039D,
    434     0x039E,
    435     0x039F | HAS_VOWEL,
    436     0x03A0,
    437     0x03A1,
    438     0,
    439     0x03A3,
    440     0x03A4,
    441     0x03A5 | HAS_VOWEL,
    442     0x03A6,
    443     0x03A7,
    444     0x03A8,
    445     0x03A9 | HAS_VOWEL,
    446     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
    447     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
    448     0x0391 | HAS_VOWEL | HAS_ACCENT,
    449     0x0395 | HAS_VOWEL | HAS_ACCENT,
    450     0x0397 | HAS_VOWEL | HAS_ACCENT,
    451     0x0399 | HAS_VOWEL | HAS_ACCENT,
    452     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    453     0x0391 | HAS_VOWEL,
    454     0x0392,
    455     0x0393,
    456     0x0394,
    457     0x0395 | HAS_VOWEL,
    458     0x0396,
    459     0x0397 | HAS_VOWEL,
    460     0x0398,
    461     0x0399 | HAS_VOWEL,
    462     0x039A,
    463     0x039B,
    464     0x039C,
    465     0x039D,
    466     0x039E,
    467     0x039F | HAS_VOWEL,
    468     0x03A0,
    469     0x03A1,
    470     0x03A3,
    471     0x03A3,
    472     0x03A4,
    473     0x03A5 | HAS_VOWEL,
    474     0x03A6,
    475     0x03A7,
    476     0x03A8,
    477     0x03A9 | HAS_VOWEL,
    478     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
    479     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
    480     0x039F | HAS_VOWEL | HAS_ACCENT,
    481     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    482     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    483     0x03CF,
    484     0x0392,
    485     0x0398,
    486     0x03D2,
    487     0x03D2 | HAS_ACCENT,
    488     0x03D2 | HAS_DIALYTIKA,
    489     0x03A6,
    490     0x03A0,
    491     0x03CF,
    492     0x03D8,
    493     0x03D8,
    494     0x03DA,
    495     0x03DA,
    496     0x03DC,
    497     0x03DC,
    498     0x03DE,
    499     0x03DE,
    500     0x03E0,
    501     0x03E0,
    502     0,
    503     0,
    504     0,
    505     0,
    506     0,
    507     0,
    508     0,
    509     0,
    510     0,
    511     0,
    512     0,
    513     0,
    514     0,
    515     0,
    516     0x039A,
    517     0x03A1,
    518     0x03F9,
    519     0x037F,
    520     0x03F4,
    521     0x0395 | HAS_VOWEL,
    522     0,
    523     0x03F7,
    524     0x03F7,
    525     0x03F9,
    526     0x03FA,
    527     0x03FA,
    528     0x03FC,
    529     0x03FD,
    530     0x03FE,
    531     0x03FF,
    532 };
    533 
    534 static const uint16_t data1F00[] = {
    535     // U+1F00..1FFF
    536     0x0391 | HAS_VOWEL,
    537     0x0391 | HAS_VOWEL,
    538     0x0391 | HAS_VOWEL | HAS_ACCENT,
    539     0x0391 | HAS_VOWEL | HAS_ACCENT,
    540     0x0391 | HAS_VOWEL | HAS_ACCENT,
    541     0x0391 | HAS_VOWEL | HAS_ACCENT,
    542     0x0391 | HAS_VOWEL | HAS_ACCENT,
    543     0x0391 | HAS_VOWEL | HAS_ACCENT,
    544     0x0391 | HAS_VOWEL,
    545     0x0391 | HAS_VOWEL,
    546     0x0391 | HAS_VOWEL | HAS_ACCENT,
    547     0x0391 | HAS_VOWEL | HAS_ACCENT,
    548     0x0391 | HAS_VOWEL | HAS_ACCENT,
    549     0x0391 | HAS_VOWEL | HAS_ACCENT,
    550     0x0391 | HAS_VOWEL | HAS_ACCENT,
    551     0x0391 | HAS_VOWEL | HAS_ACCENT,
    552     0x0395 | HAS_VOWEL,
    553     0x0395 | HAS_VOWEL,
    554     0x0395 | HAS_VOWEL | HAS_ACCENT,
    555     0x0395 | HAS_VOWEL | HAS_ACCENT,
    556     0x0395 | HAS_VOWEL | HAS_ACCENT,
    557     0x0395 | HAS_VOWEL | HAS_ACCENT,
    558     0,
    559     0,
    560     0x0395 | HAS_VOWEL,
    561     0x0395 | HAS_VOWEL,
    562     0x0395 | HAS_VOWEL | HAS_ACCENT,
    563     0x0395 | HAS_VOWEL | HAS_ACCENT,
    564     0x0395 | HAS_VOWEL | HAS_ACCENT,
    565     0x0395 | HAS_VOWEL | HAS_ACCENT,
    566     0,
    567     0,
    568     0x0397 | HAS_VOWEL,
    569     0x0397 | HAS_VOWEL,
    570     0x0397 | HAS_VOWEL | HAS_ACCENT,
    571     0x0397 | HAS_VOWEL | HAS_ACCENT,
    572     0x0397 | HAS_VOWEL | HAS_ACCENT,
    573     0x0397 | HAS_VOWEL | HAS_ACCENT,
    574     0x0397 | HAS_VOWEL | HAS_ACCENT,
    575     0x0397 | HAS_VOWEL | HAS_ACCENT,
    576     0x0397 | HAS_VOWEL,
    577     0x0397 | HAS_VOWEL,
    578     0x0397 | HAS_VOWEL | HAS_ACCENT,
    579     0x0397 | HAS_VOWEL | HAS_ACCENT,
    580     0x0397 | HAS_VOWEL | HAS_ACCENT,
    581     0x0397 | HAS_VOWEL | HAS_ACCENT,
    582     0x0397 | HAS_VOWEL | HAS_ACCENT,
    583     0x0397 | HAS_VOWEL | HAS_ACCENT,
    584     0x0399 | HAS_VOWEL,
    585     0x0399 | HAS_VOWEL,
    586     0x0399 | HAS_VOWEL | HAS_ACCENT,
    587     0x0399 | HAS_VOWEL | HAS_ACCENT,
    588     0x0399 | HAS_VOWEL | HAS_ACCENT,
    589     0x0399 | HAS_VOWEL | HAS_ACCENT,
    590     0x0399 | HAS_VOWEL | HAS_ACCENT,
    591     0x0399 | HAS_VOWEL | HAS_ACCENT,
    592     0x0399 | HAS_VOWEL,
    593     0x0399 | HAS_VOWEL,
    594     0x0399 | HAS_VOWEL | HAS_ACCENT,
    595     0x0399 | HAS_VOWEL | HAS_ACCENT,
    596     0x0399 | HAS_VOWEL | HAS_ACCENT,
    597     0x0399 | HAS_VOWEL | HAS_ACCENT,
    598     0x0399 | HAS_VOWEL | HAS_ACCENT,
    599     0x0399 | HAS_VOWEL | HAS_ACCENT,
    600     0x039F | HAS_VOWEL,
    601     0x039F | HAS_VOWEL,
    602     0x039F | HAS_VOWEL | HAS_ACCENT,
    603     0x039F | HAS_VOWEL | HAS_ACCENT,
    604     0x039F | HAS_VOWEL | HAS_ACCENT,
    605     0x039F | HAS_VOWEL | HAS_ACCENT,
    606     0,
    607     0,
    608     0x039F | HAS_VOWEL,
    609     0x039F | HAS_VOWEL,
    610     0x039F | HAS_VOWEL | HAS_ACCENT,
    611     0x039F | HAS_VOWEL | HAS_ACCENT,
    612     0x039F | HAS_VOWEL | HAS_ACCENT,
    613     0x039F | HAS_VOWEL | HAS_ACCENT,
    614     0,
    615     0,
    616     0x03A5 | HAS_VOWEL,
    617     0x03A5 | HAS_VOWEL,
    618     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    619     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    620     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    621     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    622     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    623     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    624     0,
    625     0x03A5 | HAS_VOWEL,
    626     0,
    627     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    628     0,
    629     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    630     0,
    631     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    632     0x03A9 | HAS_VOWEL,
    633     0x03A9 | HAS_VOWEL,
    634     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    635     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    636     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    637     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    638     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    639     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    640     0x03A9 | HAS_VOWEL,
    641     0x03A9 | HAS_VOWEL,
    642     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    643     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    644     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    645     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    646     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    647     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    648     0x0391 | HAS_VOWEL | HAS_ACCENT,
    649     0x0391 | HAS_VOWEL | HAS_ACCENT,
    650     0x0395 | HAS_VOWEL | HAS_ACCENT,
    651     0x0395 | HAS_VOWEL | HAS_ACCENT,
    652     0x0397 | HAS_VOWEL | HAS_ACCENT,
    653     0x0397 | HAS_VOWEL | HAS_ACCENT,
    654     0x0399 | HAS_VOWEL | HAS_ACCENT,
    655     0x0399 | HAS_VOWEL | HAS_ACCENT,
    656     0x039F | HAS_VOWEL | HAS_ACCENT,
    657     0x039F | HAS_VOWEL | HAS_ACCENT,
    658     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    659     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    660     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    661     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    662     0,
    663     0,
    664     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    665     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    666     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    667     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    668     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    669     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    670     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    671     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    672     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    673     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    674     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    675     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    676     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    677     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    678     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    679     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    680     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    681     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    682     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    683     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    684     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    685     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    686     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    687     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    688     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    689     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    690     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    691     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    692     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    693     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    694     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    695     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    696     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    697     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    698     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    699     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    700     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    701     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    702     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    703     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    704     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    705     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    706     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    707     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    708     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    709     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    710     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    711     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    712     0x0391 | HAS_VOWEL,
    713     0x0391 | HAS_VOWEL,
    714     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    715     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    716     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    717     0,
    718     0x0391 | HAS_VOWEL | HAS_ACCENT,
    719     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    720     0x0391 | HAS_VOWEL,
    721     0x0391 | HAS_VOWEL,
    722     0x0391 | HAS_VOWEL | HAS_ACCENT,
    723     0x0391 | HAS_VOWEL | HAS_ACCENT,
    724     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    725     0,
    726     0x0399 | HAS_VOWEL,
    727     0,
    728     0,
    729     0,
    730     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    731     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    732     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    733     0,
    734     0x0397 | HAS_VOWEL | HAS_ACCENT,
    735     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    736     0x0395 | HAS_VOWEL | HAS_ACCENT,
    737     0x0395 | HAS_VOWEL | HAS_ACCENT,
    738     0x0397 | HAS_VOWEL | HAS_ACCENT,
    739     0x0397 | HAS_VOWEL | HAS_ACCENT,
    740     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    741     0,
    742     0,
    743     0,
    744     0x0399 | HAS_VOWEL,
    745     0x0399 | HAS_VOWEL,
    746     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    747     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    748     0,
    749     0,
    750     0x0399 | HAS_VOWEL | HAS_ACCENT,
    751     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    752     0x0399 | HAS_VOWEL,
    753     0x0399 | HAS_VOWEL,
    754     0x0399 | HAS_VOWEL | HAS_ACCENT,
    755     0x0399 | HAS_VOWEL | HAS_ACCENT,
    756     0,
    757     0,
    758     0,
    759     0,
    760     0x03A5 | HAS_VOWEL,
    761     0x03A5 | HAS_VOWEL,
    762     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    763     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    764     0x03A1,
    765     0x03A1,
    766     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    767     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
    768     0x03A5 | HAS_VOWEL,
    769     0x03A5 | HAS_VOWEL,
    770     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    771     0x03A5 | HAS_VOWEL | HAS_ACCENT,
    772     0x03A1,
    773     0,
    774     0,
    775     0,
    776     0,
    777     0,
    778     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    779     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    780     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    781     0,
    782     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    783     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
    784     0x039F | HAS_VOWEL | HAS_ACCENT,
    785     0x039F | HAS_VOWEL | HAS_ACCENT,
    786     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    787     0x03A9 | HAS_VOWEL | HAS_ACCENT,
    788     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
    789     0,
    790     0,
    791     0,
    792 };
    793 
    794 // U+2126 Ohm sign
    795 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
    796 
    797 uint32_t getLetterData(UChar32 c) {
    798     if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
    799         return 0;
    800     } else if (c <= 0x3ff) {
    801         return data0370[c - 0x370];
    802     } else if (c <= 0x1fff) {
    803         return data1F00[c - 0x1f00];
    804     } else if (c == 0x2126) {
    805         return data2126;
    806     } else {
    807         return 0;
    808     }
    809 }
    810 
    811 uint32_t getDiacriticData(UChar32 c) {
    812     switch (c) {
    813     case 0x0300:  // varia
    814     case 0x0301:  // tonos = oxia
    815     case 0x0342:  // perispomeni
    816     case 0x0302:  // circumflex can look like perispomeni
    817     case 0x0303:  // tilde can look like perispomeni
    818     case 0x0311:  // inverted breve can look like perispomeni
    819         return HAS_ACCENT;
    820     case 0x0308:  // dialytika = diaeresis
    821         return HAS_COMBINING_DIALYTIKA;
    822     case 0x0344:  // dialytika tonos
    823         return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
    824     case 0x0345:  // ypogegrammeni = iota subscript
    825         return HAS_YPOGEGRAMMENI;
    826     case 0x0304:  // macron
    827     case 0x0306:  // breve
    828     case 0x0313:  // comma above
    829     case 0x0314:  // reversed comma above
    830     case 0x0343:  // koronis
    831         return HAS_OTHER_GREEK_DIACRITIC;
    832     default:
    833         return 0;
    834     }
    835 }
    836 
    837 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
    838     while (i < length) {
    839         UChar32 c;
    840         U16_NEXT(s, i, length, c);
    841         int32_t type = ucase_getTypeOrIgnorable(c);
    842         if ((type & UCASE_IGNORABLE) != 0) {
    843             // Case-ignorable, continue with the loop.
    844         } else if (type != UCASE_NONE) {
    845             return TRUE;  // Followed by cased letter.
    846         } else {
    847             return FALSE;  // Uncased and not case-ignorable.
    848         }
    849     }
    850     return FALSE;  // Not followed by cased letter.
    851 }
    852 
    853 /**
    854  * Greek string uppercasing with a state machine.
    855  * Probably simpler than a stateless function that has to figure out complex context-before
    856  * for each character.
    857  * TODO: Try to re-consolidate one way or another with the non-Greek function.
    858  */
    859 int32_t toUpper(uint32_t options,
    860                 UChar *dest, int32_t destCapacity,
    861                 const UChar *src, int32_t srcLength,
    862                 Edits *edits,
    863                 UErrorCode &errorCode) {
    864     int32_t destIndex=0;
    865     uint32_t state = 0;
    866     for (int32_t i = 0; i < srcLength;) {
    867         int32_t nextIndex = i;
    868         UChar32 c;
    869         U16_NEXT(src, nextIndex, srcLength, c);
    870         uint32_t nextState = 0;
    871         int32_t type = ucase_getTypeOrIgnorable(c);
    872         if ((type & UCASE_IGNORABLE) != 0) {
    873             // c is case-ignorable
    874             nextState |= (state & AFTER_CASED);
    875         } else if (type != UCASE_NONE) {
    876             // c is cased
    877             nextState |= AFTER_CASED;
    878         }
    879         uint32_t data = getLetterData(c);
    880         if (data > 0) {
    881             uint32_t upper = data & UPPER_MASK;
    882             // Add a dialytika to this iota or ypsilon vowel
    883             // if we removed a tonos from the previous vowel,
    884             // and that previous vowel did not also have (or gain) a dialytika.
    885             // Adding one only to the final vowel in a longer sequence
    886             // (which does not occur in normal writing) would require lookahead.
    887             // Set the same flag as for preserving an existing dialytika.
    888             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
    889                     (upper == 0x399 || upper == 0x3A5)) {
    890                 data |= HAS_DIALYTIKA;
    891             }
    892             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
    893             if ((data & HAS_YPOGEGRAMMENI) != 0) {
    894                 numYpogegrammeni = 1;
    895             }
    896             // Skip combining diacritics after this Greek letter.
    897             while (nextIndex < srcLength) {
    898                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
    899                 if (diacriticData != 0) {
    900                     data |= diacriticData;
    901                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
    902                         ++numYpogegrammeni;
    903                     }
    904                     ++nextIndex;
    905                 } else {
    906                     break;  // not a Greek diacritic
    907                 }
    908             }
    909             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
    910                 nextState |= AFTER_VOWEL_WITH_ACCENT;
    911             }
    912             // Map according to Greek rules.
    913             UBool addTonos = FALSE;
    914             if (upper == 0x397 &&
    915                     (data & HAS_ACCENT) != 0 &&
    916                     numYpogegrammeni == 0 &&
    917                     (state & AFTER_CASED) == 0 &&
    918                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
    919                 // Keep disjunctive "or" with (only) a tonos.
    920                 // We use the same "word boundary" conditions as for the Final_Sigma test.
    921                 if (i == nextIndex) {
    922                     upper = 0x389;  // Preserve the precomposed form.
    923                 } else {
    924                     addTonos = TRUE;
    925                 }
    926             } else if ((data & HAS_DIALYTIKA) != 0) {
    927                 // Preserve a vowel with dialytika in precomposed form if it exists.
    928                 if (upper == 0x399) {
    929                     upper = 0x3AA;
    930                     data &= ~HAS_EITHER_DIALYTIKA;
    931                 } else if (upper == 0x3A5) {
    932                     upper = 0x3AB;
    933                     data &= ~HAS_EITHER_DIALYTIKA;
    934                 }
    935             }
    936 
    937             UBool change;
    938             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
    939                 change = TRUE;  // common, simple usage
    940             } else {
    941                 // Find out first whether we are changing the text.
    942                 change = src[i] != upper || numYpogegrammeni > 0;
    943                 int32_t i2 = i + 1;
    944                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
    945                     change |= i2 >= nextIndex || src[i2] != 0x308;
    946                     ++i2;
    947                 }
    948                 if (addTonos) {
    949                     change |= i2 >= nextIndex || src[i2] != 0x301;
    950                     ++i2;
    951                 }
    952                 int32_t oldLength = nextIndex - i;
    953                 int32_t newLength = (i2 - i) + numYpogegrammeni;
    954                 change |= oldLength != newLength;
    955                 if (change) {
    956                     if (edits != NULL) {
    957                         edits->addReplace(oldLength, newLength);
    958                     }
    959                 } else {
    960                     if (edits != NULL) {
    961                         edits->addUnchanged(oldLength);
    962                     }
    963                     // Write unchanged text?
    964                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
    965                 }
    966             }
    967 
    968             if (change) {
    969                 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
    970                 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
    971                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
    972                 }
    973                 if (destIndex >= 0 && addTonos) {
    974                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
    975                 }
    976                 while (destIndex >= 0 && numYpogegrammeni > 0) {
    977                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
    978                     --numYpogegrammeni;
    979                 }
    980                 if(destIndex<0) {
    981                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    982                     return 0;
    983                 }
    984             }
    985         } else {
    986             const UChar *s;
    987             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
    988             destIndex = appendResult(dest, destIndex, destCapacity, c, s,
    989                                      nextIndex - i, options, edits);
    990             if (destIndex < 0) {
    991                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
    992                 return 0;
    993             }
    994         }
    995         i = nextIndex;
    996         state = nextState;
    997     }
    998 
    999     return destIndex;
   1000 }
   1001 
   1002 }  // namespace GreekUpper
   1003 U_NAMESPACE_END
   1004 
   1005 /* functions available in the common library (for unistr_case.cpp) */
   1006 
   1007 U_CFUNC int32_t U_CALLCONV
   1008 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
   1009                          UChar *dest, int32_t destCapacity,
   1010                          const UChar *src, int32_t srcLength,
   1011                          icu::Edits *edits,
   1012                          UErrorCode &errorCode) {
   1013     UCaseContext csc=UCASECONTEXT_INITIALIZER;
   1014     csc.p=(void *)src;
   1015     csc.limit=srcLength;
   1016     int32_t destIndex = _caseMap(
   1017         caseLocale, options, ucase_toFullLower,
   1018         dest, destCapacity,
   1019         src, &csc, 0, srcLength,
   1020         edits, errorCode);
   1021     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
   1022 }
   1023 
   1024 U_CFUNC int32_t U_CALLCONV
   1025 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
   1026                          UChar *dest, int32_t destCapacity,
   1027                          const UChar *src, int32_t srcLength,
   1028                          icu::Edits *edits,
   1029                          UErrorCode &errorCode) {
   1030     int32_t destIndex;
   1031     if (caseLocale == UCASE_LOC_GREEK) {
   1032         destIndex = GreekUpper::toUpper(options, dest, destCapacity,
   1033                                         src, srcLength, edits, errorCode);
   1034     } else {
   1035         UCaseContext csc=UCASECONTEXT_INITIALIZER;
   1036         csc.p=(void *)src;
   1037         csc.limit=srcLength;
   1038         destIndex = _caseMap(
   1039             caseLocale, options, ucase_toFullUpper,
   1040             dest, destCapacity,
   1041             src, &csc, 0, srcLength,
   1042             edits, errorCode);
   1043     }
   1044     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
   1045 }
   1046 
   1047 U_CFUNC int32_t U_CALLCONV
   1048 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
   1049                       UChar *dest, int32_t destCapacity,
   1050                       const UChar *src, int32_t srcLength,
   1051                       icu::Edits *edits,
   1052                       UErrorCode &errorCode) {
   1053     /* case mapping loop */
   1054     int32_t srcIndex = 0;
   1055     int32_t destIndex = 0;
   1056     while (srcIndex < srcLength) {
   1057         int32_t cpStart = srcIndex;
   1058         UChar32 c;
   1059         U16_NEXT(src, srcIndex, srcLength, c);
   1060         const UChar *s;
   1061         c = ucase_toFullFolding(c, &s, options);
   1062         destIndex = appendResult(dest, destIndex, destCapacity, c, s,
   1063                                  srcIndex - cpStart, options, edits);
   1064         if (destIndex < 0) {
   1065             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
   1066             return 0;
   1067         }
   1068     }
   1069 
   1070     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
   1071 }
   1072 
   1073 U_CFUNC int32_t
   1074 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
   1075              UChar *dest, int32_t destCapacity,
   1076              const UChar *src, int32_t srcLength,
   1077              UStringCaseMapper *stringCaseMapper,
   1078              icu::Edits *edits,
   1079              UErrorCode &errorCode) {
   1080     int32_t destLength;
   1081 
   1082     /* check argument values */
   1083     if(U_FAILURE(errorCode)) {
   1084         return 0;
   1085     }
   1086     if( destCapacity<0 ||
   1087         (dest==NULL && destCapacity>0) ||
   1088         src==NULL ||
   1089         srcLength<-1
   1090     ) {
   1091         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1092         return 0;
   1093     }
   1094 
   1095     /* get the string length */
   1096     if(srcLength==-1) {
   1097         srcLength=u_strlen(src);
   1098     }
   1099 
   1100     /* check for overlapping source and destination */
   1101     if( dest!=NULL &&
   1102         ((src>=dest && src<(dest+destCapacity)) ||
   1103          (dest>=src && dest<(src+srcLength)))
   1104     ) {
   1105         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1106         return 0;
   1107     }
   1108 
   1109     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
   1110         edits->reset();
   1111     }
   1112     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
   1113                                 dest, destCapacity, src, srcLength, edits, errorCode);
   1114     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
   1115 }
   1116 
   1117 U_CFUNC int32_t
   1118 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
   1119                         UChar *dest, int32_t destCapacity,
   1120                         const UChar *src, int32_t srcLength,
   1121                         UStringCaseMapper *stringCaseMapper,
   1122                         UErrorCode &errorCode) {
   1123     UChar buffer[300];
   1124     UChar *temp;
   1125 
   1126     int32_t destLength;
   1127 
   1128     /* check argument values */
   1129     if(U_FAILURE(errorCode)) {
   1130         return 0;
   1131     }
   1132     if( destCapacity<0 ||
   1133         (dest==NULL && destCapacity>0) ||
   1134         src==NULL ||
   1135         srcLength<-1
   1136     ) {
   1137         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1138         return 0;
   1139     }
   1140 
   1141     /* get the string length */
   1142     if(srcLength==-1) {
   1143         srcLength=u_strlen(src);
   1144     }
   1145 
   1146     /* check for overlapping source and destination */
   1147     if( dest!=NULL &&
   1148         ((src>=dest && src<(dest+destCapacity)) ||
   1149          (dest>=src && dest<(src+srcLength)))
   1150     ) {
   1151         /* overlap: provide a temporary destination buffer and later copy the result */
   1152         if(destCapacity<=UPRV_LENGTHOF(buffer)) {
   1153             /* the stack buffer is large enough */
   1154             temp=buffer;
   1155         } else {
   1156             /* allocate a buffer */
   1157             temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
   1158             if(temp==NULL) {
   1159                 errorCode=U_MEMORY_ALLOCATION_ERROR;
   1160                 return 0;
   1161             }
   1162         }
   1163     } else {
   1164         temp=dest;
   1165     }
   1166 
   1167     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
   1168                                 temp, destCapacity, src, srcLength, NULL, errorCode);
   1169     if(temp!=dest) {
   1170         /* copy the result string to the destination buffer */
   1171         if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
   1172             u_memmove(dest, temp, destLength);
   1173         }
   1174         if(temp!=buffer) {
   1175             uprv_free(temp);
   1176         }
   1177     }
   1178 
   1179     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
   1180 }
   1181 
   1182 /* public API functions */
   1183 
   1184 U_CAPI int32_t U_EXPORT2
   1185 u_strFoldCase(UChar *dest, int32_t destCapacity,
   1186               const UChar *src, int32_t srcLength,
   1187               uint32_t options,
   1188               UErrorCode *pErrorCode) {
   1189     return ustrcase_mapWithOverlap(
   1190         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
   1191         dest, destCapacity,
   1192         src, srcLength,
   1193         ustrcase_internalFold, *pErrorCode);
   1194 }
   1195 
   1196 U_NAMESPACE_BEGIN
   1197 
   1198 int32_t CaseMap::fold(
   1199         uint32_t options,
   1200         const UChar *src, int32_t srcLength,
   1201         UChar *dest, int32_t destCapacity, Edits *edits,
   1202         UErrorCode &errorCode) {
   1203     return ustrcase_map(
   1204         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
   1205         dest, destCapacity,
   1206         src, srcLength,
   1207         ustrcase_internalFold, edits, errorCode);
   1208 }
   1209 
   1210 U_NAMESPACE_END
   1211 
   1212 /* case-insensitive string comparisons -------------------------------------- */
   1213 
   1214 /*
   1215  * This function is a copy of unorm_cmpEquivFold() minus the parts for
   1216  * canonical equivalence.
   1217  * Keep the functions in sync, and see there for how this works.
   1218  * The duplication is for modularization:
   1219  * It makes caseless (but not canonical caseless) matches independent of
   1220  * the normalization code.
   1221  */
   1222 
   1223 /* stack element for previous-level source/decomposition pointers */
   1224 struct CmpEquivLevel {
   1225     const UChar *start, *s, *limit;
   1226 };
   1227 typedef struct CmpEquivLevel CmpEquivLevel;
   1228 
   1229 /**
   1230  * Internal implementation code comparing string with case fold.
   1231  * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
   1232  *
   1233  * @param s1            input string 1
   1234  * @param length1       length of string 1, or -1 (NULL terminated)
   1235  * @param s2            input string 2
   1236  * @param length2       length of string 2, or -1 (NULL terminated)
   1237  * @param options       compare options
   1238  * @param matchLen1     (output) length of partial prefix match in s1
   1239  * @param matchLen2     (output) length of partial prefix match in s2
   1240  * @param pErrorCode    receives error status
   1241  * @return The result of comparison
   1242  */
   1243 static int32_t _cmpFold(
   1244             const UChar *s1, int32_t length1,
   1245             const UChar *s2, int32_t length2,
   1246             uint32_t options,
   1247             int32_t *matchLen1, int32_t *matchLen2,
   1248             UErrorCode *pErrorCode) {
   1249     int32_t cmpRes = 0;
   1250 
   1251     /* current-level start/limit - s1/s2 as current */
   1252     const UChar *start1, *start2, *limit1, *limit2;
   1253 
   1254     /* points to the original start address */
   1255     const UChar *org1, *org2;
   1256 
   1257     /* points to the end of match + 1 */
   1258     const UChar *m1, *m2;
   1259 
   1260     /* case folding variables */
   1261     const UChar *p;
   1262     int32_t length;
   1263 
   1264     /* stacks of previous-level start/current/limit */
   1265     CmpEquivLevel stack1[2], stack2[2];
   1266 
   1267     /* case folding buffers, only use current-level start/limit */
   1268     UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
   1269 
   1270     /* track which is the current level per string */
   1271     int32_t level1, level2;
   1272 
   1273     /* current code units, and code points for lookups */
   1274     UChar32 c1, c2, cp1, cp2;
   1275 
   1276     /* no argument error checking because this itself is not an API */
   1277 
   1278     /*
   1279      * assume that at least the option U_COMPARE_IGNORE_CASE is set
   1280      * otherwise this function would have to behave exactly as uprv_strCompare()
   1281      */
   1282     if(U_FAILURE(*pErrorCode)) {
   1283         return 0;
   1284     }
   1285 
   1286     /* initialize */
   1287     if(matchLen1) {
   1288         U_ASSERT(matchLen2 !=NULL);
   1289         *matchLen1=0;
   1290         *matchLen2=0;
   1291     }
   1292 
   1293     start1=m1=org1=s1;
   1294     if(length1==-1) {
   1295         limit1=NULL;
   1296     } else {
   1297         limit1=s1+length1;
   1298     }
   1299 
   1300     start2=m2=org2=s2;
   1301     if(length2==-1) {
   1302         limit2=NULL;
   1303     } else {
   1304         limit2=s2+length2;
   1305     }
   1306 
   1307     level1=level2=0;
   1308     c1=c2=-1;
   1309 
   1310     /* comparison loop */
   1311     for(;;) {
   1312         /*
   1313          * here a code unit value of -1 means "get another code unit"
   1314          * below it will mean "this source is finished"
   1315          */
   1316 
   1317         if(c1<0) {
   1318             /* get next code unit from string 1, post-increment */
   1319             for(;;) {
   1320                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
   1321                     if(level1==0) {
   1322                         c1=-1;
   1323                         break;
   1324                     }
   1325                 } else {
   1326                     ++s1;
   1327                     break;
   1328                 }
   1329 
   1330                 /* reached end of level buffer, pop one level */
   1331                 do {
   1332                     --level1;
   1333                     start1=stack1[level1].start;    /*Not uninitialized*/
   1334                 } while(start1==NULL);
   1335                 s1=stack1[level1].s;                /*Not uninitialized*/
   1336                 limit1=stack1[level1].limit;        /*Not uninitialized*/
   1337             }
   1338         }
   1339 
   1340         if(c2<0) {
   1341             /* get next code unit from string 2, post-increment */
   1342             for(;;) {
   1343                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
   1344                     if(level2==0) {
   1345                         c2=-1;
   1346                         break;
   1347                     }
   1348                 } else {
   1349                     ++s2;
   1350                     break;
   1351                 }
   1352 
   1353                 /* reached end of level buffer, pop one level */
   1354                 do {
   1355                     --level2;
   1356                     start2=stack2[level2].start;    /*Not uninitialized*/
   1357                 } while(start2==NULL);
   1358                 s2=stack2[level2].s;                /*Not uninitialized*/
   1359                 limit2=stack2[level2].limit;        /*Not uninitialized*/
   1360             }
   1361         }
   1362 
   1363         /*
   1364          * compare c1 and c2
   1365          * either variable c1, c2 is -1 only if the corresponding string is finished
   1366          */
   1367         if(c1==c2) {
   1368             const UChar *next1, *next2;
   1369 
   1370             if(c1<0) {
   1371                 cmpRes=0;   /* c1==c2==-1 indicating end of strings */
   1372                 break;
   1373             }
   1374 
   1375             /*
   1376              * Note: Move the match positions in both strings at the same time
   1377              *      only when corresponding code point(s) in the original strings
   1378              *      are fully consumed. For example, when comparing s1="Fust" and
   1379              *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
   1380              *      the first code point in the case-folded data. But the second "s"
   1381              *      has no matching code point in s1, so this implementation returns
   1382              *      2 as the prefix match length ("Fu").
   1383              */
   1384             next1=next2=NULL;
   1385             if(level1==0) {
   1386                 next1=s1;
   1387             } else if(s1==limit1) {
   1388                 /* Note: This implementation only use a single level of stack.
   1389                  *      If this code needs to be changed to use multiple levels
   1390                  *      of stacks, the code above should check if the current
   1391                  *      code is at the end of all stacks.
   1392                  */
   1393                 U_ASSERT(level1==1);
   1394 
   1395                 /* is s1 at the end of the current stack? */
   1396                 next1=stack1[0].s;
   1397             }
   1398 
   1399             if (next1!=NULL) {
   1400                 if(level2==0) {
   1401                     next2=s2;
   1402                 } else if(s2==limit2) {
   1403                     U_ASSERT(level2==1);
   1404 
   1405                     /* is s2 at the end of the current stack? */
   1406                     next2=stack2[0].s;
   1407                 }
   1408                 if(next2!=NULL) {
   1409                     m1=next1;
   1410                     m2=next2;
   1411                 }
   1412             }
   1413             c1=c2=-1;       /* make us fetch new code units */
   1414             continue;
   1415         } else if(c1<0) {
   1416             cmpRes=-1;      /* string 1 ends before string 2 */
   1417             break;
   1418         } else if(c2<0) {
   1419             cmpRes=1;       /* string 2 ends before string 1 */
   1420             break;
   1421         }
   1422         /* c1!=c2 && c1>=0 && c2>=0 */
   1423 
   1424         /* get complete code points for c1, c2 for lookups if either is a surrogate */
   1425         cp1=c1;
   1426         if(U_IS_SURROGATE(c1)) {
   1427             UChar c;
   1428 
   1429             if(U_IS_SURROGATE_LEAD(c1)) {
   1430                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
   1431                     /* advance ++s1; only below if cp1 decomposes/case-folds */
   1432                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
   1433                 }
   1434             } else /* isTrail(c1) */ {
   1435                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
   1436                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
   1437                 }
   1438             }
   1439         }
   1440 
   1441         cp2=c2;
   1442         if(U_IS_SURROGATE(c2)) {
   1443             UChar c;
   1444 
   1445             if(U_IS_SURROGATE_LEAD(c2)) {
   1446                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
   1447                     /* advance ++s2; only below if cp2 decomposes/case-folds */
   1448                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
   1449                 }
   1450             } else /* isTrail(c2) */ {
   1451                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
   1452                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
   1453                 }
   1454             }
   1455         }
   1456 
   1457         /*
   1458          * go down one level for each string
   1459          * continue with the main loop as soon as there is a real change
   1460          */
   1461 
   1462         if( level1==0 &&
   1463             (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
   1464         ) {
   1465             /* cp1 case-folds to the code point "length" or to p[length] */
   1466             if(U_IS_SURROGATE(c1)) {
   1467                 if(U_IS_SURROGATE_LEAD(c1)) {
   1468                     /* advance beyond source surrogate pair if it case-folds */
   1469                     ++s1;
   1470                 } else /* isTrail(c1) */ {
   1471                     /*
   1472                      * we got a supplementary code point when hitting its trail surrogate,
   1473                      * therefore the lead surrogate must have been the same as in the other string;
   1474                      * compare this decomposition with the lead surrogate in the other string
   1475                      * remember that this simulates bulk text replacement:
   1476                      * the decomposition would replace the entire code point
   1477                      */
   1478                     --s2;
   1479                     --m2;
   1480                     c2=*(s2-1);
   1481                 }
   1482             }
   1483 
   1484             /* push current level pointers */
   1485             stack1[0].start=start1;
   1486             stack1[0].s=s1;
   1487             stack1[0].limit=limit1;
   1488             ++level1;
   1489 
   1490             /* copy the folding result to fold1[] */
   1491             if(length<=UCASE_MAX_STRING_LENGTH) {
   1492                 u_memcpy(fold1, p, length);
   1493             } else {
   1494                 int32_t i=0;
   1495                 U16_APPEND_UNSAFE(fold1, i, length);
   1496                 length=i;
   1497             }
   1498 
   1499             /* set next level pointers to case folding */
   1500             start1=s1=fold1;
   1501             limit1=fold1+length;
   1502 
   1503             /* get ready to read from decomposition, continue with loop */
   1504             c1=-1;
   1505             continue;
   1506         }
   1507 
   1508         if( level2==0 &&
   1509             (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
   1510         ) {
   1511             /* cp2 case-folds to the code point "length" or to p[length] */
   1512             if(U_IS_SURROGATE(c2)) {
   1513                 if(U_IS_SURROGATE_LEAD(c2)) {
   1514                     /* advance beyond source surrogate pair if it case-folds */
   1515                     ++s2;
   1516                 } else /* isTrail(c2) */ {
   1517                     /*
   1518                      * we got a supplementary code point when hitting its trail surrogate,
   1519                      * therefore the lead surrogate must have been the same as in the other string;
   1520                      * compare this decomposition with the lead surrogate in the other string
   1521                      * remember that this simulates bulk text replacement:
   1522                      * the decomposition would replace the entire code point
   1523                      */
   1524                     --s1;
   1525                     --m2;
   1526                     c1=*(s1-1);
   1527                 }
   1528             }
   1529 
   1530             /* push current level pointers */
   1531             stack2[0].start=start2;
   1532             stack2[0].s=s2;
   1533             stack2[0].limit=limit2;
   1534             ++level2;
   1535 
   1536             /* copy the folding result to fold2[] */
   1537             if(length<=UCASE_MAX_STRING_LENGTH) {
   1538                 u_memcpy(fold2, p, length);
   1539             } else {
   1540                 int32_t i=0;
   1541                 U16_APPEND_UNSAFE(fold2, i, length);
   1542                 length=i;
   1543             }
   1544 
   1545             /* set next level pointers to case folding */
   1546             start2=s2=fold2;
   1547             limit2=fold2+length;
   1548 
   1549             /* get ready to read from decomposition, continue with loop */
   1550             c2=-1;
   1551             continue;
   1552         }
   1553 
   1554         /*
   1555          * no decomposition/case folding, max level for both sides:
   1556          * return difference result
   1557          *
   1558          * code point order comparison must not just return cp1-cp2
   1559          * because when single surrogates are present then the surrogate pairs
   1560          * that formed cp1 and cp2 may be from different string indexes
   1561          *
   1562          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
   1563          * c1=d800 cp1=10001 c2=dc00 cp2=10000
   1564          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
   1565          *
   1566          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
   1567          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
   1568          * so we have slightly different pointer/start/limit comparisons here
   1569          */
   1570 
   1571         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
   1572             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
   1573             if(
   1574                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
   1575                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
   1576             ) {
   1577                 /* part of a surrogate pair, leave >=d800 */
   1578             } else {
   1579                 /* BMP code point - may be surrogate code point - make <d800 */
   1580                 c1-=0x2800;
   1581             }
   1582 
   1583             if(
   1584                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
   1585                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
   1586             ) {
   1587                 /* part of a surrogate pair, leave >=d800 */
   1588             } else {
   1589                 /* BMP code point - may be surrogate code point - make <d800 */
   1590                 c2-=0x2800;
   1591             }
   1592         }
   1593 
   1594         cmpRes=c1-c2;
   1595         break;
   1596     }
   1597 
   1598     if(matchLen1) {
   1599         *matchLen1=m1-org1;
   1600         *matchLen2=m2-org2;
   1601     }
   1602     return cmpRes;
   1603 }
   1604 
   1605 /* internal function */
   1606 U_CFUNC int32_t
   1607 u_strcmpFold(const UChar *s1, int32_t length1,
   1608              const UChar *s2, int32_t length2,
   1609              uint32_t options,
   1610              UErrorCode *pErrorCode) {
   1611     return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
   1612 }
   1613 
   1614 /* public API functions */
   1615 
   1616 U_CAPI int32_t U_EXPORT2
   1617 u_strCaseCompare(const UChar *s1, int32_t length1,
   1618                  const UChar *s2, int32_t length2,
   1619                  uint32_t options,
   1620                  UErrorCode *pErrorCode) {
   1621     /* argument checking */
   1622     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
   1623         return 0;
   1624     }
   1625     if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
   1626         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1627         return 0;
   1628     }
   1629     return u_strcmpFold(s1, length1, s2, length2,
   1630                         options|U_COMPARE_IGNORE_CASE,
   1631                         pErrorCode);
   1632 }
   1633 
   1634 U_CAPI int32_t U_EXPORT2
   1635 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
   1636     UErrorCode errorCode=U_ZERO_ERROR;
   1637     return u_strcmpFold(s1, -1, s2, -1,
   1638                         options|U_COMPARE_IGNORE_CASE,
   1639                         &errorCode);
   1640 }
   1641 
   1642 U_CAPI int32_t U_EXPORT2
   1643 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
   1644     UErrorCode errorCode=U_ZERO_ERROR;
   1645     return u_strcmpFold(s1, length, s2, length,
   1646                         options|U_COMPARE_IGNORE_CASE,
   1647                         &errorCode);
   1648 }
   1649 
   1650 U_CAPI int32_t U_EXPORT2
   1651 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
   1652     UErrorCode errorCode=U_ZERO_ERROR;
   1653     return u_strcmpFold(s1, n, s2, n,
   1654                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
   1655                         &errorCode);
   1656 }
   1657 
   1658 /* internal API - detect length of shared prefix */
   1659 U_CAPI void
   1660 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
   1661                              const UChar *s2, int32_t length2,
   1662                              uint32_t options,
   1663                              int32_t *matchLen1, int32_t *matchLen2,
   1664                              UErrorCode *pErrorCode) {
   1665     _cmpFold(s1, length1, s2, length2, options,
   1666         matchLen1, matchLen2, pErrorCode);
   1667 }
   1668