Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2005-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucasemap.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2005may06
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Case mapping service object and functions using it.
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/brkiter.h"
     21 #include "unicode/ubrk.h"
     22 #include "unicode/uloc.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/ucasemap.h"
     25 #if !UCONFIG_NO_BREAK_ITERATION
     26 #include "unicode/utext.h"
     27 #endif
     28 #include "unicode/utf.h"
     29 #include "unicode/utf8.h"
     30 #include "unicode/utf16.h"
     31 #include "cmemory.h"
     32 #include "cstring.h"
     33 #include "ucase.h"
     34 #include "ustr_imp.h"
     35 
     36 U_NAMESPACE_USE
     37 
     38 /* UCaseMap service object -------------------------------------------------- */
     39 
     40 U_CAPI UCaseMap * U_EXPORT2
     41 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
     42     UCaseMap *csm;
     43 
     44     if(U_FAILURE(*pErrorCode)) {
     45         return NULL;
     46     }
     47 
     48     csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
     49     if(csm==NULL) {
     50         return NULL;
     51     }
     52     uprv_memset(csm, 0, sizeof(UCaseMap));
     53 
     54     csm->csp=ucase_getSingleton();
     55     ucasemap_setLocale(csm, locale, pErrorCode);
     56     if(U_FAILURE(*pErrorCode)) {
     57         uprv_free(csm);
     58         return NULL;
     59     }
     60 
     61     csm->options=options;
     62     return csm;
     63 }
     64 
     65 U_CAPI void U_EXPORT2
     66 ucasemap_close(UCaseMap *csm) {
     67     if(csm!=NULL) {
     68 #if !UCONFIG_NO_BREAK_ITERATION
     69         // Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
     70         delete reinterpret_cast<BreakIterator *>(csm->iter);
     71 #endif
     72         uprv_free(csm);
     73     }
     74 }
     75 
     76 U_CAPI const char * U_EXPORT2
     77 ucasemap_getLocale(const UCaseMap *csm) {
     78     return csm->locale;
     79 }
     80 
     81 U_CAPI uint32_t U_EXPORT2
     82 ucasemap_getOptions(const UCaseMap *csm) {
     83     return csm->options;
     84 }
     85 
     86 U_CAPI void U_EXPORT2
     87 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
     88     int32_t length;
     89 
     90     if(U_FAILURE(*pErrorCode)) {
     91         return;
     92     }
     93 
     94     length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
     95     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
     96         *pErrorCode=U_ZERO_ERROR;
     97         /* we only really need the language code for case mappings */
     98         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
     99     }
    100     if(length==sizeof(csm->locale)) {
    101         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    102     }
    103     csm->locCache=0;
    104     if(U_SUCCESS(*pErrorCode)) {
    105         ucase_getCaseLocale(csm->locale, &csm->locCache);
    106     } else {
    107         csm->locale[0]=0;
    108     }
    109 }
    110 
    111 U_CAPI void U_EXPORT2
    112 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) {
    113     csm->options=options;
    114 }
    115 
    116 /* UTF-8 string case mappings ----------------------------------------------- */
    117 
    118 /* TODO(markus): Move to a new, separate utf8case.c file. */
    119 
    120 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
    121 static inline int32_t
    122 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
    123              int32_t result, const UChar *s) {
    124     UChar32 c;
    125     int32_t length, destLength;
    126     UErrorCode errorCode;
    127 
    128     /* decode the result */
    129     if(result<0) {
    130         /* (not) original code point */
    131         c=~result;
    132         length=-1;
    133     } else if(result<=UCASE_MAX_STRING_LENGTH) {
    134         c=U_SENTINEL;
    135         length=result;
    136     } else {
    137         c=result;
    138         length=-1;
    139     }
    140 
    141     if(destIndex<destCapacity) {
    142         /* append the result */
    143         if(length<0) {
    144             /* code point */
    145             UBool isError=FALSE;
    146             U8_APPEND(dest, destIndex, destCapacity, c, isError);
    147             if(isError) {
    148                 /* overflow, nothing written */
    149                 destIndex+=U8_LENGTH(c);
    150             }
    151         } else {
    152             /* string */
    153             errorCode=U_ZERO_ERROR;
    154             u_strToUTF8(
    155                 (char *)(dest+destIndex), destCapacity-destIndex, &destLength,
    156                 s, length,
    157                 &errorCode);
    158             destIndex+=destLength;
    159             /* we might have an overflow, but we know the actual length */
    160         }
    161     } else {
    162         /* preflight */
    163         if(length<0) {
    164             destIndex+=U8_LENGTH(c);
    165         } else {
    166             errorCode=U_ZERO_ERROR;
    167             u_strToUTF8(
    168                 NULL, 0, &destLength,
    169                 s, length,
    170                 &errorCode);
    171             destIndex+=destLength;
    172         }
    173     }
    174     return destIndex;
    175 }
    176 
    177 static UChar32 U_CALLCONV
    178 utf8_caseContextIterator(void *context, int8_t dir) {
    179     UCaseContext *csc=(UCaseContext *)context;
    180     UChar32 c;
    181 
    182     if(dir<0) {
    183         /* reset for backward iteration */
    184         csc->index=csc->cpStart;
    185         csc->dir=dir;
    186     } else if(dir>0) {
    187         /* reset for forward iteration */
    188         csc->index=csc->cpLimit;
    189         csc->dir=dir;
    190     } else {
    191         /* continue current iteration direction */
    192         dir=csc->dir;
    193     }
    194 
    195     if(dir<0) {
    196         if(csc->start<csc->index) {
    197             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
    198             return c;
    199         }
    200     } else {
    201         if(csc->index<csc->limit) {
    202             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
    203             return c;
    204         }
    205     }
    206     return U_SENTINEL;
    207 }
    208 
    209 /*
    210  * Case-maps [srcStart..srcLimit[ but takes
    211  * context [0..srcLength[ into account.
    212  */
    213 static int32_t
    214 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
    215          uint8_t *dest, int32_t destCapacity,
    216          const uint8_t *src, UCaseContext *csc,
    217          int32_t srcStart, int32_t srcLimit,
    218          UErrorCode *pErrorCode) {
    219     const UChar *s;
    220     UChar32 c, c2 = 0;
    221     int32_t srcIndex, destIndex;
    222     int32_t locCache;
    223 
    224     locCache=csm->locCache;
    225 
    226     /* case mapping loop */
    227     srcIndex=srcStart;
    228     destIndex=0;
    229     while(srcIndex<srcLimit) {
    230         csc->cpStart=srcIndex;
    231         U8_NEXT(src, srcIndex, srcLimit, c);
    232         csc->cpLimit=srcIndex;
    233         if(c<0) {
    234             int32_t i=csc->cpStart;
    235             while(destIndex<destCapacity && i<srcIndex) {
    236                 dest[destIndex++]=src[i++];
    237             }
    238             continue;
    239         }
    240         c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
    241         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
    242             /* fast path version of appendResult() for ASCII results */
    243             dest[destIndex++]=(uint8_t)c2;
    244         } else {
    245             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    246         }
    247     }
    248 
    249     if(destIndex>destCapacity) {
    250         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    251     }
    252     return destIndex;
    253 }
    254 
    255 #if !UCONFIG_NO_BREAK_ITERATION
    256 
    257 U_CFUNC int32_t U_CALLCONV
    258 ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
    259          uint8_t *dest, int32_t destCapacity,
    260          const uint8_t *src, int32_t srcLength,
    261          UErrorCode *pErrorCode) {
    262     const UChar *s;
    263     UChar32 c;
    264     int32_t prev, titleStart, titleLimit, idx, destIndex, length;
    265     UBool isFirstIndex;
    266 
    267     if(U_FAILURE(*pErrorCode)) {
    268         return 0;
    269     }
    270 
    271     // Use the C++ abstract base class to minimize dependencies.
    272     // TODO: Change UCaseMap.iter to store a BreakIterator directly.
    273     BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
    274 
    275     /* set up local variables */
    276     int32_t locCache=csm->locCache;
    277     UCaseContext csc=UCASECONTEXT_INITIALIZER;
    278     csc.p=(void *)src;
    279     csc.limit=srcLength;
    280     destIndex=0;
    281     prev=0;
    282     isFirstIndex=TRUE;
    283 
    284     /* titlecasing loop */
    285     while(prev<srcLength) {
    286         /* find next index where to titlecase */
    287         if(isFirstIndex) {
    288             isFirstIndex=FALSE;
    289             idx=bi->first();
    290         } else {
    291             idx=bi->next();
    292         }
    293         if(idx==UBRK_DONE || idx>srcLength) {
    294             idx=srcLength;
    295         }
    296 
    297         /*
    298          * Unicode 4 & 5 section 3.13 Default Case Operations:
    299          *
    300          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
    301          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
    302          * cased character F. If F exists, map F to default_title(F); then map each
    303          * subsequent character C to default_lower(C).
    304          *
    305          * In this implementation, segment [prev..index[ into 3 parts:
    306          * a) uncased characters (copy as-is) [prev..titleStart[
    307          * b) first case letter (titlecase)         [titleStart..titleLimit[
    308          * c) subsequent characters (lowercase)                 [titleLimit..index[
    309          */
    310         if(prev<idx) {
    311             /* find and copy uncased characters [prev..titleStart[ */
    312             titleStart=titleLimit=prev;
    313             U8_NEXT(src, titleLimit, idx, c);
    314             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
    315                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
    316                 for(;;) {
    317                     titleStart=titleLimit;
    318                     if(titleLimit==idx) {
    319                         /*
    320                          * only uncased characters in [prev..index[
    321                          * stop with titleStart==titleLimit==index
    322                          */
    323                         break;
    324                     }
    325                     U8_NEXT(src, titleLimit, idx, c);
    326                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
    327                         break; /* cased letter at [titleStart..titleLimit[ */
    328                     }
    329                 }
    330                 length=titleStart-prev;
    331                 if(length>0) {
    332                     if((destIndex+length)<=destCapacity) {
    333                         uprv_memcpy(dest+destIndex, src+prev, length);
    334                     }
    335                     destIndex+=length;
    336                 }
    337             }
    338 
    339             if(titleStart<titleLimit) {
    340                 /* titlecase c which is from [titleStart..titleLimit[ */
    341                 csc.cpStart=titleStart;
    342                 csc.cpLimit=titleLimit;
    343                 c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
    344                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    345 
    346                 /* Special case Dutch IJ titlecasing */
    347                 if ( titleStart+1 < idx &&
    348                      ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH &&
    349                      ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
    350                      ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) {
    351                             c=0x004A;
    352                             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    353                             titleLimit++;
    354                 }
    355                 /* lowercase [titleLimit..index[ */
    356                 if(titleLimit<idx) {
    357                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
    358                         /* Normal operation: Lowercase the rest of the word. */
    359                         destIndex+=
    360                             _caseMap(
    361                                 csm, ucase_toFullLower,
    362                                 dest+destIndex, destCapacity-destIndex,
    363                                 src, &csc,
    364                                 titleLimit, idx,
    365                                 pErrorCode);
    366                     } else {
    367                         /* Optionally just copy the rest of the word unchanged. */
    368                         length=idx-titleLimit;
    369                         if((destIndex+length)<=destCapacity) {
    370                             uprv_memcpy(dest+destIndex, src+titleLimit, length);
    371                         }
    372                         destIndex+=length;
    373                     }
    374                 }
    375             }
    376         }
    377 
    378         prev=idx;
    379     }
    380 
    381     if(destIndex>destCapacity) {
    382         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    383     }
    384     return destIndex;
    385 }
    386 
    387 #endif
    388 
    389 static int32_t U_CALLCONV
    390 ucasemap_internalUTF8ToLower(const UCaseMap *csm,
    391                              uint8_t *dest, int32_t destCapacity,
    392                              const uint8_t *src, int32_t srcLength,
    393                              UErrorCode *pErrorCode) {
    394     UCaseContext csc=UCASECONTEXT_INITIALIZER;
    395     csc.p=(void *)src;
    396     csc.limit=srcLength;
    397     return _caseMap(
    398         csm, ucase_toFullLower,
    399         dest, destCapacity,
    400         src, &csc, 0, srcLength,
    401         pErrorCode);
    402 }
    403 
    404 static int32_t U_CALLCONV
    405 ucasemap_internalUTF8ToUpper(const UCaseMap *csm,
    406                              uint8_t *dest, int32_t destCapacity,
    407                              const uint8_t *src, int32_t srcLength,
    408                              UErrorCode *pErrorCode) {
    409     UCaseContext csc=UCASECONTEXT_INITIALIZER;
    410     csc.p=(void *)src;
    411     csc.limit=srcLength;
    412     return _caseMap(
    413         csm, ucase_toFullUpper,
    414         dest, destCapacity,
    415         src, &csc, 0, srcLength,
    416         pErrorCode);
    417 }
    418 
    419 static int32_t
    420 utf8_foldCase(const UCaseProps *csp,
    421               uint8_t *dest, int32_t destCapacity,
    422               const uint8_t *src, int32_t srcLength,
    423               uint32_t options,
    424               UErrorCode *pErrorCode) {
    425     int32_t srcIndex, destIndex;
    426 
    427     const UChar *s;
    428     UChar32 c, c2;
    429     int32_t start;
    430 
    431     /* case mapping loop */
    432     srcIndex=destIndex=0;
    433     while(srcIndex<srcLength) {
    434         start=srcIndex;
    435         U8_NEXT(src, srcIndex, srcLength, c);
    436         if(c<0) {
    437             while(destIndex<destCapacity && start<srcIndex) {
    438                 dest[destIndex++]=src[start++];
    439             }
    440             continue;
    441         }
    442         c=ucase_toFullFolding(csp, c, &s, options);
    443         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
    444             /* fast path version of appendResult() for ASCII results */
    445             dest[destIndex++]=(uint8_t)c2;
    446         } else {
    447             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    448         }
    449     }
    450 
    451     if(destIndex>destCapacity) {
    452         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    453     }
    454     return destIndex;
    455 }
    456 
    457 static int32_t U_CALLCONV
    458 ucasemap_internalUTF8Fold(const UCaseMap *csm,
    459                           uint8_t *dest, int32_t destCapacity,
    460                           const uint8_t *src, int32_t srcLength,
    461                           UErrorCode *pErrorCode) {
    462     return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
    463 }
    464 
    465 U_CFUNC int32_t
    466 ucasemap_mapUTF8(const UCaseMap *csm,
    467                  uint8_t *dest, int32_t destCapacity,
    468                  const uint8_t *src, int32_t srcLength,
    469                  UTF8CaseMapper *stringCaseMapper,
    470                  UErrorCode *pErrorCode) {
    471     int32_t destLength;
    472 
    473     /* check argument values */
    474     if(U_FAILURE(*pErrorCode)) {
    475         return 0;
    476     }
    477     if( destCapacity<0 ||
    478         (dest==NULL && destCapacity>0) ||
    479         src==NULL ||
    480         srcLength<-1
    481     ) {
    482         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    483         return 0;
    484     }
    485 
    486     /* get the string length */
    487     if(srcLength==-1) {
    488         srcLength=(int32_t)uprv_strlen((const char *)src);
    489     }
    490 
    491     /* check for overlapping source and destination */
    492     if( dest!=NULL &&
    493         ((src>=dest && src<(dest+destCapacity)) ||
    494          (dest>=src && dest<(src+srcLength)))
    495     ) {
    496         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    497         return 0;
    498     }
    499 
    500     destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode);
    501     return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
    502 }
    503 
    504 /* public API functions */
    505 
    506 U_CAPI int32_t U_EXPORT2
    507 ucasemap_utf8ToLower(const UCaseMap *csm,
    508                      char *dest, int32_t destCapacity,
    509                      const char *src, int32_t srcLength,
    510                      UErrorCode *pErrorCode) {
    511     return ucasemap_mapUTF8(csm,
    512                    (uint8_t *)dest, destCapacity,
    513                    (const uint8_t *)src, srcLength,
    514                    ucasemap_internalUTF8ToLower, pErrorCode);
    515 }
    516 
    517 U_CAPI int32_t U_EXPORT2
    518 ucasemap_utf8ToUpper(const UCaseMap *csm,
    519                      char *dest, int32_t destCapacity,
    520                      const char *src, int32_t srcLength,
    521                      UErrorCode *pErrorCode) {
    522     return ucasemap_mapUTF8(csm,
    523                    (uint8_t *)dest, destCapacity,
    524                    (const uint8_t *)src, srcLength,
    525                    ucasemap_internalUTF8ToUpper, pErrorCode);
    526 }
    527 
    528 U_CAPI int32_t U_EXPORT2
    529 ucasemap_utf8FoldCase(const UCaseMap *csm,
    530                       char *dest, int32_t destCapacity,
    531                       const char *src, int32_t srcLength,
    532                       UErrorCode *pErrorCode) {
    533     return ucasemap_mapUTF8(csm,
    534                    (uint8_t *)dest, destCapacity,
    535                    (const uint8_t *)src, srcLength,
    536                    ucasemap_internalUTF8Fold, pErrorCode);
    537 }
    538