Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2005-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucasemap.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2005may06
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Case mapping service object and functions using it.
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/uloc.h"
     21 #include "unicode/ustring.h"
     22 #include "unicode/ucasemap.h"
     23 #if !UCONFIG_NO_BREAK_ITERATION
     24 #include "unicode/ubrk.h"
     25 #include "unicode/utext.h"
     26 #endif
     27 #include "cmemory.h"
     28 #include "cstring.h"
     29 #include "ucase.h"
     30 #include "ustr_imp.h"
     31 
     32 /* UCaseMap service object -------------------------------------------------- */
     33 
     34 U_CAPI UCaseMap * U_EXPORT2
     35 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
     36     UCaseMap *csm;
     37 
     38     if(U_FAILURE(*pErrorCode)) {
     39         return NULL;
     40     }
     41 
     42     csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
     43     if(csm==NULL) {
     44         return NULL;
     45     }
     46     uprv_memset(csm, 0, sizeof(UCaseMap));
     47 
     48     csm->csp=ucase_getSingleton();
     49     ucasemap_setLocale(csm, locale, pErrorCode);
     50     if(U_FAILURE(*pErrorCode)) {
     51         uprv_free(csm);
     52         return NULL;
     53     }
     54 
     55     csm->options=options;
     56     return csm;
     57 }
     58 
     59 U_CAPI void U_EXPORT2
     60 ucasemap_close(UCaseMap *csm) {
     61     if(csm!=NULL) {
     62 #if !UCONFIG_NO_BREAK_ITERATION
     63         ubrk_close(csm->iter);
     64 #endif
     65         uprv_free(csm);
     66     }
     67 }
     68 
     69 U_CAPI const char * U_EXPORT2
     70 ucasemap_getLocale(const UCaseMap *csm) {
     71     return csm->locale;
     72 }
     73 
     74 U_CAPI uint32_t U_EXPORT2
     75 ucasemap_getOptions(const UCaseMap *csm) {
     76     return csm->options;
     77 }
     78 
     79 U_CAPI void U_EXPORT2
     80 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
     81     int32_t length;
     82 
     83     if(U_FAILURE(*pErrorCode)) {
     84         return;
     85     }
     86 
     87     length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
     88     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
     89         *pErrorCode=U_ZERO_ERROR;
     90         /* we only really need the language code for case mappings */
     91         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
     92     }
     93     if(length==sizeof(csm->locale)) {
     94         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     95     }
     96     csm->locCache=0;
     97     if(U_SUCCESS(*pErrorCode)) {
     98         ucase_getCaseLocale(csm->locale, &csm->locCache);
     99     } else {
    100         csm->locale[0]=0;
    101     }
    102 }
    103 
    104 U_CAPI void U_EXPORT2
    105 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
    106     csm->options=options;
    107 }
    108 
    109 #if !UCONFIG_NO_BREAK_ITERATION
    110 
    111 U_CAPI const UBreakIterator * U_EXPORT2
    112 ucasemap_getBreakIterator(const UCaseMap *csm) {
    113     return csm->iter;
    114 }
    115 
    116 U_CAPI void U_EXPORT2
    117 ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode) {
    118     ubrk_close(csm->iter);
    119     csm->iter=iterToAdopt;
    120 }
    121 
    122 #endif
    123 
    124 /* UTF-8 string case mappings ----------------------------------------------- */
    125 
    126 /* TODO(markus): Move to a new, separate utf8case.c file. */
    127 
    128 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
    129 static U_INLINE int32_t
    130 appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
    131              int32_t result, const UChar *s) {
    132     UChar32 c;
    133     int32_t length, destLength;
    134     UErrorCode errorCode;
    135 
    136     /* decode the result */
    137     if(result<0) {
    138         /* (not) original code point */
    139         c=~result;
    140         length=-1;
    141     } else if(result<=UCASE_MAX_STRING_LENGTH) {
    142         c=U_SENTINEL;
    143         length=result;
    144     } else {
    145         c=result;
    146         length=-1;
    147     }
    148 
    149     if(destIndex<destCapacity) {
    150         /* append the result */
    151         if(length<0) {
    152             /* code point */
    153             UBool isError=FALSE;
    154             U8_APPEND(dest, destIndex, destCapacity, c, isError);
    155             if(isError) {
    156                 /* overflow, nothing written */
    157                 destIndex+=U8_LENGTH(c);
    158             }
    159         } else {
    160             /* string */
    161             errorCode=U_ZERO_ERROR;
    162             u_strToUTF8(
    163                 (char *)(dest+destIndex), destCapacity-destIndex, &destLength,
    164                 s, length,
    165                 &errorCode);
    166             destIndex+=destLength;
    167             /* we might have an overflow, but we know the actual length */
    168         }
    169     } else {
    170         /* preflight */
    171         if(length<0) {
    172             destIndex+=U8_LENGTH(c);
    173         } else {
    174             errorCode=U_ZERO_ERROR;
    175             u_strToUTF8(
    176                 NULL, 0, &destLength,
    177                 s, length,
    178                 &errorCode);
    179             destIndex+=destLength;
    180         }
    181     }
    182     return destIndex;
    183 }
    184 
    185 static UChar32 U_CALLCONV
    186 utf8_caseContextIterator(void *context, int8_t dir) {
    187     UCaseContext *csc=(UCaseContext *)context;
    188     UChar32 c;
    189 
    190     if(dir<0) {
    191         /* reset for backward iteration */
    192         csc->index=csc->cpStart;
    193         csc->dir=dir;
    194     } else if(dir>0) {
    195         /* reset for forward iteration */
    196         csc->index=csc->cpLimit;
    197         csc->dir=dir;
    198     } else {
    199         /* continue current iteration direction */
    200         dir=csc->dir;
    201     }
    202 
    203     if(dir<0) {
    204         if(csc->start<csc->index) {
    205             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
    206             return c;
    207         }
    208     } else {
    209         if(csc->index<csc->limit) {
    210             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
    211             return c;
    212         }
    213     }
    214     return U_SENTINEL;
    215 }
    216 
    217 /*
    218  * Case-maps [srcStart..srcLimit[ but takes
    219  * context [0..srcLength[ into account.
    220  */
    221 static int32_t
    222 _caseMap(const UCaseMap *csm, UCaseMapFull *map,
    223          uint8_t *dest, int32_t destCapacity,
    224          const uint8_t *src, UCaseContext *csc,
    225          int32_t srcStart, int32_t srcLimit,
    226          UErrorCode *pErrorCode) {
    227     const UChar *s;
    228     UChar32 c, c2 = 0;
    229     int32_t srcIndex, destIndex;
    230     int32_t locCache;
    231 
    232     locCache=csm->locCache;
    233 
    234     /* case mapping loop */
    235     srcIndex=srcStart;
    236     destIndex=0;
    237     while(srcIndex<srcLimit) {
    238         csc->cpStart=srcIndex;
    239         U8_NEXT(src, srcIndex, srcLimit, c);
    240         csc->cpLimit=srcIndex;
    241         if(c<0) {
    242             int32_t i=csc->cpStart;
    243             while(destIndex<destCapacity && i<srcIndex) {
    244                 dest[destIndex++]=src[i++];
    245             }
    246             continue;
    247         }
    248         c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
    249         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
    250             /* fast path version of appendResult() for ASCII results */
    251             dest[destIndex++]=(uint8_t)c2;
    252         } else {
    253             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    254         }
    255     }
    256 
    257     if(destIndex>destCapacity) {
    258         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    259     }
    260     return destIndex;
    261 }
    262 
    263 #if !UCONFIG_NO_BREAK_ITERATION
    264 
    265 /*
    266  * Internal titlecasing function.
    267  */
    268 static int32_t
    269 _toTitle(UCaseMap *csm,
    270          uint8_t *dest, int32_t destCapacity,
    271          const uint8_t *src, UCaseContext *csc,
    272          int32_t srcLength,
    273          UErrorCode *pErrorCode) {
    274     UText utext=UTEXT_INITIALIZER;
    275     const UChar *s;
    276     UChar32 c;
    277     int32_t prev, titleStart, titleLimit, idx, destIndex, length;
    278     UBool isFirstIndex;
    279 
    280     utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
    281     if(U_FAILURE(*pErrorCode)) {
    282         return 0;
    283     }
    284     if(csm->iter==NULL) {
    285         csm->iter=ubrk_open(UBRK_WORD, csm->locale,
    286                             NULL, 0,
    287                             pErrorCode);
    288     }
    289     ubrk_setUText(csm->iter, &utext, pErrorCode);
    290     if(U_FAILURE(*pErrorCode)) {
    291         utext_close(&utext);
    292         return 0;
    293     }
    294 
    295     /* set up local variables */
    296     destIndex=0;
    297     prev=0;
    298     isFirstIndex=TRUE;
    299 
    300     /* titlecasing loop */
    301     while(prev<srcLength) {
    302         /* find next index where to titlecase */
    303         if(isFirstIndex) {
    304             isFirstIndex=FALSE;
    305             idx=ubrk_first(csm->iter);
    306         } else {
    307             idx=ubrk_next(csm->iter);
    308         }
    309         if(idx==UBRK_DONE || idx>srcLength) {
    310             idx=srcLength;
    311         }
    312 
    313         /*
    314          * Unicode 4 & 5 section 3.13 Default Case Operations:
    315          *
    316          * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
    317          * #29, "Text Boundaries." Between each pair of word boundaries, find the first
    318          * cased character F. If F exists, map F to default_title(F); then map each
    319          * subsequent character C to default_lower(C).
    320          *
    321          * In this implementation, segment [prev..index[ into 3 parts:
    322          * a) uncased characters (copy as-is) [prev..titleStart[
    323          * b) first case letter (titlecase)         [titleStart..titleLimit[
    324          * c) subsequent characters (lowercase)                 [titleLimit..index[
    325          */
    326         if(prev<idx) {
    327             /* find and copy uncased characters [prev..titleStart[ */
    328             titleStart=titleLimit=prev;
    329             U8_NEXT(src, titleLimit, idx, c);
    330             if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
    331                 /* Adjust the titlecasing index (titleStart) to the next cased character. */
    332                 for(;;) {
    333                     titleStart=titleLimit;
    334                     if(titleLimit==idx) {
    335                         /*
    336                          * only uncased characters in [prev..index[
    337                          * stop with titleStart==titleLimit==index
    338                          */
    339                         break;
    340                     }
    341                     U8_NEXT(src, titleLimit, idx, c);
    342                     if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
    343                         break; /* cased letter at [titleStart..titleLimit[ */
    344                     }
    345                 }
    346                 length=titleStart-prev;
    347                 if(length>0) {
    348                     if((destIndex+length)<=destCapacity) {
    349                         uprv_memcpy(dest+destIndex, src+prev, length);
    350                     }
    351                     destIndex+=length;
    352                 }
    353             }
    354 
    355             if(titleStart<titleLimit) {
    356                 /* titlecase c which is from [titleStart..titleLimit[ */
    357                 csc->cpStart=titleStart;
    358                 csc->cpLimit=titleLimit;
    359                 c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
    360                 destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    361 
    362 
    363                 /* Special case Dutch IJ titlecasing */
    364                 if ( titleStart+1 < idx &&
    365                      ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
    366                      ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
    367                      ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) {
    368                             c=0x004A;
    369                             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    370                             titleLimit++;
    371                 }
    372                 /* lowercase [titleLimit..index[ */
    373                 if(titleLimit<idx) {
    374                     if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
    375                         /* Normal operation: Lowercase the rest of the word. */
    376                         destIndex+=
    377                             _caseMap(
    378                                 csm, ucase_toFullLower,
    379                                 dest+destIndex, destCapacity-destIndex,
    380                                 src, csc,
    381                                 titleLimit, idx,
    382                                 pErrorCode);
    383                     } else {
    384                         /* Optionally just copy the rest of the word unchanged. */
    385                         length=idx-titleLimit;
    386                         if((destIndex+length)<=destCapacity) {
    387                             uprv_memcpy(dest+destIndex, src+titleLimit, length);
    388                         }
    389                         destIndex+=length;
    390                     }
    391                 }
    392             }
    393         }
    394 
    395         prev=idx;
    396     }
    397 
    398     if(destIndex>destCapacity) {
    399         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    400     }
    401     utext_close(&utext);
    402     return destIndex;
    403 }
    404 
    405 #endif
    406 
    407 static int32_t
    408 utf8_foldCase(const UCaseProps *csp,
    409               uint8_t *dest, int32_t destCapacity,
    410               const uint8_t *src, int32_t srcLength,
    411               uint32_t options,
    412               UErrorCode *pErrorCode) {
    413     int32_t srcIndex, destIndex;
    414 
    415     const UChar *s;
    416     UChar32 c, c2;
    417     int32_t start;
    418 
    419     /* case mapping loop */
    420     srcIndex=destIndex=0;
    421     while(srcIndex<srcLength) {
    422         start=srcIndex;
    423         U8_NEXT(src, srcIndex, srcLength, c);
    424         if(c<0) {
    425             while(destIndex<destCapacity && start<srcIndex) {
    426                 dest[destIndex++]=src[start++];
    427             }
    428             continue;
    429         }
    430         c=ucase_toFullFolding(csp, c, &s, options);
    431         if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
    432             /* fast path version of appendResult() for ASCII results */
    433             dest[destIndex++]=(uint8_t)c2;
    434         } else {
    435             destIndex=appendResult(dest, destIndex, destCapacity, c, s);
    436         }
    437     }
    438 
    439     if(destIndex>destCapacity) {
    440         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    441     }
    442     return destIndex;
    443 }
    444 
    445 /*
    446  * Implement argument checking and buffer handling
    447  * for string case mapping as a common function.
    448  */
    449 
    450 /* common internal function for public API functions */
    451 
    452 static int32_t
    453 caseMap(const UCaseMap *csm,
    454         uint8_t *dest, int32_t destCapacity,
    455         const uint8_t *src, int32_t srcLength,
    456         int32_t toWhichCase,
    457         UErrorCode *pErrorCode) {
    458     int32_t destLength;
    459 
    460     /* check argument values */
    461     if(U_FAILURE(*pErrorCode)) {
    462         return 0;
    463     }
    464     if( destCapacity<0 ||
    465         (dest==NULL && destCapacity>0) ||
    466         src==NULL ||
    467         srcLength<-1
    468     ) {
    469         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    470         return 0;
    471     }
    472 
    473     /* get the string length */
    474     if(srcLength==-1) {
    475         srcLength=(int32_t)uprv_strlen((const char *)src);
    476     }
    477 
    478     /* check for overlapping source and destination */
    479     if( dest!=NULL &&
    480         ((src>=dest && src<(dest+destCapacity)) ||
    481          (dest>=src && dest<(src+srcLength)))
    482     ) {
    483         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    484         return 0;
    485     }
    486 
    487     destLength=0;
    488 
    489     if(toWhichCase==FOLD_CASE) {
    490         destLength=utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength,
    491                                  csm->options, pErrorCode);
    492     } else {
    493         UCaseContext csc={ NULL };
    494 
    495         csc.p=(void *)src;
    496         csc.limit=srcLength;
    497 
    498         if(toWhichCase==TO_LOWER) {
    499             destLength=_caseMap(csm, ucase_toFullLower,
    500                                 dest, destCapacity,
    501                                 src, &csc,
    502                                 0, srcLength,
    503                                 pErrorCode);
    504         } else if(toWhichCase==TO_UPPER) {
    505             destLength=_caseMap(csm, ucase_toFullUpper,
    506                                 dest, destCapacity,
    507                                 src, &csc,
    508                                 0, srcLength,
    509                                 pErrorCode);
    510         } else /* if(toWhichCase==TO_TITLE) */ {
    511 #if UCONFIG_NO_BREAK_ITERATION
    512             *pErrorCode=U_UNSUPPORTED_ERROR;
    513 #else
    514             /* UCaseMap is actually non-const in toTitle() APIs. */
    515             UCaseMap *tmp = (UCaseMap *)csm;
    516             destLength=_toTitle(tmp, dest, destCapacity,
    517                                 src, &csc, srcLength,
    518                                 pErrorCode);
    519 #endif
    520         }
    521     }
    522 
    523     return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
    524 }
    525 
    526 /* public API functions */
    527 
    528 U_CAPI int32_t U_EXPORT2
    529 ucasemap_utf8ToLower(const UCaseMap *csm,
    530                      char *dest, int32_t destCapacity,
    531                      const char *src, int32_t srcLength,
    532                      UErrorCode *pErrorCode) {
    533     return caseMap(csm,
    534                    (uint8_t *)dest, destCapacity,
    535                    (const uint8_t *)src, srcLength,
    536                    TO_LOWER, pErrorCode);
    537 }
    538 
    539 U_CAPI int32_t U_EXPORT2
    540 ucasemap_utf8ToUpper(const UCaseMap *csm,
    541                      char *dest, int32_t destCapacity,
    542                      const char *src, int32_t srcLength,
    543                      UErrorCode *pErrorCode) {
    544     return caseMap(csm,
    545                    (uint8_t *)dest, destCapacity,
    546                    (const uint8_t *)src, srcLength,
    547                    TO_UPPER, pErrorCode);
    548 }
    549 
    550 #if !UCONFIG_NO_BREAK_ITERATION
    551 
    552 U_CAPI int32_t U_EXPORT2
    553 ucasemap_utf8ToTitle(UCaseMap *csm,
    554                      char *dest, int32_t destCapacity,
    555                      const char *src, int32_t srcLength,
    556                      UErrorCode *pErrorCode) {
    557     return caseMap(csm,
    558                    (uint8_t *)dest, destCapacity,
    559                    (const uint8_t *)src, srcLength,
    560                    TO_TITLE, pErrorCode);
    561 }
    562 
    563 #endif
    564 
    565 U_CAPI int32_t U_EXPORT2
    566 ucasemap_utf8FoldCase(const UCaseMap *csm,
    567                       char *dest, int32_t destCapacity,
    568                       const char *src, int32_t srcLength,
    569                       UErrorCode *pErrorCode) {
    570     return caseMap(csm,
    571                    (uint8_t *)dest, destCapacity,
    572                    (const uint8_t *)src, srcLength,
    573                    FOLD_CASE, pErrorCode);
    574 }
    575