Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  unistr_cnv.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:2
     12 *
     13 *   created on: 2004aug19
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Character conversion functions moved here from unistr.cpp
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION
     22 
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/ucnv.h"
     29 #include "ucnv_imp.h"
     30 #include "putilimp.h"
     31 #include "ustr_cnv.h"
     32 #include "ustr_imp.h"
     33 
     34 U_NAMESPACE_BEGIN
     35 
     36 //========================================
     37 // Constructors
     38 //========================================
     39 
     40 #if !U_CHARSET_IS_UTF8
     41 
     42 UnicodeString::UnicodeString(const char *codepageData)
     43   : fShortLength(0),
     44     fFlags(kShortString)
     45 {
     46     if(codepageData != 0) {
     47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
     48     }
     49 }
     50 
     51 UnicodeString::UnicodeString(const char *codepageData,
     52                              int32_t dataLength)
     53   : fShortLength(0),
     54     fFlags(kShortString)
     55 {
     56     if(codepageData != 0) {
     57         doCodepageCreate(codepageData, dataLength, 0);
     58     }
     59 }
     60 
     61 // else see unistr.cpp
     62 #endif
     63 
     64 UnicodeString::UnicodeString(const char *codepageData,
     65                              const char *codepage)
     66   : fShortLength(0),
     67     fFlags(kShortString)
     68 {
     69     if(codepageData != 0) {
     70         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
     71     }
     72 }
     73 
     74 UnicodeString::UnicodeString(const char *codepageData,
     75                              int32_t dataLength,
     76                              const char *codepage)
     77   : fShortLength(0),
     78     fFlags(kShortString)
     79 {
     80     if(codepageData != 0) {
     81         doCodepageCreate(codepageData, dataLength, codepage);
     82     }
     83 }
     84 
     85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
     86                              UConverter *cnv,
     87                              UErrorCode &errorCode)
     88   : fShortLength(0),
     89     fFlags(kShortString)
     90 {
     91     if(U_SUCCESS(errorCode)) {
     92         // check arguments
     93         if(src==NULL) {
     94             // treat as an empty string, do nothing more
     95         } else if(srcLength<-1) {
     96             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     97         } else {
     98             // get input length
     99             if(srcLength==-1) {
    100                 srcLength=(int32_t)uprv_strlen(src);
    101             }
    102             if(srcLength>0) {
    103                 if(cnv!=0) {
    104                     // use the provided converter
    105                     ucnv_resetToUnicode(cnv);
    106                     doCodepageCreate(src, srcLength, cnv, errorCode);
    107                 } else {
    108                     // use the default converter
    109                     cnv=u_getDefaultConverter(&errorCode);
    110                     doCodepageCreate(src, srcLength, cnv, errorCode);
    111                     u_releaseDefaultConverter(cnv);
    112                 }
    113             }
    114         }
    115 
    116         if(U_FAILURE(errorCode)) {
    117             setToBogus();
    118         }
    119     }
    120 }
    121 
    122 //========================================
    123 // Codeset conversion
    124 //========================================
    125 
    126 #if !U_CHARSET_IS_UTF8
    127 
    128 int32_t
    129 UnicodeString::extract(int32_t start,
    130                        int32_t length,
    131                        char *target,
    132                        uint32_t dstSize) const {
    133     return extract(start, length, target, dstSize, 0);
    134 }
    135 
    136 // else see unistr.cpp
    137 #endif
    138 
    139 int32_t
    140 UnicodeString::extract(int32_t start,
    141                        int32_t length,
    142                        char *target,
    143                        uint32_t dstSize,
    144                        const char *codepage) const
    145 {
    146     // if the arguments are illegal, then do nothing
    147     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    148         return 0;
    149     }
    150 
    151     // pin the indices to legal values
    152     pinIndices(start, length);
    153 
    154     // We need to cast dstSize to int32_t for all subsequent code.
    155     // I don't know why the API was defined with uint32_t but we are stuck with it.
    156     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
    157     // as a limit in some functions, it may wrap around and yield a pointer
    158     // that compares less-than target.
    159     int32_t capacity;
    160     if(dstSize < 0x7fffffff) {
    161         // Assume that the capacity is real and a limit pointer won't wrap around.
    162         capacity = (int32_t)dstSize;
    163     } else {
    164         char *targetLimit = target + 0x7fffffff;
    165         if(targetLimit < target) {
    166             // Pin the capacity so that a limit pointer does not wrap around.
    167             targetLimit = (char *)U_MAX_PTR(target);
    168             capacity = (int32_t)(targetLimit - target);
    169         } else {
    170             // Pin the capacity to the maximum int32_t value.
    171             capacity = 0x7fffffff;
    172         }
    173     }
    174 
    175     // create the converter
    176     UConverter *converter;
    177     UErrorCode status = U_ZERO_ERROR;
    178 
    179     // just write the NUL if the string length is 0
    180     if(length == 0) {
    181         return u_terminateChars(target, capacity, 0, &status);
    182     }
    183 
    184     // if the codepage is the default, use our cache
    185     // if it is an empty string, then use the "invariant character" conversion
    186     if (codepage == 0) {
    187         const char *defaultName = ucnv_getDefaultName();
    188         if(UCNV_FAST_IS_UTF8(defaultName)) {
    189             return toUTF8(start, length, target, capacity);
    190         }
    191         converter = u_getDefaultConverter(&status);
    192     } else if (*codepage == 0) {
    193         // use the "invariant characters" conversion
    194         int32_t destLength;
    195         if(length <= capacity) {
    196             destLength = length;
    197         } else {
    198             destLength = capacity;
    199         }
    200         u_UCharsToChars(getArrayStart() + start, target, destLength);
    201         return u_terminateChars(target, capacity, length, &status);
    202     } else {
    203         converter = ucnv_open(codepage, &status);
    204     }
    205 
    206     length = doExtract(start, length, target, capacity, converter, status);
    207 
    208     // close the converter
    209     if (codepage == 0) {
    210         u_releaseDefaultConverter(converter);
    211     } else {
    212         ucnv_close(converter);
    213     }
    214 
    215     return length;
    216 }
    217 
    218 int32_t
    219 UnicodeString::extract(char *dest, int32_t destCapacity,
    220                        UConverter *cnv,
    221                        UErrorCode &errorCode) const
    222 {
    223     if(U_FAILURE(errorCode)) {
    224         return 0;
    225     }
    226 
    227     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    228         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    229         return 0;
    230     }
    231 
    232     // nothing to do?
    233     if(isEmpty()) {
    234         return u_terminateChars(dest, destCapacity, 0, &errorCode);
    235     }
    236 
    237     // get the converter
    238     UBool isDefaultConverter;
    239     if(cnv==0) {
    240         isDefaultConverter=TRUE;
    241         cnv=u_getDefaultConverter(&errorCode);
    242         if(U_FAILURE(errorCode)) {
    243             return 0;
    244         }
    245     } else {
    246         isDefaultConverter=FALSE;
    247         ucnv_resetFromUnicode(cnv);
    248     }
    249 
    250     // convert
    251     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
    252 
    253     // release the converter
    254     if(isDefaultConverter) {
    255         u_releaseDefaultConverter(cnv);
    256     }
    257 
    258     return len;
    259 }
    260 
    261 int32_t
    262 UnicodeString::doExtract(int32_t start, int32_t length,
    263                          char *dest, int32_t destCapacity,
    264                          UConverter *cnv,
    265                          UErrorCode &errorCode) const
    266 {
    267     if(U_FAILURE(errorCode)) {
    268         if(destCapacity!=0) {
    269             *dest=0;
    270         }
    271         return 0;
    272     }
    273 
    274     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
    275     char *originalDest=dest;
    276     const char *destLimit;
    277 
    278     if(destCapacity==0) {
    279         destLimit=dest=0;
    280     } else if(destCapacity==-1) {
    281         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
    282         destLimit=(char*)U_MAX_PTR(dest);
    283         // for NUL-termination, translate into highest int32_t
    284         destCapacity=0x7fffffff;
    285     } else {
    286         destLimit=dest+destCapacity;
    287     }
    288 
    289     // perform the conversion
    290     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    291     length=(int32_t)(dest-originalDest);
    292 
    293     // if an overflow occurs, then get the preflighting length
    294     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    295         char buffer[1024];
    296 
    297         destLimit=buffer+sizeof(buffer);
    298         do {
    299             dest=buffer;
    300             errorCode=U_ZERO_ERROR;
    301             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    302             length+=(int32_t)(dest-buffer);
    303         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
    304     }
    305 
    306     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
    307 }
    308 
    309 void
    310 UnicodeString::doCodepageCreate(const char *codepageData,
    311                                 int32_t dataLength,
    312                                 const char *codepage)
    313 {
    314     // if there's nothing to convert, do nothing
    315     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    316         return;
    317     }
    318     if(dataLength == -1) {
    319         dataLength = (int32_t)uprv_strlen(codepageData);
    320     }
    321 
    322     UErrorCode status = U_ZERO_ERROR;
    323 
    324     // create the converter
    325     // if the codepage is the default, use our cache
    326     // if it is an empty string, then use the "invariant character" conversion
    327     UConverter *converter;
    328     if (codepage == 0) {
    329         const char *defaultName = ucnv_getDefaultName();
    330         if(UCNV_FAST_IS_UTF8(defaultName)) {
    331             setToUTF8(StringPiece(codepageData, dataLength));
    332             return;
    333         }
    334         converter = u_getDefaultConverter(&status);
    335     } else if(*codepage == 0) {
    336         // use the "invariant characters" conversion
    337         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
    338             u_charsToUChars(codepageData, getArrayStart(), dataLength);
    339             setLength(dataLength);
    340         } else {
    341             setToBogus();
    342         }
    343         return;
    344     } else {
    345         converter = ucnv_open(codepage, &status);
    346     }
    347 
    348     // if we failed, set the appropriate flags and return
    349     if(U_FAILURE(status)) {
    350         setToBogus();
    351         return;
    352     }
    353 
    354     // perform the conversion
    355     doCodepageCreate(codepageData, dataLength, converter, status);
    356     if(U_FAILURE(status)) {
    357         setToBogus();
    358     }
    359 
    360     // close the converter
    361     if(codepage == 0) {
    362         u_releaseDefaultConverter(converter);
    363     } else {
    364         ucnv_close(converter);
    365     }
    366 }
    367 
    368 void
    369 UnicodeString::doCodepageCreate(const char *codepageData,
    370                                 int32_t dataLength,
    371                                 UConverter *converter,
    372                                 UErrorCode &status)
    373 {
    374     if(U_FAILURE(status)) {
    375         return;
    376     }
    377 
    378     // set up the conversion parameters
    379     const char *mySource     = codepageData;
    380     const char *mySourceEnd  = mySource + dataLength;
    381     UChar *array, *myTarget;
    382 
    383     // estimate the size needed:
    384     int32_t arraySize;
    385     if(dataLength <= US_STACKBUF_SIZE) {
    386         // try to use the stack buffer
    387         arraySize = US_STACKBUF_SIZE;
    388     } else {
    389         // 1.25 UChar's per source byte should cover most cases
    390         arraySize = dataLength + (dataLength >> 2);
    391     }
    392 
    393     // we do not care about the current contents
    394     UBool doCopyArray = FALSE;
    395     for(;;) {
    396         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
    397             setToBogus();
    398             break;
    399         }
    400 
    401         // perform the conversion
    402         array = getArrayStart();
    403         myTarget = array + length();
    404         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
    405             &mySource, mySourceEnd, 0, TRUE, &status);
    406 
    407         // update the conversion parameters
    408         setLength((int32_t)(myTarget - array));
    409 
    410         // allocate more space and copy data, if needed
    411         if(status == U_BUFFER_OVERFLOW_ERROR) {
    412             // reset the error code
    413             status = U_ZERO_ERROR;
    414 
    415             // keep the previous conversion results
    416             doCopyArray = TRUE;
    417 
    418             // estimate the new size needed, larger than before
    419             // try 2 UChar's per remaining source byte
    420             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
    421         } else {
    422             break;
    423         }
    424     }
    425 }
    426 
    427 U_NAMESPACE_END
    428 
    429 #endif
    430