Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  unistr_cnv.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:2
     12 *
     13 *   created on: 2004aug19
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Character conversion functions moved here from unistr.cpp
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION
     22 
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/ucnv.h"
     29 #include "ucnv_imp.h"
     30 #include "putilimp.h"
     31 #include "ustr_cnv.h"
     32 #include "ustr_imp.h"
     33 
     34 U_NAMESPACE_BEGIN
     35 
     36 //========================================
     37 // Constructors
     38 //========================================
     39 
     40 #if !U_CHARSET_IS_UTF8
     41 
     42 UnicodeString::UnicodeString(const char *codepageData)
     43   : fShortLength(0),
     44     fFlags(kShortString)
     45 {
     46     if(codepageData != 0) {
     47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
     48     }
     49 }
     50 
     51 UnicodeString::UnicodeString(const char *codepageData,
     52                              int32_t dataLength)
     53   : fShortLength(0),
     54     fFlags(kShortString)
     55 {
     56     if(codepageData != 0) {
     57         doCodepageCreate(codepageData, dataLength, 0);
     58     }
     59 }
     60 
     61 // else see unistr.cpp
     62 #endif
     63 
     64 UnicodeString::UnicodeString(const char *codepageData,
     65                              const char *codepage)
     66   : fShortLength(0),
     67     fFlags(kShortString)
     68 {
     69     if(codepageData != 0) {
     70         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
     71     }
     72 }
     73 
     74 UnicodeString::UnicodeString(const char *codepageData,
     75                              int32_t dataLength,
     76                              const char *codepage)
     77   : fShortLength(0),
     78     fFlags(kShortString)
     79 {
     80     if(codepageData != 0) {
     81         doCodepageCreate(codepageData, dataLength, codepage);
     82     }
     83 }
     84 
     85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
     86                              UConverter *cnv,
     87                              UErrorCode &errorCode)
     88   : fShortLength(0),
     89     fFlags(kShortString)
     90 {
     91     if(U_SUCCESS(errorCode)) {
     92         // check arguments
     93         if(src==NULL) {
     94             // treat as an empty string, do nothing more
     95         } else if(srcLength<-1) {
     96             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     97         } else {
     98             // get input length
     99             if(srcLength==-1) {
    100                 srcLength=(int32_t)uprv_strlen(src);
    101             }
    102             if(srcLength>0) {
    103                 if(cnv!=0) {
    104                     // use the provided converter
    105                     ucnv_resetToUnicode(cnv);
    106                     doCodepageCreate(src, srcLength, cnv, errorCode);
    107                 } else {
    108                     // use the default converter
    109                     cnv=u_getDefaultConverter(&errorCode);
    110                     doCodepageCreate(src, srcLength, cnv, errorCode);
    111                     u_releaseDefaultConverter(cnv);
    112                 }
    113             }
    114         }
    115 
    116         if(U_FAILURE(errorCode)) {
    117             setToBogus();
    118         }
    119     }
    120 }
    121 
    122 //========================================
    123 // Codeset conversion
    124 //========================================
    125 
    126 #if !U_CHARSET_IS_UTF8
    127 
    128 int32_t
    129 UnicodeString::extract(int32_t start,
    130                        int32_t length,
    131                        char *target,
    132                        uint32_t dstSize) const {
    133     return extract(start, length, target, dstSize, 0);
    134 }
    135 
    136 // else see unistr.cpp
    137 #endif
    138 
    139 int32_t
    140 UnicodeString::extract(int32_t start,
    141                        int32_t length,
    142                        char *target,
    143                        uint32_t dstSize,
    144                        const char *codepage) const
    145 {
    146     // if the arguments are illegal, then do nothing
    147     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    148         return 0;
    149     }
    150 
    151     // pin the indices to legal values
    152     pinIndices(start, length);
    153 
    154     // We need to cast dstSize to int32_t for all subsequent code.
    155     // I don't know why the API was defined with uint32_t but we are stuck with it.
    156     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
    157     // as a limit in some functions, it may wrap around and yield a pointer
    158     // that compares less-than target.
    159     int32_t capacity;
    160     if(dstSize < 0x7fffffff) {
    161         // Assume that the capacity is real and a limit pointer won't wrap around.
    162         capacity = (int32_t)dstSize;
    163     } else {
    164         // Pin the capacity so that a limit pointer does not wrap around.
    165         char *targetLimit = (char *)U_MAX_PTR(target);
    166         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
    167         // greater than target and does not wrap around the top of the address space.
    168         capacity = (int32_t)(targetLimit - target);
    169     }
    170 
    171     // create the converter
    172     UConverter *converter;
    173     UErrorCode status = U_ZERO_ERROR;
    174 
    175     // just write the NUL if the string length is 0
    176     if(length == 0) {
    177         return u_terminateChars(target, capacity, 0, &status);
    178     }
    179 
    180     // if the codepage is the default, use our cache
    181     // if it is an empty string, then use the "invariant character" conversion
    182     if (codepage == 0) {
    183         const char *defaultName = ucnv_getDefaultName();
    184         if(UCNV_FAST_IS_UTF8(defaultName)) {
    185             return toUTF8(start, length, target, capacity);
    186         }
    187         converter = u_getDefaultConverter(&status);
    188     } else if (*codepage == 0) {
    189         // use the "invariant characters" conversion
    190         int32_t destLength;
    191         if(length <= capacity) {
    192             destLength = length;
    193         } else {
    194             destLength = capacity;
    195         }
    196         u_UCharsToChars(getArrayStart() + start, target, destLength);
    197         return u_terminateChars(target, capacity, length, &status);
    198     } else {
    199         converter = ucnv_open(codepage, &status);
    200     }
    201 
    202     length = doExtract(start, length, target, capacity, converter, status);
    203 
    204     // close the converter
    205     if (codepage == 0) {
    206         u_releaseDefaultConverter(converter);
    207     } else {
    208         ucnv_close(converter);
    209     }
    210 
    211     return length;
    212 }
    213 
    214 int32_t
    215 UnicodeString::extract(char *dest, int32_t destCapacity,
    216                        UConverter *cnv,
    217                        UErrorCode &errorCode) const
    218 {
    219     if(U_FAILURE(errorCode)) {
    220         return 0;
    221     }
    222 
    223     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    224         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    225         return 0;
    226     }
    227 
    228     // nothing to do?
    229     if(isEmpty()) {
    230         return u_terminateChars(dest, destCapacity, 0, &errorCode);
    231     }
    232 
    233     // get the converter
    234     UBool isDefaultConverter;
    235     if(cnv==0) {
    236         isDefaultConverter=TRUE;
    237         cnv=u_getDefaultConverter(&errorCode);
    238         if(U_FAILURE(errorCode)) {
    239             return 0;
    240         }
    241     } else {
    242         isDefaultConverter=FALSE;
    243         ucnv_resetFromUnicode(cnv);
    244     }
    245 
    246     // convert
    247     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
    248 
    249     // release the converter
    250     if(isDefaultConverter) {
    251         u_releaseDefaultConverter(cnv);
    252     }
    253 
    254     return len;
    255 }
    256 
    257 int32_t
    258 UnicodeString::doExtract(int32_t start, int32_t length,
    259                          char *dest, int32_t destCapacity,
    260                          UConverter *cnv,
    261                          UErrorCode &errorCode) const
    262 {
    263     if(U_FAILURE(errorCode)) {
    264         if(destCapacity!=0) {
    265             *dest=0;
    266         }
    267         return 0;
    268     }
    269 
    270     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
    271     char *originalDest=dest;
    272     const char *destLimit;
    273 
    274     if(destCapacity==0) {
    275         destLimit=dest=0;
    276     } else if(destCapacity==-1) {
    277         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
    278         destLimit=(char*)U_MAX_PTR(dest);
    279         // for NUL-termination, translate into highest int32_t
    280         destCapacity=0x7fffffff;
    281     } else {
    282         destLimit=dest+destCapacity;
    283     }
    284 
    285     // perform the conversion
    286     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    287     length=(int32_t)(dest-originalDest);
    288 
    289     // if an overflow occurs, then get the preflighting length
    290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    291         char buffer[1024];
    292 
    293         destLimit=buffer+sizeof(buffer);
    294         do {
    295             dest=buffer;
    296             errorCode=U_ZERO_ERROR;
    297             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    298             length+=(int32_t)(dest-buffer);
    299         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
    300     }
    301 
    302     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
    303 }
    304 
    305 void
    306 UnicodeString::doCodepageCreate(const char *codepageData,
    307                                 int32_t dataLength,
    308                                 const char *codepage)
    309 {
    310     // if there's nothing to convert, do nothing
    311     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    312         return;
    313     }
    314     if(dataLength == -1) {
    315         dataLength = (int32_t)uprv_strlen(codepageData);
    316     }
    317 
    318     UErrorCode status = U_ZERO_ERROR;
    319 
    320     // create the converter
    321     // if the codepage is the default, use our cache
    322     // if it is an empty string, then use the "invariant character" conversion
    323     UConverter *converter;
    324     if (codepage == 0) {
    325         const char *defaultName = ucnv_getDefaultName();
    326         if(UCNV_FAST_IS_UTF8(defaultName)) {
    327             setToUTF8(StringPiece(codepageData, dataLength));
    328             return;
    329         }
    330         converter = u_getDefaultConverter(&status);
    331     } else if(*codepage == 0) {
    332         // use the "invariant characters" conversion
    333         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
    334             u_charsToUChars(codepageData, getArrayStart(), dataLength);
    335             setLength(dataLength);
    336         } else {
    337             setToBogus();
    338         }
    339         return;
    340     } else {
    341         converter = ucnv_open(codepage, &status);
    342     }
    343 
    344     // if we failed, set the appropriate flags and return
    345     if(U_FAILURE(status)) {
    346         setToBogus();
    347         return;
    348     }
    349 
    350     // perform the conversion
    351     doCodepageCreate(codepageData, dataLength, converter, status);
    352     if(U_FAILURE(status)) {
    353         setToBogus();
    354     }
    355 
    356     // close the converter
    357     if(codepage == 0) {
    358         u_releaseDefaultConverter(converter);
    359     } else {
    360         ucnv_close(converter);
    361     }
    362 }
    363 
    364 void
    365 UnicodeString::doCodepageCreate(const char *codepageData,
    366                                 int32_t dataLength,
    367                                 UConverter *converter,
    368                                 UErrorCode &status)
    369 {
    370     if(U_FAILURE(status)) {
    371         return;
    372     }
    373 
    374     // set up the conversion parameters
    375     const char *mySource     = codepageData;
    376     const char *mySourceEnd  = mySource + dataLength;
    377     UChar *array, *myTarget;
    378 
    379     // estimate the size needed:
    380     int32_t arraySize;
    381     if(dataLength <= US_STACKBUF_SIZE) {
    382         // try to use the stack buffer
    383         arraySize = US_STACKBUF_SIZE;
    384     } else {
    385         // 1.25 UChar's per source byte should cover most cases
    386         arraySize = dataLength + (dataLength >> 2);
    387     }
    388 
    389     // we do not care about the current contents
    390     UBool doCopyArray = FALSE;
    391     for(;;) {
    392         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
    393             setToBogus();
    394             break;
    395         }
    396 
    397         // perform the conversion
    398         array = getArrayStart();
    399         myTarget = array + length();
    400         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
    401             &mySource, mySourceEnd, 0, TRUE, &status);
    402 
    403         // update the conversion parameters
    404         setLength((int32_t)(myTarget - array));
    405 
    406         // allocate more space and copy data, if needed
    407         if(status == U_BUFFER_OVERFLOW_ERROR) {
    408             // reset the error code
    409             status = U_ZERO_ERROR;
    410 
    411             // keep the previous conversion results
    412             doCopyArray = TRUE;
    413 
    414             // estimate the new size needed, larger than before
    415             // try 2 UChar's per remaining source byte
    416             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
    417         } else {
    418             break;
    419         }
    420     }
    421 }
    422 
    423 U_NAMESPACE_END
    424 
    425 #endif
    426