Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2014, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  unistr_cnv.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:2
     12 *
     13 *   created on: 2004aug19
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Character conversion functions moved here from unistr.cpp
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION
     22 
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/ucnv.h"
     29 #include "ucnv_imp.h"
     30 #include "putilimp.h"
     31 #include "ustr_cnv.h"
     32 #include "ustr_imp.h"
     33 
     34 U_NAMESPACE_BEGIN
     35 
     36 //========================================
     37 // Constructors
     38 //========================================
     39 
     40 #if !U_CHARSET_IS_UTF8
     41 
     42 UnicodeString::UnicodeString(const char *codepageData) {
     43     fUnion.fFields.fLengthAndFlags = kShortString;
     44     if(codepageData != 0) {
     45         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
     46     }
     47 }
     48 
     49 UnicodeString::UnicodeString(const char *codepageData,
     50                              int32_t dataLength) {
     51     fUnion.fFields.fLengthAndFlags = kShortString;
     52     if(codepageData != 0) {
     53         doCodepageCreate(codepageData, dataLength, 0);
     54     }
     55 }
     56 
     57 // else see unistr.cpp
     58 #endif
     59 
     60 UnicodeString::UnicodeString(const char *codepageData,
     61                              const char *codepage) {
     62     fUnion.fFields.fLengthAndFlags = kShortString;
     63     if(codepageData != 0) {
     64         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
     65     }
     66 }
     67 
     68 UnicodeString::UnicodeString(const char *codepageData,
     69                              int32_t dataLength,
     70                              const char *codepage) {
     71     fUnion.fFields.fLengthAndFlags = kShortString;
     72     if(codepageData != 0) {
     73         doCodepageCreate(codepageData, dataLength, codepage);
     74     }
     75 }
     76 
     77 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
     78                              UConverter *cnv,
     79                              UErrorCode &errorCode) {
     80     fUnion.fFields.fLengthAndFlags = kShortString;
     81     if(U_SUCCESS(errorCode)) {
     82         // check arguments
     83         if(src==NULL) {
     84             // treat as an empty string, do nothing more
     85         } else if(srcLength<-1) {
     86             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     87         } else {
     88             // get input length
     89             if(srcLength==-1) {
     90                 srcLength=(int32_t)uprv_strlen(src);
     91             }
     92             if(srcLength>0) {
     93                 if(cnv!=0) {
     94                     // use the provided converter
     95                     ucnv_resetToUnicode(cnv);
     96                     doCodepageCreate(src, srcLength, cnv, errorCode);
     97                 } else {
     98                     // use the default converter
     99                     cnv=u_getDefaultConverter(&errorCode);
    100                     doCodepageCreate(src, srcLength, cnv, errorCode);
    101                     u_releaseDefaultConverter(cnv);
    102                 }
    103             }
    104         }
    105 
    106         if(U_FAILURE(errorCode)) {
    107             setToBogus();
    108         }
    109     }
    110 }
    111 
    112 //========================================
    113 // Codeset conversion
    114 //========================================
    115 
    116 #if !U_CHARSET_IS_UTF8
    117 
    118 int32_t
    119 UnicodeString::extract(int32_t start,
    120                        int32_t length,
    121                        char *target,
    122                        uint32_t dstSize) const {
    123     return extract(start, length, target, dstSize, 0);
    124 }
    125 
    126 // else see unistr.cpp
    127 #endif
    128 
    129 int32_t
    130 UnicodeString::extract(int32_t start,
    131                        int32_t length,
    132                        char *target,
    133                        uint32_t dstSize,
    134                        const char *codepage) const
    135 {
    136     // if the arguments are illegal, then do nothing
    137     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    138         return 0;
    139     }
    140 
    141     // pin the indices to legal values
    142     pinIndices(start, length);
    143 
    144     // We need to cast dstSize to int32_t for all subsequent code.
    145     // I don't know why the API was defined with uint32_t but we are stuck with it.
    146     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
    147     // as a limit in some functions, it may wrap around and yield a pointer
    148     // that compares less-than target.
    149     int32_t capacity;
    150     if(dstSize < 0x7fffffff) {
    151         // Assume that the capacity is real and a limit pointer won't wrap around.
    152         capacity = (int32_t)dstSize;
    153     } else {
    154         // Pin the capacity so that a limit pointer does not wrap around.
    155         char *targetLimit = (char *)U_MAX_PTR(target);
    156         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
    157         // greater than target and does not wrap around the top of the address space.
    158         capacity = (int32_t)(targetLimit - target);
    159     }
    160 
    161     // create the converter
    162     UConverter *converter;
    163     UErrorCode status = U_ZERO_ERROR;
    164 
    165     // just write the NUL if the string length is 0
    166     if(length == 0) {
    167         return u_terminateChars(target, capacity, 0, &status);
    168     }
    169 
    170     // if the codepage is the default, use our cache
    171     // if it is an empty string, then use the "invariant character" conversion
    172     if (codepage == 0) {
    173         const char *defaultName = ucnv_getDefaultName();
    174         if(UCNV_FAST_IS_UTF8(defaultName)) {
    175             return toUTF8(start, length, target, capacity);
    176         }
    177         converter = u_getDefaultConverter(&status);
    178     } else if (*codepage == 0) {
    179         // use the "invariant characters" conversion
    180         int32_t destLength;
    181         if(length <= capacity) {
    182             destLength = length;
    183         } else {
    184             destLength = capacity;
    185         }
    186         u_UCharsToChars(getArrayStart() + start, target, destLength);
    187         return u_terminateChars(target, capacity, length, &status);
    188     } else {
    189         converter = ucnv_open(codepage, &status);
    190     }
    191 
    192     length = doExtract(start, length, target, capacity, converter, status);
    193 
    194     // close the converter
    195     if (codepage == 0) {
    196         u_releaseDefaultConverter(converter);
    197     } else {
    198         ucnv_close(converter);
    199     }
    200 
    201     return length;
    202 }
    203 
    204 int32_t
    205 UnicodeString::extract(char *dest, int32_t destCapacity,
    206                        UConverter *cnv,
    207                        UErrorCode &errorCode) const
    208 {
    209     if(U_FAILURE(errorCode)) {
    210         return 0;
    211     }
    212 
    213     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    214         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    215         return 0;
    216     }
    217 
    218     // nothing to do?
    219     if(isEmpty()) {
    220         return u_terminateChars(dest, destCapacity, 0, &errorCode);
    221     }
    222 
    223     // get the converter
    224     UBool isDefaultConverter;
    225     if(cnv==0) {
    226         isDefaultConverter=TRUE;
    227         cnv=u_getDefaultConverter(&errorCode);
    228         if(U_FAILURE(errorCode)) {
    229             return 0;
    230         }
    231     } else {
    232         isDefaultConverter=FALSE;
    233         ucnv_resetFromUnicode(cnv);
    234     }
    235 
    236     // convert
    237     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
    238 
    239     // release the converter
    240     if(isDefaultConverter) {
    241         u_releaseDefaultConverter(cnv);
    242     }
    243 
    244     return len;
    245 }
    246 
    247 int32_t
    248 UnicodeString::doExtract(int32_t start, int32_t length,
    249                          char *dest, int32_t destCapacity,
    250                          UConverter *cnv,
    251                          UErrorCode &errorCode) const
    252 {
    253     if(U_FAILURE(errorCode)) {
    254         if(destCapacity!=0) {
    255             *dest=0;
    256         }
    257         return 0;
    258     }
    259 
    260     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
    261     char *originalDest=dest;
    262     const char *destLimit;
    263 
    264     if(destCapacity==0) {
    265         destLimit=dest=0;
    266     } else if(destCapacity==-1) {
    267         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
    268         destLimit=(char*)U_MAX_PTR(dest);
    269         // for NUL-termination, translate into highest int32_t
    270         destCapacity=0x7fffffff;
    271     } else {
    272         destLimit=dest+destCapacity;
    273     }
    274 
    275     // perform the conversion
    276     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    277     length=(int32_t)(dest-originalDest);
    278 
    279     // if an overflow occurs, then get the preflighting length
    280     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    281         char buffer[1024];
    282 
    283         destLimit=buffer+sizeof(buffer);
    284         do {
    285             dest=buffer;
    286             errorCode=U_ZERO_ERROR;
    287             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    288             length+=(int32_t)(dest-buffer);
    289         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
    290     }
    291 
    292     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
    293 }
    294 
    295 void
    296 UnicodeString::doCodepageCreate(const char *codepageData,
    297                                 int32_t dataLength,
    298                                 const char *codepage)
    299 {
    300     // if there's nothing to convert, do nothing
    301     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    302         return;
    303     }
    304     if(dataLength == -1) {
    305         dataLength = (int32_t)uprv_strlen(codepageData);
    306     }
    307 
    308     UErrorCode status = U_ZERO_ERROR;
    309 
    310     // create the converter
    311     // if the codepage is the default, use our cache
    312     // if it is an empty string, then use the "invariant character" conversion
    313     UConverter *converter;
    314     if (codepage == 0) {
    315         const char *defaultName = ucnv_getDefaultName();
    316         if(UCNV_FAST_IS_UTF8(defaultName)) {
    317             setToUTF8(StringPiece(codepageData, dataLength));
    318             return;
    319         }
    320         converter = u_getDefaultConverter(&status);
    321     } else if(*codepage == 0) {
    322         // use the "invariant characters" conversion
    323         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
    324             u_charsToUChars(codepageData, getArrayStart(), dataLength);
    325             setLength(dataLength);
    326         } else {
    327             setToBogus();
    328         }
    329         return;
    330     } else {
    331         converter = ucnv_open(codepage, &status);
    332     }
    333 
    334     // if we failed, set the appropriate flags and return
    335     if(U_FAILURE(status)) {
    336         setToBogus();
    337         return;
    338     }
    339 
    340     // perform the conversion
    341     doCodepageCreate(codepageData, dataLength, converter, status);
    342     if(U_FAILURE(status)) {
    343         setToBogus();
    344     }
    345 
    346     // close the converter
    347     if(codepage == 0) {
    348         u_releaseDefaultConverter(converter);
    349     } else {
    350         ucnv_close(converter);
    351     }
    352 }
    353 
    354 void
    355 UnicodeString::doCodepageCreate(const char *codepageData,
    356                                 int32_t dataLength,
    357                                 UConverter *converter,
    358                                 UErrorCode &status)
    359 {
    360     if(U_FAILURE(status)) {
    361         return;
    362     }
    363 
    364     // set up the conversion parameters
    365     const char *mySource     = codepageData;
    366     const char *mySourceEnd  = mySource + dataLength;
    367     UChar *array, *myTarget;
    368 
    369     // estimate the size needed:
    370     int32_t arraySize;
    371     if(dataLength <= US_STACKBUF_SIZE) {
    372         // try to use the stack buffer
    373         arraySize = US_STACKBUF_SIZE;
    374     } else {
    375         // 1.25 UChar's per source byte should cover most cases
    376         arraySize = dataLength + (dataLength >> 2);
    377     }
    378 
    379     // we do not care about the current contents
    380     UBool doCopyArray = FALSE;
    381     for(;;) {
    382         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
    383             setToBogus();
    384             break;
    385         }
    386 
    387         // perform the conversion
    388         array = getArrayStart();
    389         myTarget = array + length();
    390         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
    391             &mySource, mySourceEnd, 0, TRUE, &status);
    392 
    393         // update the conversion parameters
    394         setLength((int32_t)(myTarget - array));
    395 
    396         // allocate more space and copy data, if needed
    397         if(status == U_BUFFER_OVERFLOW_ERROR) {
    398             // reset the error code
    399             status = U_ZERO_ERROR;
    400 
    401             // keep the previous conversion results
    402             doCopyArray = TRUE;
    403 
    404             // estimate the new size needed, larger than before
    405             // try 2 UChar's per remaining source byte
    406             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
    407         } else {
    408             break;
    409         }
    410     }
    411 }
    412 
    413 U_NAMESPACE_END
    414 
    415 #endif
    416