Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  unistr_cnv.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:2
     14 *
     15 *   created on: 2004aug19
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Character conversion functions moved here from unistr.cpp
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_CONVERSION
     24 
     25 #include "unicode/putil.h"
     26 #include "cstring.h"
     27 #include "cmemory.h"
     28 #include "unicode/ustring.h"
     29 #include "unicode/unistr.h"
     30 #include "unicode/ucnv.h"
     31 #include "ucnv_imp.h"
     32 #include "putilimp.h"
     33 #include "ustr_cnv.h"
     34 #include "ustr_imp.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 //========================================
     39 // Constructors
     40 //========================================
     41 
     42 #if !U_CHARSET_IS_UTF8
     43 
     44 UnicodeString::UnicodeString(const char *codepageData) {
     45     fUnion.fFields.fLengthAndFlags = kShortString;
     46     if(codepageData != 0) {
     47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
     48     }
     49 }
     50 
     51 UnicodeString::UnicodeString(const char *codepageData,
     52                              int32_t dataLength) {
     53     fUnion.fFields.fLengthAndFlags = kShortString;
     54     if(codepageData != 0) {
     55         doCodepageCreate(codepageData, dataLength, 0);
     56     }
     57 }
     58 
     59 // else see unistr.cpp
     60 #endif
     61 
     62 UnicodeString::UnicodeString(const char *codepageData,
     63                              const char *codepage) {
     64     fUnion.fFields.fLengthAndFlags = kShortString;
     65     if(codepageData != 0) {
     66         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
     67     }
     68 }
     69 
     70 UnicodeString::UnicodeString(const char *codepageData,
     71                              int32_t dataLength,
     72                              const char *codepage) {
     73     fUnion.fFields.fLengthAndFlags = kShortString;
     74     if(codepageData != 0) {
     75         doCodepageCreate(codepageData, dataLength, codepage);
     76     }
     77 }
     78 
     79 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
     80                              UConverter *cnv,
     81                              UErrorCode &errorCode) {
     82     fUnion.fFields.fLengthAndFlags = kShortString;
     83     if(U_SUCCESS(errorCode)) {
     84         // check arguments
     85         if(src==NULL) {
     86             // treat as an empty string, do nothing more
     87         } else if(srcLength<-1) {
     88             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     89         } else {
     90             // get input length
     91             if(srcLength==-1) {
     92                 srcLength=(int32_t)uprv_strlen(src);
     93             }
     94             if(srcLength>0) {
     95                 if(cnv!=0) {
     96                     // use the provided converter
     97                     ucnv_resetToUnicode(cnv);
     98                     doCodepageCreate(src, srcLength, cnv, errorCode);
     99                 } else {
    100                     // use the default converter
    101                     cnv=u_getDefaultConverter(&errorCode);
    102                     doCodepageCreate(src, srcLength, cnv, errorCode);
    103                     u_releaseDefaultConverter(cnv);
    104                 }
    105             }
    106         }
    107 
    108         if(U_FAILURE(errorCode)) {
    109             setToBogus();
    110         }
    111     }
    112 }
    113 
    114 //========================================
    115 // Codeset conversion
    116 //========================================
    117 
    118 #if !U_CHARSET_IS_UTF8
    119 
    120 int32_t
    121 UnicodeString::extract(int32_t start,
    122                        int32_t length,
    123                        char *target,
    124                        uint32_t dstSize) const {
    125     return extract(start, length, target, dstSize, 0);
    126 }
    127 
    128 // else see unistr.cpp
    129 #endif
    130 
    131 int32_t
    132 UnicodeString::extract(int32_t start,
    133                        int32_t length,
    134                        char *target,
    135                        uint32_t dstSize,
    136                        const char *codepage) const
    137 {
    138     // if the arguments are illegal, then do nothing
    139     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    140         return 0;
    141     }
    142 
    143     // pin the indices to legal values
    144     pinIndices(start, length);
    145 
    146     // We need to cast dstSize to int32_t for all subsequent code.
    147     // I don't know why the API was defined with uint32_t but we are stuck with it.
    148     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
    149     // as a limit in some functions, it may wrap around and yield a pointer
    150     // that compares less-than target.
    151     int32_t capacity;
    152     if(dstSize < 0x7fffffff) {
    153         // Assume that the capacity is real and a limit pointer won't wrap around.
    154         capacity = (int32_t)dstSize;
    155     } else {
    156         // Pin the capacity so that a limit pointer does not wrap around.
    157         char *targetLimit = (char *)U_MAX_PTR(target);
    158         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
    159         // greater than target and does not wrap around the top of the address space.
    160         capacity = (int32_t)(targetLimit - target);
    161     }
    162 
    163     // create the converter
    164     UConverter *converter;
    165     UErrorCode status = U_ZERO_ERROR;
    166 
    167     // just write the NUL if the string length is 0
    168     if(length == 0) {
    169         return u_terminateChars(target, capacity, 0, &status);
    170     }
    171 
    172     // if the codepage is the default, use our cache
    173     // if it is an empty string, then use the "invariant character" conversion
    174     if (codepage == 0) {
    175         const char *defaultName = ucnv_getDefaultName();
    176         if(UCNV_FAST_IS_UTF8(defaultName)) {
    177             return toUTF8(start, length, target, capacity);
    178         }
    179         converter = u_getDefaultConverter(&status);
    180     } else if (*codepage == 0) {
    181         // use the "invariant characters" conversion
    182         int32_t destLength;
    183         if(length <= capacity) {
    184             destLength = length;
    185         } else {
    186             destLength = capacity;
    187         }
    188         u_UCharsToChars(getArrayStart() + start, target, destLength);
    189         return u_terminateChars(target, capacity, length, &status);
    190     } else {
    191         converter = ucnv_open(codepage, &status);
    192     }
    193 
    194     length = doExtract(start, length, target, capacity, converter, status);
    195 
    196     // close the converter
    197     if (codepage == 0) {
    198         u_releaseDefaultConverter(converter);
    199     } else {
    200         ucnv_close(converter);
    201     }
    202 
    203     return length;
    204 }
    205 
    206 int32_t
    207 UnicodeString::extract(char *dest, int32_t destCapacity,
    208                        UConverter *cnv,
    209                        UErrorCode &errorCode) const
    210 {
    211     if(U_FAILURE(errorCode)) {
    212         return 0;
    213     }
    214 
    215     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    216         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    217         return 0;
    218     }
    219 
    220     // nothing to do?
    221     if(isEmpty()) {
    222         return u_terminateChars(dest, destCapacity, 0, &errorCode);
    223     }
    224 
    225     // get the converter
    226     UBool isDefaultConverter;
    227     if(cnv==0) {
    228         isDefaultConverter=TRUE;
    229         cnv=u_getDefaultConverter(&errorCode);
    230         if(U_FAILURE(errorCode)) {
    231             return 0;
    232         }
    233     } else {
    234         isDefaultConverter=FALSE;
    235         ucnv_resetFromUnicode(cnv);
    236     }
    237 
    238     // convert
    239     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
    240 
    241     // release the converter
    242     if(isDefaultConverter) {
    243         u_releaseDefaultConverter(cnv);
    244     }
    245 
    246     return len;
    247 }
    248 
    249 int32_t
    250 UnicodeString::doExtract(int32_t start, int32_t length,
    251                          char *dest, int32_t destCapacity,
    252                          UConverter *cnv,
    253                          UErrorCode &errorCode) const
    254 {
    255     if(U_FAILURE(errorCode)) {
    256         if(destCapacity!=0) {
    257             *dest=0;
    258         }
    259         return 0;
    260     }
    261 
    262     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
    263     char *originalDest=dest;
    264     const char *destLimit;
    265 
    266     if(destCapacity==0) {
    267         destLimit=dest=0;
    268     } else if(destCapacity==-1) {
    269         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
    270         destLimit=(char*)U_MAX_PTR(dest);
    271         // for NUL-termination, translate into highest int32_t
    272         destCapacity=0x7fffffff;
    273     } else {
    274         destLimit=dest+destCapacity;
    275     }
    276 
    277     // perform the conversion
    278     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    279     length=(int32_t)(dest-originalDest);
    280 
    281     // if an overflow occurs, then get the preflighting length
    282     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    283         char buffer[1024];
    284 
    285         destLimit=buffer+sizeof(buffer);
    286         do {
    287             dest=buffer;
    288             errorCode=U_ZERO_ERROR;
    289             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
    290             length+=(int32_t)(dest-buffer);
    291         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
    292     }
    293 
    294     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
    295 }
    296 
    297 void
    298 UnicodeString::doCodepageCreate(const char *codepageData,
    299                                 int32_t dataLength,
    300                                 const char *codepage)
    301 {
    302     // if there's nothing to convert, do nothing
    303     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    304         return;
    305     }
    306     if(dataLength == -1) {
    307         dataLength = (int32_t)uprv_strlen(codepageData);
    308     }
    309 
    310     UErrorCode status = U_ZERO_ERROR;
    311 
    312     // create the converter
    313     // if the codepage is the default, use our cache
    314     // if it is an empty string, then use the "invariant character" conversion
    315     UConverter *converter;
    316     if (codepage == 0) {
    317         const char *defaultName = ucnv_getDefaultName();
    318         if(UCNV_FAST_IS_UTF8(defaultName)) {
    319             setToUTF8(StringPiece(codepageData, dataLength));
    320             return;
    321         }
    322         converter = u_getDefaultConverter(&status);
    323     } else if(*codepage == 0) {
    324         // use the "invariant characters" conversion
    325         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
    326             u_charsToUChars(codepageData, getArrayStart(), dataLength);
    327             setLength(dataLength);
    328         } else {
    329             setToBogus();
    330         }
    331         return;
    332     } else {
    333         converter = ucnv_open(codepage, &status);
    334     }
    335 
    336     // if we failed, set the appropriate flags and return
    337     if(U_FAILURE(status)) {
    338         setToBogus();
    339         return;
    340     }
    341 
    342     // perform the conversion
    343     doCodepageCreate(codepageData, dataLength, converter, status);
    344     if(U_FAILURE(status)) {
    345         setToBogus();
    346     }
    347 
    348     // close the converter
    349     if(codepage == 0) {
    350         u_releaseDefaultConverter(converter);
    351     } else {
    352         ucnv_close(converter);
    353     }
    354 }
    355 
    356 void
    357 UnicodeString::doCodepageCreate(const char *codepageData,
    358                                 int32_t dataLength,
    359                                 UConverter *converter,
    360                                 UErrorCode &status)
    361 {
    362     if(U_FAILURE(status)) {
    363         return;
    364     }
    365 
    366     // set up the conversion parameters
    367     const char *mySource     = codepageData;
    368     const char *mySourceEnd  = mySource + dataLength;
    369     UChar *array, *myTarget;
    370 
    371     // estimate the size needed:
    372     int32_t arraySize;
    373     if(dataLength <= US_STACKBUF_SIZE) {
    374         // try to use the stack buffer
    375         arraySize = US_STACKBUF_SIZE;
    376     } else {
    377         // 1.25 UChar's per source byte should cover most cases
    378         arraySize = dataLength + (dataLength >> 2);
    379     }
    380 
    381     // we do not care about the current contents
    382     UBool doCopyArray = FALSE;
    383     for(;;) {
    384         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
    385             setToBogus();
    386             break;
    387         }
    388 
    389         // perform the conversion
    390         array = getArrayStart();
    391         myTarget = array + length();
    392         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
    393             &mySource, mySourceEnd, 0, TRUE, &status);
    394 
    395         // update the conversion parameters
    396         setLength((int32_t)(myTarget - array));
    397 
    398         // allocate more space and copy data, if needed
    399         if(status == U_BUFFER_OVERFLOW_ERROR) {
    400             // reset the error code
    401             status = U_ZERO_ERROR;
    402 
    403             // keep the previous conversion results
    404             doCopyArray = TRUE;
    405 
    406             // estimate the new size needed, larger than before
    407             // try 2 UChar's per remaining source byte
    408             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
    409         } else {
    410             break;
    411         }
    412     }
    413 }
    414 
    415 U_NAMESPACE_END
    416 
    417 #endif
    418