1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: unistr_cnv.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:2 14 * 15 * created on: 2004aug19 16 * created by: Markus W. Scherer 17 * 18 * Character conversion functions moved here from unistr.cpp 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_CONVERSION 24 25 #include "unicode/putil.h" 26 #include "cstring.h" 27 #include "cmemory.h" 28 #include "unicode/ustring.h" 29 #include "unicode/unistr.h" 30 #include "unicode/ucnv.h" 31 #include "ucnv_imp.h" 32 #include "putilimp.h" 33 #include "ustr_cnv.h" 34 #include "ustr_imp.h" 35 36 U_NAMESPACE_BEGIN 37 38 //======================================== 39 // Constructors 40 //======================================== 41 42 #if !U_CHARSET_IS_UTF8 43 44 UnicodeString::UnicodeString(const char *codepageData) { 45 fUnion.fFields.fLengthAndFlags = kShortString; 46 if(codepageData != 0) { 47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); 48 } 49 } 50 51 UnicodeString::UnicodeString(const char *codepageData, 52 int32_t dataLength) { 53 fUnion.fFields.fLengthAndFlags = kShortString; 54 if(codepageData != 0) { 55 doCodepageCreate(codepageData, dataLength, 0); 56 } 57 } 58 59 // else see unistr.cpp 60 #endif 61 62 UnicodeString::UnicodeString(const char *codepageData, 63 const char *codepage) { 64 fUnion.fFields.fLengthAndFlags = kShortString; 65 if(codepageData != 0) { 66 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); 67 } 68 } 69 70 UnicodeString::UnicodeString(const char *codepageData, 71 int32_t dataLength, 72 const char *codepage) { 73 fUnion.fFields.fLengthAndFlags = kShortString; 74 if(codepageData != 0) { 75 doCodepageCreate(codepageData, dataLength, codepage); 76 } 77 } 78 79 UnicodeString::UnicodeString(const char *src, int32_t srcLength, 80 UConverter *cnv, 81 UErrorCode &errorCode) { 82 fUnion.fFields.fLengthAndFlags = kShortString; 83 if(U_SUCCESS(errorCode)) { 84 // check arguments 85 if(src==NULL) { 86 // treat as an empty string, do nothing more 87 } else if(srcLength<-1) { 88 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 89 } else { 90 // get input length 91 if(srcLength==-1) { 92 srcLength=(int32_t)uprv_strlen(src); 93 } 94 if(srcLength>0) { 95 if(cnv!=0) { 96 // use the provided converter 97 ucnv_resetToUnicode(cnv); 98 doCodepageCreate(src, srcLength, cnv, errorCode); 99 } else { 100 // use the default converter 101 cnv=u_getDefaultConverter(&errorCode); 102 doCodepageCreate(src, srcLength, cnv, errorCode); 103 u_releaseDefaultConverter(cnv); 104 } 105 } 106 } 107 108 if(U_FAILURE(errorCode)) { 109 setToBogus(); 110 } 111 } 112 } 113 114 //======================================== 115 // Codeset conversion 116 //======================================== 117 118 #if !U_CHARSET_IS_UTF8 119 120 int32_t 121 UnicodeString::extract(int32_t start, 122 int32_t length, 123 char *target, 124 uint32_t dstSize) const { 125 return extract(start, length, target, dstSize, 0); 126 } 127 128 // else see unistr.cpp 129 #endif 130 131 int32_t 132 UnicodeString::extract(int32_t start, 133 int32_t length, 134 char *target, 135 uint32_t dstSize, 136 const char *codepage) const 137 { 138 // if the arguments are illegal, then do nothing 139 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 140 return 0; 141 } 142 143 // pin the indices to legal values 144 pinIndices(start, length); 145 146 // We need to cast dstSize to int32_t for all subsequent code. 147 // I don't know why the API was defined with uint32_t but we are stuck with it. 148 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize 149 // as a limit in some functions, it may wrap around and yield a pointer 150 // that compares less-than target. 151 int32_t capacity; 152 if(dstSize < 0x7fffffff) { 153 // Assume that the capacity is real and a limit pointer won't wrap around. 154 capacity = (int32_t)dstSize; 155 } else { 156 // Pin the capacity so that a limit pointer does not wrap around. 157 char *targetLimit = (char *)U_MAX_PTR(target); 158 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff 159 // greater than target and does not wrap around the top of the address space. 160 capacity = (int32_t)(targetLimit - target); 161 } 162 163 // create the converter 164 UConverter *converter; 165 UErrorCode status = U_ZERO_ERROR; 166 167 // just write the NUL if the string length is 0 168 if(length == 0) { 169 return u_terminateChars(target, capacity, 0, &status); 170 } 171 172 // if the codepage is the default, use our cache 173 // if it is an empty string, then use the "invariant character" conversion 174 if (codepage == 0) { 175 const char *defaultName = ucnv_getDefaultName(); 176 if(UCNV_FAST_IS_UTF8(defaultName)) { 177 return toUTF8(start, length, target, capacity); 178 } 179 converter = u_getDefaultConverter(&status); 180 } else if (*codepage == 0) { 181 // use the "invariant characters" conversion 182 int32_t destLength; 183 if(length <= capacity) { 184 destLength = length; 185 } else { 186 destLength = capacity; 187 } 188 u_UCharsToChars(getArrayStart() + start, target, destLength); 189 return u_terminateChars(target, capacity, length, &status); 190 } else { 191 converter = ucnv_open(codepage, &status); 192 } 193 194 length = doExtract(start, length, target, capacity, converter, status); 195 196 // close the converter 197 if (codepage == 0) { 198 u_releaseDefaultConverter(converter); 199 } else { 200 ucnv_close(converter); 201 } 202 203 return length; 204 } 205 206 int32_t 207 UnicodeString::extract(char *dest, int32_t destCapacity, 208 UConverter *cnv, 209 UErrorCode &errorCode) const 210 { 211 if(U_FAILURE(errorCode)) { 212 return 0; 213 } 214 215 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 216 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 217 return 0; 218 } 219 220 // nothing to do? 221 if(isEmpty()) { 222 return u_terminateChars(dest, destCapacity, 0, &errorCode); 223 } 224 225 // get the converter 226 UBool isDefaultConverter; 227 if(cnv==0) { 228 isDefaultConverter=TRUE; 229 cnv=u_getDefaultConverter(&errorCode); 230 if(U_FAILURE(errorCode)) { 231 return 0; 232 } 233 } else { 234 isDefaultConverter=FALSE; 235 ucnv_resetFromUnicode(cnv); 236 } 237 238 // convert 239 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); 240 241 // release the converter 242 if(isDefaultConverter) { 243 u_releaseDefaultConverter(cnv); 244 } 245 246 return len; 247 } 248 249 int32_t 250 UnicodeString::doExtract(int32_t start, int32_t length, 251 char *dest, int32_t destCapacity, 252 UConverter *cnv, 253 UErrorCode &errorCode) const 254 { 255 if(U_FAILURE(errorCode)) { 256 if(destCapacity!=0) { 257 *dest=0; 258 } 259 return 0; 260 } 261 262 const UChar *src=getArrayStart()+start, *srcLimit=src+length; 263 char *originalDest=dest; 264 const char *destLimit; 265 266 if(destCapacity==0) { 267 destLimit=dest=0; 268 } else if(destCapacity==-1) { 269 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. 270 destLimit=(char*)U_MAX_PTR(dest); 271 // for NUL-termination, translate into highest int32_t 272 destCapacity=0x7fffffff; 273 } else { 274 destLimit=dest+destCapacity; 275 } 276 277 // perform the conversion 278 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 279 length=(int32_t)(dest-originalDest); 280 281 // if an overflow occurs, then get the preflighting length 282 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 283 char buffer[1024]; 284 285 destLimit=buffer+sizeof(buffer); 286 do { 287 dest=buffer; 288 errorCode=U_ZERO_ERROR; 289 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 290 length+=(int32_t)(dest-buffer); 291 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); 292 } 293 294 return u_terminateChars(originalDest, destCapacity, length, &errorCode); 295 } 296 297 void 298 UnicodeString::doCodepageCreate(const char *codepageData, 299 int32_t dataLength, 300 const char *codepage) 301 { 302 // if there's nothing to convert, do nothing 303 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 304 return; 305 } 306 if(dataLength == -1) { 307 dataLength = (int32_t)uprv_strlen(codepageData); 308 } 309 310 UErrorCode status = U_ZERO_ERROR; 311 312 // create the converter 313 // if the codepage is the default, use our cache 314 // if it is an empty string, then use the "invariant character" conversion 315 UConverter *converter; 316 if (codepage == 0) { 317 const char *defaultName = ucnv_getDefaultName(); 318 if(UCNV_FAST_IS_UTF8(defaultName)) { 319 setToUTF8(StringPiece(codepageData, dataLength)); 320 return; 321 } 322 converter = u_getDefaultConverter(&status); 323 } else if(*codepage == 0) { 324 // use the "invariant characters" conversion 325 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { 326 u_charsToUChars(codepageData, getArrayStart(), dataLength); 327 setLength(dataLength); 328 } else { 329 setToBogus(); 330 } 331 return; 332 } else { 333 converter = ucnv_open(codepage, &status); 334 } 335 336 // if we failed, set the appropriate flags and return 337 if(U_FAILURE(status)) { 338 setToBogus(); 339 return; 340 } 341 342 // perform the conversion 343 doCodepageCreate(codepageData, dataLength, converter, status); 344 if(U_FAILURE(status)) { 345 setToBogus(); 346 } 347 348 // close the converter 349 if(codepage == 0) { 350 u_releaseDefaultConverter(converter); 351 } else { 352 ucnv_close(converter); 353 } 354 } 355 356 void 357 UnicodeString::doCodepageCreate(const char *codepageData, 358 int32_t dataLength, 359 UConverter *converter, 360 UErrorCode &status) 361 { 362 if(U_FAILURE(status)) { 363 return; 364 } 365 366 // set up the conversion parameters 367 const char *mySource = codepageData; 368 const char *mySourceEnd = mySource + dataLength; 369 UChar *array, *myTarget; 370 371 // estimate the size needed: 372 int32_t arraySize; 373 if(dataLength <= US_STACKBUF_SIZE) { 374 // try to use the stack buffer 375 arraySize = US_STACKBUF_SIZE; 376 } else { 377 // 1.25 UChar's per source byte should cover most cases 378 arraySize = dataLength + (dataLength >> 2); 379 } 380 381 // we do not care about the current contents 382 UBool doCopyArray = FALSE; 383 for(;;) { 384 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { 385 setToBogus(); 386 break; 387 } 388 389 // perform the conversion 390 array = getArrayStart(); 391 myTarget = array + length(); 392 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), 393 &mySource, mySourceEnd, 0, TRUE, &status); 394 395 // update the conversion parameters 396 setLength((int32_t)(myTarget - array)); 397 398 // allocate more space and copy data, if needed 399 if(status == U_BUFFER_OVERFLOW_ERROR) { 400 // reset the error code 401 status = U_ZERO_ERROR; 402 403 // keep the previous conversion results 404 doCopyArray = TRUE; 405 406 // estimate the new size needed, larger than before 407 // try 2 UChar's per remaining source byte 408 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); 409 } else { 410 break; 411 } 412 } 413 } 414 415 U_NAMESPACE_END 416 417 #endif 418