1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: unistr_cnv.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:2 12 * 13 * created on: 2004aug19 14 * created by: Markus W. Scherer 15 * 16 * Character conversion functions moved here from unistr.cpp 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_CONVERSION 22 23 #include "unicode/putil.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/unistr.h" 28 #include "unicode/ucnv.h" 29 #include "ucnv_imp.h" 30 #include "putilimp.h" 31 #include "ustr_cnv.h" 32 #include "ustr_imp.h" 33 34 U_NAMESPACE_BEGIN 35 36 //======================================== 37 // Constructors 38 //======================================== 39 40 #if !U_CHARSET_IS_UTF8 41 42 UnicodeString::UnicodeString(const char *codepageData) { 43 fUnion.fFields.fLengthAndFlags = kShortString; 44 if(codepageData != 0) { 45 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); 46 } 47 } 48 49 UnicodeString::UnicodeString(const char *codepageData, 50 int32_t dataLength) { 51 fUnion.fFields.fLengthAndFlags = kShortString; 52 if(codepageData != 0) { 53 doCodepageCreate(codepageData, dataLength, 0); 54 } 55 } 56 57 // else see unistr.cpp 58 #endif 59 60 UnicodeString::UnicodeString(const char *codepageData, 61 const char *codepage) { 62 fUnion.fFields.fLengthAndFlags = kShortString; 63 if(codepageData != 0) { 64 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); 65 } 66 } 67 68 UnicodeString::UnicodeString(const char *codepageData, 69 int32_t dataLength, 70 const char *codepage) { 71 fUnion.fFields.fLengthAndFlags = kShortString; 72 if(codepageData != 0) { 73 doCodepageCreate(codepageData, dataLength, codepage); 74 } 75 } 76 77 UnicodeString::UnicodeString(const char *src, int32_t srcLength, 78 UConverter *cnv, 79 UErrorCode &errorCode) { 80 fUnion.fFields.fLengthAndFlags = kShortString; 81 if(U_SUCCESS(errorCode)) { 82 // check arguments 83 if(src==NULL) { 84 // treat as an empty string, do nothing more 85 } else if(srcLength<-1) { 86 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 87 } else { 88 // get input length 89 if(srcLength==-1) { 90 srcLength=(int32_t)uprv_strlen(src); 91 } 92 if(srcLength>0) { 93 if(cnv!=0) { 94 // use the provided converter 95 ucnv_resetToUnicode(cnv); 96 doCodepageCreate(src, srcLength, cnv, errorCode); 97 } else { 98 // use the default converter 99 cnv=u_getDefaultConverter(&errorCode); 100 doCodepageCreate(src, srcLength, cnv, errorCode); 101 u_releaseDefaultConverter(cnv); 102 } 103 } 104 } 105 106 if(U_FAILURE(errorCode)) { 107 setToBogus(); 108 } 109 } 110 } 111 112 //======================================== 113 // Codeset conversion 114 //======================================== 115 116 #if !U_CHARSET_IS_UTF8 117 118 int32_t 119 UnicodeString::extract(int32_t start, 120 int32_t length, 121 char *target, 122 uint32_t dstSize) const { 123 return extract(start, length, target, dstSize, 0); 124 } 125 126 // else see unistr.cpp 127 #endif 128 129 int32_t 130 UnicodeString::extract(int32_t start, 131 int32_t length, 132 char *target, 133 uint32_t dstSize, 134 const char *codepage) const 135 { 136 // if the arguments are illegal, then do nothing 137 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 138 return 0; 139 } 140 141 // pin the indices to legal values 142 pinIndices(start, length); 143 144 // We need to cast dstSize to int32_t for all subsequent code. 145 // I don't know why the API was defined with uint32_t but we are stuck with it. 146 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize 147 // as a limit in some functions, it may wrap around and yield a pointer 148 // that compares less-than target. 149 int32_t capacity; 150 if(dstSize < 0x7fffffff) { 151 // Assume that the capacity is real and a limit pointer won't wrap around. 152 capacity = (int32_t)dstSize; 153 } else { 154 // Pin the capacity so that a limit pointer does not wrap around. 155 char *targetLimit = (char *)U_MAX_PTR(target); 156 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff 157 // greater than target and does not wrap around the top of the address space. 158 capacity = (int32_t)(targetLimit - target); 159 } 160 161 // create the converter 162 UConverter *converter; 163 UErrorCode status = U_ZERO_ERROR; 164 165 // just write the NUL if the string length is 0 166 if(length == 0) { 167 return u_terminateChars(target, capacity, 0, &status); 168 } 169 170 // if the codepage is the default, use our cache 171 // if it is an empty string, then use the "invariant character" conversion 172 if (codepage == 0) { 173 const char *defaultName = ucnv_getDefaultName(); 174 if(UCNV_FAST_IS_UTF8(defaultName)) { 175 return toUTF8(start, length, target, capacity); 176 } 177 converter = u_getDefaultConverter(&status); 178 } else if (*codepage == 0) { 179 // use the "invariant characters" conversion 180 int32_t destLength; 181 if(length <= capacity) { 182 destLength = length; 183 } else { 184 destLength = capacity; 185 } 186 u_UCharsToChars(getArrayStart() + start, target, destLength); 187 return u_terminateChars(target, capacity, length, &status); 188 } else { 189 converter = ucnv_open(codepage, &status); 190 } 191 192 length = doExtract(start, length, target, capacity, converter, status); 193 194 // close the converter 195 if (codepage == 0) { 196 u_releaseDefaultConverter(converter); 197 } else { 198 ucnv_close(converter); 199 } 200 201 return length; 202 } 203 204 int32_t 205 UnicodeString::extract(char *dest, int32_t destCapacity, 206 UConverter *cnv, 207 UErrorCode &errorCode) const 208 { 209 if(U_FAILURE(errorCode)) { 210 return 0; 211 } 212 213 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 214 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 215 return 0; 216 } 217 218 // nothing to do? 219 if(isEmpty()) { 220 return u_terminateChars(dest, destCapacity, 0, &errorCode); 221 } 222 223 // get the converter 224 UBool isDefaultConverter; 225 if(cnv==0) { 226 isDefaultConverter=TRUE; 227 cnv=u_getDefaultConverter(&errorCode); 228 if(U_FAILURE(errorCode)) { 229 return 0; 230 } 231 } else { 232 isDefaultConverter=FALSE; 233 ucnv_resetFromUnicode(cnv); 234 } 235 236 // convert 237 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); 238 239 // release the converter 240 if(isDefaultConverter) { 241 u_releaseDefaultConverter(cnv); 242 } 243 244 return len; 245 } 246 247 int32_t 248 UnicodeString::doExtract(int32_t start, int32_t length, 249 char *dest, int32_t destCapacity, 250 UConverter *cnv, 251 UErrorCode &errorCode) const 252 { 253 if(U_FAILURE(errorCode)) { 254 if(destCapacity!=0) { 255 *dest=0; 256 } 257 return 0; 258 } 259 260 const UChar *src=getArrayStart()+start, *srcLimit=src+length; 261 char *originalDest=dest; 262 const char *destLimit; 263 264 if(destCapacity==0) { 265 destLimit=dest=0; 266 } else if(destCapacity==-1) { 267 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. 268 destLimit=(char*)U_MAX_PTR(dest); 269 // for NUL-termination, translate into highest int32_t 270 destCapacity=0x7fffffff; 271 } else { 272 destLimit=dest+destCapacity; 273 } 274 275 // perform the conversion 276 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 277 length=(int32_t)(dest-originalDest); 278 279 // if an overflow occurs, then get the preflighting length 280 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 281 char buffer[1024]; 282 283 destLimit=buffer+sizeof(buffer); 284 do { 285 dest=buffer; 286 errorCode=U_ZERO_ERROR; 287 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 288 length+=(int32_t)(dest-buffer); 289 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); 290 } 291 292 return u_terminateChars(originalDest, destCapacity, length, &errorCode); 293 } 294 295 void 296 UnicodeString::doCodepageCreate(const char *codepageData, 297 int32_t dataLength, 298 const char *codepage) 299 { 300 // if there's nothing to convert, do nothing 301 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 302 return; 303 } 304 if(dataLength == -1) { 305 dataLength = (int32_t)uprv_strlen(codepageData); 306 } 307 308 UErrorCode status = U_ZERO_ERROR; 309 310 // create the converter 311 // if the codepage is the default, use our cache 312 // if it is an empty string, then use the "invariant character" conversion 313 UConverter *converter; 314 if (codepage == 0) { 315 const char *defaultName = ucnv_getDefaultName(); 316 if(UCNV_FAST_IS_UTF8(defaultName)) { 317 setToUTF8(StringPiece(codepageData, dataLength)); 318 return; 319 } 320 converter = u_getDefaultConverter(&status); 321 } else if(*codepage == 0) { 322 // use the "invariant characters" conversion 323 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { 324 u_charsToUChars(codepageData, getArrayStart(), dataLength); 325 setLength(dataLength); 326 } else { 327 setToBogus(); 328 } 329 return; 330 } else { 331 converter = ucnv_open(codepage, &status); 332 } 333 334 // if we failed, set the appropriate flags and return 335 if(U_FAILURE(status)) { 336 setToBogus(); 337 return; 338 } 339 340 // perform the conversion 341 doCodepageCreate(codepageData, dataLength, converter, status); 342 if(U_FAILURE(status)) { 343 setToBogus(); 344 } 345 346 // close the converter 347 if(codepage == 0) { 348 u_releaseDefaultConverter(converter); 349 } else { 350 ucnv_close(converter); 351 } 352 } 353 354 void 355 UnicodeString::doCodepageCreate(const char *codepageData, 356 int32_t dataLength, 357 UConverter *converter, 358 UErrorCode &status) 359 { 360 if(U_FAILURE(status)) { 361 return; 362 } 363 364 // set up the conversion parameters 365 const char *mySource = codepageData; 366 const char *mySourceEnd = mySource + dataLength; 367 UChar *array, *myTarget; 368 369 // estimate the size needed: 370 int32_t arraySize; 371 if(dataLength <= US_STACKBUF_SIZE) { 372 // try to use the stack buffer 373 arraySize = US_STACKBUF_SIZE; 374 } else { 375 // 1.25 UChar's per source byte should cover most cases 376 arraySize = dataLength + (dataLength >> 2); 377 } 378 379 // we do not care about the current contents 380 UBool doCopyArray = FALSE; 381 for(;;) { 382 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { 383 setToBogus(); 384 break; 385 } 386 387 // perform the conversion 388 array = getArrayStart(); 389 myTarget = array + length(); 390 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), 391 &mySource, mySourceEnd, 0, TRUE, &status); 392 393 // update the conversion parameters 394 setLength((int32_t)(myTarget - array)); 395 396 // allocate more space and copy data, if needed 397 if(status == U_BUFFER_OVERFLOW_ERROR) { 398 // reset the error code 399 status = U_ZERO_ERROR; 400 401 // keep the previous conversion results 402 doCopyArray = TRUE; 403 404 // estimate the new size needed, larger than before 405 // try 2 UChar's per remaining source byte 406 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); 407 } else { 408 break; 409 } 410 } 411 } 412 413 U_NAMESPACE_END 414 415 #endif 416