1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: unistr_cnv.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:2 12 * 13 * created on: 2004aug19 14 * created by: Markus W. Scherer 15 * 16 * Character conversion functions moved here from unistr.cpp 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_CONVERSION 22 23 #include "unicode/putil.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/unistr.h" 28 #include "unicode/ucnv.h" 29 #include "ucnv_imp.h" 30 #include "putilimp.h" 31 #include "ustr_cnv.h" 32 #include "ustr_imp.h" 33 34 U_NAMESPACE_BEGIN 35 36 //======================================== 37 // Constructors 38 //======================================== 39 40 #if !U_CHARSET_IS_UTF8 41 42 UnicodeString::UnicodeString(const char *codepageData) 43 : fShortLength(0), 44 fFlags(kShortString) 45 { 46 if(codepageData != 0) { 47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); 48 } 49 } 50 51 UnicodeString::UnicodeString(const char *codepageData, 52 int32_t dataLength) 53 : fShortLength(0), 54 fFlags(kShortString) 55 { 56 if(codepageData != 0) { 57 doCodepageCreate(codepageData, dataLength, 0); 58 } 59 } 60 61 // else see unistr.cpp 62 #endif 63 64 UnicodeString::UnicodeString(const char *codepageData, 65 const char *codepage) 66 : fShortLength(0), 67 fFlags(kShortString) 68 { 69 if(codepageData != 0) { 70 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); 71 } 72 } 73 74 UnicodeString::UnicodeString(const char *codepageData, 75 int32_t dataLength, 76 const char *codepage) 77 : fShortLength(0), 78 fFlags(kShortString) 79 { 80 if(codepageData != 0) { 81 doCodepageCreate(codepageData, dataLength, codepage); 82 } 83 } 84 85 UnicodeString::UnicodeString(const char *src, int32_t srcLength, 86 UConverter *cnv, 87 UErrorCode &errorCode) 88 : fShortLength(0), 89 fFlags(kShortString) 90 { 91 if(U_SUCCESS(errorCode)) { 92 // check arguments 93 if(src==NULL) { 94 // treat as an empty string, do nothing more 95 } else if(srcLength<-1) { 96 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 97 } else { 98 // get input length 99 if(srcLength==-1) { 100 srcLength=(int32_t)uprv_strlen(src); 101 } 102 if(srcLength>0) { 103 if(cnv!=0) { 104 // use the provided converter 105 ucnv_resetToUnicode(cnv); 106 doCodepageCreate(src, srcLength, cnv, errorCode); 107 } else { 108 // use the default converter 109 cnv=u_getDefaultConverter(&errorCode); 110 doCodepageCreate(src, srcLength, cnv, errorCode); 111 u_releaseDefaultConverter(cnv); 112 } 113 } 114 } 115 116 if(U_FAILURE(errorCode)) { 117 setToBogus(); 118 } 119 } 120 } 121 122 //======================================== 123 // Codeset conversion 124 //======================================== 125 126 #if !U_CHARSET_IS_UTF8 127 128 int32_t 129 UnicodeString::extract(int32_t start, 130 int32_t length, 131 char *target, 132 uint32_t dstSize) const { 133 return extract(start, length, target, dstSize, 0); 134 } 135 136 // else see unistr.cpp 137 #endif 138 139 int32_t 140 UnicodeString::extract(int32_t start, 141 int32_t length, 142 char *target, 143 uint32_t dstSize, 144 const char *codepage) const 145 { 146 // if the arguments are illegal, then do nothing 147 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 148 return 0; 149 } 150 151 // pin the indices to legal values 152 pinIndices(start, length); 153 154 // We need to cast dstSize to int32_t for all subsequent code. 155 // I don't know why the API was defined with uint32_t but we are stuck with it. 156 // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize 157 // as a limit in some functions, it may wrap around and yield a pointer 158 // that compares less-than target. 159 int32_t capacity; 160 if(dstSize < 0x7fffffff) { 161 // Assume that the capacity is real and a limit pointer won't wrap around. 162 capacity = (int32_t)dstSize; 163 } else { 164 // Pin the capacity so that a limit pointer does not wrap around. 165 char *targetLimit = (char *)U_MAX_PTR(target); 166 // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff 167 // greater than target and does not wrap around the top of the address space. 168 capacity = (int32_t)(targetLimit - target); 169 } 170 171 // create the converter 172 UConverter *converter; 173 UErrorCode status = U_ZERO_ERROR; 174 175 // just write the NUL if the string length is 0 176 if(length == 0) { 177 return u_terminateChars(target, capacity, 0, &status); 178 } 179 180 // if the codepage is the default, use our cache 181 // if it is an empty string, then use the "invariant character" conversion 182 if (codepage == 0) { 183 const char *defaultName = ucnv_getDefaultName(); 184 if(UCNV_FAST_IS_UTF8(defaultName)) { 185 return toUTF8(start, length, target, capacity); 186 } 187 converter = u_getDefaultConverter(&status); 188 } else if (*codepage == 0) { 189 // use the "invariant characters" conversion 190 int32_t destLength; 191 if(length <= capacity) { 192 destLength = length; 193 } else { 194 destLength = capacity; 195 } 196 u_UCharsToChars(getArrayStart() + start, target, destLength); 197 return u_terminateChars(target, capacity, length, &status); 198 } else { 199 converter = ucnv_open(codepage, &status); 200 } 201 202 length = doExtract(start, length, target, capacity, converter, status); 203 204 // close the converter 205 if (codepage == 0) { 206 u_releaseDefaultConverter(converter); 207 } else { 208 ucnv_close(converter); 209 } 210 211 return length; 212 } 213 214 int32_t 215 UnicodeString::extract(char *dest, int32_t destCapacity, 216 UConverter *cnv, 217 UErrorCode &errorCode) const 218 { 219 if(U_FAILURE(errorCode)) { 220 return 0; 221 } 222 223 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 224 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 225 return 0; 226 } 227 228 // nothing to do? 229 if(isEmpty()) { 230 return u_terminateChars(dest, destCapacity, 0, &errorCode); 231 } 232 233 // get the converter 234 UBool isDefaultConverter; 235 if(cnv==0) { 236 isDefaultConverter=TRUE; 237 cnv=u_getDefaultConverter(&errorCode); 238 if(U_FAILURE(errorCode)) { 239 return 0; 240 } 241 } else { 242 isDefaultConverter=FALSE; 243 ucnv_resetFromUnicode(cnv); 244 } 245 246 // convert 247 int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); 248 249 // release the converter 250 if(isDefaultConverter) { 251 u_releaseDefaultConverter(cnv); 252 } 253 254 return len; 255 } 256 257 int32_t 258 UnicodeString::doExtract(int32_t start, int32_t length, 259 char *dest, int32_t destCapacity, 260 UConverter *cnv, 261 UErrorCode &errorCode) const 262 { 263 if(U_FAILURE(errorCode)) { 264 if(destCapacity!=0) { 265 *dest=0; 266 } 267 return 0; 268 } 269 270 const UChar *src=getArrayStart()+start, *srcLimit=src+length; 271 char *originalDest=dest; 272 const char *destLimit; 273 274 if(destCapacity==0) { 275 destLimit=dest=0; 276 } else if(destCapacity==-1) { 277 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. 278 destLimit=(char*)U_MAX_PTR(dest); 279 // for NUL-termination, translate into highest int32_t 280 destCapacity=0x7fffffff; 281 } else { 282 destLimit=dest+destCapacity; 283 } 284 285 // perform the conversion 286 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 287 length=(int32_t)(dest-originalDest); 288 289 // if an overflow occurs, then get the preflighting length 290 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 291 char buffer[1024]; 292 293 destLimit=buffer+sizeof(buffer); 294 do { 295 dest=buffer; 296 errorCode=U_ZERO_ERROR; 297 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode); 298 length+=(int32_t)(dest-buffer); 299 } while(errorCode==U_BUFFER_OVERFLOW_ERROR); 300 } 301 302 return u_terminateChars(originalDest, destCapacity, length, &errorCode); 303 } 304 305 void 306 UnicodeString::doCodepageCreate(const char *codepageData, 307 int32_t dataLength, 308 const char *codepage) 309 { 310 // if there's nothing to convert, do nothing 311 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 312 return; 313 } 314 if(dataLength == -1) { 315 dataLength = (int32_t)uprv_strlen(codepageData); 316 } 317 318 UErrorCode status = U_ZERO_ERROR; 319 320 // create the converter 321 // if the codepage is the default, use our cache 322 // if it is an empty string, then use the "invariant character" conversion 323 UConverter *converter; 324 if (codepage == 0) { 325 const char *defaultName = ucnv_getDefaultName(); 326 if(UCNV_FAST_IS_UTF8(defaultName)) { 327 setToUTF8(StringPiece(codepageData, dataLength)); 328 return; 329 } 330 converter = u_getDefaultConverter(&status); 331 } else if(*codepage == 0) { 332 // use the "invariant characters" conversion 333 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) { 334 u_charsToUChars(codepageData, getArrayStart(), dataLength); 335 setLength(dataLength); 336 } else { 337 setToBogus(); 338 } 339 return; 340 } else { 341 converter = ucnv_open(codepage, &status); 342 } 343 344 // if we failed, set the appropriate flags and return 345 if(U_FAILURE(status)) { 346 setToBogus(); 347 return; 348 } 349 350 // perform the conversion 351 doCodepageCreate(codepageData, dataLength, converter, status); 352 if(U_FAILURE(status)) { 353 setToBogus(); 354 } 355 356 // close the converter 357 if(codepage == 0) { 358 u_releaseDefaultConverter(converter); 359 } else { 360 ucnv_close(converter); 361 } 362 } 363 364 void 365 UnicodeString::doCodepageCreate(const char *codepageData, 366 int32_t dataLength, 367 UConverter *converter, 368 UErrorCode &status) 369 { 370 if(U_FAILURE(status)) { 371 return; 372 } 373 374 // set up the conversion parameters 375 const char *mySource = codepageData; 376 const char *mySourceEnd = mySource + dataLength; 377 UChar *array, *myTarget; 378 379 // estimate the size needed: 380 int32_t arraySize; 381 if(dataLength <= US_STACKBUF_SIZE) { 382 // try to use the stack buffer 383 arraySize = US_STACKBUF_SIZE; 384 } else { 385 // 1.25 UChar's per source byte should cover most cases 386 arraySize = dataLength + (dataLength >> 2); 387 } 388 389 // we do not care about the current contents 390 UBool doCopyArray = FALSE; 391 for(;;) { 392 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { 393 setToBogus(); 394 break; 395 } 396 397 // perform the conversion 398 array = getArrayStart(); 399 myTarget = array + length(); 400 ucnv_toUnicode(converter, &myTarget, array + getCapacity(), 401 &mySource, mySourceEnd, 0, TRUE, &status); 402 403 // update the conversion parameters 404 setLength((int32_t)(myTarget - array)); 405 406 // allocate more space and copy data, if needed 407 if(status == U_BUFFER_OVERFLOW_ERROR) { 408 // reset the error code 409 status = U_ZERO_ERROR; 410 411 // keep the previous conversion results 412 doCopyArray = TRUE; 413 414 // estimate the new size needed, larger than before 415 // try 2 UChar's per remaining source byte 416 arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); 417 } else { 418 break; 419 } 420 } 421 } 422 423 U_NAMESPACE_END 424 425 #endif 426