1 /* 2 ****************************************************************************** 3 * Copyright (C) 1999-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 * 7 * File unistr.cpp 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 09/25/98 stephen Creation. 13 * 04/20/99 stephen Overhauled per 4/16 code review. 14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from 16 * Replaceable. 17 * 06/25/01 grhoten Removed the dependency on iostream 18 ****************************************************************************** 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/appendable.h" 23 #include "unicode/putil.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/unistr.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf16.h" 30 #include "uelement.h" 31 #include "ustr_imp.h" 32 #include "umutex.h" 33 #include "uassert.h" 34 35 #if 0 36 37 #include <iostream> 38 using namespace std; 39 40 //DEBUGGING 41 void 42 print(const UnicodeString& s, 43 const char *name) 44 { 45 UChar c; 46 cout << name << ":|"; 47 for(int i = 0; i < s.length(); ++i) { 48 c = s[i]; 49 if(c>= 0x007E || c < 0x0020) 50 cout << "[0x" << hex << s[i] << "]"; 51 else 52 cout << (char) s[i]; 53 } 54 cout << '|' << endl; 55 } 56 57 void 58 print(const UChar *s, 59 int32_t len, 60 const char *name) 61 { 62 UChar c; 63 cout << name << ":|"; 64 for(int i = 0; i < len; ++i) { 65 c = s[i]; 66 if(c>= 0x007E || c < 0x0020) 67 cout << "[0x" << hex << s[i] << "]"; 68 else 69 cout << (char) s[i]; 70 } 71 cout << '|' << endl; 72 } 73 // END DEBUGGING 74 #endif 75 76 // Local function definitions for now 77 78 // need to copy areas that may overlap 79 static 80 inline void 81 us_arrayCopy(const UChar *src, int32_t srcStart, 82 UChar *dst, int32_t dstStart, int32_t count) 83 { 84 if(count>0) { 85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); 86 } 87 } 88 89 // u_unescapeAt() callback to get a UChar from a UnicodeString 90 U_CDECL_BEGIN 91 static UChar U_CALLCONV 92 UnicodeString_charAt(int32_t offset, void *context) { 93 return ((icu::UnicodeString*) context)->charAt(offset); 94 } 95 U_CDECL_END 96 97 U_NAMESPACE_BEGIN 98 99 /* The Replaceable virtual destructor can't be defined in the header 100 due to how AIX works with multiple definitions of virtual functions. 101 */ 102 Replaceable::~Replaceable() {} 103 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 105 106 UnicodeString U_EXPORT2 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) { 108 return 109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). 110 append(s1). 111 append(s2); 112 } 113 114 //======================================== 115 // Reference Counting functions, put at top of file so that optimizing compilers 116 // have a chance to automatically inline. 117 //======================================== 118 119 void 120 UnicodeString::addRef() { 121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 122 } 123 124 int32_t 125 UnicodeString::removeRef() { 126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 127 } 128 129 int32_t 130 UnicodeString::refCount() const { 131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1)); 132 } 133 134 void 135 UnicodeString::releaseArray() { 136 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) { 137 uprv_free((int32_t *)fUnion.fFields.fArray - 1); 138 } 139 } 140 141 142 143 //======================================== 144 // Constructors 145 //======================================== 146 147 // The default constructor is inline in unistr.h. 148 149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) { 150 fUnion.fFields.fLengthAndFlags = 0; 151 if(count <= 0 || (uint32_t)c > 0x10ffff) { 152 // just allocate and do not do anything else 153 allocate(capacity); 154 } else { 155 // count > 0, allocate and fill the new string with count c's 156 int32_t unitCount = U16_LENGTH(c), length = count * unitCount; 157 if(capacity < length) { 158 capacity = length; 159 } 160 if(allocate(capacity)) { 161 UChar *array = getArrayStart(); 162 int32_t i = 0; 163 164 // fill the new string with c 165 if(unitCount == 1) { 166 // fill with length UChars 167 while(i < length) { 168 array[i++] = (UChar)c; 169 } 170 } else { 171 // get the code units for c 172 UChar units[U16_MAX_LENGTH]; 173 U16_APPEND_UNSAFE(units, i, c); 174 175 // now it must be i==unitCount 176 i = 0; 177 178 // for Unicode, unitCount can only be 1, 2, 3, or 4 179 // 1 is handled above 180 while(i < length) { 181 int32_t unitIdx = 0; 182 while(unitIdx < unitCount) { 183 array[i++]=units[unitIdx++]; 184 } 185 } 186 } 187 } 188 setLength(length); 189 } 190 } 191 192 UnicodeString::UnicodeString(UChar ch) { 193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString; 194 fUnion.fStackFields.fBuffer[0] = ch; 195 } 196 197 UnicodeString::UnicodeString(UChar32 ch) { 198 fUnion.fFields.fLengthAndFlags = kShortString; 199 int32_t i = 0; 200 UBool isError = FALSE; 201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError); 202 // We test isError so that the compiler does not complain that we don't. 203 // If isError then i==0 which is what we want anyway. 204 if(!isError) { 205 setShortLength(i); 206 } 207 } 208 209 UnicodeString::UnicodeString(const UChar *text) { 210 fUnion.fFields.fLengthAndFlags = kShortString; 211 doAppend(text, 0, -1); 212 } 213 214 UnicodeString::UnicodeString(const UChar *text, 215 int32_t textLength) { 216 fUnion.fFields.fLengthAndFlags = kShortString; 217 doAppend(text, 0, textLength); 218 } 219 220 UnicodeString::UnicodeString(UBool isTerminated, 221 const UChar *text, 222 int32_t textLength) { 223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 224 if(text == NULL) { 225 // treat as an empty string, do not alias 226 setToEmpty(); 227 } else if(textLength < -1 || 228 (textLength == -1 && !isTerminated) || 229 (textLength >= 0 && isTerminated && text[textLength] != 0) 230 ) { 231 setToBogus(); 232 } else { 233 if(textLength == -1) { 234 // text is terminated, or else it would have failed the above test 235 textLength = u_strlen(text); 236 } 237 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 238 } 239 } 240 241 UnicodeString::UnicodeString(UChar *buff, 242 int32_t buffLength, 243 int32_t buffCapacity) { 244 fUnion.fFields.fLengthAndFlags = kWritableAlias; 245 if(buff == NULL) { 246 // treat as an empty string, do not alias 247 setToEmpty(); 248 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 249 setToBogus(); 250 } else { 251 if(buffLength == -1) { 252 // fLength = u_strlen(buff); but do not look beyond buffCapacity 253 const UChar *p = buff, *limit = buff + buffCapacity; 254 while(p != limit && *p != 0) { 255 ++p; 256 } 257 buffLength = (int32_t)(p - buff); 258 } 259 setArray(buff, buffLength, buffCapacity); 260 } 261 } 262 263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) { 264 fUnion.fFields.fLengthAndFlags = kShortString; 265 if(src==NULL) { 266 // treat as an empty string 267 } else { 268 if(length<0) { 269 length=(int32_t)uprv_strlen(src); 270 } 271 if(cloneArrayIfNeeded(length, length, FALSE)) { 272 u_charsToUChars(src, getArrayStart(), length); 273 setLength(length); 274 } else { 275 setToBogus(); 276 } 277 } 278 } 279 280 #if U_CHARSET_IS_UTF8 281 282 UnicodeString::UnicodeString(const char *codepageData) { 283 fUnion.fFields.fLengthAndFlags = kShortString; 284 if(codepageData != 0) { 285 setToUTF8(codepageData); 286 } 287 } 288 289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) { 290 fUnion.fFields.fLengthAndFlags = kShortString; 291 // if there's nothing to convert, do nothing 292 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 293 return; 294 } 295 if(dataLength == -1) { 296 dataLength = (int32_t)uprv_strlen(codepageData); 297 } 298 setToUTF8(StringPiece(codepageData, dataLength)); 299 } 300 301 // else see unistr_cnv.cpp 302 #endif 303 304 UnicodeString::UnicodeString(const UnicodeString& that) { 305 fUnion.fFields.fLengthAndFlags = kShortString; 306 copyFrom(that); 307 } 308 309 #if U_HAVE_RVALUE_REFERENCES 310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT { 311 fUnion.fFields.fLengthAndFlags = kShortString; 312 moveFrom(src); 313 } 314 #endif 315 316 UnicodeString::UnicodeString(const UnicodeString& that, 317 int32_t srcStart) { 318 fUnion.fFields.fLengthAndFlags = kShortString; 319 setTo(that, srcStart); 320 } 321 322 UnicodeString::UnicodeString(const UnicodeString& that, 323 int32_t srcStart, 324 int32_t srcLength) { 325 fUnion.fFields.fLengthAndFlags = kShortString; 326 setTo(that, srcStart, srcLength); 327 } 328 329 // Replaceable base class clone() default implementation, does not clone 330 Replaceable * 331 Replaceable::clone() const { 332 return NULL; 333 } 334 335 // UnicodeString overrides clone() with a real implementation 336 Replaceable * 337 UnicodeString::clone() const { 338 return new UnicodeString(*this); 339 } 340 341 //======================================== 342 // array allocation 343 //======================================== 344 345 UBool 346 UnicodeString::allocate(int32_t capacity) { 347 if(capacity <= US_STACKBUF_SIZE) { 348 fUnion.fFields.fLengthAndFlags = kShortString; 349 } else { 350 // count bytes for the refCounter and the string capacity, and 351 // round up to a multiple of 16; then divide by 4 and allocate int32_t's 352 // to be safely aligned for the refCount 353 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() 354 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); 355 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); 356 if(array != 0) { 357 // set initial refCount and point behind the refCount 358 *array++ = 1; 359 360 // have fArray point to the first UChar 361 fUnion.fFields.fArray = (UChar *)array; 362 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); 363 fUnion.fFields.fLengthAndFlags = kLongString; 364 } else { 365 fUnion.fFields.fLengthAndFlags = kIsBogus; 366 fUnion.fFields.fArray = 0; 367 fUnion.fFields.fCapacity = 0; 368 return FALSE; 369 } 370 } 371 return TRUE; 372 } 373 374 //======================================== 375 // Destructor 376 //======================================== 377 378 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 379 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1 380 static u_atomic_int32_t beyondCount(0); 381 382 U_CAPI void unistr_printLengths() { 383 int32_t i; 384 for(i = 0; i <= 59; ++i) { 385 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]); 386 } 387 int32_t beyond = beyondCount; 388 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) { 389 beyond += finalLengthCounts[i]; 390 } 391 printf(">59, %9d\n", beyond); 392 } 393 #endif 394 395 UnicodeString::~UnicodeString() 396 { 397 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 398 // Count lengths of strings at the end of their lifetime. 399 // Useful for discussion of a desirable stack buffer size. 400 // Count the contents length, not the optional NUL terminator nor further capacity. 401 // Ignore open-buffer strings and strings which alias external storage. 402 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) { 403 if(hasShortLength()) { 404 umtx_atomic_inc(finalLengthCounts + getShortLength()); 405 } else { 406 umtx_atomic_inc(&beyondCount); 407 } 408 } 409 #endif 410 411 releaseArray(); 412 } 413 414 //======================================== 415 // Factory methods 416 //======================================== 417 418 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { 419 UnicodeString result; 420 result.setToUTF8(utf8); 421 return result; 422 } 423 424 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 425 UnicodeString result; 426 int32_t capacity; 427 // Most UTF-32 strings will be BMP-only and result in a same-length 428 // UTF-16 string. We overestimate the capacity just slightly, 429 // just in case there are a few supplementary characters. 430 if(length <= US_STACKBUF_SIZE) { 431 capacity = US_STACKBUF_SIZE; 432 } else { 433 capacity = length + (length >> 4) + 4; 434 } 435 do { 436 UChar *utf16 = result.getBuffer(capacity); 437 int32_t length16; 438 UErrorCode errorCode = U_ZERO_ERROR; 439 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 440 utf32, length, 441 0xfffd, // Substitution character. 442 NULL, // Don't care about number of substitutions. 443 &errorCode); 444 result.releaseBuffer(length16); 445 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 446 capacity = length16 + 1; // +1 for the terminating NUL. 447 continue; 448 } else if(U_FAILURE(errorCode)) { 449 result.setToBogus(); 450 } 451 break; 452 } while(TRUE); 453 return result; 454 } 455 456 //======================================== 457 // Assignment 458 //======================================== 459 460 UnicodeString & 461 UnicodeString::operator=(const UnicodeString &src) { 462 return copyFrom(src); 463 } 464 465 UnicodeString & 466 UnicodeString::fastCopyFrom(const UnicodeString &src) { 467 return copyFrom(src, TRUE); 468 } 469 470 UnicodeString & 471 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 472 // if assigning to ourselves, do nothing 473 if(this == &src) { 474 return *this; 475 } 476 477 // is the right side bogus? 478 if(src.isBogus()) { 479 setToBogus(); 480 return *this; 481 } 482 483 // delete the current contents 484 releaseArray(); 485 486 if(src.isEmpty()) { 487 // empty string - use the stack buffer 488 setToEmpty(); 489 return *this; 490 } 491 492 // fLength>0 and not an "open" src.getBuffer(minCapacity) 493 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 494 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) { 495 case kShortString: 496 // short string using the stack buffer, do the same 497 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 498 getShortLength() * U_SIZEOF_UCHAR); 499 break; 500 case kLongString: 501 // src uses a refCounted string buffer, use that buffer with refCount 502 // src is const, use a cast - we don't actually change it 503 ((UnicodeString &)src).addRef(); 504 // copy all fields, share the reference-counted buffer 505 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 506 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 507 if(!hasShortLength()) { 508 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 509 } 510 break; 511 case kReadonlyAlias: 512 if(fastCopy) { 513 // src is a readonly alias, do the same 514 // -> maintain the readonly alias as such 515 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 516 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 517 if(!hasShortLength()) { 518 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 519 } 520 break; 521 } 522 // else if(!fastCopy) fall through to case kWritableAlias 523 // -> allocate a new buffer and copy the contents 524 case kWritableAlias: { 525 // src is a writable alias; we make a copy of that instead 526 int32_t srcLength = src.length(); 527 if(allocate(srcLength)) { 528 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); 529 setLength(srcLength); 530 break; 531 } 532 // if there is not enough memory, then fall through to setting to bogus 533 } 534 default: 535 // if src is bogus, set ourselves to bogus 536 // do not call setToBogus() here because fArray and flags are not consistent here 537 fUnion.fFields.fLengthAndFlags = kIsBogus; 538 fUnion.fFields.fArray = 0; 539 fUnion.fFields.fCapacity = 0; 540 break; 541 } 542 543 return *this; 544 } 545 546 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT { 547 // No explicit check for self move assignment, consistent with standard library. 548 // Self move assignment causes no crash nor leak but might make the object bogus. 549 releaseArray(); 550 copyFieldsFrom(src, TRUE); 551 return *this; 552 } 553 554 // Same as moveFrom() except without memory management. 555 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT { 556 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 557 if(lengthAndFlags & kUsingStackBuffer) { 558 // Short string using the stack buffer, copy the contents. 559 // Check for self assignment to prevent "overlap in memcpy" warnings, 560 // although it should be harmless to copy a buffer to itself exactly. 561 if(this != &src) { 562 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 563 getShortLength() * U_SIZEOF_UCHAR); 564 } 565 } else { 566 // In all other cases, copy all fields. 567 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 568 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 569 if(!hasShortLength()) { 570 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 571 } 572 if(setSrcToBogus) { 573 // Set src to bogus without releasing any memory. 574 src.fUnion.fFields.fLengthAndFlags = kIsBogus; 575 src.fUnion.fFields.fArray = NULL; 576 src.fUnion.fFields.fCapacity = 0; 577 } 578 } 579 } 580 581 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT { 582 UnicodeString temp; // Empty short string: Known not to need releaseArray(). 583 // Copy fields without resetting source values in between. 584 temp.copyFieldsFrom(*this, FALSE); 585 this->copyFieldsFrom(other, FALSE); 586 other.copyFieldsFrom(temp, FALSE); 587 // Set temp to an empty string so that other's memory is not released twice. 588 temp.fUnion.fFields.fLengthAndFlags = kShortString; 589 } 590 591 //======================================== 592 // Miscellaneous operations 593 //======================================== 594 595 UnicodeString UnicodeString::unescape() const { 596 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity 597 if (result.isBogus()) { 598 return result; 599 } 600 const UChar *array = getBuffer(); 601 int32_t len = length(); 602 int32_t prev = 0; 603 for (int32_t i=0;;) { 604 if (i == len) { 605 result.append(array, prev, len - prev); 606 break; 607 } 608 if (array[i++] == 0x5C /*'\\'*/) { 609 result.append(array, prev, (i - 1) - prev); 610 UChar32 c = unescapeAt(i); // advances i 611 if (c < 0) { 612 result.remove(); // return empty string 613 break; // invalid escape sequence 614 } 615 result.append(c); 616 prev = i; 617 } 618 } 619 return result; 620 } 621 622 UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 623 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 624 } 625 626 //======================================== 627 // Read-only implementation 628 //======================================== 629 UBool 630 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { 631 // Requires: this & text not bogus and have same lengths. 632 // Byte-wise comparison works for equality regardless of endianness. 633 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; 634 } 635 636 int8_t 637 UnicodeString::doCompare( int32_t start, 638 int32_t length, 639 const UChar *srcChars, 640 int32_t srcStart, 641 int32_t srcLength) const 642 { 643 // compare illegal string values 644 if(isBogus()) { 645 return -1; 646 } 647 648 // pin indices to legal values 649 pinIndices(start, length); 650 651 if(srcChars == NULL) { 652 // treat const UChar *srcChars==NULL as an empty string 653 return length == 0 ? 0 : 1; 654 } 655 656 // get the correct pointer 657 const UChar *chars = getArrayStart(); 658 659 chars += start; 660 srcChars += srcStart; 661 662 int32_t minLength; 663 int8_t lengthResult; 664 665 // get the srcLength if necessary 666 if(srcLength < 0) { 667 srcLength = u_strlen(srcChars + srcStart); 668 } 669 670 // are we comparing different lengths? 671 if(length != srcLength) { 672 if(length < srcLength) { 673 minLength = length; 674 lengthResult = -1; 675 } else { 676 minLength = srcLength; 677 lengthResult = 1; 678 } 679 } else { 680 minLength = length; 681 lengthResult = 0; 682 } 683 684 /* 685 * note that uprv_memcmp() returns an int but we return an int8_t; 686 * we need to take care not to truncate the result - 687 * one way to do this is to right-shift the value to 688 * move the sign bit into the lower 8 bits and making sure that this 689 * does not become 0 itself 690 */ 691 692 if(minLength > 0 && chars != srcChars) { 693 int32_t result; 694 695 # if U_IS_BIG_ENDIAN 696 // big-endian: byte comparison works 697 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); 698 if(result != 0) { 699 return (int8_t)(result >> 15 | 1); 700 } 701 # else 702 // little-endian: compare UChar units 703 do { 704 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); 705 if(result != 0) { 706 return (int8_t)(result >> 15 | 1); 707 } 708 } while(--minLength > 0); 709 # endif 710 } 711 return lengthResult; 712 } 713 714 /* String compare in code point order - doCompare() compares in code unit order. */ 715 int8_t 716 UnicodeString::doCompareCodePointOrder(int32_t start, 717 int32_t length, 718 const UChar *srcChars, 719 int32_t srcStart, 720 int32_t srcLength) const 721 { 722 // compare illegal string values 723 // treat const UChar *srcChars==NULL as an empty string 724 if(isBogus()) { 725 return -1; 726 } 727 728 // pin indices to legal values 729 pinIndices(start, length); 730 731 if(srcChars == NULL) { 732 srcStart = srcLength = 0; 733 } 734 735 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); 736 /* translate the 32-bit result into an 8-bit one */ 737 if(diff!=0) { 738 return (int8_t)(diff >> 15 | 1); 739 } else { 740 return 0; 741 } 742 } 743 744 int32_t 745 UnicodeString::getLength() const { 746 return length(); 747 } 748 749 UChar 750 UnicodeString::getCharAt(int32_t offset) const { 751 return charAt(offset); 752 } 753 754 UChar32 755 UnicodeString::getChar32At(int32_t offset) const { 756 return char32At(offset); 757 } 758 759 UChar32 760 UnicodeString::char32At(int32_t offset) const 761 { 762 int32_t len = length(); 763 if((uint32_t)offset < (uint32_t)len) { 764 const UChar *array = getArrayStart(); 765 UChar32 c; 766 U16_GET(array, 0, offset, len, c); 767 return c; 768 } else { 769 return kInvalidUChar; 770 } 771 } 772 773 int32_t 774 UnicodeString::getChar32Start(int32_t offset) const { 775 if((uint32_t)offset < (uint32_t)length()) { 776 const UChar *array = getArrayStart(); 777 U16_SET_CP_START(array, 0, offset); 778 return offset; 779 } else { 780 return 0; 781 } 782 } 783 784 int32_t 785 UnicodeString::getChar32Limit(int32_t offset) const { 786 int32_t len = length(); 787 if((uint32_t)offset < (uint32_t)len) { 788 const UChar *array = getArrayStart(); 789 U16_SET_CP_LIMIT(array, 0, offset, len); 790 return offset; 791 } else { 792 return len; 793 } 794 } 795 796 int32_t 797 UnicodeString::countChar32(int32_t start, int32_t length) const { 798 pinIndices(start, length); 799 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL 800 return u_countChar32(getArrayStart()+start, length); 801 } 802 803 UBool 804 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 805 pinIndices(start, length); 806 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL 807 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 808 } 809 810 int32_t 811 UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 812 // pin index 813 int32_t len = length(); 814 if(index<0) { 815 index=0; 816 } else if(index>len) { 817 index=len; 818 } 819 820 const UChar *array = getArrayStart(); 821 if(delta>0) { 822 U16_FWD_N(array, index, len, delta); 823 } else { 824 U16_BACK_N(array, 0, index, -delta); 825 } 826 827 return index; 828 } 829 830 void 831 UnicodeString::doExtract(int32_t start, 832 int32_t length, 833 UChar *dst, 834 int32_t dstStart) const 835 { 836 // pin indices to legal values 837 pinIndices(start, length); 838 839 // do not copy anything if we alias dst itself 840 const UChar *array = getArrayStart(); 841 if(array + start != dst + dstStart) { 842 us_arrayCopy(array, start, dst, dstStart, length); 843 } 844 } 845 846 int32_t 847 UnicodeString::extract(UChar *dest, int32_t destCapacity, 848 UErrorCode &errorCode) const { 849 int32_t len = length(); 850 if(U_SUCCESS(errorCode)) { 851 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 852 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 853 } else { 854 const UChar *array = getArrayStart(); 855 if(len>0 && len<=destCapacity && array!=dest) { 856 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); 857 } 858 return u_terminateUChars(dest, destCapacity, len, &errorCode); 859 } 860 } 861 862 return len; 863 } 864 865 int32_t 866 UnicodeString::extract(int32_t start, 867 int32_t length, 868 char *target, 869 int32_t targetCapacity, 870 enum EInvariant) const 871 { 872 // if the arguments are illegal, then do nothing 873 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { 874 return 0; 875 } 876 877 // pin the indices to legal values 878 pinIndices(start, length); 879 880 if(length <= targetCapacity) { 881 u_UCharsToChars(getArrayStart() + start, target, length); 882 } 883 UErrorCode status = U_ZERO_ERROR; 884 return u_terminateChars(target, targetCapacity, length, &status); 885 } 886 887 UnicodeString 888 UnicodeString::tempSubString(int32_t start, int32_t len) const { 889 pinIndices(start, len); 890 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 891 if(array==NULL) { 892 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string 893 len=-2; // bogus result string 894 } 895 return UnicodeString(FALSE, array + start, len); 896 } 897 898 int32_t 899 UnicodeString::toUTF8(int32_t start, int32_t len, 900 char *target, int32_t capacity) const { 901 pinIndices(start, len); 902 int32_t length8; 903 UErrorCode errorCode = U_ZERO_ERROR; 904 u_strToUTF8WithSub(target, capacity, &length8, 905 getBuffer() + start, len, 906 0xFFFD, // Standard substitution character. 907 NULL, // Don't care about number of substitutions. 908 &errorCode); 909 return length8; 910 } 911 912 #if U_CHARSET_IS_UTF8 913 914 int32_t 915 UnicodeString::extract(int32_t start, int32_t len, 916 char *target, uint32_t dstSize) const { 917 // if the arguments are illegal, then do nothing 918 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 919 return 0; 920 } 921 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); 922 } 923 924 // else see unistr_cnv.cpp 925 #endif 926 927 void 928 UnicodeString::extractBetween(int32_t start, 929 int32_t limit, 930 UnicodeString& target) const { 931 pinIndex(start); 932 pinIndex(limit); 933 doExtract(start, limit - start, target); 934 } 935 936 // When converting from UTF-16 to UTF-8, the result will have at most 3 times 937 // as many bytes as the source has UChars. 938 // The "worst cases" are writing systems like Indic, Thai and CJK with 939 // 3:1 bytes:UChars. 940 void 941 UnicodeString::toUTF8(ByteSink &sink) const { 942 int32_t length16 = length(); 943 if(length16 != 0) { 944 char stackBuffer[1024]; 945 int32_t capacity = (int32_t)sizeof(stackBuffer); 946 UBool utf8IsOwned = FALSE; 947 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 948 3*length16, 949 stackBuffer, capacity, 950 &capacity); 951 int32_t length8 = 0; 952 UErrorCode errorCode = U_ZERO_ERROR; 953 u_strToUTF8WithSub(utf8, capacity, &length8, 954 getBuffer(), length16, 955 0xFFFD, // Standard substitution character. 956 NULL, // Don't care about number of substitutions. 957 &errorCode); 958 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 959 utf8 = (char *)uprv_malloc(length8); 960 if(utf8 != NULL) { 961 utf8IsOwned = TRUE; 962 errorCode = U_ZERO_ERROR; 963 u_strToUTF8WithSub(utf8, length8, &length8, 964 getBuffer(), length16, 965 0xFFFD, // Standard substitution character. 966 NULL, // Don't care about number of substitutions. 967 &errorCode); 968 } else { 969 errorCode = U_MEMORY_ALLOCATION_ERROR; 970 } 971 } 972 if(U_SUCCESS(errorCode)) { 973 sink.Append(utf8, length8); 974 sink.Flush(); 975 } 976 if(utf8IsOwned) { 977 uprv_free(utf8); 978 } 979 } 980 } 981 982 int32_t 983 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 984 int32_t length32=0; 985 if(U_SUCCESS(errorCode)) { 986 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 987 u_strToUTF32WithSub(utf32, capacity, &length32, 988 getBuffer(), length(), 989 0xfffd, // Substitution character. 990 NULL, // Don't care about number of substitutions. 991 &errorCode); 992 } 993 return length32; 994 } 995 996 int32_t 997 UnicodeString::indexOf(const UChar *srcChars, 998 int32_t srcStart, 999 int32_t srcLength, 1000 int32_t start, 1001 int32_t length) const 1002 { 1003 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1004 return -1; 1005 } 1006 1007 // UnicodeString does not find empty substrings 1008 if(srcLength < 0 && srcChars[srcStart] == 0) { 1009 return -1; 1010 } 1011 1012 // get the indices within bounds 1013 pinIndices(start, length); 1014 1015 // find the first occurrence of the substring 1016 const UChar *array = getArrayStart(); 1017 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 1018 if(match == NULL) { 1019 return -1; 1020 } else { 1021 return (int32_t)(match - array); 1022 } 1023 } 1024 1025 int32_t 1026 UnicodeString::doIndexOf(UChar c, 1027 int32_t start, 1028 int32_t length) const 1029 { 1030 // pin indices 1031 pinIndices(start, length); 1032 1033 // find the first occurrence of c 1034 const UChar *array = getArrayStart(); 1035 const UChar *match = u_memchr(array + start, c, length); 1036 if(match == NULL) { 1037 return -1; 1038 } else { 1039 return (int32_t)(match - array); 1040 } 1041 } 1042 1043 int32_t 1044 UnicodeString::doIndexOf(UChar32 c, 1045 int32_t start, 1046 int32_t length) const { 1047 // pin indices 1048 pinIndices(start, length); 1049 1050 // find the first occurrence of c 1051 const UChar *array = getArrayStart(); 1052 const UChar *match = u_memchr32(array + start, c, length); 1053 if(match == NULL) { 1054 return -1; 1055 } else { 1056 return (int32_t)(match - array); 1057 } 1058 } 1059 1060 int32_t 1061 UnicodeString::lastIndexOf(const UChar *srcChars, 1062 int32_t srcStart, 1063 int32_t srcLength, 1064 int32_t start, 1065 int32_t length) const 1066 { 1067 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1068 return -1; 1069 } 1070 1071 // UnicodeString does not find empty substrings 1072 if(srcLength < 0 && srcChars[srcStart] == 0) { 1073 return -1; 1074 } 1075 1076 // get the indices within bounds 1077 pinIndices(start, length); 1078 1079 // find the last occurrence of the substring 1080 const UChar *array = getArrayStart(); 1081 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1082 if(match == NULL) { 1083 return -1; 1084 } else { 1085 return (int32_t)(match - array); 1086 } 1087 } 1088 1089 int32_t 1090 UnicodeString::doLastIndexOf(UChar c, 1091 int32_t start, 1092 int32_t length) const 1093 { 1094 if(isBogus()) { 1095 return -1; 1096 } 1097 1098 // pin indices 1099 pinIndices(start, length); 1100 1101 // find the last occurrence of c 1102 const UChar *array = getArrayStart(); 1103 const UChar *match = u_memrchr(array + start, c, length); 1104 if(match == NULL) { 1105 return -1; 1106 } else { 1107 return (int32_t)(match - array); 1108 } 1109 } 1110 1111 int32_t 1112 UnicodeString::doLastIndexOf(UChar32 c, 1113 int32_t start, 1114 int32_t length) const { 1115 // pin indices 1116 pinIndices(start, length); 1117 1118 // find the last occurrence of c 1119 const UChar *array = getArrayStart(); 1120 const UChar *match = u_memrchr32(array + start, c, length); 1121 if(match == NULL) { 1122 return -1; 1123 } else { 1124 return (int32_t)(match - array); 1125 } 1126 } 1127 1128 //======================================== 1129 // Write implementation 1130 //======================================== 1131 1132 UnicodeString& 1133 UnicodeString::findAndReplace(int32_t start, 1134 int32_t length, 1135 const UnicodeString& oldText, 1136 int32_t oldStart, 1137 int32_t oldLength, 1138 const UnicodeString& newText, 1139 int32_t newStart, 1140 int32_t newLength) 1141 { 1142 if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1143 return *this; 1144 } 1145 1146 pinIndices(start, length); 1147 oldText.pinIndices(oldStart, oldLength); 1148 newText.pinIndices(newStart, newLength); 1149 1150 if(oldLength == 0) { 1151 return *this; 1152 } 1153 1154 while(length > 0 && length >= oldLength) { 1155 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1156 if(pos < 0) { 1157 // no more oldText's here: done 1158 break; 1159 } else { 1160 // we found oldText, replace it by newText and go beyond it 1161 replace(pos, oldLength, newText, newStart, newLength); 1162 length -= pos + oldLength - start; 1163 start = pos + newLength; 1164 } 1165 } 1166 1167 return *this; 1168 } 1169 1170 1171 void 1172 UnicodeString::setToBogus() 1173 { 1174 releaseArray(); 1175 1176 fUnion.fFields.fLengthAndFlags = kIsBogus; 1177 fUnion.fFields.fArray = 0; 1178 fUnion.fFields.fCapacity = 0; 1179 } 1180 1181 // turn a bogus string into an empty one 1182 void 1183 UnicodeString::unBogus() { 1184 if(fUnion.fFields.fLengthAndFlags & kIsBogus) { 1185 setToEmpty(); 1186 } 1187 } 1188 1189 const UChar * 1190 UnicodeString::getTerminatedBuffer() { 1191 if(!isWritable()) { 1192 return 0; 1193 } 1194 UChar *array = getArrayStart(); 1195 int32_t len = length(); 1196 if(len < getCapacity()) { 1197 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) { 1198 // If len<capacity on a read-only alias, then array[len] is 1199 // either the original NUL (if constructed with (TRUE, s, length)) 1200 // or one of the original string contents characters (if later truncated), 1201 // therefore we can assume that array[len] is initialized memory. 1202 if(array[len] == 0) { 1203 return array; 1204 } 1205 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) { 1206 // kRefCounted: Do not write the NUL if the buffer is shared. 1207 // That is mostly safe, except when the length of one copy was modified 1208 // without copy-on-write, e.g., via truncate(newLength) or remove(void). 1209 // Then the NUL would be written into the middle of another copy's string. 1210 1211 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL. 1212 // Do not test if there is a NUL already because it might be uninitialized memory. 1213 // (That would be safe, but tools like valgrind & Purify would complain.) 1214 array[len] = 0; 1215 return array; 1216 } 1217 } 1218 if(cloneArrayIfNeeded(len+1)) { 1219 array = getArrayStart(); 1220 array[len] = 0; 1221 return array; 1222 } else { 1223 return NULL; 1224 } 1225 } 1226 1227 // setTo() analogous to the readonly-aliasing constructor with the same signature 1228 UnicodeString & 1229 UnicodeString::setTo(UBool isTerminated, 1230 const UChar *text, 1231 int32_t textLength) 1232 { 1233 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1234 // do not modify a string that has an "open" getBuffer(minCapacity) 1235 return *this; 1236 } 1237 1238 if(text == NULL) { 1239 // treat as an empty string, do not alias 1240 releaseArray(); 1241 setToEmpty(); 1242 return *this; 1243 } 1244 1245 if( textLength < -1 || 1246 (textLength == -1 && !isTerminated) || 1247 (textLength >= 0 && isTerminated && text[textLength] != 0) 1248 ) { 1249 setToBogus(); 1250 return *this; 1251 } 1252 1253 releaseArray(); 1254 1255 if(textLength == -1) { 1256 // text is terminated, or else it would have failed the above test 1257 textLength = u_strlen(text); 1258 } 1259 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 1260 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1261 return *this; 1262 } 1263 1264 // setTo() analogous to the writable-aliasing constructor with the same signature 1265 UnicodeString & 1266 UnicodeString::setTo(UChar *buffer, 1267 int32_t buffLength, 1268 int32_t buffCapacity) { 1269 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1270 // do not modify a string that has an "open" getBuffer(minCapacity) 1271 return *this; 1272 } 1273 1274 if(buffer == NULL) { 1275 // treat as an empty string, do not alias 1276 releaseArray(); 1277 setToEmpty(); 1278 return *this; 1279 } 1280 1281 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1282 setToBogus(); 1283 return *this; 1284 } else if(buffLength == -1) { 1285 // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1286 const UChar *p = buffer, *limit = buffer + buffCapacity; 1287 while(p != limit && *p != 0) { 1288 ++p; 1289 } 1290 buffLength = (int32_t)(p - buffer); 1291 } 1292 1293 releaseArray(); 1294 1295 fUnion.fFields.fLengthAndFlags = kWritableAlias; 1296 setArray(buffer, buffLength, buffCapacity); 1297 return *this; 1298 } 1299 1300 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) { 1301 unBogus(); 1302 int32_t length = utf8.length(); 1303 int32_t capacity; 1304 // The UTF-16 string will be at most as long as the UTF-8 string. 1305 if(length <= US_STACKBUF_SIZE) { 1306 capacity = US_STACKBUF_SIZE; 1307 } else { 1308 capacity = length + 1; // +1 for the terminating NUL. 1309 } 1310 UChar *utf16 = getBuffer(capacity); 1311 int32_t length16; 1312 UErrorCode errorCode = U_ZERO_ERROR; 1313 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1314 utf8.data(), length, 1315 0xfffd, // Substitution character. 1316 NULL, // Don't care about number of substitutions. 1317 &errorCode); 1318 releaseBuffer(length16); 1319 if(U_FAILURE(errorCode)) { 1320 setToBogus(); 1321 } 1322 return *this; 1323 } 1324 1325 UnicodeString& 1326 UnicodeString::setCharAt(int32_t offset, 1327 UChar c) 1328 { 1329 int32_t len = length(); 1330 if(cloneArrayIfNeeded() && len > 0) { 1331 if(offset < 0) { 1332 offset = 0; 1333 } else if(offset >= len) { 1334 offset = len - 1; 1335 } 1336 1337 getArrayStart()[offset] = c; 1338 } 1339 return *this; 1340 } 1341 1342 UnicodeString& 1343 UnicodeString::replace(int32_t start, 1344 int32_t _length, 1345 UChar32 srcChar) { 1346 UChar buffer[U16_MAX_LENGTH]; 1347 int32_t count = 0; 1348 UBool isError = FALSE; 1349 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1350 // We test isError so that the compiler does not complain that we don't. 1351 // If isError (srcChar is not a valid code point) then count==0 which means 1352 // we remove the source segment rather than replacing it with srcChar. 1353 return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1354 } 1355 1356 UnicodeString& 1357 UnicodeString::append(UChar32 srcChar) { 1358 UChar buffer[U16_MAX_LENGTH]; 1359 int32_t _length = 0; 1360 UBool isError = FALSE; 1361 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1362 // We test isError so that the compiler does not complain that we don't. 1363 // If isError then _length==0 which turns the doAppend() into a no-op anyway. 1364 return isError ? *this : doAppend(buffer, 0, _length); 1365 } 1366 1367 UnicodeString& 1368 UnicodeString::doReplace( int32_t start, 1369 int32_t length, 1370 const UnicodeString& src, 1371 int32_t srcStart, 1372 int32_t srcLength) 1373 { 1374 // pin the indices to legal values 1375 src.pinIndices(srcStart, srcLength); 1376 1377 // get the characters from src 1378 // and replace the range in ourselves with them 1379 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1380 } 1381 1382 UnicodeString& 1383 UnicodeString::doReplace(int32_t start, 1384 int32_t length, 1385 const UChar *srcChars, 1386 int32_t srcStart, 1387 int32_t srcLength) 1388 { 1389 if(!isWritable()) { 1390 return *this; 1391 } 1392 1393 int32_t oldLength = this->length(); 1394 1395 // optimize (read-only alias).remove(0, start) and .remove(start, end) 1396 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) { 1397 if(start == 0) { 1398 // remove prefix by adjusting the array pointer 1399 pinIndex(length); 1400 fUnion.fFields.fArray += length; 1401 fUnion.fFields.fCapacity -= length; 1402 setLength(oldLength - length); 1403 return *this; 1404 } else { 1405 pinIndex(start); 1406 if(length >= (oldLength - start)) { 1407 // remove suffix by reducing the length (like truncate()) 1408 setLength(start); 1409 fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1410 return *this; 1411 } 1412 } 1413 } 1414 1415 if(start == oldLength) { 1416 return doAppend(srcChars, srcStart, srcLength); 1417 } 1418 1419 if(srcChars == 0) { 1420 srcStart = srcLength = 0; 1421 } else if(srcLength < 0) { 1422 // get the srcLength if necessary 1423 srcLength = u_strlen(srcChars + srcStart); 1424 } 1425 1426 // pin the indices to legal values 1427 pinIndices(start, length); 1428 1429 // calculate the size of the string after the replace 1430 int32_t newLength = oldLength - length + srcLength; 1431 1432 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents; 1433 // therefore we need to keep the current fArray 1434 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1435 UChar *oldArray; 1436 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1437 // copy the stack buffer contents because it will be overwritten with 1438 // fUnion.fFields values 1439 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength); 1440 oldArray = oldStackBuffer; 1441 } else { 1442 oldArray = getArrayStart(); 1443 } 1444 1445 // clone our array and allocate a bigger array if needed 1446 int32_t *bufferToDelete = 0; 1447 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, 1448 FALSE, &bufferToDelete) 1449 ) { 1450 return *this; 1451 } 1452 1453 // now do the replace 1454 1455 UChar *newArray = getArrayStart(); 1456 if(newArray != oldArray) { 1457 // if fArray changed, then we need to copy everything except what will change 1458 us_arrayCopy(oldArray, 0, newArray, 0, start); 1459 us_arrayCopy(oldArray, start + length, 1460 newArray, start + srcLength, 1461 oldLength - (start + length)); 1462 } else if(length != srcLength) { 1463 // fArray did not change; copy only the portion that isn't changing, leaving a hole 1464 us_arrayCopy(oldArray, start + length, 1465 newArray, start + srcLength, 1466 oldLength - (start + length)); 1467 } 1468 1469 // now fill in the hole with the new string 1470 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); 1471 1472 setLength(newLength); 1473 1474 // delayed delete in case srcChars == fArray when we started, and 1475 // to keep oldArray alive for the above operations 1476 if (bufferToDelete) { 1477 uprv_free(bufferToDelete); 1478 } 1479 1480 return *this; 1481 } 1482 1483 // Versions of doReplace() only for append() variants. 1484 // doReplace() and doAppend() optimize for different cases. 1485 1486 UnicodeString& 1487 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) { 1488 if(srcLength == 0) { 1489 return *this; 1490 } 1491 1492 // pin the indices to legal values 1493 src.pinIndices(srcStart, srcLength); 1494 return doAppend(src.getArrayStart(), srcStart, srcLength); 1495 } 1496 1497 UnicodeString& 1498 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) { 1499 if(!isWritable() || srcLength == 0 || srcChars == NULL) { 1500 return *this; 1501 } 1502 1503 if(srcLength < 0) { 1504 // get the srcLength if necessary 1505 if((srcLength = u_strlen(srcChars + srcStart)) == 0) { 1506 return *this; 1507 } 1508 } 1509 1510 int32_t oldLength = length(); 1511 int32_t newLength = oldLength + srcLength; 1512 // optimize append() onto a large-enough, owned string 1513 if((newLength <= getCapacity() && isBufferWritable()) || 1514 cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) { 1515 UChar *newArray = getArrayStart(); 1516 // Do not copy characters when 1517 // UChar *buffer=str.getAppendBuffer(...); 1518 // is followed by 1519 // str.append(buffer, length); 1520 // or 1521 // str.appendString(buffer, length) 1522 // or similar. 1523 if(srcChars + srcStart != newArray + oldLength) { 1524 us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength); 1525 } 1526 setLength(newLength); 1527 } 1528 return *this; 1529 } 1530 1531 /** 1532 * Replaceable API 1533 */ 1534 void 1535 UnicodeString::handleReplaceBetween(int32_t start, 1536 int32_t limit, 1537 const UnicodeString& text) { 1538 replaceBetween(start, limit, text); 1539 } 1540 1541 /** 1542 * Replaceable API 1543 */ 1544 void 1545 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1546 if (limit <= start) { 1547 return; // Nothing to do; avoid bogus malloc call 1548 } 1549 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); 1550 // Check to make sure text is not null. 1551 if (text != NULL) { 1552 extractBetween(start, limit, text, 0); 1553 insert(dest, text, 0, limit - start); 1554 uprv_free(text); 1555 } 1556 } 1557 1558 /** 1559 * Replaceable API 1560 * 1561 * NOTE: This is for the Replaceable class. There is no rep.cpp, 1562 * so we implement this function here. 1563 */ 1564 UBool Replaceable::hasMetaData() const { 1565 return TRUE; 1566 } 1567 1568 /** 1569 * Replaceable API 1570 */ 1571 UBool UnicodeString::hasMetaData() const { 1572 return FALSE; 1573 } 1574 1575 UnicodeString& 1576 UnicodeString::doReverse(int32_t start, int32_t length) { 1577 if(length <= 1 || !cloneArrayIfNeeded()) { 1578 return *this; 1579 } 1580 1581 // pin the indices to legal values 1582 pinIndices(start, length); 1583 if(length <= 1) { // pinIndices() might have shrunk the length 1584 return *this; 1585 } 1586 1587 UChar *left = getArrayStart() + start; 1588 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1589 UChar swap; 1590 UBool hasSupplementary = FALSE; 1591 1592 // Before the loop we know left<right because length>=2. 1593 do { 1594 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); 1595 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); 1596 *right-- = swap; 1597 } while(left < right); 1598 // Make sure to test the middle code unit of an odd-length string. 1599 // Redundant if the length is even. 1600 hasSupplementary |= (UBool)U16_IS_LEAD(*left); 1601 1602 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1603 if(hasSupplementary) { 1604 UChar swap2; 1605 1606 left = getArrayStart() + start; 1607 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1608 while(left < right) { 1609 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1610 *left++ = swap2; 1611 *left++ = swap; 1612 } else { 1613 ++left; 1614 } 1615 } 1616 } 1617 1618 return *this; 1619 } 1620 1621 UBool 1622 UnicodeString::padLeading(int32_t targetLength, 1623 UChar padChar) 1624 { 1625 int32_t oldLength = length(); 1626 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1627 return FALSE; 1628 } else { 1629 // move contents up by padding width 1630 UChar *array = getArrayStart(); 1631 int32_t start = targetLength - oldLength; 1632 us_arrayCopy(array, 0, array, start, oldLength); 1633 1634 // fill in padding character 1635 while(--start >= 0) { 1636 array[start] = padChar; 1637 } 1638 setLength(targetLength); 1639 return TRUE; 1640 } 1641 } 1642 1643 UBool 1644 UnicodeString::padTrailing(int32_t targetLength, 1645 UChar padChar) 1646 { 1647 int32_t oldLength = length(); 1648 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1649 return FALSE; 1650 } else { 1651 // fill in padding character 1652 UChar *array = getArrayStart(); 1653 int32_t length = targetLength; 1654 while(--length >= oldLength) { 1655 array[length] = padChar; 1656 } 1657 setLength(targetLength); 1658 return TRUE; 1659 } 1660 } 1661 1662 //======================================== 1663 // Hashing 1664 //======================================== 1665 int32_t 1666 UnicodeString::doHashCode() const 1667 { 1668 /* Delegate hash computation to uhash. This makes UnicodeString 1669 * hashing consistent with UChar* hashing. */ 1670 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1671 if (hashCode == kInvalidHashCode) { 1672 hashCode = kEmptyHashCode; 1673 } 1674 return hashCode; 1675 } 1676 1677 //======================================== 1678 // External Buffer 1679 //======================================== 1680 1681 UChar * 1682 UnicodeString::getBuffer(int32_t minCapacity) { 1683 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1684 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer; 1685 setZeroLength(); 1686 return getArrayStart(); 1687 } else { 1688 return 0; 1689 } 1690 } 1691 1692 void 1693 UnicodeString::releaseBuffer(int32_t newLength) { 1694 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) { 1695 // set the new fLength 1696 int32_t capacity=getCapacity(); 1697 if(newLength==-1) { 1698 // the new length is the string length, capped by fCapacity 1699 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; 1700 while(p<limit && *p!=0) { 1701 ++p; 1702 } 1703 newLength=(int32_t)(p-array); 1704 } else if(newLength>capacity) { 1705 newLength=capacity; 1706 } 1707 setLength(newLength); 1708 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer; 1709 } 1710 } 1711 1712 //======================================== 1713 // Miscellaneous 1714 //======================================== 1715 UBool 1716 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1717 int32_t growCapacity, 1718 UBool doCopyArray, 1719 int32_t **pBufferToDelete, 1720 UBool forceClone) { 1721 // default parameters need to be static, therefore 1722 // the defaults are -1 to have convenience defaults 1723 if(newCapacity == -1) { 1724 newCapacity = getCapacity(); 1725 } 1726 1727 // while a getBuffer(minCapacity) is "open", 1728 // prevent any modifications of the string by returning FALSE here 1729 // if the string is bogus, then only an assignment or similar can revive it 1730 if(!isWritable()) { 1731 return FALSE; 1732 } 1733 1734 /* 1735 * We need to make a copy of the array if 1736 * the buffer is read-only, or 1737 * the buffer is refCounted (shared), and refCount>1, or 1738 * the buffer is too small. 1739 * Return FALSE if memory could not be allocated. 1740 */ 1741 if(forceClone || 1742 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly || 1743 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) || 1744 newCapacity > getCapacity() 1745 ) { 1746 // check growCapacity for default value and use of the stack buffer 1747 if(growCapacity < 0) { 1748 growCapacity = newCapacity; 1749 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1750 growCapacity = US_STACKBUF_SIZE; 1751 } 1752 1753 // save old values 1754 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1755 UChar *oldArray; 1756 int32_t oldLength = length(); 1757 int16_t flags = fUnion.fFields.fLengthAndFlags; 1758 1759 if(flags&kUsingStackBuffer) { 1760 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1761 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1762 // copy the stack buffer contents because it will be overwritten with 1763 // fUnion.fFields values 1764 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength); 1765 oldArray = oldStackBuffer; 1766 } else { 1767 oldArray = NULL; // no need to copy from the stack buffer to itself 1768 } 1769 } else { 1770 oldArray = fUnion.fFields.fArray; 1771 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ 1772 } 1773 1774 // allocate a new array 1775 if(allocate(growCapacity) || 1776 (newCapacity < growCapacity && allocate(newCapacity)) 1777 ) { 1778 if(doCopyArray) { 1779 // copy the contents 1780 // do not copy more than what fits - it may be smaller than before 1781 int32_t minLength = oldLength; 1782 newCapacity = getCapacity(); 1783 if(newCapacity < minLength) { 1784 minLength = newCapacity; 1785 } 1786 if(oldArray != NULL) { 1787 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1788 } 1789 setLength(minLength); 1790 } else { 1791 setZeroLength(); 1792 } 1793 1794 // release the old array 1795 if(flags & kRefCounted) { 1796 // the array is refCounted; decrement and release if 0 1797 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1); 1798 if(umtx_atomic_dec(pRefCount) == 0) { 1799 if(pBufferToDelete == 0) { 1800 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t 1801 // is defined as volatile. (Volatile has useful non-standard behavior 1802 // with this compiler.) 1803 uprv_free((void *)pRefCount); 1804 } else { 1805 // the caller requested to delete it himself 1806 *pBufferToDelete = (int32_t *)pRefCount; 1807 } 1808 } 1809 } 1810 } else { 1811 // not enough memory for growCapacity and not even for the smaller newCapacity 1812 // reset the old values for setToBogus() to release the array 1813 if(!(flags&kUsingStackBuffer)) { 1814 fUnion.fFields.fArray = oldArray; 1815 } 1816 fUnion.fFields.fLengthAndFlags = flags; 1817 setToBogus(); 1818 return FALSE; 1819 } 1820 } 1821 return TRUE; 1822 } 1823 1824 // UnicodeStringAppendable ------------------------------------------------- *** 1825 1826 UnicodeStringAppendable::~UnicodeStringAppendable() {} 1827 1828 UBool 1829 UnicodeStringAppendable::appendCodeUnit(UChar c) { 1830 return str.doAppend(&c, 0, 1).isWritable(); 1831 } 1832 1833 UBool 1834 UnicodeStringAppendable::appendCodePoint(UChar32 c) { 1835 UChar buffer[U16_MAX_LENGTH]; 1836 int32_t cLength = 0; 1837 UBool isError = FALSE; 1838 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 1839 return !isError && str.doAppend(buffer, 0, cLength).isWritable(); 1840 } 1841 1842 UBool 1843 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { 1844 return str.doAppend(s, 0, length).isWritable(); 1845 } 1846 1847 UBool 1848 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 1849 return str.cloneArrayIfNeeded(str.length() + appendCapacity); 1850 } 1851 1852 UChar * 1853 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 1854 int32_t desiredCapacityHint, 1855 UChar *scratch, int32_t scratchCapacity, 1856 int32_t *resultCapacity) { 1857 if(minCapacity < 1 || scratchCapacity < minCapacity) { 1858 *resultCapacity = 0; 1859 return NULL; 1860 } 1861 int32_t oldLength = str.length(); 1862 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 1863 *resultCapacity = str.getCapacity() - oldLength; 1864 return str.getArrayStart() + oldLength; 1865 } 1866 *resultCapacity = scratchCapacity; 1867 return scratch; 1868 } 1869 1870 U_NAMESPACE_END 1871 1872 U_NAMESPACE_USE 1873 1874 U_CAPI int32_t U_EXPORT2 1875 uhash_hashUnicodeString(const UElement key) { 1876 const UnicodeString *str = (const UnicodeString*) key.pointer; 1877 return (str == NULL) ? 0 : str->hashCode(); 1878 } 1879 1880 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 1881 // does not depend on hashtable code. 1882 U_CAPI UBool U_EXPORT2 1883 uhash_compareUnicodeString(const UElement key1, const UElement key2) { 1884 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 1885 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 1886 if (str1 == str2) { 1887 return TRUE; 1888 } 1889 if (str1 == NULL || str2 == NULL) { 1890 return FALSE; 1891 } 1892 return *str1 == *str2; 1893 } 1894 1895 #ifdef U_STATIC_IMPLEMENTATION 1896 /* 1897 This should never be called. It is defined here to make sure that the 1898 virtual vector deleting destructor is defined within unistr.cpp. 1899 The vector deleting destructor is already a part of UObject, 1900 but defining it here makes sure that it is included with this object file. 1901 This makes sure that static library dependencies are kept to a minimum. 1902 */ 1903 static void uprv_UnicodeStringDummy(void) { 1904 delete [] (new UnicodeString[2]); 1905 } 1906 #endif 1907