1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1999-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 * 9 * File unistr.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 09/25/98 stephen Creation. 15 * 04/20/99 stephen Overhauled per 4/16 code review. 16 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 17 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from 18 * Replaceable. 19 * 06/25/01 grhoten Removed the dependency on iostream 20 ****************************************************************************** 21 */ 22 23 #include "unicode/utypes.h" 24 #include "unicode/appendable.h" 25 #include "unicode/putil.h" 26 #include "cstring.h" 27 #include "cmemory.h" 28 #include "unicode/ustring.h" 29 #include "unicode/unistr.h" 30 #include "unicode/utf.h" 31 #include "unicode/utf16.h" 32 #include "uelement.h" 33 #include "ustr_imp.h" 34 #include "umutex.h" 35 #include "uassert.h" 36 37 #if 0 38 39 #include <iostream> 40 using namespace std; 41 42 //DEBUGGING 43 void 44 print(const UnicodeString& s, 45 const char *name) 46 { 47 UChar c; 48 cout << name << ":|"; 49 for(int i = 0; i < s.length(); ++i) { 50 c = s[i]; 51 if(c>= 0x007E || c < 0x0020) 52 cout << "[0x" << hex << s[i] << "]"; 53 else 54 cout << (char) s[i]; 55 } 56 cout << '|' << endl; 57 } 58 59 void 60 print(const UChar *s, 61 int32_t len, 62 const char *name) 63 { 64 UChar c; 65 cout << name << ":|"; 66 for(int i = 0; i < len; ++i) { 67 c = s[i]; 68 if(c>= 0x007E || c < 0x0020) 69 cout << "[0x" << hex << s[i] << "]"; 70 else 71 cout << (char) s[i]; 72 } 73 cout << '|' << endl; 74 } 75 // END DEBUGGING 76 #endif 77 78 // Local function definitions for now 79 80 // need to copy areas that may overlap 81 static 82 inline void 83 us_arrayCopy(const UChar *src, int32_t srcStart, 84 UChar *dst, int32_t dstStart, int32_t count) 85 { 86 if(count>0) { 87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src)); 88 } 89 } 90 91 // u_unescapeAt() callback to get a UChar from a UnicodeString 92 U_CDECL_BEGIN 93 static UChar U_CALLCONV 94 UnicodeString_charAt(int32_t offset, void *context) { 95 return ((icu::UnicodeString*) context)->charAt(offset); 96 } 97 U_CDECL_END 98 99 U_NAMESPACE_BEGIN 100 101 /* The Replaceable virtual destructor can't be defined in the header 102 due to how AIX works with multiple definitions of virtual functions. 103 */ 104 Replaceable::~Replaceable() {} 105 106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 107 108 UnicodeString U_EXPORT2 109 operator+ (const UnicodeString &s1, const UnicodeString &s2) { 110 return 111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). 112 append(s1). 113 append(s2); 114 } 115 116 //======================================== 117 // Reference Counting functions, put at top of file so that optimizing compilers 118 // have a chance to automatically inline. 119 //======================================== 120 121 void 122 UnicodeString::addRef() { 123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 124 } 125 126 int32_t 127 UnicodeString::removeRef() { 128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 129 } 130 131 int32_t 132 UnicodeString::refCount() const { 133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1)); 134 } 135 136 void 137 UnicodeString::releaseArray() { 138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) { 139 uprv_free((int32_t *)fUnion.fFields.fArray - 1); 140 } 141 } 142 143 144 145 //======================================== 146 // Constructors 147 //======================================== 148 149 // The default constructor is inline in unistr.h. 150 151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) { 152 fUnion.fFields.fLengthAndFlags = 0; 153 if(count <= 0 || (uint32_t)c > 0x10ffff) { 154 // just allocate and do not do anything else 155 allocate(capacity); 156 } else if(c <= 0xffff) { 157 int32_t length = count; 158 if(capacity < length) { 159 capacity = length; 160 } 161 if(allocate(capacity)) { 162 UChar *array = getArrayStart(); 163 UChar unit = (UChar)c; 164 for(int32_t i = 0; i < length; ++i) { 165 array[i] = unit; 166 } 167 setLength(length); 168 } 169 } else { // supplementary code point, write surrogate pairs 170 if(count > (INT32_MAX / 2)) { 171 // We would get more than 2G UChars. 172 allocate(capacity); 173 return; 174 } 175 int32_t length = count * 2; 176 if(capacity < length) { 177 capacity = length; 178 } 179 if(allocate(capacity)) { 180 UChar *array = getArrayStart(); 181 UChar lead = U16_LEAD(c); 182 UChar trail = U16_TRAIL(c); 183 for(int32_t i = 0; i < length; i += 2) { 184 array[i] = lead; 185 array[i + 1] = trail; 186 } 187 setLength(length); 188 } 189 } 190 } 191 192 UnicodeString::UnicodeString(UChar ch) { 193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString; 194 fUnion.fStackFields.fBuffer[0] = ch; 195 } 196 197 UnicodeString::UnicodeString(UChar32 ch) { 198 fUnion.fFields.fLengthAndFlags = kShortString; 199 int32_t i = 0; 200 UBool isError = FALSE; 201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError); 202 // We test isError so that the compiler does not complain that we don't. 203 // If isError then i==0 which is what we want anyway. 204 if(!isError) { 205 setShortLength(i); 206 } 207 } 208 209 UnicodeString::UnicodeString(const UChar *text) { 210 fUnion.fFields.fLengthAndFlags = kShortString; 211 doAppend(text, 0, -1); 212 } 213 214 UnicodeString::UnicodeString(const UChar *text, 215 int32_t textLength) { 216 fUnion.fFields.fLengthAndFlags = kShortString; 217 doAppend(text, 0, textLength); 218 } 219 220 UnicodeString::UnicodeString(UBool isTerminated, 221 const UChar *text, 222 int32_t textLength) { 223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 224 if(text == NULL) { 225 // treat as an empty string, do not alias 226 setToEmpty(); 227 } else if(textLength < -1 || 228 (textLength == -1 && !isTerminated) || 229 (textLength >= 0 && isTerminated && text[textLength] != 0) 230 ) { 231 setToBogus(); 232 } else { 233 if(textLength == -1) { 234 // text is terminated, or else it would have failed the above test 235 textLength = u_strlen(text); 236 } 237 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 238 } 239 } 240 241 UnicodeString::UnicodeString(UChar *buff, 242 int32_t buffLength, 243 int32_t buffCapacity) { 244 fUnion.fFields.fLengthAndFlags = kWritableAlias; 245 if(buff == NULL) { 246 // treat as an empty string, do not alias 247 setToEmpty(); 248 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 249 setToBogus(); 250 } else { 251 if(buffLength == -1) { 252 // fLength = u_strlen(buff); but do not look beyond buffCapacity 253 const UChar *p = buff, *limit = buff + buffCapacity; 254 while(p != limit && *p != 0) { 255 ++p; 256 } 257 buffLength = (int32_t)(p - buff); 258 } 259 setArray(buff, buffLength, buffCapacity); 260 } 261 } 262 263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) { 264 fUnion.fFields.fLengthAndFlags = kShortString; 265 if(src==NULL) { 266 // treat as an empty string 267 } else { 268 if(length<0) { 269 length=(int32_t)uprv_strlen(src); 270 } 271 if(cloneArrayIfNeeded(length, length, FALSE)) { 272 u_charsToUChars(src, getArrayStart(), length); 273 setLength(length); 274 } else { 275 setToBogus(); 276 } 277 } 278 } 279 280 #if U_CHARSET_IS_UTF8 281 282 UnicodeString::UnicodeString(const char *codepageData) { 283 fUnion.fFields.fLengthAndFlags = kShortString; 284 if(codepageData != 0) { 285 setToUTF8(codepageData); 286 } 287 } 288 289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) { 290 fUnion.fFields.fLengthAndFlags = kShortString; 291 // if there's nothing to convert, do nothing 292 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 293 return; 294 } 295 if(dataLength == -1) { 296 dataLength = (int32_t)uprv_strlen(codepageData); 297 } 298 setToUTF8(StringPiece(codepageData, dataLength)); 299 } 300 301 // else see unistr_cnv.cpp 302 #endif 303 304 UnicodeString::UnicodeString(const UnicodeString& that) { 305 fUnion.fFields.fLengthAndFlags = kShortString; 306 copyFrom(that); 307 } 308 309 #if U_HAVE_RVALUE_REFERENCES 310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT { 311 fUnion.fFields.fLengthAndFlags = kShortString; 312 moveFrom(src); 313 } 314 #endif 315 316 UnicodeString::UnicodeString(const UnicodeString& that, 317 int32_t srcStart) { 318 fUnion.fFields.fLengthAndFlags = kShortString; 319 setTo(that, srcStart); 320 } 321 322 UnicodeString::UnicodeString(const UnicodeString& that, 323 int32_t srcStart, 324 int32_t srcLength) { 325 fUnion.fFields.fLengthAndFlags = kShortString; 326 setTo(that, srcStart, srcLength); 327 } 328 329 // Replaceable base class clone() default implementation, does not clone 330 Replaceable * 331 Replaceable::clone() const { 332 return NULL; 333 } 334 335 // UnicodeString overrides clone() with a real implementation 336 Replaceable * 337 UnicodeString::clone() const { 338 return new UnicodeString(*this); 339 } 340 341 //======================================== 342 // array allocation 343 //======================================== 344 345 namespace { 346 347 const int32_t kGrowSize = 128; 348 349 // The number of bytes for one int32_t reference counter and capacity UChars 350 // must fit into a 32-bit size_t (at least when on a 32-bit platform). 351 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(), 352 // and round up to a multiple of 16 bytes. 353 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5. 354 // (With more complicated checks we could go up to 0x7ffffffd without rounding up, 355 // but that does not seem worth it.) 356 const int32_t kMaxCapacity = 0x7ffffff5; 357 358 int32_t getGrowCapacity(int32_t newLength) { 359 int32_t growSize = (newLength >> 2) + kGrowSize; 360 if(growSize <= (kMaxCapacity - newLength)) { 361 return newLength + growSize; 362 } else { 363 return kMaxCapacity; 364 } 365 } 366 367 } // namespace 368 369 UBool 370 UnicodeString::allocate(int32_t capacity) { 371 if(capacity <= US_STACKBUF_SIZE) { 372 fUnion.fFields.fLengthAndFlags = kShortString; 373 return TRUE; 374 } 375 if(capacity <= kMaxCapacity) { 376 ++capacity; // for the NUL 377 // Switch to size_t which is unsigned so that we can allocate up to 4GB. 378 // Reference counter + UChars. 379 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR; 380 // Round up to a multiple of 16. 381 numBytes = (numBytes + 15) & ~15; 382 int32_t *array = (int32_t *) uprv_malloc(numBytes); 383 if(array != NULL) { 384 // set initial refCount and point behind the refCount 385 *array++ = 1; 386 numBytes -= sizeof(int32_t); 387 388 // have fArray point to the first UChar 389 fUnion.fFields.fArray = (UChar *)array; 390 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR); 391 fUnion.fFields.fLengthAndFlags = kLongString; 392 return TRUE; 393 } 394 } 395 fUnion.fFields.fLengthAndFlags = kIsBogus; 396 fUnion.fFields.fArray = 0; 397 fUnion.fFields.fCapacity = 0; 398 return FALSE; 399 } 400 401 //======================================== 402 // Destructor 403 //======================================== 404 405 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 406 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1 407 static u_atomic_int32_t beyondCount(0); 408 409 U_CAPI void unistr_printLengths() { 410 int32_t i; 411 for(i = 0; i <= 59; ++i) { 412 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]); 413 } 414 int32_t beyond = beyondCount; 415 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) { 416 beyond += finalLengthCounts[i]; 417 } 418 printf(">59, %9d\n", beyond); 419 } 420 #endif 421 422 UnicodeString::~UnicodeString() 423 { 424 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS 425 // Count lengths of strings at the end of their lifetime. 426 // Useful for discussion of a desirable stack buffer size. 427 // Count the contents length, not the optional NUL terminator nor further capacity. 428 // Ignore open-buffer strings and strings which alias external storage. 429 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) { 430 if(hasShortLength()) { 431 umtx_atomic_inc(finalLengthCounts + getShortLength()); 432 } else { 433 umtx_atomic_inc(&beyondCount); 434 } 435 } 436 #endif 437 438 releaseArray(); 439 } 440 441 //======================================== 442 // Factory methods 443 //======================================== 444 445 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) { 446 UnicodeString result; 447 result.setToUTF8(utf8); 448 return result; 449 } 450 451 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 452 UnicodeString result; 453 int32_t capacity; 454 // Most UTF-32 strings will be BMP-only and result in a same-length 455 // UTF-16 string. We overestimate the capacity just slightly, 456 // just in case there are a few supplementary characters. 457 if(length <= US_STACKBUF_SIZE) { 458 capacity = US_STACKBUF_SIZE; 459 } else { 460 capacity = length + (length >> 4) + 4; 461 } 462 do { 463 UChar *utf16 = result.getBuffer(capacity); 464 int32_t length16; 465 UErrorCode errorCode = U_ZERO_ERROR; 466 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 467 utf32, length, 468 0xfffd, // Substitution character. 469 NULL, // Don't care about number of substitutions. 470 &errorCode); 471 result.releaseBuffer(length16); 472 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 473 capacity = length16 + 1; // +1 for the terminating NUL. 474 continue; 475 } else if(U_FAILURE(errorCode)) { 476 result.setToBogus(); 477 } 478 break; 479 } while(TRUE); 480 return result; 481 } 482 483 //======================================== 484 // Assignment 485 //======================================== 486 487 UnicodeString & 488 UnicodeString::operator=(const UnicodeString &src) { 489 return copyFrom(src); 490 } 491 492 UnicodeString & 493 UnicodeString::fastCopyFrom(const UnicodeString &src) { 494 return copyFrom(src, TRUE); 495 } 496 497 UnicodeString & 498 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 499 // if assigning to ourselves, do nothing 500 if(this == &src) { 501 return *this; 502 } 503 504 // is the right side bogus? 505 if(src.isBogus()) { 506 setToBogus(); 507 return *this; 508 } 509 510 // delete the current contents 511 releaseArray(); 512 513 if(src.isEmpty()) { 514 // empty string - use the stack buffer 515 setToEmpty(); 516 return *this; 517 } 518 519 // fLength>0 and not an "open" src.getBuffer(minCapacity) 520 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 521 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) { 522 case kShortString: 523 // short string using the stack buffer, do the same 524 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 525 getShortLength() * U_SIZEOF_UCHAR); 526 break; 527 case kLongString: 528 // src uses a refCounted string buffer, use that buffer with refCount 529 // src is const, use a cast - we don't actually change it 530 ((UnicodeString &)src).addRef(); 531 // copy all fields, share the reference-counted buffer 532 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 533 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 534 if(!hasShortLength()) { 535 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 536 } 537 break; 538 case kReadonlyAlias: 539 if(fastCopy) { 540 // src is a readonly alias, do the same 541 // -> maintain the readonly alias as such 542 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 543 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 544 if(!hasShortLength()) { 545 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 546 } 547 break; 548 } 549 // else if(!fastCopy) fall through to case kWritableAlias 550 // -> allocate a new buffer and copy the contents 551 U_FALLTHROUGH; 552 case kWritableAlias: { 553 // src is a writable alias; we make a copy of that instead 554 int32_t srcLength = src.length(); 555 if(allocate(srcLength)) { 556 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength); 557 setLength(srcLength); 558 break; 559 } 560 // if there is not enough memory, then fall through to setting to bogus 561 U_FALLTHROUGH; 562 } 563 default: 564 // if src is bogus, set ourselves to bogus 565 // do not call setToBogus() here because fArray and flags are not consistent here 566 fUnion.fFields.fLengthAndFlags = kIsBogus; 567 fUnion.fFields.fArray = 0; 568 fUnion.fFields.fCapacity = 0; 569 break; 570 } 571 572 return *this; 573 } 574 575 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT { 576 // No explicit check for self move assignment, consistent with standard library. 577 // Self move assignment causes no crash nor leak but might make the object bogus. 578 releaseArray(); 579 copyFieldsFrom(src, TRUE); 580 return *this; 581 } 582 583 // Same as moveFrom() except without memory management. 584 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT { 585 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags; 586 if(lengthAndFlags & kUsingStackBuffer) { 587 // Short string using the stack buffer, copy the contents. 588 // Check for self assignment to prevent "overlap in memcpy" warnings, 589 // although it should be harmless to copy a buffer to itself exactly. 590 if(this != &src) { 591 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer, 592 getShortLength() * U_SIZEOF_UCHAR); 593 } 594 } else { 595 // In all other cases, copy all fields. 596 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 597 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 598 if(!hasShortLength()) { 599 fUnion.fFields.fLength = src.fUnion.fFields.fLength; 600 } 601 if(setSrcToBogus) { 602 // Set src to bogus without releasing any memory. 603 src.fUnion.fFields.fLengthAndFlags = kIsBogus; 604 src.fUnion.fFields.fArray = NULL; 605 src.fUnion.fFields.fCapacity = 0; 606 } 607 } 608 } 609 610 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT { 611 UnicodeString temp; // Empty short string: Known not to need releaseArray(). 612 // Copy fields without resetting source values in between. 613 temp.copyFieldsFrom(*this, FALSE); 614 this->copyFieldsFrom(other, FALSE); 615 other.copyFieldsFrom(temp, FALSE); 616 // Set temp to an empty string so that other's memory is not released twice. 617 temp.fUnion.fFields.fLengthAndFlags = kShortString; 618 } 619 620 //======================================== 621 // Miscellaneous operations 622 //======================================== 623 624 UnicodeString UnicodeString::unescape() const { 625 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity 626 if (result.isBogus()) { 627 return result; 628 } 629 const UChar *array = getBuffer(); 630 int32_t len = length(); 631 int32_t prev = 0; 632 for (int32_t i=0;;) { 633 if (i == len) { 634 result.append(array, prev, len - prev); 635 break; 636 } 637 if (array[i++] == 0x5C /*'\\'*/) { 638 result.append(array, prev, (i - 1) - prev); 639 UChar32 c = unescapeAt(i); // advances i 640 if (c < 0) { 641 result.remove(); // return empty string 642 break; // invalid escape sequence 643 } 644 result.append(c); 645 prev = i; 646 } 647 } 648 return result; 649 } 650 651 UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 652 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 653 } 654 655 //======================================== 656 // Read-only implementation 657 //======================================== 658 UBool 659 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { 660 // Requires: this & text not bogus and have same lengths. 661 // Byte-wise comparison works for equality regardless of endianness. 662 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; 663 } 664 665 int8_t 666 UnicodeString::doCompare( int32_t start, 667 int32_t length, 668 const UChar *srcChars, 669 int32_t srcStart, 670 int32_t srcLength) const 671 { 672 // compare illegal string values 673 if(isBogus()) { 674 return -1; 675 } 676 677 // pin indices to legal values 678 pinIndices(start, length); 679 680 if(srcChars == NULL) { 681 // treat const UChar *srcChars==NULL as an empty string 682 return length == 0 ? 0 : 1; 683 } 684 685 // get the correct pointer 686 const UChar *chars = getArrayStart(); 687 688 chars += start; 689 srcChars += srcStart; 690 691 int32_t minLength; 692 int8_t lengthResult; 693 694 // get the srcLength if necessary 695 if(srcLength < 0) { 696 srcLength = u_strlen(srcChars + srcStart); 697 } 698 699 // are we comparing different lengths? 700 if(length != srcLength) { 701 if(length < srcLength) { 702 minLength = length; 703 lengthResult = -1; 704 } else { 705 minLength = srcLength; 706 lengthResult = 1; 707 } 708 } else { 709 minLength = length; 710 lengthResult = 0; 711 } 712 713 /* 714 * note that uprv_memcmp() returns an int but we return an int8_t; 715 * we need to take care not to truncate the result - 716 * one way to do this is to right-shift the value to 717 * move the sign bit into the lower 8 bits and making sure that this 718 * does not become 0 itself 719 */ 720 721 if(minLength > 0 && chars != srcChars) { 722 int32_t result; 723 724 # if U_IS_BIG_ENDIAN 725 // big-endian: byte comparison works 726 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); 727 if(result != 0) { 728 return (int8_t)(result >> 15 | 1); 729 } 730 # else 731 // little-endian: compare UChar units 732 do { 733 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); 734 if(result != 0) { 735 return (int8_t)(result >> 15 | 1); 736 } 737 } while(--minLength > 0); 738 # endif 739 } 740 return lengthResult; 741 } 742 743 /* String compare in code point order - doCompare() compares in code unit order. */ 744 int8_t 745 UnicodeString::doCompareCodePointOrder(int32_t start, 746 int32_t length, 747 const UChar *srcChars, 748 int32_t srcStart, 749 int32_t srcLength) const 750 { 751 // compare illegal string values 752 // treat const UChar *srcChars==NULL as an empty string 753 if(isBogus()) { 754 return -1; 755 } 756 757 // pin indices to legal values 758 pinIndices(start, length); 759 760 if(srcChars == NULL) { 761 srcStart = srcLength = 0; 762 } 763 764 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); 765 /* translate the 32-bit result into an 8-bit one */ 766 if(diff!=0) { 767 return (int8_t)(diff >> 15 | 1); 768 } else { 769 return 0; 770 } 771 } 772 773 int32_t 774 UnicodeString::getLength() const { 775 return length(); 776 } 777 778 UChar 779 UnicodeString::getCharAt(int32_t offset) const { 780 return charAt(offset); 781 } 782 783 UChar32 784 UnicodeString::getChar32At(int32_t offset) const { 785 return char32At(offset); 786 } 787 788 UChar32 789 UnicodeString::char32At(int32_t offset) const 790 { 791 int32_t len = length(); 792 if((uint32_t)offset < (uint32_t)len) { 793 const UChar *array = getArrayStart(); 794 UChar32 c; 795 U16_GET(array, 0, offset, len, c); 796 return c; 797 } else { 798 return kInvalidUChar; 799 } 800 } 801 802 int32_t 803 UnicodeString::getChar32Start(int32_t offset) const { 804 if((uint32_t)offset < (uint32_t)length()) { 805 const UChar *array = getArrayStart(); 806 U16_SET_CP_START(array, 0, offset); 807 return offset; 808 } else { 809 return 0; 810 } 811 } 812 813 int32_t 814 UnicodeString::getChar32Limit(int32_t offset) const { 815 int32_t len = length(); 816 if((uint32_t)offset < (uint32_t)len) { 817 const UChar *array = getArrayStart(); 818 U16_SET_CP_LIMIT(array, 0, offset, len); 819 return offset; 820 } else { 821 return len; 822 } 823 } 824 825 int32_t 826 UnicodeString::countChar32(int32_t start, int32_t length) const { 827 pinIndices(start, length); 828 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL 829 return u_countChar32(getArrayStart()+start, length); 830 } 831 832 UBool 833 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 834 pinIndices(start, length); 835 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL 836 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 837 } 838 839 int32_t 840 UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 841 // pin index 842 int32_t len = length(); 843 if(index<0) { 844 index=0; 845 } else if(index>len) { 846 index=len; 847 } 848 849 const UChar *array = getArrayStart(); 850 if(delta>0) { 851 U16_FWD_N(array, index, len, delta); 852 } else { 853 U16_BACK_N(array, 0, index, -delta); 854 } 855 856 return index; 857 } 858 859 void 860 UnicodeString::doExtract(int32_t start, 861 int32_t length, 862 UChar *dst, 863 int32_t dstStart) const 864 { 865 // pin indices to legal values 866 pinIndices(start, length); 867 868 // do not copy anything if we alias dst itself 869 const UChar *array = getArrayStart(); 870 if(array + start != dst + dstStart) { 871 us_arrayCopy(array, start, dst, dstStart, length); 872 } 873 } 874 875 int32_t 876 UnicodeString::extract(UChar *dest, int32_t destCapacity, 877 UErrorCode &errorCode) const { 878 int32_t len = length(); 879 if(U_SUCCESS(errorCode)) { 880 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 881 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 882 } else { 883 const UChar *array = getArrayStart(); 884 if(len>0 && len<=destCapacity && array!=dest) { 885 u_memcpy(dest, array, len); 886 } 887 return u_terminateUChars(dest, destCapacity, len, &errorCode); 888 } 889 } 890 891 return len; 892 } 893 894 int32_t 895 UnicodeString::extract(int32_t start, 896 int32_t length, 897 char *target, 898 int32_t targetCapacity, 899 enum EInvariant) const 900 { 901 // if the arguments are illegal, then do nothing 902 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { 903 return 0; 904 } 905 906 // pin the indices to legal values 907 pinIndices(start, length); 908 909 if(length <= targetCapacity) { 910 u_UCharsToChars(getArrayStart() + start, target, length); 911 } 912 UErrorCode status = U_ZERO_ERROR; 913 return u_terminateChars(target, targetCapacity, length, &status); 914 } 915 916 UnicodeString 917 UnicodeString::tempSubString(int32_t start, int32_t len) const { 918 pinIndices(start, len); 919 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 920 if(array==NULL) { 921 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string 922 len=-2; // bogus result string 923 } 924 return UnicodeString(FALSE, array + start, len); 925 } 926 927 int32_t 928 UnicodeString::toUTF8(int32_t start, int32_t len, 929 char *target, int32_t capacity) const { 930 pinIndices(start, len); 931 int32_t length8; 932 UErrorCode errorCode = U_ZERO_ERROR; 933 u_strToUTF8WithSub(target, capacity, &length8, 934 getBuffer() + start, len, 935 0xFFFD, // Standard substitution character. 936 NULL, // Don't care about number of substitutions. 937 &errorCode); 938 return length8; 939 } 940 941 #if U_CHARSET_IS_UTF8 942 943 int32_t 944 UnicodeString::extract(int32_t start, int32_t len, 945 char *target, uint32_t dstSize) const { 946 // if the arguments are illegal, then do nothing 947 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 948 return 0; 949 } 950 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); 951 } 952 953 // else see unistr_cnv.cpp 954 #endif 955 956 void 957 UnicodeString::extractBetween(int32_t start, 958 int32_t limit, 959 UnicodeString& target) const { 960 pinIndex(start); 961 pinIndex(limit); 962 doExtract(start, limit - start, target); 963 } 964 965 // When converting from UTF-16 to UTF-8, the result will have at most 3 times 966 // as many bytes as the source has UChars. 967 // The "worst cases" are writing systems like Indic, Thai and CJK with 968 // 3:1 bytes:UChars. 969 void 970 UnicodeString::toUTF8(ByteSink &sink) const { 971 int32_t length16 = length(); 972 if(length16 != 0) { 973 char stackBuffer[1024]; 974 int32_t capacity = (int32_t)sizeof(stackBuffer); 975 UBool utf8IsOwned = FALSE; 976 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 977 3*length16, 978 stackBuffer, capacity, 979 &capacity); 980 int32_t length8 = 0; 981 UErrorCode errorCode = U_ZERO_ERROR; 982 u_strToUTF8WithSub(utf8, capacity, &length8, 983 getBuffer(), length16, 984 0xFFFD, // Standard substitution character. 985 NULL, // Don't care about number of substitutions. 986 &errorCode); 987 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 988 utf8 = (char *)uprv_malloc(length8); 989 if(utf8 != NULL) { 990 utf8IsOwned = TRUE; 991 errorCode = U_ZERO_ERROR; 992 u_strToUTF8WithSub(utf8, length8, &length8, 993 getBuffer(), length16, 994 0xFFFD, // Standard substitution character. 995 NULL, // Don't care about number of substitutions. 996 &errorCode); 997 } else { 998 errorCode = U_MEMORY_ALLOCATION_ERROR; 999 } 1000 } 1001 if(U_SUCCESS(errorCode)) { 1002 sink.Append(utf8, length8); 1003 sink.Flush(); 1004 } 1005 if(utf8IsOwned) { 1006 uprv_free(utf8); 1007 } 1008 } 1009 } 1010 1011 int32_t 1012 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 1013 int32_t length32=0; 1014 if(U_SUCCESS(errorCode)) { 1015 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 1016 u_strToUTF32WithSub(utf32, capacity, &length32, 1017 getBuffer(), length(), 1018 0xfffd, // Substitution character. 1019 NULL, // Don't care about number of substitutions. 1020 &errorCode); 1021 } 1022 return length32; 1023 } 1024 1025 int32_t 1026 UnicodeString::indexOf(const UChar *srcChars, 1027 int32_t srcStart, 1028 int32_t srcLength, 1029 int32_t start, 1030 int32_t length) const 1031 { 1032 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1033 return -1; 1034 } 1035 1036 // UnicodeString does not find empty substrings 1037 if(srcLength < 0 && srcChars[srcStart] == 0) { 1038 return -1; 1039 } 1040 1041 // get the indices within bounds 1042 pinIndices(start, length); 1043 1044 // find the first occurrence of the substring 1045 const UChar *array = getArrayStart(); 1046 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 1047 if(match == NULL) { 1048 return -1; 1049 } else { 1050 return (int32_t)(match - array); 1051 } 1052 } 1053 1054 int32_t 1055 UnicodeString::doIndexOf(UChar c, 1056 int32_t start, 1057 int32_t length) const 1058 { 1059 // pin indices 1060 pinIndices(start, length); 1061 1062 // find the first occurrence of c 1063 const UChar *array = getArrayStart(); 1064 const UChar *match = u_memchr(array + start, c, length); 1065 if(match == NULL) { 1066 return -1; 1067 } else { 1068 return (int32_t)(match - array); 1069 } 1070 } 1071 1072 int32_t 1073 UnicodeString::doIndexOf(UChar32 c, 1074 int32_t start, 1075 int32_t length) const { 1076 // pin indices 1077 pinIndices(start, length); 1078 1079 // find the first occurrence of c 1080 const UChar *array = getArrayStart(); 1081 const UChar *match = u_memchr32(array + start, c, length); 1082 if(match == NULL) { 1083 return -1; 1084 } else { 1085 return (int32_t)(match - array); 1086 } 1087 } 1088 1089 int32_t 1090 UnicodeString::lastIndexOf(const UChar *srcChars, 1091 int32_t srcStart, 1092 int32_t srcLength, 1093 int32_t start, 1094 int32_t length) const 1095 { 1096 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1097 return -1; 1098 } 1099 1100 // UnicodeString does not find empty substrings 1101 if(srcLength < 0 && srcChars[srcStart] == 0) { 1102 return -1; 1103 } 1104 1105 // get the indices within bounds 1106 pinIndices(start, length); 1107 1108 // find the last occurrence of the substring 1109 const UChar *array = getArrayStart(); 1110 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1111 if(match == NULL) { 1112 return -1; 1113 } else { 1114 return (int32_t)(match - array); 1115 } 1116 } 1117 1118 int32_t 1119 UnicodeString::doLastIndexOf(UChar c, 1120 int32_t start, 1121 int32_t length) const 1122 { 1123 if(isBogus()) { 1124 return -1; 1125 } 1126 1127 // pin indices 1128 pinIndices(start, length); 1129 1130 // find the last occurrence of c 1131 const UChar *array = getArrayStart(); 1132 const UChar *match = u_memrchr(array + start, c, length); 1133 if(match == NULL) { 1134 return -1; 1135 } else { 1136 return (int32_t)(match - array); 1137 } 1138 } 1139 1140 int32_t 1141 UnicodeString::doLastIndexOf(UChar32 c, 1142 int32_t start, 1143 int32_t length) const { 1144 // pin indices 1145 pinIndices(start, length); 1146 1147 // find the last occurrence of c 1148 const UChar *array = getArrayStart(); 1149 const UChar *match = u_memrchr32(array + start, c, length); 1150 if(match == NULL) { 1151 return -1; 1152 } else { 1153 return (int32_t)(match - array); 1154 } 1155 } 1156 1157 //======================================== 1158 // Write implementation 1159 //======================================== 1160 1161 UnicodeString& 1162 UnicodeString::findAndReplace(int32_t start, 1163 int32_t length, 1164 const UnicodeString& oldText, 1165 int32_t oldStart, 1166 int32_t oldLength, 1167 const UnicodeString& newText, 1168 int32_t newStart, 1169 int32_t newLength) 1170 { 1171 if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1172 return *this; 1173 } 1174 1175 pinIndices(start, length); 1176 oldText.pinIndices(oldStart, oldLength); 1177 newText.pinIndices(newStart, newLength); 1178 1179 if(oldLength == 0) { 1180 return *this; 1181 } 1182 1183 while(length > 0 && length >= oldLength) { 1184 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1185 if(pos < 0) { 1186 // no more oldText's here: done 1187 break; 1188 } else { 1189 // we found oldText, replace it by newText and go beyond it 1190 replace(pos, oldLength, newText, newStart, newLength); 1191 length -= pos + oldLength - start; 1192 start = pos + newLength; 1193 } 1194 } 1195 1196 return *this; 1197 } 1198 1199 1200 void 1201 UnicodeString::setToBogus() 1202 { 1203 releaseArray(); 1204 1205 fUnion.fFields.fLengthAndFlags = kIsBogus; 1206 fUnion.fFields.fArray = 0; 1207 fUnion.fFields.fCapacity = 0; 1208 } 1209 1210 // turn a bogus string into an empty one 1211 void 1212 UnicodeString::unBogus() { 1213 if(fUnion.fFields.fLengthAndFlags & kIsBogus) { 1214 setToEmpty(); 1215 } 1216 } 1217 1218 const UChar * 1219 UnicodeString::getTerminatedBuffer() { 1220 if(!isWritable()) { 1221 return 0; 1222 } 1223 UChar *array = getArrayStart(); 1224 int32_t len = length(); 1225 if(len < getCapacity()) { 1226 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) { 1227 // If len<capacity on a read-only alias, then array[len] is 1228 // either the original NUL (if constructed with (TRUE, s, length)) 1229 // or one of the original string contents characters (if later truncated), 1230 // therefore we can assume that array[len] is initialized memory. 1231 if(array[len] == 0) { 1232 return array; 1233 } 1234 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) { 1235 // kRefCounted: Do not write the NUL if the buffer is shared. 1236 // That is mostly safe, except when the length of one copy was modified 1237 // without copy-on-write, e.g., via truncate(newLength) or remove(void). 1238 // Then the NUL would be written into the middle of another copy's string. 1239 1240 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL. 1241 // Do not test if there is a NUL already because it might be uninitialized memory. 1242 // (That would be safe, but tools like valgrind & Purify would complain.) 1243 array[len] = 0; 1244 return array; 1245 } 1246 } 1247 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) { 1248 array = getArrayStart(); 1249 array[len] = 0; 1250 return array; 1251 } else { 1252 return NULL; 1253 } 1254 } 1255 1256 // setTo() analogous to the readonly-aliasing constructor with the same signature 1257 UnicodeString & 1258 UnicodeString::setTo(UBool isTerminated, 1259 const UChar *text, 1260 int32_t textLength) 1261 { 1262 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1263 // do not modify a string that has an "open" getBuffer(minCapacity) 1264 return *this; 1265 } 1266 1267 if(text == NULL) { 1268 // treat as an empty string, do not alias 1269 releaseArray(); 1270 setToEmpty(); 1271 return *this; 1272 } 1273 1274 if( textLength < -1 || 1275 (textLength == -1 && !isTerminated) || 1276 (textLength >= 0 && isTerminated && text[textLength] != 0) 1277 ) { 1278 setToBogus(); 1279 return *this; 1280 } 1281 1282 releaseArray(); 1283 1284 if(textLength == -1) { 1285 // text is terminated, or else it would have failed the above test 1286 textLength = u_strlen(text); 1287 } 1288 fUnion.fFields.fLengthAndFlags = kReadonlyAlias; 1289 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1290 return *this; 1291 } 1292 1293 // setTo() analogous to the writable-aliasing constructor with the same signature 1294 UnicodeString & 1295 UnicodeString::setTo(UChar *buffer, 1296 int32_t buffLength, 1297 int32_t buffCapacity) { 1298 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) { 1299 // do not modify a string that has an "open" getBuffer(minCapacity) 1300 return *this; 1301 } 1302 1303 if(buffer == NULL) { 1304 // treat as an empty string, do not alias 1305 releaseArray(); 1306 setToEmpty(); 1307 return *this; 1308 } 1309 1310 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1311 setToBogus(); 1312 return *this; 1313 } else if(buffLength == -1) { 1314 // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1315 const UChar *p = buffer, *limit = buffer + buffCapacity; 1316 while(p != limit && *p != 0) { 1317 ++p; 1318 } 1319 buffLength = (int32_t)(p - buffer); 1320 } 1321 1322 releaseArray(); 1323 1324 fUnion.fFields.fLengthAndFlags = kWritableAlias; 1325 setArray(buffer, buffLength, buffCapacity); 1326 return *this; 1327 } 1328 1329 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) { 1330 unBogus(); 1331 int32_t length = utf8.length(); 1332 int32_t capacity; 1333 // The UTF-16 string will be at most as long as the UTF-8 string. 1334 if(length <= US_STACKBUF_SIZE) { 1335 capacity = US_STACKBUF_SIZE; 1336 } else { 1337 capacity = length + 1; // +1 for the terminating NUL. 1338 } 1339 UChar *utf16 = getBuffer(capacity); 1340 int32_t length16; 1341 UErrorCode errorCode = U_ZERO_ERROR; 1342 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1343 utf8.data(), length, 1344 0xfffd, // Substitution character. 1345 NULL, // Don't care about number of substitutions. 1346 &errorCode); 1347 releaseBuffer(length16); 1348 if(U_FAILURE(errorCode)) { 1349 setToBogus(); 1350 } 1351 return *this; 1352 } 1353 1354 UnicodeString& 1355 UnicodeString::setCharAt(int32_t offset, 1356 UChar c) 1357 { 1358 int32_t len = length(); 1359 if(cloneArrayIfNeeded() && len > 0) { 1360 if(offset < 0) { 1361 offset = 0; 1362 } else if(offset >= len) { 1363 offset = len - 1; 1364 } 1365 1366 getArrayStart()[offset] = c; 1367 } 1368 return *this; 1369 } 1370 1371 UnicodeString& 1372 UnicodeString::replace(int32_t start, 1373 int32_t _length, 1374 UChar32 srcChar) { 1375 UChar buffer[U16_MAX_LENGTH]; 1376 int32_t count = 0; 1377 UBool isError = FALSE; 1378 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1379 // We test isError so that the compiler does not complain that we don't. 1380 // If isError (srcChar is not a valid code point) then count==0 which means 1381 // we remove the source segment rather than replacing it with srcChar. 1382 return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1383 } 1384 1385 UnicodeString& 1386 UnicodeString::append(UChar32 srcChar) { 1387 UChar buffer[U16_MAX_LENGTH]; 1388 int32_t _length = 0; 1389 UBool isError = FALSE; 1390 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1391 // We test isError so that the compiler does not complain that we don't. 1392 // If isError then _length==0 which turns the doAppend() into a no-op anyway. 1393 return isError ? *this : doAppend(buffer, 0, _length); 1394 } 1395 1396 UnicodeString& 1397 UnicodeString::doReplace( int32_t start, 1398 int32_t length, 1399 const UnicodeString& src, 1400 int32_t srcStart, 1401 int32_t srcLength) 1402 { 1403 // pin the indices to legal values 1404 src.pinIndices(srcStart, srcLength); 1405 1406 // get the characters from src 1407 // and replace the range in ourselves with them 1408 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1409 } 1410 1411 UnicodeString& 1412 UnicodeString::doReplace(int32_t start, 1413 int32_t length, 1414 const UChar *srcChars, 1415 int32_t srcStart, 1416 int32_t srcLength) 1417 { 1418 if(!isWritable()) { 1419 return *this; 1420 } 1421 1422 int32_t oldLength = this->length(); 1423 1424 // optimize (read-only alias).remove(0, start) and .remove(start, end) 1425 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) { 1426 if(start == 0) { 1427 // remove prefix by adjusting the array pointer 1428 pinIndex(length); 1429 fUnion.fFields.fArray += length; 1430 fUnion.fFields.fCapacity -= length; 1431 setLength(oldLength - length); 1432 return *this; 1433 } else { 1434 pinIndex(start); 1435 if(length >= (oldLength - start)) { 1436 // remove suffix by reducing the length (like truncate()) 1437 setLength(start); 1438 fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1439 return *this; 1440 } 1441 } 1442 } 1443 1444 if(start == oldLength) { 1445 return doAppend(srcChars, srcStart, srcLength); 1446 } 1447 1448 if(srcChars == 0) { 1449 srcStart = srcLength = 0; 1450 } else if(srcLength < 0) { 1451 // get the srcLength if necessary 1452 srcLength = u_strlen(srcChars + srcStart); 1453 } 1454 1455 // pin the indices to legal values 1456 pinIndices(start, length); 1457 1458 // Calculate the size of the string after the replace. 1459 // Avoid int32_t overflow. 1460 int32_t newLength = oldLength - length; 1461 if(srcLength > (INT32_MAX - newLength)) { 1462 setToBogus(); 1463 return *this; 1464 } 1465 newLength += srcLength; 1466 1467 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents; 1468 // therefore we need to keep the current fArray 1469 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1470 UChar *oldArray; 1471 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1472 // copy the stack buffer contents because it will be overwritten with 1473 // fUnion.fFields values 1474 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength); 1475 oldArray = oldStackBuffer; 1476 } else { 1477 oldArray = getArrayStart(); 1478 } 1479 1480 // clone our array and allocate a bigger array if needed 1481 int32_t *bufferToDelete = 0; 1482 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength), 1483 FALSE, &bufferToDelete) 1484 ) { 1485 return *this; 1486 } 1487 1488 // now do the replace 1489 1490 UChar *newArray = getArrayStart(); 1491 if(newArray != oldArray) { 1492 // if fArray changed, then we need to copy everything except what will change 1493 us_arrayCopy(oldArray, 0, newArray, 0, start); 1494 us_arrayCopy(oldArray, start + length, 1495 newArray, start + srcLength, 1496 oldLength - (start + length)); 1497 } else if(length != srcLength) { 1498 // fArray did not change; copy only the portion that isn't changing, leaving a hole 1499 us_arrayCopy(oldArray, start + length, 1500 newArray, start + srcLength, 1501 oldLength - (start + length)); 1502 } 1503 1504 // now fill in the hole with the new string 1505 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); 1506 1507 setLength(newLength); 1508 1509 // delayed delete in case srcChars == fArray when we started, and 1510 // to keep oldArray alive for the above operations 1511 if (bufferToDelete) { 1512 uprv_free(bufferToDelete); 1513 } 1514 1515 return *this; 1516 } 1517 1518 // Versions of doReplace() only for append() variants. 1519 // doReplace() and doAppend() optimize for different cases. 1520 1521 UnicodeString& 1522 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) { 1523 if(srcLength == 0) { 1524 return *this; 1525 } 1526 1527 // pin the indices to legal values 1528 src.pinIndices(srcStart, srcLength); 1529 return doAppend(src.getArrayStart(), srcStart, srcLength); 1530 } 1531 1532 UnicodeString& 1533 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) { 1534 if(!isWritable() || srcLength == 0 || srcChars == NULL) { 1535 return *this; 1536 } 1537 1538 if(srcLength < 0) { 1539 // get the srcLength if necessary 1540 if((srcLength = u_strlen(srcChars + srcStart)) == 0) { 1541 return *this; 1542 } 1543 } 1544 1545 int32_t oldLength = length(); 1546 int32_t newLength = oldLength + srcLength; 1547 // optimize append() onto a large-enough, owned string 1548 if((newLength <= getCapacity() && isBufferWritable()) || 1549 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) { 1550 UChar *newArray = getArrayStart(); 1551 // Do not copy characters when 1552 // UChar *buffer=str.getAppendBuffer(...); 1553 // is followed by 1554 // str.append(buffer, length); 1555 // or 1556 // str.appendString(buffer, length) 1557 // or similar. 1558 if(srcChars + srcStart != newArray + oldLength) { 1559 us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength); 1560 } 1561 setLength(newLength); 1562 } 1563 return *this; 1564 } 1565 1566 /** 1567 * Replaceable API 1568 */ 1569 void 1570 UnicodeString::handleReplaceBetween(int32_t start, 1571 int32_t limit, 1572 const UnicodeString& text) { 1573 replaceBetween(start, limit, text); 1574 } 1575 1576 /** 1577 * Replaceable API 1578 */ 1579 void 1580 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1581 if (limit <= start) { 1582 return; // Nothing to do; avoid bogus malloc call 1583 } 1584 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); 1585 // Check to make sure text is not null. 1586 if (text != NULL) { 1587 extractBetween(start, limit, text, 0); 1588 insert(dest, text, 0, limit - start); 1589 uprv_free(text); 1590 } 1591 } 1592 1593 /** 1594 * Replaceable API 1595 * 1596 * NOTE: This is for the Replaceable class. There is no rep.cpp, 1597 * so we implement this function here. 1598 */ 1599 UBool Replaceable::hasMetaData() const { 1600 return TRUE; 1601 } 1602 1603 /** 1604 * Replaceable API 1605 */ 1606 UBool UnicodeString::hasMetaData() const { 1607 return FALSE; 1608 } 1609 1610 UnicodeString& 1611 UnicodeString::doReverse(int32_t start, int32_t length) { 1612 if(length <= 1 || !cloneArrayIfNeeded()) { 1613 return *this; 1614 } 1615 1616 // pin the indices to legal values 1617 pinIndices(start, length); 1618 if(length <= 1) { // pinIndices() might have shrunk the length 1619 return *this; 1620 } 1621 1622 UChar *left = getArrayStart() + start; 1623 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1624 UChar swap; 1625 UBool hasSupplementary = FALSE; 1626 1627 // Before the loop we know left<right because length>=2. 1628 do { 1629 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); 1630 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); 1631 *right-- = swap; 1632 } while(left < right); 1633 // Make sure to test the middle code unit of an odd-length string. 1634 // Redundant if the length is even. 1635 hasSupplementary |= (UBool)U16_IS_LEAD(*left); 1636 1637 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1638 if(hasSupplementary) { 1639 UChar swap2; 1640 1641 left = getArrayStart() + start; 1642 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1643 while(left < right) { 1644 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1645 *left++ = swap2; 1646 *left++ = swap; 1647 } else { 1648 ++left; 1649 } 1650 } 1651 } 1652 1653 return *this; 1654 } 1655 1656 UBool 1657 UnicodeString::padLeading(int32_t targetLength, 1658 UChar padChar) 1659 { 1660 int32_t oldLength = length(); 1661 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1662 return FALSE; 1663 } else { 1664 // move contents up by padding width 1665 UChar *array = getArrayStart(); 1666 int32_t start = targetLength - oldLength; 1667 us_arrayCopy(array, 0, array, start, oldLength); 1668 1669 // fill in padding character 1670 while(--start >= 0) { 1671 array[start] = padChar; 1672 } 1673 setLength(targetLength); 1674 return TRUE; 1675 } 1676 } 1677 1678 UBool 1679 UnicodeString::padTrailing(int32_t targetLength, 1680 UChar padChar) 1681 { 1682 int32_t oldLength = length(); 1683 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1684 return FALSE; 1685 } else { 1686 // fill in padding character 1687 UChar *array = getArrayStart(); 1688 int32_t length = targetLength; 1689 while(--length >= oldLength) { 1690 array[length] = padChar; 1691 } 1692 setLength(targetLength); 1693 return TRUE; 1694 } 1695 } 1696 1697 //======================================== 1698 // Hashing 1699 //======================================== 1700 int32_t 1701 UnicodeString::doHashCode() const 1702 { 1703 /* Delegate hash computation to uhash. This makes UnicodeString 1704 * hashing consistent with UChar* hashing. */ 1705 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1706 if (hashCode == kInvalidHashCode) { 1707 hashCode = kEmptyHashCode; 1708 } 1709 return hashCode; 1710 } 1711 1712 //======================================== 1713 // External Buffer 1714 //======================================== 1715 1716 UChar * 1717 UnicodeString::getBuffer(int32_t minCapacity) { 1718 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1719 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer; 1720 setZeroLength(); 1721 return getArrayStart(); 1722 } else { 1723 return 0; 1724 } 1725 } 1726 1727 void 1728 UnicodeString::releaseBuffer(int32_t newLength) { 1729 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) { 1730 // set the new fLength 1731 int32_t capacity=getCapacity(); 1732 if(newLength==-1) { 1733 // the new length is the string length, capped by fCapacity 1734 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; 1735 while(p<limit && *p!=0) { 1736 ++p; 1737 } 1738 newLength=(int32_t)(p-array); 1739 } else if(newLength>capacity) { 1740 newLength=capacity; 1741 } 1742 setLength(newLength); 1743 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer; 1744 } 1745 } 1746 1747 //======================================== 1748 // Miscellaneous 1749 //======================================== 1750 UBool 1751 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1752 int32_t growCapacity, 1753 UBool doCopyArray, 1754 int32_t **pBufferToDelete, 1755 UBool forceClone) { 1756 // default parameters need to be static, therefore 1757 // the defaults are -1 to have convenience defaults 1758 if(newCapacity == -1) { 1759 newCapacity = getCapacity(); 1760 } 1761 1762 // while a getBuffer(minCapacity) is "open", 1763 // prevent any modifications of the string by returning FALSE here 1764 // if the string is bogus, then only an assignment or similar can revive it 1765 if(!isWritable()) { 1766 return FALSE; 1767 } 1768 1769 /* 1770 * We need to make a copy of the array if 1771 * the buffer is read-only, or 1772 * the buffer is refCounted (shared), and refCount>1, or 1773 * the buffer is too small. 1774 * Return FALSE if memory could not be allocated. 1775 */ 1776 if(forceClone || 1777 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly || 1778 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) || 1779 newCapacity > getCapacity() 1780 ) { 1781 // check growCapacity for default value and use of the stack buffer 1782 if(growCapacity < 0) { 1783 growCapacity = newCapacity; 1784 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1785 growCapacity = US_STACKBUF_SIZE; 1786 } 1787 1788 // save old values 1789 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1790 UChar *oldArray; 1791 int32_t oldLength = length(); 1792 int16_t flags = fUnion.fFields.fLengthAndFlags; 1793 1794 if(flags&kUsingStackBuffer) { 1795 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1796 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1797 // copy the stack buffer contents because it will be overwritten with 1798 // fUnion.fFields values 1799 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength); 1800 oldArray = oldStackBuffer; 1801 } else { 1802 oldArray = NULL; // no need to copy from the stack buffer to itself 1803 } 1804 } else { 1805 oldArray = fUnion.fFields.fArray; 1806 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ 1807 } 1808 1809 // allocate a new array 1810 if(allocate(growCapacity) || 1811 (newCapacity < growCapacity && allocate(newCapacity)) 1812 ) { 1813 if(doCopyArray) { 1814 // copy the contents 1815 // do not copy more than what fits - it may be smaller than before 1816 int32_t minLength = oldLength; 1817 newCapacity = getCapacity(); 1818 if(newCapacity < minLength) { 1819 minLength = newCapacity; 1820 } 1821 if(oldArray != NULL) { 1822 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1823 } 1824 setLength(minLength); 1825 } else { 1826 setZeroLength(); 1827 } 1828 1829 // release the old array 1830 if(flags & kRefCounted) { 1831 // the array is refCounted; decrement and release if 0 1832 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1); 1833 if(umtx_atomic_dec(pRefCount) == 0) { 1834 if(pBufferToDelete == 0) { 1835 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t 1836 // is defined as volatile. (Volatile has useful non-standard behavior 1837 // with this compiler.) 1838 uprv_free((void *)pRefCount); 1839 } else { 1840 // the caller requested to delete it himself 1841 *pBufferToDelete = (int32_t *)pRefCount; 1842 } 1843 } 1844 } 1845 } else { 1846 // not enough memory for growCapacity and not even for the smaller newCapacity 1847 // reset the old values for setToBogus() to release the array 1848 if(!(flags&kUsingStackBuffer)) { 1849 fUnion.fFields.fArray = oldArray; 1850 } 1851 fUnion.fFields.fLengthAndFlags = flags; 1852 setToBogus(); 1853 return FALSE; 1854 } 1855 } 1856 return TRUE; 1857 } 1858 1859 // UnicodeStringAppendable ------------------------------------------------- *** 1860 1861 UnicodeStringAppendable::~UnicodeStringAppendable() {} 1862 1863 UBool 1864 UnicodeStringAppendable::appendCodeUnit(UChar c) { 1865 return str.doAppend(&c, 0, 1).isWritable(); 1866 } 1867 1868 UBool 1869 UnicodeStringAppendable::appendCodePoint(UChar32 c) { 1870 UChar buffer[U16_MAX_LENGTH]; 1871 int32_t cLength = 0; 1872 UBool isError = FALSE; 1873 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 1874 return !isError && str.doAppend(buffer, 0, cLength).isWritable(); 1875 } 1876 1877 UBool 1878 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { 1879 return str.doAppend(s, 0, length).isWritable(); 1880 } 1881 1882 UBool 1883 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 1884 return str.cloneArrayIfNeeded(str.length() + appendCapacity); 1885 } 1886 1887 UChar * 1888 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 1889 int32_t desiredCapacityHint, 1890 UChar *scratch, int32_t scratchCapacity, 1891 int32_t *resultCapacity) { 1892 if(minCapacity < 1 || scratchCapacity < minCapacity) { 1893 *resultCapacity = 0; 1894 return NULL; 1895 } 1896 int32_t oldLength = str.length(); 1897 if(minCapacity <= (kMaxCapacity - oldLength) && 1898 desiredCapacityHint <= (kMaxCapacity - oldLength) && 1899 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 1900 *resultCapacity = str.getCapacity() - oldLength; 1901 return str.getArrayStart() + oldLength; 1902 } 1903 *resultCapacity = scratchCapacity; 1904 return scratch; 1905 } 1906 1907 U_NAMESPACE_END 1908 1909 U_NAMESPACE_USE 1910 1911 U_CAPI int32_t U_EXPORT2 1912 uhash_hashUnicodeString(const UElement key) { 1913 const UnicodeString *str = (const UnicodeString*) key.pointer; 1914 return (str == NULL) ? 0 : str->hashCode(); 1915 } 1916 1917 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 1918 // does not depend on hashtable code. 1919 U_CAPI UBool U_EXPORT2 1920 uhash_compareUnicodeString(const UElement key1, const UElement key2) { 1921 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 1922 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 1923 if (str1 == str2) { 1924 return TRUE; 1925 } 1926 if (str1 == NULL || str2 == NULL) { 1927 return FALSE; 1928 } 1929 return *str1 == *str2; 1930 } 1931 1932 #ifdef U_STATIC_IMPLEMENTATION 1933 /* 1934 This should never be called. It is defined here to make sure that the 1935 virtual vector deleting destructor is defined within unistr.cpp. 1936 The vector deleting destructor is already a part of UObject, 1937 but defining it here makes sure that it is included with this object file. 1938 This makes sure that static library dependencies are kept to a minimum. 1939 */ 1940 static void uprv_UnicodeStringDummy(void) { 1941 delete [] (new UnicodeString[2]); 1942 } 1943 #endif 1944