1 /* 2 ****************************************************************************** 3 * Copyright (C) 1999-2012, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 * 7 * File unistr.cpp 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 09/25/98 stephen Creation. 13 * 04/20/99 stephen Overhauled per 4/16 code review. 14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from 16 * Replaceable. 17 * 06/25/01 grhoten Removed the dependency on iostream 18 ****************************************************************************** 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/appendable.h" 23 #include "unicode/putil.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/unistr.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf16.h" 30 #include "uelement.h" 31 #include "ustr_imp.h" 32 #include "umutex.h" 33 #include "uassert.h" 34 35 #if 0 36 37 #include <iostream> 38 using namespace std; 39 40 //DEBUGGING 41 void 42 print(const UnicodeString& s, 43 const char *name) 44 { 45 UChar c; 46 cout << name << ":|"; 47 for(int i = 0; i < s.length(); ++i) { 48 c = s[i]; 49 if(c>= 0x007E || c < 0x0020) 50 cout << "[0x" << hex << s[i] << "]"; 51 else 52 cout << (char) s[i]; 53 } 54 cout << '|' << endl; 55 } 56 57 void 58 print(const UChar *s, 59 int32_t len, 60 const char *name) 61 { 62 UChar c; 63 cout << name << ":|"; 64 for(int i = 0; i < len; ++i) { 65 c = s[i]; 66 if(c>= 0x007E || c < 0x0020) 67 cout << "[0x" << hex << s[i] << "]"; 68 else 69 cout << (char) s[i]; 70 } 71 cout << '|' << endl; 72 } 73 // END DEBUGGING 74 #endif 75 76 // Local function definitions for now 77 78 // need to copy areas that may overlap 79 static 80 inline void 81 us_arrayCopy(const UChar *src, int32_t srcStart, 82 UChar *dst, int32_t dstStart, int32_t count) 83 { 84 if(count>0) { 85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); 86 } 87 } 88 89 // u_unescapeAt() callback to get a UChar from a UnicodeString 90 U_CDECL_BEGIN 91 static UChar U_CALLCONV 92 UnicodeString_charAt(int32_t offset, void *context) { 93 return ((icu::UnicodeString*) context)->charAt(offset); 94 } 95 U_CDECL_END 96 97 U_NAMESPACE_BEGIN 98 99 /* The Replaceable virtual destructor can't be defined in the header 100 due to how AIX works with multiple definitions of virtual functions. 101 */ 102 Replaceable::~Replaceable() {} 103 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 105 106 UnicodeString U_EXPORT2 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) { 108 return 109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). 110 append(s1). 111 append(s2); 112 } 113 114 //======================================== 115 // Reference Counting functions, put at top of file so that optimizing compilers 116 // have a chance to automatically inline. 117 //======================================== 118 119 void 120 UnicodeString::addRef() 121 { umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);} 122 123 int32_t 124 UnicodeString::removeRef() 125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);} 126 127 int32_t 128 UnicodeString::refCount() const 129 { 130 umtx_lock(NULL); 131 // Note: without the lock to force a memory barrier, we might see a very 132 // stale value on some multi-processor systems. 133 int32_t count = *((int32_t *)fUnion.fFields.fArray - 1); 134 umtx_unlock(NULL); 135 return count; 136 } 137 138 void 139 UnicodeString::releaseArray() { 140 if((fFlags & kRefCounted) && removeRef() == 0) { 141 uprv_free((int32_t *)fUnion.fFields.fArray - 1); 142 } 143 } 144 145 146 147 //======================================== 148 // Constructors 149 //======================================== 150 151 // The default constructor is inline in unistr.h. 152 153 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) 154 : fShortLength(0), 155 fFlags(0) 156 { 157 if(count <= 0 || (uint32_t)c > 0x10ffff) { 158 // just allocate and do not do anything else 159 allocate(capacity); 160 } else { 161 // count > 0, allocate and fill the new string with count c's 162 int32_t unitCount = U16_LENGTH(c), length = count * unitCount; 163 if(capacity < length) { 164 capacity = length; 165 } 166 if(allocate(capacity)) { 167 UChar *array = getArrayStart(); 168 int32_t i = 0; 169 170 // fill the new string with c 171 if(unitCount == 1) { 172 // fill with length UChars 173 while(i < length) { 174 array[i++] = (UChar)c; 175 } 176 } else { 177 // get the code units for c 178 UChar units[U16_MAX_LENGTH]; 179 U16_APPEND_UNSAFE(units, i, c); 180 181 // now it must be i==unitCount 182 i = 0; 183 184 // for Unicode, unitCount can only be 1, 2, 3, or 4 185 // 1 is handled above 186 while(i < length) { 187 int32_t unitIdx = 0; 188 while(unitIdx < unitCount) { 189 array[i++]=units[unitIdx++]; 190 } 191 } 192 } 193 } 194 setLength(length); 195 } 196 } 197 198 UnicodeString::UnicodeString(UChar ch) 199 : fShortLength(1), 200 fFlags(kShortString) 201 { 202 fUnion.fStackBuffer[0] = ch; 203 } 204 205 UnicodeString::UnicodeString(UChar32 ch) 206 : fShortLength(0), 207 fFlags(kShortString) 208 { 209 int32_t i = 0; 210 UBool isError = FALSE; 211 U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError); 212 // We test isError so that the compiler does not complain that we don't. 213 // If isError then i==0 which is what we want anyway. 214 if(!isError) { 215 fShortLength = (int8_t)i; 216 } 217 } 218 219 UnicodeString::UnicodeString(const UChar *text) 220 : fShortLength(0), 221 fFlags(kShortString) 222 { 223 doReplace(0, 0, text, 0, -1); 224 } 225 226 UnicodeString::UnicodeString(const UChar *text, 227 int32_t textLength) 228 : fShortLength(0), 229 fFlags(kShortString) 230 { 231 doReplace(0, 0, text, 0, textLength); 232 } 233 234 UnicodeString::UnicodeString(UBool isTerminated, 235 const UChar *text, 236 int32_t textLength) 237 : fShortLength(0), 238 fFlags(kReadonlyAlias) 239 { 240 if(text == NULL) { 241 // treat as an empty string, do not alias 242 setToEmpty(); 243 } else if(textLength < -1 || 244 (textLength == -1 && !isTerminated) || 245 (textLength >= 0 && isTerminated && text[textLength] != 0) 246 ) { 247 setToBogus(); 248 } else { 249 if(textLength == -1) { 250 // text is terminated, or else it would have failed the above test 251 textLength = u_strlen(text); 252 } 253 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 254 } 255 } 256 257 UnicodeString::UnicodeString(UChar *buff, 258 int32_t buffLength, 259 int32_t buffCapacity) 260 : fShortLength(0), 261 fFlags(kWritableAlias) 262 { 263 if(buff == NULL) { 264 // treat as an empty string, do not alias 265 setToEmpty(); 266 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 267 setToBogus(); 268 } else { 269 if(buffLength == -1) { 270 // fLength = u_strlen(buff); but do not look beyond buffCapacity 271 const UChar *p = buff, *limit = buff + buffCapacity; 272 while(p != limit && *p != 0) { 273 ++p; 274 } 275 buffLength = (int32_t)(p - buff); 276 } 277 setArray(buff, buffLength, buffCapacity); 278 } 279 } 280 281 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) 282 : fShortLength(0), 283 fFlags(kShortString) 284 { 285 if(src==NULL) { 286 // treat as an empty string 287 } else { 288 if(length<0) { 289 length=(int32_t)uprv_strlen(src); 290 } 291 if(cloneArrayIfNeeded(length, length, FALSE)) { 292 u_charsToUChars(src, getArrayStart(), length); 293 setLength(length); 294 } else { 295 setToBogus(); 296 } 297 } 298 } 299 300 #if U_CHARSET_IS_UTF8 301 302 UnicodeString::UnicodeString(const char *codepageData) 303 : fShortLength(0), 304 fFlags(kShortString) { 305 if(codepageData != 0) { 306 setToUTF8(codepageData); 307 } 308 } 309 310 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) 311 : fShortLength(0), 312 fFlags(kShortString) { 313 // if there's nothing to convert, do nothing 314 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 315 return; 316 } 317 if(dataLength == -1) { 318 dataLength = (int32_t)uprv_strlen(codepageData); 319 } 320 setToUTF8(StringPiece(codepageData, dataLength)); 321 } 322 323 // else see unistr_cnv.cpp 324 #endif 325 326 UnicodeString::UnicodeString(const UnicodeString& that) 327 : Replaceable(), 328 fShortLength(0), 329 fFlags(kShortString) 330 { 331 copyFrom(that); 332 } 333 334 UnicodeString::UnicodeString(const UnicodeString& that, 335 int32_t srcStart) 336 : Replaceable(), 337 fShortLength(0), 338 fFlags(kShortString) 339 { 340 setTo(that, srcStart); 341 } 342 343 UnicodeString::UnicodeString(const UnicodeString& that, 344 int32_t srcStart, 345 int32_t srcLength) 346 : Replaceable(), 347 fShortLength(0), 348 fFlags(kShortString) 349 { 350 setTo(that, srcStart, srcLength); 351 } 352 353 // Replaceable base class clone() default implementation, does not clone 354 Replaceable * 355 Replaceable::clone() const { 356 return NULL; 357 } 358 359 // UnicodeString overrides clone() with a real implementation 360 Replaceable * 361 UnicodeString::clone() const { 362 return new UnicodeString(*this); 363 } 364 365 //======================================== 366 // array allocation 367 //======================================== 368 369 UBool 370 UnicodeString::allocate(int32_t capacity) { 371 if(capacity <= US_STACKBUF_SIZE) { 372 fFlags = kShortString; 373 } else { 374 // count bytes for the refCounter and the string capacity, and 375 // round up to a multiple of 16; then divide by 4 and allocate int32_t's 376 // to be safely aligned for the refCount 377 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() 378 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); 379 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); 380 if(array != 0) { 381 // set initial refCount and point behind the refCount 382 *array++ = 1; 383 384 // have fArray point to the first UChar 385 fUnion.fFields.fArray = (UChar *)array; 386 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); 387 fFlags = kLongString; 388 } else { 389 fShortLength = 0; 390 fUnion.fFields.fArray = 0; 391 fUnion.fFields.fCapacity = 0; 392 fFlags = kIsBogus; 393 return FALSE; 394 } 395 } 396 return TRUE; 397 } 398 399 //======================================== 400 // Destructor 401 //======================================== 402 UnicodeString::~UnicodeString() 403 { 404 releaseArray(); 405 } 406 407 //======================================== 408 // Factory methods 409 //======================================== 410 411 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { 412 UnicodeString result; 413 result.setToUTF8(utf8); 414 return result; 415 } 416 417 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 418 UnicodeString result; 419 int32_t capacity; 420 // Most UTF-32 strings will be BMP-only and result in a same-length 421 // UTF-16 string. We overestimate the capacity just slightly, 422 // just in case there are a few supplementary characters. 423 if(length <= US_STACKBUF_SIZE) { 424 capacity = US_STACKBUF_SIZE; 425 } else { 426 capacity = length + (length >> 4) + 4; 427 } 428 do { 429 UChar *utf16 = result.getBuffer(capacity); 430 int32_t length16; 431 UErrorCode errorCode = U_ZERO_ERROR; 432 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 433 utf32, length, 434 0xfffd, // Substitution character. 435 NULL, // Don't care about number of substitutions. 436 &errorCode); 437 result.releaseBuffer(length16); 438 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 439 capacity = length16 + 1; // +1 for the terminating NUL. 440 continue; 441 } else if(U_FAILURE(errorCode)) { 442 result.setToBogus(); 443 } 444 break; 445 } while(TRUE); 446 return result; 447 } 448 449 //======================================== 450 // Assignment 451 //======================================== 452 453 UnicodeString & 454 UnicodeString::operator=(const UnicodeString &src) { 455 return copyFrom(src); 456 } 457 458 UnicodeString & 459 UnicodeString::fastCopyFrom(const UnicodeString &src) { 460 return copyFrom(src, TRUE); 461 } 462 463 UnicodeString & 464 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 465 // if assigning to ourselves, do nothing 466 if(this == 0 || this == &src) { 467 return *this; 468 } 469 470 // is the right side bogus? 471 if(&src == 0 || src.isBogus()) { 472 setToBogus(); 473 return *this; 474 } 475 476 // delete the current contents 477 releaseArray(); 478 479 if(src.isEmpty()) { 480 // empty string - use the stack buffer 481 setToEmpty(); 482 return *this; 483 } 484 485 // we always copy the length 486 int32_t srcLength = src.length(); 487 setLength(srcLength); 488 489 // fLength>0 and not an "open" src.getBuffer(minCapacity) 490 switch(src.fFlags) { 491 case kShortString: 492 // short string using the stack buffer, do the same 493 fFlags = kShortString; 494 uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR); 495 break; 496 case kLongString: 497 // src uses a refCounted string buffer, use that buffer with refCount 498 // src is const, use a cast - we don't really change it 499 ((UnicodeString &)src).addRef(); 500 // copy all fields, share the reference-counted buffer 501 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 502 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 503 fFlags = src.fFlags; 504 break; 505 case kReadonlyAlias: 506 if(fastCopy) { 507 // src is a readonly alias, do the same 508 // -> maintain the readonly alias as such 509 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 510 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 511 fFlags = src.fFlags; 512 break; 513 } 514 // else if(!fastCopy) fall through to case kWritableAlias 515 // -> allocate a new buffer and copy the contents 516 case kWritableAlias: 517 // src is a writable alias; we make a copy of that instead 518 if(allocate(srcLength)) { 519 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); 520 break; 521 } 522 // if there is not enough memory, then fall through to setting to bogus 523 default: 524 // if src is bogus, set ourselves to bogus 525 // do not call setToBogus() here because fArray and fFlags are not consistent here 526 fShortLength = 0; 527 fUnion.fFields.fArray = 0; 528 fUnion.fFields.fCapacity = 0; 529 fFlags = kIsBogus; 530 break; 531 } 532 533 return *this; 534 } 535 536 //======================================== 537 // Miscellaneous operations 538 //======================================== 539 540 UnicodeString UnicodeString::unescape() const { 541 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity 542 const UChar *array = getBuffer(); 543 int32_t len = length(); 544 int32_t prev = 0; 545 for (int32_t i=0;;) { 546 if (i == len) { 547 result.append(array, prev, len - prev); 548 break; 549 } 550 if (array[i++] == 0x5C /*'\\'*/) { 551 result.append(array, prev, (i - 1) - prev); 552 UChar32 c = unescapeAt(i); // advances i 553 if (c < 0) { 554 result.remove(); // return empty string 555 break; // invalid escape sequence 556 } 557 result.append(c); 558 prev = i; 559 } 560 } 561 return result; 562 } 563 564 UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 565 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 566 } 567 568 //======================================== 569 // Read-only implementation 570 //======================================== 571 UBool 572 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { 573 // Requires: this & text not bogus and have same lengths. 574 // Byte-wise comparison works for equality regardless of endianness. 575 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; 576 } 577 578 int8_t 579 UnicodeString::doCompare( int32_t start, 580 int32_t length, 581 const UChar *srcChars, 582 int32_t srcStart, 583 int32_t srcLength) const 584 { 585 // compare illegal string values 586 if(isBogus()) { 587 return -1; 588 } 589 590 // pin indices to legal values 591 pinIndices(start, length); 592 593 if(srcChars == NULL) { 594 // treat const UChar *srcChars==NULL as an empty string 595 return length == 0 ? 0 : 1; 596 } 597 598 // get the correct pointer 599 const UChar *chars = getArrayStart(); 600 601 chars += start; 602 srcChars += srcStart; 603 604 int32_t minLength; 605 int8_t lengthResult; 606 607 // get the srcLength if necessary 608 if(srcLength < 0) { 609 srcLength = u_strlen(srcChars + srcStart); 610 } 611 612 // are we comparing different lengths? 613 if(length != srcLength) { 614 if(length < srcLength) { 615 minLength = length; 616 lengthResult = -1; 617 } else { 618 minLength = srcLength; 619 lengthResult = 1; 620 } 621 } else { 622 minLength = length; 623 lengthResult = 0; 624 } 625 626 /* 627 * note that uprv_memcmp() returns an int but we return an int8_t; 628 * we need to take care not to truncate the result - 629 * one way to do this is to right-shift the value to 630 * move the sign bit into the lower 8 bits and making sure that this 631 * does not become 0 itself 632 */ 633 634 if(minLength > 0 && chars != srcChars) { 635 int32_t result; 636 637 # if U_IS_BIG_ENDIAN 638 // big-endian: byte comparison works 639 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); 640 if(result != 0) { 641 return (int8_t)(result >> 15 | 1); 642 } 643 # else 644 // little-endian: compare UChar units 645 do { 646 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); 647 if(result != 0) { 648 return (int8_t)(result >> 15 | 1); 649 } 650 } while(--minLength > 0); 651 # endif 652 } 653 return lengthResult; 654 } 655 656 /* String compare in code point order - doCompare() compares in code unit order. */ 657 int8_t 658 UnicodeString::doCompareCodePointOrder(int32_t start, 659 int32_t length, 660 const UChar *srcChars, 661 int32_t srcStart, 662 int32_t srcLength) const 663 { 664 // compare illegal string values 665 // treat const UChar *srcChars==NULL as an empty string 666 if(isBogus()) { 667 return -1; 668 } 669 670 // pin indices to legal values 671 pinIndices(start, length); 672 673 if(srcChars == NULL) { 674 srcStart = srcLength = 0; 675 } 676 677 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); 678 /* translate the 32-bit result into an 8-bit one */ 679 if(diff!=0) { 680 return (int8_t)(diff >> 15 | 1); 681 } else { 682 return 0; 683 } 684 } 685 686 int32_t 687 UnicodeString::getLength() const { 688 return length(); 689 } 690 691 UChar 692 UnicodeString::getCharAt(int32_t offset) const { 693 return charAt(offset); 694 } 695 696 UChar32 697 UnicodeString::getChar32At(int32_t offset) const { 698 return char32At(offset); 699 } 700 701 UChar32 702 UnicodeString::char32At(int32_t offset) const 703 { 704 int32_t len = length(); 705 if((uint32_t)offset < (uint32_t)len) { 706 const UChar *array = getArrayStart(); 707 UChar32 c; 708 U16_GET(array, 0, offset, len, c); 709 return c; 710 } else { 711 return kInvalidUChar; 712 } 713 } 714 715 int32_t 716 UnicodeString::getChar32Start(int32_t offset) const { 717 if((uint32_t)offset < (uint32_t)length()) { 718 const UChar *array = getArrayStart(); 719 U16_SET_CP_START(array, 0, offset); 720 return offset; 721 } else { 722 return 0; 723 } 724 } 725 726 int32_t 727 UnicodeString::getChar32Limit(int32_t offset) const { 728 int32_t len = length(); 729 if((uint32_t)offset < (uint32_t)len) { 730 const UChar *array = getArrayStart(); 731 U16_SET_CP_LIMIT(array, 0, offset, len); 732 return offset; 733 } else { 734 return len; 735 } 736 } 737 738 int32_t 739 UnicodeString::countChar32(int32_t start, int32_t length) const { 740 pinIndices(start, length); 741 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL 742 return u_countChar32(getArrayStart()+start, length); 743 } 744 745 UBool 746 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 747 pinIndices(start, length); 748 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL 749 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 750 } 751 752 int32_t 753 UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 754 // pin index 755 int32_t len = length(); 756 if(index<0) { 757 index=0; 758 } else if(index>len) { 759 index=len; 760 } 761 762 const UChar *array = getArrayStart(); 763 if(delta>0) { 764 U16_FWD_N(array, index, len, delta); 765 } else { 766 U16_BACK_N(array, 0, index, -delta); 767 } 768 769 return index; 770 } 771 772 void 773 UnicodeString::doExtract(int32_t start, 774 int32_t length, 775 UChar *dst, 776 int32_t dstStart) const 777 { 778 // pin indices to legal values 779 pinIndices(start, length); 780 781 // do not copy anything if we alias dst itself 782 const UChar *array = getArrayStart(); 783 if(array + start != dst + dstStart) { 784 us_arrayCopy(array, start, dst, dstStart, length); 785 } 786 } 787 788 int32_t 789 UnicodeString::extract(UChar *dest, int32_t destCapacity, 790 UErrorCode &errorCode) const { 791 int32_t len = length(); 792 if(U_SUCCESS(errorCode)) { 793 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 794 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 795 } else { 796 const UChar *array = getArrayStart(); 797 if(len>0 && len<=destCapacity && array!=dest) { 798 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); 799 } 800 return u_terminateUChars(dest, destCapacity, len, &errorCode); 801 } 802 } 803 804 return len; 805 } 806 807 int32_t 808 UnicodeString::extract(int32_t start, 809 int32_t length, 810 char *target, 811 int32_t targetCapacity, 812 enum EInvariant) const 813 { 814 // if the arguments are illegal, then do nothing 815 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { 816 return 0; 817 } 818 819 // pin the indices to legal values 820 pinIndices(start, length); 821 822 if(length <= targetCapacity) { 823 u_UCharsToChars(getArrayStart() + start, target, length); 824 } 825 UErrorCode status = U_ZERO_ERROR; 826 return u_terminateChars(target, targetCapacity, length, &status); 827 } 828 829 UnicodeString 830 UnicodeString::tempSubString(int32_t start, int32_t len) const { 831 pinIndices(start, len); 832 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 833 if(array==NULL) { 834 array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string 835 len=-2; // bogus result string 836 } 837 return UnicodeString(FALSE, array + start, len); 838 } 839 840 int32_t 841 UnicodeString::toUTF8(int32_t start, int32_t len, 842 char *target, int32_t capacity) const { 843 pinIndices(start, len); 844 int32_t length8; 845 UErrorCode errorCode = U_ZERO_ERROR; 846 u_strToUTF8WithSub(target, capacity, &length8, 847 getBuffer() + start, len, 848 0xFFFD, // Standard substitution character. 849 NULL, // Don't care about number of substitutions. 850 &errorCode); 851 return length8; 852 } 853 854 #if U_CHARSET_IS_UTF8 855 856 int32_t 857 UnicodeString::extract(int32_t start, int32_t len, 858 char *target, uint32_t dstSize) const { 859 // if the arguments are illegal, then do nothing 860 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 861 return 0; 862 } 863 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); 864 } 865 866 // else see unistr_cnv.cpp 867 #endif 868 869 void 870 UnicodeString::extractBetween(int32_t start, 871 int32_t limit, 872 UnicodeString& target) const { 873 pinIndex(start); 874 pinIndex(limit); 875 doExtract(start, limit - start, target); 876 } 877 878 // When converting from UTF-16 to UTF-8, the result will have at most 3 times 879 // as many bytes as the source has UChars. 880 // The "worst cases" are writing systems like Indic, Thai and CJK with 881 // 3:1 bytes:UChars. 882 void 883 UnicodeString::toUTF8(ByteSink &sink) const { 884 int32_t length16 = length(); 885 if(length16 != 0) { 886 char stackBuffer[1024]; 887 int32_t capacity = (int32_t)sizeof(stackBuffer); 888 UBool utf8IsOwned = FALSE; 889 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 890 3*length16, 891 stackBuffer, capacity, 892 &capacity); 893 int32_t length8 = 0; 894 UErrorCode errorCode = U_ZERO_ERROR; 895 u_strToUTF8WithSub(utf8, capacity, &length8, 896 getBuffer(), length16, 897 0xFFFD, // Standard substitution character. 898 NULL, // Don't care about number of substitutions. 899 &errorCode); 900 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 901 utf8 = (char *)uprv_malloc(length8); 902 if(utf8 != NULL) { 903 utf8IsOwned = TRUE; 904 errorCode = U_ZERO_ERROR; 905 u_strToUTF8WithSub(utf8, length8, &length8, 906 getBuffer(), length16, 907 0xFFFD, // Standard substitution character. 908 NULL, // Don't care about number of substitutions. 909 &errorCode); 910 } else { 911 errorCode = U_MEMORY_ALLOCATION_ERROR; 912 } 913 } 914 if(U_SUCCESS(errorCode)) { 915 sink.Append(utf8, length8); 916 sink.Flush(); 917 } 918 if(utf8IsOwned) { 919 uprv_free(utf8); 920 } 921 } 922 } 923 924 int32_t 925 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 926 int32_t length32=0; 927 if(U_SUCCESS(errorCode)) { 928 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 929 u_strToUTF32WithSub(utf32, capacity, &length32, 930 getBuffer(), length(), 931 0xfffd, // Substitution character. 932 NULL, // Don't care about number of substitutions. 933 &errorCode); 934 } 935 return length32; 936 } 937 938 int32_t 939 UnicodeString::indexOf(const UChar *srcChars, 940 int32_t srcStart, 941 int32_t srcLength, 942 int32_t start, 943 int32_t length) const 944 { 945 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 946 return -1; 947 } 948 949 // UnicodeString does not find empty substrings 950 if(srcLength < 0 && srcChars[srcStart] == 0) { 951 return -1; 952 } 953 954 // get the indices within bounds 955 pinIndices(start, length); 956 957 // find the first occurrence of the substring 958 const UChar *array = getArrayStart(); 959 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 960 if(match == NULL) { 961 return -1; 962 } else { 963 return (int32_t)(match - array); 964 } 965 } 966 967 int32_t 968 UnicodeString::doIndexOf(UChar c, 969 int32_t start, 970 int32_t length) const 971 { 972 // pin indices 973 pinIndices(start, length); 974 975 // find the first occurrence of c 976 const UChar *array = getArrayStart(); 977 const UChar *match = u_memchr(array + start, c, length); 978 if(match == NULL) { 979 return -1; 980 } else { 981 return (int32_t)(match - array); 982 } 983 } 984 985 int32_t 986 UnicodeString::doIndexOf(UChar32 c, 987 int32_t start, 988 int32_t length) const { 989 // pin indices 990 pinIndices(start, length); 991 992 // find the first occurrence of c 993 const UChar *array = getArrayStart(); 994 const UChar *match = u_memchr32(array + start, c, length); 995 if(match == NULL) { 996 return -1; 997 } else { 998 return (int32_t)(match - array); 999 } 1000 } 1001 1002 int32_t 1003 UnicodeString::lastIndexOf(const UChar *srcChars, 1004 int32_t srcStart, 1005 int32_t srcLength, 1006 int32_t start, 1007 int32_t length) const 1008 { 1009 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1010 return -1; 1011 } 1012 1013 // UnicodeString does not find empty substrings 1014 if(srcLength < 0 && srcChars[srcStart] == 0) { 1015 return -1; 1016 } 1017 1018 // get the indices within bounds 1019 pinIndices(start, length); 1020 1021 // find the last occurrence of the substring 1022 const UChar *array = getArrayStart(); 1023 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1024 if(match == NULL) { 1025 return -1; 1026 } else { 1027 return (int32_t)(match - array); 1028 } 1029 } 1030 1031 int32_t 1032 UnicodeString::doLastIndexOf(UChar c, 1033 int32_t start, 1034 int32_t length) const 1035 { 1036 if(isBogus()) { 1037 return -1; 1038 } 1039 1040 // pin indices 1041 pinIndices(start, length); 1042 1043 // find the last occurrence of c 1044 const UChar *array = getArrayStart(); 1045 const UChar *match = u_memrchr(array + start, c, length); 1046 if(match == NULL) { 1047 return -1; 1048 } else { 1049 return (int32_t)(match - array); 1050 } 1051 } 1052 1053 int32_t 1054 UnicodeString::doLastIndexOf(UChar32 c, 1055 int32_t start, 1056 int32_t length) const { 1057 // pin indices 1058 pinIndices(start, length); 1059 1060 // find the last occurrence of c 1061 const UChar *array = getArrayStart(); 1062 const UChar *match = u_memrchr32(array + start, c, length); 1063 if(match == NULL) { 1064 return -1; 1065 } else { 1066 return (int32_t)(match - array); 1067 } 1068 } 1069 1070 //======================================== 1071 // Write implementation 1072 //======================================== 1073 1074 UnicodeString& 1075 UnicodeString::findAndReplace(int32_t start, 1076 int32_t length, 1077 const UnicodeString& oldText, 1078 int32_t oldStart, 1079 int32_t oldLength, 1080 const UnicodeString& newText, 1081 int32_t newStart, 1082 int32_t newLength) 1083 { 1084 if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1085 return *this; 1086 } 1087 1088 pinIndices(start, length); 1089 oldText.pinIndices(oldStart, oldLength); 1090 newText.pinIndices(newStart, newLength); 1091 1092 if(oldLength == 0) { 1093 return *this; 1094 } 1095 1096 while(length > 0 && length >= oldLength) { 1097 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1098 if(pos < 0) { 1099 // no more oldText's here: done 1100 break; 1101 } else { 1102 // we found oldText, replace it by newText and go beyond it 1103 replace(pos, oldLength, newText, newStart, newLength); 1104 length -= pos + oldLength - start; 1105 start = pos + newLength; 1106 } 1107 } 1108 1109 return *this; 1110 } 1111 1112 1113 void 1114 UnicodeString::setToBogus() 1115 { 1116 releaseArray(); 1117 1118 fShortLength = 0; 1119 fUnion.fFields.fArray = 0; 1120 fUnion.fFields.fCapacity = 0; 1121 fFlags = kIsBogus; 1122 } 1123 1124 // turn a bogus string into an empty one 1125 void 1126 UnicodeString::unBogus() { 1127 if(fFlags & kIsBogus) { 1128 setToEmpty(); 1129 } 1130 } 1131 1132 // setTo() analogous to the readonly-aliasing constructor with the same signature 1133 UnicodeString & 1134 UnicodeString::setTo(UBool isTerminated, 1135 const UChar *text, 1136 int32_t textLength) 1137 { 1138 if(fFlags & kOpenGetBuffer) { 1139 // do not modify a string that has an "open" getBuffer(minCapacity) 1140 return *this; 1141 } 1142 1143 if(text == NULL) { 1144 // treat as an empty string, do not alias 1145 releaseArray(); 1146 setToEmpty(); 1147 return *this; 1148 } 1149 1150 if( textLength < -1 || 1151 (textLength == -1 && !isTerminated) || 1152 (textLength >= 0 && isTerminated && text[textLength] != 0) 1153 ) { 1154 setToBogus(); 1155 return *this; 1156 } 1157 1158 releaseArray(); 1159 1160 if(textLength == -1) { 1161 // text is terminated, or else it would have failed the above test 1162 textLength = u_strlen(text); 1163 } 1164 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1165 1166 fFlags = kReadonlyAlias; 1167 return *this; 1168 } 1169 1170 // setTo() analogous to the writable-aliasing constructor with the same signature 1171 UnicodeString & 1172 UnicodeString::setTo(UChar *buffer, 1173 int32_t buffLength, 1174 int32_t buffCapacity) { 1175 if(fFlags & kOpenGetBuffer) { 1176 // do not modify a string that has an "open" getBuffer(minCapacity) 1177 return *this; 1178 } 1179 1180 if(buffer == NULL) { 1181 // treat as an empty string, do not alias 1182 releaseArray(); 1183 setToEmpty(); 1184 return *this; 1185 } 1186 1187 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1188 setToBogus(); 1189 return *this; 1190 } else if(buffLength == -1) { 1191 // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1192 const UChar *p = buffer, *limit = buffer + buffCapacity; 1193 while(p != limit && *p != 0) { 1194 ++p; 1195 } 1196 buffLength = (int32_t)(p - buffer); 1197 } 1198 1199 releaseArray(); 1200 1201 setArray(buffer, buffLength, buffCapacity); 1202 fFlags = kWritableAlias; 1203 return *this; 1204 } 1205 1206 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) { 1207 unBogus(); 1208 int32_t length = utf8.length(); 1209 int32_t capacity; 1210 // The UTF-16 string will be at most as long as the UTF-8 string. 1211 if(length <= US_STACKBUF_SIZE) { 1212 capacity = US_STACKBUF_SIZE; 1213 } else { 1214 capacity = length + 1; // +1 for the terminating NUL. 1215 } 1216 UChar *utf16 = getBuffer(capacity); 1217 int32_t length16; 1218 UErrorCode errorCode = U_ZERO_ERROR; 1219 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1220 utf8.data(), length, 1221 0xfffd, // Substitution character. 1222 NULL, // Don't care about number of substitutions. 1223 &errorCode); 1224 releaseBuffer(length16); 1225 if(U_FAILURE(errorCode)) { 1226 setToBogus(); 1227 } 1228 return *this; 1229 } 1230 1231 UnicodeString& 1232 UnicodeString::setCharAt(int32_t offset, 1233 UChar c) 1234 { 1235 int32_t len = length(); 1236 if(cloneArrayIfNeeded() && len > 0) { 1237 if(offset < 0) { 1238 offset = 0; 1239 } else if(offset >= len) { 1240 offset = len - 1; 1241 } 1242 1243 getArrayStart()[offset] = c; 1244 } 1245 return *this; 1246 } 1247 1248 UnicodeString& 1249 UnicodeString::replace(int32_t start, 1250 int32_t _length, 1251 UChar32 srcChar) { 1252 UChar buffer[U16_MAX_LENGTH]; 1253 int32_t count = 0; 1254 UBool isError = FALSE; 1255 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1256 // We test isError so that the compiler does not complain that we don't. 1257 // If isError (srcChar is not a valid code point) then count==0 which means 1258 // we remove the source segment rather than replacing it with srcChar. 1259 return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1260 } 1261 1262 UnicodeString& 1263 UnicodeString::append(UChar32 srcChar) { 1264 UChar buffer[U16_MAX_LENGTH]; 1265 int32_t _length = 0; 1266 UBool isError = FALSE; 1267 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1268 // We test isError so that the compiler does not complain that we don't. 1269 // If isError then _length==0 which turns the doReplace() into a no-op anyway. 1270 return isError ? *this : doReplace(length(), 0, buffer, 0, _length); 1271 } 1272 1273 UnicodeString& 1274 UnicodeString::doReplace( int32_t start, 1275 int32_t length, 1276 const UnicodeString& src, 1277 int32_t srcStart, 1278 int32_t srcLength) 1279 { 1280 if(!src.isBogus()) { 1281 // pin the indices to legal values 1282 src.pinIndices(srcStart, srcLength); 1283 1284 // get the characters from src 1285 // and replace the range in ourselves with them 1286 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1287 } else { 1288 // remove the range 1289 return doReplace(start, length, 0, 0, 0); 1290 } 1291 } 1292 1293 UnicodeString& 1294 UnicodeString::doReplace(int32_t start, 1295 int32_t length, 1296 const UChar *srcChars, 1297 int32_t srcStart, 1298 int32_t srcLength) 1299 { 1300 if(!isWritable()) { 1301 return *this; 1302 } 1303 1304 int32_t oldLength = this->length(); 1305 1306 // optimize (read-only alias).remove(0, start) and .remove(start, end) 1307 if((fFlags&kBufferIsReadonly) && srcLength == 0) { 1308 if(start == 0) { 1309 // remove prefix by adjusting the array pointer 1310 pinIndex(length); 1311 fUnion.fFields.fArray += length; 1312 fUnion.fFields.fCapacity -= length; 1313 setLength(oldLength - length); 1314 return *this; 1315 } else { 1316 pinIndex(start); 1317 if(length >= (oldLength - start)) { 1318 // remove suffix by reducing the length (like truncate()) 1319 setLength(start); 1320 fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1321 return *this; 1322 } 1323 } 1324 } 1325 1326 if(srcChars == 0) { 1327 srcStart = srcLength = 0; 1328 } else if(srcLength < 0) { 1329 // get the srcLength if necessary 1330 srcLength = u_strlen(srcChars + srcStart); 1331 } 1332 1333 // calculate the size of the string after the replace 1334 int32_t newLength; 1335 1336 // optimize append() onto a large-enough, owned string 1337 if(start >= oldLength) { 1338 if(srcLength == 0) { 1339 return *this; 1340 } 1341 newLength = oldLength + srcLength; 1342 if(newLength <= getCapacity() && isBufferWritable()) { 1343 UChar *oldArray = getArrayStart(); 1344 // Do not copy characters when 1345 // UChar *buffer=str.getAppendBuffer(...); 1346 // is followed by 1347 // str.append(buffer, length); 1348 // or 1349 // str.appendString(buffer, length) 1350 // or similar. 1351 if(srcChars + srcStart != oldArray + start || start > oldLength) { 1352 us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength); 1353 } 1354 setLength(newLength); 1355 return *this; 1356 } else { 1357 // pin the indices to legal values 1358 start = oldLength; 1359 length = 0; 1360 } 1361 } else { 1362 // pin the indices to legal values 1363 pinIndices(start, length); 1364 1365 newLength = oldLength - length + srcLength; 1366 } 1367 1368 // the following may change fArray but will not copy the current contents; 1369 // therefore we need to keep the current fArray 1370 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1371 UChar *oldArray; 1372 if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1373 // copy the stack buffer contents because it will be overwritten with 1374 // fUnion.fFields values 1375 u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength); 1376 oldArray = oldStackBuffer; 1377 } else { 1378 oldArray = getArrayStart(); 1379 } 1380 1381 // clone our array and allocate a bigger array if needed 1382 int32_t *bufferToDelete = 0; 1383 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, 1384 FALSE, &bufferToDelete) 1385 ) { 1386 return *this; 1387 } 1388 1389 // now do the replace 1390 1391 UChar *newArray = getArrayStart(); 1392 if(newArray != oldArray) { 1393 // if fArray changed, then we need to copy everything except what will change 1394 us_arrayCopy(oldArray, 0, newArray, 0, start); 1395 us_arrayCopy(oldArray, start + length, 1396 newArray, start + srcLength, 1397 oldLength - (start + length)); 1398 } else if(length != srcLength) { 1399 // fArray did not change; copy only the portion that isn't changing, leaving a hole 1400 us_arrayCopy(oldArray, start + length, 1401 newArray, start + srcLength, 1402 oldLength - (start + length)); 1403 } 1404 1405 // now fill in the hole with the new string 1406 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); 1407 1408 setLength(newLength); 1409 1410 // delayed delete in case srcChars == fArray when we started, and 1411 // to keep oldArray alive for the above operations 1412 if (bufferToDelete) { 1413 uprv_free(bufferToDelete); 1414 } 1415 1416 return *this; 1417 } 1418 1419 /** 1420 * Replaceable API 1421 */ 1422 void 1423 UnicodeString::handleReplaceBetween(int32_t start, 1424 int32_t limit, 1425 const UnicodeString& text) { 1426 replaceBetween(start, limit, text); 1427 } 1428 1429 /** 1430 * Replaceable API 1431 */ 1432 void 1433 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1434 if (limit <= start) { 1435 return; // Nothing to do; avoid bogus malloc call 1436 } 1437 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); 1438 // Check to make sure text is not null. 1439 if (text != NULL) { 1440 extractBetween(start, limit, text, 0); 1441 insert(dest, text, 0, limit - start); 1442 uprv_free(text); 1443 } 1444 } 1445 1446 /** 1447 * Replaceable API 1448 * 1449 * NOTE: This is for the Replaceable class. There is no rep.cpp, 1450 * so we implement this function here. 1451 */ 1452 UBool Replaceable::hasMetaData() const { 1453 return TRUE; 1454 } 1455 1456 /** 1457 * Replaceable API 1458 */ 1459 UBool UnicodeString::hasMetaData() const { 1460 return FALSE; 1461 } 1462 1463 UnicodeString& 1464 UnicodeString::doReverse(int32_t start, int32_t length) { 1465 if(length <= 1 || !cloneArrayIfNeeded()) { 1466 return *this; 1467 } 1468 1469 // pin the indices to legal values 1470 pinIndices(start, length); 1471 if(length <= 1) { // pinIndices() might have shrunk the length 1472 return *this; 1473 } 1474 1475 UChar *left = getArrayStart() + start; 1476 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1477 UChar swap; 1478 UBool hasSupplementary = FALSE; 1479 1480 // Before the loop we know left<right because length>=2. 1481 do { 1482 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); 1483 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); 1484 *right-- = swap; 1485 } while(left < right); 1486 // Make sure to test the middle code unit of an odd-length string. 1487 // Redundant if the length is even. 1488 hasSupplementary |= (UBool)U16_IS_LEAD(*left); 1489 1490 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1491 if(hasSupplementary) { 1492 UChar swap2; 1493 1494 left = getArrayStart() + start; 1495 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1496 while(left < right) { 1497 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1498 *left++ = swap2; 1499 *left++ = swap; 1500 } else { 1501 ++left; 1502 } 1503 } 1504 } 1505 1506 return *this; 1507 } 1508 1509 UBool 1510 UnicodeString::padLeading(int32_t targetLength, 1511 UChar padChar) 1512 { 1513 int32_t oldLength = length(); 1514 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1515 return FALSE; 1516 } else { 1517 // move contents up by padding width 1518 UChar *array = getArrayStart(); 1519 int32_t start = targetLength - oldLength; 1520 us_arrayCopy(array, 0, array, start, oldLength); 1521 1522 // fill in padding character 1523 while(--start >= 0) { 1524 array[start] = padChar; 1525 } 1526 setLength(targetLength); 1527 return TRUE; 1528 } 1529 } 1530 1531 UBool 1532 UnicodeString::padTrailing(int32_t targetLength, 1533 UChar padChar) 1534 { 1535 int32_t oldLength = length(); 1536 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1537 return FALSE; 1538 } else { 1539 // fill in padding character 1540 UChar *array = getArrayStart(); 1541 int32_t length = targetLength; 1542 while(--length >= oldLength) { 1543 array[length] = padChar; 1544 } 1545 setLength(targetLength); 1546 return TRUE; 1547 } 1548 } 1549 1550 //======================================== 1551 // Hashing 1552 //======================================== 1553 int32_t 1554 UnicodeString::doHashCode() const 1555 { 1556 /* Delegate hash computation to uhash. This makes UnicodeString 1557 * hashing consistent with UChar* hashing. */ 1558 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1559 if (hashCode == kInvalidHashCode) { 1560 hashCode = kEmptyHashCode; 1561 } 1562 return hashCode; 1563 } 1564 1565 //======================================== 1566 // External Buffer 1567 //======================================== 1568 1569 UChar * 1570 UnicodeString::getBuffer(int32_t minCapacity) { 1571 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1572 fFlags|=kOpenGetBuffer; 1573 fShortLength=0; 1574 return getArrayStart(); 1575 } else { 1576 return 0; 1577 } 1578 } 1579 1580 void 1581 UnicodeString::releaseBuffer(int32_t newLength) { 1582 if(fFlags&kOpenGetBuffer && newLength>=-1) { 1583 // set the new fLength 1584 int32_t capacity=getCapacity(); 1585 if(newLength==-1) { 1586 // the new length is the string length, capped by fCapacity 1587 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; 1588 while(p<limit && *p!=0) { 1589 ++p; 1590 } 1591 newLength=(int32_t)(p-array); 1592 } else if(newLength>capacity) { 1593 newLength=capacity; 1594 } 1595 setLength(newLength); 1596 fFlags&=~kOpenGetBuffer; 1597 } 1598 } 1599 1600 //======================================== 1601 // Miscellaneous 1602 //======================================== 1603 UBool 1604 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1605 int32_t growCapacity, 1606 UBool doCopyArray, 1607 int32_t **pBufferToDelete, 1608 UBool forceClone) { 1609 // default parameters need to be static, therefore 1610 // the defaults are -1 to have convenience defaults 1611 if(newCapacity == -1) { 1612 newCapacity = getCapacity(); 1613 } 1614 1615 // while a getBuffer(minCapacity) is "open", 1616 // prevent any modifications of the string by returning FALSE here 1617 // if the string is bogus, then only an assignment or similar can revive it 1618 if(!isWritable()) { 1619 return FALSE; 1620 } 1621 1622 /* 1623 * We need to make a copy of the array if 1624 * the buffer is read-only, or 1625 * the buffer is refCounted (shared), and refCount>1, or 1626 * the buffer is too small. 1627 * Return FALSE if memory could not be allocated. 1628 */ 1629 if(forceClone || 1630 fFlags & kBufferIsReadonly || 1631 (fFlags & kRefCounted && refCount() > 1) || 1632 newCapacity > getCapacity() 1633 ) { 1634 // check growCapacity for default value and use of the stack buffer 1635 if(growCapacity < 0) { 1636 growCapacity = newCapacity; 1637 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1638 growCapacity = US_STACKBUF_SIZE; 1639 } 1640 1641 // save old values 1642 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1643 UChar *oldArray; 1644 uint8_t flags = fFlags; 1645 1646 if(flags&kUsingStackBuffer) { 1647 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1648 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1649 // copy the stack buffer contents because it will be overwritten with 1650 // fUnion.fFields values 1651 us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength); 1652 oldArray = oldStackBuffer; 1653 } else { 1654 oldArray = 0; // no need to copy from stack buffer to itself 1655 } 1656 } else { 1657 oldArray = fUnion.fFields.fArray; 1658 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ 1659 } 1660 1661 // allocate a new array 1662 if(allocate(growCapacity) || 1663 (newCapacity < growCapacity && allocate(newCapacity)) 1664 ) { 1665 if(doCopyArray && oldArray != 0) { 1666 // copy the contents 1667 // do not copy more than what fits - it may be smaller than before 1668 int32_t minLength = length(); 1669 newCapacity = getCapacity(); 1670 if(newCapacity < minLength) { 1671 minLength = newCapacity; 1672 setLength(minLength); 1673 } 1674 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1675 } else { 1676 fShortLength = 0; 1677 } 1678 1679 // release the old array 1680 if(flags & kRefCounted) { 1681 // the array is refCounted; decrement and release if 0 1682 int32_t *pRefCount = ((int32_t *)oldArray - 1); 1683 if(umtx_atomic_dec(pRefCount) == 0) { 1684 if(pBufferToDelete == 0) { 1685 uprv_free(pRefCount); 1686 } else { 1687 // the caller requested to delete it himself 1688 *pBufferToDelete = pRefCount; 1689 } 1690 } 1691 } 1692 } else { 1693 // not enough memory for growCapacity and not even for the smaller newCapacity 1694 // reset the old values for setToBogus() to release the array 1695 if(!(flags&kUsingStackBuffer)) { 1696 fUnion.fFields.fArray = oldArray; 1697 } 1698 fFlags = flags; 1699 setToBogus(); 1700 return FALSE; 1701 } 1702 } 1703 return TRUE; 1704 } 1705 1706 // UnicodeStringAppendable ------------------------------------------------- *** 1707 1708 UnicodeStringAppendable::~UnicodeStringAppendable() {} 1709 1710 UBool 1711 UnicodeStringAppendable::appendCodeUnit(UChar c) { 1712 return str.doReplace(str.length(), 0, &c, 0, 1).isWritable(); 1713 } 1714 1715 UBool 1716 UnicodeStringAppendable::appendCodePoint(UChar32 c) { 1717 UChar buffer[U16_MAX_LENGTH]; 1718 int32_t cLength = 0; 1719 UBool isError = FALSE; 1720 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 1721 return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable(); 1722 } 1723 1724 UBool 1725 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { 1726 return str.doReplace(str.length(), 0, s, 0, length).isWritable(); 1727 } 1728 1729 UBool 1730 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 1731 return str.cloneArrayIfNeeded(str.length() + appendCapacity); 1732 } 1733 1734 UChar * 1735 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 1736 int32_t desiredCapacityHint, 1737 UChar *scratch, int32_t scratchCapacity, 1738 int32_t *resultCapacity) { 1739 if(minCapacity < 1 || scratchCapacity < minCapacity) { 1740 *resultCapacity = 0; 1741 return NULL; 1742 } 1743 int32_t oldLength = str.length(); 1744 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 1745 *resultCapacity = str.getCapacity() - oldLength; 1746 return str.getArrayStart() + oldLength; 1747 } 1748 *resultCapacity = scratchCapacity; 1749 return scratch; 1750 } 1751 1752 U_NAMESPACE_END 1753 1754 U_NAMESPACE_USE 1755 1756 U_CAPI int32_t U_EXPORT2 1757 uhash_hashUnicodeString(const UElement key) { 1758 const UnicodeString *str = (const UnicodeString*) key.pointer; 1759 return (str == NULL) ? 0 : str->hashCode(); 1760 } 1761 1762 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 1763 // does not depend on hashtable code. 1764 U_CAPI UBool U_EXPORT2 1765 uhash_compareUnicodeString(const UElement key1, const UElement key2) { 1766 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 1767 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 1768 if (str1 == str2) { 1769 return TRUE; 1770 } 1771 if (str1 == NULL || str2 == NULL) { 1772 return FALSE; 1773 } 1774 return *str1 == *str2; 1775 } 1776 1777 #ifdef U_STATIC_IMPLEMENTATION 1778 /* 1779 This should never be called. It is defined here to make sure that the 1780 virtual vector deleting destructor is defined within unistr.cpp. 1781 The vector deleting destructor is already a part of UObject, 1782 but defining it here makes sure that it is included with this object file. 1783 This makes sure that static library dependencies are kept to a minimum. 1784 */ 1785 static void uprv_UnicodeStringDummy(void) { 1786 delete [] (new UnicodeString[2]); 1787 } 1788 #endif 1789