1 /* 2 ****************************************************************************** 3 * Copyright (C) 1999-2012, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 * 7 * File unistr.cpp 8 * 9 * Modification History: 10 * 11 * Date Name Description 12 * 09/25/98 stephen Creation. 13 * 04/20/99 stephen Overhauled per 4/16 code review. 14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from 16 * Replaceable. 17 * 06/25/01 grhoten Removed the dependency on iostream 18 ****************************************************************************** 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/appendable.h" 23 #include "unicode/putil.h" 24 #include "cstring.h" 25 #include "cmemory.h" 26 #include "unicode/ustring.h" 27 #include "unicode/unistr.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf16.h" 30 #include "uelement.h" 31 #include "ustr_imp.h" 32 #include "umutex.h" 33 #include "uassert.h" 34 35 #if 0 36 37 #include <iostream> 38 using namespace std; 39 40 //DEBUGGING 41 void 42 print(const UnicodeString& s, 43 const char *name) 44 { 45 UChar c; 46 cout << name << ":|"; 47 for(int i = 0; i < s.length(); ++i) { 48 c = s[i]; 49 if(c>= 0x007E || c < 0x0020) 50 cout << "[0x" << hex << s[i] << "]"; 51 else 52 cout << (char) s[i]; 53 } 54 cout << '|' << endl; 55 } 56 57 void 58 print(const UChar *s, 59 int32_t len, 60 const char *name) 61 { 62 UChar c; 63 cout << name << ":|"; 64 for(int i = 0; i < len; ++i) { 65 c = s[i]; 66 if(c>= 0x007E || c < 0x0020) 67 cout << "[0x" << hex << s[i] << "]"; 68 else 69 cout << (char) s[i]; 70 } 71 cout << '|' << endl; 72 } 73 // END DEBUGGING 74 #endif 75 76 // Local function definitions for now 77 78 // need to copy areas that may overlap 79 static 80 inline void 81 us_arrayCopy(const UChar *src, int32_t srcStart, 82 UChar *dst, int32_t dstStart, int32_t count) 83 { 84 if(count>0) { 85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); 86 } 87 } 88 89 // u_unescapeAt() callback to get a UChar from a UnicodeString 90 U_CDECL_BEGIN 91 static UChar U_CALLCONV 92 UnicodeString_charAt(int32_t offset, void *context) { 93 return ((icu::UnicodeString*) context)->charAt(offset); 94 } 95 U_CDECL_END 96 97 U_NAMESPACE_BEGIN 98 99 /* The Replaceable virtual destructor can't be defined in the header 100 due to how AIX works with multiple definitions of virtual functions. 101 */ 102 Replaceable::~Replaceable() {} 103 Replaceable::Replaceable() {} 104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 105 106 UnicodeString U_EXPORT2 107 operator+ (const UnicodeString &s1, const UnicodeString &s2) { 108 return 109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). 110 append(s1). 111 append(s2); 112 } 113 114 //======================================== 115 // Reference Counting functions, put at top of file so that optimizing compilers 116 // have a chance to automatically inline. 117 //======================================== 118 119 void 120 UnicodeString::addRef() 121 { umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);} 122 123 int32_t 124 UnicodeString::removeRef() 125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);} 126 127 int32_t 128 UnicodeString::refCount() const 129 { 130 umtx_lock(NULL); 131 // Note: without the lock to force a memory barrier, we might see a very 132 // stale value on some multi-processor systems. 133 int32_t count = *((int32_t *)fUnion.fFields.fArray - 1); 134 umtx_unlock(NULL); 135 return count; 136 } 137 138 void 139 UnicodeString::releaseArray() { 140 if((fFlags & kRefCounted) && removeRef() == 0) { 141 uprv_free((int32_t *)fUnion.fFields.fArray - 1); 142 } 143 } 144 145 146 147 //======================================== 148 // Constructors 149 //======================================== 150 UnicodeString::UnicodeString() 151 : fShortLength(0), 152 fFlags(kShortString) 153 {} 154 155 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) 156 : fShortLength(0), 157 fFlags(0) 158 { 159 if(count <= 0 || (uint32_t)c > 0x10ffff) { 160 // just allocate and do not do anything else 161 allocate(capacity); 162 } else { 163 // count > 0, allocate and fill the new string with count c's 164 int32_t unitCount = U16_LENGTH(c), length = count * unitCount; 165 if(capacity < length) { 166 capacity = length; 167 } 168 if(allocate(capacity)) { 169 UChar *array = getArrayStart(); 170 int32_t i = 0; 171 172 // fill the new string with c 173 if(unitCount == 1) { 174 // fill with length UChars 175 while(i < length) { 176 array[i++] = (UChar)c; 177 } 178 } else { 179 // get the code units for c 180 UChar units[U16_MAX_LENGTH]; 181 U16_APPEND_UNSAFE(units, i, c); 182 183 // now it must be i==unitCount 184 i = 0; 185 186 // for Unicode, unitCount can only be 1, 2, 3, or 4 187 // 1 is handled above 188 while(i < length) { 189 int32_t unitIdx = 0; 190 while(unitIdx < unitCount) { 191 array[i++]=units[unitIdx++]; 192 } 193 } 194 } 195 } 196 setLength(length); 197 } 198 } 199 200 UnicodeString::UnicodeString(UChar ch) 201 : fShortLength(1), 202 fFlags(kShortString) 203 { 204 fUnion.fStackBuffer[0] = ch; 205 } 206 207 UnicodeString::UnicodeString(UChar32 ch) 208 : fShortLength(0), 209 fFlags(kShortString) 210 { 211 int32_t i = 0; 212 UBool isError = FALSE; 213 U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError); 214 // We test isError so that the compiler does not complain that we don't. 215 // If isError then i==0 which is what we want anyway. 216 if(!isError) { 217 fShortLength = (int8_t)i; 218 } 219 } 220 221 UnicodeString::UnicodeString(const UChar *text) 222 : fShortLength(0), 223 fFlags(kShortString) 224 { 225 doReplace(0, 0, text, 0, -1); 226 } 227 228 UnicodeString::UnicodeString(const UChar *text, 229 int32_t textLength) 230 : fShortLength(0), 231 fFlags(kShortString) 232 { 233 doReplace(0, 0, text, 0, textLength); 234 } 235 236 UnicodeString::UnicodeString(UBool isTerminated, 237 const UChar *text, 238 int32_t textLength) 239 : fShortLength(0), 240 fFlags(kReadonlyAlias) 241 { 242 if(text == NULL) { 243 // treat as an empty string, do not alias 244 setToEmpty(); 245 } else if(textLength < -1 || 246 (textLength == -1 && !isTerminated) || 247 (textLength >= 0 && isTerminated && text[textLength] != 0) 248 ) { 249 setToBogus(); 250 } else { 251 if(textLength == -1) { 252 // text is terminated, or else it would have failed the above test 253 textLength = u_strlen(text); 254 } 255 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 256 } 257 } 258 259 UnicodeString::UnicodeString(UChar *buff, 260 int32_t buffLength, 261 int32_t buffCapacity) 262 : fShortLength(0), 263 fFlags(kWritableAlias) 264 { 265 if(buff == NULL) { 266 // treat as an empty string, do not alias 267 setToEmpty(); 268 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 269 setToBogus(); 270 } else { 271 if(buffLength == -1) { 272 // fLength = u_strlen(buff); but do not look beyond buffCapacity 273 const UChar *p = buff, *limit = buff + buffCapacity; 274 while(p != limit && *p != 0) { 275 ++p; 276 } 277 buffLength = (int32_t)(p - buff); 278 } 279 setArray(buff, buffLength, buffCapacity); 280 } 281 } 282 283 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) 284 : fShortLength(0), 285 fFlags(kShortString) 286 { 287 if(src==NULL) { 288 // treat as an empty string 289 } else { 290 if(length<0) { 291 length=(int32_t)uprv_strlen(src); 292 } 293 if(cloneArrayIfNeeded(length, length, FALSE)) { 294 u_charsToUChars(src, getArrayStart(), length); 295 setLength(length); 296 } else { 297 setToBogus(); 298 } 299 } 300 } 301 302 #if U_CHARSET_IS_UTF8 303 304 UnicodeString::UnicodeString(const char *codepageData) 305 : fShortLength(0), 306 fFlags(kShortString) { 307 if(codepageData != 0) { 308 setToUTF8(codepageData); 309 } 310 } 311 312 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) 313 : fShortLength(0), 314 fFlags(kShortString) { 315 // if there's nothing to convert, do nothing 316 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 317 return; 318 } 319 if(dataLength == -1) { 320 dataLength = (int32_t)uprv_strlen(codepageData); 321 } 322 setToUTF8(StringPiece(codepageData, dataLength)); 323 } 324 325 // else see unistr_cnv.cpp 326 #endif 327 328 UnicodeString::UnicodeString(const UnicodeString& that) 329 : Replaceable(), 330 fShortLength(0), 331 fFlags(kShortString) 332 { 333 copyFrom(that); 334 } 335 336 UnicodeString::UnicodeString(const UnicodeString& that, 337 int32_t srcStart) 338 : Replaceable(), 339 fShortLength(0), 340 fFlags(kShortString) 341 { 342 setTo(that, srcStart); 343 } 344 345 UnicodeString::UnicodeString(const UnicodeString& that, 346 int32_t srcStart, 347 int32_t srcLength) 348 : Replaceable(), 349 fShortLength(0), 350 fFlags(kShortString) 351 { 352 setTo(that, srcStart, srcLength); 353 } 354 355 // Replaceable base class clone() default implementation, does not clone 356 Replaceable * 357 Replaceable::clone() const { 358 return NULL; 359 } 360 361 // UnicodeString overrides clone() with a real implementation 362 Replaceable * 363 UnicodeString::clone() const { 364 return new UnicodeString(*this); 365 } 366 367 //======================================== 368 // array allocation 369 //======================================== 370 371 UBool 372 UnicodeString::allocate(int32_t capacity) { 373 if(capacity <= US_STACKBUF_SIZE) { 374 fFlags = kShortString; 375 } else { 376 // count bytes for the refCounter and the string capacity, and 377 // round up to a multiple of 16; then divide by 4 and allocate int32_t's 378 // to be safely aligned for the refCount 379 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() 380 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); 381 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); 382 if(array != 0) { 383 // set initial refCount and point behind the refCount 384 *array++ = 1; 385 386 // have fArray point to the first UChar 387 fUnion.fFields.fArray = (UChar *)array; 388 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); 389 fFlags = kLongString; 390 } else { 391 fShortLength = 0; 392 fUnion.fFields.fArray = 0; 393 fUnion.fFields.fCapacity = 0; 394 fFlags = kIsBogus; 395 return FALSE; 396 } 397 } 398 return TRUE; 399 } 400 401 //======================================== 402 // Destructor 403 //======================================== 404 UnicodeString::~UnicodeString() 405 { 406 releaseArray(); 407 } 408 409 //======================================== 410 // Factory methods 411 //======================================== 412 413 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { 414 UnicodeString result; 415 result.setToUTF8(utf8); 416 return result; 417 } 418 419 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 420 UnicodeString result; 421 int32_t capacity; 422 // Most UTF-32 strings will be BMP-only and result in a same-length 423 // UTF-16 string. We overestimate the capacity just slightly, 424 // just in case there are a few supplementary characters. 425 if(length <= US_STACKBUF_SIZE) { 426 capacity = US_STACKBUF_SIZE; 427 } else { 428 capacity = length + (length >> 4) + 4; 429 } 430 do { 431 UChar *utf16 = result.getBuffer(capacity); 432 int32_t length16; 433 UErrorCode errorCode = U_ZERO_ERROR; 434 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 435 utf32, length, 436 0xfffd, // Substitution character. 437 NULL, // Don't care about number of substitutions. 438 &errorCode); 439 result.releaseBuffer(length16); 440 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 441 capacity = length16 + 1; // +1 for the terminating NUL. 442 continue; 443 } else if(U_FAILURE(errorCode)) { 444 result.setToBogus(); 445 } 446 break; 447 } while(TRUE); 448 return result; 449 } 450 451 //======================================== 452 // Assignment 453 //======================================== 454 455 UnicodeString & 456 UnicodeString::operator=(const UnicodeString &src) { 457 return copyFrom(src); 458 } 459 460 UnicodeString & 461 UnicodeString::fastCopyFrom(const UnicodeString &src) { 462 return copyFrom(src, TRUE); 463 } 464 465 UnicodeString & 466 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 467 // if assigning to ourselves, do nothing 468 if(this == 0 || this == &src) { 469 return *this; 470 } 471 472 // is the right side bogus? 473 if(&src == 0 || src.isBogus()) { 474 setToBogus(); 475 return *this; 476 } 477 478 // delete the current contents 479 releaseArray(); 480 481 if(src.isEmpty()) { 482 // empty string - use the stack buffer 483 setToEmpty(); 484 return *this; 485 } 486 487 // we always copy the length 488 int32_t srcLength = src.length(); 489 setLength(srcLength); 490 491 // fLength>0 and not an "open" src.getBuffer(minCapacity) 492 switch(src.fFlags) { 493 case kShortString: 494 // short string using the stack buffer, do the same 495 fFlags = kShortString; 496 uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR); 497 break; 498 case kLongString: 499 // src uses a refCounted string buffer, use that buffer with refCount 500 // src is const, use a cast - we don't really change it 501 ((UnicodeString &)src).addRef(); 502 // copy all fields, share the reference-counted buffer 503 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 504 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 505 fFlags = src.fFlags; 506 break; 507 case kReadonlyAlias: 508 if(fastCopy) { 509 // src is a readonly alias, do the same 510 // -> maintain the readonly alias as such 511 fUnion.fFields.fArray = src.fUnion.fFields.fArray; 512 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 513 fFlags = src.fFlags; 514 break; 515 } 516 // else if(!fastCopy) fall through to case kWritableAlias 517 // -> allocate a new buffer and copy the contents 518 case kWritableAlias: 519 // src is a writable alias; we make a copy of that instead 520 if(allocate(srcLength)) { 521 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); 522 break; 523 } 524 // if there is not enough memory, then fall through to setting to bogus 525 default: 526 // if src is bogus, set ourselves to bogus 527 // do not call setToBogus() here because fArray and fFlags are not consistent here 528 fShortLength = 0; 529 fUnion.fFields.fArray = 0; 530 fUnion.fFields.fCapacity = 0; 531 fFlags = kIsBogus; 532 break; 533 } 534 535 return *this; 536 } 537 538 //======================================== 539 // Miscellaneous operations 540 //======================================== 541 542 UnicodeString UnicodeString::unescape() const { 543 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity 544 const UChar *array = getBuffer(); 545 int32_t len = length(); 546 int32_t prev = 0; 547 for (int32_t i=0;;) { 548 if (i == len) { 549 result.append(array, prev, len - prev); 550 break; 551 } 552 if (array[i++] == 0x5C /*'\\'*/) { 553 result.append(array, prev, (i - 1) - prev); 554 UChar32 c = unescapeAt(i); // advances i 555 if (c < 0) { 556 result.remove(); // return empty string 557 break; // invalid escape sequence 558 } 559 result.append(c); 560 prev = i; 561 } 562 } 563 return result; 564 } 565 566 UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 567 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 568 } 569 570 //======================================== 571 // Read-only implementation 572 //======================================== 573 UBool 574 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { 575 // Requires: this & text not bogus and have same lengths. 576 // Byte-wise comparison works for equality regardless of endianness. 577 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; 578 } 579 580 int8_t 581 UnicodeString::doCompare( int32_t start, 582 int32_t length, 583 const UChar *srcChars, 584 int32_t srcStart, 585 int32_t srcLength) const 586 { 587 // compare illegal string values 588 if(isBogus()) { 589 return -1; 590 } 591 592 // pin indices to legal values 593 pinIndices(start, length); 594 595 if(srcChars == NULL) { 596 // treat const UChar *srcChars==NULL as an empty string 597 return length == 0 ? 0 : 1; 598 } 599 600 // get the correct pointer 601 const UChar *chars = getArrayStart(); 602 603 chars += start; 604 srcChars += srcStart; 605 606 int32_t minLength; 607 int8_t lengthResult; 608 609 // get the srcLength if necessary 610 if(srcLength < 0) { 611 srcLength = u_strlen(srcChars + srcStart); 612 } 613 614 // are we comparing different lengths? 615 if(length != srcLength) { 616 if(length < srcLength) { 617 minLength = length; 618 lengthResult = -1; 619 } else { 620 minLength = srcLength; 621 lengthResult = 1; 622 } 623 } else { 624 minLength = length; 625 lengthResult = 0; 626 } 627 628 /* 629 * note that uprv_memcmp() returns an int but we return an int8_t; 630 * we need to take care not to truncate the result - 631 * one way to do this is to right-shift the value to 632 * move the sign bit into the lower 8 bits and making sure that this 633 * does not become 0 itself 634 */ 635 636 if(minLength > 0 && chars != srcChars) { 637 int32_t result; 638 639 # if U_IS_BIG_ENDIAN 640 // big-endian: byte comparison works 641 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); 642 if(result != 0) { 643 return (int8_t)(result >> 15 | 1); 644 } 645 # else 646 // little-endian: compare UChar units 647 do { 648 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); 649 if(result != 0) { 650 return (int8_t)(result >> 15 | 1); 651 } 652 } while(--minLength > 0); 653 # endif 654 } 655 return lengthResult; 656 } 657 658 /* String compare in code point order - doCompare() compares in code unit order. */ 659 int8_t 660 UnicodeString::doCompareCodePointOrder(int32_t start, 661 int32_t length, 662 const UChar *srcChars, 663 int32_t srcStart, 664 int32_t srcLength) const 665 { 666 // compare illegal string values 667 // treat const UChar *srcChars==NULL as an empty string 668 if(isBogus()) { 669 return -1; 670 } 671 672 // pin indices to legal values 673 pinIndices(start, length); 674 675 if(srcChars == NULL) { 676 srcStart = srcLength = 0; 677 } 678 679 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); 680 /* translate the 32-bit result into an 8-bit one */ 681 if(diff!=0) { 682 return (int8_t)(diff >> 15 | 1); 683 } else { 684 return 0; 685 } 686 } 687 688 int32_t 689 UnicodeString::getLength() const { 690 return length(); 691 } 692 693 UChar 694 UnicodeString::getCharAt(int32_t offset) const { 695 return charAt(offset); 696 } 697 698 UChar32 699 UnicodeString::getChar32At(int32_t offset) const { 700 return char32At(offset); 701 } 702 703 UChar32 704 UnicodeString::char32At(int32_t offset) const 705 { 706 int32_t len = length(); 707 if((uint32_t)offset < (uint32_t)len) { 708 const UChar *array = getArrayStart(); 709 UChar32 c; 710 U16_GET(array, 0, offset, len, c); 711 return c; 712 } else { 713 return kInvalidUChar; 714 } 715 } 716 717 int32_t 718 UnicodeString::getChar32Start(int32_t offset) const { 719 if((uint32_t)offset < (uint32_t)length()) { 720 const UChar *array = getArrayStart(); 721 U16_SET_CP_START(array, 0, offset); 722 return offset; 723 } else { 724 return 0; 725 } 726 } 727 728 int32_t 729 UnicodeString::getChar32Limit(int32_t offset) const { 730 int32_t len = length(); 731 if((uint32_t)offset < (uint32_t)len) { 732 const UChar *array = getArrayStart(); 733 U16_SET_CP_LIMIT(array, 0, offset, len); 734 return offset; 735 } else { 736 return len; 737 } 738 } 739 740 int32_t 741 UnicodeString::countChar32(int32_t start, int32_t length) const { 742 pinIndices(start, length); 743 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL 744 return u_countChar32(getArrayStart()+start, length); 745 } 746 747 UBool 748 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 749 pinIndices(start, length); 750 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL 751 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 752 } 753 754 int32_t 755 UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 756 // pin index 757 int32_t len = length(); 758 if(index<0) { 759 index=0; 760 } else if(index>len) { 761 index=len; 762 } 763 764 const UChar *array = getArrayStart(); 765 if(delta>0) { 766 U16_FWD_N(array, index, len, delta); 767 } else { 768 U16_BACK_N(array, 0, index, -delta); 769 } 770 771 return index; 772 } 773 774 void 775 UnicodeString::doExtract(int32_t start, 776 int32_t length, 777 UChar *dst, 778 int32_t dstStart) const 779 { 780 // pin indices to legal values 781 pinIndices(start, length); 782 783 // do not copy anything if we alias dst itself 784 const UChar *array = getArrayStart(); 785 if(array + start != dst + dstStart) { 786 us_arrayCopy(array, start, dst, dstStart, length); 787 } 788 } 789 790 int32_t 791 UnicodeString::extract(UChar *dest, int32_t destCapacity, 792 UErrorCode &errorCode) const { 793 int32_t len = length(); 794 if(U_SUCCESS(errorCode)) { 795 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 796 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 797 } else { 798 const UChar *array = getArrayStart(); 799 if(len>0 && len<=destCapacity && array!=dest) { 800 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); 801 } 802 return u_terminateUChars(dest, destCapacity, len, &errorCode); 803 } 804 } 805 806 return len; 807 } 808 809 int32_t 810 UnicodeString::extract(int32_t start, 811 int32_t length, 812 char *target, 813 int32_t targetCapacity, 814 enum EInvariant) const 815 { 816 // if the arguments are illegal, then do nothing 817 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { 818 return 0; 819 } 820 821 // pin the indices to legal values 822 pinIndices(start, length); 823 824 if(length <= targetCapacity) { 825 u_UCharsToChars(getArrayStart() + start, target, length); 826 } 827 UErrorCode status = U_ZERO_ERROR; 828 return u_terminateChars(target, targetCapacity, length, &status); 829 } 830 831 UnicodeString 832 UnicodeString::tempSubString(int32_t start, int32_t len) const { 833 pinIndices(start, len); 834 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 835 if(array==NULL) { 836 array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string 837 len=-2; // bogus result string 838 } 839 return UnicodeString(FALSE, array + start, len); 840 } 841 842 int32_t 843 UnicodeString::toUTF8(int32_t start, int32_t len, 844 char *target, int32_t capacity) const { 845 pinIndices(start, len); 846 int32_t length8; 847 UErrorCode errorCode = U_ZERO_ERROR; 848 u_strToUTF8WithSub(target, capacity, &length8, 849 getBuffer() + start, len, 850 0xFFFD, // Standard substitution character. 851 NULL, // Don't care about number of substitutions. 852 &errorCode); 853 return length8; 854 } 855 856 #if U_CHARSET_IS_UTF8 857 858 int32_t 859 UnicodeString::extract(int32_t start, int32_t len, 860 char *target, uint32_t dstSize) const { 861 // if the arguments are illegal, then do nothing 862 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 863 return 0; 864 } 865 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); 866 } 867 868 // else see unistr_cnv.cpp 869 #endif 870 871 void 872 UnicodeString::extractBetween(int32_t start, 873 int32_t limit, 874 UnicodeString& target) const { 875 pinIndex(start); 876 pinIndex(limit); 877 doExtract(start, limit - start, target); 878 } 879 880 // When converting from UTF-16 to UTF-8, the result will have at most 3 times 881 // as many bytes as the source has UChars. 882 // The "worst cases" are writing systems like Indic, Thai and CJK with 883 // 3:1 bytes:UChars. 884 void 885 UnicodeString::toUTF8(ByteSink &sink) const { 886 int32_t length16 = length(); 887 if(length16 != 0) { 888 char stackBuffer[1024]; 889 int32_t capacity = (int32_t)sizeof(stackBuffer); 890 UBool utf8IsOwned = FALSE; 891 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 892 3*length16, 893 stackBuffer, capacity, 894 &capacity); 895 int32_t length8 = 0; 896 UErrorCode errorCode = U_ZERO_ERROR; 897 u_strToUTF8WithSub(utf8, capacity, &length8, 898 getBuffer(), length16, 899 0xFFFD, // Standard substitution character. 900 NULL, // Don't care about number of substitutions. 901 &errorCode); 902 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 903 utf8 = (char *)uprv_malloc(length8); 904 if(utf8 != NULL) { 905 utf8IsOwned = TRUE; 906 errorCode = U_ZERO_ERROR; 907 u_strToUTF8WithSub(utf8, length8, &length8, 908 getBuffer(), length16, 909 0xFFFD, // Standard substitution character. 910 NULL, // Don't care about number of substitutions. 911 &errorCode); 912 } else { 913 errorCode = U_MEMORY_ALLOCATION_ERROR; 914 } 915 } 916 if(U_SUCCESS(errorCode)) { 917 sink.Append(utf8, length8); 918 sink.Flush(); 919 } 920 if(utf8IsOwned) { 921 uprv_free(utf8); 922 } 923 } 924 } 925 926 int32_t 927 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 928 int32_t length32=0; 929 if(U_SUCCESS(errorCode)) { 930 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 931 u_strToUTF32WithSub(utf32, capacity, &length32, 932 getBuffer(), length(), 933 0xfffd, // Substitution character. 934 NULL, // Don't care about number of substitutions. 935 &errorCode); 936 } 937 return length32; 938 } 939 940 int32_t 941 UnicodeString::indexOf(const UChar *srcChars, 942 int32_t srcStart, 943 int32_t srcLength, 944 int32_t start, 945 int32_t length) const 946 { 947 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 948 return -1; 949 } 950 951 // UnicodeString does not find empty substrings 952 if(srcLength < 0 && srcChars[srcStart] == 0) { 953 return -1; 954 } 955 956 // get the indices within bounds 957 pinIndices(start, length); 958 959 // find the first occurrence of the substring 960 const UChar *array = getArrayStart(); 961 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 962 if(match == NULL) { 963 return -1; 964 } else { 965 return (int32_t)(match - array); 966 } 967 } 968 969 int32_t 970 UnicodeString::doIndexOf(UChar c, 971 int32_t start, 972 int32_t length) const 973 { 974 // pin indices 975 pinIndices(start, length); 976 977 // find the first occurrence of c 978 const UChar *array = getArrayStart(); 979 const UChar *match = u_memchr(array + start, c, length); 980 if(match == NULL) { 981 return -1; 982 } else { 983 return (int32_t)(match - array); 984 } 985 } 986 987 int32_t 988 UnicodeString::doIndexOf(UChar32 c, 989 int32_t start, 990 int32_t length) const { 991 // pin indices 992 pinIndices(start, length); 993 994 // find the first occurrence of c 995 const UChar *array = getArrayStart(); 996 const UChar *match = u_memchr32(array + start, c, length); 997 if(match == NULL) { 998 return -1; 999 } else { 1000 return (int32_t)(match - array); 1001 } 1002 } 1003 1004 int32_t 1005 UnicodeString::lastIndexOf(const UChar *srcChars, 1006 int32_t srcStart, 1007 int32_t srcLength, 1008 int32_t start, 1009 int32_t length) const 1010 { 1011 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1012 return -1; 1013 } 1014 1015 // UnicodeString does not find empty substrings 1016 if(srcLength < 0 && srcChars[srcStart] == 0) { 1017 return -1; 1018 } 1019 1020 // get the indices within bounds 1021 pinIndices(start, length); 1022 1023 // find the last occurrence of the substring 1024 const UChar *array = getArrayStart(); 1025 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1026 if(match == NULL) { 1027 return -1; 1028 } else { 1029 return (int32_t)(match - array); 1030 } 1031 } 1032 1033 int32_t 1034 UnicodeString::doLastIndexOf(UChar c, 1035 int32_t start, 1036 int32_t length) const 1037 { 1038 if(isBogus()) { 1039 return -1; 1040 } 1041 1042 // pin indices 1043 pinIndices(start, length); 1044 1045 // find the last occurrence of c 1046 const UChar *array = getArrayStart(); 1047 const UChar *match = u_memrchr(array + start, c, length); 1048 if(match == NULL) { 1049 return -1; 1050 } else { 1051 return (int32_t)(match - array); 1052 } 1053 } 1054 1055 int32_t 1056 UnicodeString::doLastIndexOf(UChar32 c, 1057 int32_t start, 1058 int32_t length) const { 1059 // pin indices 1060 pinIndices(start, length); 1061 1062 // find the last occurrence of c 1063 const UChar *array = getArrayStart(); 1064 const UChar *match = u_memrchr32(array + start, c, length); 1065 if(match == NULL) { 1066 return -1; 1067 } else { 1068 return (int32_t)(match - array); 1069 } 1070 } 1071 1072 //======================================== 1073 // Write implementation 1074 //======================================== 1075 1076 UnicodeString& 1077 UnicodeString::findAndReplace(int32_t start, 1078 int32_t length, 1079 const UnicodeString& oldText, 1080 int32_t oldStart, 1081 int32_t oldLength, 1082 const UnicodeString& newText, 1083 int32_t newStart, 1084 int32_t newLength) 1085 { 1086 if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1087 return *this; 1088 } 1089 1090 pinIndices(start, length); 1091 oldText.pinIndices(oldStart, oldLength); 1092 newText.pinIndices(newStart, newLength); 1093 1094 if(oldLength == 0) { 1095 return *this; 1096 } 1097 1098 while(length > 0 && length >= oldLength) { 1099 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1100 if(pos < 0) { 1101 // no more oldText's here: done 1102 break; 1103 } else { 1104 // we found oldText, replace it by newText and go beyond it 1105 replace(pos, oldLength, newText, newStart, newLength); 1106 length -= pos + oldLength - start; 1107 start = pos + newLength; 1108 } 1109 } 1110 1111 return *this; 1112 } 1113 1114 1115 void 1116 UnicodeString::setToBogus() 1117 { 1118 releaseArray(); 1119 1120 fShortLength = 0; 1121 fUnion.fFields.fArray = 0; 1122 fUnion.fFields.fCapacity = 0; 1123 fFlags = kIsBogus; 1124 } 1125 1126 // turn a bogus string into an empty one 1127 void 1128 UnicodeString::unBogus() { 1129 if(fFlags & kIsBogus) { 1130 setToEmpty(); 1131 } 1132 } 1133 1134 // setTo() analogous to the readonly-aliasing constructor with the same signature 1135 UnicodeString & 1136 UnicodeString::setTo(UBool isTerminated, 1137 const UChar *text, 1138 int32_t textLength) 1139 { 1140 if(fFlags & kOpenGetBuffer) { 1141 // do not modify a string that has an "open" getBuffer(minCapacity) 1142 return *this; 1143 } 1144 1145 if(text == NULL) { 1146 // treat as an empty string, do not alias 1147 releaseArray(); 1148 setToEmpty(); 1149 return *this; 1150 } 1151 1152 if( textLength < -1 || 1153 (textLength == -1 && !isTerminated) || 1154 (textLength >= 0 && isTerminated && text[textLength] != 0) 1155 ) { 1156 setToBogus(); 1157 return *this; 1158 } 1159 1160 releaseArray(); 1161 1162 if(textLength == -1) { 1163 // text is terminated, or else it would have failed the above test 1164 textLength = u_strlen(text); 1165 } 1166 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1167 1168 fFlags = kReadonlyAlias; 1169 return *this; 1170 } 1171 1172 // setTo() analogous to the writable-aliasing constructor with the same signature 1173 UnicodeString & 1174 UnicodeString::setTo(UChar *buffer, 1175 int32_t buffLength, 1176 int32_t buffCapacity) { 1177 if(fFlags & kOpenGetBuffer) { 1178 // do not modify a string that has an "open" getBuffer(minCapacity) 1179 return *this; 1180 } 1181 1182 if(buffer == NULL) { 1183 // treat as an empty string, do not alias 1184 releaseArray(); 1185 setToEmpty(); 1186 return *this; 1187 } 1188 1189 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1190 setToBogus(); 1191 return *this; 1192 } else if(buffLength == -1) { 1193 // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1194 const UChar *p = buffer, *limit = buffer + buffCapacity; 1195 while(p != limit && *p != 0) { 1196 ++p; 1197 } 1198 buffLength = (int32_t)(p - buffer); 1199 } 1200 1201 releaseArray(); 1202 1203 setArray(buffer, buffLength, buffCapacity); 1204 fFlags = kWritableAlias; 1205 return *this; 1206 } 1207 1208 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) { 1209 unBogus(); 1210 int32_t length = utf8.length(); 1211 int32_t capacity; 1212 // The UTF-16 string will be at most as long as the UTF-8 string. 1213 if(length <= US_STACKBUF_SIZE) { 1214 capacity = US_STACKBUF_SIZE; 1215 } else { 1216 capacity = length + 1; // +1 for the terminating NUL. 1217 } 1218 UChar *utf16 = getBuffer(capacity); 1219 int32_t length16; 1220 UErrorCode errorCode = U_ZERO_ERROR; 1221 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1222 utf8.data(), length, 1223 0xfffd, // Substitution character. 1224 NULL, // Don't care about number of substitutions. 1225 &errorCode); 1226 releaseBuffer(length16); 1227 if(U_FAILURE(errorCode)) { 1228 setToBogus(); 1229 } 1230 return *this; 1231 } 1232 1233 UnicodeString& 1234 UnicodeString::setCharAt(int32_t offset, 1235 UChar c) 1236 { 1237 int32_t len = length(); 1238 if(cloneArrayIfNeeded() && len > 0) { 1239 if(offset < 0) { 1240 offset = 0; 1241 } else if(offset >= len) { 1242 offset = len - 1; 1243 } 1244 1245 getArrayStart()[offset] = c; 1246 } 1247 return *this; 1248 } 1249 1250 UnicodeString& 1251 UnicodeString::replace(int32_t start, 1252 int32_t _length, 1253 UChar32 srcChar) { 1254 UChar buffer[U16_MAX_LENGTH]; 1255 int32_t count = 0; 1256 UBool isError = FALSE; 1257 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1258 // We test isError so that the compiler does not complain that we don't. 1259 // If isError (srcChar is not a valid code point) then count==0 which means 1260 // we remove the source segment rather than replacing it with srcChar. 1261 return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1262 } 1263 1264 UnicodeString& 1265 UnicodeString::append(UChar32 srcChar) { 1266 UChar buffer[U16_MAX_LENGTH]; 1267 int32_t _length = 0; 1268 UBool isError = FALSE; 1269 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1270 // We test isError so that the compiler does not complain that we don't. 1271 // If isError then _length==0 which turns the doReplace() into a no-op anyway. 1272 return isError ? *this : doReplace(length(), 0, buffer, 0, _length); 1273 } 1274 1275 UnicodeString& 1276 UnicodeString::doReplace( int32_t start, 1277 int32_t length, 1278 const UnicodeString& src, 1279 int32_t srcStart, 1280 int32_t srcLength) 1281 { 1282 if(!src.isBogus()) { 1283 // pin the indices to legal values 1284 src.pinIndices(srcStart, srcLength); 1285 1286 // get the characters from src 1287 // and replace the range in ourselves with them 1288 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1289 } else { 1290 // remove the range 1291 return doReplace(start, length, 0, 0, 0); 1292 } 1293 } 1294 1295 UnicodeString& 1296 UnicodeString::doReplace(int32_t start, 1297 int32_t length, 1298 const UChar *srcChars, 1299 int32_t srcStart, 1300 int32_t srcLength) 1301 { 1302 if(!isWritable()) { 1303 return *this; 1304 } 1305 1306 int32_t oldLength = this->length(); 1307 1308 // optimize (read-only alias).remove(0, start) and .remove(start, end) 1309 if((fFlags&kBufferIsReadonly) && srcLength == 0) { 1310 if(start == 0) { 1311 // remove prefix by adjusting the array pointer 1312 pinIndex(length); 1313 fUnion.fFields.fArray += length; 1314 fUnion.fFields.fCapacity -= length; 1315 setLength(oldLength - length); 1316 return *this; 1317 } else { 1318 pinIndex(start); 1319 if(length >= (oldLength - start)) { 1320 // remove suffix by reducing the length (like truncate()) 1321 setLength(start); 1322 fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1323 return *this; 1324 } 1325 } 1326 } 1327 1328 if(srcChars == 0) { 1329 srcStart = srcLength = 0; 1330 } else if(srcLength < 0) { 1331 // get the srcLength if necessary 1332 srcLength = u_strlen(srcChars + srcStart); 1333 } 1334 1335 // calculate the size of the string after the replace 1336 int32_t newLength; 1337 1338 // optimize append() onto a large-enough, owned string 1339 if(start >= oldLength) { 1340 if(srcLength == 0) { 1341 return *this; 1342 } 1343 newLength = oldLength + srcLength; 1344 if(newLength <= getCapacity() && isBufferWritable()) { 1345 UChar *oldArray = getArrayStart(); 1346 // Do not copy characters when 1347 // UChar *buffer=str.getAppendBuffer(...); 1348 // is followed by 1349 // str.append(buffer, length); 1350 // or 1351 // str.appendString(buffer, length) 1352 // or similar. 1353 if(srcChars + srcStart != oldArray + start || start > oldLength) { 1354 us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength); 1355 } 1356 setLength(newLength); 1357 return *this; 1358 } else { 1359 // pin the indices to legal values 1360 start = oldLength; 1361 length = 0; 1362 } 1363 } else { 1364 // pin the indices to legal values 1365 pinIndices(start, length); 1366 1367 newLength = oldLength - length + srcLength; 1368 } 1369 1370 // the following may change fArray but will not copy the current contents; 1371 // therefore we need to keep the current fArray 1372 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1373 UChar *oldArray; 1374 if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1375 // copy the stack buffer contents because it will be overwritten with 1376 // fUnion.fFields values 1377 u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength); 1378 oldArray = oldStackBuffer; 1379 } else { 1380 oldArray = getArrayStart(); 1381 } 1382 1383 // clone our array and allocate a bigger array if needed 1384 int32_t *bufferToDelete = 0; 1385 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, 1386 FALSE, &bufferToDelete) 1387 ) { 1388 return *this; 1389 } 1390 1391 // now do the replace 1392 1393 UChar *newArray = getArrayStart(); 1394 if(newArray != oldArray) { 1395 // if fArray changed, then we need to copy everything except what will change 1396 us_arrayCopy(oldArray, 0, newArray, 0, start); 1397 us_arrayCopy(oldArray, start + length, 1398 newArray, start + srcLength, 1399 oldLength - (start + length)); 1400 } else if(length != srcLength) { 1401 // fArray did not change; copy only the portion that isn't changing, leaving a hole 1402 us_arrayCopy(oldArray, start + length, 1403 newArray, start + srcLength, 1404 oldLength - (start + length)); 1405 } 1406 1407 // now fill in the hole with the new string 1408 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); 1409 1410 setLength(newLength); 1411 1412 // delayed delete in case srcChars == fArray when we started, and 1413 // to keep oldArray alive for the above operations 1414 if (bufferToDelete) { 1415 uprv_free(bufferToDelete); 1416 } 1417 1418 return *this; 1419 } 1420 1421 /** 1422 * Replaceable API 1423 */ 1424 void 1425 UnicodeString::handleReplaceBetween(int32_t start, 1426 int32_t limit, 1427 const UnicodeString& text) { 1428 replaceBetween(start, limit, text); 1429 } 1430 1431 /** 1432 * Replaceable API 1433 */ 1434 void 1435 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1436 if (limit <= start) { 1437 return; // Nothing to do; avoid bogus malloc call 1438 } 1439 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); 1440 // Check to make sure text is not null. 1441 if (text != NULL) { 1442 extractBetween(start, limit, text, 0); 1443 insert(dest, text, 0, limit - start); 1444 uprv_free(text); 1445 } 1446 } 1447 1448 /** 1449 * Replaceable API 1450 * 1451 * NOTE: This is for the Replaceable class. There is no rep.cpp, 1452 * so we implement this function here. 1453 */ 1454 UBool Replaceable::hasMetaData() const { 1455 return TRUE; 1456 } 1457 1458 /** 1459 * Replaceable API 1460 */ 1461 UBool UnicodeString::hasMetaData() const { 1462 return FALSE; 1463 } 1464 1465 UnicodeString& 1466 UnicodeString::doReverse(int32_t start, int32_t length) { 1467 if(length <= 1 || !cloneArrayIfNeeded()) { 1468 return *this; 1469 } 1470 1471 // pin the indices to legal values 1472 pinIndices(start, length); 1473 if(length <= 1) { // pinIndices() might have shrunk the length 1474 return *this; 1475 } 1476 1477 UChar *left = getArrayStart() + start; 1478 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1479 UChar swap; 1480 UBool hasSupplementary = FALSE; 1481 1482 // Before the loop we know left<right because length>=2. 1483 do { 1484 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); 1485 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); 1486 *right-- = swap; 1487 } while(left < right); 1488 // Make sure to test the middle code unit of an odd-length string. 1489 // Redundant if the length is even. 1490 hasSupplementary |= (UBool)U16_IS_LEAD(*left); 1491 1492 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1493 if(hasSupplementary) { 1494 UChar swap2; 1495 1496 left = getArrayStart() + start; 1497 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1498 while(left < right) { 1499 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1500 *left++ = swap2; 1501 *left++ = swap; 1502 } else { 1503 ++left; 1504 } 1505 } 1506 } 1507 1508 return *this; 1509 } 1510 1511 UBool 1512 UnicodeString::padLeading(int32_t targetLength, 1513 UChar padChar) 1514 { 1515 int32_t oldLength = length(); 1516 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1517 return FALSE; 1518 } else { 1519 // move contents up by padding width 1520 UChar *array = getArrayStart(); 1521 int32_t start = targetLength - oldLength; 1522 us_arrayCopy(array, 0, array, start, oldLength); 1523 1524 // fill in padding character 1525 while(--start >= 0) { 1526 array[start] = padChar; 1527 } 1528 setLength(targetLength); 1529 return TRUE; 1530 } 1531 } 1532 1533 UBool 1534 UnicodeString::padTrailing(int32_t targetLength, 1535 UChar padChar) 1536 { 1537 int32_t oldLength = length(); 1538 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1539 return FALSE; 1540 } else { 1541 // fill in padding character 1542 UChar *array = getArrayStart(); 1543 int32_t length = targetLength; 1544 while(--length >= oldLength) { 1545 array[length] = padChar; 1546 } 1547 setLength(targetLength); 1548 return TRUE; 1549 } 1550 } 1551 1552 //======================================== 1553 // Hashing 1554 //======================================== 1555 int32_t 1556 UnicodeString::doHashCode() const 1557 { 1558 /* Delegate hash computation to uhash. This makes UnicodeString 1559 * hashing consistent with UChar* hashing. */ 1560 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1561 if (hashCode == kInvalidHashCode) { 1562 hashCode = kEmptyHashCode; 1563 } 1564 return hashCode; 1565 } 1566 1567 //======================================== 1568 // External Buffer 1569 //======================================== 1570 1571 UChar * 1572 UnicodeString::getBuffer(int32_t minCapacity) { 1573 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1574 fFlags|=kOpenGetBuffer; 1575 fShortLength=0; 1576 return getArrayStart(); 1577 } else { 1578 return 0; 1579 } 1580 } 1581 1582 void 1583 UnicodeString::releaseBuffer(int32_t newLength) { 1584 if(fFlags&kOpenGetBuffer && newLength>=-1) { 1585 // set the new fLength 1586 int32_t capacity=getCapacity(); 1587 if(newLength==-1) { 1588 // the new length is the string length, capped by fCapacity 1589 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; 1590 while(p<limit && *p!=0) { 1591 ++p; 1592 } 1593 newLength=(int32_t)(p-array); 1594 } else if(newLength>capacity) { 1595 newLength=capacity; 1596 } 1597 setLength(newLength); 1598 fFlags&=~kOpenGetBuffer; 1599 } 1600 } 1601 1602 //======================================== 1603 // Miscellaneous 1604 //======================================== 1605 UBool 1606 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1607 int32_t growCapacity, 1608 UBool doCopyArray, 1609 int32_t **pBufferToDelete, 1610 UBool forceClone) { 1611 // default parameters need to be static, therefore 1612 // the defaults are -1 to have convenience defaults 1613 if(newCapacity == -1) { 1614 newCapacity = getCapacity(); 1615 } 1616 1617 // while a getBuffer(minCapacity) is "open", 1618 // prevent any modifications of the string by returning FALSE here 1619 // if the string is bogus, then only an assignment or similar can revive it 1620 if(!isWritable()) { 1621 return FALSE; 1622 } 1623 1624 /* 1625 * We need to make a copy of the array if 1626 * the buffer is read-only, or 1627 * the buffer is refCounted (shared), and refCount>1, or 1628 * the buffer is too small. 1629 * Return FALSE if memory could not be allocated. 1630 */ 1631 if(forceClone || 1632 fFlags & kBufferIsReadonly || 1633 (fFlags & kRefCounted && refCount() > 1) || 1634 newCapacity > getCapacity() 1635 ) { 1636 // check growCapacity for default value and use of the stack buffer 1637 if(growCapacity < 0) { 1638 growCapacity = newCapacity; 1639 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1640 growCapacity = US_STACKBUF_SIZE; 1641 } 1642 1643 // save old values 1644 UChar oldStackBuffer[US_STACKBUF_SIZE]; 1645 UChar *oldArray; 1646 uint8_t flags = fFlags; 1647 1648 if(flags&kUsingStackBuffer) { 1649 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1650 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1651 // copy the stack buffer contents because it will be overwritten with 1652 // fUnion.fFields values 1653 us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength); 1654 oldArray = oldStackBuffer; 1655 } else { 1656 oldArray = 0; // no need to copy from stack buffer to itself 1657 } 1658 } else { 1659 oldArray = fUnion.fFields.fArray; 1660 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ 1661 } 1662 1663 // allocate a new array 1664 if(allocate(growCapacity) || 1665 (newCapacity < growCapacity && allocate(newCapacity)) 1666 ) { 1667 if(doCopyArray && oldArray != 0) { 1668 // copy the contents 1669 // do not copy more than what fits - it may be smaller than before 1670 int32_t minLength = length(); 1671 newCapacity = getCapacity(); 1672 if(newCapacity < minLength) { 1673 minLength = newCapacity; 1674 setLength(minLength); 1675 } 1676 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1677 } else { 1678 fShortLength = 0; 1679 } 1680 1681 // release the old array 1682 if(flags & kRefCounted) { 1683 // the array is refCounted; decrement and release if 0 1684 int32_t *pRefCount = ((int32_t *)oldArray - 1); 1685 if(umtx_atomic_dec(pRefCount) == 0) { 1686 if(pBufferToDelete == 0) { 1687 uprv_free(pRefCount); 1688 } else { 1689 // the caller requested to delete it himself 1690 *pBufferToDelete = pRefCount; 1691 } 1692 } 1693 } 1694 } else { 1695 // not enough memory for growCapacity and not even for the smaller newCapacity 1696 // reset the old values for setToBogus() to release the array 1697 if(!(flags&kUsingStackBuffer)) { 1698 fUnion.fFields.fArray = oldArray; 1699 } 1700 fFlags = flags; 1701 setToBogus(); 1702 return FALSE; 1703 } 1704 } 1705 return TRUE; 1706 } 1707 1708 // UnicodeStringAppendable ------------------------------------------------- *** 1709 1710 UnicodeStringAppendable::~UnicodeStringAppendable() {} 1711 1712 UBool 1713 UnicodeStringAppendable::appendCodeUnit(UChar c) { 1714 return str.doReplace(str.length(), 0, &c, 0, 1).isWritable(); 1715 } 1716 1717 UBool 1718 UnicodeStringAppendable::appendCodePoint(UChar32 c) { 1719 UChar buffer[U16_MAX_LENGTH]; 1720 int32_t cLength = 0; 1721 UBool isError = FALSE; 1722 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 1723 return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable(); 1724 } 1725 1726 UBool 1727 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { 1728 return str.doReplace(str.length(), 0, s, 0, length).isWritable(); 1729 } 1730 1731 UBool 1732 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 1733 return str.cloneArrayIfNeeded(str.length() + appendCapacity); 1734 } 1735 1736 UChar * 1737 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 1738 int32_t desiredCapacityHint, 1739 UChar *scratch, int32_t scratchCapacity, 1740 int32_t *resultCapacity) { 1741 if(minCapacity < 1 || scratchCapacity < minCapacity) { 1742 *resultCapacity = 0; 1743 return NULL; 1744 } 1745 int32_t oldLength = str.length(); 1746 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 1747 *resultCapacity = str.getCapacity() - oldLength; 1748 return str.getArrayStart() + oldLength; 1749 } 1750 *resultCapacity = scratchCapacity; 1751 return scratch; 1752 } 1753 1754 U_NAMESPACE_END 1755 1756 U_NAMESPACE_USE 1757 1758 U_CAPI int32_t U_EXPORT2 1759 uhash_hashUnicodeString(const UElement key) { 1760 const UnicodeString *str = (const UnicodeString*) key.pointer; 1761 return (str == NULL) ? 0 : str->hashCode(); 1762 } 1763 1764 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 1765 // does not depend on hashtable code. 1766 U_CAPI UBool U_EXPORT2 1767 uhash_compareUnicodeString(const UElement key1, const UElement key2) { 1768 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 1769 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 1770 if (str1 == str2) { 1771 return TRUE; 1772 } 1773 if (str1 == NULL || str2 == NULL) { 1774 return FALSE; 1775 } 1776 return *str1 == *str2; 1777 } 1778 1779 #ifdef U_STATIC_IMPLEMENTATION 1780 /* 1781 This should never be called. It is defined here to make sure that the 1782 virtual vector deleting destructor is defined within unistr.cpp. 1783 The vector deleting destructor is already a part of UObject, 1784 but defining it here makes sure that it is included with this object file. 1785 This makes sure that static library dependencies are kept to a minimum. 1786 */ 1787 static void uprv_UnicodeStringDummy(void) { 1788 delete [] (new UnicodeString[2]); 1789 } 1790 #endif 1791