Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1999-2015, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 ******************************************************************************
      6 *
      7 * File unistr.cpp
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   09/25/98    stephen     Creation.
     13 *   04/20/99    stephen     Overhauled per 4/16 code review.
     14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
     15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
     16 *                           Replaceable.
     17 *   06/25/01    grhoten     Removed the dependency on iostream
     18 ******************************************************************************
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/appendable.h"
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf16.h"
     30 #include "uelement.h"
     31 #include "ustr_imp.h"
     32 #include "umutex.h"
     33 #include "uassert.h"
     34 
     35 #if 0
     36 
     37 #include <iostream>
     38 using namespace std;
     39 
     40 //DEBUGGING
     41 void
     42 print(const UnicodeString& s,
     43       const char *name)
     44 {
     45   UChar c;
     46   cout << name << ":|";
     47   for(int i = 0; i < s.length(); ++i) {
     48     c = s[i];
     49     if(c>= 0x007E || c < 0x0020)
     50       cout << "[0x" << hex << s[i] << "]";
     51     else
     52       cout << (char) s[i];
     53   }
     54   cout << '|' << endl;
     55 }
     56 
     57 void
     58 print(const UChar *s,
     59       int32_t len,
     60       const char *name)
     61 {
     62   UChar c;
     63   cout << name << ":|";
     64   for(int i = 0; i < len; ++i) {
     65     c = s[i];
     66     if(c>= 0x007E || c < 0x0020)
     67       cout << "[0x" << hex << s[i] << "]";
     68     else
     69       cout << (char) s[i];
     70   }
     71   cout << '|' << endl;
     72 }
     73 // END DEBUGGING
     74 #endif
     75 
     76 // Local function definitions for now
     77 
     78 // need to copy areas that may overlap
     79 static
     80 inline void
     81 us_arrayCopy(const UChar *src, int32_t srcStart,
     82          UChar *dst, int32_t dstStart, int32_t count)
     83 {
     84   if(count>0) {
     85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
     86   }
     87 }
     88 
     89 // u_unescapeAt() callback to get a UChar from a UnicodeString
     90 U_CDECL_BEGIN
     91 static UChar U_CALLCONV
     92 UnicodeString_charAt(int32_t offset, void *context) {
     93     return ((icu::UnicodeString*) context)->charAt(offset);
     94 }
     95 U_CDECL_END
     96 
     97 U_NAMESPACE_BEGIN
     98 
     99 /* The Replaceable virtual destructor can't be defined in the header
    100    due to how AIX works with multiple definitions of virtual functions.
    101 */
    102 Replaceable::~Replaceable() {}
    103 
    104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
    105 
    106 UnicodeString U_EXPORT2
    107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
    108     return
    109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
    110             append(s1).
    111                 append(s2);
    112 }
    113 
    114 //========================================
    115 // Reference Counting functions, put at top of file so that optimizing compilers
    116 //                               have a chance to automatically inline.
    117 //========================================
    118 
    119 void
    120 UnicodeString::addRef() {
    121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
    122 }
    123 
    124 int32_t
    125 UnicodeString::removeRef() {
    126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
    127 }
    128 
    129 int32_t
    130 UnicodeString::refCount() const {
    131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
    132 }
    133 
    134 void
    135 UnicodeString::releaseArray() {
    136   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
    137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
    138   }
    139 }
    140 
    141 
    142 
    143 //========================================
    144 // Constructors
    145 //========================================
    146 
    147 // The default constructor is inline in unistr.h.
    148 
    149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
    150   fUnion.fFields.fLengthAndFlags = 0;
    151   if(count <= 0 || (uint32_t)c > 0x10ffff) {
    152     // just allocate and do not do anything else
    153     allocate(capacity);
    154   } else {
    155     // count > 0, allocate and fill the new string with count c's
    156     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
    157     if(capacity < length) {
    158       capacity = length;
    159     }
    160     if(allocate(capacity)) {
    161       UChar *array = getArrayStart();
    162       int32_t i = 0;
    163 
    164       // fill the new string with c
    165       if(unitCount == 1) {
    166         // fill with length UChars
    167         while(i < length) {
    168           array[i++] = (UChar)c;
    169         }
    170       } else {
    171         // get the code units for c
    172         UChar units[U16_MAX_LENGTH];
    173         U16_APPEND_UNSAFE(units, i, c);
    174 
    175         // now it must be i==unitCount
    176         i = 0;
    177 
    178         // for Unicode, unitCount can only be 1, 2, 3, or 4
    179         // 1 is handled above
    180         while(i < length) {
    181           int32_t unitIdx = 0;
    182           while(unitIdx < unitCount) {
    183             array[i++]=units[unitIdx++];
    184           }
    185         }
    186       }
    187     }
    188     setLength(length);
    189   }
    190 }
    191 
    192 UnicodeString::UnicodeString(UChar ch) {
    193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
    194   fUnion.fStackFields.fBuffer[0] = ch;
    195 }
    196 
    197 UnicodeString::UnicodeString(UChar32 ch) {
    198   fUnion.fFields.fLengthAndFlags = kShortString;
    199   int32_t i = 0;
    200   UBool isError = FALSE;
    201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
    202   // We test isError so that the compiler does not complain that we don't.
    203   // If isError then i==0 which is what we want anyway.
    204   if(!isError) {
    205     setShortLength(i);
    206   }
    207 }
    208 
    209 UnicodeString::UnicodeString(const UChar *text) {
    210   fUnion.fFields.fLengthAndFlags = kShortString;
    211   doAppend(text, 0, -1);
    212 }
    213 
    214 UnicodeString::UnicodeString(const UChar *text,
    215                              int32_t textLength) {
    216   fUnion.fFields.fLengthAndFlags = kShortString;
    217   doAppend(text, 0, textLength);
    218 }
    219 
    220 UnicodeString::UnicodeString(UBool isTerminated,
    221                              const UChar *text,
    222                              int32_t textLength) {
    223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
    224   if(text == NULL) {
    225     // treat as an empty string, do not alias
    226     setToEmpty();
    227   } else if(textLength < -1 ||
    228             (textLength == -1 && !isTerminated) ||
    229             (textLength >= 0 && isTerminated && text[textLength] != 0)
    230   ) {
    231     setToBogus();
    232   } else {
    233     if(textLength == -1) {
    234       // text is terminated, or else it would have failed the above test
    235       textLength = u_strlen(text);
    236     }
    237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
    238   }
    239 }
    240 
    241 UnicodeString::UnicodeString(UChar *buff,
    242                              int32_t buffLength,
    243                              int32_t buffCapacity) {
    244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
    245   if(buff == NULL) {
    246     // treat as an empty string, do not alias
    247     setToEmpty();
    248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
    249     setToBogus();
    250   } else {
    251     if(buffLength == -1) {
    252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
    253       const UChar *p = buff, *limit = buff + buffCapacity;
    254       while(p != limit && *p != 0) {
    255         ++p;
    256       }
    257       buffLength = (int32_t)(p - buff);
    258     }
    259     setArray(buff, buffLength, buffCapacity);
    260   }
    261 }
    262 
    263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
    264   fUnion.fFields.fLengthAndFlags = kShortString;
    265   if(src==NULL) {
    266     // treat as an empty string
    267   } else {
    268     if(length<0) {
    269       length=(int32_t)uprv_strlen(src);
    270     }
    271     if(cloneArrayIfNeeded(length, length, FALSE)) {
    272       u_charsToUChars(src, getArrayStart(), length);
    273       setLength(length);
    274     } else {
    275       setToBogus();
    276     }
    277   }
    278 }
    279 
    280 #if U_CHARSET_IS_UTF8
    281 
    282 UnicodeString::UnicodeString(const char *codepageData) {
    283   fUnion.fFields.fLengthAndFlags = kShortString;
    284   if(codepageData != 0) {
    285     setToUTF8(codepageData);
    286   }
    287 }
    288 
    289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
    290   fUnion.fFields.fLengthAndFlags = kShortString;
    291   // if there's nothing to convert, do nothing
    292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    293     return;
    294   }
    295   if(dataLength == -1) {
    296     dataLength = (int32_t)uprv_strlen(codepageData);
    297   }
    298   setToUTF8(StringPiece(codepageData, dataLength));
    299 }
    300 
    301 // else see unistr_cnv.cpp
    302 #endif
    303 
    304 UnicodeString::UnicodeString(const UnicodeString& that) {
    305   fUnion.fFields.fLengthAndFlags = kShortString;
    306   copyFrom(that);
    307 }
    308 
    309 #if U_HAVE_RVALUE_REFERENCES
    310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
    311   fUnion.fFields.fLengthAndFlags = kShortString;
    312   moveFrom(src);
    313 }
    314 #endif
    315 
    316 UnicodeString::UnicodeString(const UnicodeString& that,
    317                              int32_t srcStart) {
    318   fUnion.fFields.fLengthAndFlags = kShortString;
    319   setTo(that, srcStart);
    320 }
    321 
    322 UnicodeString::UnicodeString(const UnicodeString& that,
    323                              int32_t srcStart,
    324                              int32_t srcLength) {
    325   fUnion.fFields.fLengthAndFlags = kShortString;
    326   setTo(that, srcStart, srcLength);
    327 }
    328 
    329 // Replaceable base class clone() default implementation, does not clone
    330 Replaceable *
    331 Replaceable::clone() const {
    332   return NULL;
    333 }
    334 
    335 // UnicodeString overrides clone() with a real implementation
    336 Replaceable *
    337 UnicodeString::clone() const {
    338   return new UnicodeString(*this);
    339 }
    340 
    341 //========================================
    342 // array allocation
    343 //========================================
    344 
    345 UBool
    346 UnicodeString::allocate(int32_t capacity) {
    347   if(capacity <= US_STACKBUF_SIZE) {
    348     fUnion.fFields.fLengthAndFlags = kShortString;
    349   } else {
    350     // count bytes for the refCounter and the string capacity, and
    351     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
    352     // to be safely aligned for the refCount
    353     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
    354     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
    355     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
    356     if(array != 0) {
    357       // set initial refCount and point behind the refCount
    358       *array++ = 1;
    359 
    360       // have fArray point to the first UChar
    361       fUnion.fFields.fArray = (UChar *)array;
    362       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
    363       fUnion.fFields.fLengthAndFlags = kLongString;
    364     } else {
    365       fUnion.fFields.fLengthAndFlags = kIsBogus;
    366       fUnion.fFields.fArray = 0;
    367       fUnion.fFields.fCapacity = 0;
    368       return FALSE;
    369     }
    370   }
    371   return TRUE;
    372 }
    373 
    374 //========================================
    375 // Destructor
    376 //========================================
    377 
    378 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
    379 static u_atomic_int32_t finalLengthCounts[0x400];  // UnicodeString::kMaxShortLength+1
    380 static u_atomic_int32_t beyondCount(0);
    381 
    382 U_CAPI void unistr_printLengths() {
    383   int32_t i;
    384   for(i = 0; i <= 59; ++i) {
    385     printf("%2d,  %9d\n", i, (int32_t)finalLengthCounts[i]);
    386   }
    387   int32_t beyond = beyondCount;
    388   for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
    389     beyond += finalLengthCounts[i];
    390   }
    391   printf(">59, %9d\n", beyond);
    392 }
    393 #endif
    394 
    395 UnicodeString::~UnicodeString()
    396 {
    397 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
    398   // Count lengths of strings at the end of their lifetime.
    399   // Useful for discussion of a desirable stack buffer size.
    400   // Count the contents length, not the optional NUL terminator nor further capacity.
    401   // Ignore open-buffer strings and strings which alias external storage.
    402   if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
    403     if(hasShortLength()) {
    404       umtx_atomic_inc(finalLengthCounts + getShortLength());
    405     } else {
    406       umtx_atomic_inc(&beyondCount);
    407     }
    408   }
    409 #endif
    410 
    411   releaseArray();
    412 }
    413 
    414 //========================================
    415 // Factory methods
    416 //========================================
    417 
    418 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
    419   UnicodeString result;
    420   result.setToUTF8(utf8);
    421   return result;
    422 }
    423 
    424 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
    425   UnicodeString result;
    426   int32_t capacity;
    427   // Most UTF-32 strings will be BMP-only and result in a same-length
    428   // UTF-16 string. We overestimate the capacity just slightly,
    429   // just in case there are a few supplementary characters.
    430   if(length <= US_STACKBUF_SIZE) {
    431     capacity = US_STACKBUF_SIZE;
    432   } else {
    433     capacity = length + (length >> 4) + 4;
    434   }
    435   do {
    436     UChar *utf16 = result.getBuffer(capacity);
    437     int32_t length16;
    438     UErrorCode errorCode = U_ZERO_ERROR;
    439     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
    440         utf32, length,
    441         0xfffd,  // Substitution character.
    442         NULL,    // Don't care about number of substitutions.
    443         &errorCode);
    444     result.releaseBuffer(length16);
    445     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    446       capacity = length16 + 1;  // +1 for the terminating NUL.
    447       continue;
    448     } else if(U_FAILURE(errorCode)) {
    449       result.setToBogus();
    450     }
    451     break;
    452   } while(TRUE);
    453   return result;
    454 }
    455 
    456 //========================================
    457 // Assignment
    458 //========================================
    459 
    460 UnicodeString &
    461 UnicodeString::operator=(const UnicodeString &src) {
    462   return copyFrom(src);
    463 }
    464 
    465 UnicodeString &
    466 UnicodeString::fastCopyFrom(const UnicodeString &src) {
    467   return copyFrom(src, TRUE);
    468 }
    469 
    470 UnicodeString &
    471 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
    472   // if assigning to ourselves, do nothing
    473   if(this == &src) {
    474     return *this;
    475   }
    476 
    477   // is the right side bogus?
    478   if(src.isBogus()) {
    479     setToBogus();
    480     return *this;
    481   }
    482 
    483   // delete the current contents
    484   releaseArray();
    485 
    486   if(src.isEmpty()) {
    487     // empty string - use the stack buffer
    488     setToEmpty();
    489     return *this;
    490   }
    491 
    492   // fLength>0 and not an "open" src.getBuffer(minCapacity)
    493   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
    494   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
    495   case kShortString:
    496     // short string using the stack buffer, do the same
    497     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
    498                 getShortLength() * U_SIZEOF_UCHAR);
    499     break;
    500   case kLongString:
    501     // src uses a refCounted string buffer, use that buffer with refCount
    502     // src is const, use a cast - we don't actually change it
    503     ((UnicodeString &)src).addRef();
    504     // copy all fields, share the reference-counted buffer
    505     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    506     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    507     if(!hasShortLength()) {
    508       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
    509     }
    510     break;
    511   case kReadonlyAlias:
    512     if(fastCopy) {
    513       // src is a readonly alias, do the same
    514       // -> maintain the readonly alias as such
    515       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    516       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    517       if(!hasShortLength()) {
    518         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
    519       }
    520       break;
    521     }
    522     // else if(!fastCopy) fall through to case kWritableAlias
    523     // -> allocate a new buffer and copy the contents
    524   case kWritableAlias: {
    525     // src is a writable alias; we make a copy of that instead
    526     int32_t srcLength = src.length();
    527     if(allocate(srcLength)) {
    528       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
    529       setLength(srcLength);
    530       break;
    531     }
    532     // if there is not enough memory, then fall through to setting to bogus
    533   }
    534   default:
    535     // if src is bogus, set ourselves to bogus
    536     // do not call setToBogus() here because fArray and flags are not consistent here
    537     fUnion.fFields.fLengthAndFlags = kIsBogus;
    538     fUnion.fFields.fArray = 0;
    539     fUnion.fFields.fCapacity = 0;
    540     break;
    541   }
    542 
    543   return *this;
    544 }
    545 
    546 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
    547   // No explicit check for self move assignment, consistent with standard library.
    548   // Self move assignment causes no crash nor leak but might make the object bogus.
    549   releaseArray();
    550   copyFieldsFrom(src, TRUE);
    551   return *this;
    552 }
    553 
    554 // Same as moveFrom() except without memory management.
    555 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
    556   int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
    557   if(lengthAndFlags & kUsingStackBuffer) {
    558     // Short string using the stack buffer, copy the contents.
    559     // Check for self assignment to prevent "overlap in memcpy" warnings,
    560     // although it should be harmless to copy a buffer to itself exactly.
    561     if(this != &src) {
    562       uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
    563                   getShortLength() * U_SIZEOF_UCHAR);
    564     }
    565   } else {
    566     // In all other cases, copy all fields.
    567     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    568     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    569     if(!hasShortLength()) {
    570       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
    571     }
    572     if(setSrcToBogus) {
    573       // Set src to bogus without releasing any memory.
    574       src.fUnion.fFields.fLengthAndFlags = kIsBogus;
    575       src.fUnion.fFields.fArray = NULL;
    576       src.fUnion.fFields.fCapacity = 0;
    577     }
    578   }
    579 }
    580 
    581 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
    582   UnicodeString temp;  // Empty short string: Known not to need releaseArray().
    583   // Copy fields without resetting source values in between.
    584   temp.copyFieldsFrom(*this, FALSE);
    585   this->copyFieldsFrom(other, FALSE);
    586   other.copyFieldsFrom(temp, FALSE);
    587   // Set temp to an empty string so that other's memory is not released twice.
    588   temp.fUnion.fFields.fLengthAndFlags = kShortString;
    589 }
    590 
    591 //========================================
    592 // Miscellaneous operations
    593 //========================================
    594 
    595 UnicodeString UnicodeString::unescape() const {
    596     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
    597     if (result.isBogus()) {
    598         return result;
    599     }
    600     const UChar *array = getBuffer();
    601     int32_t len = length();
    602     int32_t prev = 0;
    603     for (int32_t i=0;;) {
    604         if (i == len) {
    605             result.append(array, prev, len - prev);
    606             break;
    607         }
    608         if (array[i++] == 0x5C /*'\\'*/) {
    609             result.append(array, prev, (i - 1) - prev);
    610             UChar32 c = unescapeAt(i); // advances i
    611             if (c < 0) {
    612                 result.remove(); // return empty string
    613                 break; // invalid escape sequence
    614             }
    615             result.append(c);
    616             prev = i;
    617         }
    618     }
    619     return result;
    620 }
    621 
    622 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
    623     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
    624 }
    625 
    626 //========================================
    627 // Read-only implementation
    628 //========================================
    629 UBool
    630 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
    631   // Requires: this & text not bogus and have same lengths.
    632   // Byte-wise comparison works for equality regardless of endianness.
    633   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
    634 }
    635 
    636 int8_t
    637 UnicodeString::doCompare( int32_t start,
    638               int32_t length,
    639               const UChar *srcChars,
    640               int32_t srcStart,
    641               int32_t srcLength) const
    642 {
    643   // compare illegal string values
    644   if(isBogus()) {
    645     return -1;
    646   }
    647 
    648   // pin indices to legal values
    649   pinIndices(start, length);
    650 
    651   if(srcChars == NULL) {
    652     // treat const UChar *srcChars==NULL as an empty string
    653     return length == 0 ? 0 : 1;
    654   }
    655 
    656   // get the correct pointer
    657   const UChar *chars = getArrayStart();
    658 
    659   chars += start;
    660   srcChars += srcStart;
    661 
    662   int32_t minLength;
    663   int8_t lengthResult;
    664 
    665   // get the srcLength if necessary
    666   if(srcLength < 0) {
    667     srcLength = u_strlen(srcChars + srcStart);
    668   }
    669 
    670   // are we comparing different lengths?
    671   if(length != srcLength) {
    672     if(length < srcLength) {
    673       minLength = length;
    674       lengthResult = -1;
    675     } else {
    676       minLength = srcLength;
    677       lengthResult = 1;
    678     }
    679   } else {
    680     minLength = length;
    681     lengthResult = 0;
    682   }
    683 
    684   /*
    685    * note that uprv_memcmp() returns an int but we return an int8_t;
    686    * we need to take care not to truncate the result -
    687    * one way to do this is to right-shift the value to
    688    * move the sign bit into the lower 8 bits and making sure that this
    689    * does not become 0 itself
    690    */
    691 
    692   if(minLength > 0 && chars != srcChars) {
    693     int32_t result;
    694 
    695 #   if U_IS_BIG_ENDIAN
    696       // big-endian: byte comparison works
    697       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
    698       if(result != 0) {
    699         return (int8_t)(result >> 15 | 1);
    700       }
    701 #   else
    702       // little-endian: compare UChar units
    703       do {
    704         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
    705         if(result != 0) {
    706           return (int8_t)(result >> 15 | 1);
    707         }
    708       } while(--minLength > 0);
    709 #   endif
    710   }
    711   return lengthResult;
    712 }
    713 
    714 /* String compare in code point order - doCompare() compares in code unit order. */
    715 int8_t
    716 UnicodeString::doCompareCodePointOrder(int32_t start,
    717                                        int32_t length,
    718                                        const UChar *srcChars,
    719                                        int32_t srcStart,
    720                                        int32_t srcLength) const
    721 {
    722   // compare illegal string values
    723   // treat const UChar *srcChars==NULL as an empty string
    724   if(isBogus()) {
    725     return -1;
    726   }
    727 
    728   // pin indices to legal values
    729   pinIndices(start, length);
    730 
    731   if(srcChars == NULL) {
    732     srcStart = srcLength = 0;
    733   }
    734 
    735   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
    736   /* translate the 32-bit result into an 8-bit one */
    737   if(diff!=0) {
    738     return (int8_t)(diff >> 15 | 1);
    739   } else {
    740     return 0;
    741   }
    742 }
    743 
    744 int32_t
    745 UnicodeString::getLength() const {
    746     return length();
    747 }
    748 
    749 UChar
    750 UnicodeString::getCharAt(int32_t offset) const {
    751   return charAt(offset);
    752 }
    753 
    754 UChar32
    755 UnicodeString::getChar32At(int32_t offset) const {
    756   return char32At(offset);
    757 }
    758 
    759 UChar32
    760 UnicodeString::char32At(int32_t offset) const
    761 {
    762   int32_t len = length();
    763   if((uint32_t)offset < (uint32_t)len) {
    764     const UChar *array = getArrayStart();
    765     UChar32 c;
    766     U16_GET(array, 0, offset, len, c);
    767     return c;
    768   } else {
    769     return kInvalidUChar;
    770   }
    771 }
    772 
    773 int32_t
    774 UnicodeString::getChar32Start(int32_t offset) const {
    775   if((uint32_t)offset < (uint32_t)length()) {
    776     const UChar *array = getArrayStart();
    777     U16_SET_CP_START(array, 0, offset);
    778     return offset;
    779   } else {
    780     return 0;
    781   }
    782 }
    783 
    784 int32_t
    785 UnicodeString::getChar32Limit(int32_t offset) const {
    786   int32_t len = length();
    787   if((uint32_t)offset < (uint32_t)len) {
    788     const UChar *array = getArrayStart();
    789     U16_SET_CP_LIMIT(array, 0, offset, len);
    790     return offset;
    791   } else {
    792     return len;
    793   }
    794 }
    795 
    796 int32_t
    797 UnicodeString::countChar32(int32_t start, int32_t length) const {
    798   pinIndices(start, length);
    799   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
    800   return u_countChar32(getArrayStart()+start, length);
    801 }
    802 
    803 UBool
    804 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
    805   pinIndices(start, length);
    806   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
    807   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
    808 }
    809 
    810 int32_t
    811 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
    812   // pin index
    813   int32_t len = length();
    814   if(index<0) {
    815     index=0;
    816   } else if(index>len) {
    817     index=len;
    818   }
    819 
    820   const UChar *array = getArrayStart();
    821   if(delta>0) {
    822     U16_FWD_N(array, index, len, delta);
    823   } else {
    824     U16_BACK_N(array, 0, index, -delta);
    825   }
    826 
    827   return index;
    828 }
    829 
    830 void
    831 UnicodeString::doExtract(int32_t start,
    832              int32_t length,
    833              UChar *dst,
    834              int32_t dstStart) const
    835 {
    836   // pin indices to legal values
    837   pinIndices(start, length);
    838 
    839   // do not copy anything if we alias dst itself
    840   const UChar *array = getArrayStart();
    841   if(array + start != dst + dstStart) {
    842     us_arrayCopy(array, start, dst, dstStart, length);
    843   }
    844 }
    845 
    846 int32_t
    847 UnicodeString::extract(UChar *dest, int32_t destCapacity,
    848                        UErrorCode &errorCode) const {
    849   int32_t len = length();
    850   if(U_SUCCESS(errorCode)) {
    851     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    852       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    853     } else {
    854       const UChar *array = getArrayStart();
    855       if(len>0 && len<=destCapacity && array!=dest) {
    856         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
    857       }
    858       return u_terminateUChars(dest, destCapacity, len, &errorCode);
    859     }
    860   }
    861 
    862   return len;
    863 }
    864 
    865 int32_t
    866 UnicodeString::extract(int32_t start,
    867                        int32_t length,
    868                        char *target,
    869                        int32_t targetCapacity,
    870                        enum EInvariant) const
    871 {
    872   // if the arguments are illegal, then do nothing
    873   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
    874     return 0;
    875   }
    876 
    877   // pin the indices to legal values
    878   pinIndices(start, length);
    879 
    880   if(length <= targetCapacity) {
    881     u_UCharsToChars(getArrayStart() + start, target, length);
    882   }
    883   UErrorCode status = U_ZERO_ERROR;
    884   return u_terminateChars(target, targetCapacity, length, &status);
    885 }
    886 
    887 UnicodeString
    888 UnicodeString::tempSubString(int32_t start, int32_t len) const {
    889   pinIndices(start, len);
    890   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
    891   if(array==NULL) {
    892     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
    893     len=-2;  // bogus result string
    894   }
    895   return UnicodeString(FALSE, array + start, len);
    896 }
    897 
    898 int32_t
    899 UnicodeString::toUTF8(int32_t start, int32_t len,
    900                       char *target, int32_t capacity) const {
    901   pinIndices(start, len);
    902   int32_t length8;
    903   UErrorCode errorCode = U_ZERO_ERROR;
    904   u_strToUTF8WithSub(target, capacity, &length8,
    905                      getBuffer() + start, len,
    906                      0xFFFD,  // Standard substitution character.
    907                      NULL,    // Don't care about number of substitutions.
    908                      &errorCode);
    909   return length8;
    910 }
    911 
    912 #if U_CHARSET_IS_UTF8
    913 
    914 int32_t
    915 UnicodeString::extract(int32_t start, int32_t len,
    916                        char *target, uint32_t dstSize) const {
    917   // if the arguments are illegal, then do nothing
    918   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    919     return 0;
    920   }
    921   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
    922 }
    923 
    924 // else see unistr_cnv.cpp
    925 #endif
    926 
    927 void
    928 UnicodeString::extractBetween(int32_t start,
    929                   int32_t limit,
    930                   UnicodeString& target) const {
    931   pinIndex(start);
    932   pinIndex(limit);
    933   doExtract(start, limit - start, target);
    934 }
    935 
    936 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
    937 // as many bytes as the source has UChars.
    938 // The "worst cases" are writing systems like Indic, Thai and CJK with
    939 // 3:1 bytes:UChars.
    940 void
    941 UnicodeString::toUTF8(ByteSink &sink) const {
    942   int32_t length16 = length();
    943   if(length16 != 0) {
    944     char stackBuffer[1024];
    945     int32_t capacity = (int32_t)sizeof(stackBuffer);
    946     UBool utf8IsOwned = FALSE;
    947     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
    948                                       3*length16,
    949                                       stackBuffer, capacity,
    950                                       &capacity);
    951     int32_t length8 = 0;
    952     UErrorCode errorCode = U_ZERO_ERROR;
    953     u_strToUTF8WithSub(utf8, capacity, &length8,
    954                        getBuffer(), length16,
    955                        0xFFFD,  // Standard substitution character.
    956                        NULL,    // Don't care about number of substitutions.
    957                        &errorCode);
    958     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    959       utf8 = (char *)uprv_malloc(length8);
    960       if(utf8 != NULL) {
    961         utf8IsOwned = TRUE;
    962         errorCode = U_ZERO_ERROR;
    963         u_strToUTF8WithSub(utf8, length8, &length8,
    964                            getBuffer(), length16,
    965                            0xFFFD,  // Standard substitution character.
    966                            NULL,    // Don't care about number of substitutions.
    967                            &errorCode);
    968       } else {
    969         errorCode = U_MEMORY_ALLOCATION_ERROR;
    970       }
    971     }
    972     if(U_SUCCESS(errorCode)) {
    973       sink.Append(utf8, length8);
    974       sink.Flush();
    975     }
    976     if(utf8IsOwned) {
    977       uprv_free(utf8);
    978     }
    979   }
    980 }
    981 
    982 int32_t
    983 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
    984   int32_t length32=0;
    985   if(U_SUCCESS(errorCode)) {
    986     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
    987     u_strToUTF32WithSub(utf32, capacity, &length32,
    988         getBuffer(), length(),
    989         0xfffd,  // Substitution character.
    990         NULL,    // Don't care about number of substitutions.
    991         &errorCode);
    992   }
    993   return length32;
    994 }
    995 
    996 int32_t
    997 UnicodeString::indexOf(const UChar *srcChars,
    998                int32_t srcStart,
    999                int32_t srcLength,
   1000                int32_t start,
   1001                int32_t length) const
   1002 {
   1003   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
   1004     return -1;
   1005   }
   1006 
   1007   // UnicodeString does not find empty substrings
   1008   if(srcLength < 0 && srcChars[srcStart] == 0) {
   1009     return -1;
   1010   }
   1011 
   1012   // get the indices within bounds
   1013   pinIndices(start, length);
   1014 
   1015   // find the first occurrence of the substring
   1016   const UChar *array = getArrayStart();
   1017   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
   1018   if(match == NULL) {
   1019     return -1;
   1020   } else {
   1021     return (int32_t)(match - array);
   1022   }
   1023 }
   1024 
   1025 int32_t
   1026 UnicodeString::doIndexOf(UChar c,
   1027              int32_t start,
   1028              int32_t length) const
   1029 {
   1030   // pin indices
   1031   pinIndices(start, length);
   1032 
   1033   // find the first occurrence of c
   1034   const UChar *array = getArrayStart();
   1035   const UChar *match = u_memchr(array + start, c, length);
   1036   if(match == NULL) {
   1037     return -1;
   1038   } else {
   1039     return (int32_t)(match - array);
   1040   }
   1041 }
   1042 
   1043 int32_t
   1044 UnicodeString::doIndexOf(UChar32 c,
   1045                          int32_t start,
   1046                          int32_t length) const {
   1047   // pin indices
   1048   pinIndices(start, length);
   1049 
   1050   // find the first occurrence of c
   1051   const UChar *array = getArrayStart();
   1052   const UChar *match = u_memchr32(array + start, c, length);
   1053   if(match == NULL) {
   1054     return -1;
   1055   } else {
   1056     return (int32_t)(match - array);
   1057   }
   1058 }
   1059 
   1060 int32_t
   1061 UnicodeString::lastIndexOf(const UChar *srcChars,
   1062                int32_t srcStart,
   1063                int32_t srcLength,
   1064                int32_t start,
   1065                int32_t length) const
   1066 {
   1067   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
   1068     return -1;
   1069   }
   1070 
   1071   // UnicodeString does not find empty substrings
   1072   if(srcLength < 0 && srcChars[srcStart] == 0) {
   1073     return -1;
   1074   }
   1075 
   1076   // get the indices within bounds
   1077   pinIndices(start, length);
   1078 
   1079   // find the last occurrence of the substring
   1080   const UChar *array = getArrayStart();
   1081   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
   1082   if(match == NULL) {
   1083     return -1;
   1084   } else {
   1085     return (int32_t)(match - array);
   1086   }
   1087 }
   1088 
   1089 int32_t
   1090 UnicodeString::doLastIndexOf(UChar c,
   1091                  int32_t start,
   1092                  int32_t length) const
   1093 {
   1094   if(isBogus()) {
   1095     return -1;
   1096   }
   1097 
   1098   // pin indices
   1099   pinIndices(start, length);
   1100 
   1101   // find the last occurrence of c
   1102   const UChar *array = getArrayStart();
   1103   const UChar *match = u_memrchr(array + start, c, length);
   1104   if(match == NULL) {
   1105     return -1;
   1106   } else {
   1107     return (int32_t)(match - array);
   1108   }
   1109 }
   1110 
   1111 int32_t
   1112 UnicodeString::doLastIndexOf(UChar32 c,
   1113                              int32_t start,
   1114                              int32_t length) const {
   1115   // pin indices
   1116   pinIndices(start, length);
   1117 
   1118   // find the last occurrence of c
   1119   const UChar *array = getArrayStart();
   1120   const UChar *match = u_memrchr32(array + start, c, length);
   1121   if(match == NULL) {
   1122     return -1;
   1123   } else {
   1124     return (int32_t)(match - array);
   1125   }
   1126 }
   1127 
   1128 //========================================
   1129 // Write implementation
   1130 //========================================
   1131 
   1132 UnicodeString&
   1133 UnicodeString::findAndReplace(int32_t start,
   1134                   int32_t length,
   1135                   const UnicodeString& oldText,
   1136                   int32_t oldStart,
   1137                   int32_t oldLength,
   1138                   const UnicodeString& newText,
   1139                   int32_t newStart,
   1140                   int32_t newLength)
   1141 {
   1142   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
   1143     return *this;
   1144   }
   1145 
   1146   pinIndices(start, length);
   1147   oldText.pinIndices(oldStart, oldLength);
   1148   newText.pinIndices(newStart, newLength);
   1149 
   1150   if(oldLength == 0) {
   1151     return *this;
   1152   }
   1153 
   1154   while(length > 0 && length >= oldLength) {
   1155     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
   1156     if(pos < 0) {
   1157       // no more oldText's here: done
   1158       break;
   1159     } else {
   1160       // we found oldText, replace it by newText and go beyond it
   1161       replace(pos, oldLength, newText, newStart, newLength);
   1162       length -= pos + oldLength - start;
   1163       start = pos + newLength;
   1164     }
   1165   }
   1166 
   1167   return *this;
   1168 }
   1169 
   1170 
   1171 void
   1172 UnicodeString::setToBogus()
   1173 {
   1174   releaseArray();
   1175 
   1176   fUnion.fFields.fLengthAndFlags = kIsBogus;
   1177   fUnion.fFields.fArray = 0;
   1178   fUnion.fFields.fCapacity = 0;
   1179 }
   1180 
   1181 // turn a bogus string into an empty one
   1182 void
   1183 UnicodeString::unBogus() {
   1184   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
   1185     setToEmpty();
   1186   }
   1187 }
   1188 
   1189 const UChar *
   1190 UnicodeString::getTerminatedBuffer() {
   1191   if(!isWritable()) {
   1192     return 0;
   1193   }
   1194   UChar *array = getArrayStart();
   1195   int32_t len = length();
   1196   if(len < getCapacity()) {
   1197     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
   1198       // If len<capacity on a read-only alias, then array[len] is
   1199       // either the original NUL (if constructed with (TRUE, s, length))
   1200       // or one of the original string contents characters (if later truncated),
   1201       // therefore we can assume that array[len] is initialized memory.
   1202       if(array[len] == 0) {
   1203         return array;
   1204       }
   1205     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
   1206       // kRefCounted: Do not write the NUL if the buffer is shared.
   1207       // That is mostly safe, except when the length of one copy was modified
   1208       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
   1209       // Then the NUL would be written into the middle of another copy's string.
   1210 
   1211       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
   1212       // Do not test if there is a NUL already because it might be uninitialized memory.
   1213       // (That would be safe, but tools like valgrind & Purify would complain.)
   1214       array[len] = 0;
   1215       return array;
   1216     }
   1217   }
   1218   if(cloneArrayIfNeeded(len+1)) {
   1219     array = getArrayStart();
   1220     array[len] = 0;
   1221     return array;
   1222   } else {
   1223     return NULL;
   1224   }
   1225 }
   1226 
   1227 // setTo() analogous to the readonly-aliasing constructor with the same signature
   1228 UnicodeString &
   1229 UnicodeString::setTo(UBool isTerminated,
   1230                      const UChar *text,
   1231                      int32_t textLength)
   1232 {
   1233   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
   1234     // do not modify a string that has an "open" getBuffer(minCapacity)
   1235     return *this;
   1236   }
   1237 
   1238   if(text == NULL) {
   1239     // treat as an empty string, do not alias
   1240     releaseArray();
   1241     setToEmpty();
   1242     return *this;
   1243   }
   1244 
   1245   if( textLength < -1 ||
   1246       (textLength == -1 && !isTerminated) ||
   1247       (textLength >= 0 && isTerminated && text[textLength] != 0)
   1248   ) {
   1249     setToBogus();
   1250     return *this;
   1251   }
   1252 
   1253   releaseArray();
   1254 
   1255   if(textLength == -1) {
   1256     // text is terminated, or else it would have failed the above test
   1257     textLength = u_strlen(text);
   1258   }
   1259   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
   1260   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
   1261   return *this;
   1262 }
   1263 
   1264 // setTo() analogous to the writable-aliasing constructor with the same signature
   1265 UnicodeString &
   1266 UnicodeString::setTo(UChar *buffer,
   1267                      int32_t buffLength,
   1268                      int32_t buffCapacity) {
   1269   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
   1270     // do not modify a string that has an "open" getBuffer(minCapacity)
   1271     return *this;
   1272   }
   1273 
   1274   if(buffer == NULL) {
   1275     // treat as an empty string, do not alias
   1276     releaseArray();
   1277     setToEmpty();
   1278     return *this;
   1279   }
   1280 
   1281   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
   1282     setToBogus();
   1283     return *this;
   1284   } else if(buffLength == -1) {
   1285     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
   1286     const UChar *p = buffer, *limit = buffer + buffCapacity;
   1287     while(p != limit && *p != 0) {
   1288       ++p;
   1289     }
   1290     buffLength = (int32_t)(p - buffer);
   1291   }
   1292 
   1293   releaseArray();
   1294 
   1295   fUnion.fFields.fLengthAndFlags = kWritableAlias;
   1296   setArray(buffer, buffLength, buffCapacity);
   1297   return *this;
   1298 }
   1299 
   1300 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
   1301   unBogus();
   1302   int32_t length = utf8.length();
   1303   int32_t capacity;
   1304   // The UTF-16 string will be at most as long as the UTF-8 string.
   1305   if(length <= US_STACKBUF_SIZE) {
   1306     capacity = US_STACKBUF_SIZE;
   1307   } else {
   1308     capacity = length + 1;  // +1 for the terminating NUL.
   1309   }
   1310   UChar *utf16 = getBuffer(capacity);
   1311   int32_t length16;
   1312   UErrorCode errorCode = U_ZERO_ERROR;
   1313   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
   1314       utf8.data(), length,
   1315       0xfffd,  // Substitution character.
   1316       NULL,    // Don't care about number of substitutions.
   1317       &errorCode);
   1318   releaseBuffer(length16);
   1319   if(U_FAILURE(errorCode)) {
   1320     setToBogus();
   1321   }
   1322   return *this;
   1323 }
   1324 
   1325 UnicodeString&
   1326 UnicodeString::setCharAt(int32_t offset,
   1327              UChar c)
   1328 {
   1329   int32_t len = length();
   1330   if(cloneArrayIfNeeded() && len > 0) {
   1331     if(offset < 0) {
   1332       offset = 0;
   1333     } else if(offset >= len) {
   1334       offset = len - 1;
   1335     }
   1336 
   1337     getArrayStart()[offset] = c;
   1338   }
   1339   return *this;
   1340 }
   1341 
   1342 UnicodeString&
   1343 UnicodeString::replace(int32_t start,
   1344                int32_t _length,
   1345                UChar32 srcChar) {
   1346   UChar buffer[U16_MAX_LENGTH];
   1347   int32_t count = 0;
   1348   UBool isError = FALSE;
   1349   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
   1350   // We test isError so that the compiler does not complain that we don't.
   1351   // If isError (srcChar is not a valid code point) then count==0 which means
   1352   // we remove the source segment rather than replacing it with srcChar.
   1353   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
   1354 }
   1355 
   1356 UnicodeString&
   1357 UnicodeString::append(UChar32 srcChar) {
   1358   UChar buffer[U16_MAX_LENGTH];
   1359   int32_t _length = 0;
   1360   UBool isError = FALSE;
   1361   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
   1362   // We test isError so that the compiler does not complain that we don't.
   1363   // If isError then _length==0 which turns the doAppend() into a no-op anyway.
   1364   return isError ? *this : doAppend(buffer, 0, _length);
   1365 }
   1366 
   1367 UnicodeString&
   1368 UnicodeString::doReplace( int32_t start,
   1369               int32_t length,
   1370               const UnicodeString& src,
   1371               int32_t srcStart,
   1372               int32_t srcLength)
   1373 {
   1374   // pin the indices to legal values
   1375   src.pinIndices(srcStart, srcLength);
   1376 
   1377   // get the characters from src
   1378   // and replace the range in ourselves with them
   1379   return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
   1380 }
   1381 
   1382 UnicodeString&
   1383 UnicodeString::doReplace(int32_t start,
   1384              int32_t length,
   1385              const UChar *srcChars,
   1386              int32_t srcStart,
   1387              int32_t srcLength)
   1388 {
   1389   if(!isWritable()) {
   1390     return *this;
   1391   }
   1392 
   1393   int32_t oldLength = this->length();
   1394 
   1395   // optimize (read-only alias).remove(0, start) and .remove(start, end)
   1396   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
   1397     if(start == 0) {
   1398       // remove prefix by adjusting the array pointer
   1399       pinIndex(length);
   1400       fUnion.fFields.fArray += length;
   1401       fUnion.fFields.fCapacity -= length;
   1402       setLength(oldLength - length);
   1403       return *this;
   1404     } else {
   1405       pinIndex(start);
   1406       if(length >= (oldLength - start)) {
   1407         // remove suffix by reducing the length (like truncate())
   1408         setLength(start);
   1409         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
   1410         return *this;
   1411       }
   1412     }
   1413   }
   1414 
   1415   if(start == oldLength) {
   1416     return doAppend(srcChars, srcStart, srcLength);
   1417   }
   1418 
   1419   if(srcChars == 0) {
   1420     srcStart = srcLength = 0;
   1421   } else if(srcLength < 0) {
   1422     // get the srcLength if necessary
   1423     srcLength = u_strlen(srcChars + srcStart);
   1424   }
   1425 
   1426   // pin the indices to legal values
   1427   pinIndices(start, length);
   1428 
   1429   // calculate the size of the string after the replace
   1430   int32_t newLength = oldLength - length + srcLength;
   1431 
   1432   // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
   1433   // therefore we need to keep the current fArray
   1434   UChar oldStackBuffer[US_STACKBUF_SIZE];
   1435   UChar *oldArray;
   1436   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
   1437     // copy the stack buffer contents because it will be overwritten with
   1438     // fUnion.fFields values
   1439     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
   1440     oldArray = oldStackBuffer;
   1441   } else {
   1442     oldArray = getArrayStart();
   1443   }
   1444 
   1445   // clone our array and allocate a bigger array if needed
   1446   int32_t *bufferToDelete = 0;
   1447   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
   1448                          FALSE, &bufferToDelete)
   1449   ) {
   1450     return *this;
   1451   }
   1452 
   1453   // now do the replace
   1454 
   1455   UChar *newArray = getArrayStart();
   1456   if(newArray != oldArray) {
   1457     // if fArray changed, then we need to copy everything except what will change
   1458     us_arrayCopy(oldArray, 0, newArray, 0, start);
   1459     us_arrayCopy(oldArray, start + length,
   1460                  newArray, start + srcLength,
   1461                  oldLength - (start + length));
   1462   } else if(length != srcLength) {
   1463     // fArray did not change; copy only the portion that isn't changing, leaving a hole
   1464     us_arrayCopy(oldArray, start + length,
   1465                  newArray, start + srcLength,
   1466                  oldLength - (start + length));
   1467   }
   1468 
   1469   // now fill in the hole with the new string
   1470   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
   1471 
   1472   setLength(newLength);
   1473 
   1474   // delayed delete in case srcChars == fArray when we started, and
   1475   // to keep oldArray alive for the above operations
   1476   if (bufferToDelete) {
   1477     uprv_free(bufferToDelete);
   1478   }
   1479 
   1480   return *this;
   1481 }
   1482 
   1483 // Versions of doReplace() only for append() variants.
   1484 // doReplace() and doAppend() optimize for different cases.
   1485 
   1486 UnicodeString&
   1487 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
   1488   if(srcLength == 0) {
   1489     return *this;
   1490   }
   1491 
   1492   // pin the indices to legal values
   1493   src.pinIndices(srcStart, srcLength);
   1494   return doAppend(src.getArrayStart(), srcStart, srcLength);
   1495 }
   1496 
   1497 UnicodeString&
   1498 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
   1499   if(!isWritable() || srcLength == 0 || srcChars == NULL) {
   1500     return *this;
   1501   }
   1502 
   1503   if(srcLength < 0) {
   1504     // get the srcLength if necessary
   1505     if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
   1506       return *this;
   1507     }
   1508   }
   1509 
   1510   int32_t oldLength = length();
   1511   int32_t newLength = oldLength + srcLength;
   1512   // optimize append() onto a large-enough, owned string
   1513   if((newLength <= getCapacity() && isBufferWritable()) ||
   1514       cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) {
   1515     UChar *newArray = getArrayStart();
   1516     // Do not copy characters when
   1517     //   UChar *buffer=str.getAppendBuffer(...);
   1518     // is followed by
   1519     //   str.append(buffer, length);
   1520     // or
   1521     //   str.appendString(buffer, length)
   1522     // or similar.
   1523     if(srcChars + srcStart != newArray + oldLength) {
   1524       us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
   1525     }
   1526     setLength(newLength);
   1527   }
   1528   return *this;
   1529 }
   1530 
   1531 /**
   1532  * Replaceable API
   1533  */
   1534 void
   1535 UnicodeString::handleReplaceBetween(int32_t start,
   1536                                     int32_t limit,
   1537                                     const UnicodeString& text) {
   1538     replaceBetween(start, limit, text);
   1539 }
   1540 
   1541 /**
   1542  * Replaceable API
   1543  */
   1544 void
   1545 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
   1546     if (limit <= start) {
   1547         return; // Nothing to do; avoid bogus malloc call
   1548     }
   1549     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
   1550     // Check to make sure text is not null.
   1551     if (text != NULL) {
   1552 	    extractBetween(start, limit, text, 0);
   1553 	    insert(dest, text, 0, limit - start);
   1554 	    uprv_free(text);
   1555     }
   1556 }
   1557 
   1558 /**
   1559  * Replaceable API
   1560  *
   1561  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
   1562  * so we implement this function here.
   1563  */
   1564 UBool Replaceable::hasMetaData() const {
   1565     return TRUE;
   1566 }
   1567 
   1568 /**
   1569  * Replaceable API
   1570  */
   1571 UBool UnicodeString::hasMetaData() const {
   1572     return FALSE;
   1573 }
   1574 
   1575 UnicodeString&
   1576 UnicodeString::doReverse(int32_t start, int32_t length) {
   1577   if(length <= 1 || !cloneArrayIfNeeded()) {
   1578     return *this;
   1579   }
   1580 
   1581   // pin the indices to legal values
   1582   pinIndices(start, length);
   1583   if(length <= 1) {  // pinIndices() might have shrunk the length
   1584     return *this;
   1585   }
   1586 
   1587   UChar *left = getArrayStart() + start;
   1588   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
   1589   UChar swap;
   1590   UBool hasSupplementary = FALSE;
   1591 
   1592   // Before the loop we know left<right because length>=2.
   1593   do {
   1594     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
   1595     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
   1596     *right-- = swap;
   1597   } while(left < right);
   1598   // Make sure to test the middle code unit of an odd-length string.
   1599   // Redundant if the length is even.
   1600   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
   1601 
   1602   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
   1603   if(hasSupplementary) {
   1604     UChar swap2;
   1605 
   1606     left = getArrayStart() + start;
   1607     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
   1608     while(left < right) {
   1609       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
   1610         *left++ = swap2;
   1611         *left++ = swap;
   1612       } else {
   1613         ++left;
   1614       }
   1615     }
   1616   }
   1617 
   1618   return *this;
   1619 }
   1620 
   1621 UBool
   1622 UnicodeString::padLeading(int32_t targetLength,
   1623                           UChar padChar)
   1624 {
   1625   int32_t oldLength = length();
   1626   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1627     return FALSE;
   1628   } else {
   1629     // move contents up by padding width
   1630     UChar *array = getArrayStart();
   1631     int32_t start = targetLength - oldLength;
   1632     us_arrayCopy(array, 0, array, start, oldLength);
   1633 
   1634     // fill in padding character
   1635     while(--start >= 0) {
   1636       array[start] = padChar;
   1637     }
   1638     setLength(targetLength);
   1639     return TRUE;
   1640   }
   1641 }
   1642 
   1643 UBool
   1644 UnicodeString::padTrailing(int32_t targetLength,
   1645                            UChar padChar)
   1646 {
   1647   int32_t oldLength = length();
   1648   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1649     return FALSE;
   1650   } else {
   1651     // fill in padding character
   1652     UChar *array = getArrayStart();
   1653     int32_t length = targetLength;
   1654     while(--length >= oldLength) {
   1655       array[length] = padChar;
   1656     }
   1657     setLength(targetLength);
   1658     return TRUE;
   1659   }
   1660 }
   1661 
   1662 //========================================
   1663 // Hashing
   1664 //========================================
   1665 int32_t
   1666 UnicodeString::doHashCode() const
   1667 {
   1668     /* Delegate hash computation to uhash.  This makes UnicodeString
   1669      * hashing consistent with UChar* hashing.  */
   1670     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
   1671     if (hashCode == kInvalidHashCode) {
   1672         hashCode = kEmptyHashCode;
   1673     }
   1674     return hashCode;
   1675 }
   1676 
   1677 //========================================
   1678 // External Buffer
   1679 //========================================
   1680 
   1681 UChar *
   1682 UnicodeString::getBuffer(int32_t minCapacity) {
   1683   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
   1684     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
   1685     setZeroLength();
   1686     return getArrayStart();
   1687   } else {
   1688     return 0;
   1689   }
   1690 }
   1691 
   1692 void
   1693 UnicodeString::releaseBuffer(int32_t newLength) {
   1694   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
   1695     // set the new fLength
   1696     int32_t capacity=getCapacity();
   1697     if(newLength==-1) {
   1698       // the new length is the string length, capped by fCapacity
   1699       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
   1700       while(p<limit && *p!=0) {
   1701         ++p;
   1702       }
   1703       newLength=(int32_t)(p-array);
   1704     } else if(newLength>capacity) {
   1705       newLength=capacity;
   1706     }
   1707     setLength(newLength);
   1708     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
   1709   }
   1710 }
   1711 
   1712 //========================================
   1713 // Miscellaneous
   1714 //========================================
   1715 UBool
   1716 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
   1717                                   int32_t growCapacity,
   1718                                   UBool doCopyArray,
   1719                                   int32_t **pBufferToDelete,
   1720                                   UBool forceClone) {
   1721   // default parameters need to be static, therefore
   1722   // the defaults are -1 to have convenience defaults
   1723   if(newCapacity == -1) {
   1724     newCapacity = getCapacity();
   1725   }
   1726 
   1727   // while a getBuffer(minCapacity) is "open",
   1728   // prevent any modifications of the string by returning FALSE here
   1729   // if the string is bogus, then only an assignment or similar can revive it
   1730   if(!isWritable()) {
   1731     return FALSE;
   1732   }
   1733 
   1734   /*
   1735    * We need to make a copy of the array if
   1736    * the buffer is read-only, or
   1737    * the buffer is refCounted (shared), and refCount>1, or
   1738    * the buffer is too small.
   1739    * Return FALSE if memory could not be allocated.
   1740    */
   1741   if(forceClone ||
   1742      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
   1743      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
   1744      newCapacity > getCapacity()
   1745   ) {
   1746     // check growCapacity for default value and use of the stack buffer
   1747     if(growCapacity < 0) {
   1748       growCapacity = newCapacity;
   1749     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
   1750       growCapacity = US_STACKBUF_SIZE;
   1751     }
   1752 
   1753     // save old values
   1754     UChar oldStackBuffer[US_STACKBUF_SIZE];
   1755     UChar *oldArray;
   1756     int32_t oldLength = length();
   1757     int16_t flags = fUnion.fFields.fLengthAndFlags;
   1758 
   1759     if(flags&kUsingStackBuffer) {
   1760       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
   1761       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
   1762         // copy the stack buffer contents because it will be overwritten with
   1763         // fUnion.fFields values
   1764         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
   1765         oldArray = oldStackBuffer;
   1766       } else {
   1767         oldArray = NULL; // no need to copy from the stack buffer to itself
   1768       }
   1769     } else {
   1770       oldArray = fUnion.fFields.fArray;
   1771       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
   1772     }
   1773 
   1774     // allocate a new array
   1775     if(allocate(growCapacity) ||
   1776        (newCapacity < growCapacity && allocate(newCapacity))
   1777     ) {
   1778       if(doCopyArray) {
   1779         // copy the contents
   1780         // do not copy more than what fits - it may be smaller than before
   1781         int32_t minLength = oldLength;
   1782         newCapacity = getCapacity();
   1783         if(newCapacity < minLength) {
   1784           minLength = newCapacity;
   1785         }
   1786         if(oldArray != NULL) {
   1787           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
   1788         }
   1789         setLength(minLength);
   1790       } else {
   1791         setZeroLength();
   1792       }
   1793 
   1794       // release the old array
   1795       if(flags & kRefCounted) {
   1796         // the array is refCounted; decrement and release if 0
   1797         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
   1798         if(umtx_atomic_dec(pRefCount) == 0) {
   1799           if(pBufferToDelete == 0) {
   1800               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
   1801               // is defined as volatile. (Volatile has useful non-standard behavior
   1802               //   with this compiler.)
   1803             uprv_free((void *)pRefCount);
   1804           } else {
   1805             // the caller requested to delete it himself
   1806             *pBufferToDelete = (int32_t *)pRefCount;
   1807           }
   1808         }
   1809       }
   1810     } else {
   1811       // not enough memory for growCapacity and not even for the smaller newCapacity
   1812       // reset the old values for setToBogus() to release the array
   1813       if(!(flags&kUsingStackBuffer)) {
   1814         fUnion.fFields.fArray = oldArray;
   1815       }
   1816       fUnion.fFields.fLengthAndFlags = flags;
   1817       setToBogus();
   1818       return FALSE;
   1819     }
   1820   }
   1821   return TRUE;
   1822 }
   1823 
   1824 // UnicodeStringAppendable ------------------------------------------------- ***
   1825 
   1826 UnicodeStringAppendable::~UnicodeStringAppendable() {}
   1827 
   1828 UBool
   1829 UnicodeStringAppendable::appendCodeUnit(UChar c) {
   1830   return str.doAppend(&c, 0, 1).isWritable();
   1831 }
   1832 
   1833 UBool
   1834 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
   1835   UChar buffer[U16_MAX_LENGTH];
   1836   int32_t cLength = 0;
   1837   UBool isError = FALSE;
   1838   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
   1839   return !isError && str.doAppend(buffer, 0, cLength).isWritable();
   1840 }
   1841 
   1842 UBool
   1843 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
   1844   return str.doAppend(s, 0, length).isWritable();
   1845 }
   1846 
   1847 UBool
   1848 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
   1849   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
   1850 }
   1851 
   1852 UChar *
   1853 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
   1854                                          int32_t desiredCapacityHint,
   1855                                          UChar *scratch, int32_t scratchCapacity,
   1856                                          int32_t *resultCapacity) {
   1857   if(minCapacity < 1 || scratchCapacity < minCapacity) {
   1858     *resultCapacity = 0;
   1859     return NULL;
   1860   }
   1861   int32_t oldLength = str.length();
   1862   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
   1863     *resultCapacity = str.getCapacity() - oldLength;
   1864     return str.getArrayStart() + oldLength;
   1865   }
   1866   *resultCapacity = scratchCapacity;
   1867   return scratch;
   1868 }
   1869 
   1870 U_NAMESPACE_END
   1871 
   1872 U_NAMESPACE_USE
   1873 
   1874 U_CAPI int32_t U_EXPORT2
   1875 uhash_hashUnicodeString(const UElement key) {
   1876     const UnicodeString *str = (const UnicodeString*) key.pointer;
   1877     return (str == NULL) ? 0 : str->hashCode();
   1878 }
   1879 
   1880 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
   1881 // does not depend on hashtable code.
   1882 U_CAPI UBool U_EXPORT2
   1883 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
   1884     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
   1885     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
   1886     if (str1 == str2) {
   1887         return TRUE;
   1888     }
   1889     if (str1 == NULL || str2 == NULL) {
   1890         return FALSE;
   1891     }
   1892     return *str1 == *str2;
   1893 }
   1894 
   1895 #ifdef U_STATIC_IMPLEMENTATION
   1896 /*
   1897 This should never be called. It is defined here to make sure that the
   1898 virtual vector deleting destructor is defined within unistr.cpp.
   1899 The vector deleting destructor is already a part of UObject,
   1900 but defining it here makes sure that it is included with this object file.
   1901 This makes sure that static library dependencies are kept to a minimum.
   1902 */
   1903 static void uprv_UnicodeStringDummy(void) {
   1904     delete [] (new UnicodeString[2]);
   1905 }
   1906 #endif
   1907