Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1999-2013, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 ******************************************************************************
      6 *
      7 * File unistr.cpp
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   09/25/98    stephen     Creation.
     13 *   04/20/99    stephen     Overhauled per 4/16 code review.
     14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
     15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
     16 *                           Replaceable.
     17 *   06/25/01    grhoten     Removed the dependency on iostream
     18 ******************************************************************************
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/appendable.h"
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf16.h"
     30 #include "uelement.h"
     31 #include "ustr_imp.h"
     32 #include "umutex.h"
     33 #include "uassert.h"
     34 
     35 #if 0
     36 
     37 #include <iostream>
     38 using namespace std;
     39 
     40 //DEBUGGING
     41 void
     42 print(const UnicodeString& s,
     43       const char *name)
     44 {
     45   UChar c;
     46   cout << name << ":|";
     47   for(int i = 0; i < s.length(); ++i) {
     48     c = s[i];
     49     if(c>= 0x007E || c < 0x0020)
     50       cout << "[0x" << hex << s[i] << "]";
     51     else
     52       cout << (char) s[i];
     53   }
     54   cout << '|' << endl;
     55 }
     56 
     57 void
     58 print(const UChar *s,
     59       int32_t len,
     60       const char *name)
     61 {
     62   UChar c;
     63   cout << name << ":|";
     64   for(int i = 0; i < len; ++i) {
     65     c = s[i];
     66     if(c>= 0x007E || c < 0x0020)
     67       cout << "[0x" << hex << s[i] << "]";
     68     else
     69       cout << (char) s[i];
     70   }
     71   cout << '|' << endl;
     72 }
     73 // END DEBUGGING
     74 #endif
     75 
     76 // Local function definitions for now
     77 
     78 // need to copy areas that may overlap
     79 static
     80 inline void
     81 us_arrayCopy(const UChar *src, int32_t srcStart,
     82          UChar *dst, int32_t dstStart, int32_t count)
     83 {
     84   if(count>0) {
     85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
     86   }
     87 }
     88 
     89 // u_unescapeAt() callback to get a UChar from a UnicodeString
     90 U_CDECL_BEGIN
     91 static UChar U_CALLCONV
     92 UnicodeString_charAt(int32_t offset, void *context) {
     93     return ((icu::UnicodeString*) context)->charAt(offset);
     94 }
     95 U_CDECL_END
     96 
     97 U_NAMESPACE_BEGIN
     98 
     99 /* The Replaceable virtual destructor can't be defined in the header
    100    due to how AIX works with multiple definitions of virtual functions.
    101 */
    102 Replaceable::~Replaceable() {}
    103 
    104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
    105 
    106 UnicodeString U_EXPORT2
    107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
    108     return
    109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
    110             append(s1).
    111                 append(s2);
    112 }
    113 
    114 //========================================
    115 // Reference Counting functions, put at top of file so that optimizing compilers
    116 //                               have a chance to automatically inline.
    117 //========================================
    118 
    119 void
    120 UnicodeString::addRef() {
    121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
    122 }
    123 
    124 int32_t
    125 UnicodeString::removeRef() {
    126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
    127 }
    128 
    129 int32_t
    130 UnicodeString::refCount() const {
    131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
    132 }
    133 
    134 void
    135 UnicodeString::releaseArray() {
    136   if((fFlags & kRefCounted) && removeRef() == 0) {
    137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
    138   }
    139 }
    140 
    141 
    142 
    143 //========================================
    144 // Constructors
    145 //========================================
    146 
    147 // The default constructor is inline in unistr.h.
    148 
    149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
    150   : fShortLength(0),
    151     fFlags(0)
    152 {
    153   if(count <= 0 || (uint32_t)c > 0x10ffff) {
    154     // just allocate and do not do anything else
    155     allocate(capacity);
    156   } else {
    157     // count > 0, allocate and fill the new string with count c's
    158     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
    159     if(capacity < length) {
    160       capacity = length;
    161     }
    162     if(allocate(capacity)) {
    163       UChar *array = getArrayStart();
    164       int32_t i = 0;
    165 
    166       // fill the new string with c
    167       if(unitCount == 1) {
    168         // fill with length UChars
    169         while(i < length) {
    170           array[i++] = (UChar)c;
    171         }
    172       } else {
    173         // get the code units for c
    174         UChar units[U16_MAX_LENGTH];
    175         U16_APPEND_UNSAFE(units, i, c);
    176 
    177         // now it must be i==unitCount
    178         i = 0;
    179 
    180         // for Unicode, unitCount can only be 1, 2, 3, or 4
    181         // 1 is handled above
    182         while(i < length) {
    183           int32_t unitIdx = 0;
    184           while(unitIdx < unitCount) {
    185             array[i++]=units[unitIdx++];
    186           }
    187         }
    188       }
    189     }
    190     setLength(length);
    191   }
    192 }
    193 
    194 UnicodeString::UnicodeString(UChar ch)
    195   : fShortLength(1),
    196     fFlags(kShortString)
    197 {
    198   fUnion.fStackBuffer[0] = ch;
    199 }
    200 
    201 UnicodeString::UnicodeString(UChar32 ch)
    202   : fShortLength(0),
    203     fFlags(kShortString)
    204 {
    205   int32_t i = 0;
    206   UBool isError = FALSE;
    207   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
    208   // We test isError so that the compiler does not complain that we don't.
    209   // If isError then i==0 which is what we want anyway.
    210   if(!isError) {
    211     fShortLength = (int8_t)i;
    212   }
    213 }
    214 
    215 UnicodeString::UnicodeString(const UChar *text)
    216   : fShortLength(0),
    217     fFlags(kShortString)
    218 {
    219   doReplace(0, 0, text, 0, -1);
    220 }
    221 
    222 UnicodeString::UnicodeString(const UChar *text,
    223                              int32_t textLength)
    224   : fShortLength(0),
    225     fFlags(kShortString)
    226 {
    227   doReplace(0, 0, text, 0, textLength);
    228 }
    229 
    230 UnicodeString::UnicodeString(UBool isTerminated,
    231                              const UChar *text,
    232                              int32_t textLength)
    233   : fShortLength(0),
    234     fFlags(kReadonlyAlias)
    235 {
    236   if(text == NULL) {
    237     // treat as an empty string, do not alias
    238     setToEmpty();
    239   } else if(textLength < -1 ||
    240             (textLength == -1 && !isTerminated) ||
    241             (textLength >= 0 && isTerminated && text[textLength] != 0)
    242   ) {
    243     setToBogus();
    244   } else {
    245     if(textLength == -1) {
    246       // text is terminated, or else it would have failed the above test
    247       textLength = u_strlen(text);
    248     }
    249     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
    250   }
    251 }
    252 
    253 UnicodeString::UnicodeString(UChar *buff,
    254                              int32_t buffLength,
    255                              int32_t buffCapacity)
    256   : fShortLength(0),
    257     fFlags(kWritableAlias)
    258 {
    259   if(buff == NULL) {
    260     // treat as an empty string, do not alias
    261     setToEmpty();
    262   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
    263     setToBogus();
    264   } else {
    265     if(buffLength == -1) {
    266       // fLength = u_strlen(buff); but do not look beyond buffCapacity
    267       const UChar *p = buff, *limit = buff + buffCapacity;
    268       while(p != limit && *p != 0) {
    269         ++p;
    270       }
    271       buffLength = (int32_t)(p - buff);
    272     }
    273     setArray(buff, buffLength, buffCapacity);
    274   }
    275 }
    276 
    277 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
    278   : fShortLength(0),
    279     fFlags(kShortString)
    280 {
    281   if(src==NULL) {
    282     // treat as an empty string
    283   } else {
    284     if(length<0) {
    285       length=(int32_t)uprv_strlen(src);
    286     }
    287     if(cloneArrayIfNeeded(length, length, FALSE)) {
    288       u_charsToUChars(src, getArrayStart(), length);
    289       setLength(length);
    290     } else {
    291       setToBogus();
    292     }
    293   }
    294 }
    295 
    296 #if U_CHARSET_IS_UTF8
    297 
    298 UnicodeString::UnicodeString(const char *codepageData)
    299   : fShortLength(0),
    300     fFlags(kShortString) {
    301   if(codepageData != 0) {
    302     setToUTF8(codepageData);
    303   }
    304 }
    305 
    306 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
    307   : fShortLength(0),
    308     fFlags(kShortString) {
    309   // if there's nothing to convert, do nothing
    310   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    311     return;
    312   }
    313   if(dataLength == -1) {
    314     dataLength = (int32_t)uprv_strlen(codepageData);
    315   }
    316   setToUTF8(StringPiece(codepageData, dataLength));
    317 }
    318 
    319 // else see unistr_cnv.cpp
    320 #endif
    321 
    322 UnicodeString::UnicodeString(const UnicodeString& that)
    323   : Replaceable(),
    324     fShortLength(0),
    325     fFlags(kShortString)
    326 {
    327   copyFrom(that);
    328 }
    329 
    330 UnicodeString::UnicodeString(const UnicodeString& that,
    331                              int32_t srcStart)
    332   : Replaceable(),
    333     fShortLength(0),
    334     fFlags(kShortString)
    335 {
    336   setTo(that, srcStart);
    337 }
    338 
    339 UnicodeString::UnicodeString(const UnicodeString& that,
    340                              int32_t srcStart,
    341                              int32_t srcLength)
    342   : Replaceable(),
    343     fShortLength(0),
    344     fFlags(kShortString)
    345 {
    346   setTo(that, srcStart, srcLength);
    347 }
    348 
    349 // Replaceable base class clone() default implementation, does not clone
    350 Replaceable *
    351 Replaceable::clone() const {
    352   return NULL;
    353 }
    354 
    355 // UnicodeString overrides clone() with a real implementation
    356 Replaceable *
    357 UnicodeString::clone() const {
    358   return new UnicodeString(*this);
    359 }
    360 
    361 //========================================
    362 // array allocation
    363 //========================================
    364 
    365 UBool
    366 UnicodeString::allocate(int32_t capacity) {
    367   if(capacity <= US_STACKBUF_SIZE) {
    368     fFlags = kShortString;
    369   } else {
    370     // count bytes for the refCounter and the string capacity, and
    371     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
    372     // to be safely aligned for the refCount
    373     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
    374     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
    375     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
    376     if(array != 0) {
    377       // set initial refCount and point behind the refCount
    378       *array++ = 1;
    379 
    380       // have fArray point to the first UChar
    381       fUnion.fFields.fArray = (UChar *)array;
    382       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
    383       fFlags = kLongString;
    384     } else {
    385       fShortLength = 0;
    386       fUnion.fFields.fArray = 0;
    387       fUnion.fFields.fCapacity = 0;
    388       fFlags = kIsBogus;
    389       return FALSE;
    390     }
    391   }
    392   return TRUE;
    393 }
    394 
    395 //========================================
    396 // Destructor
    397 //========================================
    398 UnicodeString::~UnicodeString()
    399 {
    400   releaseArray();
    401 }
    402 
    403 //========================================
    404 // Factory methods
    405 //========================================
    406 
    407 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
    408   UnicodeString result;
    409   result.setToUTF8(utf8);
    410   return result;
    411 }
    412 
    413 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
    414   UnicodeString result;
    415   int32_t capacity;
    416   // Most UTF-32 strings will be BMP-only and result in a same-length
    417   // UTF-16 string. We overestimate the capacity just slightly,
    418   // just in case there are a few supplementary characters.
    419   if(length <= US_STACKBUF_SIZE) {
    420     capacity = US_STACKBUF_SIZE;
    421   } else {
    422     capacity = length + (length >> 4) + 4;
    423   }
    424   do {
    425     UChar *utf16 = result.getBuffer(capacity);
    426     int32_t length16;
    427     UErrorCode errorCode = U_ZERO_ERROR;
    428     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
    429         utf32, length,
    430         0xfffd,  // Substitution character.
    431         NULL,    // Don't care about number of substitutions.
    432         &errorCode);
    433     result.releaseBuffer(length16);
    434     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    435       capacity = length16 + 1;  // +1 for the terminating NUL.
    436       continue;
    437     } else if(U_FAILURE(errorCode)) {
    438       result.setToBogus();
    439     }
    440     break;
    441   } while(TRUE);
    442   return result;
    443 }
    444 
    445 //========================================
    446 // Assignment
    447 //========================================
    448 
    449 UnicodeString &
    450 UnicodeString::operator=(const UnicodeString &src) {
    451   return copyFrom(src);
    452 }
    453 
    454 UnicodeString &
    455 UnicodeString::fastCopyFrom(const UnicodeString &src) {
    456   return copyFrom(src, TRUE);
    457 }
    458 
    459 UnicodeString &
    460 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
    461   // if assigning to ourselves, do nothing
    462   if(this == 0 || this == &src) {
    463     return *this;
    464   }
    465 
    466   // is the right side bogus?
    467   if(&src == 0 || src.isBogus()) {
    468     setToBogus();
    469     return *this;
    470   }
    471 
    472   // delete the current contents
    473   releaseArray();
    474 
    475   if(src.isEmpty()) {
    476     // empty string - use the stack buffer
    477     setToEmpty();
    478     return *this;
    479   }
    480 
    481   // we always copy the length
    482   int32_t srcLength = src.length();
    483   setLength(srcLength);
    484 
    485   // fLength>0 and not an "open" src.getBuffer(minCapacity)
    486   switch(src.fFlags) {
    487   case kShortString:
    488     // short string using the stack buffer, do the same
    489     fFlags = kShortString;
    490     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
    491     break;
    492   case kLongString:
    493     // src uses a refCounted string buffer, use that buffer with refCount
    494     // src is const, use a cast - we don't really change it
    495     ((UnicodeString &)src).addRef();
    496     // copy all fields, share the reference-counted buffer
    497     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    498     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    499     fFlags = src.fFlags;
    500     break;
    501   case kReadonlyAlias:
    502     if(fastCopy) {
    503       // src is a readonly alias, do the same
    504       // -> maintain the readonly alias as such
    505       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    506       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    507       fFlags = src.fFlags;
    508       break;
    509     }
    510     // else if(!fastCopy) fall through to case kWritableAlias
    511     // -> allocate a new buffer and copy the contents
    512   case kWritableAlias:
    513     // src is a writable alias; we make a copy of that instead
    514     if(allocate(srcLength)) {
    515       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
    516       break;
    517     }
    518     // if there is not enough memory, then fall through to setting to bogus
    519   default:
    520     // if src is bogus, set ourselves to bogus
    521     // do not call setToBogus() here because fArray and fFlags are not consistent here
    522     fShortLength = 0;
    523     fUnion.fFields.fArray = 0;
    524     fUnion.fFields.fCapacity = 0;
    525     fFlags = kIsBogus;
    526     break;
    527   }
    528 
    529   return *this;
    530 }
    531 
    532 //========================================
    533 // Miscellaneous operations
    534 //========================================
    535 
    536 UnicodeString UnicodeString::unescape() const {
    537     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
    538     const UChar *array = getBuffer();
    539     int32_t len = length();
    540     int32_t prev = 0;
    541     for (int32_t i=0;;) {
    542         if (i == len) {
    543             result.append(array, prev, len - prev);
    544             break;
    545         }
    546         if (array[i++] == 0x5C /*'\\'*/) {
    547             result.append(array, prev, (i - 1) - prev);
    548             UChar32 c = unescapeAt(i); // advances i
    549             if (c < 0) {
    550                 result.remove(); // return empty string
    551                 break; // invalid escape sequence
    552             }
    553             result.append(c);
    554             prev = i;
    555         }
    556     }
    557     return result;
    558 }
    559 
    560 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
    561     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
    562 }
    563 
    564 //========================================
    565 // Read-only implementation
    566 //========================================
    567 UBool
    568 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
    569   // Requires: this & text not bogus and have same lengths.
    570   // Byte-wise comparison works for equality regardless of endianness.
    571   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
    572 }
    573 
    574 int8_t
    575 UnicodeString::doCompare( int32_t start,
    576               int32_t length,
    577               const UChar *srcChars,
    578               int32_t srcStart,
    579               int32_t srcLength) const
    580 {
    581   // compare illegal string values
    582   if(isBogus()) {
    583     return -1;
    584   }
    585 
    586   // pin indices to legal values
    587   pinIndices(start, length);
    588 
    589   if(srcChars == NULL) {
    590     // treat const UChar *srcChars==NULL as an empty string
    591     return length == 0 ? 0 : 1;
    592   }
    593 
    594   // get the correct pointer
    595   const UChar *chars = getArrayStart();
    596 
    597   chars += start;
    598   srcChars += srcStart;
    599 
    600   int32_t minLength;
    601   int8_t lengthResult;
    602 
    603   // get the srcLength if necessary
    604   if(srcLength < 0) {
    605     srcLength = u_strlen(srcChars + srcStart);
    606   }
    607 
    608   // are we comparing different lengths?
    609   if(length != srcLength) {
    610     if(length < srcLength) {
    611       minLength = length;
    612       lengthResult = -1;
    613     } else {
    614       minLength = srcLength;
    615       lengthResult = 1;
    616     }
    617   } else {
    618     minLength = length;
    619     lengthResult = 0;
    620   }
    621 
    622   /*
    623    * note that uprv_memcmp() returns an int but we return an int8_t;
    624    * we need to take care not to truncate the result -
    625    * one way to do this is to right-shift the value to
    626    * move the sign bit into the lower 8 bits and making sure that this
    627    * does not become 0 itself
    628    */
    629 
    630   if(minLength > 0 && chars != srcChars) {
    631     int32_t result;
    632 
    633 #   if U_IS_BIG_ENDIAN
    634       // big-endian: byte comparison works
    635       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
    636       if(result != 0) {
    637         return (int8_t)(result >> 15 | 1);
    638       }
    639 #   else
    640       // little-endian: compare UChar units
    641       do {
    642         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
    643         if(result != 0) {
    644           return (int8_t)(result >> 15 | 1);
    645         }
    646       } while(--minLength > 0);
    647 #   endif
    648   }
    649   return lengthResult;
    650 }
    651 
    652 /* String compare in code point order - doCompare() compares in code unit order. */
    653 int8_t
    654 UnicodeString::doCompareCodePointOrder(int32_t start,
    655                                        int32_t length,
    656                                        const UChar *srcChars,
    657                                        int32_t srcStart,
    658                                        int32_t srcLength) const
    659 {
    660   // compare illegal string values
    661   // treat const UChar *srcChars==NULL as an empty string
    662   if(isBogus()) {
    663     return -1;
    664   }
    665 
    666   // pin indices to legal values
    667   pinIndices(start, length);
    668 
    669   if(srcChars == NULL) {
    670     srcStart = srcLength = 0;
    671   }
    672 
    673   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
    674   /* translate the 32-bit result into an 8-bit one */
    675   if(diff!=0) {
    676     return (int8_t)(diff >> 15 | 1);
    677   } else {
    678     return 0;
    679   }
    680 }
    681 
    682 int32_t
    683 UnicodeString::getLength() const {
    684     return length();
    685 }
    686 
    687 UChar
    688 UnicodeString::getCharAt(int32_t offset) const {
    689   return charAt(offset);
    690 }
    691 
    692 UChar32
    693 UnicodeString::getChar32At(int32_t offset) const {
    694   return char32At(offset);
    695 }
    696 
    697 UChar32
    698 UnicodeString::char32At(int32_t offset) const
    699 {
    700   int32_t len = length();
    701   if((uint32_t)offset < (uint32_t)len) {
    702     const UChar *array = getArrayStart();
    703     UChar32 c;
    704     U16_GET(array, 0, offset, len, c);
    705     return c;
    706   } else {
    707     return kInvalidUChar;
    708   }
    709 }
    710 
    711 int32_t
    712 UnicodeString::getChar32Start(int32_t offset) const {
    713   if((uint32_t)offset < (uint32_t)length()) {
    714     const UChar *array = getArrayStart();
    715     U16_SET_CP_START(array, 0, offset);
    716     return offset;
    717   } else {
    718     return 0;
    719   }
    720 }
    721 
    722 int32_t
    723 UnicodeString::getChar32Limit(int32_t offset) const {
    724   int32_t len = length();
    725   if((uint32_t)offset < (uint32_t)len) {
    726     const UChar *array = getArrayStart();
    727     U16_SET_CP_LIMIT(array, 0, offset, len);
    728     return offset;
    729   } else {
    730     return len;
    731   }
    732 }
    733 
    734 int32_t
    735 UnicodeString::countChar32(int32_t start, int32_t length) const {
    736   pinIndices(start, length);
    737   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
    738   return u_countChar32(getArrayStart()+start, length);
    739 }
    740 
    741 UBool
    742 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
    743   pinIndices(start, length);
    744   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
    745   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
    746 }
    747 
    748 int32_t
    749 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
    750   // pin index
    751   int32_t len = length();
    752   if(index<0) {
    753     index=0;
    754   } else if(index>len) {
    755     index=len;
    756   }
    757 
    758   const UChar *array = getArrayStart();
    759   if(delta>0) {
    760     U16_FWD_N(array, index, len, delta);
    761   } else {
    762     U16_BACK_N(array, 0, index, -delta);
    763   }
    764 
    765   return index;
    766 }
    767 
    768 void
    769 UnicodeString::doExtract(int32_t start,
    770              int32_t length,
    771              UChar *dst,
    772              int32_t dstStart) const
    773 {
    774   // pin indices to legal values
    775   pinIndices(start, length);
    776 
    777   // do not copy anything if we alias dst itself
    778   const UChar *array = getArrayStart();
    779   if(array + start != dst + dstStart) {
    780     us_arrayCopy(array, start, dst, dstStart, length);
    781   }
    782 }
    783 
    784 int32_t
    785 UnicodeString::extract(UChar *dest, int32_t destCapacity,
    786                        UErrorCode &errorCode) const {
    787   int32_t len = length();
    788   if(U_SUCCESS(errorCode)) {
    789     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    790       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    791     } else {
    792       const UChar *array = getArrayStart();
    793       if(len>0 && len<=destCapacity && array!=dest) {
    794         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
    795       }
    796       return u_terminateUChars(dest, destCapacity, len, &errorCode);
    797     }
    798   }
    799 
    800   return len;
    801 }
    802 
    803 int32_t
    804 UnicodeString::extract(int32_t start,
    805                        int32_t length,
    806                        char *target,
    807                        int32_t targetCapacity,
    808                        enum EInvariant) const
    809 {
    810   // if the arguments are illegal, then do nothing
    811   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
    812     return 0;
    813   }
    814 
    815   // pin the indices to legal values
    816   pinIndices(start, length);
    817 
    818   if(length <= targetCapacity) {
    819     u_UCharsToChars(getArrayStart() + start, target, length);
    820   }
    821   UErrorCode status = U_ZERO_ERROR;
    822   return u_terminateChars(target, targetCapacity, length, &status);
    823 }
    824 
    825 UnicodeString
    826 UnicodeString::tempSubString(int32_t start, int32_t len) const {
    827   pinIndices(start, len);
    828   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
    829   if(array==NULL) {
    830     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
    831     len=-2;  // bogus result string
    832   }
    833   return UnicodeString(FALSE, array + start, len);
    834 }
    835 
    836 int32_t
    837 UnicodeString::toUTF8(int32_t start, int32_t len,
    838                       char *target, int32_t capacity) const {
    839   pinIndices(start, len);
    840   int32_t length8;
    841   UErrorCode errorCode = U_ZERO_ERROR;
    842   u_strToUTF8WithSub(target, capacity, &length8,
    843                      getBuffer() + start, len,
    844                      0xFFFD,  // Standard substitution character.
    845                      NULL,    // Don't care about number of substitutions.
    846                      &errorCode);
    847   return length8;
    848 }
    849 
    850 #if U_CHARSET_IS_UTF8
    851 
    852 int32_t
    853 UnicodeString::extract(int32_t start, int32_t len,
    854                        char *target, uint32_t dstSize) const {
    855   // if the arguments are illegal, then do nothing
    856   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    857     return 0;
    858   }
    859   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
    860 }
    861 
    862 // else see unistr_cnv.cpp
    863 #endif
    864 
    865 void
    866 UnicodeString::extractBetween(int32_t start,
    867                   int32_t limit,
    868                   UnicodeString& target) const {
    869   pinIndex(start);
    870   pinIndex(limit);
    871   doExtract(start, limit - start, target);
    872 }
    873 
    874 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
    875 // as many bytes as the source has UChars.
    876 // The "worst cases" are writing systems like Indic, Thai and CJK with
    877 // 3:1 bytes:UChars.
    878 void
    879 UnicodeString::toUTF8(ByteSink &sink) const {
    880   int32_t length16 = length();
    881   if(length16 != 0) {
    882     char stackBuffer[1024];
    883     int32_t capacity = (int32_t)sizeof(stackBuffer);
    884     UBool utf8IsOwned = FALSE;
    885     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
    886                                       3*length16,
    887                                       stackBuffer, capacity,
    888                                       &capacity);
    889     int32_t length8 = 0;
    890     UErrorCode errorCode = U_ZERO_ERROR;
    891     u_strToUTF8WithSub(utf8, capacity, &length8,
    892                        getBuffer(), length16,
    893                        0xFFFD,  // Standard substitution character.
    894                        NULL,    // Don't care about number of substitutions.
    895                        &errorCode);
    896     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    897       utf8 = (char *)uprv_malloc(length8);
    898       if(utf8 != NULL) {
    899         utf8IsOwned = TRUE;
    900         errorCode = U_ZERO_ERROR;
    901         u_strToUTF8WithSub(utf8, length8, &length8,
    902                            getBuffer(), length16,
    903                            0xFFFD,  // Standard substitution character.
    904                            NULL,    // Don't care about number of substitutions.
    905                            &errorCode);
    906       } else {
    907         errorCode = U_MEMORY_ALLOCATION_ERROR;
    908       }
    909     }
    910     if(U_SUCCESS(errorCode)) {
    911       sink.Append(utf8, length8);
    912       sink.Flush();
    913     }
    914     if(utf8IsOwned) {
    915       uprv_free(utf8);
    916     }
    917   }
    918 }
    919 
    920 int32_t
    921 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
    922   int32_t length32=0;
    923   if(U_SUCCESS(errorCode)) {
    924     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
    925     u_strToUTF32WithSub(utf32, capacity, &length32,
    926         getBuffer(), length(),
    927         0xfffd,  // Substitution character.
    928         NULL,    // Don't care about number of substitutions.
    929         &errorCode);
    930   }
    931   return length32;
    932 }
    933 
    934 int32_t
    935 UnicodeString::indexOf(const UChar *srcChars,
    936                int32_t srcStart,
    937                int32_t srcLength,
    938                int32_t start,
    939                int32_t length) const
    940 {
    941   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
    942     return -1;
    943   }
    944 
    945   // UnicodeString does not find empty substrings
    946   if(srcLength < 0 && srcChars[srcStart] == 0) {
    947     return -1;
    948   }
    949 
    950   // get the indices within bounds
    951   pinIndices(start, length);
    952 
    953   // find the first occurrence of the substring
    954   const UChar *array = getArrayStart();
    955   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
    956   if(match == NULL) {
    957     return -1;
    958   } else {
    959     return (int32_t)(match - array);
    960   }
    961 }
    962 
    963 int32_t
    964 UnicodeString::doIndexOf(UChar c,
    965              int32_t start,
    966              int32_t length) const
    967 {
    968   // pin indices
    969   pinIndices(start, length);
    970 
    971   // find the first occurrence of c
    972   const UChar *array = getArrayStart();
    973   const UChar *match = u_memchr(array + start, c, length);
    974   if(match == NULL) {
    975     return -1;
    976   } else {
    977     return (int32_t)(match - array);
    978   }
    979 }
    980 
    981 int32_t
    982 UnicodeString::doIndexOf(UChar32 c,
    983                          int32_t start,
    984                          int32_t length) const {
    985   // pin indices
    986   pinIndices(start, length);
    987 
    988   // find the first occurrence of c
    989   const UChar *array = getArrayStart();
    990   const UChar *match = u_memchr32(array + start, c, length);
    991   if(match == NULL) {
    992     return -1;
    993   } else {
    994     return (int32_t)(match - array);
    995   }
    996 }
    997 
    998 int32_t
    999 UnicodeString::lastIndexOf(const UChar *srcChars,
   1000                int32_t srcStart,
   1001                int32_t srcLength,
   1002                int32_t start,
   1003                int32_t length) const
   1004 {
   1005   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
   1006     return -1;
   1007   }
   1008 
   1009   // UnicodeString does not find empty substrings
   1010   if(srcLength < 0 && srcChars[srcStart] == 0) {
   1011     return -1;
   1012   }
   1013 
   1014   // get the indices within bounds
   1015   pinIndices(start, length);
   1016 
   1017   // find the last occurrence of the substring
   1018   const UChar *array = getArrayStart();
   1019   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
   1020   if(match == NULL) {
   1021     return -1;
   1022   } else {
   1023     return (int32_t)(match - array);
   1024   }
   1025 }
   1026 
   1027 int32_t
   1028 UnicodeString::doLastIndexOf(UChar c,
   1029                  int32_t start,
   1030                  int32_t length) const
   1031 {
   1032   if(isBogus()) {
   1033     return -1;
   1034   }
   1035 
   1036   // pin indices
   1037   pinIndices(start, length);
   1038 
   1039   // find the last occurrence of c
   1040   const UChar *array = getArrayStart();
   1041   const UChar *match = u_memrchr(array + start, c, length);
   1042   if(match == NULL) {
   1043     return -1;
   1044   } else {
   1045     return (int32_t)(match - array);
   1046   }
   1047 }
   1048 
   1049 int32_t
   1050 UnicodeString::doLastIndexOf(UChar32 c,
   1051                              int32_t start,
   1052                              int32_t length) const {
   1053   // pin indices
   1054   pinIndices(start, length);
   1055 
   1056   // find the last occurrence of c
   1057   const UChar *array = getArrayStart();
   1058   const UChar *match = u_memrchr32(array + start, c, length);
   1059   if(match == NULL) {
   1060     return -1;
   1061   } else {
   1062     return (int32_t)(match - array);
   1063   }
   1064 }
   1065 
   1066 //========================================
   1067 // Write implementation
   1068 //========================================
   1069 
   1070 UnicodeString&
   1071 UnicodeString::findAndReplace(int32_t start,
   1072                   int32_t length,
   1073                   const UnicodeString& oldText,
   1074                   int32_t oldStart,
   1075                   int32_t oldLength,
   1076                   const UnicodeString& newText,
   1077                   int32_t newStart,
   1078                   int32_t newLength)
   1079 {
   1080   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
   1081     return *this;
   1082   }
   1083 
   1084   pinIndices(start, length);
   1085   oldText.pinIndices(oldStart, oldLength);
   1086   newText.pinIndices(newStart, newLength);
   1087 
   1088   if(oldLength == 0) {
   1089     return *this;
   1090   }
   1091 
   1092   while(length > 0 && length >= oldLength) {
   1093     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
   1094     if(pos < 0) {
   1095       // no more oldText's here: done
   1096       break;
   1097     } else {
   1098       // we found oldText, replace it by newText and go beyond it
   1099       replace(pos, oldLength, newText, newStart, newLength);
   1100       length -= pos + oldLength - start;
   1101       start = pos + newLength;
   1102     }
   1103   }
   1104 
   1105   return *this;
   1106 }
   1107 
   1108 
   1109 void
   1110 UnicodeString::setToBogus()
   1111 {
   1112   releaseArray();
   1113 
   1114   fShortLength = 0;
   1115   fUnion.fFields.fArray = 0;
   1116   fUnion.fFields.fCapacity = 0;
   1117   fFlags = kIsBogus;
   1118 }
   1119 
   1120 // turn a bogus string into an empty one
   1121 void
   1122 UnicodeString::unBogus() {
   1123   if(fFlags & kIsBogus) {
   1124     setToEmpty();
   1125   }
   1126 }
   1127 
   1128 const UChar *
   1129 UnicodeString::getTerminatedBuffer() {
   1130   if(!isWritable()) {
   1131     return 0;
   1132   }
   1133   UChar *array = getArrayStart();
   1134   int32_t len = length();
   1135   if(len < getCapacity()) {
   1136     if(fFlags & kBufferIsReadonly) {
   1137       // If len<capacity on a read-only alias, then array[len] is
   1138       // either the original NUL (if constructed with (TRUE, s, length))
   1139       // or one of the original string contents characters (if later truncated),
   1140       // therefore we can assume that array[len] is initialized memory.
   1141       if(array[len] == 0) {
   1142         return array;
   1143       }
   1144     } else if(((fFlags & kRefCounted) == 0 || refCount() == 1)) {
   1145       // kRefCounted: Do not write the NUL if the buffer is shared.
   1146       // That is mostly safe, except when the length of one copy was modified
   1147       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
   1148       // Then the NUL would be written into the middle of another copy's string.
   1149 
   1150       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
   1151       // Do not test if there is a NUL already because it might be uninitialized memory.
   1152       // (That would be safe, but tools like valgrind & Purify would complain.)
   1153       array[len] = 0;
   1154       return array;
   1155     }
   1156   }
   1157   if(cloneArrayIfNeeded(len+1)) {
   1158     array = getArrayStart();
   1159     array[len] = 0;
   1160     return array;
   1161   } else {
   1162     return NULL;
   1163   }
   1164 }
   1165 
   1166 // setTo() analogous to the readonly-aliasing constructor with the same signature
   1167 UnicodeString &
   1168 UnicodeString::setTo(UBool isTerminated,
   1169                      const UChar *text,
   1170                      int32_t textLength)
   1171 {
   1172   if(fFlags & kOpenGetBuffer) {
   1173     // do not modify a string that has an "open" getBuffer(minCapacity)
   1174     return *this;
   1175   }
   1176 
   1177   if(text == NULL) {
   1178     // treat as an empty string, do not alias
   1179     releaseArray();
   1180     setToEmpty();
   1181     return *this;
   1182   }
   1183 
   1184   if( textLength < -1 ||
   1185       (textLength == -1 && !isTerminated) ||
   1186       (textLength >= 0 && isTerminated && text[textLength] != 0)
   1187   ) {
   1188     setToBogus();
   1189     return *this;
   1190   }
   1191 
   1192   releaseArray();
   1193 
   1194   if(textLength == -1) {
   1195     // text is terminated, or else it would have failed the above test
   1196     textLength = u_strlen(text);
   1197   }
   1198   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
   1199 
   1200   fFlags = kReadonlyAlias;
   1201   return *this;
   1202 }
   1203 
   1204 // setTo() analogous to the writable-aliasing constructor with the same signature
   1205 UnicodeString &
   1206 UnicodeString::setTo(UChar *buffer,
   1207                      int32_t buffLength,
   1208                      int32_t buffCapacity) {
   1209   if(fFlags & kOpenGetBuffer) {
   1210     // do not modify a string that has an "open" getBuffer(minCapacity)
   1211     return *this;
   1212   }
   1213 
   1214   if(buffer == NULL) {
   1215     // treat as an empty string, do not alias
   1216     releaseArray();
   1217     setToEmpty();
   1218     return *this;
   1219   }
   1220 
   1221   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
   1222     setToBogus();
   1223     return *this;
   1224   } else if(buffLength == -1) {
   1225     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
   1226     const UChar *p = buffer, *limit = buffer + buffCapacity;
   1227     while(p != limit && *p != 0) {
   1228       ++p;
   1229     }
   1230     buffLength = (int32_t)(p - buffer);
   1231   }
   1232 
   1233   releaseArray();
   1234 
   1235   setArray(buffer, buffLength, buffCapacity);
   1236   fFlags = kWritableAlias;
   1237   return *this;
   1238 }
   1239 
   1240 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
   1241   unBogus();
   1242   int32_t length = utf8.length();
   1243   int32_t capacity;
   1244   // The UTF-16 string will be at most as long as the UTF-8 string.
   1245   if(length <= US_STACKBUF_SIZE) {
   1246     capacity = US_STACKBUF_SIZE;
   1247   } else {
   1248     capacity = length + 1;  // +1 for the terminating NUL.
   1249   }
   1250   UChar *utf16 = getBuffer(capacity);
   1251   int32_t length16;
   1252   UErrorCode errorCode = U_ZERO_ERROR;
   1253   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
   1254       utf8.data(), length,
   1255       0xfffd,  // Substitution character.
   1256       NULL,    // Don't care about number of substitutions.
   1257       &errorCode);
   1258   releaseBuffer(length16);
   1259   if(U_FAILURE(errorCode)) {
   1260     setToBogus();
   1261   }
   1262   return *this;
   1263 }
   1264 
   1265 UnicodeString&
   1266 UnicodeString::setCharAt(int32_t offset,
   1267              UChar c)
   1268 {
   1269   int32_t len = length();
   1270   if(cloneArrayIfNeeded() && len > 0) {
   1271     if(offset < 0) {
   1272       offset = 0;
   1273     } else if(offset >= len) {
   1274       offset = len - 1;
   1275     }
   1276 
   1277     getArrayStart()[offset] = c;
   1278   }
   1279   return *this;
   1280 }
   1281 
   1282 UnicodeString&
   1283 UnicodeString::replace(int32_t start,
   1284                int32_t _length,
   1285                UChar32 srcChar) {
   1286   UChar buffer[U16_MAX_LENGTH];
   1287   int32_t count = 0;
   1288   UBool isError = FALSE;
   1289   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
   1290   // We test isError so that the compiler does not complain that we don't.
   1291   // If isError (srcChar is not a valid code point) then count==0 which means
   1292   // we remove the source segment rather than replacing it with srcChar.
   1293   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
   1294 }
   1295 
   1296 UnicodeString&
   1297 UnicodeString::append(UChar32 srcChar) {
   1298   UChar buffer[U16_MAX_LENGTH];
   1299   int32_t _length = 0;
   1300   UBool isError = FALSE;
   1301   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
   1302   // We test isError so that the compiler does not complain that we don't.
   1303   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
   1304   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
   1305 }
   1306 
   1307 UnicodeString&
   1308 UnicodeString::doReplace( int32_t start,
   1309               int32_t length,
   1310               const UnicodeString& src,
   1311               int32_t srcStart,
   1312               int32_t srcLength)
   1313 {
   1314   if(!src.isBogus()) {
   1315     // pin the indices to legal values
   1316     src.pinIndices(srcStart, srcLength);
   1317 
   1318     // get the characters from src
   1319     // and replace the range in ourselves with them
   1320     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
   1321   } else {
   1322     // remove the range
   1323     return doReplace(start, length, 0, 0, 0);
   1324   }
   1325 }
   1326 
   1327 UnicodeString&
   1328 UnicodeString::doReplace(int32_t start,
   1329              int32_t length,
   1330              const UChar *srcChars,
   1331              int32_t srcStart,
   1332              int32_t srcLength)
   1333 {
   1334   if(!isWritable()) {
   1335     return *this;
   1336   }
   1337 
   1338   int32_t oldLength = this->length();
   1339 
   1340   // optimize (read-only alias).remove(0, start) and .remove(start, end)
   1341   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
   1342     if(start == 0) {
   1343       // remove prefix by adjusting the array pointer
   1344       pinIndex(length);
   1345       fUnion.fFields.fArray += length;
   1346       fUnion.fFields.fCapacity -= length;
   1347       setLength(oldLength - length);
   1348       return *this;
   1349     } else {
   1350       pinIndex(start);
   1351       if(length >= (oldLength - start)) {
   1352         // remove suffix by reducing the length (like truncate())
   1353         setLength(start);
   1354         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
   1355         return *this;
   1356       }
   1357     }
   1358   }
   1359 
   1360   if(srcChars == 0) {
   1361     srcStart = srcLength = 0;
   1362   } else if(srcLength < 0) {
   1363     // get the srcLength if necessary
   1364     srcLength = u_strlen(srcChars + srcStart);
   1365   }
   1366 
   1367   // calculate the size of the string after the replace
   1368   int32_t newLength;
   1369 
   1370   // optimize append() onto a large-enough, owned string
   1371   if(start >= oldLength) {
   1372     if(srcLength == 0) {
   1373       return *this;
   1374     }
   1375     newLength = oldLength + srcLength;
   1376     if(newLength <= getCapacity() && isBufferWritable()) {
   1377       UChar *oldArray = getArrayStart();
   1378       // Do not copy characters when
   1379       //   UChar *buffer=str.getAppendBuffer(...);
   1380       // is followed by
   1381       //   str.append(buffer, length);
   1382       // or
   1383       //   str.appendString(buffer, length)
   1384       // or similar.
   1385       if(srcChars + srcStart != oldArray + start || start > oldLength) {
   1386         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
   1387       }
   1388       setLength(newLength);
   1389       return *this;
   1390     } else {
   1391       // pin the indices to legal values
   1392       start = oldLength;
   1393       length = 0;
   1394     }
   1395   } else {
   1396     // pin the indices to legal values
   1397     pinIndices(start, length);
   1398 
   1399     newLength = oldLength - length + srcLength;
   1400   }
   1401 
   1402   // the following may change fArray but will not copy the current contents;
   1403   // therefore we need to keep the current fArray
   1404   UChar oldStackBuffer[US_STACKBUF_SIZE];
   1405   UChar *oldArray;
   1406   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
   1407     // copy the stack buffer contents because it will be overwritten with
   1408     // fUnion.fFields values
   1409     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
   1410     oldArray = oldStackBuffer;
   1411   } else {
   1412     oldArray = getArrayStart();
   1413   }
   1414 
   1415   // clone our array and allocate a bigger array if needed
   1416   int32_t *bufferToDelete = 0;
   1417   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
   1418                          FALSE, &bufferToDelete)
   1419   ) {
   1420     return *this;
   1421   }
   1422 
   1423   // now do the replace
   1424 
   1425   UChar *newArray = getArrayStart();
   1426   if(newArray != oldArray) {
   1427     // if fArray changed, then we need to copy everything except what will change
   1428     us_arrayCopy(oldArray, 0, newArray, 0, start);
   1429     us_arrayCopy(oldArray, start + length,
   1430                  newArray, start + srcLength,
   1431                  oldLength - (start + length));
   1432   } else if(length != srcLength) {
   1433     // fArray did not change; copy only the portion that isn't changing, leaving a hole
   1434     us_arrayCopy(oldArray, start + length,
   1435                  newArray, start + srcLength,
   1436                  oldLength - (start + length));
   1437   }
   1438 
   1439   // now fill in the hole with the new string
   1440   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
   1441 
   1442   setLength(newLength);
   1443 
   1444   // delayed delete in case srcChars == fArray when we started, and
   1445   // to keep oldArray alive for the above operations
   1446   if (bufferToDelete) {
   1447     uprv_free(bufferToDelete);
   1448   }
   1449 
   1450   return *this;
   1451 }
   1452 
   1453 /**
   1454  * Replaceable API
   1455  */
   1456 void
   1457 UnicodeString::handleReplaceBetween(int32_t start,
   1458                                     int32_t limit,
   1459                                     const UnicodeString& text) {
   1460     replaceBetween(start, limit, text);
   1461 }
   1462 
   1463 /**
   1464  * Replaceable API
   1465  */
   1466 void
   1467 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
   1468     if (limit <= start) {
   1469         return; // Nothing to do; avoid bogus malloc call
   1470     }
   1471     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
   1472     // Check to make sure text is not null.
   1473     if (text != NULL) {
   1474 	    extractBetween(start, limit, text, 0);
   1475 	    insert(dest, text, 0, limit - start);
   1476 	    uprv_free(text);
   1477     }
   1478 }
   1479 
   1480 /**
   1481  * Replaceable API
   1482  *
   1483  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
   1484  * so we implement this function here.
   1485  */
   1486 UBool Replaceable::hasMetaData() const {
   1487     return TRUE;
   1488 }
   1489 
   1490 /**
   1491  * Replaceable API
   1492  */
   1493 UBool UnicodeString::hasMetaData() const {
   1494     return FALSE;
   1495 }
   1496 
   1497 UnicodeString&
   1498 UnicodeString::doReverse(int32_t start, int32_t length) {
   1499   if(length <= 1 || !cloneArrayIfNeeded()) {
   1500     return *this;
   1501   }
   1502 
   1503   // pin the indices to legal values
   1504   pinIndices(start, length);
   1505   if(length <= 1) {  // pinIndices() might have shrunk the length
   1506     return *this;
   1507   }
   1508 
   1509   UChar *left = getArrayStart() + start;
   1510   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
   1511   UChar swap;
   1512   UBool hasSupplementary = FALSE;
   1513 
   1514   // Before the loop we know left<right because length>=2.
   1515   do {
   1516     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
   1517     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
   1518     *right-- = swap;
   1519   } while(left < right);
   1520   // Make sure to test the middle code unit of an odd-length string.
   1521   // Redundant if the length is even.
   1522   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
   1523 
   1524   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
   1525   if(hasSupplementary) {
   1526     UChar swap2;
   1527 
   1528     left = getArrayStart() + start;
   1529     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
   1530     while(left < right) {
   1531       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
   1532         *left++ = swap2;
   1533         *left++ = swap;
   1534       } else {
   1535         ++left;
   1536       }
   1537     }
   1538   }
   1539 
   1540   return *this;
   1541 }
   1542 
   1543 UBool
   1544 UnicodeString::padLeading(int32_t targetLength,
   1545                           UChar padChar)
   1546 {
   1547   int32_t oldLength = length();
   1548   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1549     return FALSE;
   1550   } else {
   1551     // move contents up by padding width
   1552     UChar *array = getArrayStart();
   1553     int32_t start = targetLength - oldLength;
   1554     us_arrayCopy(array, 0, array, start, oldLength);
   1555 
   1556     // fill in padding character
   1557     while(--start >= 0) {
   1558       array[start] = padChar;
   1559     }
   1560     setLength(targetLength);
   1561     return TRUE;
   1562   }
   1563 }
   1564 
   1565 UBool
   1566 UnicodeString::padTrailing(int32_t targetLength,
   1567                            UChar padChar)
   1568 {
   1569   int32_t oldLength = length();
   1570   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1571     return FALSE;
   1572   } else {
   1573     // fill in padding character
   1574     UChar *array = getArrayStart();
   1575     int32_t length = targetLength;
   1576     while(--length >= oldLength) {
   1577       array[length] = padChar;
   1578     }
   1579     setLength(targetLength);
   1580     return TRUE;
   1581   }
   1582 }
   1583 
   1584 //========================================
   1585 // Hashing
   1586 //========================================
   1587 int32_t
   1588 UnicodeString::doHashCode() const
   1589 {
   1590     /* Delegate hash computation to uhash.  This makes UnicodeString
   1591      * hashing consistent with UChar* hashing.  */
   1592     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
   1593     if (hashCode == kInvalidHashCode) {
   1594         hashCode = kEmptyHashCode;
   1595     }
   1596     return hashCode;
   1597 }
   1598 
   1599 //========================================
   1600 // External Buffer
   1601 //========================================
   1602 
   1603 UChar *
   1604 UnicodeString::getBuffer(int32_t minCapacity) {
   1605   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
   1606     fFlags|=kOpenGetBuffer;
   1607     fShortLength=0;
   1608     return getArrayStart();
   1609   } else {
   1610     return 0;
   1611   }
   1612 }
   1613 
   1614 void
   1615 UnicodeString::releaseBuffer(int32_t newLength) {
   1616   if(fFlags&kOpenGetBuffer && newLength>=-1) {
   1617     // set the new fLength
   1618     int32_t capacity=getCapacity();
   1619     if(newLength==-1) {
   1620       // the new length is the string length, capped by fCapacity
   1621       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
   1622       while(p<limit && *p!=0) {
   1623         ++p;
   1624       }
   1625       newLength=(int32_t)(p-array);
   1626     } else if(newLength>capacity) {
   1627       newLength=capacity;
   1628     }
   1629     setLength(newLength);
   1630     fFlags&=~kOpenGetBuffer;
   1631   }
   1632 }
   1633 
   1634 //========================================
   1635 // Miscellaneous
   1636 //========================================
   1637 UBool
   1638 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
   1639                                   int32_t growCapacity,
   1640                                   UBool doCopyArray,
   1641                                   int32_t **pBufferToDelete,
   1642                                   UBool forceClone) {
   1643   // default parameters need to be static, therefore
   1644   // the defaults are -1 to have convenience defaults
   1645   if(newCapacity == -1) {
   1646     newCapacity = getCapacity();
   1647   }
   1648 
   1649   // while a getBuffer(minCapacity) is "open",
   1650   // prevent any modifications of the string by returning FALSE here
   1651   // if the string is bogus, then only an assignment or similar can revive it
   1652   if(!isWritable()) {
   1653     return FALSE;
   1654   }
   1655 
   1656   /*
   1657    * We need to make a copy of the array if
   1658    * the buffer is read-only, or
   1659    * the buffer is refCounted (shared), and refCount>1, or
   1660    * the buffer is too small.
   1661    * Return FALSE if memory could not be allocated.
   1662    */
   1663   if(forceClone ||
   1664      fFlags & kBufferIsReadonly ||
   1665      (fFlags & kRefCounted && refCount() > 1) ||
   1666      newCapacity > getCapacity()
   1667   ) {
   1668     // check growCapacity for default value and use of the stack buffer
   1669     if(growCapacity < 0) {
   1670       growCapacity = newCapacity;
   1671     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
   1672       growCapacity = US_STACKBUF_SIZE;
   1673     }
   1674 
   1675     // save old values
   1676     UChar oldStackBuffer[US_STACKBUF_SIZE];
   1677     UChar *oldArray;
   1678     uint8_t flags = fFlags;
   1679 
   1680     if(flags&kUsingStackBuffer) {
   1681       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
   1682       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
   1683         // copy the stack buffer contents because it will be overwritten with
   1684         // fUnion.fFields values
   1685         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
   1686         oldArray = oldStackBuffer;
   1687       } else {
   1688         oldArray = 0; // no need to copy from stack buffer to itself
   1689       }
   1690     } else {
   1691       oldArray = fUnion.fFields.fArray;
   1692       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
   1693     }
   1694 
   1695     // allocate a new array
   1696     if(allocate(growCapacity) ||
   1697        (newCapacity < growCapacity && allocate(newCapacity))
   1698     ) {
   1699       if(doCopyArray && oldArray != 0) {
   1700         // copy the contents
   1701         // do not copy more than what fits - it may be smaller than before
   1702         int32_t minLength = length();
   1703         newCapacity = getCapacity();
   1704         if(newCapacity < minLength) {
   1705           minLength = newCapacity;
   1706           setLength(minLength);
   1707         }
   1708         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
   1709       } else {
   1710         fShortLength = 0;
   1711       }
   1712 
   1713       // release the old array
   1714       if(flags & kRefCounted) {
   1715         // the array is refCounted; decrement and release if 0
   1716         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
   1717         if(umtx_atomic_dec(pRefCount) == 0) {
   1718           if(pBufferToDelete == 0) {
   1719               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
   1720               // is defined as volatile. (Volatile has useful non-standard behavior
   1721               //   with this compiler.)
   1722             uprv_free((void *)pRefCount);
   1723           } else {
   1724             // the caller requested to delete it himself
   1725             *pBufferToDelete = (int32_t *)pRefCount;
   1726           }
   1727         }
   1728       }
   1729     } else {
   1730       // not enough memory for growCapacity and not even for the smaller newCapacity
   1731       // reset the old values for setToBogus() to release the array
   1732       if(!(flags&kUsingStackBuffer)) {
   1733         fUnion.fFields.fArray = oldArray;
   1734       }
   1735       fFlags = flags;
   1736       setToBogus();
   1737       return FALSE;
   1738     }
   1739   }
   1740   return TRUE;
   1741 }
   1742 
   1743 // UnicodeStringAppendable ------------------------------------------------- ***
   1744 
   1745 UnicodeStringAppendable::~UnicodeStringAppendable() {}
   1746 
   1747 UBool
   1748 UnicodeStringAppendable::appendCodeUnit(UChar c) {
   1749   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
   1750 }
   1751 
   1752 UBool
   1753 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
   1754   UChar buffer[U16_MAX_LENGTH];
   1755   int32_t cLength = 0;
   1756   UBool isError = FALSE;
   1757   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
   1758   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
   1759 }
   1760 
   1761 UBool
   1762 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
   1763   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
   1764 }
   1765 
   1766 UBool
   1767 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
   1768   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
   1769 }
   1770 
   1771 UChar *
   1772 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
   1773                                          int32_t desiredCapacityHint,
   1774                                          UChar *scratch, int32_t scratchCapacity,
   1775                                          int32_t *resultCapacity) {
   1776   if(minCapacity < 1 || scratchCapacity < minCapacity) {
   1777     *resultCapacity = 0;
   1778     return NULL;
   1779   }
   1780   int32_t oldLength = str.length();
   1781   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
   1782     *resultCapacity = str.getCapacity() - oldLength;
   1783     return str.getArrayStart() + oldLength;
   1784   }
   1785   *resultCapacity = scratchCapacity;
   1786   return scratch;
   1787 }
   1788 
   1789 U_NAMESPACE_END
   1790 
   1791 U_NAMESPACE_USE
   1792 
   1793 U_CAPI int32_t U_EXPORT2
   1794 uhash_hashUnicodeString(const UElement key) {
   1795     const UnicodeString *str = (const UnicodeString*) key.pointer;
   1796     return (str == NULL) ? 0 : str->hashCode();
   1797 }
   1798 
   1799 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
   1800 // does not depend on hashtable code.
   1801 U_CAPI UBool U_EXPORT2
   1802 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
   1803     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
   1804     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
   1805     if (str1 == str2) {
   1806         return TRUE;
   1807     }
   1808     if (str1 == NULL || str2 == NULL) {
   1809         return FALSE;
   1810     }
   1811     return *str1 == *str2;
   1812 }
   1813 
   1814 #ifdef U_STATIC_IMPLEMENTATION
   1815 /*
   1816 This should never be called. It is defined here to make sure that the
   1817 virtual vector deleting destructor is defined within unistr.cpp.
   1818 The vector deleting destructor is already a part of UObject,
   1819 but defining it here makes sure that it is included with this object file.
   1820 This makes sure that static library dependencies are kept to a minimum.
   1821 */
   1822 static void uprv_UnicodeStringDummy(void) {
   1823     delete [] (new UnicodeString[2]);
   1824 }
   1825 #endif
   1826