Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1999-2012, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 ******************************************************************************
      6 *
      7 * File unistr.cpp
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   09/25/98    stephen     Creation.
     13 *   04/20/99    stephen     Overhauled per 4/16 code review.
     14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
     15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
     16 *                           Replaceable.
     17 *   06/25/01    grhoten     Removed the dependency on iostream
     18 ******************************************************************************
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/appendable.h"
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf16.h"
     30 #include "uelement.h"
     31 #include "ustr_imp.h"
     32 #include "umutex.h"
     33 #include "uassert.h"
     34 
     35 #if 0
     36 
     37 #include <iostream>
     38 using namespace std;
     39 
     40 //DEBUGGING
     41 void
     42 print(const UnicodeString& s,
     43       const char *name)
     44 {
     45   UChar c;
     46   cout << name << ":|";
     47   for(int i = 0; i < s.length(); ++i) {
     48     c = s[i];
     49     if(c>= 0x007E || c < 0x0020)
     50       cout << "[0x" << hex << s[i] << "]";
     51     else
     52       cout << (char) s[i];
     53   }
     54   cout << '|' << endl;
     55 }
     56 
     57 void
     58 print(const UChar *s,
     59       int32_t len,
     60       const char *name)
     61 {
     62   UChar c;
     63   cout << name << ":|";
     64   for(int i = 0; i < len; ++i) {
     65     c = s[i];
     66     if(c>= 0x007E || c < 0x0020)
     67       cout << "[0x" << hex << s[i] << "]";
     68     else
     69       cout << (char) s[i];
     70   }
     71   cout << '|' << endl;
     72 }
     73 // END DEBUGGING
     74 #endif
     75 
     76 // Local function definitions for now
     77 
     78 // need to copy areas that may overlap
     79 static
     80 inline void
     81 us_arrayCopy(const UChar *src, int32_t srcStart,
     82          UChar *dst, int32_t dstStart, int32_t count)
     83 {
     84   if(count>0) {
     85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
     86   }
     87 }
     88 
     89 // u_unescapeAt() callback to get a UChar from a UnicodeString
     90 U_CDECL_BEGIN
     91 static UChar U_CALLCONV
     92 UnicodeString_charAt(int32_t offset, void *context) {
     93     return ((icu::UnicodeString*) context)->charAt(offset);
     94 }
     95 U_CDECL_END
     96 
     97 U_NAMESPACE_BEGIN
     98 
     99 /* The Replaceable virtual destructor can't be defined in the header
    100    due to how AIX works with multiple definitions of virtual functions.
    101 */
    102 Replaceable::~Replaceable() {}
    103 Replaceable::Replaceable() {}
    104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
    105 
    106 UnicodeString U_EXPORT2
    107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
    108     return
    109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
    110             append(s1).
    111                 append(s2);
    112 }
    113 
    114 //========================================
    115 // Reference Counting functions, put at top of file so that optimizing compilers
    116 //                               have a chance to automatically inline.
    117 //========================================
    118 
    119 void
    120 UnicodeString::addRef()
    121 {  umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);}
    122 
    123 int32_t
    124 UnicodeString::removeRef()
    125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);}
    126 
    127 int32_t
    128 UnicodeString::refCount() const
    129 {
    130     umtx_lock(NULL);
    131     // Note: without the lock to force a memory barrier, we might see a very
    132     //       stale value on some multi-processor systems.
    133     int32_t  count = *((int32_t *)fUnion.fFields.fArray - 1);
    134     umtx_unlock(NULL);
    135     return count;
    136  }
    137 
    138 void
    139 UnicodeString::releaseArray() {
    140   if((fFlags & kRefCounted) && removeRef() == 0) {
    141     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
    142   }
    143 }
    144 
    145 
    146 
    147 //========================================
    148 // Constructors
    149 //========================================
    150 UnicodeString::UnicodeString()
    151   : fShortLength(0),
    152     fFlags(kShortString)
    153 {}
    154 
    155 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
    156   : fShortLength(0),
    157     fFlags(0)
    158 {
    159   if(count <= 0 || (uint32_t)c > 0x10ffff) {
    160     // just allocate and do not do anything else
    161     allocate(capacity);
    162   } else {
    163     // count > 0, allocate and fill the new string with count c's
    164     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
    165     if(capacity < length) {
    166       capacity = length;
    167     }
    168     if(allocate(capacity)) {
    169       UChar *array = getArrayStart();
    170       int32_t i = 0;
    171 
    172       // fill the new string with c
    173       if(unitCount == 1) {
    174         // fill with length UChars
    175         while(i < length) {
    176           array[i++] = (UChar)c;
    177         }
    178       } else {
    179         // get the code units for c
    180         UChar units[U16_MAX_LENGTH];
    181         U16_APPEND_UNSAFE(units, i, c);
    182 
    183         // now it must be i==unitCount
    184         i = 0;
    185 
    186         // for Unicode, unitCount can only be 1, 2, 3, or 4
    187         // 1 is handled above
    188         while(i < length) {
    189           int32_t unitIdx = 0;
    190           while(unitIdx < unitCount) {
    191             array[i++]=units[unitIdx++];
    192           }
    193         }
    194       }
    195     }
    196     setLength(length);
    197   }
    198 }
    199 
    200 UnicodeString::UnicodeString(UChar ch)
    201   : fShortLength(1),
    202     fFlags(kShortString)
    203 {
    204   fUnion.fStackBuffer[0] = ch;
    205 }
    206 
    207 UnicodeString::UnicodeString(UChar32 ch)
    208   : fShortLength(0),
    209     fFlags(kShortString)
    210 {
    211   int32_t i = 0;
    212   UBool isError = FALSE;
    213   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
    214   // We test isError so that the compiler does not complain that we don't.
    215   // If isError then i==0 which is what we want anyway.
    216   if(!isError) {
    217     fShortLength = (int8_t)i;
    218   }
    219 }
    220 
    221 UnicodeString::UnicodeString(const UChar *text)
    222   : fShortLength(0),
    223     fFlags(kShortString)
    224 {
    225   doReplace(0, 0, text, 0, -1);
    226 }
    227 
    228 UnicodeString::UnicodeString(const UChar *text,
    229                              int32_t textLength)
    230   : fShortLength(0),
    231     fFlags(kShortString)
    232 {
    233   doReplace(0, 0, text, 0, textLength);
    234 }
    235 
    236 UnicodeString::UnicodeString(UBool isTerminated,
    237                              const UChar *text,
    238                              int32_t textLength)
    239   : fShortLength(0),
    240     fFlags(kReadonlyAlias)
    241 {
    242   if(text == NULL) {
    243     // treat as an empty string, do not alias
    244     setToEmpty();
    245   } else if(textLength < -1 ||
    246             (textLength == -1 && !isTerminated) ||
    247             (textLength >= 0 && isTerminated && text[textLength] != 0)
    248   ) {
    249     setToBogus();
    250   } else {
    251     if(textLength == -1) {
    252       // text is terminated, or else it would have failed the above test
    253       textLength = u_strlen(text);
    254     }
    255     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
    256   }
    257 }
    258 
    259 UnicodeString::UnicodeString(UChar *buff,
    260                              int32_t buffLength,
    261                              int32_t buffCapacity)
    262   : fShortLength(0),
    263     fFlags(kWritableAlias)
    264 {
    265   if(buff == NULL) {
    266     // treat as an empty string, do not alias
    267     setToEmpty();
    268   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
    269     setToBogus();
    270   } else {
    271     if(buffLength == -1) {
    272       // fLength = u_strlen(buff); but do not look beyond buffCapacity
    273       const UChar *p = buff, *limit = buff + buffCapacity;
    274       while(p != limit && *p != 0) {
    275         ++p;
    276       }
    277       buffLength = (int32_t)(p - buff);
    278     }
    279     setArray(buff, buffLength, buffCapacity);
    280   }
    281 }
    282 
    283 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
    284   : fShortLength(0),
    285     fFlags(kShortString)
    286 {
    287   if(src==NULL) {
    288     // treat as an empty string
    289   } else {
    290     if(length<0) {
    291       length=(int32_t)uprv_strlen(src);
    292     }
    293     if(cloneArrayIfNeeded(length, length, FALSE)) {
    294       u_charsToUChars(src, getArrayStart(), length);
    295       setLength(length);
    296     } else {
    297       setToBogus();
    298     }
    299   }
    300 }
    301 
    302 #if U_CHARSET_IS_UTF8
    303 
    304 UnicodeString::UnicodeString(const char *codepageData)
    305   : fShortLength(0),
    306     fFlags(kShortString) {
    307   if(codepageData != 0) {
    308     setToUTF8(codepageData);
    309   }
    310 }
    311 
    312 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
    313   : fShortLength(0),
    314     fFlags(kShortString) {
    315   // if there's nothing to convert, do nothing
    316   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    317     return;
    318   }
    319   if(dataLength == -1) {
    320     dataLength = (int32_t)uprv_strlen(codepageData);
    321   }
    322   setToUTF8(StringPiece(codepageData, dataLength));
    323 }
    324 
    325 // else see unistr_cnv.cpp
    326 #endif
    327 
    328 UnicodeString::UnicodeString(const UnicodeString& that)
    329   : Replaceable(),
    330     fShortLength(0),
    331     fFlags(kShortString)
    332 {
    333   copyFrom(that);
    334 }
    335 
    336 UnicodeString::UnicodeString(const UnicodeString& that,
    337                              int32_t srcStart)
    338   : Replaceable(),
    339     fShortLength(0),
    340     fFlags(kShortString)
    341 {
    342   setTo(that, srcStart);
    343 }
    344 
    345 UnicodeString::UnicodeString(const UnicodeString& that,
    346                              int32_t srcStart,
    347                              int32_t srcLength)
    348   : Replaceable(),
    349     fShortLength(0),
    350     fFlags(kShortString)
    351 {
    352   setTo(that, srcStart, srcLength);
    353 }
    354 
    355 // Replaceable base class clone() default implementation, does not clone
    356 Replaceable *
    357 Replaceable::clone() const {
    358   return NULL;
    359 }
    360 
    361 // UnicodeString overrides clone() with a real implementation
    362 Replaceable *
    363 UnicodeString::clone() const {
    364   return new UnicodeString(*this);
    365 }
    366 
    367 //========================================
    368 // array allocation
    369 //========================================
    370 
    371 UBool
    372 UnicodeString::allocate(int32_t capacity) {
    373   if(capacity <= US_STACKBUF_SIZE) {
    374     fFlags = kShortString;
    375   } else {
    376     // count bytes for the refCounter and the string capacity, and
    377     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
    378     // to be safely aligned for the refCount
    379     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
    380     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
    381     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
    382     if(array != 0) {
    383       // set initial refCount and point behind the refCount
    384       *array++ = 1;
    385 
    386       // have fArray point to the first UChar
    387       fUnion.fFields.fArray = (UChar *)array;
    388       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
    389       fFlags = kLongString;
    390     } else {
    391       fShortLength = 0;
    392       fUnion.fFields.fArray = 0;
    393       fUnion.fFields.fCapacity = 0;
    394       fFlags = kIsBogus;
    395       return FALSE;
    396     }
    397   }
    398   return TRUE;
    399 }
    400 
    401 //========================================
    402 // Destructor
    403 //========================================
    404 UnicodeString::~UnicodeString()
    405 {
    406   releaseArray();
    407 }
    408 
    409 //========================================
    410 // Factory methods
    411 //========================================
    412 
    413 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
    414   UnicodeString result;
    415   result.setToUTF8(utf8);
    416   return result;
    417 }
    418 
    419 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
    420   UnicodeString result;
    421   int32_t capacity;
    422   // Most UTF-32 strings will be BMP-only and result in a same-length
    423   // UTF-16 string. We overestimate the capacity just slightly,
    424   // just in case there are a few supplementary characters.
    425   if(length <= US_STACKBUF_SIZE) {
    426     capacity = US_STACKBUF_SIZE;
    427   } else {
    428     capacity = length + (length >> 4) + 4;
    429   }
    430   do {
    431     UChar *utf16 = result.getBuffer(capacity);
    432     int32_t length16;
    433     UErrorCode errorCode = U_ZERO_ERROR;
    434     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
    435         utf32, length,
    436         0xfffd,  // Substitution character.
    437         NULL,    // Don't care about number of substitutions.
    438         &errorCode);
    439     result.releaseBuffer(length16);
    440     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    441       capacity = length16 + 1;  // +1 for the terminating NUL.
    442       continue;
    443     } else if(U_FAILURE(errorCode)) {
    444       result.setToBogus();
    445     }
    446     break;
    447   } while(TRUE);
    448   return result;
    449 }
    450 
    451 //========================================
    452 // Assignment
    453 //========================================
    454 
    455 UnicodeString &
    456 UnicodeString::operator=(const UnicodeString &src) {
    457   return copyFrom(src);
    458 }
    459 
    460 UnicodeString &
    461 UnicodeString::fastCopyFrom(const UnicodeString &src) {
    462   return copyFrom(src, TRUE);
    463 }
    464 
    465 UnicodeString &
    466 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
    467   // if assigning to ourselves, do nothing
    468   if(this == 0 || this == &src) {
    469     return *this;
    470   }
    471 
    472   // is the right side bogus?
    473   if(&src == 0 || src.isBogus()) {
    474     setToBogus();
    475     return *this;
    476   }
    477 
    478   // delete the current contents
    479   releaseArray();
    480 
    481   if(src.isEmpty()) {
    482     // empty string - use the stack buffer
    483     setToEmpty();
    484     return *this;
    485   }
    486 
    487   // we always copy the length
    488   int32_t srcLength = src.length();
    489   setLength(srcLength);
    490 
    491   // fLength>0 and not an "open" src.getBuffer(minCapacity)
    492   switch(src.fFlags) {
    493   case kShortString:
    494     // short string using the stack buffer, do the same
    495     fFlags = kShortString;
    496     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
    497     break;
    498   case kLongString:
    499     // src uses a refCounted string buffer, use that buffer with refCount
    500     // src is const, use a cast - we don't really change it
    501     ((UnicodeString &)src).addRef();
    502     // copy all fields, share the reference-counted buffer
    503     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    504     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    505     fFlags = src.fFlags;
    506     break;
    507   case kReadonlyAlias:
    508     if(fastCopy) {
    509       // src is a readonly alias, do the same
    510       // -> maintain the readonly alias as such
    511       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    512       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    513       fFlags = src.fFlags;
    514       break;
    515     }
    516     // else if(!fastCopy) fall through to case kWritableAlias
    517     // -> allocate a new buffer and copy the contents
    518   case kWritableAlias:
    519     // src is a writable alias; we make a copy of that instead
    520     if(allocate(srcLength)) {
    521       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
    522       break;
    523     }
    524     // if there is not enough memory, then fall through to setting to bogus
    525   default:
    526     // if src is bogus, set ourselves to bogus
    527     // do not call setToBogus() here because fArray and fFlags are not consistent here
    528     fShortLength = 0;
    529     fUnion.fFields.fArray = 0;
    530     fUnion.fFields.fCapacity = 0;
    531     fFlags = kIsBogus;
    532     break;
    533   }
    534 
    535   return *this;
    536 }
    537 
    538 //========================================
    539 // Miscellaneous operations
    540 //========================================
    541 
    542 UnicodeString UnicodeString::unescape() const {
    543     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
    544     const UChar *array = getBuffer();
    545     int32_t len = length();
    546     int32_t prev = 0;
    547     for (int32_t i=0;;) {
    548         if (i == len) {
    549             result.append(array, prev, len - prev);
    550             break;
    551         }
    552         if (array[i++] == 0x5C /*'\\'*/) {
    553             result.append(array, prev, (i - 1) - prev);
    554             UChar32 c = unescapeAt(i); // advances i
    555             if (c < 0) {
    556                 result.remove(); // return empty string
    557                 break; // invalid escape sequence
    558             }
    559             result.append(c);
    560             prev = i;
    561         }
    562     }
    563     return result;
    564 }
    565 
    566 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
    567     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
    568 }
    569 
    570 //========================================
    571 // Read-only implementation
    572 //========================================
    573 UBool
    574 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
    575   // Requires: this & text not bogus and have same lengths.
    576   // Byte-wise comparison works for equality regardless of endianness.
    577   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
    578 }
    579 
    580 int8_t
    581 UnicodeString::doCompare( int32_t start,
    582               int32_t length,
    583               const UChar *srcChars,
    584               int32_t srcStart,
    585               int32_t srcLength) const
    586 {
    587   // compare illegal string values
    588   if(isBogus()) {
    589     return -1;
    590   }
    591 
    592   // pin indices to legal values
    593   pinIndices(start, length);
    594 
    595   if(srcChars == NULL) {
    596     // treat const UChar *srcChars==NULL as an empty string
    597     return length == 0 ? 0 : 1;
    598   }
    599 
    600   // get the correct pointer
    601   const UChar *chars = getArrayStart();
    602 
    603   chars += start;
    604   srcChars += srcStart;
    605 
    606   int32_t minLength;
    607   int8_t lengthResult;
    608 
    609   // get the srcLength if necessary
    610   if(srcLength < 0) {
    611     srcLength = u_strlen(srcChars + srcStart);
    612   }
    613 
    614   // are we comparing different lengths?
    615   if(length != srcLength) {
    616     if(length < srcLength) {
    617       minLength = length;
    618       lengthResult = -1;
    619     } else {
    620       minLength = srcLength;
    621       lengthResult = 1;
    622     }
    623   } else {
    624     minLength = length;
    625     lengthResult = 0;
    626   }
    627 
    628   /*
    629    * note that uprv_memcmp() returns an int but we return an int8_t;
    630    * we need to take care not to truncate the result -
    631    * one way to do this is to right-shift the value to
    632    * move the sign bit into the lower 8 bits and making sure that this
    633    * does not become 0 itself
    634    */
    635 
    636   if(minLength > 0 && chars != srcChars) {
    637     int32_t result;
    638 
    639 #   if U_IS_BIG_ENDIAN
    640       // big-endian: byte comparison works
    641       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
    642       if(result != 0) {
    643         return (int8_t)(result >> 15 | 1);
    644       }
    645 #   else
    646       // little-endian: compare UChar units
    647       do {
    648         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
    649         if(result != 0) {
    650           return (int8_t)(result >> 15 | 1);
    651         }
    652       } while(--minLength > 0);
    653 #   endif
    654   }
    655   return lengthResult;
    656 }
    657 
    658 /* String compare in code point order - doCompare() compares in code unit order. */
    659 int8_t
    660 UnicodeString::doCompareCodePointOrder(int32_t start,
    661                                        int32_t length,
    662                                        const UChar *srcChars,
    663                                        int32_t srcStart,
    664                                        int32_t srcLength) const
    665 {
    666   // compare illegal string values
    667   // treat const UChar *srcChars==NULL as an empty string
    668   if(isBogus()) {
    669     return -1;
    670   }
    671 
    672   // pin indices to legal values
    673   pinIndices(start, length);
    674 
    675   if(srcChars == NULL) {
    676     srcStart = srcLength = 0;
    677   }
    678 
    679   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
    680   /* translate the 32-bit result into an 8-bit one */
    681   if(diff!=0) {
    682     return (int8_t)(diff >> 15 | 1);
    683   } else {
    684     return 0;
    685   }
    686 }
    687 
    688 int32_t
    689 UnicodeString::getLength() const {
    690     return length();
    691 }
    692 
    693 UChar
    694 UnicodeString::getCharAt(int32_t offset) const {
    695   return charAt(offset);
    696 }
    697 
    698 UChar32
    699 UnicodeString::getChar32At(int32_t offset) const {
    700   return char32At(offset);
    701 }
    702 
    703 UChar32
    704 UnicodeString::char32At(int32_t offset) const
    705 {
    706   int32_t len = length();
    707   if((uint32_t)offset < (uint32_t)len) {
    708     const UChar *array = getArrayStart();
    709     UChar32 c;
    710     U16_GET(array, 0, offset, len, c);
    711     return c;
    712   } else {
    713     return kInvalidUChar;
    714   }
    715 }
    716 
    717 int32_t
    718 UnicodeString::getChar32Start(int32_t offset) const {
    719   if((uint32_t)offset < (uint32_t)length()) {
    720     const UChar *array = getArrayStart();
    721     U16_SET_CP_START(array, 0, offset);
    722     return offset;
    723   } else {
    724     return 0;
    725   }
    726 }
    727 
    728 int32_t
    729 UnicodeString::getChar32Limit(int32_t offset) const {
    730   int32_t len = length();
    731   if((uint32_t)offset < (uint32_t)len) {
    732     const UChar *array = getArrayStart();
    733     U16_SET_CP_LIMIT(array, 0, offset, len);
    734     return offset;
    735   } else {
    736     return len;
    737   }
    738 }
    739 
    740 int32_t
    741 UnicodeString::countChar32(int32_t start, int32_t length) const {
    742   pinIndices(start, length);
    743   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
    744   return u_countChar32(getArrayStart()+start, length);
    745 }
    746 
    747 UBool
    748 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
    749   pinIndices(start, length);
    750   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
    751   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
    752 }
    753 
    754 int32_t
    755 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
    756   // pin index
    757   int32_t len = length();
    758   if(index<0) {
    759     index=0;
    760   } else if(index>len) {
    761     index=len;
    762   }
    763 
    764   const UChar *array = getArrayStart();
    765   if(delta>0) {
    766     U16_FWD_N(array, index, len, delta);
    767   } else {
    768     U16_BACK_N(array, 0, index, -delta);
    769   }
    770 
    771   return index;
    772 }
    773 
    774 void
    775 UnicodeString::doExtract(int32_t start,
    776              int32_t length,
    777              UChar *dst,
    778              int32_t dstStart) const
    779 {
    780   // pin indices to legal values
    781   pinIndices(start, length);
    782 
    783   // do not copy anything if we alias dst itself
    784   const UChar *array = getArrayStart();
    785   if(array + start != dst + dstStart) {
    786     us_arrayCopy(array, start, dst, dstStart, length);
    787   }
    788 }
    789 
    790 int32_t
    791 UnicodeString::extract(UChar *dest, int32_t destCapacity,
    792                        UErrorCode &errorCode) const {
    793   int32_t len = length();
    794   if(U_SUCCESS(errorCode)) {
    795     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    796       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    797     } else {
    798       const UChar *array = getArrayStart();
    799       if(len>0 && len<=destCapacity && array!=dest) {
    800         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
    801       }
    802       return u_terminateUChars(dest, destCapacity, len, &errorCode);
    803     }
    804   }
    805 
    806   return len;
    807 }
    808 
    809 int32_t
    810 UnicodeString::extract(int32_t start,
    811                        int32_t length,
    812                        char *target,
    813                        int32_t targetCapacity,
    814                        enum EInvariant) const
    815 {
    816   // if the arguments are illegal, then do nothing
    817   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
    818     return 0;
    819   }
    820 
    821   // pin the indices to legal values
    822   pinIndices(start, length);
    823 
    824   if(length <= targetCapacity) {
    825     u_UCharsToChars(getArrayStart() + start, target, length);
    826   }
    827   UErrorCode status = U_ZERO_ERROR;
    828   return u_terminateChars(target, targetCapacity, length, &status);
    829 }
    830 
    831 UnicodeString
    832 UnicodeString::tempSubString(int32_t start, int32_t len) const {
    833   pinIndices(start, len);
    834   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
    835   if(array==NULL) {
    836     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
    837     len=-2;  // bogus result string
    838   }
    839   return UnicodeString(FALSE, array + start, len);
    840 }
    841 
    842 int32_t
    843 UnicodeString::toUTF8(int32_t start, int32_t len,
    844                       char *target, int32_t capacity) const {
    845   pinIndices(start, len);
    846   int32_t length8;
    847   UErrorCode errorCode = U_ZERO_ERROR;
    848   u_strToUTF8WithSub(target, capacity, &length8,
    849                      getBuffer() + start, len,
    850                      0xFFFD,  // Standard substitution character.
    851                      NULL,    // Don't care about number of substitutions.
    852                      &errorCode);
    853   return length8;
    854 }
    855 
    856 #if U_CHARSET_IS_UTF8
    857 
    858 int32_t
    859 UnicodeString::extract(int32_t start, int32_t len,
    860                        char *target, uint32_t dstSize) const {
    861   // if the arguments are illegal, then do nothing
    862   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    863     return 0;
    864   }
    865   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
    866 }
    867 
    868 // else see unistr_cnv.cpp
    869 #endif
    870 
    871 void
    872 UnicodeString::extractBetween(int32_t start,
    873                   int32_t limit,
    874                   UnicodeString& target) const {
    875   pinIndex(start);
    876   pinIndex(limit);
    877   doExtract(start, limit - start, target);
    878 }
    879 
    880 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
    881 // as many bytes as the source has UChars.
    882 // The "worst cases" are writing systems like Indic, Thai and CJK with
    883 // 3:1 bytes:UChars.
    884 void
    885 UnicodeString::toUTF8(ByteSink &sink) const {
    886   int32_t length16 = length();
    887   if(length16 != 0) {
    888     char stackBuffer[1024];
    889     int32_t capacity = (int32_t)sizeof(stackBuffer);
    890     UBool utf8IsOwned = FALSE;
    891     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
    892                                       3*length16,
    893                                       stackBuffer, capacity,
    894                                       &capacity);
    895     int32_t length8 = 0;
    896     UErrorCode errorCode = U_ZERO_ERROR;
    897     u_strToUTF8WithSub(utf8, capacity, &length8,
    898                        getBuffer(), length16,
    899                        0xFFFD,  // Standard substitution character.
    900                        NULL,    // Don't care about number of substitutions.
    901                        &errorCode);
    902     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    903       utf8 = (char *)uprv_malloc(length8);
    904       if(utf8 != NULL) {
    905         utf8IsOwned = TRUE;
    906         errorCode = U_ZERO_ERROR;
    907         u_strToUTF8WithSub(utf8, length8, &length8,
    908                            getBuffer(), length16,
    909                            0xFFFD,  // Standard substitution character.
    910                            NULL,    // Don't care about number of substitutions.
    911                            &errorCode);
    912       } else {
    913         errorCode = U_MEMORY_ALLOCATION_ERROR;
    914       }
    915     }
    916     if(U_SUCCESS(errorCode)) {
    917       sink.Append(utf8, length8);
    918       sink.Flush();
    919     }
    920     if(utf8IsOwned) {
    921       uprv_free(utf8);
    922     }
    923   }
    924 }
    925 
    926 int32_t
    927 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
    928   int32_t length32=0;
    929   if(U_SUCCESS(errorCode)) {
    930     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
    931     u_strToUTF32WithSub(utf32, capacity, &length32,
    932         getBuffer(), length(),
    933         0xfffd,  // Substitution character.
    934         NULL,    // Don't care about number of substitutions.
    935         &errorCode);
    936   }
    937   return length32;
    938 }
    939 
    940 int32_t
    941 UnicodeString::indexOf(const UChar *srcChars,
    942                int32_t srcStart,
    943                int32_t srcLength,
    944                int32_t start,
    945                int32_t length) const
    946 {
    947   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
    948     return -1;
    949   }
    950 
    951   // UnicodeString does not find empty substrings
    952   if(srcLength < 0 && srcChars[srcStart] == 0) {
    953     return -1;
    954   }
    955 
    956   // get the indices within bounds
    957   pinIndices(start, length);
    958 
    959   // find the first occurrence of the substring
    960   const UChar *array = getArrayStart();
    961   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
    962   if(match == NULL) {
    963     return -1;
    964   } else {
    965     return (int32_t)(match - array);
    966   }
    967 }
    968 
    969 int32_t
    970 UnicodeString::doIndexOf(UChar c,
    971              int32_t start,
    972              int32_t length) const
    973 {
    974   // pin indices
    975   pinIndices(start, length);
    976 
    977   // find the first occurrence of c
    978   const UChar *array = getArrayStart();
    979   const UChar *match = u_memchr(array + start, c, length);
    980   if(match == NULL) {
    981     return -1;
    982   } else {
    983     return (int32_t)(match - array);
    984   }
    985 }
    986 
    987 int32_t
    988 UnicodeString::doIndexOf(UChar32 c,
    989                          int32_t start,
    990                          int32_t length) const {
    991   // pin indices
    992   pinIndices(start, length);
    993 
    994   // find the first occurrence of c
    995   const UChar *array = getArrayStart();
    996   const UChar *match = u_memchr32(array + start, c, length);
    997   if(match == NULL) {
    998     return -1;
    999   } else {
   1000     return (int32_t)(match - array);
   1001   }
   1002 }
   1003 
   1004 int32_t
   1005 UnicodeString::lastIndexOf(const UChar *srcChars,
   1006                int32_t srcStart,
   1007                int32_t srcLength,
   1008                int32_t start,
   1009                int32_t length) const
   1010 {
   1011   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
   1012     return -1;
   1013   }
   1014 
   1015   // UnicodeString does not find empty substrings
   1016   if(srcLength < 0 && srcChars[srcStart] == 0) {
   1017     return -1;
   1018   }
   1019 
   1020   // get the indices within bounds
   1021   pinIndices(start, length);
   1022 
   1023   // find the last occurrence of the substring
   1024   const UChar *array = getArrayStart();
   1025   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
   1026   if(match == NULL) {
   1027     return -1;
   1028   } else {
   1029     return (int32_t)(match - array);
   1030   }
   1031 }
   1032 
   1033 int32_t
   1034 UnicodeString::doLastIndexOf(UChar c,
   1035                  int32_t start,
   1036                  int32_t length) const
   1037 {
   1038   if(isBogus()) {
   1039     return -1;
   1040   }
   1041 
   1042   // pin indices
   1043   pinIndices(start, length);
   1044 
   1045   // find the last occurrence of c
   1046   const UChar *array = getArrayStart();
   1047   const UChar *match = u_memrchr(array + start, c, length);
   1048   if(match == NULL) {
   1049     return -1;
   1050   } else {
   1051     return (int32_t)(match - array);
   1052   }
   1053 }
   1054 
   1055 int32_t
   1056 UnicodeString::doLastIndexOf(UChar32 c,
   1057                              int32_t start,
   1058                              int32_t length) const {
   1059   // pin indices
   1060   pinIndices(start, length);
   1061 
   1062   // find the last occurrence of c
   1063   const UChar *array = getArrayStart();
   1064   const UChar *match = u_memrchr32(array + start, c, length);
   1065   if(match == NULL) {
   1066     return -1;
   1067   } else {
   1068     return (int32_t)(match - array);
   1069   }
   1070 }
   1071 
   1072 //========================================
   1073 // Write implementation
   1074 //========================================
   1075 
   1076 UnicodeString&
   1077 UnicodeString::findAndReplace(int32_t start,
   1078                   int32_t length,
   1079                   const UnicodeString& oldText,
   1080                   int32_t oldStart,
   1081                   int32_t oldLength,
   1082                   const UnicodeString& newText,
   1083                   int32_t newStart,
   1084                   int32_t newLength)
   1085 {
   1086   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
   1087     return *this;
   1088   }
   1089 
   1090   pinIndices(start, length);
   1091   oldText.pinIndices(oldStart, oldLength);
   1092   newText.pinIndices(newStart, newLength);
   1093 
   1094   if(oldLength == 0) {
   1095     return *this;
   1096   }
   1097 
   1098   while(length > 0 && length >= oldLength) {
   1099     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
   1100     if(pos < 0) {
   1101       // no more oldText's here: done
   1102       break;
   1103     } else {
   1104       // we found oldText, replace it by newText and go beyond it
   1105       replace(pos, oldLength, newText, newStart, newLength);
   1106       length -= pos + oldLength - start;
   1107       start = pos + newLength;
   1108     }
   1109   }
   1110 
   1111   return *this;
   1112 }
   1113 
   1114 
   1115 void
   1116 UnicodeString::setToBogus()
   1117 {
   1118   releaseArray();
   1119 
   1120   fShortLength = 0;
   1121   fUnion.fFields.fArray = 0;
   1122   fUnion.fFields.fCapacity = 0;
   1123   fFlags = kIsBogus;
   1124 }
   1125 
   1126 // turn a bogus string into an empty one
   1127 void
   1128 UnicodeString::unBogus() {
   1129   if(fFlags & kIsBogus) {
   1130     setToEmpty();
   1131   }
   1132 }
   1133 
   1134 // setTo() analogous to the readonly-aliasing constructor with the same signature
   1135 UnicodeString &
   1136 UnicodeString::setTo(UBool isTerminated,
   1137                      const UChar *text,
   1138                      int32_t textLength)
   1139 {
   1140   if(fFlags & kOpenGetBuffer) {
   1141     // do not modify a string that has an "open" getBuffer(minCapacity)
   1142     return *this;
   1143   }
   1144 
   1145   if(text == NULL) {
   1146     // treat as an empty string, do not alias
   1147     releaseArray();
   1148     setToEmpty();
   1149     return *this;
   1150   }
   1151 
   1152   if( textLength < -1 ||
   1153       (textLength == -1 && !isTerminated) ||
   1154       (textLength >= 0 && isTerminated && text[textLength] != 0)
   1155   ) {
   1156     setToBogus();
   1157     return *this;
   1158   }
   1159 
   1160   releaseArray();
   1161 
   1162   if(textLength == -1) {
   1163     // text is terminated, or else it would have failed the above test
   1164     textLength = u_strlen(text);
   1165   }
   1166   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
   1167 
   1168   fFlags = kReadonlyAlias;
   1169   return *this;
   1170 }
   1171 
   1172 // setTo() analogous to the writable-aliasing constructor with the same signature
   1173 UnicodeString &
   1174 UnicodeString::setTo(UChar *buffer,
   1175                      int32_t buffLength,
   1176                      int32_t buffCapacity) {
   1177   if(fFlags & kOpenGetBuffer) {
   1178     // do not modify a string that has an "open" getBuffer(minCapacity)
   1179     return *this;
   1180   }
   1181 
   1182   if(buffer == NULL) {
   1183     // treat as an empty string, do not alias
   1184     releaseArray();
   1185     setToEmpty();
   1186     return *this;
   1187   }
   1188 
   1189   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
   1190     setToBogus();
   1191     return *this;
   1192   } else if(buffLength == -1) {
   1193     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
   1194     const UChar *p = buffer, *limit = buffer + buffCapacity;
   1195     while(p != limit && *p != 0) {
   1196       ++p;
   1197     }
   1198     buffLength = (int32_t)(p - buffer);
   1199   }
   1200 
   1201   releaseArray();
   1202 
   1203   setArray(buffer, buffLength, buffCapacity);
   1204   fFlags = kWritableAlias;
   1205   return *this;
   1206 }
   1207 
   1208 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
   1209   unBogus();
   1210   int32_t length = utf8.length();
   1211   int32_t capacity;
   1212   // The UTF-16 string will be at most as long as the UTF-8 string.
   1213   if(length <= US_STACKBUF_SIZE) {
   1214     capacity = US_STACKBUF_SIZE;
   1215   } else {
   1216     capacity = length + 1;  // +1 for the terminating NUL.
   1217   }
   1218   UChar *utf16 = getBuffer(capacity);
   1219   int32_t length16;
   1220   UErrorCode errorCode = U_ZERO_ERROR;
   1221   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
   1222       utf8.data(), length,
   1223       0xfffd,  // Substitution character.
   1224       NULL,    // Don't care about number of substitutions.
   1225       &errorCode);
   1226   releaseBuffer(length16);
   1227   if(U_FAILURE(errorCode)) {
   1228     setToBogus();
   1229   }
   1230   return *this;
   1231 }
   1232 
   1233 UnicodeString&
   1234 UnicodeString::setCharAt(int32_t offset,
   1235              UChar c)
   1236 {
   1237   int32_t len = length();
   1238   if(cloneArrayIfNeeded() && len > 0) {
   1239     if(offset < 0) {
   1240       offset = 0;
   1241     } else if(offset >= len) {
   1242       offset = len - 1;
   1243     }
   1244 
   1245     getArrayStart()[offset] = c;
   1246   }
   1247   return *this;
   1248 }
   1249 
   1250 UnicodeString&
   1251 UnicodeString::replace(int32_t start,
   1252                int32_t _length,
   1253                UChar32 srcChar) {
   1254   UChar buffer[U16_MAX_LENGTH];
   1255   int32_t count = 0;
   1256   UBool isError = FALSE;
   1257   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
   1258   // We test isError so that the compiler does not complain that we don't.
   1259   // If isError (srcChar is not a valid code point) then count==0 which means
   1260   // we remove the source segment rather than replacing it with srcChar.
   1261   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
   1262 }
   1263 
   1264 UnicodeString&
   1265 UnicodeString::append(UChar32 srcChar) {
   1266   UChar buffer[U16_MAX_LENGTH];
   1267   int32_t _length = 0;
   1268   UBool isError = FALSE;
   1269   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
   1270   // We test isError so that the compiler does not complain that we don't.
   1271   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
   1272   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
   1273 }
   1274 
   1275 UnicodeString&
   1276 UnicodeString::doReplace( int32_t start,
   1277               int32_t length,
   1278               const UnicodeString& src,
   1279               int32_t srcStart,
   1280               int32_t srcLength)
   1281 {
   1282   if(!src.isBogus()) {
   1283     // pin the indices to legal values
   1284     src.pinIndices(srcStart, srcLength);
   1285 
   1286     // get the characters from src
   1287     // and replace the range in ourselves with them
   1288     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
   1289   } else {
   1290     // remove the range
   1291     return doReplace(start, length, 0, 0, 0);
   1292   }
   1293 }
   1294 
   1295 UnicodeString&
   1296 UnicodeString::doReplace(int32_t start,
   1297              int32_t length,
   1298              const UChar *srcChars,
   1299              int32_t srcStart,
   1300              int32_t srcLength)
   1301 {
   1302   if(!isWritable()) {
   1303     return *this;
   1304   }
   1305 
   1306   int32_t oldLength = this->length();
   1307 
   1308   // optimize (read-only alias).remove(0, start) and .remove(start, end)
   1309   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
   1310     if(start == 0) {
   1311       // remove prefix by adjusting the array pointer
   1312       pinIndex(length);
   1313       fUnion.fFields.fArray += length;
   1314       fUnion.fFields.fCapacity -= length;
   1315       setLength(oldLength - length);
   1316       return *this;
   1317     } else {
   1318       pinIndex(start);
   1319       if(length >= (oldLength - start)) {
   1320         // remove suffix by reducing the length (like truncate())
   1321         setLength(start);
   1322         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
   1323         return *this;
   1324       }
   1325     }
   1326   }
   1327 
   1328   if(srcChars == 0) {
   1329     srcStart = srcLength = 0;
   1330   } else if(srcLength < 0) {
   1331     // get the srcLength if necessary
   1332     srcLength = u_strlen(srcChars + srcStart);
   1333   }
   1334 
   1335   // calculate the size of the string after the replace
   1336   int32_t newLength;
   1337 
   1338   // optimize append() onto a large-enough, owned string
   1339   if(start >= oldLength) {
   1340     if(srcLength == 0) {
   1341       return *this;
   1342     }
   1343     newLength = oldLength + srcLength;
   1344     if(newLength <= getCapacity() && isBufferWritable()) {
   1345       UChar *oldArray = getArrayStart();
   1346       // Do not copy characters when
   1347       //   UChar *buffer=str.getAppendBuffer(...);
   1348       // is followed by
   1349       //   str.append(buffer, length);
   1350       // or
   1351       //   str.appendString(buffer, length)
   1352       // or similar.
   1353       if(srcChars + srcStart != oldArray + start || start > oldLength) {
   1354         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
   1355       }
   1356       setLength(newLength);
   1357       return *this;
   1358     } else {
   1359       // pin the indices to legal values
   1360       start = oldLength;
   1361       length = 0;
   1362     }
   1363   } else {
   1364     // pin the indices to legal values
   1365     pinIndices(start, length);
   1366 
   1367     newLength = oldLength - length + srcLength;
   1368   }
   1369 
   1370   // the following may change fArray but will not copy the current contents;
   1371   // therefore we need to keep the current fArray
   1372   UChar oldStackBuffer[US_STACKBUF_SIZE];
   1373   UChar *oldArray;
   1374   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
   1375     // copy the stack buffer contents because it will be overwritten with
   1376     // fUnion.fFields values
   1377     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
   1378     oldArray = oldStackBuffer;
   1379   } else {
   1380     oldArray = getArrayStart();
   1381   }
   1382 
   1383   // clone our array and allocate a bigger array if needed
   1384   int32_t *bufferToDelete = 0;
   1385   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
   1386                          FALSE, &bufferToDelete)
   1387   ) {
   1388     return *this;
   1389   }
   1390 
   1391   // now do the replace
   1392 
   1393   UChar *newArray = getArrayStart();
   1394   if(newArray != oldArray) {
   1395     // if fArray changed, then we need to copy everything except what will change
   1396     us_arrayCopy(oldArray, 0, newArray, 0, start);
   1397     us_arrayCopy(oldArray, start + length,
   1398                  newArray, start + srcLength,
   1399                  oldLength - (start + length));
   1400   } else if(length != srcLength) {
   1401     // fArray did not change; copy only the portion that isn't changing, leaving a hole
   1402     us_arrayCopy(oldArray, start + length,
   1403                  newArray, start + srcLength,
   1404                  oldLength - (start + length));
   1405   }
   1406 
   1407   // now fill in the hole with the new string
   1408   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
   1409 
   1410   setLength(newLength);
   1411 
   1412   // delayed delete in case srcChars == fArray when we started, and
   1413   // to keep oldArray alive for the above operations
   1414   if (bufferToDelete) {
   1415     uprv_free(bufferToDelete);
   1416   }
   1417 
   1418   return *this;
   1419 }
   1420 
   1421 /**
   1422  * Replaceable API
   1423  */
   1424 void
   1425 UnicodeString::handleReplaceBetween(int32_t start,
   1426                                     int32_t limit,
   1427                                     const UnicodeString& text) {
   1428     replaceBetween(start, limit, text);
   1429 }
   1430 
   1431 /**
   1432  * Replaceable API
   1433  */
   1434 void
   1435 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
   1436     if (limit <= start) {
   1437         return; // Nothing to do; avoid bogus malloc call
   1438     }
   1439     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
   1440     // Check to make sure text is not null.
   1441     if (text != NULL) {
   1442 	    extractBetween(start, limit, text, 0);
   1443 	    insert(dest, text, 0, limit - start);
   1444 	    uprv_free(text);
   1445     }
   1446 }
   1447 
   1448 /**
   1449  * Replaceable API
   1450  *
   1451  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
   1452  * so we implement this function here.
   1453  */
   1454 UBool Replaceable::hasMetaData() const {
   1455     return TRUE;
   1456 }
   1457 
   1458 /**
   1459  * Replaceable API
   1460  */
   1461 UBool UnicodeString::hasMetaData() const {
   1462     return FALSE;
   1463 }
   1464 
   1465 UnicodeString&
   1466 UnicodeString::doReverse(int32_t start, int32_t length) {
   1467   if(length <= 1 || !cloneArrayIfNeeded()) {
   1468     return *this;
   1469   }
   1470 
   1471   // pin the indices to legal values
   1472   pinIndices(start, length);
   1473   if(length <= 1) {  // pinIndices() might have shrunk the length
   1474     return *this;
   1475   }
   1476 
   1477   UChar *left = getArrayStart() + start;
   1478   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
   1479   UChar swap;
   1480   UBool hasSupplementary = FALSE;
   1481 
   1482   // Before the loop we know left<right because length>=2.
   1483   do {
   1484     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
   1485     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
   1486     *right-- = swap;
   1487   } while(left < right);
   1488   // Make sure to test the middle code unit of an odd-length string.
   1489   // Redundant if the length is even.
   1490   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
   1491 
   1492   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
   1493   if(hasSupplementary) {
   1494     UChar swap2;
   1495 
   1496     left = getArrayStart() + start;
   1497     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
   1498     while(left < right) {
   1499       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
   1500         *left++ = swap2;
   1501         *left++ = swap;
   1502       } else {
   1503         ++left;
   1504       }
   1505     }
   1506   }
   1507 
   1508   return *this;
   1509 }
   1510 
   1511 UBool
   1512 UnicodeString::padLeading(int32_t targetLength,
   1513                           UChar padChar)
   1514 {
   1515   int32_t oldLength = length();
   1516   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1517     return FALSE;
   1518   } else {
   1519     // move contents up by padding width
   1520     UChar *array = getArrayStart();
   1521     int32_t start = targetLength - oldLength;
   1522     us_arrayCopy(array, 0, array, start, oldLength);
   1523 
   1524     // fill in padding character
   1525     while(--start >= 0) {
   1526       array[start] = padChar;
   1527     }
   1528     setLength(targetLength);
   1529     return TRUE;
   1530   }
   1531 }
   1532 
   1533 UBool
   1534 UnicodeString::padTrailing(int32_t targetLength,
   1535                            UChar padChar)
   1536 {
   1537   int32_t oldLength = length();
   1538   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1539     return FALSE;
   1540   } else {
   1541     // fill in padding character
   1542     UChar *array = getArrayStart();
   1543     int32_t length = targetLength;
   1544     while(--length >= oldLength) {
   1545       array[length] = padChar;
   1546     }
   1547     setLength(targetLength);
   1548     return TRUE;
   1549   }
   1550 }
   1551 
   1552 //========================================
   1553 // Hashing
   1554 //========================================
   1555 int32_t
   1556 UnicodeString::doHashCode() const
   1557 {
   1558     /* Delegate hash computation to uhash.  This makes UnicodeString
   1559      * hashing consistent with UChar* hashing.  */
   1560     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
   1561     if (hashCode == kInvalidHashCode) {
   1562         hashCode = kEmptyHashCode;
   1563     }
   1564     return hashCode;
   1565 }
   1566 
   1567 //========================================
   1568 // External Buffer
   1569 //========================================
   1570 
   1571 UChar *
   1572 UnicodeString::getBuffer(int32_t minCapacity) {
   1573   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
   1574     fFlags|=kOpenGetBuffer;
   1575     fShortLength=0;
   1576     return getArrayStart();
   1577   } else {
   1578     return 0;
   1579   }
   1580 }
   1581 
   1582 void
   1583 UnicodeString::releaseBuffer(int32_t newLength) {
   1584   if(fFlags&kOpenGetBuffer && newLength>=-1) {
   1585     // set the new fLength
   1586     int32_t capacity=getCapacity();
   1587     if(newLength==-1) {
   1588       // the new length is the string length, capped by fCapacity
   1589       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
   1590       while(p<limit && *p!=0) {
   1591         ++p;
   1592       }
   1593       newLength=(int32_t)(p-array);
   1594     } else if(newLength>capacity) {
   1595       newLength=capacity;
   1596     }
   1597     setLength(newLength);
   1598     fFlags&=~kOpenGetBuffer;
   1599   }
   1600 }
   1601 
   1602 //========================================
   1603 // Miscellaneous
   1604 //========================================
   1605 UBool
   1606 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
   1607                                   int32_t growCapacity,
   1608                                   UBool doCopyArray,
   1609                                   int32_t **pBufferToDelete,
   1610                                   UBool forceClone) {
   1611   // default parameters need to be static, therefore
   1612   // the defaults are -1 to have convenience defaults
   1613   if(newCapacity == -1) {
   1614     newCapacity = getCapacity();
   1615   }
   1616 
   1617   // while a getBuffer(minCapacity) is "open",
   1618   // prevent any modifications of the string by returning FALSE here
   1619   // if the string is bogus, then only an assignment or similar can revive it
   1620   if(!isWritable()) {
   1621     return FALSE;
   1622   }
   1623 
   1624   /*
   1625    * We need to make a copy of the array if
   1626    * the buffer is read-only, or
   1627    * the buffer is refCounted (shared), and refCount>1, or
   1628    * the buffer is too small.
   1629    * Return FALSE if memory could not be allocated.
   1630    */
   1631   if(forceClone ||
   1632      fFlags & kBufferIsReadonly ||
   1633      (fFlags & kRefCounted && refCount() > 1) ||
   1634      newCapacity > getCapacity()
   1635   ) {
   1636     // check growCapacity for default value and use of the stack buffer
   1637     if(growCapacity < 0) {
   1638       growCapacity = newCapacity;
   1639     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
   1640       growCapacity = US_STACKBUF_SIZE;
   1641     }
   1642 
   1643     // save old values
   1644     UChar oldStackBuffer[US_STACKBUF_SIZE];
   1645     UChar *oldArray;
   1646     uint8_t flags = fFlags;
   1647 
   1648     if(flags&kUsingStackBuffer) {
   1649       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
   1650       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
   1651         // copy the stack buffer contents because it will be overwritten with
   1652         // fUnion.fFields values
   1653         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
   1654         oldArray = oldStackBuffer;
   1655       } else {
   1656         oldArray = 0; // no need to copy from stack buffer to itself
   1657       }
   1658     } else {
   1659       oldArray = fUnion.fFields.fArray;
   1660       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
   1661     }
   1662 
   1663     // allocate a new array
   1664     if(allocate(growCapacity) ||
   1665        (newCapacity < growCapacity && allocate(newCapacity))
   1666     ) {
   1667       if(doCopyArray && oldArray != 0) {
   1668         // copy the contents
   1669         // do not copy more than what fits - it may be smaller than before
   1670         int32_t minLength = length();
   1671         newCapacity = getCapacity();
   1672         if(newCapacity < minLength) {
   1673           minLength = newCapacity;
   1674           setLength(minLength);
   1675         }
   1676         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
   1677       } else {
   1678         fShortLength = 0;
   1679       }
   1680 
   1681       // release the old array
   1682       if(flags & kRefCounted) {
   1683         // the array is refCounted; decrement and release if 0
   1684         int32_t *pRefCount = ((int32_t *)oldArray - 1);
   1685         if(umtx_atomic_dec(pRefCount) == 0) {
   1686           if(pBufferToDelete == 0) {
   1687             uprv_free(pRefCount);
   1688           } else {
   1689             // the caller requested to delete it himself
   1690             *pBufferToDelete = pRefCount;
   1691           }
   1692         }
   1693       }
   1694     } else {
   1695       // not enough memory for growCapacity and not even for the smaller newCapacity
   1696       // reset the old values for setToBogus() to release the array
   1697       if(!(flags&kUsingStackBuffer)) {
   1698         fUnion.fFields.fArray = oldArray;
   1699       }
   1700       fFlags = flags;
   1701       setToBogus();
   1702       return FALSE;
   1703     }
   1704   }
   1705   return TRUE;
   1706 }
   1707 
   1708 // UnicodeStringAppendable ------------------------------------------------- ***
   1709 
   1710 UnicodeStringAppendable::~UnicodeStringAppendable() {}
   1711 
   1712 UBool
   1713 UnicodeStringAppendable::appendCodeUnit(UChar c) {
   1714   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
   1715 }
   1716 
   1717 UBool
   1718 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
   1719   UChar buffer[U16_MAX_LENGTH];
   1720   int32_t cLength = 0;
   1721   UBool isError = FALSE;
   1722   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
   1723   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
   1724 }
   1725 
   1726 UBool
   1727 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
   1728   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
   1729 }
   1730 
   1731 UBool
   1732 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
   1733   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
   1734 }
   1735 
   1736 UChar *
   1737 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
   1738                                          int32_t desiredCapacityHint,
   1739                                          UChar *scratch, int32_t scratchCapacity,
   1740                                          int32_t *resultCapacity) {
   1741   if(minCapacity < 1 || scratchCapacity < minCapacity) {
   1742     *resultCapacity = 0;
   1743     return NULL;
   1744   }
   1745   int32_t oldLength = str.length();
   1746   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
   1747     *resultCapacity = str.getCapacity() - oldLength;
   1748     return str.getArrayStart() + oldLength;
   1749   }
   1750   *resultCapacity = scratchCapacity;
   1751   return scratch;
   1752 }
   1753 
   1754 U_NAMESPACE_END
   1755 
   1756 U_NAMESPACE_USE
   1757 
   1758 U_CAPI int32_t U_EXPORT2
   1759 uhash_hashUnicodeString(const UElement key) {
   1760     const UnicodeString *str = (const UnicodeString*) key.pointer;
   1761     return (str == NULL) ? 0 : str->hashCode();
   1762 }
   1763 
   1764 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
   1765 // does not depend on hashtable code.
   1766 U_CAPI UBool U_EXPORT2
   1767 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
   1768     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
   1769     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
   1770     if (str1 == str2) {
   1771         return TRUE;
   1772     }
   1773     if (str1 == NULL || str2 == NULL) {
   1774         return FALSE;
   1775     }
   1776     return *str1 == *str2;
   1777 }
   1778 
   1779 #ifdef U_STATIC_IMPLEMENTATION
   1780 /*
   1781 This should never be called. It is defined here to make sure that the
   1782 virtual vector deleting destructor is defined within unistr.cpp.
   1783 The vector deleting destructor is already a part of UObject,
   1784 but defining it here makes sure that it is included with this object file.
   1785 This makes sure that static library dependencies are kept to a minimum.
   1786 */
   1787 static void uprv_UnicodeStringDummy(void) {
   1788     delete [] (new UnicodeString[2]);
   1789 }
   1790 #endif
   1791