Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1999-2012, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 ******************************************************************************
      6 *
      7 * File unistr.cpp
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   09/25/98    stephen     Creation.
     13 *   04/20/99    stephen     Overhauled per 4/16 code review.
     14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
     15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
     16 *                           Replaceable.
     17 *   06/25/01    grhoten     Removed the dependency on iostream
     18 ******************************************************************************
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/appendable.h"
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf16.h"
     30 #include "uelement.h"
     31 #include "ustr_imp.h"
     32 #include "umutex.h"
     33 #include "uassert.h"
     34 
     35 #if 0
     36 
     37 #include <iostream>
     38 using namespace std;
     39 
     40 //DEBUGGING
     41 void
     42 print(const UnicodeString& s,
     43       const char *name)
     44 {
     45   UChar c;
     46   cout << name << ":|";
     47   for(int i = 0; i < s.length(); ++i) {
     48     c = s[i];
     49     if(c>= 0x007E || c < 0x0020)
     50       cout << "[0x" << hex << s[i] << "]";
     51     else
     52       cout << (char) s[i];
     53   }
     54   cout << '|' << endl;
     55 }
     56 
     57 void
     58 print(const UChar *s,
     59       int32_t len,
     60       const char *name)
     61 {
     62   UChar c;
     63   cout << name << ":|";
     64   for(int i = 0; i < len; ++i) {
     65     c = s[i];
     66     if(c>= 0x007E || c < 0x0020)
     67       cout << "[0x" << hex << s[i] << "]";
     68     else
     69       cout << (char) s[i];
     70   }
     71   cout << '|' << endl;
     72 }
     73 // END DEBUGGING
     74 #endif
     75 
     76 // Local function definitions for now
     77 
     78 // need to copy areas that may overlap
     79 static
     80 inline void
     81 us_arrayCopy(const UChar *src, int32_t srcStart,
     82          UChar *dst, int32_t dstStart, int32_t count)
     83 {
     84   if(count>0) {
     85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
     86   }
     87 }
     88 
     89 // u_unescapeAt() callback to get a UChar from a UnicodeString
     90 U_CDECL_BEGIN
     91 static UChar U_CALLCONV
     92 UnicodeString_charAt(int32_t offset, void *context) {
     93     return ((icu::UnicodeString*) context)->charAt(offset);
     94 }
     95 U_CDECL_END
     96 
     97 U_NAMESPACE_BEGIN
     98 
     99 /* The Replaceable virtual destructor can't be defined in the header
    100    due to how AIX works with multiple definitions of virtual functions.
    101 */
    102 Replaceable::~Replaceable() {}
    103 
    104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
    105 
    106 UnicodeString U_EXPORT2
    107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
    108     return
    109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
    110             append(s1).
    111                 append(s2);
    112 }
    113 
    114 //========================================
    115 // Reference Counting functions, put at top of file so that optimizing compilers
    116 //                               have a chance to automatically inline.
    117 //========================================
    118 
    119 void
    120 UnicodeString::addRef()
    121 {  umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);}
    122 
    123 int32_t
    124 UnicodeString::removeRef()
    125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);}
    126 
    127 int32_t
    128 UnicodeString::refCount() const
    129 {
    130     umtx_lock(NULL);
    131     // Note: without the lock to force a memory barrier, we might see a very
    132     //       stale value on some multi-processor systems.
    133     int32_t  count = *((int32_t *)fUnion.fFields.fArray - 1);
    134     umtx_unlock(NULL);
    135     return count;
    136  }
    137 
    138 void
    139 UnicodeString::releaseArray() {
    140   if((fFlags & kRefCounted) && removeRef() == 0) {
    141     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
    142   }
    143 }
    144 
    145 
    146 
    147 //========================================
    148 // Constructors
    149 //========================================
    150 
    151 // The default constructor is inline in unistr.h.
    152 
    153 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
    154   : fShortLength(0),
    155     fFlags(0)
    156 {
    157   if(count <= 0 || (uint32_t)c > 0x10ffff) {
    158     // just allocate and do not do anything else
    159     allocate(capacity);
    160   } else {
    161     // count > 0, allocate and fill the new string with count c's
    162     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
    163     if(capacity < length) {
    164       capacity = length;
    165     }
    166     if(allocate(capacity)) {
    167       UChar *array = getArrayStart();
    168       int32_t i = 0;
    169 
    170       // fill the new string with c
    171       if(unitCount == 1) {
    172         // fill with length UChars
    173         while(i < length) {
    174           array[i++] = (UChar)c;
    175         }
    176       } else {
    177         // get the code units for c
    178         UChar units[U16_MAX_LENGTH];
    179         U16_APPEND_UNSAFE(units, i, c);
    180 
    181         // now it must be i==unitCount
    182         i = 0;
    183 
    184         // for Unicode, unitCount can only be 1, 2, 3, or 4
    185         // 1 is handled above
    186         while(i < length) {
    187           int32_t unitIdx = 0;
    188           while(unitIdx < unitCount) {
    189             array[i++]=units[unitIdx++];
    190           }
    191         }
    192       }
    193     }
    194     setLength(length);
    195   }
    196 }
    197 
    198 UnicodeString::UnicodeString(UChar ch)
    199   : fShortLength(1),
    200     fFlags(kShortString)
    201 {
    202   fUnion.fStackBuffer[0] = ch;
    203 }
    204 
    205 UnicodeString::UnicodeString(UChar32 ch)
    206   : fShortLength(0),
    207     fFlags(kShortString)
    208 {
    209   int32_t i = 0;
    210   UBool isError = FALSE;
    211   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
    212   // We test isError so that the compiler does not complain that we don't.
    213   // If isError then i==0 which is what we want anyway.
    214   if(!isError) {
    215     fShortLength = (int8_t)i;
    216   }
    217 }
    218 
    219 UnicodeString::UnicodeString(const UChar *text)
    220   : fShortLength(0),
    221     fFlags(kShortString)
    222 {
    223   doReplace(0, 0, text, 0, -1);
    224 }
    225 
    226 UnicodeString::UnicodeString(const UChar *text,
    227                              int32_t textLength)
    228   : fShortLength(0),
    229     fFlags(kShortString)
    230 {
    231   doReplace(0, 0, text, 0, textLength);
    232 }
    233 
    234 UnicodeString::UnicodeString(UBool isTerminated,
    235                              const UChar *text,
    236                              int32_t textLength)
    237   : fShortLength(0),
    238     fFlags(kReadonlyAlias)
    239 {
    240   if(text == NULL) {
    241     // treat as an empty string, do not alias
    242     setToEmpty();
    243   } else if(textLength < -1 ||
    244             (textLength == -1 && !isTerminated) ||
    245             (textLength >= 0 && isTerminated && text[textLength] != 0)
    246   ) {
    247     setToBogus();
    248   } else {
    249     if(textLength == -1) {
    250       // text is terminated, or else it would have failed the above test
    251       textLength = u_strlen(text);
    252     }
    253     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
    254   }
    255 }
    256 
    257 UnicodeString::UnicodeString(UChar *buff,
    258                              int32_t buffLength,
    259                              int32_t buffCapacity)
    260   : fShortLength(0),
    261     fFlags(kWritableAlias)
    262 {
    263   if(buff == NULL) {
    264     // treat as an empty string, do not alias
    265     setToEmpty();
    266   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
    267     setToBogus();
    268   } else {
    269     if(buffLength == -1) {
    270       // fLength = u_strlen(buff); but do not look beyond buffCapacity
    271       const UChar *p = buff, *limit = buff + buffCapacity;
    272       while(p != limit && *p != 0) {
    273         ++p;
    274       }
    275       buffLength = (int32_t)(p - buff);
    276     }
    277     setArray(buff, buffLength, buffCapacity);
    278   }
    279 }
    280 
    281 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
    282   : fShortLength(0),
    283     fFlags(kShortString)
    284 {
    285   if(src==NULL) {
    286     // treat as an empty string
    287   } else {
    288     if(length<0) {
    289       length=(int32_t)uprv_strlen(src);
    290     }
    291     if(cloneArrayIfNeeded(length, length, FALSE)) {
    292       u_charsToUChars(src, getArrayStart(), length);
    293       setLength(length);
    294     } else {
    295       setToBogus();
    296     }
    297   }
    298 }
    299 
    300 #if U_CHARSET_IS_UTF8
    301 
    302 UnicodeString::UnicodeString(const char *codepageData)
    303   : fShortLength(0),
    304     fFlags(kShortString) {
    305   if(codepageData != 0) {
    306     setToUTF8(codepageData);
    307   }
    308 }
    309 
    310 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
    311   : fShortLength(0),
    312     fFlags(kShortString) {
    313   // if there's nothing to convert, do nothing
    314   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    315     return;
    316   }
    317   if(dataLength == -1) {
    318     dataLength = (int32_t)uprv_strlen(codepageData);
    319   }
    320   setToUTF8(StringPiece(codepageData, dataLength));
    321 }
    322 
    323 // else see unistr_cnv.cpp
    324 #endif
    325 
    326 UnicodeString::UnicodeString(const UnicodeString& that)
    327   : Replaceable(),
    328     fShortLength(0),
    329     fFlags(kShortString)
    330 {
    331   copyFrom(that);
    332 }
    333 
    334 UnicodeString::UnicodeString(const UnicodeString& that,
    335                              int32_t srcStart)
    336   : Replaceable(),
    337     fShortLength(0),
    338     fFlags(kShortString)
    339 {
    340   setTo(that, srcStart);
    341 }
    342 
    343 UnicodeString::UnicodeString(const UnicodeString& that,
    344                              int32_t srcStart,
    345                              int32_t srcLength)
    346   : Replaceable(),
    347     fShortLength(0),
    348     fFlags(kShortString)
    349 {
    350   setTo(that, srcStart, srcLength);
    351 }
    352 
    353 // Replaceable base class clone() default implementation, does not clone
    354 Replaceable *
    355 Replaceable::clone() const {
    356   return NULL;
    357 }
    358 
    359 // UnicodeString overrides clone() with a real implementation
    360 Replaceable *
    361 UnicodeString::clone() const {
    362   return new UnicodeString(*this);
    363 }
    364 
    365 //========================================
    366 // array allocation
    367 //========================================
    368 
    369 UBool
    370 UnicodeString::allocate(int32_t capacity) {
    371   if(capacity <= US_STACKBUF_SIZE) {
    372     fFlags = kShortString;
    373   } else {
    374     // count bytes for the refCounter and the string capacity, and
    375     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
    376     // to be safely aligned for the refCount
    377     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
    378     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
    379     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
    380     if(array != 0) {
    381       // set initial refCount and point behind the refCount
    382       *array++ = 1;
    383 
    384       // have fArray point to the first UChar
    385       fUnion.fFields.fArray = (UChar *)array;
    386       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
    387       fFlags = kLongString;
    388     } else {
    389       fShortLength = 0;
    390       fUnion.fFields.fArray = 0;
    391       fUnion.fFields.fCapacity = 0;
    392       fFlags = kIsBogus;
    393       return FALSE;
    394     }
    395   }
    396   return TRUE;
    397 }
    398 
    399 //========================================
    400 // Destructor
    401 //========================================
    402 UnicodeString::~UnicodeString()
    403 {
    404   releaseArray();
    405 }
    406 
    407 //========================================
    408 // Factory methods
    409 //========================================
    410 
    411 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
    412   UnicodeString result;
    413   result.setToUTF8(utf8);
    414   return result;
    415 }
    416 
    417 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
    418   UnicodeString result;
    419   int32_t capacity;
    420   // Most UTF-32 strings will be BMP-only and result in a same-length
    421   // UTF-16 string. We overestimate the capacity just slightly,
    422   // just in case there are a few supplementary characters.
    423   if(length <= US_STACKBUF_SIZE) {
    424     capacity = US_STACKBUF_SIZE;
    425   } else {
    426     capacity = length + (length >> 4) + 4;
    427   }
    428   do {
    429     UChar *utf16 = result.getBuffer(capacity);
    430     int32_t length16;
    431     UErrorCode errorCode = U_ZERO_ERROR;
    432     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
    433         utf32, length,
    434         0xfffd,  // Substitution character.
    435         NULL,    // Don't care about number of substitutions.
    436         &errorCode);
    437     result.releaseBuffer(length16);
    438     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    439       capacity = length16 + 1;  // +1 for the terminating NUL.
    440       continue;
    441     } else if(U_FAILURE(errorCode)) {
    442       result.setToBogus();
    443     }
    444     break;
    445   } while(TRUE);
    446   return result;
    447 }
    448 
    449 //========================================
    450 // Assignment
    451 //========================================
    452 
    453 UnicodeString &
    454 UnicodeString::operator=(const UnicodeString &src) {
    455   return copyFrom(src);
    456 }
    457 
    458 UnicodeString &
    459 UnicodeString::fastCopyFrom(const UnicodeString &src) {
    460   return copyFrom(src, TRUE);
    461 }
    462 
    463 UnicodeString &
    464 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
    465   // if assigning to ourselves, do nothing
    466   if(this == 0 || this == &src) {
    467     return *this;
    468   }
    469 
    470   // is the right side bogus?
    471   if(&src == 0 || src.isBogus()) {
    472     setToBogus();
    473     return *this;
    474   }
    475 
    476   // delete the current contents
    477   releaseArray();
    478 
    479   if(src.isEmpty()) {
    480     // empty string - use the stack buffer
    481     setToEmpty();
    482     return *this;
    483   }
    484 
    485   // we always copy the length
    486   int32_t srcLength = src.length();
    487   setLength(srcLength);
    488 
    489   // fLength>0 and not an "open" src.getBuffer(minCapacity)
    490   switch(src.fFlags) {
    491   case kShortString:
    492     // short string using the stack buffer, do the same
    493     fFlags = kShortString;
    494     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
    495     break;
    496   case kLongString:
    497     // src uses a refCounted string buffer, use that buffer with refCount
    498     // src is const, use a cast - we don't really change it
    499     ((UnicodeString &)src).addRef();
    500     // copy all fields, share the reference-counted buffer
    501     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    502     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    503     fFlags = src.fFlags;
    504     break;
    505   case kReadonlyAlias:
    506     if(fastCopy) {
    507       // src is a readonly alias, do the same
    508       // -> maintain the readonly alias as such
    509       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    510       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    511       fFlags = src.fFlags;
    512       break;
    513     }
    514     // else if(!fastCopy) fall through to case kWritableAlias
    515     // -> allocate a new buffer and copy the contents
    516   case kWritableAlias:
    517     // src is a writable alias; we make a copy of that instead
    518     if(allocate(srcLength)) {
    519       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
    520       break;
    521     }
    522     // if there is not enough memory, then fall through to setting to bogus
    523   default:
    524     // if src is bogus, set ourselves to bogus
    525     // do not call setToBogus() here because fArray and fFlags are not consistent here
    526     fShortLength = 0;
    527     fUnion.fFields.fArray = 0;
    528     fUnion.fFields.fCapacity = 0;
    529     fFlags = kIsBogus;
    530     break;
    531   }
    532 
    533   return *this;
    534 }
    535 
    536 //========================================
    537 // Miscellaneous operations
    538 //========================================
    539 
    540 UnicodeString UnicodeString::unescape() const {
    541     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
    542     const UChar *array = getBuffer();
    543     int32_t len = length();
    544     int32_t prev = 0;
    545     for (int32_t i=0;;) {
    546         if (i == len) {
    547             result.append(array, prev, len - prev);
    548             break;
    549         }
    550         if (array[i++] == 0x5C /*'\\'*/) {
    551             result.append(array, prev, (i - 1) - prev);
    552             UChar32 c = unescapeAt(i); // advances i
    553             if (c < 0) {
    554                 result.remove(); // return empty string
    555                 break; // invalid escape sequence
    556             }
    557             result.append(c);
    558             prev = i;
    559         }
    560     }
    561     return result;
    562 }
    563 
    564 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
    565     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
    566 }
    567 
    568 //========================================
    569 // Read-only implementation
    570 //========================================
    571 UBool
    572 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
    573   // Requires: this & text not bogus and have same lengths.
    574   // Byte-wise comparison works for equality regardless of endianness.
    575   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
    576 }
    577 
    578 int8_t
    579 UnicodeString::doCompare( int32_t start,
    580               int32_t length,
    581               const UChar *srcChars,
    582               int32_t srcStart,
    583               int32_t srcLength) const
    584 {
    585   // compare illegal string values
    586   if(isBogus()) {
    587     return -1;
    588   }
    589 
    590   // pin indices to legal values
    591   pinIndices(start, length);
    592 
    593   if(srcChars == NULL) {
    594     // treat const UChar *srcChars==NULL as an empty string
    595     return length == 0 ? 0 : 1;
    596   }
    597 
    598   // get the correct pointer
    599   const UChar *chars = getArrayStart();
    600 
    601   chars += start;
    602   srcChars += srcStart;
    603 
    604   int32_t minLength;
    605   int8_t lengthResult;
    606 
    607   // get the srcLength if necessary
    608   if(srcLength < 0) {
    609     srcLength = u_strlen(srcChars + srcStart);
    610   }
    611 
    612   // are we comparing different lengths?
    613   if(length != srcLength) {
    614     if(length < srcLength) {
    615       minLength = length;
    616       lengthResult = -1;
    617     } else {
    618       minLength = srcLength;
    619       lengthResult = 1;
    620     }
    621   } else {
    622     minLength = length;
    623     lengthResult = 0;
    624   }
    625 
    626   /*
    627    * note that uprv_memcmp() returns an int but we return an int8_t;
    628    * we need to take care not to truncate the result -
    629    * one way to do this is to right-shift the value to
    630    * move the sign bit into the lower 8 bits and making sure that this
    631    * does not become 0 itself
    632    */
    633 
    634   if(minLength > 0 && chars != srcChars) {
    635     int32_t result;
    636 
    637 #   if U_IS_BIG_ENDIAN
    638       // big-endian: byte comparison works
    639       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
    640       if(result != 0) {
    641         return (int8_t)(result >> 15 | 1);
    642       }
    643 #   else
    644       // little-endian: compare UChar units
    645       do {
    646         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
    647         if(result != 0) {
    648           return (int8_t)(result >> 15 | 1);
    649         }
    650       } while(--minLength > 0);
    651 #   endif
    652   }
    653   return lengthResult;
    654 }
    655 
    656 /* String compare in code point order - doCompare() compares in code unit order. */
    657 int8_t
    658 UnicodeString::doCompareCodePointOrder(int32_t start,
    659                                        int32_t length,
    660                                        const UChar *srcChars,
    661                                        int32_t srcStart,
    662                                        int32_t srcLength) const
    663 {
    664   // compare illegal string values
    665   // treat const UChar *srcChars==NULL as an empty string
    666   if(isBogus()) {
    667     return -1;
    668   }
    669 
    670   // pin indices to legal values
    671   pinIndices(start, length);
    672 
    673   if(srcChars == NULL) {
    674     srcStart = srcLength = 0;
    675   }
    676 
    677   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
    678   /* translate the 32-bit result into an 8-bit one */
    679   if(diff!=0) {
    680     return (int8_t)(diff >> 15 | 1);
    681   } else {
    682     return 0;
    683   }
    684 }
    685 
    686 int32_t
    687 UnicodeString::getLength() const {
    688     return length();
    689 }
    690 
    691 UChar
    692 UnicodeString::getCharAt(int32_t offset) const {
    693   return charAt(offset);
    694 }
    695 
    696 UChar32
    697 UnicodeString::getChar32At(int32_t offset) const {
    698   return char32At(offset);
    699 }
    700 
    701 UChar32
    702 UnicodeString::char32At(int32_t offset) const
    703 {
    704   int32_t len = length();
    705   if((uint32_t)offset < (uint32_t)len) {
    706     const UChar *array = getArrayStart();
    707     UChar32 c;
    708     U16_GET(array, 0, offset, len, c);
    709     return c;
    710   } else {
    711     return kInvalidUChar;
    712   }
    713 }
    714 
    715 int32_t
    716 UnicodeString::getChar32Start(int32_t offset) const {
    717   if((uint32_t)offset < (uint32_t)length()) {
    718     const UChar *array = getArrayStart();
    719     U16_SET_CP_START(array, 0, offset);
    720     return offset;
    721   } else {
    722     return 0;
    723   }
    724 }
    725 
    726 int32_t
    727 UnicodeString::getChar32Limit(int32_t offset) const {
    728   int32_t len = length();
    729   if((uint32_t)offset < (uint32_t)len) {
    730     const UChar *array = getArrayStart();
    731     U16_SET_CP_LIMIT(array, 0, offset, len);
    732     return offset;
    733   } else {
    734     return len;
    735   }
    736 }
    737 
    738 int32_t
    739 UnicodeString::countChar32(int32_t start, int32_t length) const {
    740   pinIndices(start, length);
    741   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
    742   return u_countChar32(getArrayStart()+start, length);
    743 }
    744 
    745 UBool
    746 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
    747   pinIndices(start, length);
    748   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
    749   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
    750 }
    751 
    752 int32_t
    753 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
    754   // pin index
    755   int32_t len = length();
    756   if(index<0) {
    757     index=0;
    758   } else if(index>len) {
    759     index=len;
    760   }
    761 
    762   const UChar *array = getArrayStart();
    763   if(delta>0) {
    764     U16_FWD_N(array, index, len, delta);
    765   } else {
    766     U16_BACK_N(array, 0, index, -delta);
    767   }
    768 
    769   return index;
    770 }
    771 
    772 void
    773 UnicodeString::doExtract(int32_t start,
    774              int32_t length,
    775              UChar *dst,
    776              int32_t dstStart) const
    777 {
    778   // pin indices to legal values
    779   pinIndices(start, length);
    780 
    781   // do not copy anything if we alias dst itself
    782   const UChar *array = getArrayStart();
    783   if(array + start != dst + dstStart) {
    784     us_arrayCopy(array, start, dst, dstStart, length);
    785   }
    786 }
    787 
    788 int32_t
    789 UnicodeString::extract(UChar *dest, int32_t destCapacity,
    790                        UErrorCode &errorCode) const {
    791   int32_t len = length();
    792   if(U_SUCCESS(errorCode)) {
    793     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    794       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    795     } else {
    796       const UChar *array = getArrayStart();
    797       if(len>0 && len<=destCapacity && array!=dest) {
    798         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
    799       }
    800       return u_terminateUChars(dest, destCapacity, len, &errorCode);
    801     }
    802   }
    803 
    804   return len;
    805 }
    806 
    807 int32_t
    808 UnicodeString::extract(int32_t start,
    809                        int32_t length,
    810                        char *target,
    811                        int32_t targetCapacity,
    812                        enum EInvariant) const
    813 {
    814   // if the arguments are illegal, then do nothing
    815   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
    816     return 0;
    817   }
    818 
    819   // pin the indices to legal values
    820   pinIndices(start, length);
    821 
    822   if(length <= targetCapacity) {
    823     u_UCharsToChars(getArrayStart() + start, target, length);
    824   }
    825   UErrorCode status = U_ZERO_ERROR;
    826   return u_terminateChars(target, targetCapacity, length, &status);
    827 }
    828 
    829 UnicodeString
    830 UnicodeString::tempSubString(int32_t start, int32_t len) const {
    831   pinIndices(start, len);
    832   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
    833   if(array==NULL) {
    834     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
    835     len=-2;  // bogus result string
    836   }
    837   return UnicodeString(FALSE, array + start, len);
    838 }
    839 
    840 int32_t
    841 UnicodeString::toUTF8(int32_t start, int32_t len,
    842                       char *target, int32_t capacity) const {
    843   pinIndices(start, len);
    844   int32_t length8;
    845   UErrorCode errorCode = U_ZERO_ERROR;
    846   u_strToUTF8WithSub(target, capacity, &length8,
    847                      getBuffer() + start, len,
    848                      0xFFFD,  // Standard substitution character.
    849                      NULL,    // Don't care about number of substitutions.
    850                      &errorCode);
    851   return length8;
    852 }
    853 
    854 #if U_CHARSET_IS_UTF8
    855 
    856 int32_t
    857 UnicodeString::extract(int32_t start, int32_t len,
    858                        char *target, uint32_t dstSize) const {
    859   // if the arguments are illegal, then do nothing
    860   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    861     return 0;
    862   }
    863   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
    864 }
    865 
    866 // else see unistr_cnv.cpp
    867 #endif
    868 
    869 void
    870 UnicodeString::extractBetween(int32_t start,
    871                   int32_t limit,
    872                   UnicodeString& target) const {
    873   pinIndex(start);
    874   pinIndex(limit);
    875   doExtract(start, limit - start, target);
    876 }
    877 
    878 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
    879 // as many bytes as the source has UChars.
    880 // The "worst cases" are writing systems like Indic, Thai and CJK with
    881 // 3:1 bytes:UChars.
    882 void
    883 UnicodeString::toUTF8(ByteSink &sink) const {
    884   int32_t length16 = length();
    885   if(length16 != 0) {
    886     char stackBuffer[1024];
    887     int32_t capacity = (int32_t)sizeof(stackBuffer);
    888     UBool utf8IsOwned = FALSE;
    889     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
    890                                       3*length16,
    891                                       stackBuffer, capacity,
    892                                       &capacity);
    893     int32_t length8 = 0;
    894     UErrorCode errorCode = U_ZERO_ERROR;
    895     u_strToUTF8WithSub(utf8, capacity, &length8,
    896                        getBuffer(), length16,
    897                        0xFFFD,  // Standard substitution character.
    898                        NULL,    // Don't care about number of substitutions.
    899                        &errorCode);
    900     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    901       utf8 = (char *)uprv_malloc(length8);
    902       if(utf8 != NULL) {
    903         utf8IsOwned = TRUE;
    904         errorCode = U_ZERO_ERROR;
    905         u_strToUTF8WithSub(utf8, length8, &length8,
    906                            getBuffer(), length16,
    907                            0xFFFD,  // Standard substitution character.
    908                            NULL,    // Don't care about number of substitutions.
    909                            &errorCode);
    910       } else {
    911         errorCode = U_MEMORY_ALLOCATION_ERROR;
    912       }
    913     }
    914     if(U_SUCCESS(errorCode)) {
    915       sink.Append(utf8, length8);
    916       sink.Flush();
    917     }
    918     if(utf8IsOwned) {
    919       uprv_free(utf8);
    920     }
    921   }
    922 }
    923 
    924 int32_t
    925 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
    926   int32_t length32=0;
    927   if(U_SUCCESS(errorCode)) {
    928     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
    929     u_strToUTF32WithSub(utf32, capacity, &length32,
    930         getBuffer(), length(),
    931         0xfffd,  // Substitution character.
    932         NULL,    // Don't care about number of substitutions.
    933         &errorCode);
    934   }
    935   return length32;
    936 }
    937 
    938 int32_t
    939 UnicodeString::indexOf(const UChar *srcChars,
    940                int32_t srcStart,
    941                int32_t srcLength,
    942                int32_t start,
    943                int32_t length) const
    944 {
    945   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
    946     return -1;
    947   }
    948 
    949   // UnicodeString does not find empty substrings
    950   if(srcLength < 0 && srcChars[srcStart] == 0) {
    951     return -1;
    952   }
    953 
    954   // get the indices within bounds
    955   pinIndices(start, length);
    956 
    957   // find the first occurrence of the substring
    958   const UChar *array = getArrayStart();
    959   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
    960   if(match == NULL) {
    961     return -1;
    962   } else {
    963     return (int32_t)(match - array);
    964   }
    965 }
    966 
    967 int32_t
    968 UnicodeString::doIndexOf(UChar c,
    969              int32_t start,
    970              int32_t length) const
    971 {
    972   // pin indices
    973   pinIndices(start, length);
    974 
    975   // find the first occurrence of c
    976   const UChar *array = getArrayStart();
    977   const UChar *match = u_memchr(array + start, c, length);
    978   if(match == NULL) {
    979     return -1;
    980   } else {
    981     return (int32_t)(match - array);
    982   }
    983 }
    984 
    985 int32_t
    986 UnicodeString::doIndexOf(UChar32 c,
    987                          int32_t start,
    988                          int32_t length) const {
    989   // pin indices
    990   pinIndices(start, length);
    991 
    992   // find the first occurrence of c
    993   const UChar *array = getArrayStart();
    994   const UChar *match = u_memchr32(array + start, c, length);
    995   if(match == NULL) {
    996     return -1;
    997   } else {
    998     return (int32_t)(match - array);
    999   }
   1000 }
   1001 
   1002 int32_t
   1003 UnicodeString::lastIndexOf(const UChar *srcChars,
   1004                int32_t srcStart,
   1005                int32_t srcLength,
   1006                int32_t start,
   1007                int32_t length) const
   1008 {
   1009   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
   1010     return -1;
   1011   }
   1012 
   1013   // UnicodeString does not find empty substrings
   1014   if(srcLength < 0 && srcChars[srcStart] == 0) {
   1015     return -1;
   1016   }
   1017 
   1018   // get the indices within bounds
   1019   pinIndices(start, length);
   1020 
   1021   // find the last occurrence of the substring
   1022   const UChar *array = getArrayStart();
   1023   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
   1024   if(match == NULL) {
   1025     return -1;
   1026   } else {
   1027     return (int32_t)(match - array);
   1028   }
   1029 }
   1030 
   1031 int32_t
   1032 UnicodeString::doLastIndexOf(UChar c,
   1033                  int32_t start,
   1034                  int32_t length) const
   1035 {
   1036   if(isBogus()) {
   1037     return -1;
   1038   }
   1039 
   1040   // pin indices
   1041   pinIndices(start, length);
   1042 
   1043   // find the last occurrence of c
   1044   const UChar *array = getArrayStart();
   1045   const UChar *match = u_memrchr(array + start, c, length);
   1046   if(match == NULL) {
   1047     return -1;
   1048   } else {
   1049     return (int32_t)(match - array);
   1050   }
   1051 }
   1052 
   1053 int32_t
   1054 UnicodeString::doLastIndexOf(UChar32 c,
   1055                              int32_t start,
   1056                              int32_t length) const {
   1057   // pin indices
   1058   pinIndices(start, length);
   1059 
   1060   // find the last occurrence of c
   1061   const UChar *array = getArrayStart();
   1062   const UChar *match = u_memrchr32(array + start, c, length);
   1063   if(match == NULL) {
   1064     return -1;
   1065   } else {
   1066     return (int32_t)(match - array);
   1067   }
   1068 }
   1069 
   1070 //========================================
   1071 // Write implementation
   1072 //========================================
   1073 
   1074 UnicodeString&
   1075 UnicodeString::findAndReplace(int32_t start,
   1076                   int32_t length,
   1077                   const UnicodeString& oldText,
   1078                   int32_t oldStart,
   1079                   int32_t oldLength,
   1080                   const UnicodeString& newText,
   1081                   int32_t newStart,
   1082                   int32_t newLength)
   1083 {
   1084   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
   1085     return *this;
   1086   }
   1087 
   1088   pinIndices(start, length);
   1089   oldText.pinIndices(oldStart, oldLength);
   1090   newText.pinIndices(newStart, newLength);
   1091 
   1092   if(oldLength == 0) {
   1093     return *this;
   1094   }
   1095 
   1096   while(length > 0 && length >= oldLength) {
   1097     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
   1098     if(pos < 0) {
   1099       // no more oldText's here: done
   1100       break;
   1101     } else {
   1102       // we found oldText, replace it by newText and go beyond it
   1103       replace(pos, oldLength, newText, newStart, newLength);
   1104       length -= pos + oldLength - start;
   1105       start = pos + newLength;
   1106     }
   1107   }
   1108 
   1109   return *this;
   1110 }
   1111 
   1112 
   1113 void
   1114 UnicodeString::setToBogus()
   1115 {
   1116   releaseArray();
   1117 
   1118   fShortLength = 0;
   1119   fUnion.fFields.fArray = 0;
   1120   fUnion.fFields.fCapacity = 0;
   1121   fFlags = kIsBogus;
   1122 }
   1123 
   1124 // turn a bogus string into an empty one
   1125 void
   1126 UnicodeString::unBogus() {
   1127   if(fFlags & kIsBogus) {
   1128     setToEmpty();
   1129   }
   1130 }
   1131 
   1132 // setTo() analogous to the readonly-aliasing constructor with the same signature
   1133 UnicodeString &
   1134 UnicodeString::setTo(UBool isTerminated,
   1135                      const UChar *text,
   1136                      int32_t textLength)
   1137 {
   1138   if(fFlags & kOpenGetBuffer) {
   1139     // do not modify a string that has an "open" getBuffer(minCapacity)
   1140     return *this;
   1141   }
   1142 
   1143   if(text == NULL) {
   1144     // treat as an empty string, do not alias
   1145     releaseArray();
   1146     setToEmpty();
   1147     return *this;
   1148   }
   1149 
   1150   if( textLength < -1 ||
   1151       (textLength == -1 && !isTerminated) ||
   1152       (textLength >= 0 && isTerminated && text[textLength] != 0)
   1153   ) {
   1154     setToBogus();
   1155     return *this;
   1156   }
   1157 
   1158   releaseArray();
   1159 
   1160   if(textLength == -1) {
   1161     // text is terminated, or else it would have failed the above test
   1162     textLength = u_strlen(text);
   1163   }
   1164   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
   1165 
   1166   fFlags = kReadonlyAlias;
   1167   return *this;
   1168 }
   1169 
   1170 // setTo() analogous to the writable-aliasing constructor with the same signature
   1171 UnicodeString &
   1172 UnicodeString::setTo(UChar *buffer,
   1173                      int32_t buffLength,
   1174                      int32_t buffCapacity) {
   1175   if(fFlags & kOpenGetBuffer) {
   1176     // do not modify a string that has an "open" getBuffer(minCapacity)
   1177     return *this;
   1178   }
   1179 
   1180   if(buffer == NULL) {
   1181     // treat as an empty string, do not alias
   1182     releaseArray();
   1183     setToEmpty();
   1184     return *this;
   1185   }
   1186 
   1187   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
   1188     setToBogus();
   1189     return *this;
   1190   } else if(buffLength == -1) {
   1191     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
   1192     const UChar *p = buffer, *limit = buffer + buffCapacity;
   1193     while(p != limit && *p != 0) {
   1194       ++p;
   1195     }
   1196     buffLength = (int32_t)(p - buffer);
   1197   }
   1198 
   1199   releaseArray();
   1200 
   1201   setArray(buffer, buffLength, buffCapacity);
   1202   fFlags = kWritableAlias;
   1203   return *this;
   1204 }
   1205 
   1206 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
   1207   unBogus();
   1208   int32_t length = utf8.length();
   1209   int32_t capacity;
   1210   // The UTF-16 string will be at most as long as the UTF-8 string.
   1211   if(length <= US_STACKBUF_SIZE) {
   1212     capacity = US_STACKBUF_SIZE;
   1213   } else {
   1214     capacity = length + 1;  // +1 for the terminating NUL.
   1215   }
   1216   UChar *utf16 = getBuffer(capacity);
   1217   int32_t length16;
   1218   UErrorCode errorCode = U_ZERO_ERROR;
   1219   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
   1220       utf8.data(), length,
   1221       0xfffd,  // Substitution character.
   1222       NULL,    // Don't care about number of substitutions.
   1223       &errorCode);
   1224   releaseBuffer(length16);
   1225   if(U_FAILURE(errorCode)) {
   1226     setToBogus();
   1227   }
   1228   return *this;
   1229 }
   1230 
   1231 UnicodeString&
   1232 UnicodeString::setCharAt(int32_t offset,
   1233              UChar c)
   1234 {
   1235   int32_t len = length();
   1236   if(cloneArrayIfNeeded() && len > 0) {
   1237     if(offset < 0) {
   1238       offset = 0;
   1239     } else if(offset >= len) {
   1240       offset = len - 1;
   1241     }
   1242 
   1243     getArrayStart()[offset] = c;
   1244   }
   1245   return *this;
   1246 }
   1247 
   1248 UnicodeString&
   1249 UnicodeString::replace(int32_t start,
   1250                int32_t _length,
   1251                UChar32 srcChar) {
   1252   UChar buffer[U16_MAX_LENGTH];
   1253   int32_t count = 0;
   1254   UBool isError = FALSE;
   1255   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
   1256   // We test isError so that the compiler does not complain that we don't.
   1257   // If isError (srcChar is not a valid code point) then count==0 which means
   1258   // we remove the source segment rather than replacing it with srcChar.
   1259   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
   1260 }
   1261 
   1262 UnicodeString&
   1263 UnicodeString::append(UChar32 srcChar) {
   1264   UChar buffer[U16_MAX_LENGTH];
   1265   int32_t _length = 0;
   1266   UBool isError = FALSE;
   1267   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
   1268   // We test isError so that the compiler does not complain that we don't.
   1269   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
   1270   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
   1271 }
   1272 
   1273 UnicodeString&
   1274 UnicodeString::doReplace( int32_t start,
   1275               int32_t length,
   1276               const UnicodeString& src,
   1277               int32_t srcStart,
   1278               int32_t srcLength)
   1279 {
   1280   if(!src.isBogus()) {
   1281     // pin the indices to legal values
   1282     src.pinIndices(srcStart, srcLength);
   1283 
   1284     // get the characters from src
   1285     // and replace the range in ourselves with them
   1286     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
   1287   } else {
   1288     // remove the range
   1289     return doReplace(start, length, 0, 0, 0);
   1290   }
   1291 }
   1292 
   1293 UnicodeString&
   1294 UnicodeString::doReplace(int32_t start,
   1295              int32_t length,
   1296              const UChar *srcChars,
   1297              int32_t srcStart,
   1298              int32_t srcLength)
   1299 {
   1300   if(!isWritable()) {
   1301     return *this;
   1302   }
   1303 
   1304   int32_t oldLength = this->length();
   1305 
   1306   // optimize (read-only alias).remove(0, start) and .remove(start, end)
   1307   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
   1308     if(start == 0) {
   1309       // remove prefix by adjusting the array pointer
   1310       pinIndex(length);
   1311       fUnion.fFields.fArray += length;
   1312       fUnion.fFields.fCapacity -= length;
   1313       setLength(oldLength - length);
   1314       return *this;
   1315     } else {
   1316       pinIndex(start);
   1317       if(length >= (oldLength - start)) {
   1318         // remove suffix by reducing the length (like truncate())
   1319         setLength(start);
   1320         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
   1321         return *this;
   1322       }
   1323     }
   1324   }
   1325 
   1326   if(srcChars == 0) {
   1327     srcStart = srcLength = 0;
   1328   } else if(srcLength < 0) {
   1329     // get the srcLength if necessary
   1330     srcLength = u_strlen(srcChars + srcStart);
   1331   }
   1332 
   1333   // calculate the size of the string after the replace
   1334   int32_t newLength;
   1335 
   1336   // optimize append() onto a large-enough, owned string
   1337   if(start >= oldLength) {
   1338     if(srcLength == 0) {
   1339       return *this;
   1340     }
   1341     newLength = oldLength + srcLength;
   1342     if(newLength <= getCapacity() && isBufferWritable()) {
   1343       UChar *oldArray = getArrayStart();
   1344       // Do not copy characters when
   1345       //   UChar *buffer=str.getAppendBuffer(...);
   1346       // is followed by
   1347       //   str.append(buffer, length);
   1348       // or
   1349       //   str.appendString(buffer, length)
   1350       // or similar.
   1351       if(srcChars + srcStart != oldArray + start || start > oldLength) {
   1352         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
   1353       }
   1354       setLength(newLength);
   1355       return *this;
   1356     } else {
   1357       // pin the indices to legal values
   1358       start = oldLength;
   1359       length = 0;
   1360     }
   1361   } else {
   1362     // pin the indices to legal values
   1363     pinIndices(start, length);
   1364 
   1365     newLength = oldLength - length + srcLength;
   1366   }
   1367 
   1368   // the following may change fArray but will not copy the current contents;
   1369   // therefore we need to keep the current fArray
   1370   UChar oldStackBuffer[US_STACKBUF_SIZE];
   1371   UChar *oldArray;
   1372   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
   1373     // copy the stack buffer contents because it will be overwritten with
   1374     // fUnion.fFields values
   1375     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
   1376     oldArray = oldStackBuffer;
   1377   } else {
   1378     oldArray = getArrayStart();
   1379   }
   1380 
   1381   // clone our array and allocate a bigger array if needed
   1382   int32_t *bufferToDelete = 0;
   1383   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
   1384                          FALSE, &bufferToDelete)
   1385   ) {
   1386     return *this;
   1387   }
   1388 
   1389   // now do the replace
   1390 
   1391   UChar *newArray = getArrayStart();
   1392   if(newArray != oldArray) {
   1393     // if fArray changed, then we need to copy everything except what will change
   1394     us_arrayCopy(oldArray, 0, newArray, 0, start);
   1395     us_arrayCopy(oldArray, start + length,
   1396                  newArray, start + srcLength,
   1397                  oldLength - (start + length));
   1398   } else if(length != srcLength) {
   1399     // fArray did not change; copy only the portion that isn't changing, leaving a hole
   1400     us_arrayCopy(oldArray, start + length,
   1401                  newArray, start + srcLength,
   1402                  oldLength - (start + length));
   1403   }
   1404 
   1405   // now fill in the hole with the new string
   1406   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
   1407 
   1408   setLength(newLength);
   1409 
   1410   // delayed delete in case srcChars == fArray when we started, and
   1411   // to keep oldArray alive for the above operations
   1412   if (bufferToDelete) {
   1413     uprv_free(bufferToDelete);
   1414   }
   1415 
   1416   return *this;
   1417 }
   1418 
   1419 /**
   1420  * Replaceable API
   1421  */
   1422 void
   1423 UnicodeString::handleReplaceBetween(int32_t start,
   1424                                     int32_t limit,
   1425                                     const UnicodeString& text) {
   1426     replaceBetween(start, limit, text);
   1427 }
   1428 
   1429 /**
   1430  * Replaceable API
   1431  */
   1432 void
   1433 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
   1434     if (limit <= start) {
   1435         return; // Nothing to do; avoid bogus malloc call
   1436     }
   1437     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
   1438     // Check to make sure text is not null.
   1439     if (text != NULL) {
   1440 	    extractBetween(start, limit, text, 0);
   1441 	    insert(dest, text, 0, limit - start);
   1442 	    uprv_free(text);
   1443     }
   1444 }
   1445 
   1446 /**
   1447  * Replaceable API
   1448  *
   1449  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
   1450  * so we implement this function here.
   1451  */
   1452 UBool Replaceable::hasMetaData() const {
   1453     return TRUE;
   1454 }
   1455 
   1456 /**
   1457  * Replaceable API
   1458  */
   1459 UBool UnicodeString::hasMetaData() const {
   1460     return FALSE;
   1461 }
   1462 
   1463 UnicodeString&
   1464 UnicodeString::doReverse(int32_t start, int32_t length) {
   1465   if(length <= 1 || !cloneArrayIfNeeded()) {
   1466     return *this;
   1467   }
   1468 
   1469   // pin the indices to legal values
   1470   pinIndices(start, length);
   1471   if(length <= 1) {  // pinIndices() might have shrunk the length
   1472     return *this;
   1473   }
   1474 
   1475   UChar *left = getArrayStart() + start;
   1476   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
   1477   UChar swap;
   1478   UBool hasSupplementary = FALSE;
   1479 
   1480   // Before the loop we know left<right because length>=2.
   1481   do {
   1482     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
   1483     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
   1484     *right-- = swap;
   1485   } while(left < right);
   1486   // Make sure to test the middle code unit of an odd-length string.
   1487   // Redundant if the length is even.
   1488   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
   1489 
   1490   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
   1491   if(hasSupplementary) {
   1492     UChar swap2;
   1493 
   1494     left = getArrayStart() + start;
   1495     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
   1496     while(left < right) {
   1497       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
   1498         *left++ = swap2;
   1499         *left++ = swap;
   1500       } else {
   1501         ++left;
   1502       }
   1503     }
   1504   }
   1505 
   1506   return *this;
   1507 }
   1508 
   1509 UBool
   1510 UnicodeString::padLeading(int32_t targetLength,
   1511                           UChar padChar)
   1512 {
   1513   int32_t oldLength = length();
   1514   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1515     return FALSE;
   1516   } else {
   1517     // move contents up by padding width
   1518     UChar *array = getArrayStart();
   1519     int32_t start = targetLength - oldLength;
   1520     us_arrayCopy(array, 0, array, start, oldLength);
   1521 
   1522     // fill in padding character
   1523     while(--start >= 0) {
   1524       array[start] = padChar;
   1525     }
   1526     setLength(targetLength);
   1527     return TRUE;
   1528   }
   1529 }
   1530 
   1531 UBool
   1532 UnicodeString::padTrailing(int32_t targetLength,
   1533                            UChar padChar)
   1534 {
   1535   int32_t oldLength = length();
   1536   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1537     return FALSE;
   1538   } else {
   1539     // fill in padding character
   1540     UChar *array = getArrayStart();
   1541     int32_t length = targetLength;
   1542     while(--length >= oldLength) {
   1543       array[length] = padChar;
   1544     }
   1545     setLength(targetLength);
   1546     return TRUE;
   1547   }
   1548 }
   1549 
   1550 //========================================
   1551 // Hashing
   1552 //========================================
   1553 int32_t
   1554 UnicodeString::doHashCode() const
   1555 {
   1556     /* Delegate hash computation to uhash.  This makes UnicodeString
   1557      * hashing consistent with UChar* hashing.  */
   1558     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
   1559     if (hashCode == kInvalidHashCode) {
   1560         hashCode = kEmptyHashCode;
   1561     }
   1562     return hashCode;
   1563 }
   1564 
   1565 //========================================
   1566 // External Buffer
   1567 //========================================
   1568 
   1569 UChar *
   1570 UnicodeString::getBuffer(int32_t minCapacity) {
   1571   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
   1572     fFlags|=kOpenGetBuffer;
   1573     fShortLength=0;
   1574     return getArrayStart();
   1575   } else {
   1576     return 0;
   1577   }
   1578 }
   1579 
   1580 void
   1581 UnicodeString::releaseBuffer(int32_t newLength) {
   1582   if(fFlags&kOpenGetBuffer && newLength>=-1) {
   1583     // set the new fLength
   1584     int32_t capacity=getCapacity();
   1585     if(newLength==-1) {
   1586       // the new length is the string length, capped by fCapacity
   1587       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
   1588       while(p<limit && *p!=0) {
   1589         ++p;
   1590       }
   1591       newLength=(int32_t)(p-array);
   1592     } else if(newLength>capacity) {
   1593       newLength=capacity;
   1594     }
   1595     setLength(newLength);
   1596     fFlags&=~kOpenGetBuffer;
   1597   }
   1598 }
   1599 
   1600 //========================================
   1601 // Miscellaneous
   1602 //========================================
   1603 UBool
   1604 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
   1605                                   int32_t growCapacity,
   1606                                   UBool doCopyArray,
   1607                                   int32_t **pBufferToDelete,
   1608                                   UBool forceClone) {
   1609   // default parameters need to be static, therefore
   1610   // the defaults are -1 to have convenience defaults
   1611   if(newCapacity == -1) {
   1612     newCapacity = getCapacity();
   1613   }
   1614 
   1615   // while a getBuffer(minCapacity) is "open",
   1616   // prevent any modifications of the string by returning FALSE here
   1617   // if the string is bogus, then only an assignment or similar can revive it
   1618   if(!isWritable()) {
   1619     return FALSE;
   1620   }
   1621 
   1622   /*
   1623    * We need to make a copy of the array if
   1624    * the buffer is read-only, or
   1625    * the buffer is refCounted (shared), and refCount>1, or
   1626    * the buffer is too small.
   1627    * Return FALSE if memory could not be allocated.
   1628    */
   1629   if(forceClone ||
   1630      fFlags & kBufferIsReadonly ||
   1631      (fFlags & kRefCounted && refCount() > 1) ||
   1632      newCapacity > getCapacity()
   1633   ) {
   1634     // check growCapacity for default value and use of the stack buffer
   1635     if(growCapacity < 0) {
   1636       growCapacity = newCapacity;
   1637     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
   1638       growCapacity = US_STACKBUF_SIZE;
   1639     }
   1640 
   1641     // save old values
   1642     UChar oldStackBuffer[US_STACKBUF_SIZE];
   1643     UChar *oldArray;
   1644     uint8_t flags = fFlags;
   1645 
   1646     if(flags&kUsingStackBuffer) {
   1647       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
   1648       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
   1649         // copy the stack buffer contents because it will be overwritten with
   1650         // fUnion.fFields values
   1651         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
   1652         oldArray = oldStackBuffer;
   1653       } else {
   1654         oldArray = 0; // no need to copy from stack buffer to itself
   1655       }
   1656     } else {
   1657       oldArray = fUnion.fFields.fArray;
   1658       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
   1659     }
   1660 
   1661     // allocate a new array
   1662     if(allocate(growCapacity) ||
   1663        (newCapacity < growCapacity && allocate(newCapacity))
   1664     ) {
   1665       if(doCopyArray && oldArray != 0) {
   1666         // copy the contents
   1667         // do not copy more than what fits - it may be smaller than before
   1668         int32_t minLength = length();
   1669         newCapacity = getCapacity();
   1670         if(newCapacity < minLength) {
   1671           minLength = newCapacity;
   1672           setLength(minLength);
   1673         }
   1674         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
   1675       } else {
   1676         fShortLength = 0;
   1677       }
   1678 
   1679       // release the old array
   1680       if(flags & kRefCounted) {
   1681         // the array is refCounted; decrement and release if 0
   1682         int32_t *pRefCount = ((int32_t *)oldArray - 1);
   1683         if(umtx_atomic_dec(pRefCount) == 0) {
   1684           if(pBufferToDelete == 0) {
   1685             uprv_free(pRefCount);
   1686           } else {
   1687             // the caller requested to delete it himself
   1688             *pBufferToDelete = pRefCount;
   1689           }
   1690         }
   1691       }
   1692     } else {
   1693       // not enough memory for growCapacity and not even for the smaller newCapacity
   1694       // reset the old values for setToBogus() to release the array
   1695       if(!(flags&kUsingStackBuffer)) {
   1696         fUnion.fFields.fArray = oldArray;
   1697       }
   1698       fFlags = flags;
   1699       setToBogus();
   1700       return FALSE;
   1701     }
   1702   }
   1703   return TRUE;
   1704 }
   1705 
   1706 // UnicodeStringAppendable ------------------------------------------------- ***
   1707 
   1708 UnicodeStringAppendable::~UnicodeStringAppendable() {}
   1709 
   1710 UBool
   1711 UnicodeStringAppendable::appendCodeUnit(UChar c) {
   1712   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
   1713 }
   1714 
   1715 UBool
   1716 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
   1717   UChar buffer[U16_MAX_LENGTH];
   1718   int32_t cLength = 0;
   1719   UBool isError = FALSE;
   1720   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
   1721   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
   1722 }
   1723 
   1724 UBool
   1725 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
   1726   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
   1727 }
   1728 
   1729 UBool
   1730 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
   1731   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
   1732 }
   1733 
   1734 UChar *
   1735 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
   1736                                          int32_t desiredCapacityHint,
   1737                                          UChar *scratch, int32_t scratchCapacity,
   1738                                          int32_t *resultCapacity) {
   1739   if(minCapacity < 1 || scratchCapacity < minCapacity) {
   1740     *resultCapacity = 0;
   1741     return NULL;
   1742   }
   1743   int32_t oldLength = str.length();
   1744   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
   1745     *resultCapacity = str.getCapacity() - oldLength;
   1746     return str.getArrayStart() + oldLength;
   1747   }
   1748   *resultCapacity = scratchCapacity;
   1749   return scratch;
   1750 }
   1751 
   1752 U_NAMESPACE_END
   1753 
   1754 U_NAMESPACE_USE
   1755 
   1756 U_CAPI int32_t U_EXPORT2
   1757 uhash_hashUnicodeString(const UElement key) {
   1758     const UnicodeString *str = (const UnicodeString*) key.pointer;
   1759     return (str == NULL) ? 0 : str->hashCode();
   1760 }
   1761 
   1762 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
   1763 // does not depend on hashtable code.
   1764 U_CAPI UBool U_EXPORT2
   1765 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
   1766     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
   1767     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
   1768     if (str1 == str2) {
   1769         return TRUE;
   1770     }
   1771     if (str1 == NULL || str2 == NULL) {
   1772         return FALSE;
   1773     }
   1774     return *str1 == *str2;
   1775 }
   1776 
   1777 #ifdef U_STATIC_IMPLEMENTATION
   1778 /*
   1779 This should never be called. It is defined here to make sure that the
   1780 virtual vector deleting destructor is defined within unistr.cpp.
   1781 The vector deleting destructor is already a part of UObject,
   1782 but defining it here makes sure that it is included with this object file.
   1783 This makes sure that static library dependencies are kept to a minimum.
   1784 */
   1785 static void uprv_UnicodeStringDummy(void) {
   1786     delete [] (new UnicodeString[2]);
   1787 }
   1788 #endif
   1789