Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 * Copyright (C) 1999-2014, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 ******************************************************************************
      6 *
      7 * File unistr.cpp
      8 *
      9 * Modification History:
     10 *
     11 *   Date        Name        Description
     12 *   09/25/98    stephen     Creation.
     13 *   04/20/99    stephen     Overhauled per 4/16 code review.
     14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
     15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
     16 *                           Replaceable.
     17 *   06/25/01    grhoten     Removed the dependency on iostream
     18 ******************************************************************************
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicode/appendable.h"
     23 #include "unicode/putil.h"
     24 #include "cstring.h"
     25 #include "cmemory.h"
     26 #include "unicode/ustring.h"
     27 #include "unicode/unistr.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf16.h"
     30 #include "uelement.h"
     31 #include "ustr_imp.h"
     32 #include "umutex.h"
     33 #include "uassert.h"
     34 
     35 #if 0
     36 
     37 #include <iostream>
     38 using namespace std;
     39 
     40 //DEBUGGING
     41 void
     42 print(const UnicodeString& s,
     43       const char *name)
     44 {
     45   UChar c;
     46   cout << name << ":|";
     47   for(int i = 0; i < s.length(); ++i) {
     48     c = s[i];
     49     if(c>= 0x007E || c < 0x0020)
     50       cout << "[0x" << hex << s[i] << "]";
     51     else
     52       cout << (char) s[i];
     53   }
     54   cout << '|' << endl;
     55 }
     56 
     57 void
     58 print(const UChar *s,
     59       int32_t len,
     60       const char *name)
     61 {
     62   UChar c;
     63   cout << name << ":|";
     64   for(int i = 0; i < len; ++i) {
     65     c = s[i];
     66     if(c>= 0x007E || c < 0x0020)
     67       cout << "[0x" << hex << s[i] << "]";
     68     else
     69       cout << (char) s[i];
     70   }
     71   cout << '|' << endl;
     72 }
     73 // END DEBUGGING
     74 #endif
     75 
     76 // Local function definitions for now
     77 
     78 // need to copy areas that may overlap
     79 static
     80 inline void
     81 us_arrayCopy(const UChar *src, int32_t srcStart,
     82          UChar *dst, int32_t dstStart, int32_t count)
     83 {
     84   if(count>0) {
     85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
     86   }
     87 }
     88 
     89 // u_unescapeAt() callback to get a UChar from a UnicodeString
     90 U_CDECL_BEGIN
     91 static UChar U_CALLCONV
     92 UnicodeString_charAt(int32_t offset, void *context) {
     93     return ((icu::UnicodeString*) context)->charAt(offset);
     94 }
     95 U_CDECL_END
     96 
     97 U_NAMESPACE_BEGIN
     98 
     99 /* The Replaceable virtual destructor can't be defined in the header
    100    due to how AIX works with multiple definitions of virtual functions.
    101 */
    102 Replaceable::~Replaceable() {}
    103 
    104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
    105 
    106 UnicodeString U_EXPORT2
    107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
    108     return
    109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
    110             append(s1).
    111                 append(s2);
    112 }
    113 
    114 //========================================
    115 // Reference Counting functions, put at top of file so that optimizing compilers
    116 //                               have a chance to automatically inline.
    117 //========================================
    118 
    119 void
    120 UnicodeString::addRef() {
    121   umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
    122 }
    123 
    124 int32_t
    125 UnicodeString::removeRef() {
    126   return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
    127 }
    128 
    129 int32_t
    130 UnicodeString::refCount() const {
    131   return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
    132 }
    133 
    134 void
    135 UnicodeString::releaseArray() {
    136   if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
    137     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
    138   }
    139 }
    140 
    141 
    142 
    143 //========================================
    144 // Constructors
    145 //========================================
    146 
    147 // The default constructor is inline in unistr.h.
    148 
    149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
    150   fUnion.fFields.fLengthAndFlags = 0;
    151   if(count <= 0 || (uint32_t)c > 0x10ffff) {
    152     // just allocate and do not do anything else
    153     allocate(capacity);
    154   } else {
    155     // count > 0, allocate and fill the new string with count c's
    156     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
    157     if(capacity < length) {
    158       capacity = length;
    159     }
    160     if(allocate(capacity)) {
    161       UChar *array = getArrayStart();
    162       int32_t i = 0;
    163 
    164       // fill the new string with c
    165       if(unitCount == 1) {
    166         // fill with length UChars
    167         while(i < length) {
    168           array[i++] = (UChar)c;
    169         }
    170       } else {
    171         // get the code units for c
    172         UChar units[U16_MAX_LENGTH];
    173         U16_APPEND_UNSAFE(units, i, c);
    174 
    175         // now it must be i==unitCount
    176         i = 0;
    177 
    178         // for Unicode, unitCount can only be 1, 2, 3, or 4
    179         // 1 is handled above
    180         while(i < length) {
    181           int32_t unitIdx = 0;
    182           while(unitIdx < unitCount) {
    183             array[i++]=units[unitIdx++];
    184           }
    185         }
    186       }
    187     }
    188     setLength(length);
    189   }
    190 }
    191 
    192 UnicodeString::UnicodeString(UChar ch) {
    193   fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
    194   fUnion.fStackFields.fBuffer[0] = ch;
    195 }
    196 
    197 UnicodeString::UnicodeString(UChar32 ch) {
    198   fUnion.fFields.fLengthAndFlags = kShortString;
    199   int32_t i = 0;
    200   UBool isError = FALSE;
    201   U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
    202   // We test isError so that the compiler does not complain that we don't.
    203   // If isError then i==0 which is what we want anyway.
    204   if(!isError) {
    205     setShortLength(i);
    206   }
    207 }
    208 
    209 UnicodeString::UnicodeString(const UChar *text) {
    210   fUnion.fFields.fLengthAndFlags = kShortString;
    211   doReplace(0, 0, text, 0, -1);
    212 }
    213 
    214 UnicodeString::UnicodeString(const UChar *text,
    215                              int32_t textLength) {
    216   fUnion.fFields.fLengthAndFlags = kShortString;
    217   doReplace(0, 0, text, 0, textLength);
    218 }
    219 
    220 UnicodeString::UnicodeString(UBool isTerminated,
    221                              const UChar *text,
    222                              int32_t textLength) {
    223   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
    224   if(text == NULL) {
    225     // treat as an empty string, do not alias
    226     setToEmpty();
    227   } else if(textLength < -1 ||
    228             (textLength == -1 && !isTerminated) ||
    229             (textLength >= 0 && isTerminated && text[textLength] != 0)
    230   ) {
    231     setToBogus();
    232   } else {
    233     if(textLength == -1) {
    234       // text is terminated, or else it would have failed the above test
    235       textLength = u_strlen(text);
    236     }
    237     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
    238   }
    239 }
    240 
    241 UnicodeString::UnicodeString(UChar *buff,
    242                              int32_t buffLength,
    243                              int32_t buffCapacity) {
    244   fUnion.fFields.fLengthAndFlags = kWritableAlias;
    245   if(buff == NULL) {
    246     // treat as an empty string, do not alias
    247     setToEmpty();
    248   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
    249     setToBogus();
    250   } else {
    251     if(buffLength == -1) {
    252       // fLength = u_strlen(buff); but do not look beyond buffCapacity
    253       const UChar *p = buff, *limit = buff + buffCapacity;
    254       while(p != limit && *p != 0) {
    255         ++p;
    256       }
    257       buffLength = (int32_t)(p - buff);
    258     }
    259     setArray(buff, buffLength, buffCapacity);
    260   }
    261 }
    262 
    263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
    264   fUnion.fFields.fLengthAndFlags = kShortString;
    265   if(src==NULL) {
    266     // treat as an empty string
    267   } else {
    268     if(length<0) {
    269       length=(int32_t)uprv_strlen(src);
    270     }
    271     if(cloneArrayIfNeeded(length, length, FALSE)) {
    272       u_charsToUChars(src, getArrayStart(), length);
    273       setLength(length);
    274     } else {
    275       setToBogus();
    276     }
    277   }
    278 }
    279 
    280 #if U_CHARSET_IS_UTF8
    281 
    282 UnicodeString::UnicodeString(const char *codepageData) {
    283   fUnion.fFields.fLengthAndFlags = kShortString;
    284   if(codepageData != 0) {
    285     setToUTF8(codepageData);
    286   }
    287 }
    288 
    289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
    290   fUnion.fFields.fLengthAndFlags = kShortString;
    291   // if there's nothing to convert, do nothing
    292   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
    293     return;
    294   }
    295   if(dataLength == -1) {
    296     dataLength = (int32_t)uprv_strlen(codepageData);
    297   }
    298   setToUTF8(StringPiece(codepageData, dataLength));
    299 }
    300 
    301 // else see unistr_cnv.cpp
    302 #endif
    303 
    304 UnicodeString::UnicodeString(const UnicodeString& that) {
    305   fUnion.fFields.fLengthAndFlags = kShortString;
    306   copyFrom(that);
    307 }
    308 
    309 UnicodeString::UnicodeString(const UnicodeString& that,
    310                              int32_t srcStart) {
    311   fUnion.fFields.fLengthAndFlags = kShortString;
    312   setTo(that, srcStart);
    313 }
    314 
    315 UnicodeString::UnicodeString(const UnicodeString& that,
    316                              int32_t srcStart,
    317                              int32_t srcLength) {
    318   fUnion.fFields.fLengthAndFlags = kShortString;
    319   setTo(that, srcStart, srcLength);
    320 }
    321 
    322 // Replaceable base class clone() default implementation, does not clone
    323 Replaceable *
    324 Replaceable::clone() const {
    325   return NULL;
    326 }
    327 
    328 // UnicodeString overrides clone() with a real implementation
    329 Replaceable *
    330 UnicodeString::clone() const {
    331   return new UnicodeString(*this);
    332 }
    333 
    334 //========================================
    335 // array allocation
    336 //========================================
    337 
    338 UBool
    339 UnicodeString::allocate(int32_t capacity) {
    340   if(capacity <= US_STACKBUF_SIZE) {
    341     fUnion.fFields.fLengthAndFlags = kShortString;
    342   } else {
    343     // count bytes for the refCounter and the string capacity, and
    344     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
    345     // to be safely aligned for the refCount
    346     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
    347     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
    348     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
    349     if(array != 0) {
    350       // set initial refCount and point behind the refCount
    351       *array++ = 1;
    352 
    353       // have fArray point to the first UChar
    354       fUnion.fFields.fArray = (UChar *)array;
    355       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
    356       fUnion.fFields.fLengthAndFlags = kLongString;
    357     } else {
    358       fUnion.fFields.fLengthAndFlags = kIsBogus;
    359       fUnion.fFields.fArray = 0;
    360       fUnion.fFields.fCapacity = 0;
    361       return FALSE;
    362     }
    363   }
    364   return TRUE;
    365 }
    366 
    367 //========================================
    368 // Destructor
    369 //========================================
    370 UnicodeString::~UnicodeString()
    371 {
    372   releaseArray();
    373 }
    374 
    375 //========================================
    376 // Factory methods
    377 //========================================
    378 
    379 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
    380   UnicodeString result;
    381   result.setToUTF8(utf8);
    382   return result;
    383 }
    384 
    385 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
    386   UnicodeString result;
    387   int32_t capacity;
    388   // Most UTF-32 strings will be BMP-only and result in a same-length
    389   // UTF-16 string. We overestimate the capacity just slightly,
    390   // just in case there are a few supplementary characters.
    391   if(length <= US_STACKBUF_SIZE) {
    392     capacity = US_STACKBUF_SIZE;
    393   } else {
    394     capacity = length + (length >> 4) + 4;
    395   }
    396   do {
    397     UChar *utf16 = result.getBuffer(capacity);
    398     int32_t length16;
    399     UErrorCode errorCode = U_ZERO_ERROR;
    400     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
    401         utf32, length,
    402         0xfffd,  // Substitution character.
    403         NULL,    // Don't care about number of substitutions.
    404         &errorCode);
    405     result.releaseBuffer(length16);
    406     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    407       capacity = length16 + 1;  // +1 for the terminating NUL.
    408       continue;
    409     } else if(U_FAILURE(errorCode)) {
    410       result.setToBogus();
    411     }
    412     break;
    413   } while(TRUE);
    414   return result;
    415 }
    416 
    417 //========================================
    418 // Assignment
    419 //========================================
    420 
    421 UnicodeString &
    422 UnicodeString::operator=(const UnicodeString &src) {
    423   return copyFrom(src);
    424 }
    425 
    426 UnicodeString &
    427 UnicodeString::fastCopyFrom(const UnicodeString &src) {
    428   return copyFrom(src, TRUE);
    429 }
    430 
    431 UnicodeString &
    432 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
    433   // if assigning to ourselves, do nothing
    434   if(this == &src) {
    435     return *this;
    436   }
    437 
    438   // is the right side bogus?
    439   if(src.isBogus()) {
    440     setToBogus();
    441     return *this;
    442   }
    443 
    444   // delete the current contents
    445   releaseArray();
    446 
    447   if(src.isEmpty()) {
    448     // empty string - use the stack buffer
    449     setToEmpty();
    450     return *this;
    451   }
    452 
    453   // fLength>0 and not an "open" src.getBuffer(minCapacity)
    454   fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
    455   switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
    456   case kShortString:
    457     // short string using the stack buffer, do the same
    458     uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
    459                 getShortLength() * U_SIZEOF_UCHAR);
    460     break;
    461   case kLongString:
    462     // src uses a refCounted string buffer, use that buffer with refCount
    463     // src is const, use a cast - we don't actually change it
    464     ((UnicodeString &)src).addRef();
    465     // copy all fields, share the reference-counted buffer
    466     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    467     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    468     if(!hasShortLength()) {
    469       fUnion.fFields.fLength = src.fUnion.fFields.fLength;
    470     }
    471     break;
    472   case kReadonlyAlias:
    473     if(fastCopy) {
    474       // src is a readonly alias, do the same
    475       // -> maintain the readonly alias as such
    476       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
    477       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
    478       if(!hasShortLength()) {
    479         fUnion.fFields.fLength = src.fUnion.fFields.fLength;
    480       }
    481       break;
    482     }
    483     // else if(!fastCopy) fall through to case kWritableAlias
    484     // -> allocate a new buffer and copy the contents
    485   case kWritableAlias: {
    486     // src is a writable alias; we make a copy of that instead
    487     int32_t srcLength = src.length();
    488     if(allocate(srcLength)) {
    489       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
    490       setLength(srcLength);
    491       break;
    492     }
    493     // if there is not enough memory, then fall through to setting to bogus
    494   }
    495   default:
    496     // if src is bogus, set ourselves to bogus
    497     // do not call setToBogus() here because fArray and flags are not consistent here
    498     fUnion.fFields.fLengthAndFlags = kIsBogus;
    499     fUnion.fFields.fArray = 0;
    500     fUnion.fFields.fCapacity = 0;
    501     break;
    502   }
    503 
    504   return *this;
    505 }
    506 
    507 //========================================
    508 // Miscellaneous operations
    509 //========================================
    510 
    511 UnicodeString UnicodeString::unescape() const {
    512     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
    513     const UChar *array = getBuffer();
    514     int32_t len = length();
    515     int32_t prev = 0;
    516     for (int32_t i=0;;) {
    517         if (i == len) {
    518             result.append(array, prev, len - prev);
    519             break;
    520         }
    521         if (array[i++] == 0x5C /*'\\'*/) {
    522             result.append(array, prev, (i - 1) - prev);
    523             UChar32 c = unescapeAt(i); // advances i
    524             if (c < 0) {
    525                 result.remove(); // return empty string
    526                 break; // invalid escape sequence
    527             }
    528             result.append(c);
    529             prev = i;
    530         }
    531     }
    532     return result;
    533 }
    534 
    535 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
    536     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
    537 }
    538 
    539 //========================================
    540 // Read-only implementation
    541 //========================================
    542 UBool
    543 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
    544   // Requires: this & text not bogus and have same lengths.
    545   // Byte-wise comparison works for equality regardless of endianness.
    546   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
    547 }
    548 
    549 int8_t
    550 UnicodeString::doCompare( int32_t start,
    551               int32_t length,
    552               const UChar *srcChars,
    553               int32_t srcStart,
    554               int32_t srcLength) const
    555 {
    556   // compare illegal string values
    557   if(isBogus()) {
    558     return -1;
    559   }
    560 
    561   // pin indices to legal values
    562   pinIndices(start, length);
    563 
    564   if(srcChars == NULL) {
    565     // treat const UChar *srcChars==NULL as an empty string
    566     return length == 0 ? 0 : 1;
    567   }
    568 
    569   // get the correct pointer
    570   const UChar *chars = getArrayStart();
    571 
    572   chars += start;
    573   srcChars += srcStart;
    574 
    575   int32_t minLength;
    576   int8_t lengthResult;
    577 
    578   // get the srcLength if necessary
    579   if(srcLength < 0) {
    580     srcLength = u_strlen(srcChars + srcStart);
    581   }
    582 
    583   // are we comparing different lengths?
    584   if(length != srcLength) {
    585     if(length < srcLength) {
    586       minLength = length;
    587       lengthResult = -1;
    588     } else {
    589       minLength = srcLength;
    590       lengthResult = 1;
    591     }
    592   } else {
    593     minLength = length;
    594     lengthResult = 0;
    595   }
    596 
    597   /*
    598    * note that uprv_memcmp() returns an int but we return an int8_t;
    599    * we need to take care not to truncate the result -
    600    * one way to do this is to right-shift the value to
    601    * move the sign bit into the lower 8 bits and making sure that this
    602    * does not become 0 itself
    603    */
    604 
    605   if(minLength > 0 && chars != srcChars) {
    606     int32_t result;
    607 
    608 #   if U_IS_BIG_ENDIAN
    609       // big-endian: byte comparison works
    610       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
    611       if(result != 0) {
    612         return (int8_t)(result >> 15 | 1);
    613       }
    614 #   else
    615       // little-endian: compare UChar units
    616       do {
    617         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
    618         if(result != 0) {
    619           return (int8_t)(result >> 15 | 1);
    620         }
    621       } while(--minLength > 0);
    622 #   endif
    623   }
    624   return lengthResult;
    625 }
    626 
    627 /* String compare in code point order - doCompare() compares in code unit order. */
    628 int8_t
    629 UnicodeString::doCompareCodePointOrder(int32_t start,
    630                                        int32_t length,
    631                                        const UChar *srcChars,
    632                                        int32_t srcStart,
    633                                        int32_t srcLength) const
    634 {
    635   // compare illegal string values
    636   // treat const UChar *srcChars==NULL as an empty string
    637   if(isBogus()) {
    638     return -1;
    639   }
    640 
    641   // pin indices to legal values
    642   pinIndices(start, length);
    643 
    644   if(srcChars == NULL) {
    645     srcStart = srcLength = 0;
    646   }
    647 
    648   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
    649   /* translate the 32-bit result into an 8-bit one */
    650   if(diff!=0) {
    651     return (int8_t)(diff >> 15 | 1);
    652   } else {
    653     return 0;
    654   }
    655 }
    656 
    657 int32_t
    658 UnicodeString::getLength() const {
    659     return length();
    660 }
    661 
    662 UChar
    663 UnicodeString::getCharAt(int32_t offset) const {
    664   return charAt(offset);
    665 }
    666 
    667 UChar32
    668 UnicodeString::getChar32At(int32_t offset) const {
    669   return char32At(offset);
    670 }
    671 
    672 UChar32
    673 UnicodeString::char32At(int32_t offset) const
    674 {
    675   int32_t len = length();
    676   if((uint32_t)offset < (uint32_t)len) {
    677     const UChar *array = getArrayStart();
    678     UChar32 c;
    679     U16_GET(array, 0, offset, len, c);
    680     return c;
    681   } else {
    682     return kInvalidUChar;
    683   }
    684 }
    685 
    686 int32_t
    687 UnicodeString::getChar32Start(int32_t offset) const {
    688   if((uint32_t)offset < (uint32_t)length()) {
    689     const UChar *array = getArrayStart();
    690     U16_SET_CP_START(array, 0, offset);
    691     return offset;
    692   } else {
    693     return 0;
    694   }
    695 }
    696 
    697 int32_t
    698 UnicodeString::getChar32Limit(int32_t offset) const {
    699   int32_t len = length();
    700   if((uint32_t)offset < (uint32_t)len) {
    701     const UChar *array = getArrayStart();
    702     U16_SET_CP_LIMIT(array, 0, offset, len);
    703     return offset;
    704   } else {
    705     return len;
    706   }
    707 }
    708 
    709 int32_t
    710 UnicodeString::countChar32(int32_t start, int32_t length) const {
    711   pinIndices(start, length);
    712   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
    713   return u_countChar32(getArrayStart()+start, length);
    714 }
    715 
    716 UBool
    717 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
    718   pinIndices(start, length);
    719   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
    720   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
    721 }
    722 
    723 int32_t
    724 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
    725   // pin index
    726   int32_t len = length();
    727   if(index<0) {
    728     index=0;
    729   } else if(index>len) {
    730     index=len;
    731   }
    732 
    733   const UChar *array = getArrayStart();
    734   if(delta>0) {
    735     U16_FWD_N(array, index, len, delta);
    736   } else {
    737     U16_BACK_N(array, 0, index, -delta);
    738   }
    739 
    740   return index;
    741 }
    742 
    743 void
    744 UnicodeString::doExtract(int32_t start,
    745              int32_t length,
    746              UChar *dst,
    747              int32_t dstStart) const
    748 {
    749   // pin indices to legal values
    750   pinIndices(start, length);
    751 
    752   // do not copy anything if we alias dst itself
    753   const UChar *array = getArrayStart();
    754   if(array + start != dst + dstStart) {
    755     us_arrayCopy(array, start, dst, dstStart, length);
    756   }
    757 }
    758 
    759 int32_t
    760 UnicodeString::extract(UChar *dest, int32_t destCapacity,
    761                        UErrorCode &errorCode) const {
    762   int32_t len = length();
    763   if(U_SUCCESS(errorCode)) {
    764     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
    765       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    766     } else {
    767       const UChar *array = getArrayStart();
    768       if(len>0 && len<=destCapacity && array!=dest) {
    769         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
    770       }
    771       return u_terminateUChars(dest, destCapacity, len, &errorCode);
    772     }
    773   }
    774 
    775   return len;
    776 }
    777 
    778 int32_t
    779 UnicodeString::extract(int32_t start,
    780                        int32_t length,
    781                        char *target,
    782                        int32_t targetCapacity,
    783                        enum EInvariant) const
    784 {
    785   // if the arguments are illegal, then do nothing
    786   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
    787     return 0;
    788   }
    789 
    790   // pin the indices to legal values
    791   pinIndices(start, length);
    792 
    793   if(length <= targetCapacity) {
    794     u_UCharsToChars(getArrayStart() + start, target, length);
    795   }
    796   UErrorCode status = U_ZERO_ERROR;
    797   return u_terminateChars(target, targetCapacity, length, &status);
    798 }
    799 
    800 UnicodeString
    801 UnicodeString::tempSubString(int32_t start, int32_t len) const {
    802   pinIndices(start, len);
    803   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
    804   if(array==NULL) {
    805     array=fUnion.fStackFields.fBuffer;  // anything not NULL because that would make an empty string
    806     len=-2;  // bogus result string
    807   }
    808   return UnicodeString(FALSE, array + start, len);
    809 }
    810 
    811 int32_t
    812 UnicodeString::toUTF8(int32_t start, int32_t len,
    813                       char *target, int32_t capacity) const {
    814   pinIndices(start, len);
    815   int32_t length8;
    816   UErrorCode errorCode = U_ZERO_ERROR;
    817   u_strToUTF8WithSub(target, capacity, &length8,
    818                      getBuffer() + start, len,
    819                      0xFFFD,  // Standard substitution character.
    820                      NULL,    // Don't care about number of substitutions.
    821                      &errorCode);
    822   return length8;
    823 }
    824 
    825 #if U_CHARSET_IS_UTF8
    826 
    827 int32_t
    828 UnicodeString::extract(int32_t start, int32_t len,
    829                        char *target, uint32_t dstSize) const {
    830   // if the arguments are illegal, then do nothing
    831   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
    832     return 0;
    833   }
    834   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
    835 }
    836 
    837 // else see unistr_cnv.cpp
    838 #endif
    839 
    840 void
    841 UnicodeString::extractBetween(int32_t start,
    842                   int32_t limit,
    843                   UnicodeString& target) const {
    844   pinIndex(start);
    845   pinIndex(limit);
    846   doExtract(start, limit - start, target);
    847 }
    848 
    849 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
    850 // as many bytes as the source has UChars.
    851 // The "worst cases" are writing systems like Indic, Thai and CJK with
    852 // 3:1 bytes:UChars.
    853 void
    854 UnicodeString::toUTF8(ByteSink &sink) const {
    855   int32_t length16 = length();
    856   if(length16 != 0) {
    857     char stackBuffer[1024];
    858     int32_t capacity = (int32_t)sizeof(stackBuffer);
    859     UBool utf8IsOwned = FALSE;
    860     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
    861                                       3*length16,
    862                                       stackBuffer, capacity,
    863                                       &capacity);
    864     int32_t length8 = 0;
    865     UErrorCode errorCode = U_ZERO_ERROR;
    866     u_strToUTF8WithSub(utf8, capacity, &length8,
    867                        getBuffer(), length16,
    868                        0xFFFD,  // Standard substitution character.
    869                        NULL,    // Don't care about number of substitutions.
    870                        &errorCode);
    871     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
    872       utf8 = (char *)uprv_malloc(length8);
    873       if(utf8 != NULL) {
    874         utf8IsOwned = TRUE;
    875         errorCode = U_ZERO_ERROR;
    876         u_strToUTF8WithSub(utf8, length8, &length8,
    877                            getBuffer(), length16,
    878                            0xFFFD,  // Standard substitution character.
    879                            NULL,    // Don't care about number of substitutions.
    880                            &errorCode);
    881       } else {
    882         errorCode = U_MEMORY_ALLOCATION_ERROR;
    883       }
    884     }
    885     if(U_SUCCESS(errorCode)) {
    886       sink.Append(utf8, length8);
    887       sink.Flush();
    888     }
    889     if(utf8IsOwned) {
    890       uprv_free(utf8);
    891     }
    892   }
    893 }
    894 
    895 int32_t
    896 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
    897   int32_t length32=0;
    898   if(U_SUCCESS(errorCode)) {
    899     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
    900     u_strToUTF32WithSub(utf32, capacity, &length32,
    901         getBuffer(), length(),
    902         0xfffd,  // Substitution character.
    903         NULL,    // Don't care about number of substitutions.
    904         &errorCode);
    905   }
    906   return length32;
    907 }
    908 
    909 int32_t
    910 UnicodeString::indexOf(const UChar *srcChars,
    911                int32_t srcStart,
    912                int32_t srcLength,
    913                int32_t start,
    914                int32_t length) const
    915 {
    916   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
    917     return -1;
    918   }
    919 
    920   // UnicodeString does not find empty substrings
    921   if(srcLength < 0 && srcChars[srcStart] == 0) {
    922     return -1;
    923   }
    924 
    925   // get the indices within bounds
    926   pinIndices(start, length);
    927 
    928   // find the first occurrence of the substring
    929   const UChar *array = getArrayStart();
    930   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
    931   if(match == NULL) {
    932     return -1;
    933   } else {
    934     return (int32_t)(match - array);
    935   }
    936 }
    937 
    938 int32_t
    939 UnicodeString::doIndexOf(UChar c,
    940              int32_t start,
    941              int32_t length) const
    942 {
    943   // pin indices
    944   pinIndices(start, length);
    945 
    946   // find the first occurrence of c
    947   const UChar *array = getArrayStart();
    948   const UChar *match = u_memchr(array + start, c, length);
    949   if(match == NULL) {
    950     return -1;
    951   } else {
    952     return (int32_t)(match - array);
    953   }
    954 }
    955 
    956 int32_t
    957 UnicodeString::doIndexOf(UChar32 c,
    958                          int32_t start,
    959                          int32_t length) const {
    960   // pin indices
    961   pinIndices(start, length);
    962 
    963   // find the first occurrence of c
    964   const UChar *array = getArrayStart();
    965   const UChar *match = u_memchr32(array + start, c, length);
    966   if(match == NULL) {
    967     return -1;
    968   } else {
    969     return (int32_t)(match - array);
    970   }
    971 }
    972 
    973 int32_t
    974 UnicodeString::lastIndexOf(const UChar *srcChars,
    975                int32_t srcStart,
    976                int32_t srcLength,
    977                int32_t start,
    978                int32_t length) const
    979 {
    980   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
    981     return -1;
    982   }
    983 
    984   // UnicodeString does not find empty substrings
    985   if(srcLength < 0 && srcChars[srcStart] == 0) {
    986     return -1;
    987   }
    988 
    989   // get the indices within bounds
    990   pinIndices(start, length);
    991 
    992   // find the last occurrence of the substring
    993   const UChar *array = getArrayStart();
    994   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
    995   if(match == NULL) {
    996     return -1;
    997   } else {
    998     return (int32_t)(match - array);
    999   }
   1000 }
   1001 
   1002 int32_t
   1003 UnicodeString::doLastIndexOf(UChar c,
   1004                  int32_t start,
   1005                  int32_t length) const
   1006 {
   1007   if(isBogus()) {
   1008     return -1;
   1009   }
   1010 
   1011   // pin indices
   1012   pinIndices(start, length);
   1013 
   1014   // find the last occurrence of c
   1015   const UChar *array = getArrayStart();
   1016   const UChar *match = u_memrchr(array + start, c, length);
   1017   if(match == NULL) {
   1018     return -1;
   1019   } else {
   1020     return (int32_t)(match - array);
   1021   }
   1022 }
   1023 
   1024 int32_t
   1025 UnicodeString::doLastIndexOf(UChar32 c,
   1026                              int32_t start,
   1027                              int32_t length) const {
   1028   // pin indices
   1029   pinIndices(start, length);
   1030 
   1031   // find the last occurrence of c
   1032   const UChar *array = getArrayStart();
   1033   const UChar *match = u_memrchr32(array + start, c, length);
   1034   if(match == NULL) {
   1035     return -1;
   1036   } else {
   1037     return (int32_t)(match - array);
   1038   }
   1039 }
   1040 
   1041 //========================================
   1042 // Write implementation
   1043 //========================================
   1044 
   1045 UnicodeString&
   1046 UnicodeString::findAndReplace(int32_t start,
   1047                   int32_t length,
   1048                   const UnicodeString& oldText,
   1049                   int32_t oldStart,
   1050                   int32_t oldLength,
   1051                   const UnicodeString& newText,
   1052                   int32_t newStart,
   1053                   int32_t newLength)
   1054 {
   1055   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
   1056     return *this;
   1057   }
   1058 
   1059   pinIndices(start, length);
   1060   oldText.pinIndices(oldStart, oldLength);
   1061   newText.pinIndices(newStart, newLength);
   1062 
   1063   if(oldLength == 0) {
   1064     return *this;
   1065   }
   1066 
   1067   while(length > 0 && length >= oldLength) {
   1068     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
   1069     if(pos < 0) {
   1070       // no more oldText's here: done
   1071       break;
   1072     } else {
   1073       // we found oldText, replace it by newText and go beyond it
   1074       replace(pos, oldLength, newText, newStart, newLength);
   1075       length -= pos + oldLength - start;
   1076       start = pos + newLength;
   1077     }
   1078   }
   1079 
   1080   return *this;
   1081 }
   1082 
   1083 
   1084 void
   1085 UnicodeString::setToBogus()
   1086 {
   1087   releaseArray();
   1088 
   1089   fUnion.fFields.fLengthAndFlags = kIsBogus;
   1090   fUnion.fFields.fArray = 0;
   1091   fUnion.fFields.fCapacity = 0;
   1092 }
   1093 
   1094 // turn a bogus string into an empty one
   1095 void
   1096 UnicodeString::unBogus() {
   1097   if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
   1098     setToEmpty();
   1099   }
   1100 }
   1101 
   1102 const UChar *
   1103 UnicodeString::getTerminatedBuffer() {
   1104   if(!isWritable()) {
   1105     return 0;
   1106   }
   1107   UChar *array = getArrayStart();
   1108   int32_t len = length();
   1109   if(len < getCapacity()) {
   1110     if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
   1111       // If len<capacity on a read-only alias, then array[len] is
   1112       // either the original NUL (if constructed with (TRUE, s, length))
   1113       // or one of the original string contents characters (if later truncated),
   1114       // therefore we can assume that array[len] is initialized memory.
   1115       if(array[len] == 0) {
   1116         return array;
   1117       }
   1118     } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
   1119       // kRefCounted: Do not write the NUL if the buffer is shared.
   1120       // That is mostly safe, except when the length of one copy was modified
   1121       // without copy-on-write, e.g., via truncate(newLength) or remove(void).
   1122       // Then the NUL would be written into the middle of another copy's string.
   1123 
   1124       // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
   1125       // Do not test if there is a NUL already because it might be uninitialized memory.
   1126       // (That would be safe, but tools like valgrind & Purify would complain.)
   1127       array[len] = 0;
   1128       return array;
   1129     }
   1130   }
   1131   if(cloneArrayIfNeeded(len+1)) {
   1132     array = getArrayStart();
   1133     array[len] = 0;
   1134     return array;
   1135   } else {
   1136     return NULL;
   1137   }
   1138 }
   1139 
   1140 // setTo() analogous to the readonly-aliasing constructor with the same signature
   1141 UnicodeString &
   1142 UnicodeString::setTo(UBool isTerminated,
   1143                      const UChar *text,
   1144                      int32_t textLength)
   1145 {
   1146   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
   1147     // do not modify a string that has an "open" getBuffer(minCapacity)
   1148     return *this;
   1149   }
   1150 
   1151   if(text == NULL) {
   1152     // treat as an empty string, do not alias
   1153     releaseArray();
   1154     setToEmpty();
   1155     return *this;
   1156   }
   1157 
   1158   if( textLength < -1 ||
   1159       (textLength == -1 && !isTerminated) ||
   1160       (textLength >= 0 && isTerminated && text[textLength] != 0)
   1161   ) {
   1162     setToBogus();
   1163     return *this;
   1164   }
   1165 
   1166   releaseArray();
   1167 
   1168   if(textLength == -1) {
   1169     // text is terminated, or else it would have failed the above test
   1170     textLength = u_strlen(text);
   1171   }
   1172   fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
   1173   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
   1174   return *this;
   1175 }
   1176 
   1177 // setTo() analogous to the writable-aliasing constructor with the same signature
   1178 UnicodeString &
   1179 UnicodeString::setTo(UChar *buffer,
   1180                      int32_t buffLength,
   1181                      int32_t buffCapacity) {
   1182   if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
   1183     // do not modify a string that has an "open" getBuffer(minCapacity)
   1184     return *this;
   1185   }
   1186 
   1187   if(buffer == NULL) {
   1188     // treat as an empty string, do not alias
   1189     releaseArray();
   1190     setToEmpty();
   1191     return *this;
   1192   }
   1193 
   1194   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
   1195     setToBogus();
   1196     return *this;
   1197   } else if(buffLength == -1) {
   1198     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
   1199     const UChar *p = buffer, *limit = buffer + buffCapacity;
   1200     while(p != limit && *p != 0) {
   1201       ++p;
   1202     }
   1203     buffLength = (int32_t)(p - buffer);
   1204   }
   1205 
   1206   releaseArray();
   1207 
   1208   fUnion.fFields.fLengthAndFlags = kWritableAlias;
   1209   setArray(buffer, buffLength, buffCapacity);
   1210   return *this;
   1211 }
   1212 
   1213 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
   1214   unBogus();
   1215   int32_t length = utf8.length();
   1216   int32_t capacity;
   1217   // The UTF-16 string will be at most as long as the UTF-8 string.
   1218   if(length <= US_STACKBUF_SIZE) {
   1219     capacity = US_STACKBUF_SIZE;
   1220   } else {
   1221     capacity = length + 1;  // +1 for the terminating NUL.
   1222   }
   1223   UChar *utf16 = getBuffer(capacity);
   1224   int32_t length16;
   1225   UErrorCode errorCode = U_ZERO_ERROR;
   1226   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
   1227       utf8.data(), length,
   1228       0xfffd,  // Substitution character.
   1229       NULL,    // Don't care about number of substitutions.
   1230       &errorCode);
   1231   releaseBuffer(length16);
   1232   if(U_FAILURE(errorCode)) {
   1233     setToBogus();
   1234   }
   1235   return *this;
   1236 }
   1237 
   1238 UnicodeString&
   1239 UnicodeString::setCharAt(int32_t offset,
   1240              UChar c)
   1241 {
   1242   int32_t len = length();
   1243   if(cloneArrayIfNeeded() && len > 0) {
   1244     if(offset < 0) {
   1245       offset = 0;
   1246     } else if(offset >= len) {
   1247       offset = len - 1;
   1248     }
   1249 
   1250     getArrayStart()[offset] = c;
   1251   }
   1252   return *this;
   1253 }
   1254 
   1255 UnicodeString&
   1256 UnicodeString::replace(int32_t start,
   1257                int32_t _length,
   1258                UChar32 srcChar) {
   1259   UChar buffer[U16_MAX_LENGTH];
   1260   int32_t count = 0;
   1261   UBool isError = FALSE;
   1262   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
   1263   // We test isError so that the compiler does not complain that we don't.
   1264   // If isError (srcChar is not a valid code point) then count==0 which means
   1265   // we remove the source segment rather than replacing it with srcChar.
   1266   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
   1267 }
   1268 
   1269 UnicodeString&
   1270 UnicodeString::append(UChar32 srcChar) {
   1271   UChar buffer[U16_MAX_LENGTH];
   1272   int32_t _length = 0;
   1273   UBool isError = FALSE;
   1274   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
   1275   // We test isError so that the compiler does not complain that we don't.
   1276   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
   1277   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
   1278 }
   1279 
   1280 UnicodeString&
   1281 UnicodeString::doReplace( int32_t start,
   1282               int32_t length,
   1283               const UnicodeString& src,
   1284               int32_t srcStart,
   1285               int32_t srcLength)
   1286 {
   1287   if(!src.isBogus()) {
   1288     // pin the indices to legal values
   1289     src.pinIndices(srcStart, srcLength);
   1290 
   1291     // get the characters from src
   1292     // and replace the range in ourselves with them
   1293     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
   1294   } else {
   1295     // remove the range
   1296     return doReplace(start, length, 0, 0, 0);
   1297   }
   1298 }
   1299 
   1300 UnicodeString&
   1301 UnicodeString::doReplace(int32_t start,
   1302              int32_t length,
   1303              const UChar *srcChars,
   1304              int32_t srcStart,
   1305              int32_t srcLength)
   1306 {
   1307   if(!isWritable()) {
   1308     return *this;
   1309   }
   1310 
   1311   int32_t oldLength = this->length();
   1312 
   1313   // optimize (read-only alias).remove(0, start) and .remove(start, end)
   1314   if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
   1315     if(start == 0) {
   1316       // remove prefix by adjusting the array pointer
   1317       pinIndex(length);
   1318       fUnion.fFields.fArray += length;
   1319       fUnion.fFields.fCapacity -= length;
   1320       setLength(oldLength - length);
   1321       return *this;
   1322     } else {
   1323       pinIndex(start);
   1324       if(length >= (oldLength - start)) {
   1325         // remove suffix by reducing the length (like truncate())
   1326         setLength(start);
   1327         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
   1328         return *this;
   1329       }
   1330     }
   1331   }
   1332 
   1333   if(srcChars == 0) {
   1334     srcStart = srcLength = 0;
   1335   } else if(srcLength < 0) {
   1336     // get the srcLength if necessary
   1337     srcLength = u_strlen(srcChars + srcStart);
   1338   }
   1339 
   1340   // calculate the size of the string after the replace
   1341   int32_t newLength;
   1342 
   1343   // optimize append() onto a large-enough, owned string
   1344   if(start >= oldLength) {
   1345     if(srcLength == 0) {
   1346       return *this;
   1347     }
   1348     newLength = oldLength + srcLength;
   1349     if(newLength <= getCapacity() && isBufferWritable()) {
   1350       UChar *oldArray = getArrayStart();
   1351       // Do not copy characters when
   1352       //   UChar *buffer=str.getAppendBuffer(...);
   1353       // is followed by
   1354       //   str.append(buffer, length);
   1355       // or
   1356       //   str.appendString(buffer, length)
   1357       // or similar.
   1358       if(srcChars + srcStart != oldArray + start || start > oldLength) {
   1359         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
   1360       }
   1361       setLength(newLength);
   1362       return *this;
   1363     } else {
   1364       // pin the indices to legal values
   1365       start = oldLength;
   1366       length = 0;
   1367     }
   1368   } else {
   1369     // pin the indices to legal values
   1370     pinIndices(start, length);
   1371 
   1372     newLength = oldLength - length + srcLength;
   1373   }
   1374 
   1375   // the following may change fArray but will not copy the current contents;
   1376   // therefore we need to keep the current fArray
   1377   UChar oldStackBuffer[US_STACKBUF_SIZE];
   1378   UChar *oldArray;
   1379   if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
   1380     // copy the stack buffer contents because it will be overwritten with
   1381     // fUnion.fFields values
   1382     u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
   1383     oldArray = oldStackBuffer;
   1384   } else {
   1385     oldArray = getArrayStart();
   1386   }
   1387 
   1388   // clone our array and allocate a bigger array if needed
   1389   int32_t *bufferToDelete = 0;
   1390   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
   1391                          FALSE, &bufferToDelete)
   1392   ) {
   1393     return *this;
   1394   }
   1395 
   1396   // now do the replace
   1397 
   1398   UChar *newArray = getArrayStart();
   1399   if(newArray != oldArray) {
   1400     // if fArray changed, then we need to copy everything except what will change
   1401     us_arrayCopy(oldArray, 0, newArray, 0, start);
   1402     us_arrayCopy(oldArray, start + length,
   1403                  newArray, start + srcLength,
   1404                  oldLength - (start + length));
   1405   } else if(length != srcLength) {
   1406     // fArray did not change; copy only the portion that isn't changing, leaving a hole
   1407     us_arrayCopy(oldArray, start + length,
   1408                  newArray, start + srcLength,
   1409                  oldLength - (start + length));
   1410   }
   1411 
   1412   // now fill in the hole with the new string
   1413   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
   1414 
   1415   setLength(newLength);
   1416 
   1417   // delayed delete in case srcChars == fArray when we started, and
   1418   // to keep oldArray alive for the above operations
   1419   if (bufferToDelete) {
   1420     uprv_free(bufferToDelete);
   1421   }
   1422 
   1423   return *this;
   1424 }
   1425 
   1426 /**
   1427  * Replaceable API
   1428  */
   1429 void
   1430 UnicodeString::handleReplaceBetween(int32_t start,
   1431                                     int32_t limit,
   1432                                     const UnicodeString& text) {
   1433     replaceBetween(start, limit, text);
   1434 }
   1435 
   1436 /**
   1437  * Replaceable API
   1438  */
   1439 void
   1440 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
   1441     if (limit <= start) {
   1442         return; // Nothing to do; avoid bogus malloc call
   1443     }
   1444     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
   1445     // Check to make sure text is not null.
   1446     if (text != NULL) {
   1447 	    extractBetween(start, limit, text, 0);
   1448 	    insert(dest, text, 0, limit - start);
   1449 	    uprv_free(text);
   1450     }
   1451 }
   1452 
   1453 /**
   1454  * Replaceable API
   1455  *
   1456  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
   1457  * so we implement this function here.
   1458  */
   1459 UBool Replaceable::hasMetaData() const {
   1460     return TRUE;
   1461 }
   1462 
   1463 /**
   1464  * Replaceable API
   1465  */
   1466 UBool UnicodeString::hasMetaData() const {
   1467     return FALSE;
   1468 }
   1469 
   1470 UnicodeString&
   1471 UnicodeString::doReverse(int32_t start, int32_t length) {
   1472   if(length <= 1 || !cloneArrayIfNeeded()) {
   1473     return *this;
   1474   }
   1475 
   1476   // pin the indices to legal values
   1477   pinIndices(start, length);
   1478   if(length <= 1) {  // pinIndices() might have shrunk the length
   1479     return *this;
   1480   }
   1481 
   1482   UChar *left = getArrayStart() + start;
   1483   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
   1484   UChar swap;
   1485   UBool hasSupplementary = FALSE;
   1486 
   1487   // Before the loop we know left<right because length>=2.
   1488   do {
   1489     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
   1490     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
   1491     *right-- = swap;
   1492   } while(left < right);
   1493   // Make sure to test the middle code unit of an odd-length string.
   1494   // Redundant if the length is even.
   1495   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
   1496 
   1497   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
   1498   if(hasSupplementary) {
   1499     UChar swap2;
   1500 
   1501     left = getArrayStart() + start;
   1502     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
   1503     while(left < right) {
   1504       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
   1505         *left++ = swap2;
   1506         *left++ = swap;
   1507       } else {
   1508         ++left;
   1509       }
   1510     }
   1511   }
   1512 
   1513   return *this;
   1514 }
   1515 
   1516 UBool
   1517 UnicodeString::padLeading(int32_t targetLength,
   1518                           UChar padChar)
   1519 {
   1520   int32_t oldLength = length();
   1521   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1522     return FALSE;
   1523   } else {
   1524     // move contents up by padding width
   1525     UChar *array = getArrayStart();
   1526     int32_t start = targetLength - oldLength;
   1527     us_arrayCopy(array, 0, array, start, oldLength);
   1528 
   1529     // fill in padding character
   1530     while(--start >= 0) {
   1531       array[start] = padChar;
   1532     }
   1533     setLength(targetLength);
   1534     return TRUE;
   1535   }
   1536 }
   1537 
   1538 UBool
   1539 UnicodeString::padTrailing(int32_t targetLength,
   1540                            UChar padChar)
   1541 {
   1542   int32_t oldLength = length();
   1543   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
   1544     return FALSE;
   1545   } else {
   1546     // fill in padding character
   1547     UChar *array = getArrayStart();
   1548     int32_t length = targetLength;
   1549     while(--length >= oldLength) {
   1550       array[length] = padChar;
   1551     }
   1552     setLength(targetLength);
   1553     return TRUE;
   1554   }
   1555 }
   1556 
   1557 //========================================
   1558 // Hashing
   1559 //========================================
   1560 int32_t
   1561 UnicodeString::doHashCode() const
   1562 {
   1563     /* Delegate hash computation to uhash.  This makes UnicodeString
   1564      * hashing consistent with UChar* hashing.  */
   1565     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
   1566     if (hashCode == kInvalidHashCode) {
   1567         hashCode = kEmptyHashCode;
   1568     }
   1569     return hashCode;
   1570 }
   1571 
   1572 //========================================
   1573 // External Buffer
   1574 //========================================
   1575 
   1576 UChar *
   1577 UnicodeString::getBuffer(int32_t minCapacity) {
   1578   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
   1579     fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
   1580     setZeroLength();
   1581     return getArrayStart();
   1582   } else {
   1583     return 0;
   1584   }
   1585 }
   1586 
   1587 void
   1588 UnicodeString::releaseBuffer(int32_t newLength) {
   1589   if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
   1590     // set the new fLength
   1591     int32_t capacity=getCapacity();
   1592     if(newLength==-1) {
   1593       // the new length is the string length, capped by fCapacity
   1594       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
   1595       while(p<limit && *p!=0) {
   1596         ++p;
   1597       }
   1598       newLength=(int32_t)(p-array);
   1599     } else if(newLength>capacity) {
   1600       newLength=capacity;
   1601     }
   1602     setLength(newLength);
   1603     fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
   1604   }
   1605 }
   1606 
   1607 //========================================
   1608 // Miscellaneous
   1609 //========================================
   1610 UBool
   1611 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
   1612                                   int32_t growCapacity,
   1613                                   UBool doCopyArray,
   1614                                   int32_t **pBufferToDelete,
   1615                                   UBool forceClone) {
   1616   // default parameters need to be static, therefore
   1617   // the defaults are -1 to have convenience defaults
   1618   if(newCapacity == -1) {
   1619     newCapacity = getCapacity();
   1620   }
   1621 
   1622   // while a getBuffer(minCapacity) is "open",
   1623   // prevent any modifications of the string by returning FALSE here
   1624   // if the string is bogus, then only an assignment or similar can revive it
   1625   if(!isWritable()) {
   1626     return FALSE;
   1627   }
   1628 
   1629   /*
   1630    * We need to make a copy of the array if
   1631    * the buffer is read-only, or
   1632    * the buffer is refCounted (shared), and refCount>1, or
   1633    * the buffer is too small.
   1634    * Return FALSE if memory could not be allocated.
   1635    */
   1636   if(forceClone ||
   1637      fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
   1638      (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
   1639      newCapacity > getCapacity()
   1640   ) {
   1641     // check growCapacity for default value and use of the stack buffer
   1642     if(growCapacity < 0) {
   1643       growCapacity = newCapacity;
   1644     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
   1645       growCapacity = US_STACKBUF_SIZE;
   1646     }
   1647 
   1648     // save old values
   1649     UChar oldStackBuffer[US_STACKBUF_SIZE];
   1650     UChar *oldArray;
   1651     int32_t oldLength = length();
   1652     int16_t flags = fUnion.fFields.fLengthAndFlags;
   1653 
   1654     if(flags&kUsingStackBuffer) {
   1655       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
   1656       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
   1657         // copy the stack buffer contents because it will be overwritten with
   1658         // fUnion.fFields values
   1659         us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
   1660         oldArray = oldStackBuffer;
   1661       } else {
   1662         oldArray = NULL; // no need to copy from the stack buffer to itself
   1663       }
   1664     } else {
   1665       oldArray = fUnion.fFields.fArray;
   1666       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
   1667     }
   1668 
   1669     // allocate a new array
   1670     if(allocate(growCapacity) ||
   1671        (newCapacity < growCapacity && allocate(newCapacity))
   1672     ) {
   1673       if(doCopyArray) {
   1674         // copy the contents
   1675         // do not copy more than what fits - it may be smaller than before
   1676         int32_t minLength = oldLength;
   1677         newCapacity = getCapacity();
   1678         if(newCapacity < minLength) {
   1679           minLength = newCapacity;
   1680         }
   1681         if(oldArray != NULL) {
   1682           us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
   1683         }
   1684         setLength(minLength);
   1685       } else {
   1686         setZeroLength();
   1687       }
   1688 
   1689       // release the old array
   1690       if(flags & kRefCounted) {
   1691         // the array is refCounted; decrement and release if 0
   1692         u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
   1693         if(umtx_atomic_dec(pRefCount) == 0) {
   1694           if(pBufferToDelete == 0) {
   1695               // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
   1696               // is defined as volatile. (Volatile has useful non-standard behavior
   1697               //   with this compiler.)
   1698             uprv_free((void *)pRefCount);
   1699           } else {
   1700             // the caller requested to delete it himself
   1701             *pBufferToDelete = (int32_t *)pRefCount;
   1702           }
   1703         }
   1704       }
   1705     } else {
   1706       // not enough memory for growCapacity and not even for the smaller newCapacity
   1707       // reset the old values for setToBogus() to release the array
   1708       if(!(flags&kUsingStackBuffer)) {
   1709         fUnion.fFields.fArray = oldArray;
   1710       }
   1711       fUnion.fFields.fLengthAndFlags = flags;
   1712       setToBogus();
   1713       return FALSE;
   1714     }
   1715   }
   1716   return TRUE;
   1717 }
   1718 
   1719 // UnicodeStringAppendable ------------------------------------------------- ***
   1720 
   1721 UnicodeStringAppendable::~UnicodeStringAppendable() {}
   1722 
   1723 UBool
   1724 UnicodeStringAppendable::appendCodeUnit(UChar c) {
   1725   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
   1726 }
   1727 
   1728 UBool
   1729 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
   1730   UChar buffer[U16_MAX_LENGTH];
   1731   int32_t cLength = 0;
   1732   UBool isError = FALSE;
   1733   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
   1734   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
   1735 }
   1736 
   1737 UBool
   1738 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
   1739   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
   1740 }
   1741 
   1742 UBool
   1743 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
   1744   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
   1745 }
   1746 
   1747 UChar *
   1748 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
   1749                                          int32_t desiredCapacityHint,
   1750                                          UChar *scratch, int32_t scratchCapacity,
   1751                                          int32_t *resultCapacity) {
   1752   if(minCapacity < 1 || scratchCapacity < minCapacity) {
   1753     *resultCapacity = 0;
   1754     return NULL;
   1755   }
   1756   int32_t oldLength = str.length();
   1757   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
   1758     *resultCapacity = str.getCapacity() - oldLength;
   1759     return str.getArrayStart() + oldLength;
   1760   }
   1761   *resultCapacity = scratchCapacity;
   1762   return scratch;
   1763 }
   1764 
   1765 U_NAMESPACE_END
   1766 
   1767 U_NAMESPACE_USE
   1768 
   1769 U_CAPI int32_t U_EXPORT2
   1770 uhash_hashUnicodeString(const UElement key) {
   1771     const UnicodeString *str = (const UnicodeString*) key.pointer;
   1772     return (str == NULL) ? 0 : str->hashCode();
   1773 }
   1774 
   1775 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
   1776 // does not depend on hashtable code.
   1777 U_CAPI UBool U_EXPORT2
   1778 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
   1779     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
   1780     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
   1781     if (str1 == str2) {
   1782         return TRUE;
   1783     }
   1784     if (str1 == NULL || str2 == NULL) {
   1785         return FALSE;
   1786     }
   1787     return *str1 == *str2;
   1788 }
   1789 
   1790 #ifdef U_STATIC_IMPLEMENTATION
   1791 /*
   1792 This should never be called. It is defined here to make sure that the
   1793 virtual vector deleting destructor is defined within unistr.cpp.
   1794 The vector deleting destructor is already a part of UObject,
   1795 but defining it here makes sure that it is included with this object file.
   1796 This makes sure that static library dependencies are kept to a minimum.
   1797 */
   1798 static void uprv_UnicodeStringDummy(void) {
   1799     delete [] (new UnicodeString[2]);
   1800 }
   1801 #endif
   1802