Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2004-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  regex.cpp
      7 */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     12 
     13 #include "unicode/regex.h"
     14 #include "unicode/uregex.h"
     15 #include "unicode/unistr.h"
     16 #include "unicode/ustring.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/uobject.h"
     19 #include "umutex.h"
     20 #include "uassert.h"
     21 #include "cmemory.h"
     22 
     23 #include "regextxt.h"
     24 
     25 #include <stdio.h>
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
     30 
     31 struct RegularExpression: public UMemory {
     32 public:
     33     RegularExpression();
     34     ~RegularExpression();
     35     int32_t           fMagic;
     36     RegexPattern     *fPat;
     37     int32_t          *fPatRefCount;
     38     UChar            *fPatString;
     39     int32_t           fPatStringLen;
     40     RegexMatcher     *fMatcher;
     41     const UChar      *fText;         // Text from setText()
     42     int32_t           fTextLength;   // Length provided by user with setText(), which
     43                                      //  may be -1.
     44     UBool             fOwnsText;
     45 };
     46 
     47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
     48 
     49 RegularExpression::RegularExpression() {
     50     fMagic        = REXP_MAGIC;
     51     fPat          = NULL;
     52     fPatRefCount  = NULL;
     53     fPatString    = NULL;
     54     fPatStringLen = 0;
     55     fMatcher      = NULL;
     56     fText         = NULL;
     57     fTextLength   = 0;
     58     fOwnsText     = FALSE;
     59 }
     60 
     61 RegularExpression::~RegularExpression() {
     62     delete fMatcher;
     63     fMatcher = NULL;
     64     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
     65         delete fPat;
     66         uprv_free(fPatString);
     67         uprv_free(fPatRefCount);
     68     }
     69     if (fOwnsText && fText!=NULL) {
     70         uprv_free((void *)fText);
     71     }
     72     fMagic = 0;
     73 }
     74 
     75 U_NAMESPACE_END
     76 
     77 U_NAMESPACE_USE
     78 
     79 //----------------------------------------------------------------------------------------
     80 //
     81 //   validateRE    Do boilerplate style checks on API function parameters.
     82 //                 Return TRUE if they look OK.
     83 //----------------------------------------------------------------------------------------
     84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
     85     if (U_FAILURE(*status)) {
     86         return FALSE;
     87     }
     88     if (re == NULL || re->fMagic != REXP_MAGIC) {
     89         *status = U_ILLEGAL_ARGUMENT_ERROR;
     90         return FALSE;
     91     }
     92     // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
     93     if (requiresText && re->fText == NULL && !re->fOwnsText) {
     94         *status = U_REGEX_INVALID_STATE;
     95         return FALSE;
     96     }
     97     return TRUE;
     98 }
     99 
    100 //----------------------------------------------------------------------------------------
    101 //
    102 //    uregex_open
    103 //
    104 //----------------------------------------------------------------------------------------
    105 U_CAPI URegularExpression *  U_EXPORT2
    106 uregex_open( const  UChar          *pattern,
    107                     int32_t         patternLength,
    108                     uint32_t        flags,
    109                     UParseError    *pe,
    110                     UErrorCode     *status) {
    111 
    112     if (U_FAILURE(*status)) {
    113         return NULL;
    114     }
    115     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
    116         *status = U_ILLEGAL_ARGUMENT_ERROR;
    117         return NULL;
    118     }
    119     int32_t actualPatLen = patternLength;
    120     if (actualPatLen == -1) {
    121         actualPatLen = u_strlen(pattern);
    122     }
    123 
    124     RegularExpression *re     = new RegularExpression;
    125     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    126     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
    127     if (re == NULL || refC == NULL || patBuf == NULL) {
    128         *status = U_MEMORY_ALLOCATION_ERROR;
    129         delete re;
    130         uprv_free(refC);
    131         uprv_free(patBuf);
    132         return NULL;
    133     }
    134     re->fPatRefCount = refC;
    135     *re->fPatRefCount = 1;
    136 
    137     //
    138     // Make a copy of the pattern string, so we can return it later if asked.
    139     //    For compiling the pattern, we will use a UText wrapper around
    140     //    this local copy, to avoid making even more copies.
    141     //
    142     re->fPatString    = patBuf;
    143     re->fPatStringLen = patternLength;
    144     u_memcpy(patBuf, pattern, actualPatLen);
    145     patBuf[actualPatLen] = 0;
    146 
    147     UText patText = UTEXT_INITIALIZER;
    148     utext_openUChars(&patText, patBuf, patternLength, status);
    149 
    150     //
    151     // Compile the pattern
    152     //
    153     if (pe != NULL) {
    154         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    155     } else {
    156         re->fPat = RegexPattern::compile(&patText, flags, *status);
    157     }
    158     utext_close(&patText);
    159 
    160     if (U_FAILURE(*status)) {
    161         goto ErrorExit;
    162     }
    163 
    164     //
    165     // Create the matcher object
    166     //
    167     re->fMatcher = re->fPat->matcher(*status);
    168     if (U_SUCCESS(*status)) {
    169         return (URegularExpression*)re;
    170     }
    171 
    172 ErrorExit:
    173     delete re;
    174     return NULL;
    175 
    176 }
    177 
    178 //----------------------------------------------------------------------------------------
    179 //
    180 //    uregex_openUText
    181 //
    182 //----------------------------------------------------------------------------------------
    183 U_CAPI URegularExpression *  U_EXPORT2
    184 uregex_openUText(UText          *pattern,
    185                  uint32_t        flags,
    186                  UParseError    *pe,
    187                  UErrorCode     *status) {
    188 
    189     if (U_FAILURE(*status)) {
    190         return NULL;
    191     }
    192     if (pattern == NULL) {
    193         *status = U_ILLEGAL_ARGUMENT_ERROR;
    194         return NULL;
    195     }
    196 
    197     int64_t patternNativeLength = utext_nativeLength(pattern);
    198 
    199     if (patternNativeLength == 0) {
    200         *status = U_ILLEGAL_ARGUMENT_ERROR;
    201         return NULL;
    202     }
    203 
    204     RegularExpression *re     = new RegularExpression;
    205 
    206     UErrorCode lengthStatus = U_ZERO_ERROR;
    207     int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
    208 
    209     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    210     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
    211     if (re == NULL || refC == NULL || patBuf == NULL) {
    212         *status = U_MEMORY_ALLOCATION_ERROR;
    213         delete re;
    214         uprv_free(refC);
    215         uprv_free(patBuf);
    216         return NULL;
    217     }
    218     re->fPatRefCount = refC;
    219     *re->fPatRefCount = 1;
    220 
    221     //
    222     // Make a copy of the pattern string, so we can return it later if asked.
    223     //    For compiling the pattern, we will use a read-only UText wrapper
    224     //    around this local copy, to avoid making even more copies.
    225     //
    226     re->fPatString    = patBuf;
    227     re->fPatStringLen = pattern16Length;
    228     utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
    229 
    230     UText patText = UTEXT_INITIALIZER;
    231     utext_openUChars(&patText, patBuf, pattern16Length, status);
    232 
    233     //
    234     // Compile the pattern
    235     //
    236     if (pe != NULL) {
    237         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    238     } else {
    239         re->fPat = RegexPattern::compile(&patText, flags, *status);
    240     }
    241     utext_close(&patText);
    242 
    243     if (U_FAILURE(*status)) {
    244         goto ErrorExit;
    245     }
    246 
    247     //
    248     // Create the matcher object
    249     //
    250     re->fMatcher = re->fPat->matcher(*status);
    251     if (U_SUCCESS(*status)) {
    252         return (URegularExpression*)re;
    253     }
    254 
    255 ErrorExit:
    256     delete re;
    257     return NULL;
    258 
    259 }
    260 
    261 //----------------------------------------------------------------------------------------
    262 //
    263 //    uregex_close
    264 //
    265 //----------------------------------------------------------------------------------------
    266 U_CAPI void  U_EXPORT2
    267 uregex_close(URegularExpression  *re2) {
    268     RegularExpression *re = (RegularExpression*)re2;
    269     UErrorCode  status = U_ZERO_ERROR;
    270     if (validateRE(re, &status, FALSE) == FALSE) {
    271         return;
    272     }
    273     delete re;
    274 }
    275 
    276 
    277 //----------------------------------------------------------------------------------------
    278 //
    279 //    uregex_clone
    280 //
    281 //----------------------------------------------------------------------------------------
    282 U_CAPI URegularExpression * U_EXPORT2
    283 uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
    284     RegularExpression *source = (RegularExpression*)source2;
    285     if (validateRE(source, status, FALSE) == FALSE) {
    286         return NULL;
    287     }
    288 
    289     RegularExpression *clone = new RegularExpression;
    290     if (clone == NULL) {
    291         *status = U_MEMORY_ALLOCATION_ERROR;
    292         return NULL;
    293     }
    294 
    295     clone->fMatcher = source->fPat->matcher(*status);
    296     if (U_FAILURE(*status)) {
    297         delete clone;
    298         return NULL;
    299     }
    300 
    301     clone->fPat          = source->fPat;
    302     clone->fPatRefCount  = source->fPatRefCount;
    303     clone->fPatString    = source->fPatString;
    304     clone->fPatStringLen = source->fPatStringLen;
    305     umtx_atomic_inc(source->fPatRefCount);
    306     // Note:  fText is not cloned.
    307 
    308     return (URegularExpression*)clone;
    309 }
    310 
    311 
    312 
    313 
    314 //------------------------------------------------------------------------------
    315 //
    316 //    uregex_pattern
    317 //
    318 //------------------------------------------------------------------------------
    319 U_CAPI const UChar * U_EXPORT2
    320 uregex_pattern(const  URegularExpression *regexp2,
    321                       int32_t            *patLength,
    322                       UErrorCode         *status)  {
    323     RegularExpression *regexp = (RegularExpression*)regexp2;
    324 
    325     if (validateRE(regexp, status, FALSE) == FALSE) {
    326         return NULL;
    327     }
    328     if (patLength != NULL) {
    329         *patLength = regexp->fPatStringLen;
    330     }
    331     return regexp->fPatString;
    332 }
    333 
    334 
    335 //------------------------------------------------------------------------------
    336 //
    337 //    uregex_patternUText
    338 //
    339 //------------------------------------------------------------------------------
    340 U_CAPI UText * U_EXPORT2
    341 uregex_patternUText(const URegularExpression *regexp2,
    342                           UErrorCode         *status)  {
    343     RegularExpression *regexp = (RegularExpression*)regexp2;
    344     (void)status;
    345     return regexp->fPat->patternText();
    346 }
    347 
    348 
    349 //------------------------------------------------------------------------------
    350 //
    351 //    uregex_flags
    352 //
    353 //------------------------------------------------------------------------------
    354 U_CAPI int32_t U_EXPORT2
    355 uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
    356     RegularExpression *regexp = (RegularExpression*)regexp2;
    357     if (validateRE(regexp, status, FALSE) == FALSE) {
    358         return 0;
    359     }
    360     int32_t flags = regexp->fPat->flags();
    361     return flags;
    362 }
    363 
    364 
    365 //------------------------------------------------------------------------------
    366 //
    367 //    uregex_setText
    368 //
    369 //------------------------------------------------------------------------------
    370 U_CAPI void U_EXPORT2
    371 uregex_setText(URegularExpression *regexp2,
    372                const UChar        *text,
    373                int32_t             textLength,
    374                UErrorCode         *status)  {
    375     RegularExpression *regexp = (RegularExpression*)regexp2;
    376     if (validateRE(regexp, status, FALSE) == FALSE) {
    377         return;
    378     }
    379     if (text == NULL || textLength < -1) {
    380         *status = U_ILLEGAL_ARGUMENT_ERROR;
    381         return;
    382     }
    383 
    384     if (regexp->fOwnsText && regexp->fText != NULL) {
    385         uprv_free((void *)regexp->fText);
    386     }
    387 
    388     regexp->fText       = text;
    389     regexp->fTextLength = textLength;
    390     regexp->fOwnsText   = FALSE;
    391 
    392     UText input = UTEXT_INITIALIZER;
    393     utext_openUChars(&input, text, textLength, status);
    394     regexp->fMatcher->reset(&input);
    395     utext_close(&input); // reset() made a shallow clone, so we don't need this copy
    396 }
    397 
    398 
    399 //------------------------------------------------------------------------------
    400 //
    401 //    uregex_setUText
    402 //
    403 //------------------------------------------------------------------------------
    404 U_CAPI void U_EXPORT2
    405 uregex_setUText(URegularExpression *regexp2,
    406                 UText              *text,
    407                 UErrorCode         *status) {
    408     RegularExpression *regexp = (RegularExpression*)regexp2;
    409     if (validateRE(regexp, status, FALSE) == FALSE) {
    410         return;
    411     }
    412     if (text == NULL) {
    413         *status = U_ILLEGAL_ARGUMENT_ERROR;
    414         return;
    415     }
    416 
    417     if (regexp->fOwnsText && regexp->fText != NULL) {
    418         uprv_free((void *)regexp->fText);
    419     }
    420 
    421     regexp->fText       = NULL; // only fill it in on request
    422     regexp->fTextLength = -1;
    423     regexp->fOwnsText   = TRUE;
    424     regexp->fMatcher->reset(text);
    425 }
    426 
    427 
    428 
    429 //------------------------------------------------------------------------------
    430 //
    431 //    uregex_getText
    432 //
    433 //------------------------------------------------------------------------------
    434 U_CAPI const UChar * U_EXPORT2
    435 uregex_getText(URegularExpression *regexp2,
    436                int32_t            *textLength,
    437                UErrorCode         *status)  {
    438     RegularExpression *regexp = (RegularExpression*)regexp2;
    439     if (validateRE(regexp, status, FALSE) == FALSE) {
    440         return NULL;
    441     }
    442 
    443     if (regexp->fText == NULL) {
    444         // need to fill in the text
    445         UText *inputText = regexp->fMatcher->inputText();
    446         int64_t inputNativeLength = utext_nativeLength(inputText);
    447         if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
    448             regexp->fText = inputText->chunkContents;
    449             regexp->fTextLength = (int32_t)inputNativeLength;
    450             regexp->fOwnsText = FALSE; // because the UText owns it
    451         } else {
    452             UErrorCode lengthStatus = U_ZERO_ERROR;
    453             regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
    454             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
    455 
    456             utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
    457             regexp->fText = inputChars;
    458             regexp->fOwnsText = TRUE; // should already be set but just in case
    459         }
    460     }
    461 
    462     if (textLength != NULL) {
    463         *textLength = regexp->fTextLength;
    464     }
    465     return regexp->fText;
    466 }
    467 
    468 
    469 //------------------------------------------------------------------------------
    470 //
    471 //    uregex_getUText
    472 //
    473 //------------------------------------------------------------------------------
    474 U_CAPI UText * U_EXPORT2
    475 uregex_getUText(URegularExpression *regexp2,
    476                 UText              *dest,
    477                 UErrorCode         *status)  {
    478     RegularExpression *regexp = (RegularExpression*)regexp2;
    479     if (validateRE(regexp, status, FALSE) == FALSE) {
    480         return dest;
    481     }
    482     return regexp->fMatcher->getInput(dest);
    483 }
    484 
    485 // BEGIN android-added
    486 // Removed this function after Android upgrade to ICU4.6.
    487 //------------------------------------------------------------------------------
    488 //
    489 //    uregex_refreshUText
    490 //
    491 //------------------------------------------------------------------------------
    492 U_CAPI void U_EXPORT2
    493 uregex_refreshUText(URegularExpression *regexp2,
    494                     UText              *text,
    495                     UErrorCode         *status) {
    496     RegularExpression *regexp = (RegularExpression*)regexp2;
    497     if (validateRE(regexp, status, FALSE) == FALSE) {
    498         return;
    499     }
    500     regexp->fMatcher->refreshInputText(text, *status);
    501 }
    502 // END android-added
    503 
    504 
    505 //------------------------------------------------------------------------------
    506 //
    507 //    uregex_matches
    508 //
    509 //------------------------------------------------------------------------------
    510 U_CAPI UBool U_EXPORT2
    511 uregex_matches(URegularExpression *regexp2,
    512                 int32_t            startIndex,
    513                 UErrorCode        *status)  {
    514     RegularExpression *regexp = (RegularExpression*)regexp2;
    515     UBool result = FALSE;
    516     if (validateRE(regexp, status) == FALSE) {
    517         return result;
    518     }
    519     if (startIndex == -1) {
    520         result = regexp->fMatcher->matches(*status);
    521     } else {
    522         result = regexp->fMatcher->matches(startIndex, *status);
    523     }
    524     return result;
    525 }
    526 
    527 
    528 
    529 //------------------------------------------------------------------------------
    530 //
    531 //    uregex_lookingAt
    532 //
    533 //------------------------------------------------------------------------------
    534 U_CAPI UBool U_EXPORT2
    535 uregex_lookingAt(URegularExpression *regexp2,
    536                  int32_t             startIndex,
    537                  UErrorCode         *status)  {
    538     RegularExpression *regexp = (RegularExpression*)regexp2;
    539     UBool result = FALSE;
    540     if (validateRE(regexp, status) == FALSE) {
    541         return result;
    542     }
    543     if (startIndex == -1) {
    544         result = regexp->fMatcher->lookingAt(*status);
    545     } else {
    546         result = regexp->fMatcher->lookingAt(startIndex, *status);
    547     }
    548     return result;
    549 }
    550 
    551 
    552 
    553 //------------------------------------------------------------------------------
    554 //
    555 //    uregex_find
    556 //
    557 //------------------------------------------------------------------------------
    558 U_CAPI UBool U_EXPORT2
    559 uregex_find(URegularExpression *regexp2,
    560             int32_t             startIndex,
    561             UErrorCode         *status)  {
    562     RegularExpression *regexp = (RegularExpression*)regexp2;
    563     UBool result = FALSE;
    564     if (validateRE(regexp, status) == FALSE) {
    565         return result;
    566     }
    567     if (startIndex == -1) {
    568         regexp->fMatcher->resetPreserveRegion();
    569         result = regexp->fMatcher->find();
    570     } else {
    571         result = regexp->fMatcher->find(startIndex, *status);
    572     }
    573     return result;
    574 }
    575 
    576 //------------------------------------------------------------------------------
    577 //
    578 //    uregex_findNext
    579 //
    580 //------------------------------------------------------------------------------
    581 U_CAPI UBool U_EXPORT2
    582 uregex_findNext(URegularExpression *regexp2,
    583                 UErrorCode         *status)  {
    584     RegularExpression *regexp = (RegularExpression*)regexp2;
    585     if (validateRE(regexp, status) == FALSE) {
    586         return FALSE;
    587     }
    588     UBool result = regexp->fMatcher->find();
    589     return result;
    590 }
    591 
    592 //------------------------------------------------------------------------------
    593 //
    594 //    uregex_groupCount
    595 //
    596 //------------------------------------------------------------------------------
    597 U_CAPI int32_t U_EXPORT2
    598 uregex_groupCount(URegularExpression *regexp2,
    599                   UErrorCode         *status)  {
    600     RegularExpression *regexp = (RegularExpression*)regexp2;
    601     if (validateRE(regexp, status, FALSE) == FALSE) {
    602         return 0;
    603     }
    604     int32_t  result = regexp->fMatcher->groupCount();
    605     return result;
    606 }
    607 
    608 
    609 //------------------------------------------------------------------------------
    610 //
    611 //    uregex_group
    612 //
    613 //------------------------------------------------------------------------------
    614 U_CAPI int32_t U_EXPORT2
    615 uregex_group(URegularExpression *regexp2,
    616              int32_t             groupNum,
    617              UChar              *dest,
    618              int32_t             destCapacity,
    619              UErrorCode          *status)  {
    620     RegularExpression *regexp = (RegularExpression*)regexp2;
    621     if (validateRE(regexp, status) == FALSE) {
    622         return 0;
    623     }
    624     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
    625         *status = U_ILLEGAL_ARGUMENT_ERROR;
    626         return 0;
    627     }
    628 
    629     if (destCapacity == 0 || regexp->fText != NULL) {
    630         // If preflighting or if we already have the text as UChars,
    631         // this is a little cheaper than going through uregex_groupUText()
    632 
    633         //
    634         // Pick up the range of characters from the matcher
    635         //
    636         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    637         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    638         if (U_FAILURE(*status)) {
    639             return 0;
    640         }
    641 
    642         //
    643         // Trim length based on buffer capacity
    644         //
    645         int32_t fullLength = endIx - startIx;
    646         int32_t copyLength = fullLength;
    647         if (copyLength < destCapacity) {
    648             dest[copyLength] = 0;
    649         } else if (copyLength == destCapacity) {
    650             *status = U_STRING_NOT_TERMINATED_WARNING;
    651         } else {
    652             copyLength = destCapacity;
    653             *status = U_BUFFER_OVERFLOW_ERROR;
    654         }
    655 
    656         //
    657         // Copy capture group to user's buffer
    658         //
    659         if (copyLength > 0) {
    660             u_memcpy(dest, &regexp->fText[startIx], copyLength);
    661         }
    662         return fullLength;
    663     } else {
    664         UText *groupText = uregex_groupUText(regexp2, groupNum, NULL, status);
    665         int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
    666         utext_close(groupText);
    667         return result;
    668     }
    669 }
    670 
    671 
    672 //------------------------------------------------------------------------------
    673 //
    674 //    uregex_groupUText
    675 //
    676 //------------------------------------------------------------------------------
    677 U_CAPI UText * U_EXPORT2
    678 uregex_groupUText(URegularExpression *regexp2,
    679                   int32_t             groupNum,
    680                   UText              *dest,
    681                   UErrorCode         *status)  {
    682     RegularExpression *regexp = (RegularExpression*)regexp2;
    683     if (validateRE(regexp, status) == FALSE) {
    684         UErrorCode emptyTextStatus = U_ZERO_ERROR;
    685         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    686     }
    687 
    688     if (regexp->fText != NULL) {
    689         //
    690         // Pick up the range of characters from the matcher
    691         // and use our already-extracted characters
    692         //
    693         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    694         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    695         if (U_FAILURE(*status)) {
    696             UErrorCode emptyTextStatus = U_ZERO_ERROR;
    697             return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    698         }
    699 
    700         if (dest) {
    701             utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
    702         } else {
    703             UText groupText = UTEXT_INITIALIZER;
    704             utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
    705             dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
    706             utext_close(&groupText);
    707         }
    708 
    709         return dest;
    710     } else {
    711         return regexp->fMatcher->group(groupNum, dest, *status);
    712     }
    713 }
    714 
    715 
    716 //------------------------------------------------------------------------------
    717 //
    718 //    uregex_start
    719 //
    720 //------------------------------------------------------------------------------
    721 U_CAPI int32_t U_EXPORT2
    722 uregex_start(URegularExpression *regexp2,
    723              int32_t             groupNum,
    724              UErrorCode          *status)  {
    725     RegularExpression *regexp = (RegularExpression*)regexp2;
    726     if (validateRE(regexp, status) == FALSE) {
    727         return 0;
    728     }
    729     int32_t result = regexp->fMatcher->start(groupNum, *status);
    730     return result;
    731 }
    732 
    733 
    734 //------------------------------------------------------------------------------
    735 //
    736 //    uregex_end
    737 //
    738 //------------------------------------------------------------------------------
    739 U_CAPI int32_t U_EXPORT2
    740 uregex_end(URegularExpression   *regexp2,
    741            int32_t               groupNum,
    742            UErrorCode           *status)  {
    743     RegularExpression *regexp = (RegularExpression*)regexp2;
    744     if (validateRE(regexp, status) == FALSE) {
    745         return 0;
    746     }
    747     int32_t result = regexp->fMatcher->end(groupNum, *status);
    748     return result;
    749 }
    750 
    751 //------------------------------------------------------------------------------
    752 //
    753 //    uregex_reset
    754 //
    755 //------------------------------------------------------------------------------
    756 U_CAPI void U_EXPORT2
    757 uregex_reset(URegularExpression    *regexp2,
    758              int32_t               index,
    759              UErrorCode            *status)  {
    760     RegularExpression *regexp = (RegularExpression*)regexp2;
    761     if (validateRE(regexp, status) == FALSE) {
    762         return;
    763     }
    764     regexp->fMatcher->reset(index, *status);
    765 }
    766 
    767 
    768 //------------------------------------------------------------------------------
    769 //
    770 //    uregex_setRegion
    771 //
    772 //------------------------------------------------------------------------------
    773 U_CAPI void U_EXPORT2
    774 uregex_setRegion(URegularExpression   *regexp2,
    775                  int32_t               regionStart,
    776                  int32_t               regionLimit,
    777                  UErrorCode           *status)  {
    778     RegularExpression *regexp = (RegularExpression*)regexp2;
    779     if (validateRE(regexp, status) == FALSE) {
    780         return;
    781     }
    782     regexp->fMatcher->region(regionStart, regionLimit, *status);
    783 }
    784 
    785 
    786 //------------------------------------------------------------------------------
    787 //
    788 //    uregex_regionStart
    789 //
    790 //------------------------------------------------------------------------------
    791 U_CAPI int32_t U_EXPORT2
    792 uregex_regionStart(const  URegularExpression   *regexp2,
    793                           UErrorCode           *status)  {
    794     RegularExpression *regexp = (RegularExpression*)regexp2;
    795     if (validateRE(regexp, status) == FALSE) {
    796         return 0;
    797     }
    798     return regexp->fMatcher->regionStart();
    799 }
    800 
    801 
    802 //------------------------------------------------------------------------------
    803 //
    804 //    uregex_regionEnd
    805 //
    806 //------------------------------------------------------------------------------
    807 U_CAPI int32_t U_EXPORT2
    808 uregex_regionEnd(const  URegularExpression   *regexp2,
    809                         UErrorCode           *status)  {
    810     RegularExpression *regexp = (RegularExpression*)regexp2;
    811     if (validateRE(regexp, status) == FALSE) {
    812         return 0;
    813     }
    814     return regexp->fMatcher->regionEnd();
    815 }
    816 
    817 
    818 //------------------------------------------------------------------------------
    819 //
    820 //    uregex_hasTransparentBounds
    821 //
    822 //------------------------------------------------------------------------------
    823 U_CAPI UBool U_EXPORT2
    824 uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
    825                                    UErrorCode           *status)  {
    826     RegularExpression *regexp = (RegularExpression*)regexp2;
    827     if (validateRE(regexp, status) == FALSE) {
    828         return FALSE;
    829     }
    830     return regexp->fMatcher->hasTransparentBounds();
    831 }
    832 
    833 
    834 //------------------------------------------------------------------------------
    835 //
    836 //    uregex_useTransparentBounds
    837 //
    838 //------------------------------------------------------------------------------
    839 U_CAPI void U_EXPORT2
    840 uregex_useTransparentBounds(URegularExpression    *regexp2,
    841                             UBool                  b,
    842                             UErrorCode            *status)  {
    843     RegularExpression *regexp = (RegularExpression*)regexp2;
    844     if (validateRE(regexp, status) == FALSE) {
    845         return;
    846     }
    847     regexp->fMatcher->useTransparentBounds(b);
    848 }
    849 
    850 
    851 //------------------------------------------------------------------------------
    852 //
    853 //    uregex_hasAnchoringBounds
    854 //
    855 //------------------------------------------------------------------------------
    856 U_CAPI UBool U_EXPORT2
    857 uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
    858                                  UErrorCode           *status)  {
    859     RegularExpression *regexp = (RegularExpression*)regexp2;
    860     if (validateRE(regexp, status) == FALSE) {
    861         return FALSE;
    862     }
    863     return regexp->fMatcher->hasAnchoringBounds();
    864 }
    865 
    866 
    867 //------------------------------------------------------------------------------
    868 //
    869 //    uregex_useAnchoringBounds
    870 //
    871 //------------------------------------------------------------------------------
    872 U_CAPI void U_EXPORT2
    873 uregex_useAnchoringBounds(URegularExpression    *regexp2,
    874                           UBool                  b,
    875                           UErrorCode            *status)  {
    876     RegularExpression *regexp = (RegularExpression*)regexp2;
    877     if (validateRE(regexp, status) == FALSE) {
    878         return;
    879     }
    880     regexp->fMatcher->useAnchoringBounds(b);
    881 }
    882 
    883 
    884 //------------------------------------------------------------------------------
    885 //
    886 //    uregex_hitEnd
    887 //
    888 //------------------------------------------------------------------------------
    889 U_CAPI UBool U_EXPORT2
    890 uregex_hitEnd(const  URegularExpression   *regexp2,
    891                      UErrorCode           *status)  {
    892     RegularExpression *regexp = (RegularExpression*)regexp2;
    893     if (validateRE(regexp, status) == FALSE) {
    894         return FALSE;
    895     }
    896     return regexp->fMatcher->hitEnd();
    897 }
    898 
    899 
    900 //------------------------------------------------------------------------------
    901 //
    902 //    uregex_requireEnd
    903 //
    904 //------------------------------------------------------------------------------
    905 U_CAPI UBool U_EXPORT2
    906 uregex_requireEnd(const  URegularExpression   *regexp2,
    907                          UErrorCode           *status)  {
    908     RegularExpression *regexp = (RegularExpression*)regexp2;
    909     if (validateRE(regexp, status) == FALSE) {
    910         return FALSE;
    911     }
    912     return regexp->fMatcher->requireEnd();
    913 }
    914 
    915 
    916 //------------------------------------------------------------------------------
    917 //
    918 //    uregex_setTimeLimit
    919 //
    920 //------------------------------------------------------------------------------
    921 U_CAPI void U_EXPORT2
    922 uregex_setTimeLimit(URegularExpression   *regexp2,
    923                     int32_t               limit,
    924                     UErrorCode           *status) {
    925     RegularExpression *regexp = (RegularExpression*)regexp2;
    926     if (validateRE(regexp, status)) {
    927         regexp->fMatcher->setTimeLimit(limit, *status);
    928     }
    929 }
    930 
    931 
    932 
    933 //------------------------------------------------------------------------------
    934 //
    935 //    uregex_getTimeLimit
    936 //
    937 //------------------------------------------------------------------------------
    938 U_CAPI int32_t U_EXPORT2
    939 uregex_getTimeLimit(const  URegularExpression   *regexp2,
    940                            UErrorCode           *status) {
    941     int32_t retVal = 0;
    942     RegularExpression *regexp = (RegularExpression*)regexp2;
    943     if (validateRE(regexp, status)) {
    944         retVal = regexp->fMatcher->getTimeLimit();
    945     }
    946     return retVal;
    947 }
    948 
    949 
    950 
    951 //------------------------------------------------------------------------------
    952 //
    953 //    uregex_setStackLimit
    954 //
    955 //------------------------------------------------------------------------------
    956 U_CAPI void U_EXPORT2
    957 uregex_setStackLimit(URegularExpression   *regexp2,
    958                      int32_t               limit,
    959                      UErrorCode           *status) {
    960     RegularExpression *regexp = (RegularExpression*)regexp2;
    961     if (validateRE(regexp, status)) {
    962         regexp->fMatcher->setStackLimit(limit, *status);
    963     }
    964 }
    965 
    966 
    967 
    968 //------------------------------------------------------------------------------
    969 //
    970 //    uregex_getStackLimit
    971 //
    972 //------------------------------------------------------------------------------
    973 U_CAPI int32_t U_EXPORT2
    974 uregex_getStackLimit(const  URegularExpression   *regexp2,
    975                             UErrorCode           *status) {
    976     int32_t retVal = 0;
    977     RegularExpression *regexp = (RegularExpression*)regexp2;
    978     if (validateRE(regexp, status)) {
    979         retVal = regexp->fMatcher->getStackLimit();
    980     }
    981     return retVal;
    982 }
    983 
    984 
    985 //------------------------------------------------------------------------------
    986 //
    987 //    uregex_setMatchCallback
    988 //
    989 //------------------------------------------------------------------------------
    990 U_CAPI void U_EXPORT2
    991 uregex_setMatchCallback(URegularExpression      *regexp2,
    992                         URegexMatchCallback     *callback,
    993                         const void              *context,
    994                         UErrorCode              *status) {
    995     RegularExpression *regexp = (RegularExpression*)regexp2;
    996     if (validateRE(regexp, status)) {
    997         regexp->fMatcher->setMatchCallback(callback, context, *status);
    998     }
    999 }
   1000 
   1001 
   1002 //------------------------------------------------------------------------------
   1003 //
   1004 //    uregex_getMatchCallback
   1005 //
   1006 //------------------------------------------------------------------------------
   1007 U_CAPI void U_EXPORT2
   1008 uregex_getMatchCallback(const URegularExpression    *regexp2,
   1009                         URegexMatchCallback        **callback,
   1010                         const void                 **context,
   1011                         UErrorCode                  *status) {
   1012     RegularExpression *regexp = (RegularExpression*)regexp2;
   1013      if (validateRE(regexp, status)) {
   1014          regexp->fMatcher->getMatchCallback(*callback, *context, *status);
   1015      }
   1016 }
   1017 
   1018 
   1019 //------------------------------------------------------------------------------
   1020 //
   1021 //    uregex_replaceAll
   1022 //
   1023 //------------------------------------------------------------------------------
   1024 U_CAPI int32_t U_EXPORT2
   1025 uregex_replaceAll(URegularExpression    *regexp2,
   1026                   const UChar           *replacementText,
   1027                   int32_t                replacementLength,
   1028                   UChar                 *destBuf,
   1029                   int32_t                destCapacity,
   1030                   UErrorCode            *status)  {
   1031     RegularExpression *regexp = (RegularExpression*)regexp2;
   1032     if (validateRE(regexp, status) == FALSE) {
   1033         return 0;
   1034     }
   1035     if (replacementText == NULL || replacementLength < -1 ||
   1036         destBuf == NULL && destCapacity > 0 ||
   1037         destCapacity < 0) {
   1038         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1039         return 0;
   1040     }
   1041 
   1042     int32_t   len = 0;
   1043 
   1044     uregex_reset(regexp2, 0, status);
   1045 
   1046     // Note: Seperate error code variables for findNext() and appendReplacement()
   1047     //       are used so that destination buffer overflow errors
   1048     //       in appendReplacement won't stop findNext() from working.
   1049     //       appendReplacement() and appendTail() special case incoming buffer
   1050     //       overflow errors, continuing to return the correct length.
   1051     UErrorCode  findStatus = *status;
   1052     while (uregex_findNext(regexp2, &findStatus)) {
   1053         len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
   1054                                         &destBuf, &destCapacity, status);
   1055     }
   1056     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
   1057 
   1058     if (U_FAILURE(findStatus)) {
   1059         // If anything went wrong with the findNext(), make that error trump
   1060         //   whatever may have happened with the append() operations.
   1061         //   Errors in findNext() are not expected.
   1062         *status = findStatus;
   1063     }
   1064 
   1065     return len;
   1066 }
   1067 
   1068 
   1069 //------------------------------------------------------------------------------
   1070 //
   1071 //    uregex_replaceAllUText
   1072 //
   1073 //------------------------------------------------------------------------------
   1074 U_CAPI UText * U_EXPORT2
   1075 uregex_replaceAllUText(URegularExpression    *regexp2,
   1076                        UText                 *replacementText,
   1077                        UText                 *dest,
   1078                        UErrorCode            *status)  {
   1079     RegularExpression *regexp = (RegularExpression*)regexp2;
   1080     if (validateRE(regexp, status) == FALSE) {
   1081         return 0;
   1082     }
   1083     if (replacementText == NULL) {
   1084         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1085         return 0;
   1086     }
   1087 
   1088     dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
   1089     return dest;
   1090 }
   1091 
   1092 
   1093 //------------------------------------------------------------------------------
   1094 //
   1095 //    uregex_replaceFirst
   1096 //
   1097 //------------------------------------------------------------------------------
   1098 U_CAPI int32_t U_EXPORT2
   1099 uregex_replaceFirst(URegularExpression  *regexp2,
   1100                     const UChar         *replacementText,
   1101                     int32_t              replacementLength,
   1102                     UChar               *destBuf,
   1103                     int32_t              destCapacity,
   1104                     UErrorCode          *status)  {
   1105     RegularExpression *regexp = (RegularExpression*)regexp2;
   1106     if (validateRE(regexp, status) == FALSE) {
   1107         return 0;
   1108     }
   1109     if (replacementText == NULL || replacementLength < -1 ||
   1110         destBuf == NULL && destCapacity > 0 ||
   1111         destCapacity < 0) {
   1112         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1113         return 0;
   1114     }
   1115 
   1116     int32_t   len = 0;
   1117     UBool     findSucceeded;
   1118     uregex_reset(regexp2, 0, status);
   1119     findSucceeded = uregex_find(regexp2, 0, status);
   1120     if (findSucceeded) {
   1121         len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
   1122                                        &destBuf, &destCapacity, status);
   1123     }
   1124     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
   1125 
   1126     return len;
   1127 }
   1128 
   1129 
   1130 //------------------------------------------------------------------------------
   1131 //
   1132 //    uregex_replaceFirstUText
   1133 //
   1134 //------------------------------------------------------------------------------
   1135 U_CAPI UText * U_EXPORT2
   1136 uregex_replaceFirstUText(URegularExpression  *regexp2,
   1137                          UText                 *replacementText,
   1138                          UText                 *dest,
   1139                          UErrorCode            *status)  {
   1140     RegularExpression *regexp = (RegularExpression*)regexp2;
   1141     if (validateRE(regexp, status) == FALSE) {
   1142         return 0;
   1143     }
   1144     if (replacementText == NULL) {
   1145         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1146         return 0;
   1147     }
   1148 
   1149     dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
   1150     return dest;
   1151 }
   1152 
   1153 
   1154 //------------------------------------------------------------------------------
   1155 //
   1156 //    uregex_appendReplacement
   1157 //
   1158 //------------------------------------------------------------------------------
   1159 
   1160 U_NAMESPACE_BEGIN
   1161 //
   1162 //  Dummy class, because these functions need to be friends of class RegexMatcher,
   1163 //               and stand-alone C functions don't work as friends
   1164 //
   1165 class RegexCImpl {
   1166  public:
   1167    inline static  int32_t appendReplacement(RegularExpression    *regexp,
   1168                       const UChar           *replacementText,
   1169                       int32_t                replacementLength,
   1170                       UChar                **destBuf,
   1171                       int32_t               *destCapacity,
   1172                       UErrorCode            *status);
   1173 
   1174    inline static int32_t appendTail(RegularExpression    *regexp,
   1175         UChar                **destBuf,
   1176         int32_t               *destCapacity,
   1177         UErrorCode            *status);
   1178 
   1179     inline static int32_t split(RegularExpression    *regexp,
   1180         UChar                 *destBuf,
   1181         int32_t                destCapacity,
   1182         int32_t               *requiredCapacity,
   1183         UChar                 *destFields[],
   1184         int32_t                destFieldsCapacity,
   1185         UErrorCode            *status);
   1186 };
   1187 
   1188 U_NAMESPACE_END
   1189 
   1190 
   1191 
   1192 static const UChar BACKSLASH  = 0x5c;
   1193 static const UChar DOLLARSIGN = 0x24;
   1194 
   1195 //
   1196 //  Move a character to an output buffer, with bounds checking on the index.
   1197 //      Index advances even if capacity is exceeded, for preflight size computations.
   1198 //      This little sequence is used a LOT.
   1199 //
   1200 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
   1201     if (*idx < bufCapacity) {
   1202         buf[*idx] = c;
   1203     }
   1204     (*idx)++;
   1205 }
   1206 
   1207 
   1208 //
   1209 //  appendReplacement, the actual implementation.
   1210 //
   1211 int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
   1212                                       const UChar           *replacementText,
   1213                                       int32_t                replacementLength,
   1214                                       UChar                **destBuf,
   1215                                       int32_t               *destCapacity,
   1216                                       UErrorCode            *status)  {
   1217 
   1218     // If we come in with a buffer overflow error, don't suppress the operation.
   1219     //  A series of appendReplacements, appendTail need to correctly preflight
   1220     //  the buffer size when an overflow happens somewhere in the middle.
   1221     UBool pendingBufferOverflow = FALSE;
   1222     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1223         pendingBufferOverflow = TRUE;
   1224         *status = U_ZERO_ERROR;
   1225     }
   1226 
   1227     //
   1228     // Validate all paramters
   1229     //
   1230     if (validateRE(regexp, status) == FALSE) {
   1231         return 0;
   1232     }
   1233     if (replacementText == NULL || replacementLength < -1 ||
   1234         destCapacity == NULL || destBuf == NULL ||
   1235         *destBuf == NULL && *destCapacity > 0 ||
   1236         *destCapacity < 0) {
   1237         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1238         return 0;
   1239     }
   1240 
   1241     RegexMatcher *m = regexp->fMatcher;
   1242     if (m->fMatch == FALSE) {
   1243         *status = U_REGEX_INVALID_STATE;
   1244         return 0;
   1245     }
   1246 
   1247     UChar    *dest             = *destBuf;
   1248     int32_t   capacity         = *destCapacity;
   1249     int32_t   destIdx          =  0;
   1250     int32_t   i;
   1251 
   1252     // If it wasn't supplied by the caller,  get the length of the replacement text.
   1253     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
   1254     //          the fly and avoid this step.
   1255     if (replacementLength == -1) {
   1256         replacementLength = u_strlen(replacementText);
   1257     }
   1258 
   1259     // Copy input string from the end of previous match to start of current match
   1260     if (regexp->fText != NULL) {
   1261         int32_t matchStart;
   1262         int32_t lastMatchEnd;
   1263         if (UTEXT_USES_U16(m->fInputText)) {
   1264             lastMatchEnd = (int32_t)m->fLastMatchEnd;
   1265             matchStart = (int32_t)m->fMatchStart;
   1266         } else {
   1267             // !!!: Would like a better way to do this!
   1268             UErrorCode status = U_ZERO_ERROR;
   1269             lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
   1270             status = U_ZERO_ERROR;
   1271             matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
   1272         }
   1273         for (i=lastMatchEnd; i<matchStart; i++) {
   1274             appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
   1275         }
   1276     } else {
   1277         UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
   1278         destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
   1279                                  &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
   1280     }
   1281 
   1282 
   1283     // scan the replacement text, looking for substitutions ($n) and \escapes.
   1284     int32_t  replIdx = 0;
   1285     while (replIdx < replacementLength) {
   1286         UChar  c = replacementText[replIdx];
   1287         replIdx++;
   1288         if (c != DOLLARSIGN && c != BACKSLASH) {
   1289             // Common case, no substitution, no escaping,
   1290             //  just copy the char to the dest buf.
   1291             appendToBuf(c, &destIdx, dest, capacity);
   1292             continue;
   1293         }
   1294 
   1295         if (c == BACKSLASH) {
   1296             // Backslash Escape.  Copy the following char out without further checks.
   1297             //                    Note:  Surrogate pairs don't need any special handling
   1298             //                           The second half wont be a '$' or a '\', and
   1299             //                           will move to the dest normally on the next
   1300             //                           loop iteration.
   1301             if (replIdx >= replacementLength) {
   1302                 break;
   1303             }
   1304             c = replacementText[replIdx];
   1305 
   1306             if (c==0x55/*U*/ || c==0x75/*u*/) {
   1307                 // We have a \udddd or \Udddddddd escape sequence.
   1308                 UChar32 escapedChar =
   1309                     u_unescapeAt(uregex_ucstr_unescape_charAt,
   1310                        &replIdx,                   // Index is updated by unescapeAt
   1311                        replacementLength,          // Length of replacement text
   1312                        (void *)replacementText);
   1313 
   1314                 if (escapedChar != (UChar32)0xFFFFFFFF) {
   1315                     if (escapedChar <= 0xffff) {
   1316                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
   1317                     } else {
   1318                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
   1319                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
   1320                     }
   1321                     continue;
   1322                 }
   1323                 // Note:  if the \u escape was invalid, just fall through and
   1324                 //        treat it as a plain \<anything> escape.
   1325             }
   1326 
   1327             // Plain backslash escape.  Just put out the escaped character.
   1328             appendToBuf(c, &destIdx, dest, capacity);
   1329 
   1330             replIdx++;
   1331             continue;
   1332         }
   1333 
   1334 
   1335 
   1336         // We've got a $.  Pick up a capture group number if one follows.
   1337         // Consume at most the number of digits necessary for the largest capture
   1338         // number that is valid for this pattern.
   1339 
   1340         int32_t numDigits = 0;
   1341         int32_t groupNum  = 0;
   1342         UChar32 digitC;
   1343         for (;;) {
   1344             if (replIdx >= replacementLength) {
   1345                 break;
   1346             }
   1347             U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
   1348             if (u_isdigit(digitC) == FALSE) {
   1349                 break;
   1350             }
   1351 
   1352             U16_FWD_1(replacementText, replIdx, replacementLength);
   1353             groupNum=groupNum*10 + u_charDigitValue(digitC);
   1354             numDigits++;
   1355             if (numDigits >= m->fPattern->fMaxCaptureDigits) {
   1356                 break;
   1357             }
   1358         }
   1359 
   1360 
   1361         if (numDigits == 0) {
   1362             // The $ didn't introduce a group number at all.
   1363             // Treat it as just part of the substitution text.
   1364             appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
   1365             continue;
   1366         }
   1367 
   1368         // Finally, append the capture group data to the destination.
   1369         destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
   1370         if (*status == U_BUFFER_OVERFLOW_ERROR) {
   1371             // Ignore buffer overflow when extracting the group.  We need to
   1372             //   continue on to get full size of the untruncated result.  We will
   1373             //   raise our own buffer overflow error at the end.
   1374             *status = U_ZERO_ERROR;
   1375         }
   1376 
   1377         if (U_FAILURE(*status)) {
   1378             // Can fail if group number is out of range.
   1379             break;
   1380         }
   1381 
   1382     }
   1383 
   1384     //
   1385     //  Nul Terminate the dest buffer if possible.
   1386     //  Set the appropriate buffer overflow or not terminated error, if needed.
   1387     //
   1388     if (destIdx < capacity) {
   1389         dest[destIdx] = 0;
   1390     } else if (destIdx == *destCapacity) {
   1391         *status = U_STRING_NOT_TERMINATED_WARNING;
   1392     } else {
   1393         *status = U_BUFFER_OVERFLOW_ERROR;
   1394     }
   1395 
   1396     //
   1397     // Return an updated dest buffer and capacity to the caller.
   1398     //
   1399     if (destIdx > 0 &&  *destCapacity > 0) {
   1400         if (destIdx < capacity) {
   1401             *destBuf      += destIdx;
   1402             *destCapacity -= destIdx;
   1403         } else {
   1404             *destBuf      += capacity;
   1405             *destCapacity =  0;
   1406         }
   1407     }
   1408 
   1409     // If we came in with a buffer overflow, make sure we go out with one also.
   1410     //   (A zero length match right at the end of the previous match could
   1411     //    make this function succeed even though a previous call had overflowed the buf)
   1412     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1413         *status = U_BUFFER_OVERFLOW_ERROR;
   1414     }
   1415 
   1416     return destIdx;
   1417 }
   1418 
   1419 //
   1420 //   appendReplacement   the actual API function,
   1421 //
   1422 U_CAPI int32_t U_EXPORT2
   1423 uregex_appendReplacement(URegularExpression    *regexp2,
   1424                          const UChar           *replacementText,
   1425                          int32_t                replacementLength,
   1426                          UChar                **destBuf,
   1427                          int32_t               *destCapacity,
   1428                          UErrorCode            *status) {
   1429 
   1430     RegularExpression *regexp = (RegularExpression*)regexp2;
   1431     return RegexCImpl::appendReplacement(
   1432         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
   1433 }
   1434 
   1435 //
   1436 //   uregex_appendReplacementUText...can just use the normal C++ method
   1437 //
   1438 U_CAPI void U_EXPORT2
   1439 uregex_appendReplacementUText(URegularExpression    *regexp2,
   1440                               UText                 *replText,
   1441                               UText                 *dest,
   1442                               UErrorCode            *status)  {
   1443     RegularExpression *regexp = (RegularExpression*)regexp2;
   1444     regexp->fMatcher->appendReplacement(dest, replText, *status);
   1445 }
   1446 
   1447 
   1448 //------------------------------------------------------------------------------
   1449 //
   1450 //    uregex_appendTail
   1451 //
   1452 //------------------------------------------------------------------------------
   1453 int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
   1454                                UChar                **destBuf,
   1455                                int32_t               *destCapacity,
   1456                                UErrorCode            *status)
   1457 {
   1458 
   1459     // If we come in with a buffer overflow error, don't suppress the operation.
   1460     //  A series of appendReplacements, appendTail need to correctly preflight
   1461     //  the buffer size when an overflow happens somewhere in the middle.
   1462     UBool pendingBufferOverflow = FALSE;
   1463     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1464         pendingBufferOverflow = TRUE;
   1465         *status = U_ZERO_ERROR;
   1466     }
   1467 
   1468     if (validateRE(regexp, status) == FALSE) {
   1469         return 0;
   1470     }
   1471 
   1472     if (destCapacity == NULL || destBuf == NULL ||
   1473         *destBuf == NULL && *destCapacity > 0 ||
   1474         *destCapacity < 0)
   1475     {
   1476         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1477         return 0;
   1478     }
   1479 
   1480     RegexMatcher *m = regexp->fMatcher;
   1481 
   1482     int32_t  destIdx     = 0;
   1483     int32_t  destCap     = *destCapacity;
   1484     UChar    *dest       = *destBuf;
   1485 
   1486     if (regexp->fText != NULL) {
   1487         int32_t srcIdx;
   1488         int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
   1489         if (nativeIdx == -1) {
   1490             srcIdx = 0;
   1491         } else if (UTEXT_USES_U16(m->fInputText)) {
   1492             srcIdx = (int32_t)nativeIdx;
   1493         } else {
   1494             UErrorCode status = U_ZERO_ERROR;
   1495             srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
   1496         }
   1497 
   1498         for (;;) {
   1499             if (srcIdx == regexp->fTextLength) {
   1500                 break;
   1501             }
   1502             UChar c = regexp->fText[srcIdx];
   1503             if (c == 0 && regexp->fTextLength == -1) {
   1504                 regexp->fTextLength = srcIdx;
   1505                 break;
   1506             }
   1507             if (destIdx < destCap) {
   1508                 dest[destIdx] = c;
   1509             } else {
   1510                 // We've overflowed the dest buffer.
   1511                 //  If the total input string length is known, we can
   1512                 //    compute the total buffer size needed without scanning through the string.
   1513                 if (regexp->fTextLength > 0) {
   1514                     destIdx += (regexp->fTextLength - srcIdx);
   1515                     break;
   1516                 }
   1517             }
   1518             srcIdx++;
   1519             destIdx++;
   1520         }
   1521     } else {
   1522         int64_t  srcIdx;
   1523         if (m->fMatch) {
   1524             // The most recent call to find() succeeded.
   1525             srcIdx = m->fMatchEnd;
   1526         } else {
   1527             // The last call to find() on this matcher failed().
   1528             //   Look back to the end of the last find() that succeeded for src index.
   1529             srcIdx = m->fLastMatchEnd;
   1530             if (srcIdx == -1)  {
   1531                 // There has been no successful match with this matcher.
   1532                 //   We want to copy the whole string.
   1533                 srcIdx = 0;
   1534             }
   1535         }
   1536 
   1537         destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
   1538     }
   1539 
   1540     //
   1541     //  NUL terminate the output string, if possible, otherwise issue the
   1542     //   appropriate error or warning.
   1543     //
   1544     if (destIdx < destCap) {
   1545         dest[destIdx] = 0;
   1546     } else  if (destIdx == destCap) {
   1547         *status = U_STRING_NOT_TERMINATED_WARNING;
   1548     } else {
   1549         *status = U_BUFFER_OVERFLOW_ERROR;
   1550     }
   1551 
   1552     //
   1553     // Update the user's buffer ptr and capacity vars to reflect the
   1554     //   amount used.
   1555     //
   1556     if (destIdx < destCap) {
   1557         *destBuf      += destIdx;
   1558         *destCapacity -= destIdx;
   1559     } else {
   1560         *destBuf      += destCap;
   1561         *destCapacity  = 0;
   1562     }
   1563 
   1564     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1565         *status = U_BUFFER_OVERFLOW_ERROR;
   1566     }
   1567 
   1568     return destIdx;
   1569 }
   1570 
   1571 
   1572 //
   1573 //   appendTail   the actual API function
   1574 //
   1575 U_CAPI int32_t U_EXPORT2
   1576 uregex_appendTail(URegularExpression    *regexp2,
   1577                   UChar                **destBuf,
   1578                   int32_t               *destCapacity,
   1579                   UErrorCode            *status)  {
   1580     RegularExpression *regexp = (RegularExpression*)regexp2;
   1581     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
   1582 }
   1583 
   1584 
   1585 //
   1586 //   uregex_appendTailUText...can just use the normal C++ method
   1587 //
   1588 U_CAPI UText * U_EXPORT2
   1589 uregex_appendTailUText(URegularExpression    *regexp2,
   1590                        UText                 *dest)  {
   1591     RegularExpression *regexp = (RegularExpression*)regexp2;
   1592     return regexp->fMatcher->appendTail(dest);
   1593 }
   1594 
   1595 
   1596 //------------------------------------------------------------------------------
   1597 //
   1598 //    copyString     Internal utility to copy a string to an output buffer,
   1599 //                   while managing buffer overflow and preflight size
   1600 //                   computation.  NUL termination is added to destination,
   1601 //                   and the NUL is counted in the output size.
   1602 //
   1603 //------------------------------------------------------------------------------
   1604 #if 0
   1605 static void copyString(UChar        *destBuffer,    //  Destination buffer.
   1606                        int32_t       destCapacity,  //  Total capacity of dest buffer
   1607                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
   1608                                                     //    Update not clipped to destCapacity.
   1609                        const UChar  *srcPtr,        //  Pointer to source string
   1610                        int32_t       srcLen)        //  Source string len.
   1611 {
   1612     int32_t  si;
   1613     int32_t  di = *destIndex;
   1614     UChar    c;
   1615 
   1616     for (si=0; si<srcLen;  si++) {
   1617         c = srcPtr[si];
   1618         if (di < destCapacity) {
   1619             destBuffer[di] = c;
   1620             di++;
   1621         } else {
   1622             di += srcLen - si;
   1623             break;
   1624         }
   1625     }
   1626     if (di<destCapacity) {
   1627         destBuffer[di] = 0;
   1628     }
   1629     di++;
   1630     *destIndex = di;
   1631 }
   1632 #endif
   1633 
   1634 //------------------------------------------------------------------------------
   1635 //
   1636 //    uregex_split
   1637 //
   1638 //------------------------------------------------------------------------------
   1639 int32_t RegexCImpl::split(RegularExpression     *regexp,
   1640                           UChar                 *destBuf,
   1641                           int32_t                destCapacity,
   1642                           int32_t               *requiredCapacity,
   1643                           UChar                 *destFields[],
   1644                           int32_t                destFieldsCapacity,
   1645                           UErrorCode            *status) {
   1646     //
   1647     // Reset for the input text
   1648     //
   1649     regexp->fMatcher->reset();
   1650     UText *inputText = regexp->fMatcher->fInputText;
   1651     int64_t   nextOutputStringStart = 0;
   1652     int64_t   inputLen = regexp->fMatcher->fInputLength;
   1653     if (inputLen == 0) {
   1654         return 0;
   1655     }
   1656 
   1657     //
   1658     // Loop through the input text, searching for the delimiter pattern
   1659     //
   1660     int32_t   i;             // Index of the field being processed.
   1661     int32_t   destIdx = 0;   // Next available position in destBuf;
   1662     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
   1663     UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
   1664     for (i=0; ; i++) {
   1665         if (i>=destFieldsCapacity-1) {
   1666             // There are one or zero output strings left.
   1667             // Fill the last output string with whatever is left from the input, then exit the loop.
   1668             //  ( i will be == destFieldsCapacity if we filled the output array while processing
   1669             //    capture groups of the delimiter expression, in which case we will discard the
   1670             //    last capture group saved in favor of the unprocessed remainder of the
   1671             //    input string.)
   1672             if (inputLen > nextOutputStringStart) {
   1673                 if (i != destFieldsCapacity-1) {
   1674                     // No fields are left.  Recycle the last one for holding the trailing part of
   1675                     //   the input string.
   1676                     i = destFieldsCapacity-1;
   1677                     destIdx = (int32_t)(destFields[i] - destFields[0]);
   1678                 }
   1679 
   1680                 destFields[i] = &destBuf[destIdx];
   1681                 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
   1682                                              &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
   1683             }
   1684             break;
   1685         }
   1686 
   1687         if (regexp->fMatcher->find()) {
   1688             // We found another delimiter.  Move everything from where we started looking
   1689             //  up until the start of the delimiter into the next output string.
   1690             destFields[i] = &destBuf[destIdx];
   1691 
   1692             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
   1693                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
   1694             if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
   1695                 tStatus = U_ZERO_ERROR;
   1696             } else {
   1697                 *status = tStatus;
   1698             }
   1699             nextOutputStringStart = regexp->fMatcher->fMatchEnd;
   1700 
   1701             // If the delimiter pattern has capturing parentheses, the captured
   1702             //  text goes out into the next n destination strings.
   1703             int32_t groupNum;
   1704             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   1705                 // If we've run out of output string slots, bail out.
   1706                 if (i==destFieldsCapacity-1) {
   1707                     break;
   1708                 }
   1709                 i++;
   1710 
   1711                 // Set up to extract the capture group contents into the dest buffer.
   1712                 destFields[i] = &destBuf[destIdx];
   1713                 tStatus = U_ZERO_ERROR;
   1714                 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
   1715                 destIdx += t + 1;    // Record the space used in the output string buffer.
   1716                                      //  +1 for the NUL that terminates the string.
   1717                 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
   1718                     tStatus = U_ZERO_ERROR;
   1719                 } else {
   1720                     *status = tStatus;
   1721                 }
   1722             }
   1723 
   1724             if (nextOutputStringStart == inputLen) {
   1725                 // The delimiter was at the end of the string.  We're done.
   1726                 break;
   1727             }
   1728 
   1729         }
   1730         else
   1731         {
   1732             // We ran off the end of the input while looking for the next delimiter.
   1733             // All the remaining text goes into the current output string.
   1734             destFields[i] = &destBuf[destIdx];
   1735             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
   1736                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
   1737             break;
   1738         }
   1739     }
   1740 
   1741     // Zero out any unused portion of the destFields array
   1742     int j;
   1743     for (j=i+1; j<destFieldsCapacity; j++) {
   1744         destFields[j] = NULL;
   1745     }
   1746 
   1747     if (requiredCapacity != NULL) {
   1748         *requiredCapacity = destIdx;
   1749     }
   1750     if (destIdx > destCapacity) {
   1751         *status = U_BUFFER_OVERFLOW_ERROR;
   1752     }
   1753     return i+1;
   1754 }
   1755 
   1756 //
   1757 //   uregex_split   The actual API function
   1758 //
   1759 U_CAPI int32_t U_EXPORT2
   1760 uregex_split(URegularExpression      *regexp2,
   1761              UChar                   *destBuf,
   1762              int32_t                  destCapacity,
   1763              int32_t                 *requiredCapacity,
   1764              UChar                   *destFields[],
   1765              int32_t                  destFieldsCapacity,
   1766              UErrorCode              *status) {
   1767     RegularExpression *regexp = (RegularExpression*)regexp2;
   1768     if (validateRE(regexp, status) == FALSE) {
   1769         return 0;
   1770     }
   1771     if (destBuf == NULL && destCapacity > 0 ||
   1772         destCapacity < 0 ||
   1773         destFields == NULL ||
   1774         destFieldsCapacity < 1 ) {
   1775         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1776         return 0;
   1777     }
   1778 
   1779     return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
   1780 }
   1781 
   1782 
   1783 //
   1784 //   uregex_splitUText...can just use the normal C++ method
   1785 //
   1786 U_CAPI int32_t U_EXPORT2
   1787 uregex_splitUText(URegularExpression    *regexp2,
   1788                   UText                 *destFields[],
   1789                   int32_t                destFieldsCapacity,
   1790                   UErrorCode            *status) {
   1791     RegularExpression *regexp = (RegularExpression*)regexp2;
   1792     return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
   1793 }
   1794 
   1795 
   1796 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1797 
   1798