Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2004-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  regex.cpp
      7 */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     12 
     13 #include "unicode/regex.h"
     14 #include "unicode/uregex.h"
     15 #include "unicode/unistr.h"
     16 #include "unicode/ustring.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/uobject.h"
     19 #include "umutex.h"
     20 #include "uassert.h"
     21 #include "cmemory.h"
     22 
     23 #include "regextxt.h"
     24 
     25 #include <stdio.h>
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
     30 
     31 struct RegularExpression: public UMemory {
     32 public:
     33     RegularExpression();
     34     ~RegularExpression();
     35     int32_t           fMagic;
     36     RegexPattern     *fPat;
     37     int32_t          *fPatRefCount;
     38     UChar            *fPatString;
     39     int32_t           fPatStringLen;
     40     RegexMatcher     *fMatcher;
     41     const UChar      *fText;         // Text from setText()
     42     int32_t           fTextLength;   // Length provided by user with setText(), which
     43                                      //  may be -1.
     44     UBool             fOwnsText;
     45 };
     46 
     47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
     48 
     49 RegularExpression::RegularExpression() {
     50     fMagic        = REXP_MAGIC;
     51     fPat          = NULL;
     52     fPatRefCount  = NULL;
     53     fPatString    = NULL;
     54     fPatStringLen = 0;
     55     fMatcher      = NULL;
     56     fText         = NULL;
     57     fTextLength   = 0;
     58     fOwnsText     = FALSE;
     59 }
     60 
     61 RegularExpression::~RegularExpression() {
     62     delete fMatcher;
     63     fMatcher = NULL;
     64     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
     65         delete fPat;
     66         uprv_free(fPatString);
     67         uprv_free(fPatRefCount);
     68     }
     69     if (fOwnsText && fText!=NULL) {
     70         uprv_free((void *)fText);
     71     }
     72     fMagic = 0;
     73 }
     74 
     75 U_NAMESPACE_END
     76 
     77 U_NAMESPACE_USE
     78 
     79 //----------------------------------------------------------------------------------------
     80 //
     81 //   validateRE    Do boilerplate style checks on API function parameters.
     82 //                 Return TRUE if they look OK.
     83 //----------------------------------------------------------------------------------------
     84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
     85     if (U_FAILURE(*status)) {
     86         return FALSE;
     87     }
     88     if (re == NULL || re->fMagic != REXP_MAGIC) {
     89         *status = U_ILLEGAL_ARGUMENT_ERROR;
     90         return FALSE;
     91     }
     92     // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
     93     if (requiresText && re->fText == NULL && !re->fOwnsText) {
     94         *status = U_REGEX_INVALID_STATE;
     95         return FALSE;
     96     }
     97     return TRUE;
     98 }
     99 
    100 //----------------------------------------------------------------------------------------
    101 //
    102 //    uregex_open
    103 //
    104 //----------------------------------------------------------------------------------------
    105 U_CAPI URegularExpression *  U_EXPORT2
    106 uregex_open( const  UChar          *pattern,
    107                     int32_t         patternLength,
    108                     uint32_t        flags,
    109                     UParseError    *pe,
    110                     UErrorCode     *status) {
    111 
    112     if (U_FAILURE(*status)) {
    113         return NULL;
    114     }
    115     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
    116         *status = U_ILLEGAL_ARGUMENT_ERROR;
    117         return NULL;
    118     }
    119     int32_t actualPatLen = patternLength;
    120     if (actualPatLen == -1) {
    121         actualPatLen = u_strlen(pattern);
    122     }
    123 
    124     RegularExpression *re     = new RegularExpression;
    125     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    126     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
    127     if (re == NULL || refC == NULL || patBuf == NULL) {
    128         *status = U_MEMORY_ALLOCATION_ERROR;
    129         delete re;
    130         uprv_free(refC);
    131         uprv_free(patBuf);
    132         return NULL;
    133     }
    134     re->fPatRefCount = refC;
    135     *re->fPatRefCount = 1;
    136 
    137     //
    138     // Make a copy of the pattern string, so we can return it later if asked.
    139     //    For compiling the pattern, we will use a UText wrapper around
    140     //    this local copy, to avoid making even more copies.
    141     //
    142     re->fPatString    = patBuf;
    143     re->fPatStringLen = patternLength;
    144     u_memcpy(patBuf, pattern, actualPatLen);
    145     patBuf[actualPatLen] = 0;
    146 
    147     UText patText = UTEXT_INITIALIZER;
    148     utext_openUChars(&patText, patBuf, patternLength, status);
    149 
    150     //
    151     // Compile the pattern
    152     //
    153     if (pe != NULL) {
    154         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    155     } else {
    156         re->fPat = RegexPattern::compile(&patText, flags, *status);
    157     }
    158     utext_close(&patText);
    159 
    160     if (U_FAILURE(*status)) {
    161         goto ErrorExit;
    162     }
    163 
    164     //
    165     // Create the matcher object
    166     //
    167     re->fMatcher = re->fPat->matcher(*status);
    168     if (U_SUCCESS(*status)) {
    169         return (URegularExpression*)re;
    170     }
    171 
    172 ErrorExit:
    173     delete re;
    174     return NULL;
    175 
    176 }
    177 
    178 //----------------------------------------------------------------------------------------
    179 //
    180 //    uregex_openUText
    181 //
    182 //----------------------------------------------------------------------------------------
    183 U_CAPI URegularExpression *  U_EXPORT2
    184 uregex_openUText(UText          *pattern,
    185                  uint32_t        flags,
    186                  UParseError    *pe,
    187                  UErrorCode     *status) {
    188 
    189     if (U_FAILURE(*status)) {
    190         return NULL;
    191     }
    192     if (pattern == NULL) {
    193         *status = U_ILLEGAL_ARGUMENT_ERROR;
    194         return NULL;
    195     }
    196 
    197     int64_t patternNativeLength = utext_nativeLength(pattern);
    198 
    199     if (patternNativeLength == 0) {
    200         *status = U_ILLEGAL_ARGUMENT_ERROR;
    201         return NULL;
    202     }
    203 
    204     RegularExpression *re     = new RegularExpression;
    205 
    206     UErrorCode lengthStatus = U_ZERO_ERROR;
    207     int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
    208 
    209     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    210     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
    211     if (re == NULL || refC == NULL || patBuf == NULL) {
    212         *status = U_MEMORY_ALLOCATION_ERROR;
    213         delete re;
    214         uprv_free(refC);
    215         uprv_free(patBuf);
    216         return NULL;
    217     }
    218     re->fPatRefCount = refC;
    219     *re->fPatRefCount = 1;
    220 
    221     //
    222     // Make a copy of the pattern string, so we can return it later if asked.
    223     //    For compiling the pattern, we will use a read-only UText wrapper
    224     //    around this local copy, to avoid making even more copies.
    225     //
    226     re->fPatString    = patBuf;
    227     re->fPatStringLen = pattern16Length;
    228     utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
    229 
    230     UText patText = UTEXT_INITIALIZER;
    231     utext_openUChars(&patText, patBuf, pattern16Length, status);
    232 
    233     //
    234     // Compile the pattern
    235     //
    236     if (pe != NULL) {
    237         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    238     } else {
    239         re->fPat = RegexPattern::compile(&patText, flags, *status);
    240     }
    241     utext_close(&patText);
    242 
    243     if (U_FAILURE(*status)) {
    244         goto ErrorExit;
    245     }
    246 
    247     //
    248     // Create the matcher object
    249     //
    250     re->fMatcher = re->fPat->matcher(*status);
    251     if (U_SUCCESS(*status)) {
    252         return (URegularExpression*)re;
    253     }
    254 
    255 ErrorExit:
    256     delete re;
    257     return NULL;
    258 
    259 }
    260 
    261 //----------------------------------------------------------------------------------------
    262 //
    263 //    uregex_close
    264 //
    265 //----------------------------------------------------------------------------------------
    266 U_CAPI void  U_EXPORT2
    267 uregex_close(URegularExpression  *re2) {
    268     RegularExpression *re = (RegularExpression*)re2;
    269     UErrorCode  status = U_ZERO_ERROR;
    270     if (validateRE(re, &status, FALSE) == FALSE) {
    271         return;
    272     }
    273     delete re;
    274 }
    275 
    276 
    277 //----------------------------------------------------------------------------------------
    278 //
    279 //    uregex_clone
    280 //
    281 //----------------------------------------------------------------------------------------
    282 U_CAPI URegularExpression * U_EXPORT2
    283 uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
    284     RegularExpression *source = (RegularExpression*)source2;
    285     if (validateRE(source, status, FALSE) == FALSE) {
    286         return NULL;
    287     }
    288 
    289     RegularExpression *clone = new RegularExpression;
    290     if (clone == NULL) {
    291         *status = U_MEMORY_ALLOCATION_ERROR;
    292         return NULL;
    293     }
    294 
    295     clone->fMatcher = source->fPat->matcher(*status);
    296     if (U_FAILURE(*status)) {
    297         delete clone;
    298         return NULL;
    299     }
    300 
    301     clone->fPat          = source->fPat;
    302     clone->fPatRefCount  = source->fPatRefCount;
    303     clone->fPatString    = source->fPatString;
    304     clone->fPatStringLen = source->fPatStringLen;
    305     umtx_atomic_inc(source->fPatRefCount);
    306     // Note:  fText is not cloned.
    307 
    308     return (URegularExpression*)clone;
    309 }
    310 
    311 
    312 
    313 
    314 //------------------------------------------------------------------------------
    315 //
    316 //    uregex_pattern
    317 //
    318 //------------------------------------------------------------------------------
    319 U_CAPI const UChar * U_EXPORT2
    320 uregex_pattern(const  URegularExpression *regexp2,
    321                       int32_t            *patLength,
    322                       UErrorCode         *status)  {
    323     RegularExpression *regexp = (RegularExpression*)regexp2;
    324 
    325     if (validateRE(regexp, status, FALSE) == FALSE) {
    326         return NULL;
    327     }
    328     if (patLength != NULL) {
    329         *patLength = regexp->fPatStringLen;
    330     }
    331     return regexp->fPatString;
    332 }
    333 
    334 
    335 //------------------------------------------------------------------------------
    336 //
    337 //    uregex_patternUText
    338 //
    339 //------------------------------------------------------------------------------
    340 U_CAPI UText * U_EXPORT2
    341 uregex_patternUText(const URegularExpression *regexp2,
    342                           UErrorCode         *status)  {
    343     RegularExpression *regexp = (RegularExpression*)regexp2;
    344     return regexp->fPat->patternText(*status);
    345 }
    346 
    347 
    348 //------------------------------------------------------------------------------
    349 //
    350 //    uregex_flags
    351 //
    352 //------------------------------------------------------------------------------
    353 U_CAPI int32_t U_EXPORT2
    354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
    355     RegularExpression *regexp = (RegularExpression*)regexp2;
    356     if (validateRE(regexp, status, FALSE) == FALSE) {
    357         return 0;
    358     }
    359     int32_t flags = regexp->fPat->flags();
    360     return flags;
    361 }
    362 
    363 
    364 //------------------------------------------------------------------------------
    365 //
    366 //    uregex_setText
    367 //
    368 //------------------------------------------------------------------------------
    369 U_CAPI void U_EXPORT2
    370 uregex_setText(URegularExpression *regexp2,
    371                const UChar        *text,
    372                int32_t             textLength,
    373                UErrorCode         *status)  {
    374     RegularExpression *regexp = (RegularExpression*)regexp2;
    375     if (validateRE(regexp, status, FALSE) == FALSE) {
    376         return;
    377     }
    378     if (text == NULL || textLength < -1) {
    379         *status = U_ILLEGAL_ARGUMENT_ERROR;
    380         return;
    381     }
    382 
    383     if (regexp->fOwnsText && regexp->fText != NULL) {
    384         uprv_free((void *)regexp->fText);
    385     }
    386 
    387     regexp->fText       = text;
    388     regexp->fTextLength = textLength;
    389     regexp->fOwnsText   = FALSE;
    390 
    391     UText input = UTEXT_INITIALIZER;
    392     utext_openUChars(&input, text, textLength, status);
    393     regexp->fMatcher->reset(&input);
    394     utext_close(&input); // reset() made a shallow clone, so we don't need this copy
    395 }
    396 
    397 
    398 //------------------------------------------------------------------------------
    399 //
    400 //    uregex_setUText
    401 //
    402 //------------------------------------------------------------------------------
    403 U_CAPI void U_EXPORT2
    404 uregex_setUText(URegularExpression *regexp2,
    405                 UText              *text,
    406                 UErrorCode         *status) {
    407     RegularExpression *regexp = (RegularExpression*)regexp2;
    408     if (validateRE(regexp, status, FALSE) == FALSE) {
    409         return;
    410     }
    411     if (text == NULL) {
    412         *status = U_ILLEGAL_ARGUMENT_ERROR;
    413         return;
    414     }
    415 
    416     if (regexp->fOwnsText && regexp->fText != NULL) {
    417         uprv_free((void *)regexp->fText);
    418     }
    419 
    420     regexp->fText       = NULL; // only fill it in on request
    421     regexp->fTextLength = -1;
    422     regexp->fOwnsText   = TRUE;
    423     regexp->fMatcher->reset(text);
    424 }
    425 
    426 
    427 
    428 //------------------------------------------------------------------------------
    429 //
    430 //    uregex_getText
    431 //
    432 //------------------------------------------------------------------------------
    433 U_CAPI const UChar * U_EXPORT2
    434 uregex_getText(URegularExpression *regexp2,
    435                int32_t            *textLength,
    436                UErrorCode         *status)  {
    437     RegularExpression *regexp = (RegularExpression*)regexp2;
    438     if (validateRE(regexp, status, FALSE) == FALSE) {
    439         return NULL;
    440     }
    441 
    442     if (regexp->fText == NULL) {
    443         // need to fill in the text
    444         UText *inputText = regexp->fMatcher->inputText();
    445         int64_t inputNativeLength = utext_nativeLength(inputText);
    446         if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
    447             regexp->fText = inputText->chunkContents;
    448             regexp->fTextLength = (int32_t)inputNativeLength;
    449             regexp->fOwnsText = FALSE; // because the UText owns it
    450         } else {
    451             UErrorCode lengthStatus = U_ZERO_ERROR;
    452             regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
    453             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
    454 
    455             utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
    456             regexp->fText = inputChars;
    457             regexp->fOwnsText = TRUE; // should already be set but just in case
    458         }
    459     }
    460 
    461     if (textLength != NULL) {
    462         *textLength = regexp->fTextLength;
    463     }
    464     return regexp->fText;
    465 }
    466 
    467 
    468 //------------------------------------------------------------------------------
    469 //
    470 //    uregex_getUText
    471 //
    472 //------------------------------------------------------------------------------
    473 U_CAPI UText * U_EXPORT2
    474 uregex_getUText(URegularExpression *regexp2,
    475                 UText              *dest,
    476                 UErrorCode         *status)  {
    477     RegularExpression *regexp = (RegularExpression*)regexp2;
    478     if (validateRE(regexp, status, FALSE) == FALSE) {
    479         return dest;
    480     }
    481     return regexp->fMatcher->getInput(dest, *status);
    482 }
    483 
    484 // BEGIN android-added
    485 // Removed this function after Android upgrade to ICU4.8.
    486 //------------------------------------------------------------------------------
    487 //
    488 //    uregex_refreshUText
    489 //
    490 //------------------------------------------------------------------------------
    491 U_CAPI void U_EXPORT2
    492 uregex_refreshUText(URegularExpression *regexp2,
    493                     UText              *text,
    494                     UErrorCode         *status) {
    495     RegularExpression *regexp = (RegularExpression*)regexp2;
    496     if (validateRE(regexp, status, FALSE) == FALSE) {
    497         return;
    498     }
    499     regexp->fMatcher->refreshInputText(text, *status);
    500 }
    501 // END android-added
    502 
    503 //------------------------------------------------------------------------------
    504 //
    505 //    uregex_matches
    506 //
    507 //------------------------------------------------------------------------------
    508 U_CAPI UBool U_EXPORT2
    509 uregex_matches(URegularExpression *regexp2,
    510                int32_t            startIndex,
    511                UErrorCode        *status)  {
    512     return uregex_matches64( regexp2, (int64_t)startIndex, status);
    513 }
    514 
    515 U_CAPI UBool U_EXPORT2
    516 uregex_matches64(URegularExpression *regexp2,
    517                  int64_t            startIndex,
    518                  UErrorCode        *status)  {
    519     RegularExpression *regexp = (RegularExpression*)regexp2;
    520     UBool result = FALSE;
    521     if (validateRE(regexp, status) == FALSE) {
    522         return result;
    523     }
    524     if (startIndex == -1) {
    525         result = regexp->fMatcher->matches(*status);
    526     } else {
    527         result = regexp->fMatcher->matches(startIndex, *status);
    528     }
    529     return result;
    530 }
    531 
    532 
    533 //------------------------------------------------------------------------------
    534 //
    535 //    uregex_lookingAt
    536 //
    537 //------------------------------------------------------------------------------
    538 U_CAPI UBool U_EXPORT2
    539 uregex_lookingAt(URegularExpression *regexp2,
    540                  int32_t             startIndex,
    541                  UErrorCode         *status)  {
    542     return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
    543 }
    544 
    545 U_CAPI UBool U_EXPORT2
    546 uregex_lookingAt64(URegularExpression *regexp2,
    547                    int64_t             startIndex,
    548                    UErrorCode         *status)  {
    549     RegularExpression *regexp = (RegularExpression*)regexp2;
    550     UBool result = FALSE;
    551     if (validateRE(regexp, status) == FALSE) {
    552         return result;
    553     }
    554     if (startIndex == -1) {
    555         result = regexp->fMatcher->lookingAt(*status);
    556     } else {
    557         result = regexp->fMatcher->lookingAt(startIndex, *status);
    558     }
    559     return result;
    560 }
    561 
    562 
    563 
    564 //------------------------------------------------------------------------------
    565 //
    566 //    uregex_find
    567 //
    568 //------------------------------------------------------------------------------
    569 U_CAPI UBool U_EXPORT2
    570 uregex_find(URegularExpression *regexp2,
    571             int32_t             startIndex,
    572             UErrorCode         *status)  {
    573     return uregex_find64( regexp2, (int64_t)startIndex, status);
    574 }
    575 
    576 U_CAPI UBool U_EXPORT2
    577 uregex_find64(URegularExpression *regexp2,
    578               int64_t             startIndex,
    579               UErrorCode         *status)  {
    580     RegularExpression *regexp = (RegularExpression*)regexp2;
    581     UBool result = FALSE;
    582     if (validateRE(regexp, status) == FALSE) {
    583         return result;
    584     }
    585     if (startIndex == -1) {
    586         regexp->fMatcher->resetPreserveRegion();
    587         result = regexp->fMatcher->find();
    588     } else {
    589         result = regexp->fMatcher->find(startIndex, *status);
    590     }
    591     return result;
    592 }
    593 
    594 
    595 //------------------------------------------------------------------------------
    596 //
    597 //    uregex_findNext
    598 //
    599 //------------------------------------------------------------------------------
    600 U_CAPI UBool U_EXPORT2
    601 uregex_findNext(URegularExpression *regexp2,
    602                 UErrorCode         *status)  {
    603     RegularExpression *regexp = (RegularExpression*)regexp2;
    604     if (validateRE(regexp, status) == FALSE) {
    605         return FALSE;
    606     }
    607     UBool result = regexp->fMatcher->find();
    608     return result;
    609 }
    610 
    611 //------------------------------------------------------------------------------
    612 //
    613 //    uregex_groupCount
    614 //
    615 //------------------------------------------------------------------------------
    616 U_CAPI int32_t U_EXPORT2
    617 uregex_groupCount(URegularExpression *regexp2,
    618                   UErrorCode         *status)  {
    619     RegularExpression *regexp = (RegularExpression*)regexp2;
    620     if (validateRE(regexp, status, FALSE) == FALSE) {
    621         return 0;
    622     }
    623     int32_t  result = regexp->fMatcher->groupCount();
    624     return result;
    625 }
    626 
    627 
    628 //------------------------------------------------------------------------------
    629 //
    630 //    uregex_group
    631 //
    632 //------------------------------------------------------------------------------
    633 U_CAPI int32_t U_EXPORT2
    634 uregex_group(URegularExpression *regexp2,
    635              int32_t             groupNum,
    636              UChar              *dest,
    637              int32_t             destCapacity,
    638              UErrorCode          *status)  {
    639     RegularExpression *regexp = (RegularExpression*)regexp2;
    640     if (validateRE(regexp, status) == FALSE) {
    641         return 0;
    642     }
    643     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
    644         *status = U_ILLEGAL_ARGUMENT_ERROR;
    645         return 0;
    646     }
    647 
    648     if (destCapacity == 0 || regexp->fText != NULL) {
    649         // If preflighting or if we already have the text as UChars,
    650         // this is a little cheaper than going through uregex_groupUTextDeep()
    651 
    652         //
    653         // Pick up the range of characters from the matcher
    654         //
    655         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    656         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    657         if (U_FAILURE(*status)) {
    658             return 0;
    659         }
    660 
    661         //
    662         // Trim length based on buffer capacity
    663         //
    664         int32_t fullLength = endIx - startIx;
    665         int32_t copyLength = fullLength;
    666         if (copyLength < destCapacity) {
    667             dest[copyLength] = 0;
    668         } else if (copyLength == destCapacity) {
    669             *status = U_STRING_NOT_TERMINATED_WARNING;
    670         } else {
    671             copyLength = destCapacity;
    672             *status = U_BUFFER_OVERFLOW_ERROR;
    673         }
    674 
    675         //
    676         // Copy capture group to user's buffer
    677         //
    678         if (copyLength > 0) {
    679             u_memcpy(dest, &regexp->fText[startIx], copyLength);
    680         }
    681         return fullLength;
    682     } else {
    683         UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
    684         int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
    685         utext_close(groupText);
    686         return result;
    687     }
    688 }
    689 
    690 
    691 //------------------------------------------------------------------------------
    692 //
    693 //    uregex_groupUText
    694 //
    695 //------------------------------------------------------------------------------
    696 U_CAPI UText * U_EXPORT2
    697 uregex_groupUText(URegularExpression *regexp2,
    698                   int32_t             groupNum,
    699                   UText              *dest,
    700                   int64_t            *groupLength,
    701                   UErrorCode         *status)  {
    702     RegularExpression *regexp = (RegularExpression*)regexp2;
    703     if (validateRE(regexp, status) == FALSE) {
    704         UErrorCode emptyTextStatus = U_ZERO_ERROR;
    705         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    706     }
    707 
    708     return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
    709 }
    710 
    711 //------------------------------------------------------------------------------
    712 //
    713 //    uregex_groupUTextDeep
    714 //
    715 //------------------------------------------------------------------------------
    716 U_CAPI UText * U_EXPORT2
    717 uregex_groupUTextDeep(URegularExpression *regexp2,
    718                   int32_t             groupNum,
    719                   UText              *dest,
    720                   UErrorCode         *status)  {
    721     RegularExpression *regexp = (RegularExpression*)regexp2;
    722     if (validateRE(regexp, status) == FALSE) {
    723         UErrorCode emptyTextStatus = U_ZERO_ERROR;
    724         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    725     }
    726 
    727     if (regexp->fText != NULL) {
    728         //
    729         // Pick up the range of characters from the matcher
    730         // and use our already-extracted characters
    731         //
    732         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    733         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    734         if (U_FAILURE(*status)) {
    735             UErrorCode emptyTextStatus = U_ZERO_ERROR;
    736             return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    737         }
    738 
    739         if (dest) {
    740             utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
    741         } else {
    742             UText groupText = UTEXT_INITIALIZER;
    743             utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
    744             dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
    745             utext_close(&groupText);
    746         }
    747 
    748         return dest;
    749     } else {
    750         return regexp->fMatcher->group(groupNum, dest, *status);
    751     }
    752 }
    753 
    754 //------------------------------------------------------------------------------
    755 //
    756 //    uregex_start
    757 //
    758 //------------------------------------------------------------------------------
    759 U_CAPI int32_t U_EXPORT2
    760 uregex_start(URegularExpression *regexp2,
    761              int32_t             groupNum,
    762              UErrorCode          *status)  {
    763     return (int32_t)uregex_start64( regexp2, groupNum, status);
    764 }
    765 
    766 U_CAPI int64_t U_EXPORT2
    767 uregex_start64(URegularExpression *regexp2,
    768                int32_t             groupNum,
    769                UErrorCode          *status)  {
    770     RegularExpression *regexp = (RegularExpression*)regexp2;
    771     if (validateRE(regexp, status) == FALSE) {
    772         return 0;
    773     }
    774     int32_t result = regexp->fMatcher->start(groupNum, *status);
    775     return result;
    776 }
    777 
    778 //------------------------------------------------------------------------------
    779 //
    780 //    uregex_end
    781 //
    782 //------------------------------------------------------------------------------
    783 U_CAPI int32_t U_EXPORT2
    784 uregex_end(URegularExpression   *regexp2,
    785            int32_t               groupNum,
    786            UErrorCode           *status)  {
    787     return (int32_t)uregex_end64( regexp2, groupNum, status);
    788 }
    789 
    790 U_CAPI int64_t U_EXPORT2
    791 uregex_end64(URegularExpression   *regexp2,
    792              int32_t               groupNum,
    793              UErrorCode           *status)  {
    794     RegularExpression *regexp = (RegularExpression*)regexp2;
    795     if (validateRE(regexp, status) == FALSE) {
    796         return 0;
    797     }
    798     int32_t result = regexp->fMatcher->end(groupNum, *status);
    799     return result;
    800 }
    801 
    802 //------------------------------------------------------------------------------
    803 //
    804 //    uregex_reset
    805 //
    806 //------------------------------------------------------------------------------
    807 U_CAPI void U_EXPORT2
    808 uregex_reset(URegularExpression    *regexp2,
    809              int32_t               index,
    810              UErrorCode            *status)  {
    811     uregex_reset64( regexp2, (int64_t)index, status);
    812 }
    813 
    814 U_CAPI void U_EXPORT2
    815 uregex_reset64(URegularExpression    *regexp2,
    816                int64_t               index,
    817                UErrorCode            *status)  {
    818     RegularExpression *regexp = (RegularExpression*)regexp2;
    819     if (validateRE(regexp, status) == FALSE) {
    820         return;
    821     }
    822     regexp->fMatcher->reset(index, *status);
    823 }
    824 
    825 
    826 //------------------------------------------------------------------------------
    827 //
    828 //    uregex_setRegion
    829 //
    830 //------------------------------------------------------------------------------
    831 U_CAPI void U_EXPORT2
    832 uregex_setRegion(URegularExpression   *regexp2,
    833                  int32_t               regionStart,
    834                  int32_t               regionLimit,
    835                  UErrorCode           *status)  {
    836     uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
    837 }
    838 
    839 U_CAPI void U_EXPORT2
    840 uregex_setRegion64(URegularExpression   *regexp2,
    841                    int64_t               regionStart,
    842                    int64_t               regionLimit,
    843                    UErrorCode           *status)  {
    844     RegularExpression *regexp = (RegularExpression*)regexp2;
    845     if (validateRE(regexp, status) == FALSE) {
    846         return;
    847     }
    848     regexp->fMatcher->region(regionStart, regionLimit, *status);
    849 }
    850 
    851 
    852 //------------------------------------------------------------------------------
    853 //
    854 //    uregex_setRegionAndStart
    855 //
    856 //------------------------------------------------------------------------------
    857 U_DRAFT void U_EXPORT2
    858 uregex_setRegionAndStart(URegularExpression   *regexp2,
    859                  int64_t               regionStart,
    860                  int64_t               regionLimit,
    861                  int64_t               startIndex,
    862                  UErrorCode           *status)  {
    863     RegularExpression *regexp = (RegularExpression*)regexp2;
    864     if (validateRE(regexp, status) == FALSE) {
    865         return;
    866     }
    867     regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
    868 }
    869 
    870 //------------------------------------------------------------------------------
    871 //
    872 //    uregex_regionStart
    873 //
    874 //------------------------------------------------------------------------------
    875 U_CAPI int32_t U_EXPORT2
    876 uregex_regionStart(const  URegularExpression   *regexp2,
    877                           UErrorCode           *status)  {
    878     return (int32_t)uregex_regionStart64(regexp2, status);
    879 }
    880 
    881 U_CAPI int64_t U_EXPORT2
    882 uregex_regionStart64(const  URegularExpression   *regexp2,
    883                             UErrorCode           *status)  {
    884     RegularExpression *regexp = (RegularExpression*)regexp2;
    885     if (validateRE(regexp, status) == FALSE) {
    886         return 0;
    887     }
    888     return regexp->fMatcher->regionStart();
    889 }
    890 
    891 
    892 //------------------------------------------------------------------------------
    893 //
    894 //    uregex_regionEnd
    895 //
    896 //------------------------------------------------------------------------------
    897 U_CAPI int32_t U_EXPORT2
    898 uregex_regionEnd(const  URegularExpression   *regexp2,
    899                         UErrorCode           *status)  {
    900     return (int32_t)uregex_regionEnd64(regexp2, status);
    901 }
    902 
    903 U_CAPI int64_t U_EXPORT2
    904 uregex_regionEnd64(const  URegularExpression   *regexp2,
    905                           UErrorCode           *status)  {
    906     RegularExpression *regexp = (RegularExpression*)regexp2;
    907     if (validateRE(regexp, status) == FALSE) {
    908         return 0;
    909     }
    910     return regexp->fMatcher->regionEnd();
    911 }
    912 
    913 
    914 //------------------------------------------------------------------------------
    915 //
    916 //    uregex_hasTransparentBounds
    917 //
    918 //------------------------------------------------------------------------------
    919 U_CAPI UBool U_EXPORT2
    920 uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
    921                                    UErrorCode           *status)  {
    922     RegularExpression *regexp = (RegularExpression*)regexp2;
    923     if (validateRE(regexp, status) == FALSE) {
    924         return FALSE;
    925     }
    926     return regexp->fMatcher->hasTransparentBounds();
    927 }
    928 
    929 
    930 //------------------------------------------------------------------------------
    931 //
    932 //    uregex_useTransparentBounds
    933 //
    934 //------------------------------------------------------------------------------
    935 U_CAPI void U_EXPORT2
    936 uregex_useTransparentBounds(URegularExpression    *regexp2,
    937                             UBool                  b,
    938                             UErrorCode            *status)  {
    939     RegularExpression *regexp = (RegularExpression*)regexp2;
    940     if (validateRE(regexp, status) == FALSE) {
    941         return;
    942     }
    943     regexp->fMatcher->useTransparentBounds(b);
    944 }
    945 
    946 
    947 //------------------------------------------------------------------------------
    948 //
    949 //    uregex_hasAnchoringBounds
    950 //
    951 //------------------------------------------------------------------------------
    952 U_CAPI UBool U_EXPORT2
    953 uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
    954                                  UErrorCode           *status)  {
    955     RegularExpression *regexp = (RegularExpression*)regexp2;
    956     if (validateRE(regexp, status) == FALSE) {
    957         return FALSE;
    958     }
    959     return regexp->fMatcher->hasAnchoringBounds();
    960 }
    961 
    962 
    963 //------------------------------------------------------------------------------
    964 //
    965 //    uregex_useAnchoringBounds
    966 //
    967 //------------------------------------------------------------------------------
    968 U_CAPI void U_EXPORT2
    969 uregex_useAnchoringBounds(URegularExpression    *regexp2,
    970                           UBool                  b,
    971                           UErrorCode            *status)  {
    972     RegularExpression *regexp = (RegularExpression*)regexp2;
    973     if (validateRE(regexp, status) == FALSE) {
    974         return;
    975     }
    976     regexp->fMatcher->useAnchoringBounds(b);
    977 }
    978 
    979 
    980 //------------------------------------------------------------------------------
    981 //
    982 //    uregex_hitEnd
    983 //
    984 //------------------------------------------------------------------------------
    985 U_CAPI UBool U_EXPORT2
    986 uregex_hitEnd(const  URegularExpression   *regexp2,
    987                      UErrorCode           *status)  {
    988     RegularExpression *regexp = (RegularExpression*)regexp2;
    989     if (validateRE(regexp, status) == FALSE) {
    990         return FALSE;
    991     }
    992     return regexp->fMatcher->hitEnd();
    993 }
    994 
    995 
    996 //------------------------------------------------------------------------------
    997 //
    998 //    uregex_requireEnd
    999 //
   1000 //------------------------------------------------------------------------------
   1001 U_CAPI UBool U_EXPORT2
   1002 uregex_requireEnd(const  URegularExpression   *regexp2,
   1003                          UErrorCode           *status)  {
   1004     RegularExpression *regexp = (RegularExpression*)regexp2;
   1005     if (validateRE(regexp, status) == FALSE) {
   1006         return FALSE;
   1007     }
   1008     return regexp->fMatcher->requireEnd();
   1009 }
   1010 
   1011 
   1012 //------------------------------------------------------------------------------
   1013 //
   1014 //    uregex_setTimeLimit
   1015 //
   1016 //------------------------------------------------------------------------------
   1017 U_CAPI void U_EXPORT2
   1018 uregex_setTimeLimit(URegularExpression   *regexp2,
   1019                     int32_t               limit,
   1020                     UErrorCode           *status) {
   1021     RegularExpression *regexp = (RegularExpression*)regexp2;
   1022     if (validateRE(regexp, status)) {
   1023         regexp->fMatcher->setTimeLimit(limit, *status);
   1024     }
   1025 }
   1026 
   1027 
   1028 
   1029 //------------------------------------------------------------------------------
   1030 //
   1031 //    uregex_getTimeLimit
   1032 //
   1033 //------------------------------------------------------------------------------
   1034 U_CAPI int32_t U_EXPORT2
   1035 uregex_getTimeLimit(const  URegularExpression   *regexp2,
   1036                            UErrorCode           *status) {
   1037     int32_t retVal = 0;
   1038     RegularExpression *regexp = (RegularExpression*)regexp2;
   1039     if (validateRE(regexp, status)) {
   1040         retVal = regexp->fMatcher->getTimeLimit();
   1041     }
   1042     return retVal;
   1043 }
   1044 
   1045 
   1046 
   1047 //------------------------------------------------------------------------------
   1048 //
   1049 //    uregex_setStackLimit
   1050 //
   1051 //------------------------------------------------------------------------------
   1052 U_CAPI void U_EXPORT2
   1053 uregex_setStackLimit(URegularExpression   *regexp2,
   1054                      int32_t               limit,
   1055                      UErrorCode           *status) {
   1056     RegularExpression *regexp = (RegularExpression*)regexp2;
   1057     if (validateRE(regexp, status)) {
   1058         regexp->fMatcher->setStackLimit(limit, *status);
   1059     }
   1060 }
   1061 
   1062 
   1063 
   1064 //------------------------------------------------------------------------------
   1065 //
   1066 //    uregex_getStackLimit
   1067 //
   1068 //------------------------------------------------------------------------------
   1069 U_CAPI int32_t U_EXPORT2
   1070 uregex_getStackLimit(const  URegularExpression   *regexp2,
   1071                             UErrorCode           *status) {
   1072     int32_t retVal = 0;
   1073     RegularExpression *regexp = (RegularExpression*)regexp2;
   1074     if (validateRE(regexp, status)) {
   1075         retVal = regexp->fMatcher->getStackLimit();
   1076     }
   1077     return retVal;
   1078 }
   1079 
   1080 
   1081 //------------------------------------------------------------------------------
   1082 //
   1083 //    uregex_setMatchCallback
   1084 //
   1085 //------------------------------------------------------------------------------
   1086 U_CAPI void U_EXPORT2
   1087 uregex_setMatchCallback(URegularExpression      *regexp2,
   1088                         URegexMatchCallback     *callback,
   1089                         const void              *context,
   1090                         UErrorCode              *status) {
   1091     RegularExpression *regexp = (RegularExpression*)regexp2;
   1092     if (validateRE(regexp, status)) {
   1093         regexp->fMatcher->setMatchCallback(callback, context, *status);
   1094     }
   1095 }
   1096 
   1097 
   1098 //------------------------------------------------------------------------------
   1099 //
   1100 //    uregex_getMatchCallback
   1101 //
   1102 //------------------------------------------------------------------------------
   1103 U_CAPI void U_EXPORT2
   1104 uregex_getMatchCallback(const URegularExpression    *regexp2,
   1105                         URegexMatchCallback        **callback,
   1106                         const void                 **context,
   1107                         UErrorCode                  *status) {
   1108     RegularExpression *regexp = (RegularExpression*)regexp2;
   1109      if (validateRE(regexp, status)) {
   1110          regexp->fMatcher->getMatchCallback(*callback, *context, *status);
   1111      }
   1112 }
   1113 
   1114 
   1115 //------------------------------------------------------------------------------
   1116 //
   1117 //    uregex_setMatchProgressCallback
   1118 //
   1119 //------------------------------------------------------------------------------
   1120 U_CAPI void U_EXPORT2
   1121 uregex_setFindProgressCallback(URegularExpression              *regexp2,
   1122                                 URegexFindProgressCallback      *callback,
   1123                                 const void                      *context,
   1124                                 UErrorCode                      *status) {
   1125     RegularExpression *regexp = (RegularExpression*)regexp2;
   1126     if (validateRE(regexp, status)) {
   1127         regexp->fMatcher->setFindProgressCallback(callback, context, *status);
   1128     }
   1129 }
   1130 
   1131 
   1132 //------------------------------------------------------------------------------
   1133 //
   1134 //    uregex_getMatchCallback
   1135 //
   1136 //------------------------------------------------------------------------------
   1137 U_CAPI void U_EXPORT2
   1138 uregex_getFindProgressCallback(const URegularExpression          *regexp2,
   1139                                 URegexFindProgressCallback        **callback,
   1140                                 const void                        **context,
   1141                                 UErrorCode                        *status) {
   1142     RegularExpression *regexp = (RegularExpression*)regexp2;
   1143      if (validateRE(regexp, status)) {
   1144          regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
   1145      }
   1146 }
   1147 
   1148 
   1149 //------------------------------------------------------------------------------
   1150 //
   1151 //    uregex_replaceAll
   1152 //
   1153 //------------------------------------------------------------------------------
   1154 U_CAPI int32_t U_EXPORT2
   1155 uregex_replaceAll(URegularExpression    *regexp2,
   1156                   const UChar           *replacementText,
   1157                   int32_t                replacementLength,
   1158                   UChar                 *destBuf,
   1159                   int32_t                destCapacity,
   1160                   UErrorCode            *status)  {
   1161     RegularExpression *regexp = (RegularExpression*)regexp2;
   1162     if (validateRE(regexp, status) == FALSE) {
   1163         return 0;
   1164     }
   1165     if (replacementText == NULL || replacementLength < -1 ||
   1166         (destBuf == NULL && destCapacity > 0) ||
   1167         destCapacity < 0) {
   1168         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1169         return 0;
   1170     }
   1171 
   1172     int32_t   len = 0;
   1173 
   1174     uregex_reset(regexp2, 0, status);
   1175 
   1176     // Note: Seperate error code variables for findNext() and appendReplacement()
   1177     //       are used so that destination buffer overflow errors
   1178     //       in appendReplacement won't stop findNext() from working.
   1179     //       appendReplacement() and appendTail() special case incoming buffer
   1180     //       overflow errors, continuing to return the correct length.
   1181     UErrorCode  findStatus = *status;
   1182     while (uregex_findNext(regexp2, &findStatus)) {
   1183         len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
   1184                                         &destBuf, &destCapacity, status);
   1185     }
   1186     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
   1187 
   1188     if (U_FAILURE(findStatus)) {
   1189         // If anything went wrong with the findNext(), make that error trump
   1190         //   whatever may have happened with the append() operations.
   1191         //   Errors in findNext() are not expected.
   1192         *status = findStatus;
   1193     }
   1194 
   1195     return len;
   1196 }
   1197 
   1198 
   1199 //------------------------------------------------------------------------------
   1200 //
   1201 //    uregex_replaceAllUText
   1202 //
   1203 //------------------------------------------------------------------------------
   1204 U_CAPI UText * U_EXPORT2
   1205 uregex_replaceAllUText(URegularExpression    *regexp2,
   1206                        UText                 *replacementText,
   1207                        UText                 *dest,
   1208                        UErrorCode            *status)  {
   1209     RegularExpression *regexp = (RegularExpression*)regexp2;
   1210     if (validateRE(regexp, status) == FALSE) {
   1211         return 0;
   1212     }
   1213     if (replacementText == NULL) {
   1214         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1215         return 0;
   1216     }
   1217 
   1218     dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
   1219     return dest;
   1220 }
   1221 
   1222 
   1223 //------------------------------------------------------------------------------
   1224 //
   1225 //    uregex_replaceFirst
   1226 //
   1227 //------------------------------------------------------------------------------
   1228 U_CAPI int32_t U_EXPORT2
   1229 uregex_replaceFirst(URegularExpression  *regexp2,
   1230                     const UChar         *replacementText,
   1231                     int32_t              replacementLength,
   1232                     UChar               *destBuf,
   1233                     int32_t              destCapacity,
   1234                     UErrorCode          *status)  {
   1235     RegularExpression *regexp = (RegularExpression*)regexp2;
   1236     if (validateRE(regexp, status) == FALSE) {
   1237         return 0;
   1238     }
   1239     if (replacementText == NULL || replacementLength < -1 ||
   1240         (destBuf == NULL && destCapacity > 0) ||
   1241         destCapacity < 0) {
   1242         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1243         return 0;
   1244     }
   1245 
   1246     int32_t   len = 0;
   1247     UBool     findSucceeded;
   1248     uregex_reset(regexp2, 0, status);
   1249     findSucceeded = uregex_find(regexp2, 0, status);
   1250     if (findSucceeded) {
   1251         len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
   1252                                        &destBuf, &destCapacity, status);
   1253     }
   1254     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
   1255 
   1256     return len;
   1257 }
   1258 
   1259 
   1260 //------------------------------------------------------------------------------
   1261 //
   1262 //    uregex_replaceFirstUText
   1263 //
   1264 //------------------------------------------------------------------------------
   1265 U_CAPI UText * U_EXPORT2
   1266 uregex_replaceFirstUText(URegularExpression  *regexp2,
   1267                          UText                 *replacementText,
   1268                          UText                 *dest,
   1269                          UErrorCode            *status)  {
   1270     RegularExpression *regexp = (RegularExpression*)regexp2;
   1271     if (validateRE(regexp, status) == FALSE) {
   1272         return 0;
   1273     }
   1274     if (replacementText == NULL) {
   1275         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1276         return 0;
   1277     }
   1278 
   1279     dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
   1280     return dest;
   1281 }
   1282 
   1283 
   1284 //------------------------------------------------------------------------------
   1285 //
   1286 //    uregex_appendReplacement
   1287 //
   1288 //------------------------------------------------------------------------------
   1289 
   1290 U_NAMESPACE_BEGIN
   1291 //
   1292 //  Dummy class, because these functions need to be friends of class RegexMatcher,
   1293 //               and stand-alone C functions don't work as friends
   1294 //
   1295 class RegexCImpl {
   1296  public:
   1297    inline static  int32_t appendReplacement(RegularExpression    *regexp,
   1298                       const UChar           *replacementText,
   1299                       int32_t                replacementLength,
   1300                       UChar                **destBuf,
   1301                       int32_t               *destCapacity,
   1302                       UErrorCode            *status);
   1303 
   1304    inline static int32_t appendTail(RegularExpression    *regexp,
   1305         UChar                **destBuf,
   1306         int32_t               *destCapacity,
   1307         UErrorCode            *status);
   1308 
   1309     inline static int32_t split(RegularExpression    *regexp,
   1310         UChar                 *destBuf,
   1311         int32_t                destCapacity,
   1312         int32_t               *requiredCapacity,
   1313         UChar                 *destFields[],
   1314         int32_t                destFieldsCapacity,
   1315         UErrorCode            *status);
   1316 };
   1317 
   1318 U_NAMESPACE_END
   1319 
   1320 
   1321 
   1322 static const UChar BACKSLASH  = 0x5c;
   1323 static const UChar DOLLARSIGN = 0x24;
   1324 
   1325 //
   1326 //  Move a character to an output buffer, with bounds checking on the index.
   1327 //      Index advances even if capacity is exceeded, for preflight size computations.
   1328 //      This little sequence is used a LOT.
   1329 //
   1330 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
   1331     if (*idx < bufCapacity) {
   1332         buf[*idx] = c;
   1333     }
   1334     (*idx)++;
   1335 }
   1336 
   1337 
   1338 //
   1339 //  appendReplacement, the actual implementation.
   1340 //
   1341 int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
   1342                                       const UChar           *replacementText,
   1343                                       int32_t                replacementLength,
   1344                                       UChar                **destBuf,
   1345                                       int32_t               *destCapacity,
   1346                                       UErrorCode            *status)  {
   1347 
   1348     // If we come in with a buffer overflow error, don't suppress the operation.
   1349     //  A series of appendReplacements, appendTail need to correctly preflight
   1350     //  the buffer size when an overflow happens somewhere in the middle.
   1351     UBool pendingBufferOverflow = FALSE;
   1352     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1353         pendingBufferOverflow = TRUE;
   1354         *status = U_ZERO_ERROR;
   1355     }
   1356 
   1357     //
   1358     // Validate all paramters
   1359     //
   1360     if (validateRE(regexp, status) == FALSE) {
   1361         return 0;
   1362     }
   1363     if (replacementText == NULL || replacementLength < -1 ||
   1364         destCapacity == NULL || destBuf == NULL ||
   1365         (*destBuf == NULL && *destCapacity > 0) ||
   1366         *destCapacity < 0) {
   1367         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1368         return 0;
   1369     }
   1370 
   1371     RegexMatcher *m = regexp->fMatcher;
   1372     if (m->fMatch == FALSE) {
   1373         *status = U_REGEX_INVALID_STATE;
   1374         return 0;
   1375     }
   1376 
   1377     UChar    *dest             = *destBuf;
   1378     int32_t   capacity         = *destCapacity;
   1379     int32_t   destIdx          =  0;
   1380     int32_t   i;
   1381 
   1382     // If it wasn't supplied by the caller,  get the length of the replacement text.
   1383     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
   1384     //          the fly and avoid this step.
   1385     if (replacementLength == -1) {
   1386         replacementLength = u_strlen(replacementText);
   1387     }
   1388 
   1389     // Copy input string from the end of previous match to start of current match
   1390     if (regexp->fText != NULL) {
   1391         int32_t matchStart;
   1392         int32_t lastMatchEnd;
   1393         if (UTEXT_USES_U16(m->fInputText)) {
   1394             lastMatchEnd = (int32_t)m->fLastMatchEnd;
   1395             matchStart = (int32_t)m->fMatchStart;
   1396         } else {
   1397             // !!!: Would like a better way to do this!
   1398             UErrorCode status = U_ZERO_ERROR;
   1399             lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
   1400             status = U_ZERO_ERROR;
   1401             matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
   1402         }
   1403         for (i=lastMatchEnd; i<matchStart; i++) {
   1404             appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
   1405         }
   1406     } else {
   1407         UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
   1408         destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
   1409                                  &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
   1410     }
   1411 
   1412 
   1413     // scan the replacement text, looking for substitutions ($n) and \escapes.
   1414     int32_t  replIdx = 0;
   1415     while (replIdx < replacementLength) {
   1416         UChar  c = replacementText[replIdx];
   1417         replIdx++;
   1418         if (c != DOLLARSIGN && c != BACKSLASH) {
   1419             // Common case, no substitution, no escaping,
   1420             //  just copy the char to the dest buf.
   1421             appendToBuf(c, &destIdx, dest, capacity);
   1422             continue;
   1423         }
   1424 
   1425         if (c == BACKSLASH) {
   1426             // Backslash Escape.  Copy the following char out without further checks.
   1427             //                    Note:  Surrogate pairs don't need any special handling
   1428             //                           The second half wont be a '$' or a '\', and
   1429             //                           will move to the dest normally on the next
   1430             //                           loop iteration.
   1431             if (replIdx >= replacementLength) {
   1432                 break;
   1433             }
   1434             c = replacementText[replIdx];
   1435 
   1436             if (c==0x55/*U*/ || c==0x75/*u*/) {
   1437                 // We have a \udddd or \Udddddddd escape sequence.
   1438                 UChar32 escapedChar =
   1439                     u_unescapeAt(uregex_ucstr_unescape_charAt,
   1440                        &replIdx,                   // Index is updated by unescapeAt
   1441                        replacementLength,          // Length of replacement text
   1442                        (void *)replacementText);
   1443 
   1444                 if (escapedChar != (UChar32)0xFFFFFFFF) {
   1445                     if (escapedChar <= 0xffff) {
   1446                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
   1447                     } else {
   1448                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
   1449                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
   1450                     }
   1451                     continue;
   1452                 }
   1453                 // Note:  if the \u escape was invalid, just fall through and
   1454                 //        treat it as a plain \<anything> escape.
   1455             }
   1456 
   1457             // Plain backslash escape.  Just put out the escaped character.
   1458             appendToBuf(c, &destIdx, dest, capacity);
   1459 
   1460             replIdx++;
   1461             continue;
   1462         }
   1463 
   1464 
   1465 
   1466         // We've got a $.  Pick up a capture group number if one follows.
   1467         // Consume at most the number of digits necessary for the largest capture
   1468         // number that is valid for this pattern.
   1469 
   1470         int32_t numDigits = 0;
   1471         int32_t groupNum  = 0;
   1472         UChar32 digitC;
   1473         for (;;) {
   1474             if (replIdx >= replacementLength) {
   1475                 break;
   1476             }
   1477             U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
   1478             if (u_isdigit(digitC) == FALSE) {
   1479                 break;
   1480             }
   1481 
   1482             U16_FWD_1(replacementText, replIdx, replacementLength);
   1483             groupNum=groupNum*10 + u_charDigitValue(digitC);
   1484             numDigits++;
   1485             if (numDigits >= m->fPattern->fMaxCaptureDigits) {
   1486                 break;
   1487             }
   1488         }
   1489 
   1490 
   1491         if (numDigits == 0) {
   1492             // The $ didn't introduce a group number at all.
   1493             // Treat it as just part of the substitution text.
   1494             appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
   1495             continue;
   1496         }
   1497 
   1498         // Finally, append the capture group data to the destination.
   1499         destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
   1500         if (*status == U_BUFFER_OVERFLOW_ERROR) {
   1501             // Ignore buffer overflow when extracting the group.  We need to
   1502             //   continue on to get full size of the untruncated result.  We will
   1503             //   raise our own buffer overflow error at the end.
   1504             *status = U_ZERO_ERROR;
   1505         }
   1506 
   1507         if (U_FAILURE(*status)) {
   1508             // Can fail if group number is out of range.
   1509             break;
   1510         }
   1511 
   1512     }
   1513 
   1514     //
   1515     //  Nul Terminate the dest buffer if possible.
   1516     //  Set the appropriate buffer overflow or not terminated error, if needed.
   1517     //
   1518     if (destIdx < capacity) {
   1519         dest[destIdx] = 0;
   1520     } else if (destIdx == *destCapacity) {
   1521         *status = U_STRING_NOT_TERMINATED_WARNING;
   1522     } else {
   1523         *status = U_BUFFER_OVERFLOW_ERROR;
   1524     }
   1525 
   1526     //
   1527     // Return an updated dest buffer and capacity to the caller.
   1528     //
   1529     if (destIdx > 0 &&  *destCapacity > 0) {
   1530         if (destIdx < capacity) {
   1531             *destBuf      += destIdx;
   1532             *destCapacity -= destIdx;
   1533         } else {
   1534             *destBuf      += capacity;
   1535             *destCapacity =  0;
   1536         }
   1537     }
   1538 
   1539     // If we came in with a buffer overflow, make sure we go out with one also.
   1540     //   (A zero length match right at the end of the previous match could
   1541     //    make this function succeed even though a previous call had overflowed the buf)
   1542     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1543         *status = U_BUFFER_OVERFLOW_ERROR;
   1544     }
   1545 
   1546     return destIdx;
   1547 }
   1548 
   1549 //
   1550 //   appendReplacement   the actual API function,
   1551 //
   1552 U_CAPI int32_t U_EXPORT2
   1553 uregex_appendReplacement(URegularExpression    *regexp2,
   1554                          const UChar           *replacementText,
   1555                          int32_t                replacementLength,
   1556                          UChar                **destBuf,
   1557                          int32_t               *destCapacity,
   1558                          UErrorCode            *status) {
   1559 
   1560     RegularExpression *regexp = (RegularExpression*)regexp2;
   1561     return RegexCImpl::appendReplacement(
   1562         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
   1563 }
   1564 
   1565 //
   1566 //   uregex_appendReplacementUText...can just use the normal C++ method
   1567 //
   1568 U_CAPI void U_EXPORT2
   1569 uregex_appendReplacementUText(URegularExpression    *regexp2,
   1570                               UText                 *replText,
   1571                               UText                 *dest,
   1572                               UErrorCode            *status)  {
   1573     RegularExpression *regexp = (RegularExpression*)regexp2;
   1574     regexp->fMatcher->appendReplacement(dest, replText, *status);
   1575 }
   1576 
   1577 
   1578 //------------------------------------------------------------------------------
   1579 //
   1580 //    uregex_appendTail
   1581 //
   1582 //------------------------------------------------------------------------------
   1583 int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
   1584                                UChar                **destBuf,
   1585                                int32_t               *destCapacity,
   1586                                UErrorCode            *status)
   1587 {
   1588 
   1589     // If we come in with a buffer overflow error, don't suppress the operation.
   1590     //  A series of appendReplacements, appendTail need to correctly preflight
   1591     //  the buffer size when an overflow happens somewhere in the middle.
   1592     UBool pendingBufferOverflow = FALSE;
   1593     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1594         pendingBufferOverflow = TRUE;
   1595         *status = U_ZERO_ERROR;
   1596     }
   1597 
   1598     if (validateRE(regexp, status) == FALSE) {
   1599         return 0;
   1600     }
   1601 
   1602     if (destCapacity == NULL || destBuf == NULL ||
   1603         (*destBuf == NULL && *destCapacity > 0) ||
   1604         *destCapacity < 0)
   1605     {
   1606         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1607         return 0;
   1608     }
   1609 
   1610     RegexMatcher *m = regexp->fMatcher;
   1611 
   1612     int32_t  destIdx     = 0;
   1613     int32_t  destCap     = *destCapacity;
   1614     UChar    *dest       = *destBuf;
   1615 
   1616     if (regexp->fText != NULL) {
   1617         int32_t srcIdx;
   1618         int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
   1619         if (nativeIdx == -1) {
   1620             srcIdx = 0;
   1621         } else if (UTEXT_USES_U16(m->fInputText)) {
   1622             srcIdx = (int32_t)nativeIdx;
   1623         } else {
   1624             UErrorCode status = U_ZERO_ERROR;
   1625             srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
   1626         }
   1627 
   1628         for (;;) {
   1629             if (srcIdx == regexp->fTextLength) {
   1630                 break;
   1631             }
   1632             UChar c = regexp->fText[srcIdx];
   1633             if (c == 0 && regexp->fTextLength == -1) {
   1634                 regexp->fTextLength = srcIdx;
   1635                 break;
   1636             }
   1637             if (destIdx < destCap) {
   1638                 dest[destIdx] = c;
   1639             } else {
   1640                 // We've overflowed the dest buffer.
   1641                 //  If the total input string length is known, we can
   1642                 //    compute the total buffer size needed without scanning through the string.
   1643                 if (regexp->fTextLength > 0) {
   1644                     destIdx += (regexp->fTextLength - srcIdx);
   1645                     break;
   1646                 }
   1647             }
   1648             srcIdx++;
   1649             destIdx++;
   1650         }
   1651     } else {
   1652         int64_t  srcIdx;
   1653         if (m->fMatch) {
   1654             // The most recent call to find() succeeded.
   1655             srcIdx = m->fMatchEnd;
   1656         } else {
   1657             // The last call to find() on this matcher failed().
   1658             //   Look back to the end of the last find() that succeeded for src index.
   1659             srcIdx = m->fLastMatchEnd;
   1660             if (srcIdx == -1)  {
   1661                 // There has been no successful match with this matcher.
   1662                 //   We want to copy the whole string.
   1663                 srcIdx = 0;
   1664             }
   1665         }
   1666 
   1667         destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
   1668     }
   1669 
   1670     //
   1671     //  NUL terminate the output string, if possible, otherwise issue the
   1672     //   appropriate error or warning.
   1673     //
   1674     if (destIdx < destCap) {
   1675         dest[destIdx] = 0;
   1676     } else  if (destIdx == destCap) {
   1677         *status = U_STRING_NOT_TERMINATED_WARNING;
   1678     } else {
   1679         *status = U_BUFFER_OVERFLOW_ERROR;
   1680     }
   1681 
   1682     //
   1683     // Update the user's buffer ptr and capacity vars to reflect the
   1684     //   amount used.
   1685     //
   1686     if (destIdx < destCap) {
   1687         *destBuf      += destIdx;
   1688         *destCapacity -= destIdx;
   1689     } else {
   1690         *destBuf      += destCap;
   1691         *destCapacity  = 0;
   1692     }
   1693 
   1694     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1695         *status = U_BUFFER_OVERFLOW_ERROR;
   1696     }
   1697 
   1698     return destIdx;
   1699 }
   1700 
   1701 
   1702 //
   1703 //   appendTail   the actual API function
   1704 //
   1705 U_CAPI int32_t U_EXPORT2
   1706 uregex_appendTail(URegularExpression    *regexp2,
   1707                   UChar                **destBuf,
   1708                   int32_t               *destCapacity,
   1709                   UErrorCode            *status)  {
   1710     RegularExpression *regexp = (RegularExpression*)regexp2;
   1711     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
   1712 }
   1713 
   1714 
   1715 //
   1716 //   uregex_appendTailUText...can just use the normal C++ method
   1717 //
   1718 U_CAPI UText * U_EXPORT2
   1719 uregex_appendTailUText(URegularExpression    *regexp2,
   1720                        UText                 *dest,
   1721                        UErrorCode            *status)  {
   1722     RegularExpression *regexp = (RegularExpression*)regexp2;
   1723     return regexp->fMatcher->appendTail(dest, *status);
   1724 }
   1725 
   1726 
   1727 //------------------------------------------------------------------------------
   1728 //
   1729 //    copyString     Internal utility to copy a string to an output buffer,
   1730 //                   while managing buffer overflow and preflight size
   1731 //                   computation.  NUL termination is added to destination,
   1732 //                   and the NUL is counted in the output size.
   1733 //
   1734 //------------------------------------------------------------------------------
   1735 #if 0
   1736 static void copyString(UChar        *destBuffer,    //  Destination buffer.
   1737                        int32_t       destCapacity,  //  Total capacity of dest buffer
   1738                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
   1739                                                     //    Update not clipped to destCapacity.
   1740                        const UChar  *srcPtr,        //  Pointer to source string
   1741                        int32_t       srcLen)        //  Source string len.
   1742 {
   1743     int32_t  si;
   1744     int32_t  di = *destIndex;
   1745     UChar    c;
   1746 
   1747     for (si=0; si<srcLen;  si++) {
   1748         c = srcPtr[si];
   1749         if (di < destCapacity) {
   1750             destBuffer[di] = c;
   1751             di++;
   1752         } else {
   1753             di += srcLen - si;
   1754             break;
   1755         }
   1756     }
   1757     if (di<destCapacity) {
   1758         destBuffer[di] = 0;
   1759     }
   1760     di++;
   1761     *destIndex = di;
   1762 }
   1763 #endif
   1764 
   1765 //------------------------------------------------------------------------------
   1766 //
   1767 //    uregex_split
   1768 //
   1769 //------------------------------------------------------------------------------
   1770 int32_t RegexCImpl::split(RegularExpression     *regexp,
   1771                           UChar                 *destBuf,
   1772                           int32_t                destCapacity,
   1773                           int32_t               *requiredCapacity,
   1774                           UChar                 *destFields[],
   1775                           int32_t                destFieldsCapacity,
   1776                           UErrorCode            *status) {
   1777     //
   1778     // Reset for the input text
   1779     //
   1780     regexp->fMatcher->reset();
   1781     UText *inputText = regexp->fMatcher->fInputText;
   1782     int64_t   nextOutputStringStart = 0;
   1783     int64_t   inputLen = regexp->fMatcher->fInputLength;
   1784     if (inputLen == 0) {
   1785         return 0;
   1786     }
   1787 
   1788     //
   1789     // Loop through the input text, searching for the delimiter pattern
   1790     //
   1791     int32_t   i;             // Index of the field being processed.
   1792     int32_t   destIdx = 0;   // Next available position in destBuf;
   1793     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
   1794     UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
   1795     for (i=0; ; i++) {
   1796         if (i>=destFieldsCapacity-1) {
   1797             // There are one or zero output strings left.
   1798             // Fill the last output string with whatever is left from the input, then exit the loop.
   1799             //  ( i will be == destFieldsCapacity if we filled the output array while processing
   1800             //    capture groups of the delimiter expression, in which case we will discard the
   1801             //    last capture group saved in favor of the unprocessed remainder of the
   1802             //    input string.)
   1803             if (inputLen > nextOutputStringStart) {
   1804                 if (i != destFieldsCapacity-1) {
   1805                     // No fields are left.  Recycle the last one for holding the trailing part of
   1806                     //   the input string.
   1807                     i = destFieldsCapacity-1;
   1808                     destIdx = (int32_t)(destFields[i] - destFields[0]);
   1809                 }
   1810 
   1811                 destFields[i] = &destBuf[destIdx];
   1812                 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
   1813                                              &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
   1814             }
   1815             break;
   1816         }
   1817 
   1818         if (regexp->fMatcher->find()) {
   1819             // We found another delimiter.  Move everything from where we started looking
   1820             //  up until the start of the delimiter into the next output string.
   1821             destFields[i] = &destBuf[destIdx];
   1822 
   1823             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
   1824                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
   1825             if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
   1826                 tStatus = U_ZERO_ERROR;
   1827             } else {
   1828                 *status = tStatus;
   1829             }
   1830             nextOutputStringStart = regexp->fMatcher->fMatchEnd;
   1831 
   1832             // If the delimiter pattern has capturing parentheses, the captured
   1833             //  text goes out into the next n destination strings.
   1834             int32_t groupNum;
   1835             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   1836                 // If we've run out of output string slots, bail out.
   1837                 if (i==destFieldsCapacity-1) {
   1838                     break;
   1839                 }
   1840                 i++;
   1841 
   1842                 // Set up to extract the capture group contents into the dest buffer.
   1843                 destFields[i] = &destBuf[destIdx];
   1844                 tStatus = U_ZERO_ERROR;
   1845                 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
   1846                 destIdx += t + 1;    // Record the space used in the output string buffer.
   1847                                      //  +1 for the NUL that terminates the string.
   1848                 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
   1849                     tStatus = U_ZERO_ERROR;
   1850                 } else {
   1851                     *status = tStatus;
   1852                 }
   1853             }
   1854 
   1855             if (nextOutputStringStart == inputLen) {
   1856                 // The delimiter was at the end of the string.  We're done.
   1857                 break;
   1858             }
   1859 
   1860         }
   1861         else
   1862         {
   1863             // We ran off the end of the input while looking for the next delimiter.
   1864             // All the remaining text goes into the current output string.
   1865             destFields[i] = &destBuf[destIdx];
   1866             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
   1867                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
   1868             break;
   1869         }
   1870     }
   1871 
   1872     // Zero out any unused portion of the destFields array
   1873     int j;
   1874     for (j=i+1; j<destFieldsCapacity; j++) {
   1875         destFields[j] = NULL;
   1876     }
   1877 
   1878     if (requiredCapacity != NULL) {
   1879         *requiredCapacity = destIdx;
   1880     }
   1881     if (destIdx > destCapacity) {
   1882         *status = U_BUFFER_OVERFLOW_ERROR;
   1883     }
   1884     return i+1;
   1885 }
   1886 
   1887 //
   1888 //   uregex_split   The actual API function
   1889 //
   1890 U_CAPI int32_t U_EXPORT2
   1891 uregex_split(URegularExpression      *regexp2,
   1892              UChar                   *destBuf,
   1893              int32_t                  destCapacity,
   1894              int32_t                 *requiredCapacity,
   1895              UChar                   *destFields[],
   1896              int32_t                  destFieldsCapacity,
   1897              UErrorCode              *status) {
   1898     RegularExpression *regexp = (RegularExpression*)regexp2;
   1899     if (validateRE(regexp, status) == FALSE) {
   1900         return 0;
   1901     }
   1902     if ((destBuf == NULL && destCapacity > 0) ||
   1903         destCapacity < 0 ||
   1904         destFields == NULL ||
   1905         destFieldsCapacity < 1 ) {
   1906         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1907         return 0;
   1908     }
   1909 
   1910     return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
   1911 }
   1912 
   1913 
   1914 //
   1915 //   uregex_splitUText...can just use the normal C++ method
   1916 //
   1917 U_CAPI int32_t U_EXPORT2
   1918 uregex_splitUText(URegularExpression    *regexp2,
   1919                   UText                 *destFields[],
   1920                   int32_t                destFieldsCapacity,
   1921                   UErrorCode            *status) {
   1922     RegularExpression *regexp = (RegularExpression*)regexp2;
   1923     return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
   1924 }
   1925 
   1926 
   1927 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1928 
   1929