Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2004-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  regex.cpp
      7 */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     12 
     13 #include "unicode/regex.h"
     14 #include "unicode/uregex.h"
     15 #include "unicode/unistr.h"
     16 #include "unicode/ustring.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/uobject.h"
     19 #include "umutex.h"
     20 #include "uassert.h"
     21 #include "cmemory.h"
     22 
     23 #include "regextxt.h"
     24 
     25 #include <stdio.h>
     26 
     27 U_NAMESPACE_BEGIN
     28 
     29 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
     30 
     31 struct RegularExpression: public UMemory {
     32 public:
     33     RegularExpression();
     34     ~RegularExpression();
     35     int32_t           fMagic;
     36     RegexPattern     *fPat;
     37     int32_t          *fPatRefCount;
     38     UChar            *fPatString;
     39     int32_t           fPatStringLen;
     40     RegexMatcher     *fMatcher;
     41     const UChar      *fText;         // Text from setText()
     42     int32_t           fTextLength;   // Length provided by user with setText(), which
     43                                      //  may be -1.
     44     UBool             fOwnsText;
     45 };
     46 
     47 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
     48 
     49 RegularExpression::RegularExpression() {
     50     fMagic        = REXP_MAGIC;
     51     fPat          = NULL;
     52     fPatRefCount  = NULL;
     53     fPatString    = NULL;
     54     fPatStringLen = 0;
     55     fMatcher      = NULL;
     56     fText         = NULL;
     57     fTextLength   = 0;
     58     fOwnsText     = FALSE;
     59 }
     60 
     61 RegularExpression::~RegularExpression() {
     62     delete fMatcher;
     63     fMatcher = NULL;
     64     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
     65         delete fPat;
     66         uprv_free(fPatString);
     67         uprv_free(fPatRefCount);
     68     }
     69     if (fOwnsText && fText!=NULL) {
     70         uprv_free((void *)fText);
     71     }
     72     fMagic = 0;
     73 }
     74 
     75 U_NAMESPACE_END
     76 
     77 U_NAMESPACE_USE
     78 
     79 //----------------------------------------------------------------------------------------
     80 //
     81 //   validateRE    Do boilerplate style checks on API function parameters.
     82 //                 Return TRUE if they look OK.
     83 //----------------------------------------------------------------------------------------
     84 static UBool validateRE(const RegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
     85     if (U_FAILURE(*status)) {
     86         return FALSE;
     87     }
     88     if (re == NULL || re->fMagic != REXP_MAGIC) {
     89         *status = U_ILLEGAL_ARGUMENT_ERROR;
     90         return FALSE;
     91     }
     92     // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
     93     if (requiresText && re->fText == NULL && !re->fOwnsText) {
     94         *status = U_REGEX_INVALID_STATE;
     95         return FALSE;
     96     }
     97     return TRUE;
     98 }
     99 
    100 //----------------------------------------------------------------------------------------
    101 //
    102 //    uregex_open
    103 //
    104 //----------------------------------------------------------------------------------------
    105 U_CAPI URegularExpression *  U_EXPORT2
    106 uregex_open( const  UChar          *pattern,
    107                     int32_t         patternLength,
    108                     uint32_t        flags,
    109                     UParseError    *pe,
    110                     UErrorCode     *status) {
    111 
    112     if (U_FAILURE(*status)) {
    113         return NULL;
    114     }
    115     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
    116         *status = U_ILLEGAL_ARGUMENT_ERROR;
    117         return NULL;
    118     }
    119     int32_t actualPatLen = patternLength;
    120     if (actualPatLen == -1) {
    121         actualPatLen = u_strlen(pattern);
    122     }
    123 
    124     RegularExpression *re     = new RegularExpression;
    125     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    126     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
    127     if (re == NULL || refC == NULL || patBuf == NULL) {
    128         *status = U_MEMORY_ALLOCATION_ERROR;
    129         delete re;
    130         uprv_free(refC);
    131         uprv_free(patBuf);
    132         return NULL;
    133     }
    134     re->fPatRefCount = refC;
    135     *re->fPatRefCount = 1;
    136 
    137     //
    138     // Make a copy of the pattern string, so we can return it later if asked.
    139     //    For compiling the pattern, we will use a UText wrapper around
    140     //    this local copy, to avoid making even more copies.
    141     //
    142     re->fPatString    = patBuf;
    143     re->fPatStringLen = patternLength;
    144     u_memcpy(patBuf, pattern, actualPatLen);
    145     patBuf[actualPatLen] = 0;
    146 
    147     UText patText = UTEXT_INITIALIZER;
    148     utext_openUChars(&patText, patBuf, patternLength, status);
    149 
    150     //
    151     // Compile the pattern
    152     //
    153     if (pe != NULL) {
    154         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    155     } else {
    156         re->fPat = RegexPattern::compile(&patText, flags, *status);
    157     }
    158     utext_close(&patText);
    159 
    160     if (U_FAILURE(*status)) {
    161         goto ErrorExit;
    162     }
    163 
    164     //
    165     // Create the matcher object
    166     //
    167     re->fMatcher = re->fPat->matcher(*status);
    168     if (U_SUCCESS(*status)) {
    169         return (URegularExpression*)re;
    170     }
    171 
    172 ErrorExit:
    173     delete re;
    174     return NULL;
    175 
    176 }
    177 
    178 //----------------------------------------------------------------------------------------
    179 //
    180 //    uregex_openUText
    181 //
    182 //----------------------------------------------------------------------------------------
    183 U_CAPI URegularExpression *  U_EXPORT2
    184 uregex_openUText(UText          *pattern,
    185                  uint32_t        flags,
    186                  UParseError    *pe,
    187                  UErrorCode     *status) {
    188 
    189     if (U_FAILURE(*status)) {
    190         return NULL;
    191     }
    192     if (pattern == NULL) {
    193         *status = U_ILLEGAL_ARGUMENT_ERROR;
    194         return NULL;
    195     }
    196 
    197     int64_t patternNativeLength = utext_nativeLength(pattern);
    198 
    199     if (patternNativeLength == 0) {
    200         *status = U_ILLEGAL_ARGUMENT_ERROR;
    201         return NULL;
    202     }
    203 
    204     RegularExpression *re     = new RegularExpression;
    205 
    206     UErrorCode lengthStatus = U_ZERO_ERROR;
    207     int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
    208 
    209     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    210     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
    211     if (re == NULL || refC == NULL || patBuf == NULL) {
    212         *status = U_MEMORY_ALLOCATION_ERROR;
    213         delete re;
    214         uprv_free(refC);
    215         uprv_free(patBuf);
    216         return NULL;
    217     }
    218     re->fPatRefCount = refC;
    219     *re->fPatRefCount = 1;
    220 
    221     //
    222     // Make a copy of the pattern string, so we can return it later if asked.
    223     //    For compiling the pattern, we will use a read-only UText wrapper
    224     //    around this local copy, to avoid making even more copies.
    225     //
    226     re->fPatString    = patBuf;
    227     re->fPatStringLen = pattern16Length;
    228     utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
    229 
    230     UText patText = UTEXT_INITIALIZER;
    231     utext_openUChars(&patText, patBuf, pattern16Length, status);
    232 
    233     //
    234     // Compile the pattern
    235     //
    236     if (pe != NULL) {
    237         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
    238     } else {
    239         re->fPat = RegexPattern::compile(&patText, flags, *status);
    240     }
    241     utext_close(&patText);
    242 
    243     if (U_FAILURE(*status)) {
    244         goto ErrorExit;
    245     }
    246 
    247     //
    248     // Create the matcher object
    249     //
    250     re->fMatcher = re->fPat->matcher(*status);
    251     if (U_SUCCESS(*status)) {
    252         return (URegularExpression*)re;
    253     }
    254 
    255 ErrorExit:
    256     delete re;
    257     return NULL;
    258 
    259 }
    260 
    261 //----------------------------------------------------------------------------------------
    262 //
    263 //    uregex_close
    264 //
    265 //----------------------------------------------------------------------------------------
    266 U_CAPI void  U_EXPORT2
    267 uregex_close(URegularExpression  *re2) {
    268     RegularExpression *re = (RegularExpression*)re2;
    269     UErrorCode  status = U_ZERO_ERROR;
    270     if (validateRE(re, &status, FALSE) == FALSE) {
    271         return;
    272     }
    273     delete re;
    274 }
    275 
    276 
    277 //----------------------------------------------------------------------------------------
    278 //
    279 //    uregex_clone
    280 //
    281 //----------------------------------------------------------------------------------------
    282 U_CAPI URegularExpression * U_EXPORT2
    283 uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
    284     RegularExpression *source = (RegularExpression*)source2;
    285     if (validateRE(source, status, FALSE) == FALSE) {
    286         return NULL;
    287     }
    288 
    289     RegularExpression *clone = new RegularExpression;
    290     if (clone == NULL) {
    291         *status = U_MEMORY_ALLOCATION_ERROR;
    292         return NULL;
    293     }
    294 
    295     clone->fMatcher = source->fPat->matcher(*status);
    296     if (U_FAILURE(*status)) {
    297         delete clone;
    298         return NULL;
    299     }
    300 
    301     clone->fPat          = source->fPat;
    302     clone->fPatRefCount  = source->fPatRefCount;
    303     clone->fPatString    = source->fPatString;
    304     clone->fPatStringLen = source->fPatStringLen;
    305     umtx_atomic_inc(source->fPatRefCount);
    306     // Note:  fText is not cloned.
    307 
    308     return (URegularExpression*)clone;
    309 }
    310 
    311 
    312 
    313 
    314 //------------------------------------------------------------------------------
    315 //
    316 //    uregex_pattern
    317 //
    318 //------------------------------------------------------------------------------
    319 U_CAPI const UChar * U_EXPORT2
    320 uregex_pattern(const  URegularExpression *regexp2,
    321                       int32_t            *patLength,
    322                       UErrorCode         *status)  {
    323     RegularExpression *regexp = (RegularExpression*)regexp2;
    324 
    325     if (validateRE(regexp, status, FALSE) == FALSE) {
    326         return NULL;
    327     }
    328     if (patLength != NULL) {
    329         *patLength = regexp->fPatStringLen;
    330     }
    331     return regexp->fPatString;
    332 }
    333 
    334 
    335 //------------------------------------------------------------------------------
    336 //
    337 //    uregex_patternUText
    338 //
    339 //------------------------------------------------------------------------------
    340 U_CAPI UText * U_EXPORT2
    341 uregex_patternUText(const URegularExpression *regexp2,
    342                           UErrorCode         *status)  {
    343     RegularExpression *regexp = (RegularExpression*)regexp2;
    344     return regexp->fPat->patternText(*status);
    345 }
    346 
    347 
    348 //------------------------------------------------------------------------------
    349 //
    350 //    uregex_flags
    351 //
    352 //------------------------------------------------------------------------------
    353 U_CAPI int32_t U_EXPORT2
    354 uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
    355     RegularExpression *regexp = (RegularExpression*)regexp2;
    356     if (validateRE(regexp, status, FALSE) == FALSE) {
    357         return 0;
    358     }
    359     int32_t flags = regexp->fPat->flags();
    360     return flags;
    361 }
    362 
    363 
    364 //------------------------------------------------------------------------------
    365 //
    366 //    uregex_setText
    367 //
    368 //------------------------------------------------------------------------------
    369 U_CAPI void U_EXPORT2
    370 uregex_setText(URegularExpression *regexp2,
    371                const UChar        *text,
    372                int32_t             textLength,
    373                UErrorCode         *status)  {
    374     RegularExpression *regexp = (RegularExpression*)regexp2;
    375     if (validateRE(regexp, status, FALSE) == FALSE) {
    376         return;
    377     }
    378     if (text == NULL || textLength < -1) {
    379         *status = U_ILLEGAL_ARGUMENT_ERROR;
    380         return;
    381     }
    382 
    383     if (regexp->fOwnsText && regexp->fText != NULL) {
    384         uprv_free((void *)regexp->fText);
    385     }
    386 
    387     regexp->fText       = text;
    388     regexp->fTextLength = textLength;
    389     regexp->fOwnsText   = FALSE;
    390 
    391     UText input = UTEXT_INITIALIZER;
    392     utext_openUChars(&input, text, textLength, status);
    393     regexp->fMatcher->reset(&input);
    394     utext_close(&input); // reset() made a shallow clone, so we don't need this copy
    395 }
    396 
    397 
    398 //------------------------------------------------------------------------------
    399 //
    400 //    uregex_setUText
    401 //
    402 //------------------------------------------------------------------------------
    403 U_CAPI void U_EXPORT2
    404 uregex_setUText(URegularExpression *regexp2,
    405                 UText              *text,
    406                 UErrorCode         *status) {
    407     RegularExpression *regexp = (RegularExpression*)regexp2;
    408     if (validateRE(regexp, status, FALSE) == FALSE) {
    409         return;
    410     }
    411     if (text == NULL) {
    412         *status = U_ILLEGAL_ARGUMENT_ERROR;
    413         return;
    414     }
    415 
    416     if (regexp->fOwnsText && regexp->fText != NULL) {
    417         uprv_free((void *)regexp->fText);
    418     }
    419 
    420     regexp->fText       = NULL; // only fill it in on request
    421     regexp->fTextLength = -1;
    422     regexp->fOwnsText   = TRUE;
    423     regexp->fMatcher->reset(text);
    424 }
    425 
    426 
    427 
    428 //------------------------------------------------------------------------------
    429 //
    430 //    uregex_getText
    431 //
    432 //------------------------------------------------------------------------------
    433 U_CAPI const UChar * U_EXPORT2
    434 uregex_getText(URegularExpression *regexp2,
    435                int32_t            *textLength,
    436                UErrorCode         *status)  {
    437     RegularExpression *regexp = (RegularExpression*)regexp2;
    438     if (validateRE(regexp, status, FALSE) == FALSE) {
    439         return NULL;
    440     }
    441 
    442     if (regexp->fText == NULL) {
    443         // need to fill in the text
    444         UText *inputText = regexp->fMatcher->inputText();
    445         int64_t inputNativeLength = utext_nativeLength(inputText);
    446         if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
    447             regexp->fText = inputText->chunkContents;
    448             regexp->fTextLength = (int32_t)inputNativeLength;
    449             regexp->fOwnsText = FALSE; // because the UText owns it
    450         } else {
    451             UErrorCode lengthStatus = U_ZERO_ERROR;
    452             regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
    453             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
    454 
    455             utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
    456             regexp->fText = inputChars;
    457             regexp->fOwnsText = TRUE; // should already be set but just in case
    458         }
    459     }
    460 
    461     if (textLength != NULL) {
    462         *textLength = regexp->fTextLength;
    463     }
    464     return regexp->fText;
    465 }
    466 
    467 
    468 //------------------------------------------------------------------------------
    469 //
    470 //    uregex_getUText
    471 //
    472 //------------------------------------------------------------------------------
    473 U_CAPI UText * U_EXPORT2
    474 uregex_getUText(URegularExpression *regexp2,
    475                 UText              *dest,
    476                 UErrorCode         *status)  {
    477     RegularExpression *regexp = (RegularExpression*)regexp2;
    478     if (validateRE(regexp, status, FALSE) == FALSE) {
    479         return dest;
    480     }
    481     return regexp->fMatcher->getInput(dest, *status);
    482 }
    483 
    484 
    485 //------------------------------------------------------------------------------
    486 //
    487 //    uregex_matches
    488 //
    489 //------------------------------------------------------------------------------
    490 U_CAPI UBool U_EXPORT2
    491 uregex_matches(URegularExpression *regexp2,
    492                int32_t            startIndex,
    493                UErrorCode        *status)  {
    494     return uregex_matches64( regexp2, (int64_t)startIndex, status);
    495 }
    496 
    497 U_CAPI UBool U_EXPORT2
    498 uregex_matches64(URegularExpression *regexp2,
    499                  int64_t            startIndex,
    500                  UErrorCode        *status)  {
    501     RegularExpression *regexp = (RegularExpression*)regexp2;
    502     UBool result = FALSE;
    503     if (validateRE(regexp, status) == FALSE) {
    504         return result;
    505     }
    506     if (startIndex == -1) {
    507         result = regexp->fMatcher->matches(*status);
    508     } else {
    509         result = regexp->fMatcher->matches(startIndex, *status);
    510     }
    511     return result;
    512 }
    513 
    514 
    515 //------------------------------------------------------------------------------
    516 //
    517 //    uregex_lookingAt
    518 //
    519 //------------------------------------------------------------------------------
    520 U_CAPI UBool U_EXPORT2
    521 uregex_lookingAt(URegularExpression *regexp2,
    522                  int32_t             startIndex,
    523                  UErrorCode         *status)  {
    524     return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
    525 }
    526 
    527 U_CAPI UBool U_EXPORT2
    528 uregex_lookingAt64(URegularExpression *regexp2,
    529                    int64_t             startIndex,
    530                    UErrorCode         *status)  {
    531     RegularExpression *regexp = (RegularExpression*)regexp2;
    532     UBool result = FALSE;
    533     if (validateRE(regexp, status) == FALSE) {
    534         return result;
    535     }
    536     if (startIndex == -1) {
    537         result = regexp->fMatcher->lookingAt(*status);
    538     } else {
    539         result = regexp->fMatcher->lookingAt(startIndex, *status);
    540     }
    541     return result;
    542 }
    543 
    544 
    545 
    546 //------------------------------------------------------------------------------
    547 //
    548 //    uregex_find
    549 //
    550 //------------------------------------------------------------------------------
    551 U_CAPI UBool U_EXPORT2
    552 uregex_find(URegularExpression *regexp2,
    553             int32_t             startIndex,
    554             UErrorCode         *status)  {
    555     return uregex_find64( regexp2, (int64_t)startIndex, status);
    556 }
    557 
    558 U_CAPI UBool U_EXPORT2
    559 uregex_find64(URegularExpression *regexp2,
    560               int64_t             startIndex,
    561               UErrorCode         *status)  {
    562     RegularExpression *regexp = (RegularExpression*)regexp2;
    563     UBool result = FALSE;
    564     if (validateRE(regexp, status) == FALSE) {
    565         return result;
    566     }
    567     if (startIndex == -1) {
    568         regexp->fMatcher->resetPreserveRegion();
    569         result = regexp->fMatcher->find();
    570     } else {
    571         result = regexp->fMatcher->find(startIndex, *status);
    572     }
    573     return result;
    574 }
    575 
    576 
    577 //------------------------------------------------------------------------------
    578 //
    579 //    uregex_findNext
    580 //
    581 //------------------------------------------------------------------------------
    582 U_CAPI UBool U_EXPORT2
    583 uregex_findNext(URegularExpression *regexp2,
    584                 UErrorCode         *status)  {
    585     RegularExpression *regexp = (RegularExpression*)regexp2;
    586     if (validateRE(regexp, status) == FALSE) {
    587         return FALSE;
    588     }
    589     UBool result = regexp->fMatcher->find();
    590     return result;
    591 }
    592 
    593 //------------------------------------------------------------------------------
    594 //
    595 //    uregex_groupCount
    596 //
    597 //------------------------------------------------------------------------------
    598 U_CAPI int32_t U_EXPORT2
    599 uregex_groupCount(URegularExpression *regexp2,
    600                   UErrorCode         *status)  {
    601     RegularExpression *regexp = (RegularExpression*)regexp2;
    602     if (validateRE(regexp, status, FALSE) == FALSE) {
    603         return 0;
    604     }
    605     int32_t  result = regexp->fMatcher->groupCount();
    606     return result;
    607 }
    608 
    609 
    610 //------------------------------------------------------------------------------
    611 //
    612 //    uregex_group
    613 //
    614 //------------------------------------------------------------------------------
    615 U_CAPI int32_t U_EXPORT2
    616 uregex_group(URegularExpression *regexp2,
    617              int32_t             groupNum,
    618              UChar              *dest,
    619              int32_t             destCapacity,
    620              UErrorCode          *status)  {
    621     RegularExpression *regexp = (RegularExpression*)regexp2;
    622     if (validateRE(regexp, status) == FALSE) {
    623         return 0;
    624     }
    625     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
    626         *status = U_ILLEGAL_ARGUMENT_ERROR;
    627         return 0;
    628     }
    629 
    630     if (destCapacity == 0 || regexp->fText != NULL) {
    631         // If preflighting or if we already have the text as UChars,
    632         // this is a little cheaper than going through uregex_groupUTextDeep()
    633 
    634         //
    635         // Pick up the range of characters from the matcher
    636         //
    637         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    638         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    639         if (U_FAILURE(*status)) {
    640             return 0;
    641         }
    642 
    643         //
    644         // Trim length based on buffer capacity
    645         //
    646         int32_t fullLength = endIx - startIx;
    647         int32_t copyLength = fullLength;
    648         if (copyLength < destCapacity) {
    649             dest[copyLength] = 0;
    650         } else if (copyLength == destCapacity) {
    651             *status = U_STRING_NOT_TERMINATED_WARNING;
    652         } else {
    653             copyLength = destCapacity;
    654             *status = U_BUFFER_OVERFLOW_ERROR;
    655         }
    656 
    657         //
    658         // Copy capture group to user's buffer
    659         //
    660         if (copyLength > 0) {
    661             u_memcpy(dest, &regexp->fText[startIx], copyLength);
    662         }
    663         return fullLength;
    664     } else {
    665         UText *groupText = uregex_groupUTextDeep(regexp2, groupNum, NULL, status);
    666         int32_t result = utext_extract(groupText, 0, utext_nativeLength(groupText), dest, destCapacity, status);
    667         utext_close(groupText);
    668         return result;
    669     }
    670 }
    671 
    672 
    673 //------------------------------------------------------------------------------
    674 //
    675 //    uregex_groupUText
    676 //
    677 //------------------------------------------------------------------------------
    678 U_CAPI UText * U_EXPORT2
    679 uregex_groupUText(URegularExpression *regexp2,
    680                   int32_t             groupNum,
    681                   UText              *dest,
    682                   int64_t            *groupLength,
    683                   UErrorCode         *status)  {
    684     RegularExpression *regexp = (RegularExpression*)regexp2;
    685     if (validateRE(regexp, status) == FALSE) {
    686         UErrorCode emptyTextStatus = U_ZERO_ERROR;
    687         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    688     }
    689 
    690     return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
    691 }
    692 
    693 //------------------------------------------------------------------------------
    694 //
    695 //    uregex_groupUTextDeep
    696 //
    697 //------------------------------------------------------------------------------
    698 U_CAPI UText * U_EXPORT2
    699 uregex_groupUTextDeep(URegularExpression *regexp2,
    700                   int32_t             groupNum,
    701                   UText              *dest,
    702                   UErrorCode         *status)  {
    703     RegularExpression *regexp = (RegularExpression*)regexp2;
    704     if (validateRE(regexp, status) == FALSE) {
    705         UErrorCode emptyTextStatus = U_ZERO_ERROR;
    706         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    707     }
    708 
    709     if (regexp->fText != NULL) {
    710         //
    711         // Pick up the range of characters from the matcher
    712         // and use our already-extracted characters
    713         //
    714         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    715         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    716         if (U_FAILURE(*status)) {
    717             UErrorCode emptyTextStatus = U_ZERO_ERROR;
    718             return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
    719         }
    720 
    721         if (dest) {
    722             utext_replace(dest, 0, utext_nativeLength(dest), &regexp->fText[startIx], endIx - startIx, status);
    723         } else {
    724             UText groupText = UTEXT_INITIALIZER;
    725             utext_openUChars(&groupText, &regexp->fText[startIx], endIx - startIx, status);
    726             dest = utext_clone(NULL, &groupText, TRUE, FALSE, status);
    727             utext_close(&groupText);
    728         }
    729 
    730         return dest;
    731     } else {
    732         return regexp->fMatcher->group(groupNum, dest, *status);
    733     }
    734 }
    735 
    736 //------------------------------------------------------------------------------
    737 //
    738 //    uregex_start
    739 //
    740 //------------------------------------------------------------------------------
    741 U_CAPI int32_t U_EXPORT2
    742 uregex_start(URegularExpression *regexp2,
    743              int32_t             groupNum,
    744              UErrorCode          *status)  {
    745     return (int32_t)uregex_start64( regexp2, groupNum, status);
    746 }
    747 
    748 U_CAPI int64_t U_EXPORT2
    749 uregex_start64(URegularExpression *regexp2,
    750                int32_t             groupNum,
    751                UErrorCode          *status)  {
    752     RegularExpression *regexp = (RegularExpression*)regexp2;
    753     if (validateRE(regexp, status) == FALSE) {
    754         return 0;
    755     }
    756     int32_t result = regexp->fMatcher->start(groupNum, *status);
    757     return result;
    758 }
    759 
    760 //------------------------------------------------------------------------------
    761 //
    762 //    uregex_end
    763 //
    764 //------------------------------------------------------------------------------
    765 U_CAPI int32_t U_EXPORT2
    766 uregex_end(URegularExpression   *regexp2,
    767            int32_t               groupNum,
    768            UErrorCode           *status)  {
    769     return (int32_t)uregex_end64( regexp2, groupNum, status);
    770 }
    771 
    772 U_CAPI int64_t U_EXPORT2
    773 uregex_end64(URegularExpression   *regexp2,
    774              int32_t               groupNum,
    775              UErrorCode           *status)  {
    776     RegularExpression *regexp = (RegularExpression*)regexp2;
    777     if (validateRE(regexp, status) == FALSE) {
    778         return 0;
    779     }
    780     int32_t result = regexp->fMatcher->end(groupNum, *status);
    781     return result;
    782 }
    783 
    784 //------------------------------------------------------------------------------
    785 //
    786 //    uregex_reset
    787 //
    788 //------------------------------------------------------------------------------
    789 U_CAPI void U_EXPORT2
    790 uregex_reset(URegularExpression    *regexp2,
    791              int32_t               index,
    792              UErrorCode            *status)  {
    793     uregex_reset64( regexp2, (int64_t)index, status);
    794 }
    795 
    796 U_CAPI void U_EXPORT2
    797 uregex_reset64(URegularExpression    *regexp2,
    798                int64_t               index,
    799                UErrorCode            *status)  {
    800     RegularExpression *regexp = (RegularExpression*)regexp2;
    801     if (validateRE(regexp, status) == FALSE) {
    802         return;
    803     }
    804     regexp->fMatcher->reset(index, *status);
    805 }
    806 
    807 
    808 //------------------------------------------------------------------------------
    809 //
    810 //    uregex_setRegion
    811 //
    812 //------------------------------------------------------------------------------
    813 U_CAPI void U_EXPORT2
    814 uregex_setRegion(URegularExpression   *regexp2,
    815                  int32_t               regionStart,
    816                  int32_t               regionLimit,
    817                  UErrorCode           *status)  {
    818     uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
    819 }
    820 
    821 U_CAPI void U_EXPORT2
    822 uregex_setRegion64(URegularExpression   *regexp2,
    823                    int64_t               regionStart,
    824                    int64_t               regionLimit,
    825                    UErrorCode           *status)  {
    826     RegularExpression *regexp = (RegularExpression*)regexp2;
    827     if (validateRE(regexp, status) == FALSE) {
    828         return;
    829     }
    830     regexp->fMatcher->region(regionStart, regionLimit, *status);
    831 }
    832 
    833 
    834 //------------------------------------------------------------------------------
    835 //
    836 //    uregex_setRegionAndStart
    837 //
    838 //------------------------------------------------------------------------------
    839 U_DRAFT void U_EXPORT2
    840 uregex_setRegionAndStart(URegularExpression   *regexp2,
    841                  int64_t               regionStart,
    842                  int64_t               regionLimit,
    843                  int64_t               startIndex,
    844                  UErrorCode           *status)  {
    845     RegularExpression *regexp = (RegularExpression*)regexp2;
    846     if (validateRE(regexp, status) == FALSE) {
    847         return;
    848     }
    849     regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
    850 }
    851 
    852 //------------------------------------------------------------------------------
    853 //
    854 //    uregex_regionStart
    855 //
    856 //------------------------------------------------------------------------------
    857 U_CAPI int32_t U_EXPORT2
    858 uregex_regionStart(const  URegularExpression   *regexp2,
    859                           UErrorCode           *status)  {
    860     return (int32_t)uregex_regionStart64(regexp2, status);
    861 }
    862 
    863 U_CAPI int64_t U_EXPORT2
    864 uregex_regionStart64(const  URegularExpression   *regexp2,
    865                             UErrorCode           *status)  {
    866     RegularExpression *regexp = (RegularExpression*)regexp2;
    867     if (validateRE(regexp, status) == FALSE) {
    868         return 0;
    869     }
    870     return regexp->fMatcher->regionStart();
    871 }
    872 
    873 
    874 //------------------------------------------------------------------------------
    875 //
    876 //    uregex_regionEnd
    877 //
    878 //------------------------------------------------------------------------------
    879 U_CAPI int32_t U_EXPORT2
    880 uregex_regionEnd(const  URegularExpression   *regexp2,
    881                         UErrorCode           *status)  {
    882     return (int32_t)uregex_regionEnd64(regexp2, status);
    883 }
    884 
    885 U_CAPI int64_t U_EXPORT2
    886 uregex_regionEnd64(const  URegularExpression   *regexp2,
    887                           UErrorCode           *status)  {
    888     RegularExpression *regexp = (RegularExpression*)regexp2;
    889     if (validateRE(regexp, status) == FALSE) {
    890         return 0;
    891     }
    892     return regexp->fMatcher->regionEnd();
    893 }
    894 
    895 
    896 //------------------------------------------------------------------------------
    897 //
    898 //    uregex_hasTransparentBounds
    899 //
    900 //------------------------------------------------------------------------------
    901 U_CAPI UBool U_EXPORT2
    902 uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
    903                                    UErrorCode           *status)  {
    904     RegularExpression *regexp = (RegularExpression*)regexp2;
    905     if (validateRE(regexp, status) == FALSE) {
    906         return FALSE;
    907     }
    908     return regexp->fMatcher->hasTransparentBounds();
    909 }
    910 
    911 
    912 //------------------------------------------------------------------------------
    913 //
    914 //    uregex_useTransparentBounds
    915 //
    916 //------------------------------------------------------------------------------
    917 U_CAPI void U_EXPORT2
    918 uregex_useTransparentBounds(URegularExpression    *regexp2,
    919                             UBool                  b,
    920                             UErrorCode            *status)  {
    921     RegularExpression *regexp = (RegularExpression*)regexp2;
    922     if (validateRE(regexp, status) == FALSE) {
    923         return;
    924     }
    925     regexp->fMatcher->useTransparentBounds(b);
    926 }
    927 
    928 
    929 //------------------------------------------------------------------------------
    930 //
    931 //    uregex_hasAnchoringBounds
    932 //
    933 //------------------------------------------------------------------------------
    934 U_CAPI UBool U_EXPORT2
    935 uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
    936                                  UErrorCode           *status)  {
    937     RegularExpression *regexp = (RegularExpression*)regexp2;
    938     if (validateRE(regexp, status) == FALSE) {
    939         return FALSE;
    940     }
    941     return regexp->fMatcher->hasAnchoringBounds();
    942 }
    943 
    944 
    945 //------------------------------------------------------------------------------
    946 //
    947 //    uregex_useAnchoringBounds
    948 //
    949 //------------------------------------------------------------------------------
    950 U_CAPI void U_EXPORT2
    951 uregex_useAnchoringBounds(URegularExpression    *regexp2,
    952                           UBool                  b,
    953                           UErrorCode            *status)  {
    954     RegularExpression *regexp = (RegularExpression*)regexp2;
    955     if (validateRE(regexp, status) == FALSE) {
    956         return;
    957     }
    958     regexp->fMatcher->useAnchoringBounds(b);
    959 }
    960 
    961 
    962 //------------------------------------------------------------------------------
    963 //
    964 //    uregex_hitEnd
    965 //
    966 //------------------------------------------------------------------------------
    967 U_CAPI UBool U_EXPORT2
    968 uregex_hitEnd(const  URegularExpression   *regexp2,
    969                      UErrorCode           *status)  {
    970     RegularExpression *regexp = (RegularExpression*)regexp2;
    971     if (validateRE(regexp, status) == FALSE) {
    972         return FALSE;
    973     }
    974     return regexp->fMatcher->hitEnd();
    975 }
    976 
    977 
    978 //------------------------------------------------------------------------------
    979 //
    980 //    uregex_requireEnd
    981 //
    982 //------------------------------------------------------------------------------
    983 U_CAPI UBool U_EXPORT2
    984 uregex_requireEnd(const  URegularExpression   *regexp2,
    985                          UErrorCode           *status)  {
    986     RegularExpression *regexp = (RegularExpression*)regexp2;
    987     if (validateRE(regexp, status) == FALSE) {
    988         return FALSE;
    989     }
    990     return regexp->fMatcher->requireEnd();
    991 }
    992 
    993 
    994 //------------------------------------------------------------------------------
    995 //
    996 //    uregex_setTimeLimit
    997 //
    998 //------------------------------------------------------------------------------
    999 U_CAPI void U_EXPORT2
   1000 uregex_setTimeLimit(URegularExpression   *regexp2,
   1001                     int32_t               limit,
   1002                     UErrorCode           *status) {
   1003     RegularExpression *regexp = (RegularExpression*)regexp2;
   1004     if (validateRE(regexp, status)) {
   1005         regexp->fMatcher->setTimeLimit(limit, *status);
   1006     }
   1007 }
   1008 
   1009 
   1010 
   1011 //------------------------------------------------------------------------------
   1012 //
   1013 //    uregex_getTimeLimit
   1014 //
   1015 //------------------------------------------------------------------------------
   1016 U_CAPI int32_t U_EXPORT2
   1017 uregex_getTimeLimit(const  URegularExpression   *regexp2,
   1018                            UErrorCode           *status) {
   1019     int32_t retVal = 0;
   1020     RegularExpression *regexp = (RegularExpression*)regexp2;
   1021     if (validateRE(regexp, status)) {
   1022         retVal = regexp->fMatcher->getTimeLimit();
   1023     }
   1024     return retVal;
   1025 }
   1026 
   1027 
   1028 
   1029 //------------------------------------------------------------------------------
   1030 //
   1031 //    uregex_setStackLimit
   1032 //
   1033 //------------------------------------------------------------------------------
   1034 U_CAPI void U_EXPORT2
   1035 uregex_setStackLimit(URegularExpression   *regexp2,
   1036                      int32_t               limit,
   1037                      UErrorCode           *status) {
   1038     RegularExpression *regexp = (RegularExpression*)regexp2;
   1039     if (validateRE(regexp, status)) {
   1040         regexp->fMatcher->setStackLimit(limit, *status);
   1041     }
   1042 }
   1043 
   1044 
   1045 
   1046 //------------------------------------------------------------------------------
   1047 //
   1048 //    uregex_getStackLimit
   1049 //
   1050 //------------------------------------------------------------------------------
   1051 U_CAPI int32_t U_EXPORT2
   1052 uregex_getStackLimit(const  URegularExpression   *regexp2,
   1053                             UErrorCode           *status) {
   1054     int32_t retVal = 0;
   1055     RegularExpression *regexp = (RegularExpression*)regexp2;
   1056     if (validateRE(regexp, status)) {
   1057         retVal = regexp->fMatcher->getStackLimit();
   1058     }
   1059     return retVal;
   1060 }
   1061 
   1062 
   1063 //------------------------------------------------------------------------------
   1064 //
   1065 //    uregex_setMatchCallback
   1066 //
   1067 //------------------------------------------------------------------------------
   1068 U_CAPI void U_EXPORT2
   1069 uregex_setMatchCallback(URegularExpression      *regexp2,
   1070                         URegexMatchCallback     *callback,
   1071                         const void              *context,
   1072                         UErrorCode              *status) {
   1073     RegularExpression *regexp = (RegularExpression*)regexp2;
   1074     if (validateRE(regexp, status)) {
   1075         regexp->fMatcher->setMatchCallback(callback, context, *status);
   1076     }
   1077 }
   1078 
   1079 
   1080 //------------------------------------------------------------------------------
   1081 //
   1082 //    uregex_getMatchCallback
   1083 //
   1084 //------------------------------------------------------------------------------
   1085 U_CAPI void U_EXPORT2
   1086 uregex_getMatchCallback(const URegularExpression    *regexp2,
   1087                         URegexMatchCallback        **callback,
   1088                         const void                 **context,
   1089                         UErrorCode                  *status) {
   1090     RegularExpression *regexp = (RegularExpression*)regexp2;
   1091      if (validateRE(regexp, status)) {
   1092          regexp->fMatcher->getMatchCallback(*callback, *context, *status);
   1093      }
   1094 }
   1095 
   1096 
   1097 //------------------------------------------------------------------------------
   1098 //
   1099 //    uregex_setMatchProgressCallback
   1100 //
   1101 //------------------------------------------------------------------------------
   1102 U_CAPI void U_EXPORT2
   1103 uregex_setFindProgressCallback(URegularExpression              *regexp2,
   1104                                 URegexFindProgressCallback      *callback,
   1105                                 const void                      *context,
   1106                                 UErrorCode                      *status) {
   1107     RegularExpression *regexp = (RegularExpression*)regexp2;
   1108     if (validateRE(regexp, status)) {
   1109         regexp->fMatcher->setFindProgressCallback(callback, context, *status);
   1110     }
   1111 }
   1112 
   1113 
   1114 //------------------------------------------------------------------------------
   1115 //
   1116 //    uregex_getMatchCallback
   1117 //
   1118 //------------------------------------------------------------------------------
   1119 U_CAPI void U_EXPORT2
   1120 uregex_getFindProgressCallback(const URegularExpression          *regexp2,
   1121                                 URegexFindProgressCallback        **callback,
   1122                                 const void                        **context,
   1123                                 UErrorCode                        *status) {
   1124     RegularExpression *regexp = (RegularExpression*)regexp2;
   1125      if (validateRE(regexp, status)) {
   1126          regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
   1127      }
   1128 }
   1129 
   1130 
   1131 //------------------------------------------------------------------------------
   1132 //
   1133 //    uregex_replaceAll
   1134 //
   1135 //------------------------------------------------------------------------------
   1136 U_CAPI int32_t U_EXPORT2
   1137 uregex_replaceAll(URegularExpression    *regexp2,
   1138                   const UChar           *replacementText,
   1139                   int32_t                replacementLength,
   1140                   UChar                 *destBuf,
   1141                   int32_t                destCapacity,
   1142                   UErrorCode            *status)  {
   1143     RegularExpression *regexp = (RegularExpression*)regexp2;
   1144     if (validateRE(regexp, status) == FALSE) {
   1145         return 0;
   1146     }
   1147     if (replacementText == NULL || replacementLength < -1 ||
   1148         (destBuf == NULL && destCapacity > 0) ||
   1149         destCapacity < 0) {
   1150         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1151         return 0;
   1152     }
   1153 
   1154     int32_t   len = 0;
   1155 
   1156     uregex_reset(regexp2, 0, status);
   1157 
   1158     // Note: Seperate error code variables for findNext() and appendReplacement()
   1159     //       are used so that destination buffer overflow errors
   1160     //       in appendReplacement won't stop findNext() from working.
   1161     //       appendReplacement() and appendTail() special case incoming buffer
   1162     //       overflow errors, continuing to return the correct length.
   1163     UErrorCode  findStatus = *status;
   1164     while (uregex_findNext(regexp2, &findStatus)) {
   1165         len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
   1166                                         &destBuf, &destCapacity, status);
   1167     }
   1168     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
   1169 
   1170     if (U_FAILURE(findStatus)) {
   1171         // If anything went wrong with the findNext(), make that error trump
   1172         //   whatever may have happened with the append() operations.
   1173         //   Errors in findNext() are not expected.
   1174         *status = findStatus;
   1175     }
   1176 
   1177     return len;
   1178 }
   1179 
   1180 
   1181 //------------------------------------------------------------------------------
   1182 //
   1183 //    uregex_replaceAllUText
   1184 //
   1185 //------------------------------------------------------------------------------
   1186 U_CAPI UText * U_EXPORT2
   1187 uregex_replaceAllUText(URegularExpression    *regexp2,
   1188                        UText                 *replacementText,
   1189                        UText                 *dest,
   1190                        UErrorCode            *status)  {
   1191     RegularExpression *regexp = (RegularExpression*)regexp2;
   1192     if (validateRE(regexp, status) == FALSE) {
   1193         return 0;
   1194     }
   1195     if (replacementText == NULL) {
   1196         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1197         return 0;
   1198     }
   1199 
   1200     dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
   1201     return dest;
   1202 }
   1203 
   1204 
   1205 //------------------------------------------------------------------------------
   1206 //
   1207 //    uregex_replaceFirst
   1208 //
   1209 //------------------------------------------------------------------------------
   1210 U_CAPI int32_t U_EXPORT2
   1211 uregex_replaceFirst(URegularExpression  *regexp2,
   1212                     const UChar         *replacementText,
   1213                     int32_t              replacementLength,
   1214                     UChar               *destBuf,
   1215                     int32_t              destCapacity,
   1216                     UErrorCode          *status)  {
   1217     RegularExpression *regexp = (RegularExpression*)regexp2;
   1218     if (validateRE(regexp, status) == FALSE) {
   1219         return 0;
   1220     }
   1221     if (replacementText == NULL || replacementLength < -1 ||
   1222         (destBuf == NULL && destCapacity > 0) ||
   1223         destCapacity < 0) {
   1224         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1225         return 0;
   1226     }
   1227 
   1228     int32_t   len = 0;
   1229     UBool     findSucceeded;
   1230     uregex_reset(regexp2, 0, status);
   1231     findSucceeded = uregex_find(regexp2, 0, status);
   1232     if (findSucceeded) {
   1233         len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
   1234                                        &destBuf, &destCapacity, status);
   1235     }
   1236     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
   1237 
   1238     return len;
   1239 }
   1240 
   1241 
   1242 //------------------------------------------------------------------------------
   1243 //
   1244 //    uregex_replaceFirstUText
   1245 //
   1246 //------------------------------------------------------------------------------
   1247 U_CAPI UText * U_EXPORT2
   1248 uregex_replaceFirstUText(URegularExpression  *regexp2,
   1249                          UText                 *replacementText,
   1250                          UText                 *dest,
   1251                          UErrorCode            *status)  {
   1252     RegularExpression *regexp = (RegularExpression*)regexp2;
   1253     if (validateRE(regexp, status) == FALSE) {
   1254         return 0;
   1255     }
   1256     if (replacementText == NULL) {
   1257         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1258         return 0;
   1259     }
   1260 
   1261     dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
   1262     return dest;
   1263 }
   1264 
   1265 
   1266 //------------------------------------------------------------------------------
   1267 //
   1268 //    uregex_appendReplacement
   1269 //
   1270 //------------------------------------------------------------------------------
   1271 
   1272 U_NAMESPACE_BEGIN
   1273 //
   1274 //  Dummy class, because these functions need to be friends of class RegexMatcher,
   1275 //               and stand-alone C functions don't work as friends
   1276 //
   1277 class RegexCImpl {
   1278  public:
   1279    inline static  int32_t appendReplacement(RegularExpression    *regexp,
   1280                       const UChar           *replacementText,
   1281                       int32_t                replacementLength,
   1282                       UChar                **destBuf,
   1283                       int32_t               *destCapacity,
   1284                       UErrorCode            *status);
   1285 
   1286    inline static int32_t appendTail(RegularExpression    *regexp,
   1287         UChar                **destBuf,
   1288         int32_t               *destCapacity,
   1289         UErrorCode            *status);
   1290 
   1291     inline static int32_t split(RegularExpression    *regexp,
   1292         UChar                 *destBuf,
   1293         int32_t                destCapacity,
   1294         int32_t               *requiredCapacity,
   1295         UChar                 *destFields[],
   1296         int32_t                destFieldsCapacity,
   1297         UErrorCode            *status);
   1298 };
   1299 
   1300 U_NAMESPACE_END
   1301 
   1302 
   1303 
   1304 static const UChar BACKSLASH  = 0x5c;
   1305 static const UChar DOLLARSIGN = 0x24;
   1306 
   1307 //
   1308 //  Move a character to an output buffer, with bounds checking on the index.
   1309 //      Index advances even if capacity is exceeded, for preflight size computations.
   1310 //      This little sequence is used a LOT.
   1311 //
   1312 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
   1313     if (*idx < bufCapacity) {
   1314         buf[*idx] = c;
   1315     }
   1316     (*idx)++;
   1317 }
   1318 
   1319 
   1320 //
   1321 //  appendReplacement, the actual implementation.
   1322 //
   1323 int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
   1324                                       const UChar           *replacementText,
   1325                                       int32_t                replacementLength,
   1326                                       UChar                **destBuf,
   1327                                       int32_t               *destCapacity,
   1328                                       UErrorCode            *status)  {
   1329 
   1330     // If we come in with a buffer overflow error, don't suppress the operation.
   1331     //  A series of appendReplacements, appendTail need to correctly preflight
   1332     //  the buffer size when an overflow happens somewhere in the middle.
   1333     UBool pendingBufferOverflow = FALSE;
   1334     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1335         pendingBufferOverflow = TRUE;
   1336         *status = U_ZERO_ERROR;
   1337     }
   1338 
   1339     //
   1340     // Validate all paramters
   1341     //
   1342     if (validateRE(regexp, status) == FALSE) {
   1343         return 0;
   1344     }
   1345     if (replacementText == NULL || replacementLength < -1 ||
   1346         destCapacity == NULL || destBuf == NULL ||
   1347         (*destBuf == NULL && *destCapacity > 0) ||
   1348         *destCapacity < 0) {
   1349         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1350         return 0;
   1351     }
   1352 
   1353     RegexMatcher *m = regexp->fMatcher;
   1354     if (m->fMatch == FALSE) {
   1355         *status = U_REGEX_INVALID_STATE;
   1356         return 0;
   1357     }
   1358 
   1359     UChar    *dest             = *destBuf;
   1360     int32_t   capacity         = *destCapacity;
   1361     int32_t   destIdx          =  0;
   1362     int32_t   i;
   1363 
   1364     // If it wasn't supplied by the caller,  get the length of the replacement text.
   1365     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
   1366     //          the fly and avoid this step.
   1367     if (replacementLength == -1) {
   1368         replacementLength = u_strlen(replacementText);
   1369     }
   1370 
   1371     // Copy input string from the end of previous match to start of current match
   1372     if (regexp->fText != NULL) {
   1373         int32_t matchStart;
   1374         int32_t lastMatchEnd;
   1375         if (UTEXT_USES_U16(m->fInputText)) {
   1376             lastMatchEnd = (int32_t)m->fLastMatchEnd;
   1377             matchStart = (int32_t)m->fMatchStart;
   1378         } else {
   1379             // !!!: Would like a better way to do this!
   1380             UErrorCode status = U_ZERO_ERROR;
   1381             lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
   1382             status = U_ZERO_ERROR;
   1383             matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
   1384         }
   1385         for (i=lastMatchEnd; i<matchStart; i++) {
   1386             appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
   1387         }
   1388     } else {
   1389         UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
   1390         destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
   1391                                  &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), &possibleOverflowError);
   1392     }
   1393 
   1394 
   1395     // scan the replacement text, looking for substitutions ($n) and \escapes.
   1396     int32_t  replIdx = 0;
   1397     while (replIdx < replacementLength) {
   1398         UChar  c = replacementText[replIdx];
   1399         replIdx++;
   1400         if (c != DOLLARSIGN && c != BACKSLASH) {
   1401             // Common case, no substitution, no escaping,
   1402             //  just copy the char to the dest buf.
   1403             appendToBuf(c, &destIdx, dest, capacity);
   1404             continue;
   1405         }
   1406 
   1407         if (c == BACKSLASH) {
   1408             // Backslash Escape.  Copy the following char out without further checks.
   1409             //                    Note:  Surrogate pairs don't need any special handling
   1410             //                           The second half wont be a '$' or a '\', and
   1411             //                           will move to the dest normally on the next
   1412             //                           loop iteration.
   1413             if (replIdx >= replacementLength) {
   1414                 break;
   1415             }
   1416             c = replacementText[replIdx];
   1417 
   1418             if (c==0x55/*U*/ || c==0x75/*u*/) {
   1419                 // We have a \udddd or \Udddddddd escape sequence.
   1420                 UChar32 escapedChar =
   1421                     u_unescapeAt(uregex_ucstr_unescape_charAt,
   1422                        &replIdx,                   // Index is updated by unescapeAt
   1423                        replacementLength,          // Length of replacement text
   1424                        (void *)replacementText);
   1425 
   1426                 if (escapedChar != (UChar32)0xFFFFFFFF) {
   1427                     if (escapedChar <= 0xffff) {
   1428                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
   1429                     } else {
   1430                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
   1431                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
   1432                     }
   1433                     continue;
   1434                 }
   1435                 // Note:  if the \u escape was invalid, just fall through and
   1436                 //        treat it as a plain \<anything> escape.
   1437             }
   1438 
   1439             // Plain backslash escape.  Just put out the escaped character.
   1440             appendToBuf(c, &destIdx, dest, capacity);
   1441 
   1442             replIdx++;
   1443             continue;
   1444         }
   1445 
   1446 
   1447 
   1448         // We've got a $.  Pick up a capture group number if one follows.
   1449         // Consume at most the number of digits necessary for the largest capture
   1450         // number that is valid for this pattern.
   1451 
   1452         int32_t numDigits = 0;
   1453         int32_t groupNum  = 0;
   1454         UChar32 digitC;
   1455         for (;;) {
   1456             if (replIdx >= replacementLength) {
   1457                 break;
   1458             }
   1459             U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
   1460             if (u_isdigit(digitC) == FALSE) {
   1461                 break;
   1462             }
   1463 
   1464             U16_FWD_1(replacementText, replIdx, replacementLength);
   1465             groupNum=groupNum*10 + u_charDigitValue(digitC);
   1466             numDigits++;
   1467             if (numDigits >= m->fPattern->fMaxCaptureDigits) {
   1468                 break;
   1469             }
   1470         }
   1471 
   1472 
   1473         if (numDigits == 0) {
   1474             // The $ didn't introduce a group number at all.
   1475             // Treat it as just part of the substitution text.
   1476             appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
   1477             continue;
   1478         }
   1479 
   1480         // Finally, append the capture group data to the destination.
   1481         destIdx += uregex_group((URegularExpression*)regexp, groupNum, &dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
   1482         if (*status == U_BUFFER_OVERFLOW_ERROR) {
   1483             // Ignore buffer overflow when extracting the group.  We need to
   1484             //   continue on to get full size of the untruncated result.  We will
   1485             //   raise our own buffer overflow error at the end.
   1486             *status = U_ZERO_ERROR;
   1487         }
   1488 
   1489         if (U_FAILURE(*status)) {
   1490             // Can fail if group number is out of range.
   1491             break;
   1492         }
   1493 
   1494     }
   1495 
   1496     //
   1497     //  Nul Terminate the dest buffer if possible.
   1498     //  Set the appropriate buffer overflow or not terminated error, if needed.
   1499     //
   1500     if (destIdx < capacity) {
   1501         dest[destIdx] = 0;
   1502     } else if (destIdx == *destCapacity) {
   1503         *status = U_STRING_NOT_TERMINATED_WARNING;
   1504     } else {
   1505         *status = U_BUFFER_OVERFLOW_ERROR;
   1506     }
   1507 
   1508     //
   1509     // Return an updated dest buffer and capacity to the caller.
   1510     //
   1511     if (destIdx > 0 &&  *destCapacity > 0) {
   1512         if (destIdx < capacity) {
   1513             *destBuf      += destIdx;
   1514             *destCapacity -= destIdx;
   1515         } else {
   1516             *destBuf      += capacity;
   1517             *destCapacity =  0;
   1518         }
   1519     }
   1520 
   1521     // If we came in with a buffer overflow, make sure we go out with one also.
   1522     //   (A zero length match right at the end of the previous match could
   1523     //    make this function succeed even though a previous call had overflowed the buf)
   1524     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1525         *status = U_BUFFER_OVERFLOW_ERROR;
   1526     }
   1527 
   1528     return destIdx;
   1529 }
   1530 
   1531 //
   1532 //   appendReplacement   the actual API function,
   1533 //
   1534 U_CAPI int32_t U_EXPORT2
   1535 uregex_appendReplacement(URegularExpression    *regexp2,
   1536                          const UChar           *replacementText,
   1537                          int32_t                replacementLength,
   1538                          UChar                **destBuf,
   1539                          int32_t               *destCapacity,
   1540                          UErrorCode            *status) {
   1541 
   1542     RegularExpression *regexp = (RegularExpression*)regexp2;
   1543     return RegexCImpl::appendReplacement(
   1544         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
   1545 }
   1546 
   1547 //
   1548 //   uregex_appendReplacementUText...can just use the normal C++ method
   1549 //
   1550 U_CAPI void U_EXPORT2
   1551 uregex_appendReplacementUText(URegularExpression    *regexp2,
   1552                               UText                 *replText,
   1553                               UText                 *dest,
   1554                               UErrorCode            *status)  {
   1555     RegularExpression *regexp = (RegularExpression*)regexp2;
   1556     regexp->fMatcher->appendReplacement(dest, replText, *status);
   1557 }
   1558 
   1559 
   1560 //------------------------------------------------------------------------------
   1561 //
   1562 //    uregex_appendTail
   1563 //
   1564 //------------------------------------------------------------------------------
   1565 int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
   1566                                UChar                **destBuf,
   1567                                int32_t               *destCapacity,
   1568                                UErrorCode            *status)
   1569 {
   1570 
   1571     // If we come in with a buffer overflow error, don't suppress the operation.
   1572     //  A series of appendReplacements, appendTail need to correctly preflight
   1573     //  the buffer size when an overflow happens somewhere in the middle.
   1574     UBool pendingBufferOverflow = FALSE;
   1575     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1576         pendingBufferOverflow = TRUE;
   1577         *status = U_ZERO_ERROR;
   1578     }
   1579 
   1580     if (validateRE(regexp, status) == FALSE) {
   1581         return 0;
   1582     }
   1583 
   1584     if (destCapacity == NULL || destBuf == NULL ||
   1585         (*destBuf == NULL && *destCapacity > 0) ||
   1586         *destCapacity < 0)
   1587     {
   1588         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1589         return 0;
   1590     }
   1591 
   1592     RegexMatcher *m = regexp->fMatcher;
   1593 
   1594     int32_t  destIdx     = 0;
   1595     int32_t  destCap     = *destCapacity;
   1596     UChar    *dest       = *destBuf;
   1597 
   1598     if (regexp->fText != NULL) {
   1599         int32_t srcIdx;
   1600         int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
   1601         if (nativeIdx == -1) {
   1602             srcIdx = 0;
   1603         } else if (UTEXT_USES_U16(m->fInputText)) {
   1604             srcIdx = (int32_t)nativeIdx;
   1605         } else {
   1606             UErrorCode status = U_ZERO_ERROR;
   1607             srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
   1608         }
   1609 
   1610         for (;;) {
   1611             if (srcIdx == regexp->fTextLength) {
   1612                 break;
   1613             }
   1614             UChar c = regexp->fText[srcIdx];
   1615             if (c == 0 && regexp->fTextLength == -1) {
   1616                 regexp->fTextLength = srcIdx;
   1617                 break;
   1618             }
   1619             if (destIdx < destCap) {
   1620                 dest[destIdx] = c;
   1621             } else {
   1622                 // We've overflowed the dest buffer.
   1623                 //  If the total input string length is known, we can
   1624                 //    compute the total buffer size needed without scanning through the string.
   1625                 if (regexp->fTextLength > 0) {
   1626                     destIdx += (regexp->fTextLength - srcIdx);
   1627                     break;
   1628                 }
   1629             }
   1630             srcIdx++;
   1631             destIdx++;
   1632         }
   1633     } else {
   1634         int64_t  srcIdx;
   1635         if (m->fMatch) {
   1636             // The most recent call to find() succeeded.
   1637             srcIdx = m->fMatchEnd;
   1638         } else {
   1639             // The last call to find() on this matcher failed().
   1640             //   Look back to the end of the last find() that succeeded for src index.
   1641             srcIdx = m->fLastMatchEnd;
   1642             if (srcIdx == -1)  {
   1643                 // There has been no successful match with this matcher.
   1644                 //   We want to copy the whole string.
   1645                 srcIdx = 0;
   1646             }
   1647         }
   1648 
   1649         destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
   1650     }
   1651 
   1652     //
   1653     //  NUL terminate the output string, if possible, otherwise issue the
   1654     //   appropriate error or warning.
   1655     //
   1656     if (destIdx < destCap) {
   1657         dest[destIdx] = 0;
   1658     } else  if (destIdx == destCap) {
   1659         *status = U_STRING_NOT_TERMINATED_WARNING;
   1660     } else {
   1661         *status = U_BUFFER_OVERFLOW_ERROR;
   1662     }
   1663 
   1664     //
   1665     // Update the user's buffer ptr and capacity vars to reflect the
   1666     //   amount used.
   1667     //
   1668     if (destIdx < destCap) {
   1669         *destBuf      += destIdx;
   1670         *destCapacity -= destIdx;
   1671     } else {
   1672         *destBuf      += destCap;
   1673         *destCapacity  = 0;
   1674     }
   1675 
   1676     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1677         *status = U_BUFFER_OVERFLOW_ERROR;
   1678     }
   1679 
   1680     return destIdx;
   1681 }
   1682 
   1683 
   1684 //
   1685 //   appendTail   the actual API function
   1686 //
   1687 U_CAPI int32_t U_EXPORT2
   1688 uregex_appendTail(URegularExpression    *regexp2,
   1689                   UChar                **destBuf,
   1690                   int32_t               *destCapacity,
   1691                   UErrorCode            *status)  {
   1692     RegularExpression *regexp = (RegularExpression*)regexp2;
   1693     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
   1694 }
   1695 
   1696 
   1697 //
   1698 //   uregex_appendTailUText...can just use the normal C++ method
   1699 //
   1700 U_CAPI UText * U_EXPORT2
   1701 uregex_appendTailUText(URegularExpression    *regexp2,
   1702                        UText                 *dest,
   1703                        UErrorCode            *status)  {
   1704     RegularExpression *regexp = (RegularExpression*)regexp2;
   1705     return regexp->fMatcher->appendTail(dest, *status);
   1706 }
   1707 
   1708 
   1709 //------------------------------------------------------------------------------
   1710 //
   1711 //    copyString     Internal utility to copy a string to an output buffer,
   1712 //                   while managing buffer overflow and preflight size
   1713 //                   computation.  NUL termination is added to destination,
   1714 //                   and the NUL is counted in the output size.
   1715 //
   1716 //------------------------------------------------------------------------------
   1717 #if 0
   1718 static void copyString(UChar        *destBuffer,    //  Destination buffer.
   1719                        int32_t       destCapacity,  //  Total capacity of dest buffer
   1720                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
   1721                                                     //    Update not clipped to destCapacity.
   1722                        const UChar  *srcPtr,        //  Pointer to source string
   1723                        int32_t       srcLen)        //  Source string len.
   1724 {
   1725     int32_t  si;
   1726     int32_t  di = *destIndex;
   1727     UChar    c;
   1728 
   1729     for (si=0; si<srcLen;  si++) {
   1730         c = srcPtr[si];
   1731         if (di < destCapacity) {
   1732             destBuffer[di] = c;
   1733             di++;
   1734         } else {
   1735             di += srcLen - si;
   1736             break;
   1737         }
   1738     }
   1739     if (di<destCapacity) {
   1740         destBuffer[di] = 0;
   1741     }
   1742     di++;
   1743     *destIndex = di;
   1744 }
   1745 #endif
   1746 
   1747 //------------------------------------------------------------------------------
   1748 //
   1749 //    uregex_split
   1750 //
   1751 //------------------------------------------------------------------------------
   1752 int32_t RegexCImpl::split(RegularExpression     *regexp,
   1753                           UChar                 *destBuf,
   1754                           int32_t                destCapacity,
   1755                           int32_t               *requiredCapacity,
   1756                           UChar                 *destFields[],
   1757                           int32_t                destFieldsCapacity,
   1758                           UErrorCode            *status) {
   1759     //
   1760     // Reset for the input text
   1761     //
   1762     regexp->fMatcher->reset();
   1763     UText *inputText = regexp->fMatcher->fInputText;
   1764     int64_t   nextOutputStringStart = 0;
   1765     int64_t   inputLen = regexp->fMatcher->fInputLength;
   1766     if (inputLen == 0) {
   1767         return 0;
   1768     }
   1769 
   1770     //
   1771     // Loop through the input text, searching for the delimiter pattern
   1772     //
   1773     int32_t   i;             // Index of the field being processed.
   1774     int32_t   destIdx = 0;   // Next available position in destBuf;
   1775     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
   1776     UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
   1777     for (i=0; ; i++) {
   1778         if (i>=destFieldsCapacity-1) {
   1779             // There are one or zero output strings left.
   1780             // Fill the last output string with whatever is left from the input, then exit the loop.
   1781             //  ( i will be == destFieldsCapacity if we filled the output array while processing
   1782             //    capture groups of the delimiter expression, in which case we will discard the
   1783             //    last capture group saved in favor of the unprocessed remainder of the
   1784             //    input string.)
   1785             if (inputLen > nextOutputStringStart) {
   1786                 if (i != destFieldsCapacity-1) {
   1787                     // No fields are left.  Recycle the last one for holding the trailing part of
   1788                     //   the input string.
   1789                     i = destFieldsCapacity-1;
   1790                     destIdx = (int32_t)(destFields[i] - destFields[0]);
   1791                 }
   1792 
   1793                 destFields[i] = &destBuf[destIdx];
   1794                 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
   1795                                              &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
   1796             }
   1797             break;
   1798         }
   1799 
   1800         if (regexp->fMatcher->find()) {
   1801             // We found another delimiter.  Move everything from where we started looking
   1802             //  up until the start of the delimiter into the next output string.
   1803             destFields[i] = &destBuf[destIdx];
   1804 
   1805             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
   1806                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
   1807             if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
   1808                 tStatus = U_ZERO_ERROR;
   1809             } else {
   1810                 *status = tStatus;
   1811             }
   1812             nextOutputStringStart = regexp->fMatcher->fMatchEnd;
   1813 
   1814             // If the delimiter pattern has capturing parentheses, the captured
   1815             //  text goes out into the next n destination strings.
   1816             int32_t groupNum;
   1817             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   1818                 // If we've run out of output string slots, bail out.
   1819                 if (i==destFieldsCapacity-1) {
   1820                     break;
   1821                 }
   1822                 i++;
   1823 
   1824                 // Set up to extract the capture group contents into the dest buffer.
   1825                 destFields[i] = &destBuf[destIdx];
   1826                 tStatus = U_ZERO_ERROR;
   1827                 int32_t t = uregex_group((URegularExpression*)regexp, groupNum, destFields[i], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
   1828                 destIdx += t + 1;    // Record the space used in the output string buffer.
   1829                                      //  +1 for the NUL that terminates the string.
   1830                 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
   1831                     tStatus = U_ZERO_ERROR;
   1832                 } else {
   1833                     *status = tStatus;
   1834                 }
   1835             }
   1836 
   1837             if (nextOutputStringStart == inputLen) {
   1838                 // The delimiter was at the end of the string.  We're done.
   1839                 break;
   1840             }
   1841 
   1842         }
   1843         else
   1844         {
   1845             // We ran off the end of the input while looking for the next delimiter.
   1846             // All the remaining text goes into the current output string.
   1847             destFields[i] = &destBuf[destIdx];
   1848             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
   1849                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
   1850             break;
   1851         }
   1852     }
   1853 
   1854     // Zero out any unused portion of the destFields array
   1855     int j;
   1856     for (j=i+1; j<destFieldsCapacity; j++) {
   1857         destFields[j] = NULL;
   1858     }
   1859 
   1860     if (requiredCapacity != NULL) {
   1861         *requiredCapacity = destIdx;
   1862     }
   1863     if (destIdx > destCapacity) {
   1864         *status = U_BUFFER_OVERFLOW_ERROR;
   1865     }
   1866     return i+1;
   1867 }
   1868 
   1869 //
   1870 //   uregex_split   The actual API function
   1871 //
   1872 U_CAPI int32_t U_EXPORT2
   1873 uregex_split(URegularExpression      *regexp2,
   1874              UChar                   *destBuf,
   1875              int32_t                  destCapacity,
   1876              int32_t                 *requiredCapacity,
   1877              UChar                   *destFields[],
   1878              int32_t                  destFieldsCapacity,
   1879              UErrorCode              *status) {
   1880     RegularExpression *regexp = (RegularExpression*)regexp2;
   1881     if (validateRE(regexp, status) == FALSE) {
   1882         return 0;
   1883     }
   1884     if ((destBuf == NULL && destCapacity > 0) ||
   1885         destCapacity < 0 ||
   1886         destFields == NULL ||
   1887         destFieldsCapacity < 1 ) {
   1888         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1889         return 0;
   1890     }
   1891 
   1892     return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
   1893 }
   1894 
   1895 
   1896 //
   1897 //   uregex_splitUText...can just use the normal C++ method
   1898 //
   1899 U_CAPI int32_t U_EXPORT2
   1900 uregex_splitUText(URegularExpression    *regexp2,
   1901                   UText                 *destFields[],
   1902                   int32_t                destFieldsCapacity,
   1903                   UErrorCode            *status) {
   1904     RegularExpression *regexp = (RegularExpression*)regexp2;
   1905     return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
   1906 }
   1907 
   1908 
   1909 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1910 
   1911