Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2004-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  regex.cpp
      7 */
      8 
      9 #include "unicode/utypes.h"
     10 
     11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     12 
     13 #include "unicode/regex.h"
     14 #include "unicode/uregex.h"
     15 #include "unicode/unistr.h"
     16 #include "unicode/ustring.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/uobject.h"
     19 #include "umutex.h"
     20 #include "uassert.h"
     21 #include "cmemory.h"
     22 
     23 U_NAMESPACE_USE
     24 
     25 struct URegularExpression: public UMemory {
     26 public:
     27     URegularExpression();
     28     ~URegularExpression();
     29     int32_t           fMagic;
     30     RegexPattern     *fPat;
     31     int32_t          *fPatRefCount;
     32     UChar            *fPatString;
     33     int32_t           fPatStringLen;
     34     RegexMatcher     *fMatcher;
     35     const UChar      *fText;         // Text from setText()
     36     int32_t           fTextLength;   // Length provided by user with setText(), which
     37                                      //  may be -1.
     38 
     39     UnicodeString     fTextString;   // The setText(text) is wrapped into a UnicodeString.
     40                                      // TODO: regexp engine should not depend on UnicodeString.
     41 };
     42 
     43 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
     44 
     45 URegularExpression::URegularExpression() {
     46     fMagic        = REXP_MAGIC;
     47     fPat          = NULL;
     48     fPatRefCount  = NULL;
     49     fPatString    = NULL;
     50     fPatStringLen = 0;
     51     fMatcher      = NULL;
     52     fText         = NULL;
     53     fTextLength   = 0;
     54 }
     55 
     56 URegularExpression::~URegularExpression() {
     57     delete fMatcher;
     58     fMatcher = NULL;
     59     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
     60         delete fPat;
     61         uprv_free(fPatString);
     62         uprv_free(fPatRefCount);
     63     }
     64     fMagic = 0;
     65 }
     66 
     67 //----------------------------------------------------------------------------------------
     68 //
     69 //   validateRE    Do boilerplate style checks on API function parameters.
     70 //                 Return TRUE if they look OK.
     71 //----------------------------------------------------------------------------------------
     72 static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
     73     if (U_FAILURE(*status)) {
     74         return FALSE;
     75     }
     76     if (re == NULL || re->fMagic != REXP_MAGIC) {
     77         *status = U_ILLEGAL_ARGUMENT_ERROR;
     78         return FALSE;
     79     }
     80     if (requiresText && re->fText == NULL) {
     81         *status = U_REGEX_INVALID_STATE;
     82         return FALSE;
     83     }
     84     return TRUE;
     85 }
     86 
     87 //----------------------------------------------------------------------------------------
     88 //
     89 //    uregex_open
     90 //
     91 //----------------------------------------------------------------------------------------
     92 U_CAPI URegularExpression *  U_EXPORT2
     93 uregex_open( const  UChar          *pattern,
     94                     int32_t         patternLength,
     95                     uint32_t        flags,
     96                     UParseError    *pe,
     97                     UErrorCode     *status) {
     98 
     99     if (U_FAILURE(*status)) {
    100         return NULL;
    101     }
    102     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
    103         *status = U_ILLEGAL_ARGUMENT_ERROR;
    104         return NULL;
    105     }
    106     int32_t actualPatLen = patternLength;
    107     if (actualPatLen == -1) {
    108         actualPatLen = u_strlen(pattern);
    109     }
    110 
    111     URegularExpression *re     = new URegularExpression;
    112     int32_t            *refC   = (int32_t *)uprv_malloc(sizeof(int32_t));
    113     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
    114     if (re == NULL || refC == NULL || patBuf == NULL) {
    115         *status = U_MEMORY_ALLOCATION_ERROR;
    116         delete re;
    117         uprv_free(refC);
    118         uprv_free(patBuf);
    119         return NULL;
    120     }
    121     re->fPatRefCount = refC;
    122     *re->fPatRefCount = 1;
    123 
    124     //
    125     // Make a copy of the pattern string, so we can return it later if asked.
    126     //    For compiling the pattern, we will use a read-only-aliased UnicodeString
    127     //    of this local copy, to avoid making even more copies.
    128     //
    129     re->fPatString    = patBuf;
    130     re->fPatStringLen = patternLength;
    131     u_memcpy(patBuf, pattern, actualPatLen);
    132     patBuf[actualPatLen] = 0;
    133     UnicodeString  patString(patternLength==-1, patBuf, patternLength);
    134 
    135     //
    136     // Compile the pattern
    137     //
    138     if (pe != NULL) {
    139         re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
    140     } else {
    141         re->fPat = RegexPattern::compile(patString, flags, *status);
    142     }
    143     if (U_FAILURE(*status)) {
    144         goto ErrorExit;
    145     }
    146 
    147     //
    148     // Create the matcher object
    149     //
    150     re->fMatcher = re->fPat->matcher(*status);
    151     if (U_SUCCESS(*status)) {
    152         return re;
    153     }
    154 
    155 ErrorExit:
    156     delete re;
    157     return NULL;
    158 
    159 }
    160 
    161 //----------------------------------------------------------------------------------------
    162 //
    163 //    uregex_close
    164 //
    165 //----------------------------------------------------------------------------------------
    166 U_CAPI void  U_EXPORT2
    167 uregex_close(URegularExpression  *re) {
    168     UErrorCode  status = U_ZERO_ERROR;
    169     if (validateRE(re, &status, FALSE) == FALSE) {
    170         return;
    171     }
    172     delete re;
    173 }
    174 
    175 
    176 //----------------------------------------------------------------------------------------
    177 //
    178 //    uregex_clone
    179 //
    180 //----------------------------------------------------------------------------------------
    181 U_CAPI URegularExpression * U_EXPORT2
    182 uregex_clone(const URegularExpression *source, UErrorCode *status)  {
    183     if (validateRE(source, status, FALSE) == FALSE) {
    184         return NULL;
    185     }
    186 
    187     URegularExpression *clone = new URegularExpression;
    188     if (clone == NULL) {
    189         *status = U_MEMORY_ALLOCATION_ERROR;
    190         return NULL;
    191     }
    192 
    193     clone->fMatcher = source->fPat->matcher(*status);
    194     if (U_FAILURE(*status)) {
    195         delete clone;
    196         return NULL;
    197     }
    198 
    199     clone->fPat          = source->fPat;
    200     clone->fPatRefCount  = source->fPatRefCount;
    201     clone->fPatString    = source->fPatString;
    202     clone->fPatStringLen = source->fPatStringLen;
    203     umtx_atomic_inc(source->fPatRefCount);
    204     // Note:  fText is not cloned.
    205 
    206     return clone;
    207 }
    208 
    209 
    210 
    211 
    212 //------------------------------------------------------------------------------
    213 //
    214 //    uregex_pattern
    215 //
    216 //------------------------------------------------------------------------------
    217 U_CAPI const UChar * U_EXPORT2
    218 uregex_pattern(const  URegularExpression *regexp,
    219                int32_t            *patLength,
    220                UErrorCode         *status)  {
    221 
    222     if (validateRE(regexp, status, FALSE) == FALSE) {
    223         return NULL;
    224     }
    225     if (patLength != NULL) {
    226         *patLength = regexp->fPatStringLen;
    227     }
    228     return regexp->fPatString;
    229 }
    230 
    231 
    232 //------------------------------------------------------------------------------
    233 //
    234 //    uregex_flags
    235 //
    236 //------------------------------------------------------------------------------
    237 U_CAPI int32_t U_EXPORT2
    238 uregex_flags(const URegularExpression *regexp, UErrorCode *status)  {
    239     if (validateRE(regexp, status, FALSE) == FALSE) {
    240         return 0;
    241     }
    242     int32_t flags = regexp->fPat->flags();
    243     return flags;
    244 }
    245 
    246 
    247 //------------------------------------------------------------------------------
    248 //
    249 //    uregex_setText
    250 //
    251 //------------------------------------------------------------------------------
    252 U_CAPI void U_EXPORT2
    253 uregex_setText(URegularExpression *regexp,
    254                const UChar        *text,
    255                int32_t             textLength,
    256                UErrorCode         *status)  {
    257     if (validateRE(regexp, status, FALSE) == FALSE) {
    258         return;
    259     }
    260     if (text == NULL || textLength < -1) {
    261         *status = U_ILLEGAL_ARGUMENT_ERROR;
    262         return;
    263     }
    264     regexp->fText       = text;
    265     regexp->fTextLength = textLength;
    266     UBool isTerminated  = (textLength == -1);
    267 
    268     regexp->fTextString.setTo(isTerminated, text, textLength);
    269     regexp->fMatcher->reset(regexp->fTextString);
    270 }
    271 
    272 
    273 
    274 //------------------------------------------------------------------------------
    275 //
    276 //    uregex_getText
    277 //
    278 //------------------------------------------------------------------------------
    279 U_CAPI const UChar * U_EXPORT2
    280 uregex_getText(URegularExpression *regexp,
    281                int32_t            *textLength,
    282                UErrorCode         *status)  {
    283     if (validateRE(regexp, status, FALSE) == FALSE) {
    284         return NULL;
    285     }
    286     if (textLength != NULL) {
    287         *textLength = regexp->fTextLength;
    288     }
    289     return regexp->fText;
    290 }
    291 
    292 
    293 //------------------------------------------------------------------------------
    294 //
    295 //    uregex_matches
    296 //
    297 //------------------------------------------------------------------------------
    298 U_CAPI UBool U_EXPORT2
    299 uregex_matches(URegularExpression *regexp,
    300                 int32_t            startIndex,
    301                 UErrorCode        *status)  {
    302     UBool result = FALSE;
    303     if (validateRE(regexp, status) == FALSE) {
    304         return result;
    305     }
    306     if (startIndex == -1) {
    307         result = regexp->fMatcher->matches(*status);
    308     } else {
    309         result = regexp->fMatcher->matches(startIndex, *status);
    310     }
    311     return result;
    312 }
    313 
    314 
    315 
    316 //------------------------------------------------------------------------------
    317 //
    318 //    uregex_lookingAt
    319 //
    320 //------------------------------------------------------------------------------
    321 U_CAPI UBool U_EXPORT2
    322 uregex_lookingAt(URegularExpression *regexp,
    323                  int32_t             startIndex,
    324                  UErrorCode         *status)  {
    325     UBool result = FALSE;
    326     if (validateRE(regexp, status) == FALSE) {
    327         return result;
    328     }
    329     if (startIndex == -1) {
    330         result = regexp->fMatcher->lookingAt(*status);
    331     } else {
    332         result = regexp->fMatcher->lookingAt(startIndex, *status);
    333     }
    334     return result;
    335 }
    336 
    337 
    338 
    339 //------------------------------------------------------------------------------
    340 //
    341 //    uregex_find
    342 //
    343 //------------------------------------------------------------------------------
    344 U_CAPI UBool U_EXPORT2
    345 uregex_find(URegularExpression *regexp,
    346             int32_t             startIndex,
    347             UErrorCode         *status)  {
    348     UBool result = FALSE;
    349     if (validateRE(regexp, status) == FALSE) {
    350         return result;
    351     }
    352     if (startIndex == -1) {
    353         regexp->fMatcher->resetPreserveRegion();
    354         result = regexp->fMatcher->find();
    355     } else {
    356         result = regexp->fMatcher->find(startIndex, *status);
    357     }
    358     return result;
    359 }
    360 
    361 //------------------------------------------------------------------------------
    362 //
    363 //    uregex_findNext
    364 //
    365 //------------------------------------------------------------------------------
    366 U_CAPI UBool U_EXPORT2
    367 uregex_findNext(URegularExpression *regexp,
    368                 UErrorCode         *status)  {
    369     if (validateRE(regexp, status) == FALSE) {
    370         return FALSE;
    371     }
    372     UBool result = regexp->fMatcher->find();
    373     return result;
    374 }
    375 
    376 //------------------------------------------------------------------------------
    377 //
    378 //    uregex_groupCount
    379 //
    380 //------------------------------------------------------------------------------
    381 U_CAPI int32_t U_EXPORT2
    382 uregex_groupCount(URegularExpression *regexp,
    383                   UErrorCode         *status)  {
    384     if (validateRE(regexp, status, FALSE) == FALSE) {
    385         return 0;
    386     }
    387     int32_t  result = regexp->fMatcher->groupCount();
    388     return result;
    389 }
    390 
    391 
    392 //------------------------------------------------------------------------------
    393 //
    394 //    uregex_group
    395 //
    396 //------------------------------------------------------------------------------
    397 U_CAPI int32_t U_EXPORT2
    398 uregex_group(URegularExpression *regexp,
    399              int32_t             groupNum,
    400              UChar              *dest,
    401              int32_t             destCapacity,
    402              UErrorCode          *status)  {
    403     if (validateRE(regexp, status) == FALSE) {
    404         return 0;
    405     }
    406     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
    407         *status = U_ILLEGAL_ARGUMENT_ERROR;
    408         return 0;
    409     }
    410 
    411     //
    412     // Pick up the range of characters from the matcher
    413     //
    414     int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
    415     int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
    416     if (U_FAILURE(*status)) {
    417         return 0;
    418     }
    419 
    420     //
    421     // Trim length based on buffer capacity
    422     //
    423     int32_t fullLength = endIx - startIx;
    424     int32_t copyLength = fullLength;
    425     if (copyLength < destCapacity) {
    426         dest[copyLength] = 0;
    427     } else  if (copyLength == destCapacity) {
    428         *status = U_STRING_NOT_TERMINATED_WARNING;
    429     } else {
    430         copyLength = destCapacity;
    431         *status = U_BUFFER_OVERFLOW_ERROR;
    432     }
    433 
    434     //
    435     // Copy capture group to user's buffer
    436     //
    437     if (copyLength > 0) {
    438         u_memcpy(dest, &regexp->fText[startIx], copyLength);
    439     }
    440     return fullLength;
    441 }
    442 
    443 
    444 //------------------------------------------------------------------------------
    445 //
    446 //    uregex_start
    447 //
    448 //------------------------------------------------------------------------------
    449 U_CAPI int32_t U_EXPORT2
    450 uregex_start(URegularExpression *regexp,
    451              int32_t             groupNum,
    452              UErrorCode          *status)  {
    453     if (validateRE(regexp, status) == FALSE) {
    454         return 0;
    455     }
    456     int32_t result = regexp->fMatcher->start(groupNum, *status);
    457     return result;
    458 }
    459 
    460 
    461 //------------------------------------------------------------------------------
    462 //
    463 //    uregex_end
    464 //
    465 //------------------------------------------------------------------------------
    466 U_CAPI int32_t U_EXPORT2
    467 uregex_end(URegularExpression   *regexp,
    468            int32_t               groupNum,
    469            UErrorCode           *status)  {
    470     if (validateRE(regexp, status) == FALSE) {
    471         return 0;
    472     }
    473     int32_t result = regexp->fMatcher->end(groupNum, *status);
    474     return result;
    475 }
    476 
    477 //------------------------------------------------------------------------------
    478 //
    479 //    uregex_reset
    480 //
    481 //------------------------------------------------------------------------------
    482 U_CAPI void U_EXPORT2
    483 uregex_reset(URegularExpression    *regexp,
    484              int32_t               index,
    485              UErrorCode            *status)  {
    486     if (validateRE(regexp, status) == FALSE) {
    487         return;
    488     }
    489     regexp->fMatcher->reset(index, *status);
    490 }
    491 
    492 
    493 //------------------------------------------------------------------------------
    494 //
    495 //    uregex_setRegion
    496 //
    497 //------------------------------------------------------------------------------
    498 U_CAPI void U_EXPORT2
    499 uregex_setRegion(URegularExpression   *regexp,
    500                  int32_t               regionStart,
    501                  int32_t               regionLimit,
    502                  UErrorCode           *status)  {
    503     if (validateRE(regexp, status) == FALSE) {
    504         return;
    505     }
    506     regexp->fMatcher->region(regionStart, regionLimit, *status);
    507 }
    508 
    509 
    510 //------------------------------------------------------------------------------
    511 //
    512 //    uregex_regionStart
    513 //
    514 //------------------------------------------------------------------------------
    515 U_CAPI int32_t U_EXPORT2
    516 uregex_regionStart(const  URegularExpression   *regexp,
    517                           UErrorCode           *status)  {
    518     if (validateRE(regexp, status) == FALSE) {
    519         return 0;
    520     }
    521     return regexp->fMatcher->regionStart();
    522 }
    523 
    524 
    525 //------------------------------------------------------------------------------
    526 //
    527 //    uregex_regionEnd
    528 //
    529 //------------------------------------------------------------------------------
    530 U_CAPI int32_t U_EXPORT2
    531 uregex_regionEnd(const  URegularExpression   *regexp,
    532                         UErrorCode           *status)  {
    533     if (validateRE(regexp, status) == FALSE) {
    534         return 0;
    535     }
    536     return regexp->fMatcher->regionEnd();
    537 }
    538 
    539 
    540 //------------------------------------------------------------------------------
    541 //
    542 //    uregex_hasTransparentBounds
    543 //
    544 //------------------------------------------------------------------------------
    545 U_CAPI UBool U_EXPORT2
    546 uregex_hasTransparentBounds(const  URegularExpression   *regexp,
    547                                    UErrorCode           *status)  {
    548     if (validateRE(regexp, status) == FALSE) {
    549         return FALSE;
    550     }
    551     return regexp->fMatcher->hasTransparentBounds();
    552 }
    553 
    554 
    555 //------------------------------------------------------------------------------
    556 //
    557 //    uregex_useTransparentBounds
    558 //
    559 //------------------------------------------------------------------------------
    560 U_CAPI void U_EXPORT2
    561 uregex_useTransparentBounds(URegularExpression    *regexp,
    562              UBool                 b,
    563              UErrorCode            *status)  {
    564     if (validateRE(regexp, status) == FALSE) {
    565         return;
    566     }
    567     regexp->fMatcher->useTransparentBounds(b);
    568 }
    569 
    570 
    571 //------------------------------------------------------------------------------
    572 //
    573 //    uregex_hasAnchoringBounds
    574 //
    575 //------------------------------------------------------------------------------
    576 U_CAPI UBool U_EXPORT2
    577 uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
    578                                    UErrorCode           *status)  {
    579     if (validateRE(regexp, status) == FALSE) {
    580         return FALSE;
    581     }
    582     return regexp->fMatcher->hasAnchoringBounds();
    583 }
    584 
    585 
    586 //------------------------------------------------------------------------------
    587 //
    588 //    uregex_useAnchoringBounds
    589 //
    590 //------------------------------------------------------------------------------
    591 U_CAPI void U_EXPORT2
    592 uregex_useAnchoringBounds(URegularExpression    *regexp,
    593              UBool                 b,
    594              UErrorCode            *status)  {
    595     if (validateRE(regexp, status) == FALSE) {
    596         return;
    597     }
    598     regexp->fMatcher->useAnchoringBounds(b);
    599 }
    600 
    601 
    602 //------------------------------------------------------------------------------
    603 //
    604 //    uregex_hitEnd
    605 //
    606 //------------------------------------------------------------------------------
    607 U_CAPI UBool U_EXPORT2
    608 uregex_hitEnd(const  URegularExpression   *regexp,
    609                      UErrorCode           *status)  {
    610     if (validateRE(regexp, status) == FALSE) {
    611         return FALSE;
    612     }
    613     return regexp->fMatcher->hitEnd();
    614 }
    615 
    616 
    617 //------------------------------------------------------------------------------
    618 //
    619 //    uregex_requireEnd
    620 //
    621 //------------------------------------------------------------------------------
    622 U_CAPI UBool U_EXPORT2
    623 uregex_requireEnd(const  URegularExpression   *regexp,
    624                          UErrorCode           *status)  {
    625     if (validateRE(regexp, status) == FALSE) {
    626         return FALSE;
    627     }
    628     return regexp->fMatcher->requireEnd();
    629 }
    630 
    631 
    632 //------------------------------------------------------------------------------
    633 //
    634 //    uregex_setTimeLimit
    635 //
    636 //------------------------------------------------------------------------------
    637 U_CAPI void U_EXPORT2
    638 uregex_setTimeLimit(URegularExpression   *regexp,
    639                     int32_t               limit,
    640                     UErrorCode           *status) {
    641     if (validateRE(regexp, status)) {
    642         regexp->fMatcher->setTimeLimit(limit, *status);
    643     }
    644 }
    645 
    646 
    647 
    648 //------------------------------------------------------------------------------
    649 //
    650 //    uregex_getTimeLimit
    651 //
    652 //------------------------------------------------------------------------------
    653 U_CAPI int32_t U_EXPORT2
    654 uregex_getTimeLimit(const  URegularExpression   *regexp,
    655                            UErrorCode           *status) {
    656     int32_t retVal = 0;
    657     if (validateRE(regexp, status)) {
    658         retVal = regexp->fMatcher->getTimeLimit();
    659     }
    660     return retVal;
    661 }
    662 
    663 
    664 
    665 //------------------------------------------------------------------------------
    666 //
    667 //    uregex_setStackLimit
    668 //
    669 //------------------------------------------------------------------------------
    670 U_CAPI void U_EXPORT2
    671 uregex_setStackLimit(URegularExpression   *regexp,
    672                     int32_t               limit,
    673                     UErrorCode           *status) {
    674     if (validateRE(regexp, status)) {
    675         regexp->fMatcher->setStackLimit(limit, *status);
    676     }
    677 }
    678 
    679 
    680 
    681 //------------------------------------------------------------------------------
    682 //
    683 //    uregex_getStackLimit
    684 //
    685 //------------------------------------------------------------------------------
    686 U_CAPI int32_t U_EXPORT2
    687 uregex_getStackLimit(const  URegularExpression   *regexp,
    688                            UErrorCode           *status) {
    689     int32_t retVal = 0;
    690     if (validateRE(regexp, status)) {
    691         retVal = regexp->fMatcher->getStackLimit();
    692     }
    693     return retVal;
    694 }
    695 
    696 
    697 //------------------------------------------------------------------------------
    698 //
    699 //    uregex_setMatchCallback
    700 //
    701 //------------------------------------------------------------------------------
    702 U_CAPI void U_EXPORT2
    703 uregex_setMatchCallback(URegularExpression      *regexp,
    704                         URegexMatchCallback     *callback,
    705                         const void              *context,
    706                         UErrorCode              *status) {
    707     if (validateRE(regexp, status)) {
    708       regexp->fMatcher->setMatchCallback(callback, context, *status);
    709     }
    710 }
    711 
    712 
    713 //------------------------------------------------------------------------------
    714 //
    715 //    uregex_getMatchCallback
    716 //
    717 //------------------------------------------------------------------------------
    718 U_CAPI void U_EXPORT2
    719 uregex_getMatchCallback(const URegularExpression    *regexp,
    720                         URegexMatchCallback        **callback,
    721                         const void                 **context,
    722                         UErrorCode                  *status) {
    723      if (validateRE(regexp, status)) {
    724          regexp->fMatcher->getMatchCallback(*callback, *context, *status);
    725      }
    726 }
    727 
    728 
    729 //------------------------------------------------------------------------------
    730 //
    731 //    uregex_replaceAll
    732 //
    733 //------------------------------------------------------------------------------
    734 U_CAPI int32_t U_EXPORT2
    735 uregex_replaceAll(URegularExpression    *regexp,
    736                   const UChar           *replacementText,
    737                   int32_t                replacementLength,
    738                   UChar                 *destBuf,
    739                   int32_t                destCapacity,
    740                   UErrorCode            *status)  {
    741     if (validateRE(regexp, status) == FALSE) {
    742         return 0;
    743     }
    744     if (replacementText == NULL || replacementLength < -1 ||
    745         destBuf == NULL && destCapacity > 0 ||
    746         destCapacity < 0) {
    747         *status = U_ILLEGAL_ARGUMENT_ERROR;
    748         return 0;
    749     }
    750 
    751     int32_t   len = 0;
    752 
    753     uregex_reset(regexp, 0, status);
    754 
    755     // Note: Seperate error code variables for findNext() and appendReplacement()
    756     //       are used so that destination buffer overflow errors
    757     //       in appendReplacement won't stop findNext() from working.
    758     //       appendReplacement() and appendTail() special case incoming buffer
    759     //       overflow errors, continuing to return the correct length.
    760     UErrorCode  findStatus = *status;
    761     while (uregex_findNext(regexp, &findStatus)) {
    762         len += uregex_appendReplacement(regexp, replacementText, replacementLength,
    763                                         &destBuf, &destCapacity, status);
    764     }
    765     len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
    766 
    767     if (U_FAILURE(findStatus)) {
    768         // If anything went wrong with the findNext(), make that error trump
    769         //   whatever may have happened with the append() operations.
    770         //   Errors in findNext() are not expected.
    771         *status = findStatus;
    772     }
    773 
    774     return len;
    775 }
    776 
    777 
    778 //------------------------------------------------------------------------------
    779 //
    780 //    uregex_replaceFirst
    781 //
    782 //------------------------------------------------------------------------------
    783 U_CAPI int32_t U_EXPORT2
    784 uregex_replaceFirst(URegularExpression  *regexp,
    785                     const UChar         *replacementText,
    786                     int32_t              replacementLength,
    787                     UChar               *destBuf,
    788                     int32_t              destCapacity,
    789                     UErrorCode          *status)  {
    790     if (validateRE(regexp, status) == FALSE) {
    791         return 0;
    792     }
    793     if (replacementText == NULL || replacementLength < -1 ||
    794         destBuf == NULL && destCapacity > 0 ||
    795         destCapacity < 0) {
    796         *status = U_ILLEGAL_ARGUMENT_ERROR;
    797         return 0;
    798     }
    799 
    800     int32_t   len = 0;
    801     UBool     findSucceeded;
    802     uregex_reset(regexp, 0, status);
    803     findSucceeded = uregex_find(regexp, 0, status);
    804     if (findSucceeded) {
    805         len = uregex_appendReplacement(regexp, replacementText, replacementLength,
    806                                        &destBuf, &destCapacity, status);
    807     }
    808     len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
    809 
    810     return len;
    811 }
    812 
    813 
    814 //------------------------------------------------------------------------------
    815 //
    816 //    uregex_appendReplacement
    817 //
    818 //------------------------------------------------------------------------------
    819 
    820 
    821 //
    822 //  Dummy class, because these functions need to be friends of class RegexMatcher,
    823 //               and stand-alone C functions don't work as friends
    824 //
    825 U_NAMESPACE_BEGIN
    826 class RegexCImpl {
    827  public:
    828    inline static  int32_t appendReplacement(URegularExpression    *regexp,
    829                       const UChar           *replacementText,
    830                       int32_t                replacementLength,
    831                       UChar                **destBuf,
    832                       int32_t               *destCapacity,
    833                       UErrorCode            *status);
    834 
    835    inline static int32_t appendTail(URegularExpression    *regexp,
    836                   UChar                **destBuf,
    837                   int32_t               *destCapacity,
    838                   UErrorCode            *status);
    839 };
    840 U_NAMESPACE_END
    841 
    842 
    843 //
    844 //  Call-back function for u_unescapeAt(), used when we encounter
    845 //    \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
    846 //
    847 U_CDECL_BEGIN
    848 static UChar U_CALLCONV
    849 unescape_charAt(int32_t offset, void *context) {
    850     UChar c16 = ((UChar *)context)[offset];
    851     return c16;
    852 }
    853 U_CDECL_END
    854 
    855 
    856 static const UChar BACKSLASH  = 0x5c;
    857 static const UChar DOLLARSIGN = 0x24;
    858 
    859 //
    860 //  Move a character to an output buffer, with bounds checking on the index.
    861 //      Index advances even if capacity is exceeded, for preflight size computations.
    862 //      This little sequence is used a LOT.
    863 //
    864 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
    865     if (*idx < bufCapacity) {
    866         buf[*idx] = c;
    867     }
    868     (*idx)++;
    869 }
    870 
    871 
    872 //
    873 //  appendReplacement, the actual implementation.
    874 //
    875 int32_t RegexCImpl::appendReplacement(URegularExpression    *regexp,
    876                   const UChar           *replacementText,
    877                   int32_t                replacementLength,
    878                   UChar                **destBuf,
    879                   int32_t               *destCapacity,
    880                   UErrorCode            *status)  {
    881 
    882     // If we come in with a buffer overflow error, don't suppress the operation.
    883     //  A series of appendReplacements, appendTail need to correctly preflight
    884     //  the buffer size when an overflow happens somewhere in the middle.
    885     UBool pendingBufferOverflow = FALSE;
    886     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
    887         pendingBufferOverflow = TRUE;
    888         *status = U_ZERO_ERROR;
    889     }
    890 
    891     //
    892     // Validate all paramters
    893     //
    894     if (validateRE(regexp, status) == FALSE) {
    895         return 0;
    896     }
    897     if (replacementText == NULL || replacementLength < -1 ||
    898         destCapacity == NULL || destBuf == NULL ||
    899         *destBuf == NULL && *destCapacity > 0 ||
    900         *destCapacity < 0) {
    901         *status = U_ILLEGAL_ARGUMENT_ERROR;
    902         return 0;
    903     }
    904 
    905     RegexMatcher *m = regexp->fMatcher;
    906     if (m->fMatch == FALSE) {
    907         *status = U_REGEX_INVALID_STATE;
    908         return 0;
    909     }
    910 
    911     UChar    *dest             = *destBuf;
    912     int32_t   capacity         = *destCapacity;
    913     int32_t   destIdx          =  0;
    914     int32_t   i;
    915 
    916     // If it wasn't supplied by the caller,  get the length of the replacement text.
    917     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
    918     //          the fly and avoid this step.
    919     if (replacementLength == -1) {
    920         replacementLength = u_strlen(replacementText);
    921     }
    922 
    923     // Copy input string from the end of previous match to start of current match
    924     for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
    925         appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
    926     }
    927 
    928 
    929 
    930     // scan the replacement text, looking for substitutions ($n) and \escapes.
    931     int32_t  replIdx = 0;
    932     while (replIdx < replacementLength) {
    933         UChar  c = replacementText[replIdx];
    934         replIdx++;
    935         if (c != DOLLARSIGN && c != BACKSLASH) {
    936             // Common case, no substitution, no escaping,
    937             //  just copy the char to the dest buf.
    938             appendToBuf(c, &destIdx, dest, capacity);
    939             continue;
    940         }
    941 
    942         if (c == BACKSLASH) {
    943             // Backslash Escape.  Copy the following char out without further checks.
    944             //                    Note:  Surrogate pairs don't need any special handling
    945             //                           The second half wont be a '$' or a '\', and
    946             //                           will move to the dest normally on the next
    947             //                           loop iteration.
    948             if (replIdx >= replacementLength) {
    949                 break;
    950             }
    951             c = replacementText[replIdx];
    952 
    953             if (c==0x55/*U*/ || c==0x75/*u*/) {
    954                 // We have a \udddd or \Udddddddd escape sequence.
    955                 UChar32 escapedChar =
    956                     u_unescapeAt(unescape_charAt,
    957                        &replIdx,                   // Index is updated by unescapeAt
    958                        replacementLength,          // Length of replacement text
    959                        (void *)replacementText);
    960 
    961                 if (escapedChar != (UChar32)0xFFFFFFFF) {
    962                     if (escapedChar <= 0xffff) {
    963                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
    964                     } else {
    965                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
    966                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
    967                     }
    968                     continue;
    969                 }
    970                 // Note:  if the \u escape was invalid, just fall through and
    971                 //        treat it as a plain \<anything> escape.
    972             }
    973 
    974             // Plain backslash escape.  Just put out the escaped character.
    975             appendToBuf(c, &destIdx, dest, capacity);
    976 
    977             replIdx++;
    978             continue;
    979         }
    980 
    981 
    982 
    983         // We've got a $.  Pick up a capture group number if one follows.
    984         // Consume at most the number of digits necessary for the largest capture
    985         // number that is valid for this pattern.
    986 
    987         int32_t numDigits = 0;
    988         int32_t groupNum  = 0;
    989         UChar32 digitC;
    990         for (;;) {
    991             if (replIdx >= replacementLength) {
    992                 break;
    993             }
    994             U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
    995             if (u_isdigit(digitC) == FALSE) {
    996                 break;
    997             }
    998 
    999             U16_FWD_1(replacementText, replIdx, replacementLength);
   1000             groupNum=groupNum*10 + u_charDigitValue(digitC);
   1001             numDigits++;
   1002             if (numDigits >= m->fPattern->fMaxCaptureDigits) {
   1003                 break;
   1004             }
   1005         }
   1006 
   1007 
   1008         if (numDigits == 0) {
   1009             // The $ didn't introduce a group number at all.
   1010             // Treat it as just part of the substitution text.
   1011             appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
   1012             continue;
   1013         }
   1014 
   1015         // Finally, append the capture group data to the destination.
   1016         int32_t  capacityRemaining = capacity - destIdx;
   1017         if (capacityRemaining < 0) {
   1018             capacityRemaining = 0;
   1019         }
   1020         destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
   1021         if (*status == U_BUFFER_OVERFLOW_ERROR) {
   1022             // Ignore buffer overflow when extracting the group.  We need to
   1023             //   continue on to get full size of the untruncated result.  We will
   1024             //   raise our own buffer overflow error at the end.
   1025             *status = U_ZERO_ERROR;
   1026         }
   1027 
   1028         if (U_FAILURE(*status)) {
   1029             // Can fail if group number is out of range.
   1030             break;
   1031         }
   1032 
   1033     }
   1034 
   1035     //
   1036     //  Nul Terminate the dest buffer if possible.
   1037     //  Set the appropriate buffer overflow or not terminated error, if needed.
   1038     //
   1039     if (destIdx < capacity) {
   1040         dest[destIdx] = 0;
   1041     } else if (destIdx == *destCapacity) {
   1042         *status = U_STRING_NOT_TERMINATED_WARNING;
   1043     } else {
   1044         *status = U_BUFFER_OVERFLOW_ERROR;
   1045     }
   1046 
   1047     //
   1048     // Return an updated dest buffer and capacity to the caller.
   1049     //
   1050     if (destIdx > 0 &&  *destCapacity > 0) {
   1051         if (destIdx < capacity) {
   1052             *destBuf      += destIdx;
   1053             *destCapacity -= destIdx;
   1054         } else {
   1055             *destBuf      += capacity;
   1056             *destCapacity =  0;
   1057         }
   1058     }
   1059 
   1060     // If we came in with a buffer overflow, make sure we go out with one also.
   1061     //   (A zero length match right at the end of the previous match could
   1062     //    make this function succeed even though a previous call had overflowed the buf)
   1063     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1064         *status = U_BUFFER_OVERFLOW_ERROR;
   1065     }
   1066 
   1067     return destIdx;
   1068 }
   1069 
   1070 //
   1071 //   appendReplacement   the acutal API function,
   1072 //
   1073 U_CAPI int32_t U_EXPORT2
   1074 uregex_appendReplacement(URegularExpression    *regexp,
   1075                   const UChar           *replacementText,
   1076                   int32_t                replacementLength,
   1077                   UChar                **destBuf,
   1078                   int32_t               *destCapacity,
   1079                   UErrorCode            *status) {
   1080     return RegexCImpl::appendReplacement(
   1081         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
   1082 }
   1083 
   1084 
   1085 //------------------------------------------------------------------------------
   1086 //
   1087 //    uregex_appendTail
   1088 //
   1089 //------------------------------------------------------------------------------
   1090 int32_t RegexCImpl::appendTail(URegularExpression    *regexp,
   1091                   UChar                **destBuf,
   1092                   int32_t               *destCapacity,
   1093                   UErrorCode            *status)
   1094 {
   1095 
   1096     // If we come in with a buffer overflow error, don't suppress the operation.
   1097     //  A series of appendReplacements, appendTail need to correctly preflight
   1098     //  the buffer size when an overflow happens somewhere in the middle.
   1099     UBool pendingBufferOverflow = FALSE;
   1100     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
   1101         pendingBufferOverflow = TRUE;
   1102         *status = U_ZERO_ERROR;
   1103     }
   1104 
   1105     if (validateRE(regexp, status) == FALSE) {
   1106         return 0;
   1107     }
   1108 
   1109     if (destCapacity == NULL || destBuf == NULL ||
   1110         *destBuf == NULL && *destCapacity > 0 ||
   1111         *destCapacity < 0)
   1112     {
   1113         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1114         return 0;
   1115     }
   1116 
   1117     RegexMatcher *m = regexp->fMatcher;
   1118 
   1119     int32_t  srcIdx;
   1120     if (m->fMatch) {
   1121         // The most recent call to find() succeeded.
   1122         srcIdx = m->fMatchEnd;
   1123     } else {
   1124         // The last call to find() on this matcher failed().
   1125         //   Look back to the end of the last find() that succeeded for src index.
   1126         srcIdx = m->fLastMatchEnd;
   1127         if (srcIdx == -1)  {
   1128             // There has been no successful match with this matcher.
   1129             //   We want to copy the whole string.
   1130             srcIdx = 0;
   1131         }
   1132     }
   1133 
   1134     int32_t  destIdx     = 0;
   1135     int32_t  destCap     = *destCapacity;
   1136     UChar    *dest       = *destBuf;
   1137 
   1138     for (;;) {
   1139         if (srcIdx == regexp->fTextLength) {
   1140             break;
   1141         }
   1142         UChar c = regexp->fText[srcIdx];
   1143         if (c == 0 && regexp->fTextLength == -1) {
   1144             break;
   1145         }
   1146         if (destIdx < destCap) {
   1147             dest[destIdx] = c;
   1148         } else {
   1149             // We've overflowed the dest buffer.
   1150             //  If the total input string length is known, we can
   1151             //    compute the total buffer size needed without scanning through the string.
   1152             if (regexp->fTextLength > 0) {
   1153                 destIdx += (regexp->fTextLength - srcIdx);
   1154                 break;
   1155             }
   1156         }
   1157         srcIdx++;
   1158         destIdx++;
   1159     }
   1160 
   1161     //
   1162     //  NUL terminate the output string, if possible, otherwise issue the
   1163     //   appropriate error or warning.
   1164     //
   1165     if (destIdx < destCap) {
   1166         dest[destIdx] = 0;
   1167     } else  if (destIdx == destCap) {
   1168         *status = U_STRING_NOT_TERMINATED_WARNING;
   1169     } else {
   1170         *status = U_BUFFER_OVERFLOW_ERROR;
   1171     }
   1172 
   1173     //
   1174     // Update the user's buffer ptr and capacity vars to reflect the
   1175     //   amount used.
   1176     //
   1177     if (destIdx < destCap) {
   1178         *destBuf      += destIdx;
   1179         *destCapacity -= destIdx;
   1180     } else {
   1181         *destBuf      += destCap;
   1182         *destCapacity  = 0;
   1183     }
   1184 
   1185     if (pendingBufferOverflow && U_SUCCESS(*status)) {
   1186         *status = U_BUFFER_OVERFLOW_ERROR;
   1187     }
   1188 
   1189     return destIdx;
   1190 }
   1191 
   1192 
   1193 U_CAPI int32_t U_EXPORT2
   1194 uregex_appendTail(URegularExpression    *regexp,
   1195                   UChar                **destBuf,
   1196                   int32_t               *destCapacity,
   1197                   UErrorCode            *status)  {
   1198     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
   1199 }
   1200 
   1201 
   1202 //------------------------------------------------------------------------------
   1203 //
   1204 //    copyString     Internal utility to copy a string to an output buffer,
   1205 //                   while managing buffer overflow and preflight size
   1206 //                   computation.  NUL termination is added to destination,
   1207 //                   and the NUL is counted in the output size.
   1208 //
   1209 //------------------------------------------------------------------------------
   1210 static void copyString(UChar        *destBuffer,    //  Destination buffer.
   1211                        int32_t       destCapacity,  //  Total capacity of dest buffer
   1212                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
   1213                                                     //    Update not clipped to destCapacity.
   1214                        const UChar  *srcPtr,        //  Pointer to source string
   1215                        int32_t       srcLen)        //  Source string len.
   1216 {
   1217     int32_t  si;
   1218     int32_t  di = *destIndex;
   1219     UChar    c;
   1220 
   1221     for (si=0; si<srcLen;  si++) {
   1222         c = srcPtr[si];
   1223         if (di < destCapacity) {
   1224             destBuffer[di] = c;
   1225             di++;
   1226         } else {
   1227             di += srcLen - si;
   1228             break;
   1229         }
   1230     }
   1231     if (di<destCapacity) {
   1232         destBuffer[di] = 0;
   1233     }
   1234     di++;
   1235     *destIndex = di;
   1236 }
   1237 
   1238 
   1239 //------------------------------------------------------------------------------
   1240 //
   1241 //    uregex_split
   1242 //
   1243 //------------------------------------------------------------------------------
   1244 U_CAPI int32_t U_EXPORT2
   1245 uregex_split(   URegularExpression      *regexp,
   1246                   UChar                 *destBuf,
   1247                   int32_t                destCapacity,
   1248                   int32_t               *requiredCapacity,
   1249                   UChar                 *destFields[],
   1250                   int32_t                destFieldsCapacity,
   1251                   UErrorCode            *status) {
   1252     if (validateRE(regexp, status) == FALSE) {
   1253         return 0;
   1254     }
   1255     if (destBuf == NULL && destCapacity > 0 ||
   1256         destCapacity < 0 ||
   1257         destFields == NULL ||
   1258         destFieldsCapacity < 1 ) {
   1259         *status = U_ILLEGAL_ARGUMENT_ERROR;
   1260         return 0;
   1261     }
   1262 
   1263     //
   1264     // Reset for the input text
   1265     //
   1266     regexp->fMatcher->reset();
   1267     int32_t   inputLen = regexp->fTextString.length();
   1268     int32_t   nextOutputStringStart = 0;
   1269     if (inputLen == 0) {
   1270         return 0;
   1271     }
   1272 
   1273 
   1274     //
   1275     // Loop through the input text, searching for the delimiter pattern
   1276     //
   1277     int32_t   i;             // Index of the field being processed.
   1278     int32_t   destIdx = 0;   // Next available position in destBuf;
   1279     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
   1280     for (i=0; ; i++) {
   1281         if (i>=destFieldsCapacity-1) {
   1282             // There are one or zero output string left.
   1283             // Fill the last output string with whatever is left from the input, then exit the loop.
   1284             //  ( i will be == destFieldsCapacity if we filled the output array while processing
   1285             //    capture groups of the delimiter expression, in which case we will discard the
   1286             //    last capture group saved in favor of the unprocessed remainder of the
   1287             //    input string.)
   1288             int32_t remainingLength = inputLen-nextOutputStringStart;
   1289             if (remainingLength > 0) {
   1290             }
   1291             if (i >= destFieldsCapacity) {
   1292                 // No fields are left.  Recycle the last one for holding the trailing part of
   1293                 //   the input string.
   1294                 i = destFieldsCapacity-1;
   1295                 destIdx = (int32_t)(destFields[i] - destFields[0]);
   1296             }
   1297 
   1298             destFields[i] = &destBuf[destIdx];
   1299             copyString(destBuf, destCapacity, &destIdx,
   1300                 &regexp->fText[nextOutputStringStart], remainingLength);
   1301             break;
   1302         }
   1303 
   1304         if (regexp->fMatcher->find()) {
   1305             // We found another delimiter.  Move everything from where we started looking
   1306             //  up until the start of the delimiter into the next output string.
   1307             int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
   1308             destFields[i] = &destBuf[destIdx];
   1309             copyString(destBuf, destCapacity, &destIdx,
   1310                 &regexp->fText[nextOutputStringStart], fieldLen);
   1311             nextOutputStringStart =  regexp->fMatcher->end(*status);
   1312 
   1313             // If the delimiter pattern has capturing parentheses, the captured
   1314             //  text goes out into the next n destination strings.
   1315             int32_t groupNum;
   1316             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
   1317                 // If we've run out of output string slots, bail out.
   1318                 if (i==destFieldsCapacity-1) {
   1319                     break;
   1320                 }
   1321                 i++;
   1322 
   1323                 // Set up to extract the capture group contents into the dest buffer.
   1324                 UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow
   1325                                                       //  error while extracting this group.
   1326                 int32_t remainingCapacity = destCapacity - destIdx;
   1327                 if (remainingCapacity < 0) {
   1328                     remainingCapacity = 0;
   1329                 }
   1330                 destFields[i] = &destBuf[destIdx];
   1331                 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
   1332                 destIdx += t + 1;    // Record the space used in the output string buffer.
   1333                                      //  +1 for the NUL that terminates the string.
   1334             }
   1335 
   1336             if (nextOutputStringStart == inputLen) {
   1337                 // The delimiter was at the end of the string.  We're done.
   1338                 break;
   1339             }
   1340 
   1341         }
   1342         else
   1343         {
   1344             // We ran off the end of the input while looking for the next delimiter.
   1345             // All the remaining text goes into the current output string.
   1346             destFields[i] = &destBuf[destIdx];
   1347             copyString(destBuf, destCapacity, &destIdx,
   1348                          &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
   1349             break;
   1350         }
   1351     }
   1352 
   1353     // Zero out any unused portion of the destFields array
   1354     int j;
   1355     for (j=i+1; j<destFieldsCapacity; j++) {
   1356         destFields[j] = NULL;
   1357     }
   1358 
   1359     if (requiredCapacity != NULL) {
   1360         *requiredCapacity = destIdx;
   1361     }
   1362     if (destIdx > destCapacity) {
   1363         *status = U_BUFFER_OVERFLOW_ERROR;
   1364     }
   1365     return i+1;
   1366 }
   1367 
   1368 
   1369 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
   1370 
   1371