Home | History | Annotate | Download | only in intltest
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /********************************************************************
      4  * COPYRIGHT:
      5  * Copyright (c) 2002-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ********************************************************************/
      8 
      9 //
     10 //   regextst.cpp
     11 //
     12 //      ICU Regular Expressions test, part of intltest.
     13 //
     14 
     15 /*
     16      NOTE!!
     17 
     18      PLEASE be careful about ASCII assumptions in this test.
     19      This test is one of the worst repeat offenders.
     20      If you have questions, contact someone on the ICU PMC
     21      who has access to an EBCDIC system.
     22 
     23  */
     24 
     25 #include "intltest.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 
     28 #include <stdlib.h>
     29 #include <stdio.h>
     30 #include <string.h>
     31 
     32 #include "unicode/localpointer.h"
     33 #include "unicode/regex.h"
     34 #include "unicode/uchar.h"
     35 #include "unicode/ucnv.h"
     36 #include "unicode/uniset.h"
     37 #include "unicode/uregex.h"
     38 #include "unicode/usetiter.h"
     39 #include "unicode/ustring.h"
     40 #include "unicode/utext.h"
     41 
     42 #include "regextst.h"
     43 #include "regexcmp.h"
     44 #include "uvector.h"
     45 #include "util.h"
     46 #include "cmemory.h"
     47 #include "cstring.h"
     48 #include "uinvchar.h"
     49 
     50 #define SUPPORT_MUTATING_INPUT_STRING   0
     51 
     52 //---------------------------------------------------------------------------
     53 //
     54 //  Test class boilerplate
     55 //
     56 //---------------------------------------------------------------------------
     57 RegexTest::RegexTest()
     58 {
     59 }
     60 
     61 
     62 RegexTest::~RegexTest()
     63 {
     64 }
     65 
     66 
     67 
     68 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     69 {
     70     if (exec) logln("TestSuite RegexTest: ");
     71     switch (index) {
     72 
     73         case 0: name = "Basic";
     74             if (exec) Basic();
     75             break;
     76         case 1: name = "API_Match";
     77             if (exec) API_Match();
     78             break;
     79         case 2: name = "API_Replace";
     80             if (exec) API_Replace();
     81             break;
     82         case 3: name = "API_Pattern";
     83             if (exec) API_Pattern();
     84             break;
     85         case 4:
     86 #if !UCONFIG_NO_FILE_IO
     87             name = "Extended";
     88             if (exec) Extended();
     89 #else
     90             name = "skip";
     91 #endif
     92             break;
     93         case 5: name = "Errors";
     94             if (exec) Errors();
     95             break;
     96         case 6: name = "PerlTests";
     97             if (exec) PerlTests();
     98             break;
     99         case 7: name = "Callbacks";
    100             if (exec) Callbacks();
    101             break;
    102         case 8: name = "FindProgressCallbacks";
    103             if (exec) FindProgressCallbacks();
    104             break;
    105         case 9: name = "Bug 6149";
    106              if (exec) Bug6149();
    107              break;
    108         case 10: name = "UTextBasic";
    109           if (exec) UTextBasic();
    110           break;
    111         case 11: name = "API_Match_UTF8";
    112           if (exec) API_Match_UTF8();
    113           break;
    114         case 12: name = "API_Replace_UTF8";
    115           if (exec) API_Replace_UTF8();
    116           break;
    117         case 13: name = "API_Pattern_UTF8";
    118           if (exec) API_Pattern_UTF8();
    119           break;
    120         case 14: name = "PerlTestsUTF8";
    121           if (exec) PerlTestsUTF8();
    122           break;
    123         case 15: name = "PreAllocatedUTextCAPI";
    124           if (exec) PreAllocatedUTextCAPI();
    125           break;
    126         case 16: name = "Bug 7651";
    127              if (exec) Bug7651();
    128              break;
    129         case 17: name = "Bug 7740";
    130             if (exec) Bug7740();
    131             break;
    132         case 18: name = "Bug 8479";
    133             if (exec) Bug8479();
    134             break;
    135         case 19: name = "Bug 7029";
    136             if (exec) Bug7029();
    137             break;
    138         case 20: name = "CheckInvBufSize";
    139             if (exec) CheckInvBufSize();
    140             break;
    141         case 21: name = "Bug 9283";
    142             if (exec) Bug9283();
    143             break;
    144         case 22: name = "Bug10459";
    145             if (exec) Bug10459();
    146             break;
    147         case 23: name = "TestCaseInsensitiveStarters";
    148             if (exec) TestCaseInsensitiveStarters();
    149             break;
    150         case 24: name = "TestBug11049";
    151             if (exec) TestBug11049();
    152             break;
    153         case 25: name = "TestBug11371";
    154             if (exec) TestBug11371();
    155             break;
    156         case 26: name = "TestBug11480";
    157             if (exec) TestBug11480();
    158             break;
    159         case 27: name = "NamedCapture";
    160             if (exec) NamedCapture();
    161             break;
    162         case 28: name = "NamedCaptureLimits";
    163             if (exec) NamedCaptureLimits();
    164             break;
    165         default: name = "";
    166             break; //needed to end loop
    167     }
    168 }
    169 
    170 
    171 
    172 /**
    173  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    174  * into ASCII.
    175  * @see utext_openUTF8
    176  */
    177 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    178 
    179 //---------------------------------------------------------------------------
    180 //
    181 //   Error Checking / Reporting macros used in all of the tests.
    182 //
    183 //---------------------------------------------------------------------------
    184 
    185 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    186   int64_t oldIndex = utext_getNativeIndex(text);
    187   utext_setNativeIndex(text, 0);
    188   char *bufPtr = buf;
    189   UChar32 c = utext_next32From(text, 0);
    190   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    191     if (0x000020<=c && c<0x00007e) {
    192       *bufPtr = c;
    193     } else {
    194 #if 0
    195       sprintf(bufPtr,"U+%04X", c);
    196       bufPtr+= strlen(bufPtr)-1;
    197 #else
    198       *bufPtr = '%';
    199 #endif
    200     }
    201     bufPtr++;
    202     c = UTEXT_NEXT32(text);
    203   }
    204   *bufPtr = 0;
    205 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    206   char *ebuf = (char*)malloc(bufLen);
    207   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    208   uprv_strncpy(buf, ebuf, bufLen);
    209   free((void*)ebuf);
    210 #endif
    211   utext_setNativeIndex(text, oldIndex);
    212 }
    213 
    214 
    215 static char ASSERT_BUF[1024];
    216 
    217 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    218   if(message.length()==0) {
    219     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    220   } else {
    221     UnicodeString buf;
    222     IntlTest::prettify(message,buf);
    223     if(buf.length()==0) {
    224       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    225     } else {
    226       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    227       if(ASSERT_BUF[0]==0) {
    228         ASSERT_BUF[0]=0;
    229         for(int32_t i=0;i<buf.length();i++) {
    230           UChar ch = buf[i];
    231           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    232         }
    233       }
    234     }
    235   }
    236   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    237   return ASSERT_BUF;
    238 }
    239 
    240 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    241 
    242 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    243                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    244 
    245 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    246 
    247 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    248 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    249     __LINE__, u_errorName(errcode), u_errorName(status));};}
    250 
    251 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    252     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    253 
    254 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    255     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    256 
    257 // expected: const char * , restricted to invariant characters.
    258 // actual: const UnicodeString &
    259 #define REGEX_ASSERT_UNISTR(expected, actual) { \
    260     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
    261         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
    262                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
    263 
    264 
    265 static UBool testUTextEqual(UText *uta, UText *utb) {
    266     UChar32 ca = 0;
    267     UChar32 cb = 0;
    268     utext_setNativeIndex(uta, 0);
    269     utext_setNativeIndex(utb, 0);
    270     do {
    271         ca = utext_next32(uta);
    272         cb = utext_next32(utb);
    273         if (ca != cb) {
    274             break;
    275         }
    276     } while (ca != U_SENTINEL);
    277     return ca == cb;
    278 }
    279 
    280 
    281 /**
    282  * @param expected expected text in UTF-8 (not platform) codepage
    283  */
    284 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    285     UErrorCode status = U_ZERO_ERROR;
    286     UText expectedText = UTEXT_INITIALIZER;
    287     utext_openUTF8(&expectedText, expected, -1, &status);
    288     if(U_FAILURE(status)) {
    289       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    290       return;
    291     }
    292     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    293       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    294       return;
    295     }
    296     utext_setNativeIndex(actual, 0);
    297     if (!testUTextEqual(&expectedText, actual)) {
    298         char buf[201 /*21*/];
    299         char expectedBuf[201];
    300         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
    301         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
    302         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    303     }
    304     utext_close(&expectedText);
    305 }
    306 /**
    307  * @param expected invariant (platform local text) input
    308  */
    309 
    310 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    311     UErrorCode status = U_ZERO_ERROR;
    312     UText expectedText = UTEXT_INITIALIZER;
    313     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    314     if(U_FAILURE(status)) {
    315       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    316       return;
    317     }
    318     utext_setNativeIndex(actual, 0);
    319     if (!testUTextEqual(&expectedText, actual)) {
    320         char buf[201 /*21*/];
    321         char expectedBuf[201];
    322         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
    323         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
    324         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    325     }
    326     utext_close(&expectedText);
    327 }
    328 
    329 /**
    330  * Assumes utf-8 input
    331  */
    332 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    333 /**
    334  * Assumes Invariant input
    335  */
    336 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    337 
    338 /**
    339  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    340  * passed into utext_openUTF8. An error will be given if
    341  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    342  */
    343 
    344 #define INV_BUFSIZ 2048 /* increase this if too small */
    345 
    346 static int64_t inv_next=0;
    347 
    348 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    349 static char inv_buf[INV_BUFSIZ];
    350 #endif
    351 
    352 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    353   if(length==-1) length=strlen(inv);
    354 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    355   inv_next+=length;
    356   return utext_openUTF8(ut, inv, length, status);
    357 #else
    358   if(inv_next+length+1>INV_BUFSIZ) {
    359     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    360             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    361     *status = U_MEMORY_ALLOCATION_ERROR;
    362     return NULL;
    363   }
    364 
    365   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    366   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    367   inv_next+=length;
    368 
    369 #if 0
    370   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    371 #endif
    372 
    373   return utext_openUTF8(ut, (const char*)buf, length, status);
    374 #endif
    375 }
    376 
    377 
    378 //---------------------------------------------------------------------------
    379 //
    380 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    381 //                       for the LookingAt() and  Match() functions.
    382 //
    383 //       usage:
    384 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    385 //
    386 //          The expected results are UBool - TRUE or FALSE.
    387 //          The input text is unescaped.  The pattern is not.
    388 //
    389 //
    390 //---------------------------------------------------------------------------
    391 
    392 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    393 
    394 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    395     const UnicodeString pattern(pat, -1, US_INV);
    396     const UnicodeString inputText(text, -1, US_INV);
    397     UErrorCode          status  = U_ZERO_ERROR;
    398     UParseError         pe;
    399     RegexPattern        *REPattern = NULL;
    400     RegexMatcher        *REMatcher = NULL;
    401     UBool               retVal     = TRUE;
    402 
    403     UnicodeString patString(pat, -1, US_INV);
    404     REPattern = RegexPattern::compile(patString, 0, pe, status);
    405     if (U_FAILURE(status)) {
    406         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    407             line, u_errorName(status));
    408         return FALSE;
    409     }
    410     if (line==376) { REPattern->dumpPattern();}
    411 
    412     UnicodeString inputString(inputText);
    413     UnicodeString unEscapedInput = inputString.unescape();
    414     REMatcher = REPattern->matcher(unEscapedInput, status);
    415     if (U_FAILURE(status)) {
    416         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    417             line, u_errorName(status));
    418         return FALSE;
    419     }
    420 
    421     UBool actualmatch;
    422     actualmatch = REMatcher->lookingAt(status);
    423     if (U_FAILURE(status)) {
    424         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    425             line, u_errorName(status));
    426         retVal =  FALSE;
    427     }
    428     if (actualmatch != looking) {
    429         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    430         retVal = FALSE;
    431     }
    432 
    433     status = U_ZERO_ERROR;
    434     actualmatch = REMatcher->matches(status);
    435     if (U_FAILURE(status)) {
    436         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    437             line, u_errorName(status));
    438         retVal = FALSE;
    439     }
    440     if (actualmatch != match) {
    441         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    442         retVal = FALSE;
    443     }
    444 
    445     if (retVal == FALSE) {
    446         REPattern->dumpPattern();
    447     }
    448 
    449     delete REPattern;
    450     delete REMatcher;
    451     return retVal;
    452 }
    453 
    454 
    455 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    456     UText               pattern    = UTEXT_INITIALIZER;
    457     int32_t             inputUTF8Length;
    458     char                *textChars = NULL;
    459     UText               inputText  = UTEXT_INITIALIZER;
    460     UErrorCode          status     = U_ZERO_ERROR;
    461     UParseError         pe;
    462     RegexPattern        *REPattern = NULL;
    463     RegexMatcher        *REMatcher = NULL;
    464     UBool               retVal     = TRUE;
    465 
    466     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    467     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    468     if (U_FAILURE(status)) {
    469         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    470             line, u_errorName(status));
    471         return FALSE;
    472     }
    473 
    474     UnicodeString inputString(text, -1, US_INV);
    475     UnicodeString unEscapedInput = inputString.unescape();
    476     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    477     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    478 
    479     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    480     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    481         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    482         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    483         return TRUE; // not a failure of the Regex engine
    484     }
    485     status = U_ZERO_ERROR; // buffer overflow
    486     textChars = new char[inputUTF8Length+1];
    487     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    488     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    489 
    490     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    491     if (U_FAILURE(status)) {
    492         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    493             line, u_errorName(status));
    494         return FALSE;
    495     }
    496 
    497     UBool actualmatch;
    498     actualmatch = REMatcher->lookingAt(status);
    499     if (U_FAILURE(status)) {
    500         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    501             line, u_errorName(status));
    502         retVal =  FALSE;
    503     }
    504     if (actualmatch != looking) {
    505         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    506         retVal = FALSE;
    507     }
    508 
    509     status = U_ZERO_ERROR;
    510     actualmatch = REMatcher->matches(status);
    511     if (U_FAILURE(status)) {
    512         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    513             line, u_errorName(status));
    514         retVal = FALSE;
    515     }
    516     if (actualmatch != match) {
    517         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    518         retVal = FALSE;
    519     }
    520 
    521     if (retVal == FALSE) {
    522         REPattern->dumpPattern();
    523     }
    524 
    525     delete REPattern;
    526     delete REMatcher;
    527     utext_close(&inputText);
    528     utext_close(&pattern);
    529     delete[] textChars;
    530     return retVal;
    531 }
    532 
    533 
    534 
    535 //---------------------------------------------------------------------------
    536 //
    537 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    538 //                       regex tests for incorrect patterns
    539 //
    540 //       usage:
    541 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    542 //
    543 //---------------------------------------------------------------------------
    544 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    545 
    546 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    547                           UErrorCode expectedStatus, int32_t line) {
    548     UnicodeString       pattern(pat);
    549 
    550     UErrorCode          status         = U_ZERO_ERROR;
    551     UParseError         pe;
    552     RegexPattern        *callerPattern = NULL;
    553 
    554     //
    555     //  Compile the caller's pattern
    556     //
    557     UnicodeString patString(pat);
    558     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    559     if (status != expectedStatus) {
    560         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    561     } else {
    562         if (status != U_ZERO_ERROR) {
    563             if (pe.line != errLine || pe.offset != errCol) {
    564                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    565                     line, errLine, errCol, pe.line, pe.offset);
    566             }
    567         }
    568     }
    569 
    570     delete callerPattern;
    571 
    572     //
    573     //  Compile again, using a UTF-8-based UText
    574     //
    575     UText patternText = UTEXT_INITIALIZER;
    576     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    577     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    578     if (status != expectedStatus) {
    579         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    580     } else {
    581         if (status != U_ZERO_ERROR) {
    582             if (pe.line != errLine || pe.offset != errCol) {
    583                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    584                     line, errLine, errCol, pe.line, pe.offset);
    585             }
    586         }
    587     }
    588 
    589     delete callerPattern;
    590     utext_close(&patternText);
    591 }
    592 
    593 
    594 
    595 //---------------------------------------------------------------------------
    596 //
    597 //      Basic      Check for basic functionality of regex pattern matching.
    598 //                 Avoid the use of REGEX_FIND test macro, which has
    599 //                 substantial dependencies on basic Regex functionality.
    600 //
    601 //---------------------------------------------------------------------------
    602 void RegexTest::Basic() {
    603 
    604 
    605 //
    606 // Debug - slide failing test cases early
    607 //
    608 #if 0
    609     {
    610         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    611         UParseError pe;
    612         UErrorCode  status = U_ZERO_ERROR;
    613         RegexPattern *pattern;
    614         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    615         pattern->dumpPattern();
    616         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    617         UBool result = m->find();
    618         printf("result = %d\n", result);
    619         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    620         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    621     }
    622     exit(1);
    623 #endif
    624 
    625 
    626     //
    627     // Pattern with parentheses
    628     //
    629     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    630     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    631     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    632 
    633     //
    634     // Patterns with *
    635     //
    636     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    637     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    638     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    639     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    640     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    641 
    642     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    643     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    644 
    645 
    646     //
    647     //  Patterns with "."
    648     //
    649     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    650     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    651     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    652     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    653     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    654     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    655     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    656     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    657 
    658     //
    659     //  Patterns with * applied to chars at end of literal string
    660     //
    661     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    662     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    663 
    664     //
    665     //  Supplemental chars match as single chars, not a pair of surrogates.
    666     //
    667     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    668     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    669     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    670 
    671 
    672     //
    673     //  UnicodeSets in the pattern
    674     //
    675     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    676     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    677     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    678     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    679     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    680     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    681 
    682     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    683     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    684     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    685     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    686     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    687 
    688     //
    689     //   OR operator in patterns
    690     //
    691     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    692     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    693     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    694     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    695 
    696     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    697     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    698     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    699     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    700     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    701     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    702 
    703     //
    704     //  +
    705     //
    706     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    707     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    708     REGEX_TESTLM("b+", "", FALSE, FALSE);
    709     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    710     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    711     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    712 
    713     //
    714     //   ?
    715     //
    716     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    717     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    718     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    719     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    720     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    721     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    722     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    723     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    724     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    725 
    726     //
    727     //  Escape sequences that become single literal chars, handled internally
    728     //   by ICU's Unescape.
    729     //
    730 
    731     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    732     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    733     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    734     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    735     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    736     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    737     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    738     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    739     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    740     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    741 
    742     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    743     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    744 
    745     // Escape of special chars in patterns
    746     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    747 }
    748 
    749 
    750 //---------------------------------------------------------------------------
    751 //
    752 //    UTextBasic   Check for quirks that are specific to the UText
    753 //                 implementation.
    754 //
    755 //---------------------------------------------------------------------------
    756 void RegexTest::UTextBasic() {
    757     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    758     UErrorCode status = U_ZERO_ERROR;
    759     UText pattern = UTEXT_INITIALIZER;
    760     utext_openUTF8(&pattern, str_abc, -1, &status);
    761     RegexMatcher matcher(&pattern, 0, status);
    762     REGEX_CHECK_STATUS;
    763 
    764     UText input = UTEXT_INITIALIZER;
    765     utext_openUTF8(&input, str_abc, -1, &status);
    766     REGEX_CHECK_STATUS;
    767     matcher.reset(&input);
    768     REGEX_CHECK_STATUS;
    769     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    770 
    771     matcher.reset(matcher.inputText());
    772     REGEX_CHECK_STATUS;
    773     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    774 
    775     utext_close(&pattern);
    776     utext_close(&input);
    777 }
    778 
    779 
    780 //---------------------------------------------------------------------------
    781 //
    782 //      API_Match   Test that the API for class RegexMatcher
    783 //                  is present and nominally working, but excluding functions
    784 //                  implementing replace operations.
    785 //
    786 //---------------------------------------------------------------------------
    787 void RegexTest::API_Match() {
    788     UParseError         pe;
    789     UErrorCode          status=U_ZERO_ERROR;
    790     int32_t             flags = 0;
    791 
    792     //
    793     // Debug - slide failing test cases early
    794     //
    795 #if 0
    796     {
    797     }
    798     return;
    799 #endif
    800 
    801     //
    802     // Simple pattern compilation
    803     //
    804     {
    805         UnicodeString       re("abc");
    806         RegexPattern        *pat2;
    807         pat2 = RegexPattern::compile(re, flags, pe, status);
    808         REGEX_CHECK_STATUS;
    809 
    810         UnicodeString inStr1 = "abcdef this is a test";
    811         UnicodeString instr2 = "not abc";
    812         UnicodeString empty  = "";
    813 
    814 
    815         //
    816         // Matcher creation and reset.
    817         //
    818         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    819         REGEX_CHECK_STATUS;
    820         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    821         REGEX_ASSERT(m1->input() == inStr1);
    822         m1->reset(instr2);
    823         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    824         REGEX_ASSERT(m1->input() == instr2);
    825         m1->reset(inStr1);
    826         REGEX_ASSERT(m1->input() == inStr1);
    827         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    828         m1->reset(empty);
    829         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    830         REGEX_ASSERT(m1->input() == empty);
    831         REGEX_ASSERT(&m1->pattern() == pat2);
    832 
    833         //
    834         //  reset(pos, status)
    835         //
    836         m1->reset(inStr1);
    837         m1->reset(4, status);
    838         REGEX_CHECK_STATUS;
    839         REGEX_ASSERT(m1->input() == inStr1);
    840         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    841 
    842         m1->reset(-1, status);
    843         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    844         status = U_ZERO_ERROR;
    845 
    846         m1->reset(0, status);
    847         REGEX_CHECK_STATUS;
    848         status = U_ZERO_ERROR;
    849 
    850         int32_t len = m1->input().length();
    851         m1->reset(len-1, status);
    852         REGEX_CHECK_STATUS;
    853         status = U_ZERO_ERROR;
    854 
    855         m1->reset(len, status);
    856         REGEX_CHECK_STATUS;
    857         status = U_ZERO_ERROR;
    858 
    859         m1->reset(len+1, status);
    860         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    861         status = U_ZERO_ERROR;
    862 
    863         //
    864         // match(pos, status)
    865         //
    866         m1->reset(instr2);
    867         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    868         m1->reset();
    869         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    870         m1->reset();
    871         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    872         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    873         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    874         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    875 
    876         // Match() at end of string should fail, but should not
    877         //  be an error.
    878         status = U_ZERO_ERROR;
    879         len = m1->input().length();
    880         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    881         REGEX_CHECK_STATUS;
    882 
    883         // Match beyond end of string should fail with an error.
    884         status = U_ZERO_ERROR;
    885         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    886         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    887 
    888         // Successful match at end of string.
    889         {
    890             status = U_ZERO_ERROR;
    891             RegexMatcher m("A?", 0, status);  // will match zero length string.
    892             REGEX_CHECK_STATUS;
    893             m.reset(inStr1);
    894             len = inStr1.length();
    895             REGEX_ASSERT(m.matches(len, status) == TRUE);
    896             REGEX_CHECK_STATUS;
    897             m.reset(empty);
    898             REGEX_ASSERT(m.matches(0, status) == TRUE);
    899             REGEX_CHECK_STATUS;
    900         }
    901 
    902 
    903         //
    904         // lookingAt(pos, status)
    905         //
    906         status = U_ZERO_ERROR;
    907         m1->reset(instr2);  // "not abc"
    908         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    909         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    910         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    911         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    912         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    913         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    914         status = U_ZERO_ERROR;
    915         len = m1->input().length();
    916         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    917         REGEX_CHECK_STATUS;
    918         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    919         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    920 
    921         delete m1;
    922         delete pat2;
    923     }
    924 
    925 
    926     //
    927     // Capture Group.
    928     //     RegexMatcher::start();
    929     //     RegexMatcher::end();
    930     //     RegexMatcher::groupCount();
    931     //
    932     {
    933         int32_t             flags=0;
    934         UParseError         pe;
    935         UErrorCode          status=U_ZERO_ERROR;
    936 
    937         UnicodeString       re("01(23(45)67)(.*)");
    938         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    939         REGEX_CHECK_STATUS;
    940         UnicodeString data = "0123456789";
    941 
    942         RegexMatcher *matcher = pat->matcher(data, status);
    943         REGEX_CHECK_STATUS;
    944         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    945         static const int32_t matchStarts[] = {0,  2, 4, 8};
    946         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    947         int32_t i;
    948         for (i=0; i<4; i++) {
    949             int32_t actualStart = matcher->start(i, status);
    950             REGEX_CHECK_STATUS;
    951             if (actualStart != matchStarts[i]) {
    952                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    953                     __LINE__, i, matchStarts[i], actualStart);
    954             }
    955             int32_t actualEnd = matcher->end(i, status);
    956             REGEX_CHECK_STATUS;
    957             if (actualEnd != matchEnds[i]) {
    958                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    959                     __LINE__, i, matchEnds[i], actualEnd);
    960             }
    961         }
    962 
    963         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    964         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    965 
    966         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    967         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    968         matcher->reset();
    969         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    970 
    971         matcher->lookingAt(status);
    972         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    973         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    974         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    975         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    976         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    977         REGEX_CHECK_STATUS;
    978         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    979         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    980         matcher->reset();
    981         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    982 
    983         delete matcher;
    984         delete pat;
    985 
    986     }
    987 
    988     //
    989     //  find
    990     //
    991     {
    992         int32_t             flags=0;
    993         UParseError         pe;
    994         UErrorCode          status=U_ZERO_ERROR;
    995 
    996         UnicodeString       re("abc");
    997         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    998         REGEX_CHECK_STATUS;
    999         UnicodeString data = ".abc..abc...abc..";
   1000         //                    012345678901234567
   1001 
   1002         RegexMatcher *matcher = pat->matcher(data, status);
   1003         REGEX_CHECK_STATUS;
   1004         REGEX_ASSERT(matcher->find());
   1005         REGEX_ASSERT(matcher->start(status) == 1);
   1006         REGEX_ASSERT(matcher->find());
   1007         REGEX_ASSERT(matcher->start(status) == 6);
   1008         REGEX_ASSERT(matcher->find());
   1009         REGEX_ASSERT(matcher->start(status) == 12);
   1010         REGEX_ASSERT(matcher->find() == FALSE);
   1011         REGEX_ASSERT(matcher->find() == FALSE);
   1012 
   1013         matcher->reset();
   1014         REGEX_ASSERT(matcher->find());
   1015         REGEX_ASSERT(matcher->start(status) == 1);
   1016 
   1017         REGEX_ASSERT(matcher->find(0, status));
   1018         REGEX_ASSERT(matcher->start(status) == 1);
   1019         REGEX_ASSERT(matcher->find(1, status));
   1020         REGEX_ASSERT(matcher->start(status) == 1);
   1021         REGEX_ASSERT(matcher->find(2, status));
   1022         REGEX_ASSERT(matcher->start(status) == 6);
   1023         REGEX_ASSERT(matcher->find(12, status));
   1024         REGEX_ASSERT(matcher->start(status) == 12);
   1025         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   1026         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   1027         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   1028         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   1029 
   1030         status = U_ZERO_ERROR;
   1031         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1032         status = U_ZERO_ERROR;
   1033         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1034 
   1035         REGEX_ASSERT(matcher->groupCount() == 0);
   1036 
   1037         delete matcher;
   1038         delete pat;
   1039     }
   1040 
   1041 
   1042     //
   1043     //  find, with \G in pattern (true if at the end of a previous match).
   1044     //
   1045     {
   1046         int32_t             flags=0;
   1047         UParseError         pe;
   1048         UErrorCode          status=U_ZERO_ERROR;
   1049 
   1050         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
   1051         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1052         REGEX_CHECK_STATUS;
   1053         UnicodeString data = ".abcabc.abc..";
   1054         //                    012345678901234567
   1055 
   1056         RegexMatcher *matcher = pat->matcher(data, status);
   1057         REGEX_CHECK_STATUS;
   1058         REGEX_ASSERT(matcher->find());
   1059         REGEX_ASSERT(matcher->start(status) == 0);
   1060         REGEX_ASSERT(matcher->start(1, status) == -1);
   1061         REGEX_ASSERT(matcher->start(2, status) == 1);
   1062 
   1063         REGEX_ASSERT(matcher->find());
   1064         REGEX_ASSERT(matcher->start(status) == 4);
   1065         REGEX_ASSERT(matcher->start(1, status) == 4);
   1066         REGEX_ASSERT(matcher->start(2, status) == -1);
   1067         REGEX_CHECK_STATUS;
   1068 
   1069         delete matcher;
   1070         delete pat;
   1071     }
   1072 
   1073     //
   1074     //   find with zero length matches, match position should bump ahead
   1075     //     to prevent loops.
   1076     //
   1077     {
   1078         int32_t                 i;
   1079         UErrorCode          status=U_ZERO_ERROR;
   1080         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1081                                                       //   using an always-true look-ahead.
   1082         REGEX_CHECK_STATUS;
   1083         UnicodeString s("    ");
   1084         m.reset(s);
   1085         for (i=0; ; i++) {
   1086             if (m.find() == FALSE) {
   1087                 break;
   1088             }
   1089             REGEX_ASSERT(m.start(status) == i);
   1090             REGEX_ASSERT(m.end(status) == i);
   1091         }
   1092         REGEX_ASSERT(i==5);
   1093 
   1094         // Check that the bump goes over surrogate pairs OK
   1095         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1096         s = s.unescape();
   1097         m.reset(s);
   1098         for (i=0; ; i+=2) {
   1099             if (m.find() == FALSE) {
   1100                 break;
   1101             }
   1102             REGEX_ASSERT(m.start(status) == i);
   1103             REGEX_ASSERT(m.end(status) == i);
   1104         }
   1105         REGEX_ASSERT(i==10);
   1106     }
   1107     {
   1108         // find() loop breaking test.
   1109         //        with pattern of /.?/, should see a series of one char matches, then a single
   1110         //        match of zero length at the end of the input string.
   1111         int32_t                 i;
   1112         UErrorCode          status=U_ZERO_ERROR;
   1113         RegexMatcher        m(".?", 0, status);
   1114         REGEX_CHECK_STATUS;
   1115         UnicodeString s("    ");
   1116         m.reset(s);
   1117         for (i=0; ; i++) {
   1118             if (m.find() == FALSE) {
   1119                 break;
   1120             }
   1121             REGEX_ASSERT(m.start(status) == i);
   1122             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1123         }
   1124         REGEX_ASSERT(i==5);
   1125     }
   1126 
   1127 
   1128     //
   1129     // Matchers with no input string behave as if they had an empty input string.
   1130     //
   1131 
   1132     {
   1133         UErrorCode status = U_ZERO_ERROR;
   1134         RegexMatcher  m(".?", 0, status);
   1135         REGEX_CHECK_STATUS;
   1136         REGEX_ASSERT(m.find());
   1137         REGEX_ASSERT(m.start(status) == 0);
   1138         REGEX_ASSERT(m.input() == "");
   1139     }
   1140     {
   1141         UErrorCode status = U_ZERO_ERROR;
   1142         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1143         RegexMatcher  *m = p->matcher(status);
   1144         REGEX_CHECK_STATUS;
   1145 
   1146         REGEX_ASSERT(m->find() == FALSE);
   1147         REGEX_ASSERT(m->input() == "");
   1148         delete m;
   1149         delete p;
   1150     }
   1151 
   1152     //
   1153     // Regions
   1154     //
   1155     {
   1156         UErrorCode status = U_ZERO_ERROR;
   1157         UnicodeString testString("This is test data");
   1158         RegexMatcher m(".*", testString,  0, status);
   1159         REGEX_CHECK_STATUS;
   1160         REGEX_ASSERT(m.regionStart() == 0);
   1161         REGEX_ASSERT(m.regionEnd() == testString.length());
   1162         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1163         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1164 
   1165         m.region(2,4, status);
   1166         REGEX_CHECK_STATUS;
   1167         REGEX_ASSERT(m.matches(status));
   1168         REGEX_ASSERT(m.start(status)==2);
   1169         REGEX_ASSERT(m.end(status)==4);
   1170         REGEX_CHECK_STATUS;
   1171 
   1172         m.reset();
   1173         REGEX_ASSERT(m.regionStart() == 0);
   1174         REGEX_ASSERT(m.regionEnd() == testString.length());
   1175 
   1176         UnicodeString shorterString("short");
   1177         m.reset(shorterString);
   1178         REGEX_ASSERT(m.regionStart() == 0);
   1179         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1180 
   1181         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1182         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1183         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1184         REGEX_ASSERT(&m == &m.reset());
   1185         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1186 
   1187         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1188         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1189         REGEX_ASSERT(&m == &m.reset());
   1190         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1191 
   1192         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1193         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1194         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1195         REGEX_ASSERT(&m == &m.reset());
   1196         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1197 
   1198         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1199         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1200         REGEX_ASSERT(&m == &m.reset());
   1201         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1202 
   1203     }
   1204 
   1205     //
   1206     // hitEnd() and requireEnd()
   1207     //
   1208     {
   1209         UErrorCode status = U_ZERO_ERROR;
   1210         UnicodeString testString("aabb");
   1211         RegexMatcher m1(".*", testString,  0, status);
   1212         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1213         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1214         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1215         REGEX_CHECK_STATUS;
   1216 
   1217         status = U_ZERO_ERROR;
   1218         RegexMatcher m2("a*", testString, 0, status);
   1219         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1220         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1221         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1222         REGEX_CHECK_STATUS;
   1223 
   1224         status = U_ZERO_ERROR;
   1225         RegexMatcher m3(".*$", testString, 0, status);
   1226         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1227         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1228         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1229         REGEX_CHECK_STATUS;
   1230     }
   1231 
   1232 
   1233     //
   1234     // Compilation error on reset with UChar *
   1235     //   These were a hazard that people were stumbling over with runtime errors.
   1236     //   Changed them to compiler errors by adding private methods that more closely
   1237     //   matched the incorrect use of the functions.
   1238     //
   1239 #if 0
   1240     {
   1241         UErrorCode status = U_ZERO_ERROR;
   1242         UChar ucharString[20];
   1243         RegexMatcher m(".", 0, status);
   1244         m.reset(ucharString);  // should not compile.
   1245 
   1246         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1247         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1248 
   1249         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1250     }
   1251 #endif
   1252 
   1253     //
   1254     //  Time Outs.
   1255     //       Note:  These tests will need to be changed when the regexp engine is
   1256     //              able to detect and cut short the exponential time behavior on
   1257     //              this type of match.
   1258     //
   1259     {
   1260         UErrorCode status = U_ZERO_ERROR;
   1261         //    Enough 'a's in the string to cause the match to time out.
   1262         //       (Each on additonal 'a' doubles the time)
   1263         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1264         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1265         REGEX_CHECK_STATUS;
   1266         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1267         matcher.setTimeLimit(100, status);
   1268         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1269         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1270         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1271     }
   1272     {
   1273         UErrorCode status = U_ZERO_ERROR;
   1274         //   Few enough 'a's to slip in under the time limit.
   1275         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1276         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1277         REGEX_CHECK_STATUS;
   1278         matcher.setTimeLimit(100, status);
   1279         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1280         REGEX_CHECK_STATUS;
   1281     }
   1282 
   1283     //
   1284     //  Stack Limits
   1285     //
   1286     {
   1287         UErrorCode status = U_ZERO_ERROR;
   1288         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1289 
   1290         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1291         //   of the '+', and makes the stack frames larger.
   1292         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1293 
   1294         // With the default stack, this match should fail to run
   1295         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1296         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1297 
   1298         // With unlimited stack, it should run
   1299         status = U_ZERO_ERROR;
   1300         matcher.setStackLimit(0, status);
   1301         REGEX_CHECK_STATUS;
   1302         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1303         REGEX_CHECK_STATUS;
   1304         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1305 
   1306         // With a limited stack, it the match should fail
   1307         status = U_ZERO_ERROR;
   1308         matcher.setStackLimit(10000, status);
   1309         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1310         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1311         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1312     }
   1313 
   1314         // A pattern that doesn't save state should work with
   1315         //   a minimal sized stack
   1316     {
   1317         UErrorCode status = U_ZERO_ERROR;
   1318         UnicodeString testString = "abc";
   1319         RegexMatcher matcher("abc", testString, 0, status);
   1320         REGEX_CHECK_STATUS;
   1321         matcher.setStackLimit(30, status);
   1322         REGEX_CHECK_STATUS;
   1323         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1324         REGEX_CHECK_STATUS;
   1325         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1326 
   1327         // Negative stack sizes should fail
   1328         status = U_ZERO_ERROR;
   1329         matcher.setStackLimit(1000, status);
   1330         REGEX_CHECK_STATUS;
   1331         matcher.setStackLimit(-1, status);
   1332         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1333         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1334     }
   1335 
   1336 
   1337 }
   1338 
   1339 
   1340 
   1341 
   1342 
   1343 
   1344 //---------------------------------------------------------------------------
   1345 //
   1346 //      API_Replace        API test for class RegexMatcher, testing the
   1347 //                         Replace family of functions.
   1348 //
   1349 //---------------------------------------------------------------------------
   1350 void RegexTest::API_Replace() {
   1351     //
   1352     //  Replace
   1353     //
   1354     int32_t             flags=0;
   1355     UParseError         pe;
   1356     UErrorCode          status=U_ZERO_ERROR;
   1357 
   1358     UnicodeString       re("abc");
   1359     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1360     REGEX_CHECK_STATUS;
   1361     UnicodeString data = ".abc..abc...abc..";
   1362     //                    012345678901234567
   1363     RegexMatcher *matcher = pat->matcher(data, status);
   1364 
   1365     //
   1366     //  Plain vanilla matches.
   1367     //
   1368     UnicodeString  dest;
   1369     dest = matcher->replaceFirst("yz", status);
   1370     REGEX_CHECK_STATUS;
   1371     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1372 
   1373     dest = matcher->replaceAll("yz", status);
   1374     REGEX_CHECK_STATUS;
   1375     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1376 
   1377     //
   1378     //  Plain vanilla non-matches.
   1379     //
   1380     UnicodeString d2 = ".abx..abx...abx..";
   1381     matcher->reset(d2);
   1382     dest = matcher->replaceFirst("yz", status);
   1383     REGEX_CHECK_STATUS;
   1384     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1385 
   1386     dest = matcher->replaceAll("yz", status);
   1387     REGEX_CHECK_STATUS;
   1388     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1389 
   1390     //
   1391     // Empty source string
   1392     //
   1393     UnicodeString d3 = "";
   1394     matcher->reset(d3);
   1395     dest = matcher->replaceFirst("yz", status);
   1396     REGEX_CHECK_STATUS;
   1397     REGEX_ASSERT(dest == "");
   1398 
   1399     dest = matcher->replaceAll("yz", status);
   1400     REGEX_CHECK_STATUS;
   1401     REGEX_ASSERT(dest == "");
   1402 
   1403     //
   1404     // Empty substitution string
   1405     //
   1406     matcher->reset(data);              // ".abc..abc...abc.."
   1407     dest = matcher->replaceFirst("", status);
   1408     REGEX_CHECK_STATUS;
   1409     REGEX_ASSERT(dest == "...abc...abc..");
   1410 
   1411     dest = matcher->replaceAll("", status);
   1412     REGEX_CHECK_STATUS;
   1413     REGEX_ASSERT(dest == "........");
   1414 
   1415     //
   1416     // match whole string
   1417     //
   1418     UnicodeString d4 = "abc";
   1419     matcher->reset(d4);
   1420     dest = matcher->replaceFirst("xyz", status);
   1421     REGEX_CHECK_STATUS;
   1422     REGEX_ASSERT(dest == "xyz");
   1423 
   1424     dest = matcher->replaceAll("xyz", status);
   1425     REGEX_CHECK_STATUS;
   1426     REGEX_ASSERT(dest == "xyz");
   1427 
   1428     //
   1429     // Capture Group, simple case
   1430     //
   1431     UnicodeString       re2("a(..)");
   1432     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1433     REGEX_CHECK_STATUS;
   1434     UnicodeString d5 = "abcdefg";
   1435     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1436     REGEX_CHECK_STATUS;
   1437     dest = matcher2->replaceFirst("$1$1", status);
   1438     REGEX_CHECK_STATUS;
   1439     REGEX_ASSERT(dest == "bcbcdefg");
   1440 
   1441     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1442     REGEX_CHECK_STATUS;
   1443     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1444 
   1445     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1446     REGEX_ASSERT(U_FAILURE(status));
   1447     status = U_ZERO_ERROR;
   1448 
   1449     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1450     replacement = replacement.unescape();
   1451     dest = matcher2->replaceFirst(replacement, status);
   1452     REGEX_CHECK_STATUS;
   1453     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1454 
   1455     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1456 
   1457 
   1458     //
   1459     // Replacement String with \u hex escapes
   1460     //
   1461     {
   1462         UnicodeString  src = "abc 1 abc 2 abc 3";
   1463         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1464         matcher->reset(src);
   1465         UnicodeString  result = matcher->replaceAll(substitute, status);
   1466         REGEX_CHECK_STATUS;
   1467         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1468     }
   1469     {
   1470         UnicodeString  src = "abc !";
   1471         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1472         matcher->reset(src);
   1473         UnicodeString  result = matcher->replaceAll(substitute, status);
   1474         REGEX_CHECK_STATUS;
   1475         UnicodeString expected = UnicodeString("--");
   1476         expected.append((UChar32)0x10000);
   1477         expected.append("-- !");
   1478         REGEX_ASSERT(result == expected);
   1479     }
   1480     // TODO:  need more through testing of capture substitutions.
   1481 
   1482     // Bug 4057
   1483     //
   1484     {
   1485         status = U_ZERO_ERROR;
   1486         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1487         RegexMatcher m("ss(.*?)ee", 0, status);
   1488         REGEX_CHECK_STATUS;
   1489         UnicodeString result;
   1490 
   1491         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1492         m.reset(s);
   1493         m.find();
   1494         m.find();
   1495         m.appendReplacement(result, "ooh", status);
   1496         REGEX_CHECK_STATUS;
   1497         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1498 
   1499         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1500         status = U_ZERO_ERROR;
   1501         result.truncate(0);
   1502         m.reset(10, status);
   1503         m.find();
   1504         m.find();
   1505         m.appendReplacement(result, "ooh", status);
   1506         REGEX_CHECK_STATUS;
   1507         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1508 
   1509         // find() at interior of string, appendReplacemnt still starts at beginning.
   1510         status = U_ZERO_ERROR;
   1511         result.truncate(0);
   1512         m.reset();
   1513         m.find(10, status);
   1514         m.find();
   1515         m.appendReplacement(result, "ooh", status);
   1516         REGEX_CHECK_STATUS;
   1517         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1518 
   1519         m.appendTail(result);
   1520         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1521 
   1522     }
   1523 
   1524     delete matcher2;
   1525     delete pat2;
   1526     delete matcher;
   1527     delete pat;
   1528 }
   1529 
   1530 
   1531 //---------------------------------------------------------------------------
   1532 //
   1533 //      API_Pattern       Test that the API for class RegexPattern is
   1534 //                        present and nominally working.
   1535 //
   1536 //---------------------------------------------------------------------------
   1537 void RegexTest::API_Pattern() {
   1538     RegexPattern        pata;    // Test default constructor to not crash.
   1539     RegexPattern        patb;
   1540 
   1541     REGEX_ASSERT(pata == patb);
   1542     REGEX_ASSERT(pata == pata);
   1543 
   1544     UnicodeString re1("abc[a-l][m-z]");
   1545     UnicodeString re2("def");
   1546     UErrorCode    status = U_ZERO_ERROR;
   1547     UParseError   pe;
   1548 
   1549     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1550     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1551     REGEX_CHECK_STATUS;
   1552     REGEX_ASSERT(*pat1 == *pat1);
   1553     REGEX_ASSERT(*pat1 != pata);
   1554 
   1555     // Assign
   1556     patb = *pat1;
   1557     REGEX_ASSERT(patb == *pat1);
   1558 
   1559     // Copy Construct
   1560     RegexPattern patc(*pat1);
   1561     REGEX_ASSERT(patc == *pat1);
   1562     REGEX_ASSERT(patb == patc);
   1563     REGEX_ASSERT(pat1 != pat2);
   1564     patb = *pat2;
   1565     REGEX_ASSERT(patb != patc);
   1566     REGEX_ASSERT(patb == *pat2);
   1567 
   1568     // Compile with no flags.
   1569     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1570     REGEX_ASSERT(*pat1a == *pat1);
   1571 
   1572     REGEX_ASSERT(pat1a->flags() == 0);
   1573 
   1574     // Compile with different flags should be not equal
   1575     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1576     REGEX_CHECK_STATUS;
   1577 
   1578     REGEX_ASSERT(*pat1b != *pat1a);
   1579     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1580     REGEX_ASSERT(pat1a->flags() == 0);
   1581     delete pat1b;
   1582 
   1583     // clone
   1584     RegexPattern *pat1c = pat1->clone();
   1585     REGEX_ASSERT(*pat1c == *pat1);
   1586     REGEX_ASSERT(*pat1c != *pat2);
   1587 
   1588     delete pat1c;
   1589     delete pat1a;
   1590     delete pat1;
   1591     delete pat2;
   1592 
   1593 
   1594     //
   1595     //   Verify that a matcher created from a cloned pattern works.
   1596     //     (Jitterbug 3423)
   1597     //
   1598     {
   1599         UErrorCode     status     = U_ZERO_ERROR;
   1600         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1601         RegexPattern  *pClone     = pSource->clone();
   1602         delete         pSource;
   1603         RegexMatcher  *mFromClone = pClone->matcher(status);
   1604         REGEX_CHECK_STATUS;
   1605         UnicodeString s = "Hello World";
   1606         mFromClone->reset(s);
   1607         REGEX_ASSERT(mFromClone->find() == TRUE);
   1608         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1609         REGEX_ASSERT(mFromClone->find() == TRUE);
   1610         REGEX_ASSERT(mFromClone->group(status) == "World");
   1611         REGEX_ASSERT(mFromClone->find() == FALSE);
   1612         delete mFromClone;
   1613         delete pClone;
   1614     }
   1615 
   1616     //
   1617     //   matches convenience API
   1618     //
   1619     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1620     REGEX_CHECK_STATUS;
   1621     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1622     REGEX_CHECK_STATUS;
   1623     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1624     REGEX_CHECK_STATUS;
   1625     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1626     REGEX_CHECK_STATUS;
   1627     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1628     REGEX_CHECK_STATUS;
   1629     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1630     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1631     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1632 
   1633 
   1634     //
   1635     // Split()
   1636     //
   1637     status = U_ZERO_ERROR;
   1638     pat1 = RegexPattern::compile(" +",  pe, status);
   1639     REGEX_CHECK_STATUS;
   1640     UnicodeString  fields[10];
   1641 
   1642     int32_t n;
   1643     n = pat1->split("Now is the time", fields, 10, status);
   1644     REGEX_CHECK_STATUS;
   1645     REGEX_ASSERT(n==4);
   1646     REGEX_ASSERT(fields[0]=="Now");
   1647     REGEX_ASSERT(fields[1]=="is");
   1648     REGEX_ASSERT(fields[2]=="the");
   1649     REGEX_ASSERT(fields[3]=="time");
   1650     REGEX_ASSERT(fields[4]=="");
   1651 
   1652     n = pat1->split("Now is the time", fields, 2, status);
   1653     REGEX_CHECK_STATUS;
   1654     REGEX_ASSERT(n==2);
   1655     REGEX_ASSERT(fields[0]=="Now");
   1656     REGEX_ASSERT(fields[1]=="is the time");
   1657     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1658 
   1659     fields[1] = "*";
   1660     status = U_ZERO_ERROR;
   1661     n = pat1->split("Now is the time", fields, 1, status);
   1662     REGEX_CHECK_STATUS;
   1663     REGEX_ASSERT(n==1);
   1664     REGEX_ASSERT(fields[0]=="Now is the time");
   1665     REGEX_ASSERT(fields[1]=="*");
   1666     status = U_ZERO_ERROR;
   1667 
   1668     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1669     REGEX_CHECK_STATUS;
   1670     REGEX_ASSERT(n==6);
   1671     REGEX_ASSERT(fields[0]=="");
   1672     REGEX_ASSERT(fields[1]=="Now");
   1673     REGEX_ASSERT(fields[2]=="is");
   1674     REGEX_ASSERT(fields[3]=="the");
   1675     REGEX_ASSERT(fields[4]=="time");
   1676     REGEX_ASSERT(fields[5]=="");
   1677 
   1678     n = pat1->split("     ", fields, 10, status);
   1679     REGEX_CHECK_STATUS;
   1680     REGEX_ASSERT(n==2);
   1681     REGEX_ASSERT(fields[0]=="");
   1682     REGEX_ASSERT(fields[1]=="");
   1683 
   1684     fields[0] = "foo";
   1685     n = pat1->split("", fields, 10, status);
   1686     REGEX_CHECK_STATUS;
   1687     REGEX_ASSERT(n==0);
   1688     REGEX_ASSERT(fields[0]=="foo");
   1689 
   1690     delete pat1;
   1691 
   1692     //  split, with a pattern with (capture)
   1693     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1694     REGEX_CHECK_STATUS;
   1695 
   1696     status = U_ZERO_ERROR;
   1697     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1698     REGEX_CHECK_STATUS;
   1699     REGEX_ASSERT(n==7);
   1700     REGEX_ASSERT(fields[0]=="");
   1701     REGEX_ASSERT(fields[1]=="a");
   1702     REGEX_ASSERT(fields[2]=="Now is ");
   1703     REGEX_ASSERT(fields[3]=="b");
   1704     REGEX_ASSERT(fields[4]=="the time");
   1705     REGEX_ASSERT(fields[5]=="c");
   1706     REGEX_ASSERT(fields[6]=="");
   1707     REGEX_ASSERT(status==U_ZERO_ERROR);
   1708 
   1709     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1710     REGEX_CHECK_STATUS;
   1711     REGEX_ASSERT(n==7);
   1712     REGEX_ASSERT(fields[0]=="  ");
   1713     REGEX_ASSERT(fields[1]=="a");
   1714     REGEX_ASSERT(fields[2]=="Now is ");
   1715     REGEX_ASSERT(fields[3]=="b");
   1716     REGEX_ASSERT(fields[4]=="the time");
   1717     REGEX_ASSERT(fields[5]=="c");
   1718     REGEX_ASSERT(fields[6]=="");
   1719 
   1720     status = U_ZERO_ERROR;
   1721     fields[6] = "foo";
   1722     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1723     REGEX_CHECK_STATUS;
   1724     REGEX_ASSERT(n==6);
   1725     REGEX_ASSERT(fields[0]=="  ");
   1726     REGEX_ASSERT(fields[1]=="a");
   1727     REGEX_ASSERT(fields[2]=="Now is ");
   1728     REGEX_ASSERT(fields[3]=="b");
   1729     REGEX_ASSERT(fields[4]=="the time");
   1730     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1731     REGEX_ASSERT(fields[6]=="foo");
   1732 
   1733     status = U_ZERO_ERROR;
   1734     fields[5] = "foo";
   1735     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1736     REGEX_CHECK_STATUS;
   1737     REGEX_ASSERT(n==5);
   1738     REGEX_ASSERT(fields[0]=="  ");
   1739     REGEX_ASSERT(fields[1]=="a");
   1740     REGEX_ASSERT(fields[2]=="Now is ");
   1741     REGEX_ASSERT(fields[3]=="b");
   1742     REGEX_ASSERT(fields[4]=="the time<c>");
   1743     REGEX_ASSERT(fields[5]=="foo");
   1744 
   1745     status = U_ZERO_ERROR;
   1746     fields[5] = "foo";
   1747     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1748     REGEX_CHECK_STATUS;
   1749     REGEX_ASSERT(n==5);
   1750     REGEX_ASSERT(fields[0]=="  ");
   1751     REGEX_ASSERT(fields[1]=="a");
   1752     REGEX_ASSERT(fields[2]=="Now is ");
   1753     REGEX_ASSERT(fields[3]=="b");
   1754     REGEX_ASSERT(fields[4]=="the time");
   1755     REGEX_ASSERT(fields[5]=="foo");
   1756 
   1757     status = U_ZERO_ERROR;
   1758     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1759     REGEX_CHECK_STATUS;
   1760     REGEX_ASSERT(n==4);
   1761     REGEX_ASSERT(fields[0]=="  ");
   1762     REGEX_ASSERT(fields[1]=="a");
   1763     REGEX_ASSERT(fields[2]=="Now is ");
   1764     REGEX_ASSERT(fields[3]=="the time<c>");
   1765     status = U_ZERO_ERROR;
   1766     delete pat1;
   1767 
   1768     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1769     REGEX_CHECK_STATUS;
   1770     n = pat1->split("1-10,20", fields, 10, status);
   1771     REGEX_CHECK_STATUS;
   1772     REGEX_ASSERT(n==5);
   1773     REGEX_ASSERT(fields[0]=="1");
   1774     REGEX_ASSERT(fields[1]=="-");
   1775     REGEX_ASSERT(fields[2]=="10");
   1776     REGEX_ASSERT(fields[3]==",");
   1777     REGEX_ASSERT(fields[4]=="20");
   1778     delete pat1;
   1779 
   1780     // Test split of string with empty trailing fields
   1781     pat1 = RegexPattern::compile(",", pe, status);
   1782     REGEX_CHECK_STATUS;
   1783     n = pat1->split("a,b,c,", fields, 10, status);
   1784     REGEX_CHECK_STATUS;
   1785     REGEX_ASSERT(n==4);
   1786     REGEX_ASSERT(fields[0]=="a");
   1787     REGEX_ASSERT(fields[1]=="b");
   1788     REGEX_ASSERT(fields[2]=="c");
   1789     REGEX_ASSERT(fields[3]=="");
   1790 
   1791     n = pat1->split("a,,,", fields, 10, status);
   1792     REGEX_CHECK_STATUS;
   1793     REGEX_ASSERT(n==4);
   1794     REGEX_ASSERT(fields[0]=="a");
   1795     REGEX_ASSERT(fields[1]=="");
   1796     REGEX_ASSERT(fields[2]=="");
   1797     REGEX_ASSERT(fields[3]=="");
   1798     delete pat1;
   1799 
   1800     // Split Separator with zero length match.
   1801     pat1 = RegexPattern::compile(":?", pe, status);
   1802     REGEX_CHECK_STATUS;
   1803     n = pat1->split("abc", fields, 10, status);
   1804     REGEX_CHECK_STATUS;
   1805     REGEX_ASSERT(n==5);
   1806     REGEX_ASSERT(fields[0]=="");
   1807     REGEX_ASSERT(fields[1]=="a");
   1808     REGEX_ASSERT(fields[2]=="b");
   1809     REGEX_ASSERT(fields[3]=="c");
   1810     REGEX_ASSERT(fields[4]=="");
   1811 
   1812     delete pat1;
   1813 
   1814     //
   1815     // RegexPattern::pattern()
   1816     //
   1817     pat1 = new RegexPattern();
   1818     REGEX_ASSERT(pat1->pattern() == "");
   1819     delete pat1;
   1820 
   1821     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1822     REGEX_CHECK_STATUS;
   1823     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1824     delete pat1;
   1825 
   1826 
   1827     //
   1828     // classID functions
   1829     //
   1830     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1831     REGEX_CHECK_STATUS;
   1832     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1833     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1834     UnicodeString Hello("Hello, world.");
   1835     RegexMatcher *m = pat1->matcher(Hello, status);
   1836     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1837     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1838     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1839     delete m;
   1840     delete pat1;
   1841 
   1842 }
   1843 
   1844 //---------------------------------------------------------------------------
   1845 //
   1846 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1847 //                       is present and working, but excluding functions
   1848 //                       implementing replace operations.
   1849 //
   1850 //---------------------------------------------------------------------------
   1851 void RegexTest::API_Match_UTF8() {
   1852     UParseError         pe;
   1853     UErrorCode          status=U_ZERO_ERROR;
   1854     int32_t             flags = 0;
   1855 
   1856     //
   1857     // Debug - slide failing test cases early
   1858     //
   1859 #if 0
   1860     {
   1861     }
   1862     return;
   1863 #endif
   1864 
   1865     //
   1866     // Simple pattern compilation
   1867     //
   1868     {
   1869         UText               re = UTEXT_INITIALIZER;
   1870         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1871         REGEX_VERBOSE_TEXT(&re);
   1872         RegexPattern        *pat2;
   1873         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1874         REGEX_CHECK_STATUS;
   1875 
   1876         UText input1 = UTEXT_INITIALIZER;
   1877         UText input2 = UTEXT_INITIALIZER;
   1878         UText empty  = UTEXT_INITIALIZER;
   1879         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1880         REGEX_VERBOSE_TEXT(&input1);
   1881         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1882         REGEX_VERBOSE_TEXT(&input2);
   1883         utext_openUChars(&empty, NULL, 0, &status);
   1884 
   1885         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1886         int32_t input2Len = strlen("not abc");
   1887 
   1888 
   1889         //
   1890         // Matcher creation and reset.
   1891         //
   1892         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1893         REGEX_CHECK_STATUS;
   1894         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1895         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1896         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1897         m1->reset(&input2);
   1898         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1899         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1900         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1901         m1->reset(&input1);
   1902         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1903         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1904         m1->reset(&empty);
   1905         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1906         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1907 
   1908         //
   1909         //  reset(pos, status)
   1910         //
   1911         m1->reset(&input1);
   1912         m1->reset(4, status);
   1913         REGEX_CHECK_STATUS;
   1914         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1915         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1916 
   1917         m1->reset(-1, status);
   1918         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1919         status = U_ZERO_ERROR;
   1920 
   1921         m1->reset(0, status);
   1922         REGEX_CHECK_STATUS;
   1923         status = U_ZERO_ERROR;
   1924 
   1925         m1->reset(input1Len-1, status);
   1926         REGEX_CHECK_STATUS;
   1927         status = U_ZERO_ERROR;
   1928 
   1929         m1->reset(input1Len, status);
   1930         REGEX_CHECK_STATUS;
   1931         status = U_ZERO_ERROR;
   1932 
   1933         m1->reset(input1Len+1, status);
   1934         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1935         status = U_ZERO_ERROR;
   1936 
   1937         //
   1938         // match(pos, status)
   1939         //
   1940         m1->reset(&input2);
   1941         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1942         m1->reset();
   1943         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1944         m1->reset();
   1945         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1946         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1947         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1948         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1949 
   1950         // Match() at end of string should fail, but should not
   1951         //  be an error.
   1952         status = U_ZERO_ERROR;
   1953         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1954         REGEX_CHECK_STATUS;
   1955 
   1956         // Match beyond end of string should fail with an error.
   1957         status = U_ZERO_ERROR;
   1958         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1959         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1960 
   1961         // Successful match at end of string.
   1962         {
   1963             status = U_ZERO_ERROR;
   1964             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1965             REGEX_CHECK_STATUS;
   1966             m.reset(&input1);
   1967             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1968             REGEX_CHECK_STATUS;
   1969             m.reset(&empty);
   1970             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1971             REGEX_CHECK_STATUS;
   1972         }
   1973 
   1974 
   1975         //
   1976         // lookingAt(pos, status)
   1977         //
   1978         status = U_ZERO_ERROR;
   1979         m1->reset(&input2);  // "not abc"
   1980         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1981         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1982         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1983         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1984         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1985         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1986         status = U_ZERO_ERROR;
   1987         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1988         REGEX_CHECK_STATUS;
   1989         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1990         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1991 
   1992         delete m1;
   1993         delete pat2;
   1994 
   1995         utext_close(&re);
   1996         utext_close(&input1);
   1997         utext_close(&input2);
   1998         utext_close(&empty);
   1999     }
   2000 
   2001 
   2002     //
   2003     // Capture Group.
   2004     //     RegexMatcher::start();
   2005     //     RegexMatcher::end();
   2006     //     RegexMatcher::groupCount();
   2007     //
   2008     {
   2009         int32_t             flags=0;
   2010         UParseError         pe;
   2011         UErrorCode          status=U_ZERO_ERROR;
   2012         UText               re=UTEXT_INITIALIZER;
   2013         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   2014         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   2015 
   2016         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2017         REGEX_CHECK_STATUS;
   2018 
   2019         UText input = UTEXT_INITIALIZER;
   2020         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2021         utext_openUTF8(&input, str_0123456789, -1, &status);
   2022 
   2023         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2024         REGEX_CHECK_STATUS;
   2025         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   2026         static const int32_t matchStarts[] = {0,  2, 4, 8};
   2027         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   2028         int32_t i;
   2029         for (i=0; i<4; i++) {
   2030             int32_t actualStart = matcher->start(i, status);
   2031             REGEX_CHECK_STATUS;
   2032             if (actualStart != matchStarts[i]) {
   2033                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   2034                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   2035             }
   2036             int32_t actualEnd = matcher->end(i, status);
   2037             REGEX_CHECK_STATUS;
   2038             if (actualEnd != matchEnds[i]) {
   2039                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   2040                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   2041             }
   2042         }
   2043 
   2044         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   2045         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   2046 
   2047         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2048         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2049         matcher->reset();
   2050         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   2051 
   2052         matcher->lookingAt(status);
   2053 
   2054         UnicodeString dest;
   2055         UText destText = UTEXT_INITIALIZER;
   2056         utext_openUnicodeString(&destText, &dest, &status);
   2057         UText *result;
   2058         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2059         //  Test shallow-clone API
   2060         int64_t   group_len;
   2061         result = matcher->group((UText *)NULL, group_len, status);
   2062         REGEX_CHECK_STATUS;
   2063         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2064         utext_close(result);
   2065         result = matcher->group(0, &destText, group_len, status);
   2066         REGEX_CHECK_STATUS;
   2067         REGEX_ASSERT(result == &destText);
   2068         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2069         //  destText is now immutable, reopen it
   2070         utext_close(&destText);
   2071         utext_openUnicodeString(&destText, &dest, &status);
   2072 
   2073         int64_t length;
   2074         result = matcher->group(0, NULL, length, status);
   2075         REGEX_CHECK_STATUS;
   2076         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2077         utext_close(result);
   2078         result = matcher->group(0, &destText, length, status);
   2079         REGEX_CHECK_STATUS;
   2080         REGEX_ASSERT(result == &destText);
   2081         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
   2082         REGEX_ASSERT(length == 10);
   2083         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2084 
   2085         // Capture Group 1 == "234567"
   2086         result = matcher->group(1, NULL, length, status);
   2087         REGEX_CHECK_STATUS;
   2088         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2089         REGEX_ASSERT(length == 6);
   2090         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2091         utext_close(result);
   2092 
   2093         result = matcher->group(1, &destText, length, status);
   2094         REGEX_CHECK_STATUS;
   2095         REGEX_ASSERT(result == &destText);
   2096         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2097         REGEX_ASSERT(length == 6);
   2098         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2099         utext_close(result);
   2100 
   2101         // Capture Group 2 == "45"
   2102         result = matcher->group(2, NULL, length, status);
   2103         REGEX_CHECK_STATUS;
   2104         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2105         REGEX_ASSERT(length == 2);
   2106         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2107         utext_close(result);
   2108 
   2109         result = matcher->group(2, &destText, length, status);
   2110         REGEX_CHECK_STATUS;
   2111         REGEX_ASSERT(result == &destText);
   2112         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2113         REGEX_ASSERT(length == 2);
   2114         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2115         utext_close(result);
   2116 
   2117         // Capture Group 3 == "89"
   2118         result = matcher->group(3, NULL, length, status);
   2119         REGEX_CHECK_STATUS;
   2120         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2121         REGEX_ASSERT(length == 2);
   2122         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2123         utext_close(result);
   2124 
   2125         result = matcher->group(3, &destText, length, status);
   2126         REGEX_CHECK_STATUS;
   2127         REGEX_ASSERT(result == &destText);
   2128         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2129         REGEX_ASSERT(length == 2);
   2130         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2131         utext_close(result);
   2132 
   2133         // Capture Group number out of range.
   2134         status = U_ZERO_ERROR;
   2135         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2136         status = U_ZERO_ERROR;
   2137         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2138         status = U_ZERO_ERROR;
   2139         matcher->reset();
   2140         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2141 
   2142         delete matcher;
   2143         delete pat;
   2144 
   2145         utext_close(&destText);
   2146         utext_close(&input);
   2147         utext_close(&re);
   2148     }
   2149 
   2150     //
   2151     //  find
   2152     //
   2153     {
   2154         int32_t             flags=0;
   2155         UParseError         pe;
   2156         UErrorCode          status=U_ZERO_ERROR;
   2157         UText               re=UTEXT_INITIALIZER;
   2158         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2159         utext_openUTF8(&re, str_abc, -1, &status);
   2160 
   2161         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2162         REGEX_CHECK_STATUS;
   2163         UText input = UTEXT_INITIALIZER;
   2164         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2165         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2166         //                      012345678901234567
   2167 
   2168         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2169         REGEX_CHECK_STATUS;
   2170         REGEX_ASSERT(matcher->find());
   2171         REGEX_ASSERT(matcher->start(status) == 1);
   2172         REGEX_ASSERT(matcher->find());
   2173         REGEX_ASSERT(matcher->start(status) == 6);
   2174         REGEX_ASSERT(matcher->find());
   2175         REGEX_ASSERT(matcher->start(status) == 12);
   2176         REGEX_ASSERT(matcher->find() == FALSE);
   2177         REGEX_ASSERT(matcher->find() == FALSE);
   2178 
   2179         matcher->reset();
   2180         REGEX_ASSERT(matcher->find());
   2181         REGEX_ASSERT(matcher->start(status) == 1);
   2182 
   2183         REGEX_ASSERT(matcher->find(0, status));
   2184         REGEX_ASSERT(matcher->start(status) == 1);
   2185         REGEX_ASSERT(matcher->find(1, status));
   2186         REGEX_ASSERT(matcher->start(status) == 1);
   2187         REGEX_ASSERT(matcher->find(2, status));
   2188         REGEX_ASSERT(matcher->start(status) == 6);
   2189         REGEX_ASSERT(matcher->find(12, status));
   2190         REGEX_ASSERT(matcher->start(status) == 12);
   2191         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2192         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2193         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2194         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2195 
   2196         status = U_ZERO_ERROR;
   2197         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2198         status = U_ZERO_ERROR;
   2199         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2200 
   2201         REGEX_ASSERT(matcher->groupCount() == 0);
   2202 
   2203         delete matcher;
   2204         delete pat;
   2205 
   2206         utext_close(&input);
   2207         utext_close(&re);
   2208     }
   2209 
   2210 
   2211     //
   2212     //  find, with \G in pattern (true if at the end of a previous match).
   2213     //
   2214     {
   2215         int32_t             flags=0;
   2216         UParseError         pe;
   2217         UErrorCode          status=U_ZERO_ERROR;
   2218         UText               re=UTEXT_INITIALIZER;
   2219         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2220         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2221 
   2222         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2223 
   2224         REGEX_CHECK_STATUS;
   2225         UText input = UTEXT_INITIALIZER;
   2226         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2227         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2228         //                      012345678901234567
   2229 
   2230         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2231         REGEX_CHECK_STATUS;
   2232         REGEX_ASSERT(matcher->find());
   2233         REGEX_ASSERT(matcher->start(status) == 0);
   2234         REGEX_ASSERT(matcher->start(1, status) == -1);
   2235         REGEX_ASSERT(matcher->start(2, status) == 1);
   2236 
   2237         REGEX_ASSERT(matcher->find());
   2238         REGEX_ASSERT(matcher->start(status) == 4);
   2239         REGEX_ASSERT(matcher->start(1, status) == 4);
   2240         REGEX_ASSERT(matcher->start(2, status) == -1);
   2241         REGEX_CHECK_STATUS;
   2242 
   2243         delete matcher;
   2244         delete pat;
   2245 
   2246         utext_close(&input);
   2247         utext_close(&re);
   2248     }
   2249 
   2250     //
   2251     //   find with zero length matches, match position should bump ahead
   2252     //     to prevent loops.
   2253     //
   2254     {
   2255         int32_t                 i;
   2256         UErrorCode          status=U_ZERO_ERROR;
   2257         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2258                                                       //   using an always-true look-ahead.
   2259         REGEX_CHECK_STATUS;
   2260         UText s = UTEXT_INITIALIZER;
   2261         utext_openUTF8(&s, "    ", -1, &status);
   2262         m.reset(&s);
   2263         for (i=0; ; i++) {
   2264             if (m.find() == FALSE) {
   2265                 break;
   2266             }
   2267             REGEX_ASSERT(m.start(status) == i);
   2268             REGEX_ASSERT(m.end(status) == i);
   2269         }
   2270         REGEX_ASSERT(i==5);
   2271 
   2272         // Check that the bump goes over characters outside the BMP OK
   2273         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2274         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2275         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2276         m.reset(&s);
   2277         for (i=0; ; i+=4) {
   2278             if (m.find() == FALSE) {
   2279                 break;
   2280             }
   2281             REGEX_ASSERT(m.start(status) == i);
   2282             REGEX_ASSERT(m.end(status) == i);
   2283         }
   2284         REGEX_ASSERT(i==20);
   2285 
   2286         utext_close(&s);
   2287     }
   2288     {
   2289         // find() loop breaking test.
   2290         //        with pattern of /.?/, should see a series of one char matches, then a single
   2291         //        match of zero length at the end of the input string.
   2292         int32_t                 i;
   2293         UErrorCode          status=U_ZERO_ERROR;
   2294         RegexMatcher        m(".?", 0, status);
   2295         REGEX_CHECK_STATUS;
   2296         UText s = UTEXT_INITIALIZER;
   2297         utext_openUTF8(&s, "    ", -1, &status);
   2298         m.reset(&s);
   2299         for (i=0; ; i++) {
   2300             if (m.find() == FALSE) {
   2301                 break;
   2302             }
   2303             REGEX_ASSERT(m.start(status) == i);
   2304             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2305         }
   2306         REGEX_ASSERT(i==5);
   2307 
   2308         utext_close(&s);
   2309     }
   2310 
   2311 
   2312     //
   2313     // Matchers with no input string behave as if they had an empty input string.
   2314     //
   2315 
   2316     {
   2317         UErrorCode status = U_ZERO_ERROR;
   2318         RegexMatcher  m(".?", 0, status);
   2319         REGEX_CHECK_STATUS;
   2320         REGEX_ASSERT(m.find());
   2321         REGEX_ASSERT(m.start(status) == 0);
   2322         REGEX_ASSERT(m.input() == "");
   2323     }
   2324     {
   2325         UErrorCode status = U_ZERO_ERROR;
   2326         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2327         RegexMatcher  *m = p->matcher(status);
   2328         REGEX_CHECK_STATUS;
   2329 
   2330         REGEX_ASSERT(m->find() == FALSE);
   2331         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2332         delete m;
   2333         delete p;
   2334     }
   2335 
   2336     //
   2337     // Regions
   2338     //
   2339     {
   2340         UErrorCode status = U_ZERO_ERROR;
   2341         UText testPattern = UTEXT_INITIALIZER;
   2342         UText testText    = UTEXT_INITIALIZER;
   2343         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2344         REGEX_VERBOSE_TEXT(&testPattern);
   2345         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2346         REGEX_VERBOSE_TEXT(&testText);
   2347 
   2348         RegexMatcher m(&testPattern, &testText, 0, status);
   2349         REGEX_CHECK_STATUS;
   2350         REGEX_ASSERT(m.regionStart() == 0);
   2351         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2352         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2353         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2354 
   2355         m.region(2,4, status);
   2356         REGEX_CHECK_STATUS;
   2357         REGEX_ASSERT(m.matches(status));
   2358         REGEX_ASSERT(m.start(status)==2);
   2359         REGEX_ASSERT(m.end(status)==4);
   2360         REGEX_CHECK_STATUS;
   2361 
   2362         m.reset();
   2363         REGEX_ASSERT(m.regionStart() == 0);
   2364         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2365 
   2366         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2367         REGEX_VERBOSE_TEXT(&testText);
   2368         m.reset(&testText);
   2369         REGEX_ASSERT(m.regionStart() == 0);
   2370         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2371 
   2372         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2373         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2374         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2375         REGEX_ASSERT(&m == &m.reset());
   2376         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2377 
   2378         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2379         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2380         REGEX_ASSERT(&m == &m.reset());
   2381         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2382 
   2383         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2384         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2385         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2386         REGEX_ASSERT(&m == &m.reset());
   2387         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2388 
   2389         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2390         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2391         REGEX_ASSERT(&m == &m.reset());
   2392         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2393 
   2394         utext_close(&testText);
   2395         utext_close(&testPattern);
   2396     }
   2397 
   2398     //
   2399     // hitEnd() and requireEnd()
   2400     //
   2401     {
   2402         UErrorCode status = U_ZERO_ERROR;
   2403         UText testPattern = UTEXT_INITIALIZER;
   2404         UText testText    = UTEXT_INITIALIZER;
   2405         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2406         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2407         utext_openUTF8(&testPattern, str_, -1, &status);
   2408         utext_openUTF8(&testText, str_aabb, -1, &status);
   2409 
   2410         RegexMatcher m1(&testPattern, &testText,  0, status);
   2411         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2412         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2413         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2414         REGEX_CHECK_STATUS;
   2415 
   2416         status = U_ZERO_ERROR;
   2417         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2418         utext_openUTF8(&testPattern, str_a, -1, &status);
   2419         RegexMatcher m2(&testPattern, &testText, 0, status);
   2420         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2421         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2422         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2423         REGEX_CHECK_STATUS;
   2424 
   2425         status = U_ZERO_ERROR;
   2426         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2427         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2428         RegexMatcher m3(&testPattern, &testText, 0, status);
   2429         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2430         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2431         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2432         REGEX_CHECK_STATUS;
   2433 
   2434         utext_close(&testText);
   2435         utext_close(&testPattern);
   2436     }
   2437 }
   2438 
   2439 
   2440 //---------------------------------------------------------------------------
   2441 //
   2442 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2443 //                         Replace family of functions.
   2444 //
   2445 //---------------------------------------------------------------------------
   2446 void RegexTest::API_Replace_UTF8() {
   2447     //
   2448     //  Replace
   2449     //
   2450     int32_t             flags=0;
   2451     UParseError         pe;
   2452     UErrorCode          status=U_ZERO_ERROR;
   2453 
   2454     UText               re=UTEXT_INITIALIZER;
   2455     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2456     REGEX_VERBOSE_TEXT(&re);
   2457     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2458     REGEX_CHECK_STATUS;
   2459 
   2460     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2461     //             012345678901234567
   2462     UText dataText = UTEXT_INITIALIZER;
   2463     utext_openUTF8(&dataText, data, -1, &status);
   2464     REGEX_CHECK_STATUS;
   2465     REGEX_VERBOSE_TEXT(&dataText);
   2466     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2467 
   2468     //
   2469     //  Plain vanilla matches.
   2470     //
   2471     UnicodeString  dest;
   2472     UText destText = UTEXT_INITIALIZER;
   2473     utext_openUnicodeString(&destText, &dest, &status);
   2474     UText *result;
   2475 
   2476     UText replText = UTEXT_INITIALIZER;
   2477 
   2478     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2479     utext_openUTF8(&replText, str_yz, -1, &status);
   2480     REGEX_VERBOSE_TEXT(&replText);
   2481     result = matcher->replaceFirst(&replText, NULL, status);
   2482     REGEX_CHECK_STATUS;
   2483     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2484     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2485     utext_close(result);
   2486     result = matcher->replaceFirst(&replText, &destText, status);
   2487     REGEX_CHECK_STATUS;
   2488     REGEX_ASSERT(result == &destText);
   2489     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2490 
   2491     result = matcher->replaceAll(&replText, NULL, status);
   2492     REGEX_CHECK_STATUS;
   2493     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2494     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2495     utext_close(result);
   2496 
   2497     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2498     result = matcher->replaceAll(&replText, &destText, status);
   2499     REGEX_CHECK_STATUS;
   2500     REGEX_ASSERT(result == &destText);
   2501     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2502 
   2503     //
   2504     //  Plain vanilla non-matches.
   2505     //
   2506     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2507     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2508     matcher->reset(&dataText);
   2509 
   2510     result = matcher->replaceFirst(&replText, NULL, status);
   2511     REGEX_CHECK_STATUS;
   2512     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2513     utext_close(result);
   2514     result = matcher->replaceFirst(&replText, &destText, status);
   2515     REGEX_CHECK_STATUS;
   2516     REGEX_ASSERT(result == &destText);
   2517     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2518 
   2519     result = matcher->replaceAll(&replText, NULL, status);
   2520     REGEX_CHECK_STATUS;
   2521     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2522     utext_close(result);
   2523     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2524     result = matcher->replaceAll(&replText, &destText, status);
   2525     REGEX_CHECK_STATUS;
   2526     REGEX_ASSERT(result == &destText);
   2527     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2528 
   2529     //
   2530     // Empty source string
   2531     //
   2532     utext_openUTF8(&dataText, NULL, 0, &status);
   2533     matcher->reset(&dataText);
   2534 
   2535     result = matcher->replaceFirst(&replText, NULL, status);
   2536     REGEX_CHECK_STATUS;
   2537     REGEX_ASSERT_UTEXT_UTF8("", result);
   2538     utext_close(result);
   2539     result = matcher->replaceFirst(&replText, &destText, status);
   2540     REGEX_CHECK_STATUS;
   2541     REGEX_ASSERT(result == &destText);
   2542     REGEX_ASSERT_UTEXT_UTF8("", result);
   2543 
   2544     result = matcher->replaceAll(&replText, NULL, status);
   2545     REGEX_CHECK_STATUS;
   2546     REGEX_ASSERT_UTEXT_UTF8("", result);
   2547     utext_close(result);
   2548     result = matcher->replaceAll(&replText, &destText, status);
   2549     REGEX_CHECK_STATUS;
   2550     REGEX_ASSERT(result == &destText);
   2551     REGEX_ASSERT_UTEXT_UTF8("", result);
   2552 
   2553     //
   2554     // Empty substitution string
   2555     //
   2556     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2557     matcher->reset(&dataText);
   2558 
   2559     utext_openUTF8(&replText, NULL, 0, &status);
   2560     result = matcher->replaceFirst(&replText, NULL, status);
   2561     REGEX_CHECK_STATUS;
   2562     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2563     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2564     utext_close(result);
   2565     result = matcher->replaceFirst(&replText, &destText, status);
   2566     REGEX_CHECK_STATUS;
   2567     REGEX_ASSERT(result == &destText);
   2568     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2569 
   2570     result = matcher->replaceAll(&replText, NULL, status);
   2571     REGEX_CHECK_STATUS;
   2572     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2573     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2574     utext_close(result);
   2575     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2576     result = matcher->replaceAll(&replText, &destText, status);
   2577     REGEX_CHECK_STATUS;
   2578     REGEX_ASSERT(result == &destText);
   2579     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2580 
   2581     //
   2582     // match whole string
   2583     //
   2584     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2585     utext_openUTF8(&dataText, str_abc, -1, &status);
   2586     matcher->reset(&dataText);
   2587 
   2588     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2589     utext_openUTF8(&replText, str_xyz, -1, &status);
   2590     result = matcher->replaceFirst(&replText, NULL, status);
   2591     REGEX_CHECK_STATUS;
   2592     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2593     utext_close(result);
   2594     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2595     result = matcher->replaceFirst(&replText, &destText, status);
   2596     REGEX_CHECK_STATUS;
   2597     REGEX_ASSERT(result == &destText);
   2598     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2599 
   2600     result = matcher->replaceAll(&replText, NULL, status);
   2601     REGEX_CHECK_STATUS;
   2602     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2603     utext_close(result);
   2604     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2605     result = matcher->replaceAll(&replText, &destText, status);
   2606     REGEX_CHECK_STATUS;
   2607     REGEX_ASSERT(result == &destText);
   2608     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2609 
   2610     //
   2611     // Capture Group, simple case
   2612     //
   2613     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2614     utext_openUTF8(&re, str_add, -1, &status);
   2615     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2616     REGEX_CHECK_STATUS;
   2617 
   2618     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2619     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2620     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2621     REGEX_CHECK_STATUS;
   2622 
   2623     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2624     utext_openUTF8(&replText, str_11, -1, &status);
   2625     result = matcher2->replaceFirst(&replText, NULL, status);
   2626     REGEX_CHECK_STATUS;
   2627     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2628     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2629     utext_close(result);
   2630     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2631     result = matcher2->replaceFirst(&replText, &destText, status);
   2632     REGEX_CHECK_STATUS;
   2633     REGEX_ASSERT(result == &destText);
   2634     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2635 
   2636     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2637     utext_openUTF8(&replText, str_v, -1, &status);
   2638     REGEX_VERBOSE_TEXT(&replText);
   2639     result = matcher2->replaceFirst(&replText, NULL, status);
   2640     REGEX_CHECK_STATUS;
   2641     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2642     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2643     utext_close(result);
   2644     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2645     result = matcher2->replaceFirst(&replText, &destText, status);
   2646     REGEX_CHECK_STATUS;
   2647     REGEX_ASSERT(result == &destText);
   2648     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2649 
   2650     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
   2651                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
   2652                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
   2653     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2654     result = matcher2->replaceFirst(&replText, NULL, status);
   2655     REGEX_CHECK_STATUS;
   2656     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2657     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2658     utext_close(result);
   2659     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2660     result = matcher2->replaceFirst(&replText, &destText, status);
   2661     REGEX_CHECK_STATUS;
   2662     REGEX_ASSERT(result == &destText);
   2663     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2664 
   2665     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2666     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2667     //                                 012345678901234567890123456
   2668     supplDigitChars[22] = 0xF0;
   2669     supplDigitChars[23] = 0x9D;
   2670     supplDigitChars[24] = 0x9F;
   2671     supplDigitChars[25] = 0x8F;
   2672     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2673 
   2674     result = matcher2->replaceFirst(&replText, NULL, status);
   2675     REGEX_CHECK_STATUS;
   2676     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2677     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2678     utext_close(result);
   2679     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2680     result = matcher2->replaceFirst(&replText, &destText, status);
   2681     REGEX_CHECK_STATUS;
   2682     REGEX_ASSERT(result == &destText);
   2683     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2684     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2685     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2686     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2687 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2688     utext_close(result);
   2689     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2690     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2691     REGEX_ASSERT(result == &destText);
   2692 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2693 
   2694     //
   2695     // Replacement String with \u hex escapes
   2696     //
   2697     {
   2698       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2699       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2700         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2701         utext_openUTF8(&replText, str_u0043, -1, &status);
   2702         matcher->reset(&dataText);
   2703 
   2704         result = matcher->replaceAll(&replText, NULL, status);
   2705         REGEX_CHECK_STATUS;
   2706         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2707         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2708         utext_close(result);
   2709         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2710         result = matcher->replaceAll(&replText, &destText, status);
   2711         REGEX_CHECK_STATUS;
   2712         REGEX_ASSERT(result == &destText);
   2713         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2714     }
   2715     {
   2716       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2717         utext_openUTF8(&dataText, str_abc, -1, &status);
   2718         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2719         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2720         matcher->reset(&dataText);
   2721 
   2722         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2723         //                          0123456789
   2724         expected[2] = 0xF0;
   2725         expected[3] = 0x90;
   2726         expected[4] = 0x80;
   2727         expected[5] = 0x80;
   2728 
   2729         result = matcher->replaceAll(&replText, NULL, status);
   2730         REGEX_CHECK_STATUS;
   2731         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2732         utext_close(result);
   2733         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2734         result = matcher->replaceAll(&replText, &destText, status);
   2735         REGEX_CHECK_STATUS;
   2736         REGEX_ASSERT(result == &destText);
   2737         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2738     }
   2739     // TODO:  need more through testing of capture substitutions.
   2740 
   2741     // Bug 4057
   2742     //
   2743     {
   2744         status = U_ZERO_ERROR;
   2745 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2746 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2747 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2748         utext_openUTF8(&re, str_ssee, -1, &status);
   2749         utext_openUTF8(&dataText, str_blah, -1, &status);
   2750         utext_openUTF8(&replText, str_ooh, -1, &status);
   2751 
   2752         RegexMatcher m(&re, 0, status);
   2753         REGEX_CHECK_STATUS;
   2754 
   2755         UnicodeString result;
   2756         UText resultText = UTEXT_INITIALIZER;
   2757         utext_openUnicodeString(&resultText, &result, &status);
   2758 
   2759         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2760         m.reset(&dataText);
   2761         m.find();
   2762         m.find();
   2763         m.appendReplacement(&resultText, &replText, status);
   2764         REGEX_CHECK_STATUS;
   2765         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2766         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2767 
   2768         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2769         status = U_ZERO_ERROR;
   2770         result.truncate(0);
   2771         utext_openUnicodeString(&resultText, &result, &status);
   2772         m.reset(10, status);
   2773         m.find();
   2774         m.find();
   2775         m.appendReplacement(&resultText, &replText, status);
   2776         REGEX_CHECK_STATUS;
   2777         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2778         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2779 
   2780         // find() at interior of string, appendReplacement still starts at beginning.
   2781         status = U_ZERO_ERROR;
   2782         result.truncate(0);
   2783         utext_openUnicodeString(&resultText, &result, &status);
   2784         m.reset();
   2785         m.find(10, status);
   2786         m.find();
   2787         m.appendReplacement(&resultText, &replText, status);
   2788         REGEX_CHECK_STATUS;
   2789         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2790         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2791 
   2792         m.appendTail(&resultText, status);
   2793         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2794         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2795 
   2796         utext_close(&resultText);
   2797     }
   2798 
   2799     delete matcher2;
   2800     delete pat2;
   2801     delete matcher;
   2802     delete pat;
   2803 
   2804     utext_close(&dataText);
   2805     utext_close(&replText);
   2806     utext_close(&destText);
   2807     utext_close(&re);
   2808 }
   2809 
   2810 
   2811 //---------------------------------------------------------------------------
   2812 //
   2813 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2814 //                        present and nominally working.
   2815 //
   2816 //---------------------------------------------------------------------------
   2817 void RegexTest::API_Pattern_UTF8() {
   2818     RegexPattern        pata;    // Test default constructor to not crash.
   2819     RegexPattern        patb;
   2820 
   2821     REGEX_ASSERT(pata == patb);
   2822     REGEX_ASSERT(pata == pata);
   2823 
   2824     UText         re1 = UTEXT_INITIALIZER;
   2825     UText         re2 = UTEXT_INITIALIZER;
   2826     UErrorCode    status = U_ZERO_ERROR;
   2827     UParseError   pe;
   2828 
   2829     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2830     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2831     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2832     utext_openUTF8(&re2, str_def, -1, &status);
   2833 
   2834     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2835     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2836     REGEX_CHECK_STATUS;
   2837     REGEX_ASSERT(*pat1 == *pat1);
   2838     REGEX_ASSERT(*pat1 != pata);
   2839 
   2840     // Assign
   2841     patb = *pat1;
   2842     REGEX_ASSERT(patb == *pat1);
   2843 
   2844     // Copy Construct
   2845     RegexPattern patc(*pat1);
   2846     REGEX_ASSERT(patc == *pat1);
   2847     REGEX_ASSERT(patb == patc);
   2848     REGEX_ASSERT(pat1 != pat2);
   2849     patb = *pat2;
   2850     REGEX_ASSERT(patb != patc);
   2851     REGEX_ASSERT(patb == *pat2);
   2852 
   2853     // Compile with no flags.
   2854     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2855     REGEX_ASSERT(*pat1a == *pat1);
   2856 
   2857     REGEX_ASSERT(pat1a->flags() == 0);
   2858 
   2859     // Compile with different flags should be not equal
   2860     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2861     REGEX_CHECK_STATUS;
   2862 
   2863     REGEX_ASSERT(*pat1b != *pat1a);
   2864     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2865     REGEX_ASSERT(pat1a->flags() == 0);
   2866     delete pat1b;
   2867 
   2868     // clone
   2869     RegexPattern *pat1c = pat1->clone();
   2870     REGEX_ASSERT(*pat1c == *pat1);
   2871     REGEX_ASSERT(*pat1c != *pat2);
   2872 
   2873     delete pat1c;
   2874     delete pat1a;
   2875     delete pat1;
   2876     delete pat2;
   2877 
   2878     utext_close(&re1);
   2879     utext_close(&re2);
   2880 
   2881 
   2882     //
   2883     //   Verify that a matcher created from a cloned pattern works.
   2884     //     (Jitterbug 3423)
   2885     //
   2886     {
   2887         UErrorCode     status     = U_ZERO_ERROR;
   2888         UText          pattern    = UTEXT_INITIALIZER;
   2889         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2890         utext_openUTF8(&pattern, str_pL, -1, &status);
   2891 
   2892         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2893         RegexPattern  *pClone     = pSource->clone();
   2894         delete         pSource;
   2895         RegexMatcher  *mFromClone = pClone->matcher(status);
   2896         REGEX_CHECK_STATUS;
   2897 
   2898         UText          input      = UTEXT_INITIALIZER;
   2899         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2900         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2901         mFromClone->reset(&input);
   2902         REGEX_ASSERT(mFromClone->find() == TRUE);
   2903         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2904         REGEX_ASSERT(mFromClone->find() == TRUE);
   2905         REGEX_ASSERT(mFromClone->group(status) == "World");
   2906         REGEX_ASSERT(mFromClone->find() == FALSE);
   2907         delete mFromClone;
   2908         delete pClone;
   2909 
   2910         utext_close(&input);
   2911         utext_close(&pattern);
   2912     }
   2913 
   2914     //
   2915     //   matches convenience API
   2916     //
   2917     {
   2918         UErrorCode status  = U_ZERO_ERROR;
   2919         UText      pattern = UTEXT_INITIALIZER;
   2920         UText      input   = UTEXT_INITIALIZER;
   2921 
   2922         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2923         utext_openUTF8(&input, str_randominput, -1, &status);
   2924 
   2925         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2926         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2927         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2928         REGEX_CHECK_STATUS;
   2929 
   2930         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2931         utext_openUTF8(&pattern, str_abc, -1, &status);
   2932         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2933         REGEX_CHECK_STATUS;
   2934 
   2935         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2936         utext_openUTF8(&pattern, str_nput, -1, &status);
   2937         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2938         REGEX_CHECK_STATUS;
   2939 
   2940         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2941         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2942         REGEX_CHECK_STATUS;
   2943 
   2944         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2945         utext_openUTF8(&pattern, str_u, -1, &status);
   2946         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2947         REGEX_CHECK_STATUS;
   2948 
   2949         utext_openUTF8(&input, str_abc, -1, &status);
   2950         utext_openUTF8(&pattern, str_abc, -1, &status);
   2951         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2952         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2953         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2954 
   2955         utext_close(&input);
   2956         utext_close(&pattern);
   2957     }
   2958 
   2959 
   2960     //
   2961     // Split()
   2962     //
   2963     status = U_ZERO_ERROR;
   2964     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2965     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2966     pat1 = RegexPattern::compile(&re1, pe, status);
   2967     REGEX_CHECK_STATUS;
   2968     UnicodeString  fields[10];
   2969 
   2970     int32_t n;
   2971     n = pat1->split("Now is the time", fields, 10, status);
   2972     REGEX_CHECK_STATUS;
   2973     REGEX_ASSERT(n==4);
   2974     REGEX_ASSERT(fields[0]=="Now");
   2975     REGEX_ASSERT(fields[1]=="is");
   2976     REGEX_ASSERT(fields[2]=="the");
   2977     REGEX_ASSERT(fields[3]=="time");
   2978     REGEX_ASSERT(fields[4]=="");
   2979 
   2980     n = pat1->split("Now is the time", fields, 2, status);
   2981     REGEX_CHECK_STATUS;
   2982     REGEX_ASSERT(n==2);
   2983     REGEX_ASSERT(fields[0]=="Now");
   2984     REGEX_ASSERT(fields[1]=="is the time");
   2985     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2986 
   2987     fields[1] = "*";
   2988     status = U_ZERO_ERROR;
   2989     n = pat1->split("Now is the time", fields, 1, status);
   2990     REGEX_CHECK_STATUS;
   2991     REGEX_ASSERT(n==1);
   2992     REGEX_ASSERT(fields[0]=="Now is the time");
   2993     REGEX_ASSERT(fields[1]=="*");
   2994     status = U_ZERO_ERROR;
   2995 
   2996     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2997     REGEX_CHECK_STATUS;
   2998     REGEX_ASSERT(n==6);
   2999     REGEX_ASSERT(fields[0]=="");
   3000     REGEX_ASSERT(fields[1]=="Now");
   3001     REGEX_ASSERT(fields[2]=="is");
   3002     REGEX_ASSERT(fields[3]=="the");
   3003     REGEX_ASSERT(fields[4]=="time");
   3004     REGEX_ASSERT(fields[5]=="");
   3005     REGEX_ASSERT(fields[6]=="");
   3006 
   3007     fields[2] = "*";
   3008     n = pat1->split("     ", fields, 10, status);
   3009     REGEX_CHECK_STATUS;
   3010     REGEX_ASSERT(n==2);
   3011     REGEX_ASSERT(fields[0]=="");
   3012     REGEX_ASSERT(fields[1]=="");
   3013     REGEX_ASSERT(fields[2]=="*");
   3014 
   3015     fields[0] = "foo";
   3016     n = pat1->split("", fields, 10, status);
   3017     REGEX_CHECK_STATUS;
   3018     REGEX_ASSERT(n==0);
   3019     REGEX_ASSERT(fields[0]=="foo");
   3020 
   3021     delete pat1;
   3022 
   3023     //  split, with a pattern with (capture)
   3024     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   3025     pat1 = RegexPattern::compile(&re1,  pe, status);
   3026     REGEX_CHECK_STATUS;
   3027 
   3028     status = U_ZERO_ERROR;
   3029     fields[6] = fields[7] = "*";
   3030     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   3031     REGEX_CHECK_STATUS;
   3032     REGEX_ASSERT(n==7);
   3033     REGEX_ASSERT(fields[0]=="");
   3034     REGEX_ASSERT(fields[1]=="a");
   3035     REGEX_ASSERT(fields[2]=="Now is ");
   3036     REGEX_ASSERT(fields[3]=="b");
   3037     REGEX_ASSERT(fields[4]=="the time");
   3038     REGEX_ASSERT(fields[5]=="c");
   3039     REGEX_ASSERT(fields[6]=="");
   3040     REGEX_ASSERT(fields[7]=="*");
   3041     REGEX_ASSERT(status==U_ZERO_ERROR);
   3042 
   3043     fields[6] = fields[7] = "*";
   3044     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   3045     REGEX_CHECK_STATUS;
   3046     REGEX_ASSERT(n==7);
   3047     REGEX_ASSERT(fields[0]=="  ");
   3048     REGEX_ASSERT(fields[1]=="a");
   3049     REGEX_ASSERT(fields[2]=="Now is ");
   3050     REGEX_ASSERT(fields[3]=="b");
   3051     REGEX_ASSERT(fields[4]=="the time");
   3052     REGEX_ASSERT(fields[5]=="c");
   3053     REGEX_ASSERT(fields[6]=="");
   3054     REGEX_ASSERT(fields[7]=="*");
   3055 
   3056     status = U_ZERO_ERROR;
   3057     fields[6] = "foo";
   3058     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   3059     REGEX_CHECK_STATUS;
   3060     REGEX_ASSERT(n==6);
   3061     REGEX_ASSERT(fields[0]=="  ");
   3062     REGEX_ASSERT(fields[1]=="a");
   3063     REGEX_ASSERT(fields[2]=="Now is ");
   3064     REGEX_ASSERT(fields[3]=="b");
   3065     REGEX_ASSERT(fields[4]=="the time");
   3066     REGEX_ASSERT(fields[5]==" ");
   3067     REGEX_ASSERT(fields[6]=="foo");
   3068 
   3069     status = U_ZERO_ERROR;
   3070     fields[5] = "foo";
   3071     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3072     REGEX_CHECK_STATUS;
   3073     REGEX_ASSERT(n==5);
   3074     REGEX_ASSERT(fields[0]=="  ");
   3075     REGEX_ASSERT(fields[1]=="a");
   3076     REGEX_ASSERT(fields[2]=="Now is ");
   3077     REGEX_ASSERT(fields[3]=="b");
   3078     REGEX_ASSERT(fields[4]=="the time<c>");
   3079     REGEX_ASSERT(fields[5]=="foo");
   3080 
   3081     status = U_ZERO_ERROR;
   3082     fields[5] = "foo";
   3083     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3084     REGEX_CHECK_STATUS;
   3085     REGEX_ASSERT(n==5);
   3086     REGEX_ASSERT(fields[0]=="  ");
   3087     REGEX_ASSERT(fields[1]=="a");
   3088     REGEX_ASSERT(fields[2]=="Now is ");
   3089     REGEX_ASSERT(fields[3]=="b");
   3090     REGEX_ASSERT(fields[4]=="the time");
   3091     REGEX_ASSERT(fields[5]=="foo");
   3092 
   3093     status = U_ZERO_ERROR;
   3094     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3095     REGEX_CHECK_STATUS;
   3096     REGEX_ASSERT(n==4);
   3097     REGEX_ASSERT(fields[0]=="  ");
   3098     REGEX_ASSERT(fields[1]=="a");
   3099     REGEX_ASSERT(fields[2]=="Now is ");
   3100     REGEX_ASSERT(fields[3]=="the time<c>");
   3101     status = U_ZERO_ERROR;
   3102     delete pat1;
   3103 
   3104     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3105     pat1 = RegexPattern::compile(&re1, pe, status);
   3106     REGEX_CHECK_STATUS;
   3107     n = pat1->split("1-10,20", fields, 10, status);
   3108     REGEX_CHECK_STATUS;
   3109     REGEX_ASSERT(n==5);
   3110     REGEX_ASSERT(fields[0]=="1");
   3111     REGEX_ASSERT(fields[1]=="-");
   3112     REGEX_ASSERT(fields[2]=="10");
   3113     REGEX_ASSERT(fields[3]==",");
   3114     REGEX_ASSERT(fields[4]=="20");
   3115     delete pat1;
   3116 
   3117 
   3118     //
   3119     // split of a UText based string, with library allocating output UTexts.
   3120     //
   3121     {
   3122         status = U_ZERO_ERROR;
   3123         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
   3124         UnicodeString stringToSplit("first:second:third");
   3125         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
   3126         REGEX_CHECK_STATUS;
   3127 
   3128         UText *splits[10] = {NULL};
   3129         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
   3130         REGEX_CHECK_STATUS;
   3131         REGEX_ASSERT(numFields == 5);
   3132         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
   3133         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
   3134         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
   3135         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
   3136         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
   3137         REGEX_ASSERT(splits[5] == NULL);
   3138 
   3139         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
   3140             if (splits[i]) {
   3141                 utext_close(splits[i]);
   3142                 splits[i] = NULL;
   3143             }
   3144         }
   3145         utext_close(textToSplit);
   3146     }
   3147 
   3148 
   3149     //
   3150     // RegexPattern::pattern() and patternText()
   3151     //
   3152     pat1 = new RegexPattern();
   3153     REGEX_ASSERT(pat1->pattern() == "");
   3154     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3155     delete pat1;
   3156     const char *helloWorldInvariant = "(Hello, world)*";
   3157     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3158     pat1 = RegexPattern::compile(&re1, pe, status);
   3159     REGEX_CHECK_STATUS;
   3160     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
   3161     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3162     delete pat1;
   3163 
   3164     utext_close(&re1);
   3165 }
   3166 
   3167 
   3168 //---------------------------------------------------------------------------
   3169 //
   3170 //      Extended       A more thorough check for features of regex patterns
   3171 //                     The test cases are in a separate data file,
   3172 //                       source/tests/testdata/regextst.txt
   3173 //                     A description of the test data format is included in that file.
   3174 //
   3175 //---------------------------------------------------------------------------
   3176 
   3177 const char *
   3178 RegexTest::getPath(char buffer[2048], const char *filename) {
   3179     UErrorCode status=U_ZERO_ERROR;
   3180     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3181     if (U_FAILURE(status)) {
   3182         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3183         return NULL;
   3184     }
   3185 
   3186     strcpy(buffer, testDataDirectory);
   3187     strcat(buffer, filename);
   3188     return buffer;
   3189 }
   3190 
   3191 void RegexTest::Extended() {
   3192     char tdd[2048];
   3193     const char *srcPath;
   3194     UErrorCode  status  = U_ZERO_ERROR;
   3195     int32_t     lineNum = 0;
   3196 
   3197     //
   3198     //  Open and read the test data file.
   3199     //
   3200     srcPath=getPath(tdd, "regextst.txt");
   3201     if(srcPath==NULL) {
   3202         return; /* something went wrong, error already output */
   3203     }
   3204 
   3205     int32_t    len;
   3206     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3207     if (U_FAILURE(status)) {
   3208         return; /* something went wrong, error already output */
   3209     }
   3210 
   3211     //
   3212     //  Put the test data into a UnicodeString
   3213     //
   3214     UnicodeString testString(FALSE, testData, len);
   3215 
   3216     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3217     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3218     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3219 
   3220     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3221     UnicodeString   testPattern;   // The pattern for test from the test file.
   3222     UnicodeString   testFlags;     // the flags   for a test.
   3223     UnicodeString   matchString;   // The marked up string to be used as input
   3224 
   3225     if (U_FAILURE(status)){
   3226         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
   3227         delete [] testData;
   3228         return;
   3229     }
   3230 
   3231     //
   3232     //  Loop over the test data file, once per line.
   3233     //
   3234     while (lineMat.find()) {
   3235         lineNum++;
   3236         if (U_FAILURE(status)) {
   3237           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3238         }
   3239 
   3240         status = U_ZERO_ERROR;
   3241         UnicodeString testLine = lineMat.group(1, status);
   3242         if (testLine.length() == 0) {
   3243             continue;
   3244         }
   3245 
   3246         //
   3247         // Parse the test line.  Skip blank and comment only lines.
   3248         // Separate out the three main fields - pattern, flags, target.
   3249         //
   3250 
   3251         commentMat.reset(testLine);
   3252         if (commentMat.lookingAt(status)) {
   3253             // This line is a comment, or blank.
   3254             continue;
   3255         }
   3256 
   3257         //
   3258         //  Pull out the pattern field, remove it from the test file line.
   3259         //
   3260         quotedStuffMat.reset(testLine);
   3261         if (quotedStuffMat.lookingAt(status)) {
   3262             testPattern = quotedStuffMat.group(2, status);
   3263             testLine.remove(0, quotedStuffMat.end(0, status));
   3264         } else {
   3265             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3266             continue;
   3267         }
   3268 
   3269 
   3270         //
   3271         //  Pull out the flags from the test file line.
   3272         //
   3273         flagsMat.reset(testLine);
   3274         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3275         testFlags = flagsMat.group(1, status);
   3276         if (flagsMat.group(2, status).length() > 0) {
   3277             errln("Bad Match flag at line %d. Scanning %c\n",
   3278                 lineNum, flagsMat.group(2, status).charAt(0));
   3279             continue;
   3280         }
   3281         testLine.remove(0, flagsMat.end(0, status));
   3282 
   3283         //
   3284         //  Pull out the match string, as a whole.
   3285         //    We'll process the <tags> later.
   3286         //
   3287         quotedStuffMat.reset(testLine);
   3288         if (quotedStuffMat.lookingAt(status)) {
   3289             matchString = quotedStuffMat.group(2, status);
   3290             testLine.remove(0, quotedStuffMat.end(0, status));
   3291         } else {
   3292             errln("Bad match string at test file line %d", lineNum);
   3293             continue;
   3294         }
   3295 
   3296         //
   3297         //  The only thing left from the input line should be an optional trailing comment.
   3298         //
   3299         commentMat.reset(testLine);
   3300         if (commentMat.lookingAt(status) == FALSE) {
   3301             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3302             continue;
   3303         }
   3304 
   3305         //
   3306         //  Run the test
   3307         //
   3308         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3309     }
   3310 
   3311     delete [] testData;
   3312 
   3313 }
   3314 
   3315 
   3316 
   3317 //---------------------------------------------------------------------------
   3318 //
   3319 //    regex_find(pattern, flags, inputString, lineNumber)
   3320 //
   3321 //         Function to run a single test from the Extended (data driven) tests.
   3322 //         See file test/testdata/regextst.txt for a description of the
   3323 //         pattern and inputString fields, and the allowed flags.
   3324 //         lineNumber is the source line in regextst.txt of the test.
   3325 //
   3326 //---------------------------------------------------------------------------
   3327 
   3328 
   3329 //  Set a value into a UVector at position specified by a decimal number in
   3330 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3331 //   which follows.
   3332 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3333     UErrorCode  status=U_ZERO_ERROR;
   3334     int32_t  idx = 0;
   3335     for (int32_t i=0; i<index.length(); i++) {
   3336         int32_t d=u_charDigitValue(index.charAt(i));
   3337         if (d<0) {return;}
   3338         idx = idx*10 + d;
   3339     }
   3340     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3341     vec.setElementAt(val, idx);
   3342 }
   3343 
   3344 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3345     UErrorCode  status=U_ZERO_ERROR;
   3346     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3347     vec.setElementAt(val, idx);
   3348 }
   3349 
   3350 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3351 {
   3352     UBool couldFind = TRUE;
   3353     UTEXT_SETNATIVEINDEX(utext, 0);
   3354     int32_t i = 0;
   3355     while (i < unistrOffset) {
   3356         UChar32 c = UTEXT_NEXT32(utext);
   3357         if (c != U_SENTINEL) {
   3358             i += U16_LENGTH(c);
   3359         } else {
   3360             couldFind = FALSE;
   3361             break;
   3362         }
   3363     }
   3364     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3365     return couldFind;
   3366 }
   3367 
   3368 
   3369 void RegexTest::regex_find(const UnicodeString &pattern,
   3370                            const UnicodeString &flags,
   3371                            const UnicodeString &inputString,
   3372                            const char *srcPath,
   3373                            int32_t line) {
   3374     UnicodeString       unEscapedInput;
   3375     UnicodeString       deTaggedInput;
   3376 
   3377     int32_t             patternUTF8Length,      inputUTF8Length;
   3378     char                *patternChars  = NULL, *inputChars = NULL;
   3379     UText               patternText    = UTEXT_INITIALIZER;
   3380     UText               inputText      = UTEXT_INITIALIZER;
   3381     UConverter          *UTF8Converter = NULL;
   3382 
   3383     UErrorCode          status         = U_ZERO_ERROR;
   3384     UParseError         pe;
   3385     RegexPattern        *parsePat      = NULL;
   3386     RegexMatcher        *parseMatcher  = NULL;
   3387     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3388     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3389     UVector             groupStarts(status);
   3390     UVector             groupEnds(status);
   3391     UVector             groupStartsUTF8(status);
   3392     UVector             groupEndsUTF8(status);
   3393     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3394     UBool               failed         = FALSE;
   3395     int32_t             numFinds;
   3396     int32_t             i;
   3397     UBool               useMatchesFunc   = FALSE;
   3398     UBool               useLookingAtFunc = FALSE;
   3399     int32_t             regionStart      = -1;
   3400     int32_t             regionEnd        = -1;
   3401     int32_t             regionStartUTF8  = -1;
   3402     int32_t             regionEndUTF8    = -1;
   3403 
   3404 
   3405     //
   3406     //  Compile the caller's pattern
   3407     //
   3408     uint32_t bflags = 0;
   3409     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3410         bflags |= UREGEX_CASE_INSENSITIVE;
   3411     }
   3412     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3413         bflags |= UREGEX_COMMENTS;
   3414     }
   3415     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3416         bflags |= UREGEX_DOTALL;
   3417     }
   3418     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3419         bflags |= UREGEX_MULTILINE;
   3420     }
   3421 
   3422     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3423         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3424     }
   3425     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3426         bflags |= UREGEX_UNIX_LINES;
   3427     }
   3428     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3429         bflags |= UREGEX_LITERAL;
   3430     }
   3431 
   3432 
   3433     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3434     if (status != U_ZERO_ERROR) {
   3435         #if UCONFIG_NO_BREAK_ITERATION==1
   3436         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3437         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3438         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3439             goto cleanupAndReturn;
   3440         }
   3441         #endif
   3442         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3443             // Expected pattern compilation error.
   3444             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3445                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3446             }
   3447             goto cleanupAndReturn;
   3448         } else {
   3449             // Unexpected pattern compilation error.
   3450             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3451             goto cleanupAndReturn;
   3452         }
   3453     }
   3454 
   3455     UTF8Converter = ucnv_open("UTF8", &status);
   3456     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3457 
   3458     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3459     status = U_ZERO_ERROR; // buffer overflow
   3460     patternChars = new char[patternUTF8Length+1];
   3461     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3462     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3463 
   3464     if (status == U_ZERO_ERROR) {
   3465         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3466 
   3467         if (status != U_ZERO_ERROR) {
   3468 #if UCONFIG_NO_BREAK_ITERATION==1
   3469             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3470             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3471             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3472                 goto cleanupAndReturn;
   3473             }
   3474 #endif
   3475             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3476                 // Expected pattern compilation error.
   3477                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3478                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3479                 }
   3480                 goto cleanupAndReturn;
   3481             } else {
   3482                 // Unexpected pattern compilation error.
   3483                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3484                 goto cleanupAndReturn;
   3485             }
   3486         }
   3487     }
   3488 
   3489     if (UTF8Pattern == NULL) {
   3490         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3491         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3492         status = U_ZERO_ERROR;
   3493     }
   3494 
   3495     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3496         callerPattern->dumpPattern();
   3497     }
   3498 
   3499     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3500         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3501         goto cleanupAndReturn;
   3502     }
   3503 
   3504 
   3505     //
   3506     // Number of times find() should be called on the test string, default to 1
   3507     //
   3508     numFinds = 1;
   3509     for (i=2; i<=9; i++) {
   3510         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3511             if (numFinds != 1) {
   3512                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3513                 goto cleanupAndReturn;
   3514             }
   3515             numFinds = i;
   3516         }
   3517     }
   3518 
   3519     // 'M' flag.  Use matches() instead of find()
   3520     if (flags.indexOf((UChar)0x4d) >= 0) {
   3521         useMatchesFunc = TRUE;
   3522     }
   3523     if (flags.indexOf((UChar)0x4c) >= 0) {
   3524         useLookingAtFunc = TRUE;
   3525     }
   3526 
   3527     //
   3528     //  Find the tags in the input data, remove them, and record the group boundary
   3529     //    positions.
   3530     //
   3531     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3532     REGEX_CHECK_STATUS_L(line);
   3533 
   3534     unEscapedInput = inputString.unescape();
   3535     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3536     REGEX_CHECK_STATUS_L(line);
   3537     while(parseMatcher->find()) {
   3538         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3539         REGEX_CHECK_STATUS;
   3540         UnicodeString groupNum = parseMatcher->group(2, status);
   3541         if (groupNum == "r") {
   3542             // <r> or </r>, a region specification within the string
   3543             if (parseMatcher->group(1, status) == "/") {
   3544                 regionEnd = deTaggedInput.length();
   3545             } else {
   3546                 regionStart = deTaggedInput.length();
   3547             }
   3548         } else {
   3549             // <digits> or </digits>, a group match boundary tag.
   3550             if (parseMatcher->group(1, status) == "/") {
   3551                 set(groupEnds, deTaggedInput.length(), groupNum);
   3552             } else {
   3553                 set(groupStarts, deTaggedInput.length(), groupNum);
   3554             }
   3555         }
   3556     }
   3557     parseMatcher->appendTail(deTaggedInput);
   3558     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3559     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3560       errln("mismatched <r> tags");
   3561       failed = TRUE;
   3562       goto cleanupAndReturn;
   3563     }
   3564 
   3565     //
   3566     //  Configure the matcher according to the flags specified with this test.
   3567     //
   3568     matcher = callerPattern->matcher(deTaggedInput, status);
   3569     REGEX_CHECK_STATUS_L(line);
   3570     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3571         matcher->setTrace(TRUE);
   3572     }
   3573 
   3574     if (UTF8Pattern != NULL) {
   3575         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3576         status = U_ZERO_ERROR; // buffer overflow
   3577         inputChars = new char[inputUTF8Length+1];
   3578         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3579         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3580 
   3581         if (status == U_ZERO_ERROR) {
   3582             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3583             REGEX_CHECK_STATUS_L(line);
   3584         }
   3585 
   3586         if (UTF8Matcher == NULL) {
   3587             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3588             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3589             status = U_ZERO_ERROR;
   3590         }
   3591     }
   3592 
   3593     //
   3594     //  Generate native indices for UTF8 versions of region and capture group info
   3595     //
   3596     if (UTF8Matcher != NULL) {
   3597         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3598             UTF8Matcher->setTrace(TRUE);
   3599         }
   3600         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3601         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3602 
   3603         //  Fill out the native index UVector info.
   3604         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3605         for (i=0; i<groupStarts.size(); i++) {
   3606             int32_t  start = groupStarts.elementAti(i);
   3607             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3608             if (start >= 0) {
   3609                 int32_t  startUTF8;
   3610                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3611                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3612                     failed = TRUE;
   3613                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3614                 }
   3615                 setInt(groupStartsUTF8, startUTF8, i);
   3616             }
   3617 
   3618             int32_t  end = groupEnds.elementAti(i);
   3619             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3620             if (end >= 0) {
   3621                 int32_t  endUTF8;
   3622                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3623                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3624                     failed = TRUE;
   3625                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3626                 }
   3627                 setInt(groupEndsUTF8, endUTF8, i);
   3628             }
   3629         }
   3630     }
   3631 
   3632     if (regionStart>=0) {
   3633        matcher->region(regionStart, regionEnd, status);
   3634        REGEX_CHECK_STATUS_L(line);
   3635        if (UTF8Matcher != NULL) {
   3636            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3637            REGEX_CHECK_STATUS_L(line);
   3638        }
   3639     }
   3640     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3641         matcher->useAnchoringBounds(FALSE);
   3642         if (UTF8Matcher != NULL) {
   3643             UTF8Matcher->useAnchoringBounds(FALSE);
   3644         }
   3645     }
   3646     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3647         matcher->useTransparentBounds(TRUE);
   3648         if (UTF8Matcher != NULL) {
   3649             UTF8Matcher->useTransparentBounds(TRUE);
   3650         }
   3651     }
   3652 
   3653 
   3654 
   3655     //
   3656     // Do a find on the de-tagged input using the caller's pattern
   3657     //     TODO: error on count>1 and not find().
   3658     //           error on both matches() and lookingAt().
   3659     //
   3660     for (i=0; i<numFinds; i++) {
   3661         if (useMatchesFunc) {
   3662             isMatch = matcher->matches(status);
   3663             if (UTF8Matcher != NULL) {
   3664                isUTF8Match = UTF8Matcher->matches(status);
   3665             }
   3666         } else  if (useLookingAtFunc) {
   3667             isMatch = matcher->lookingAt(status);
   3668             if (UTF8Matcher != NULL) {
   3669                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3670             }
   3671         } else {
   3672             isMatch = matcher->find();
   3673             if (UTF8Matcher != NULL) {
   3674                 isUTF8Match = UTF8Matcher->find();
   3675             }
   3676         }
   3677     }
   3678     matcher->setTrace(FALSE);
   3679     if (UTF8Matcher) {
   3680         UTF8Matcher->setTrace(FALSE);
   3681     }
   3682     if (U_FAILURE(status)) {
   3683         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
   3684     }
   3685 
   3686     //
   3687     // Match up the groups from the find() with the groups from the tags
   3688     //
   3689 
   3690     // number of tags should match number of groups from find operation.
   3691     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3692     //   G option in test means that capture group data is not available in the
   3693     //     expected results, so the check needs to be suppressed.
   3694     if (isMatch == FALSE && groupStarts.size() != 0) {
   3695         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3696         failed = TRUE;
   3697         goto cleanupAndReturn;
   3698     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3699         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3700         failed = TRUE;
   3701         goto cleanupAndReturn;
   3702     }
   3703     if (isMatch && groupStarts.size() == 0) {
   3704         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
   3705         failed = TRUE;
   3706     }
   3707     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
   3708         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
   3709         failed = TRUE;
   3710     }
   3711 
   3712     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3713         // Only check for match / no match.  Don't check capture groups.
   3714         goto cleanupAndReturn;
   3715     }
   3716 
   3717     REGEX_CHECK_STATUS_L(line);
   3718     for (i=0; i<=matcher->groupCount(); i++) {
   3719         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3720         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3721         if (matcher->start(i, status) != expectedStart) {
   3722             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3723                 line, i, expectedStart, matcher->start(i, status));
   3724             failed = TRUE;
   3725             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3726         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3727             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3728                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3729             failed = TRUE;
   3730             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3731         }
   3732 
   3733         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3734         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3735         if (matcher->end(i, status) != expectedEnd) {
   3736             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3737                 line, i, expectedEnd, matcher->end(i, status));
   3738             failed = TRUE;
   3739             // Error on end position;  keep going; real error is probably yet to come as group
   3740             //   end positions work from end of the input data towards the front.
   3741         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3742             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3743                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3744             failed = TRUE;
   3745             // Error on end position;  keep going; real error is probably yet to come as group
   3746             //   end positions work from end of the input data towards the front.
   3747         }
   3748     }
   3749     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3750         errln("Error at line %d: Expected %d capture groups, found %d.",
   3751             line, groupStarts.size()-1, matcher->groupCount());
   3752         failed = TRUE;
   3753         }
   3754     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3755         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3756               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3757         failed = TRUE;
   3758     }
   3759 
   3760     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3761         matcher->requireEnd() == TRUE) {
   3762         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3763         failed = TRUE;
   3764     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3765         UTF8Matcher->requireEnd() == TRUE) {
   3766         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3767         failed = TRUE;
   3768     }
   3769 
   3770     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3771         matcher->requireEnd() == FALSE) {
   3772         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3773         failed = TRUE;
   3774     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3775         UTF8Matcher->requireEnd() == FALSE) {
   3776         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3777         failed = TRUE;
   3778     }
   3779 
   3780     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3781         matcher->hitEnd() == TRUE) {
   3782         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3783         failed = TRUE;
   3784     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3785                UTF8Matcher->hitEnd() == TRUE) {
   3786         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3787         failed = TRUE;
   3788     }
   3789 
   3790     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3791         matcher->hitEnd() == FALSE) {
   3792         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3793         failed = TRUE;
   3794     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3795                UTF8Matcher->hitEnd() == FALSE) {
   3796         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3797         failed = TRUE;
   3798     }
   3799 
   3800 
   3801 cleanupAndReturn:
   3802     if (failed) {
   3803         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3804             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3805         // callerPattern->dump();
   3806     }
   3807     delete parseMatcher;
   3808     delete parsePat;
   3809     delete UTF8Matcher;
   3810     delete UTF8Pattern;
   3811     delete matcher;
   3812     delete callerPattern;
   3813 
   3814     utext_close(&inputText);
   3815     delete[] inputChars;
   3816     utext_close(&patternText);
   3817     delete[] patternChars;
   3818     ucnv_close(UTF8Converter);
   3819 }
   3820 
   3821 
   3822 
   3823 
   3824 //---------------------------------------------------------------------------
   3825 //
   3826 //      Errors     Check for error handling in patterns.
   3827 //
   3828 //---------------------------------------------------------------------------
   3829 void RegexTest::Errors() {
   3830     // \escape sequences that aren't implemented yet.
   3831     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3832 
   3833     // Missing close parentheses
   3834     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3835     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3836     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3837 
   3838     // Extra close paren
   3839     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3840     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3841     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3842 
   3843     // Look-ahead, Look-behind
   3844     //  TODO:  add tests for unbounded length look-behinds.
   3845     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3846 
   3847     // Attempt to use non-default flags
   3848     {
   3849         UParseError   pe;
   3850         UErrorCode    status = U_ZERO_ERROR;
   3851         int32_t       flags  = UREGEX_CANON_EQ |
   3852                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3853                                UREGEX_MULTILINE;
   3854         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3855         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3856         delete pat1;
   3857     }
   3858 
   3859 
   3860     // Quantifiers are allowed only after something that can be quantified.
   3861     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3862     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3863     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3864 
   3865     // Mal-formed {min,max} quantifiers
   3866     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3867     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3868     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3869     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3870     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3871     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3872     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3873     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3874     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3875 
   3876     // Ticket 5389
   3877     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3878 
   3879     // Invalid Back Reference \0
   3880     //    For ICU 3.8 and earlier
   3881     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3882     //
   3883     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3884 
   3885 }
   3886 
   3887 
   3888 //-------------------------------------------------------------------------------
   3889 //
   3890 //  Read a text data file, convert it to UChars, and return the data
   3891 //    in one big UChar * buffer, which the caller must delete.
   3892 //
   3893 //--------------------------------------------------------------------------------
   3894 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3895                                      const char *defEncoding, UErrorCode &status) {
   3896     UChar       *retPtr  = NULL;
   3897     char        *fileBuf = NULL;
   3898     UConverter* conv     = NULL;
   3899     FILE        *f       = NULL;
   3900 
   3901     ulen = 0;
   3902     if (U_FAILURE(status)) {
   3903         return retPtr;
   3904     }
   3905 
   3906     //
   3907     //  Open the file.
   3908     //
   3909     f = fopen(fileName, "rb");
   3910     if (f == 0) {
   3911         dataerrln("Error opening test data file %s\n", fileName);
   3912         status = U_FILE_ACCESS_ERROR;
   3913         return NULL;
   3914     }
   3915     //
   3916     //  Read it in
   3917     //
   3918     int32_t            fileSize;
   3919     int32_t            amt_read;
   3920 
   3921     fseek( f, 0, SEEK_END);
   3922     fileSize = ftell(f);
   3923     fileBuf = new char[fileSize];
   3924     fseek(f, 0, SEEK_SET);
   3925     amt_read = fread(fileBuf, 1, fileSize, f);
   3926     if (amt_read != fileSize || fileSize <= 0) {
   3927         errln("Error reading test data file.");
   3928         goto cleanUpAndReturn;
   3929     }
   3930 
   3931     //
   3932     // Look for a Unicode Signature (BOM) on the data just read
   3933     //
   3934     int32_t        signatureLength;
   3935     const char *   fileBufC;
   3936     const char*    encoding;
   3937 
   3938     fileBufC = fileBuf;
   3939     encoding = ucnv_detectUnicodeSignature(
   3940         fileBuf, fileSize, &signatureLength, &status);
   3941     if(encoding!=NULL ){
   3942         fileBufC  += signatureLength;
   3943         fileSize  -= signatureLength;
   3944     } else {
   3945         encoding = defEncoding;
   3946         if (strcmp(encoding, "utf-8") == 0) {
   3947             errln("file %s is missing its BOM", fileName);
   3948         }
   3949     }
   3950 
   3951     //
   3952     // Open a converter to take the rule file to UTF-16
   3953     //
   3954     conv = ucnv_open(encoding, &status);
   3955     if (U_FAILURE(status)) {
   3956         goto cleanUpAndReturn;
   3957     }
   3958 
   3959     //
   3960     // Convert the rules to UChar.
   3961     //  Preflight first to determine required buffer size.
   3962     //
   3963     ulen = ucnv_toUChars(conv,
   3964         NULL,           //  dest,
   3965         0,              //  destCapacity,
   3966         fileBufC,
   3967         fileSize,
   3968         &status);
   3969     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3970         // Buffer Overflow is expected from the preflight operation.
   3971         status = U_ZERO_ERROR;
   3972 
   3973         retPtr = new UChar[ulen+1];
   3974         ucnv_toUChars(conv,
   3975             retPtr,       //  dest,
   3976             ulen+1,
   3977             fileBufC,
   3978             fileSize,
   3979             &status);
   3980     }
   3981 
   3982 cleanUpAndReturn:
   3983     fclose(f);
   3984     delete[] fileBuf;
   3985     ucnv_close(conv);
   3986     if (U_FAILURE(status)) {
   3987         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3988         delete []retPtr;
   3989         retPtr = 0;
   3990         ulen   = 0;
   3991     };
   3992     return retPtr;
   3993 }
   3994 
   3995 
   3996 //-------------------------------------------------------------------------------
   3997 //
   3998 //   PerlTests  - Run Perl's regular expression tests
   3999 //                The input file for this test is re_tests, the standard regular
   4000 //                expression test data distributed with the Perl source code.
   4001 //
   4002 //                Here is Perl's description of the test data file:
   4003 //
   4004 //        # The tests are in a separate file 't/op/re_tests'.
   4005 //        # Each line in that file is a separate test.
   4006 //        # There are five columns, separated by tabs.
   4007 //        #
   4008 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   4009 //        # Modifiers can be put after the closing C<'>.
   4010 //        #
   4011 //        # Column 2 contains the string to be matched.
   4012 //        #
   4013 //        # Column 3 contains the expected result:
   4014 //        #     y   expect a match
   4015 //        #     n   expect no match
   4016 //        #     c   expect an error
   4017 //        # B   test exposes a known bug in Perl, should be skipped
   4018 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   4019 //        #
   4020 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   4021 //        #
   4022 //        # Column 4 contains a string, usually C<$&>.
   4023 //        #
   4024 //        # Column 5 contains the expected result of double-quote
   4025 //        # interpolating that string after the match, or start of error message.
   4026 //        #
   4027 //        # Column 6, if present, contains a reason why the test is skipped.
   4028 //        # This is printed with "skipped", for harness to pick up.
   4029 //        #
   4030 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   4031 //        #
   4032 //        # If you want to add a regular expression test that can't be expressed
   4033 //        # in this format, don't add it here: put it in op/pat.t instead.
   4034 //
   4035 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   4036 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   4037 //        (The i is in addition to whatever was there before.)
   4038 //
   4039 //-------------------------------------------------------------------------------
   4040 void RegexTest::PerlTests() {
   4041     char tdd[2048];
   4042     const char *srcPath;
   4043     UErrorCode  status = U_ZERO_ERROR;
   4044     UParseError pe;
   4045 
   4046     //
   4047     //  Open and read the test data file.
   4048     //
   4049     srcPath=getPath(tdd, "re_tests.txt");
   4050     if(srcPath==NULL) {
   4051         return; /* something went wrong, error already output */
   4052     }
   4053 
   4054     int32_t    len;
   4055     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4056     if (U_FAILURE(status)) {
   4057         return; /* something went wrong, error already output */
   4058     }
   4059 
   4060     //
   4061     //  Put the test data into a UnicodeString
   4062     //
   4063     UnicodeString testDataString(FALSE, testData, len);
   4064 
   4065     //
   4066     //  Regex to break the input file into lines, and strip the new lines.
   4067     //     One line per match, capture group one is the desired data.
   4068     //
   4069     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4070     if (U_FAILURE(status)) {
   4071         dataerrln("RegexPattern::compile() error");
   4072         return;
   4073     }
   4074     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4075 
   4076     //
   4077     //  Regex to split a test file line into fields.
   4078     //    There are six fields, separated by tabs.
   4079     //
   4080     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4081 
   4082     //
   4083     //  Regex to identify test patterns with flag settings, and to separate them.
   4084     //    Test patterns with flags look like 'pattern'i
   4085     //    Test patterns without flags are not quoted:   pattern
   4086     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4087     //
   4088     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4089     RegexMatcher* flagMat = flagPat->matcher(status);
   4090 
   4091     //
   4092     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4093     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4094     //   are string constants and REs for these constructs.
   4095     //
   4096     UnicodeString nulnulSrc("${nulnul}");
   4097     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4098     nulnul = nulnul.unescape();
   4099 
   4100     UnicodeString ffffSrc("${ffff}");
   4101     UnicodeString ffff("\\uffff", -1, US_INV);
   4102     ffff = ffff.unescape();
   4103 
   4104     //  regexp for $-[0], $+[2], etc.
   4105     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4106     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4107 
   4108     //  regexp for $0, $1, $2, etc.
   4109     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4110     RegexMatcher *cgMat = cgPat->matcher(status);
   4111 
   4112 
   4113     //
   4114     // Main Loop for the Perl Tests, runs once per line from the
   4115     //   test data file.
   4116     //
   4117     int32_t  lineNum = 0;
   4118     int32_t  skippedUnimplementedCount = 0;
   4119     while (lineMat->find()) {
   4120         lineNum++;
   4121 
   4122         //
   4123         //  Get a line, break it into its fields, do the Perl
   4124         //    variable substitutions.
   4125         //
   4126         UnicodeString line = lineMat->group(1, status);
   4127         UnicodeString fields[7];
   4128         fieldPat->split(line, fields, 7, status);
   4129 
   4130         flagMat->reset(fields[0]);
   4131         flagMat->matches(status);
   4132         UnicodeString pattern  = flagMat->group(2, status);
   4133         pattern.findAndReplace("${bang}", "!");
   4134         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4135         pattern.findAndReplace(ffffSrc, ffff);
   4136 
   4137         //
   4138         //  Identify patterns that include match flag settings,
   4139         //    split off the flags, remove the extra quotes.
   4140         //
   4141         UnicodeString flagStr = flagMat->group(3, status);
   4142         if (U_FAILURE(status)) {
   4143             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4144             return;
   4145         }
   4146         int32_t flags = 0;
   4147         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4148         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4149         const UChar UChar_m = 0x6d;
   4150         const UChar UChar_x = 0x78;
   4151         const UChar UChar_y = 0x79;
   4152         if (flagStr.indexOf(UChar_i) != -1) {
   4153             flags |= UREGEX_CASE_INSENSITIVE;
   4154         }
   4155         if (flagStr.indexOf(UChar_m) != -1) {
   4156             flags |= UREGEX_MULTILINE;
   4157         }
   4158         if (flagStr.indexOf(UChar_x) != -1) {
   4159             flags |= UREGEX_COMMENTS;
   4160         }
   4161 
   4162         //
   4163         // Compile the test pattern.
   4164         //
   4165         status = U_ZERO_ERROR;
   4166         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4167         if (status == U_REGEX_UNIMPLEMENTED) {
   4168             //
   4169             // Test of a feature that is planned for ICU, but not yet implemented.
   4170             //   skip the test.
   4171             skippedUnimplementedCount++;
   4172             delete testPat;
   4173             status = U_ZERO_ERROR;
   4174             continue;
   4175         }
   4176 
   4177         if (U_FAILURE(status)) {
   4178             // Some tests are supposed to generate errors.
   4179             //   Only report an error for tests that are supposed to succeed.
   4180             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4181                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4182             {
   4183                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4184             }
   4185             status = U_ZERO_ERROR;
   4186             delete testPat;
   4187             continue;
   4188         }
   4189 
   4190         if (fields[2].indexOf(UChar_i) >= 0) {
   4191             // ICU should skip this test.
   4192             delete testPat;
   4193             continue;
   4194         }
   4195 
   4196         if (fields[2].indexOf(UChar_c) >= 0) {
   4197             // This pattern should have caused a compilation error, but didn't/
   4198             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4199             delete testPat;
   4200             continue;
   4201         }
   4202 
   4203         //
   4204         // replace the Perl variables that appear in some of the
   4205         //   match data strings.
   4206         //
   4207         UnicodeString matchString = fields[1];
   4208         matchString.findAndReplace(nulnulSrc, nulnul);
   4209         matchString.findAndReplace(ffffSrc,   ffff);
   4210 
   4211         // Replace any \n in the match string with an actual new-line char.
   4212         //  Don't do full unescape, as this unescapes more than Perl does, which
   4213         //  causes other spurious failures in the tests.
   4214         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4215 
   4216 
   4217 
   4218         //
   4219         // Run the test, check for expected match/don't match result.
   4220         //
   4221         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4222         UBool found = testMat->find();
   4223         UBool expected = FALSE;
   4224         if (fields[2].indexOf(UChar_y) >=0) {
   4225             expected = TRUE;
   4226         }
   4227         if (expected != found) {
   4228             errln("line %d: Expected %smatch, got %smatch",
   4229                 lineNum, expected?"":"no ", found?"":"no " );
   4230             continue;
   4231         }
   4232 
   4233         // Don't try to check expected results if there is no match.
   4234         //   (Some have stuff in the expected fields)
   4235         if (!found) {
   4236             delete testMat;
   4237             delete testPat;
   4238             continue;
   4239         }
   4240 
   4241         //
   4242         // Interpret the Perl expression from the fourth field of the data file,
   4243         // building up an ICU string from the results of the ICU match.
   4244         //   The Perl expression will contain references to the results of
   4245         //     a regex match, including the matched string, capture group strings,
   4246         //     group starting and ending indicies, etc.
   4247         //
   4248         UnicodeString resultString;
   4249         UnicodeString perlExpr = fields[3];
   4250 #if SUPPORT_MUTATING_INPUT_STRING
   4251         groupsMat->reset(perlExpr);
   4252         cgMat->reset(perlExpr);
   4253 #endif
   4254 
   4255         while (perlExpr.length() > 0) {
   4256 #if !SUPPORT_MUTATING_INPUT_STRING
   4257             //  Perferred usage.  Reset after any modification to input string.
   4258             groupsMat->reset(perlExpr);
   4259             cgMat->reset(perlExpr);
   4260 #endif
   4261 
   4262             if (perlExpr.startsWith("$&")) {
   4263                 resultString.append(testMat->group(status));
   4264                 perlExpr.remove(0, 2);
   4265             }
   4266 
   4267             else if (groupsMat->lookingAt(status)) {
   4268                 // $-[0]   $+[2]  etc.
   4269                 UnicodeString digitString = groupsMat->group(2, status);
   4270                 int32_t t = 0;
   4271                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4272                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4273                 int32_t matchPosition;
   4274                 if (plusOrMinus.compare("+") == 0) {
   4275                     matchPosition = testMat->end(groupNum, status);
   4276                 } else {
   4277                     matchPosition = testMat->start(groupNum, status);
   4278                 }
   4279                 if (matchPosition != -1) {
   4280                     ICU_Utility::appendNumber(resultString, matchPosition);
   4281                 }
   4282                 perlExpr.remove(0, groupsMat->end(status));
   4283             }
   4284 
   4285             else if (cgMat->lookingAt(status)) {
   4286                 // $1, $2, $3, etc.
   4287                 UnicodeString digitString = cgMat->group(1, status);
   4288                 int32_t t = 0;
   4289                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4290                 if (U_SUCCESS(status)) {
   4291                     resultString.append(testMat->group(groupNum, status));
   4292                     status = U_ZERO_ERROR;
   4293                 }
   4294                 perlExpr.remove(0, cgMat->end(status));
   4295             }
   4296 
   4297             else if (perlExpr.startsWith("@-")) {
   4298                 int32_t i;
   4299                 for (i=0; i<=testMat->groupCount(); i++) {
   4300                     if (i>0) {
   4301                         resultString.append(" ");
   4302                     }
   4303                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4304                 }
   4305                 perlExpr.remove(0, 2);
   4306             }
   4307 
   4308             else if (perlExpr.startsWith("@+")) {
   4309                 int32_t i;
   4310                 for (i=0; i<=testMat->groupCount(); i++) {
   4311                     if (i>0) {
   4312                         resultString.append(" ");
   4313                     }
   4314                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4315                 }
   4316                 perlExpr.remove(0, 2);
   4317             }
   4318 
   4319             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4320                                                      //           or as an escaped sequence (e.g. \n)
   4321                 if (perlExpr.length() > 1) {
   4322                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4323                 }
   4324                 UChar c = perlExpr.charAt(0);
   4325                 switch (c) {
   4326                 case 'n':   c = '\n'; break;
   4327                 // add any other escape sequences that show up in the test expected results.
   4328                 }
   4329                 resultString.append(c);
   4330                 perlExpr.remove(0, 1);
   4331             }
   4332 
   4333             else  {
   4334                 // Any characters from the perl expression that we don't explicitly
   4335                 //  recognize before here are assumed to be literals and copied
   4336                 //  as-is to the expected results.
   4337                 resultString.append(perlExpr.charAt(0));
   4338                 perlExpr.remove(0, 1);
   4339             }
   4340 
   4341             if (U_FAILURE(status)) {
   4342                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4343                 break;
   4344             }
   4345         }
   4346 
   4347         //
   4348         // Expected Results Compare
   4349         //
   4350         UnicodeString expectedS(fields[4]);
   4351         expectedS.findAndReplace(nulnulSrc, nulnul);
   4352         expectedS.findAndReplace(ffffSrc,   ffff);
   4353         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4354 
   4355 
   4356         if (expectedS.compare(resultString) != 0) {
   4357             err("Line %d: Incorrect perl expression results.", lineNum);
   4358             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4359         }
   4360 
   4361         delete testMat;
   4362         delete testPat;
   4363     }
   4364 
   4365     //
   4366     // All done.  Clean up allocated stuff.
   4367     //
   4368     delete cgMat;
   4369     delete cgPat;
   4370 
   4371     delete groupsMat;
   4372     delete groupsPat;
   4373 
   4374     delete flagMat;
   4375     delete flagPat;
   4376 
   4377     delete lineMat;
   4378     delete linePat;
   4379 
   4380     delete fieldPat;
   4381     delete [] testData;
   4382 
   4383 
   4384     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4385 
   4386 }
   4387 
   4388 
   4389 //-------------------------------------------------------------------------------
   4390 //
   4391 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4392 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4393 //                  The input file for this test is re_tests, the standard regular
   4394 //                  expression test data distributed with the Perl source code.
   4395 //                  See PerlTests() for more information.
   4396 //
   4397 //-------------------------------------------------------------------------------
   4398 void RegexTest::PerlTestsUTF8() {
   4399     char tdd[2048];
   4400     const char *srcPath;
   4401     UErrorCode  status = U_ZERO_ERROR;
   4402     UParseError pe;
   4403     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4404     UText       patternText = UTEXT_INITIALIZER;
   4405     char       *patternChars = NULL;
   4406     int32_t     patternLength;
   4407     int32_t     patternCapacity = 0;
   4408     UText       inputText = UTEXT_INITIALIZER;
   4409     char       *inputChars = NULL;
   4410     int32_t     inputLength;
   4411     int32_t     inputCapacity = 0;
   4412 
   4413     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4414 
   4415     //
   4416     //  Open and read the test data file.
   4417     //
   4418     srcPath=getPath(tdd, "re_tests.txt");
   4419     if(srcPath==NULL) {
   4420         return; /* something went wrong, error already output */
   4421     }
   4422 
   4423     int32_t    len;
   4424     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4425     if (U_FAILURE(status)) {
   4426         return; /* something went wrong, error already output */
   4427     }
   4428 
   4429     //
   4430     //  Put the test data into a UnicodeString
   4431     //
   4432     UnicodeString testDataString(FALSE, testData, len);
   4433 
   4434     //
   4435     //  Regex to break the input file into lines, and strip the new lines.
   4436     //     One line per match, capture group one is the desired data.
   4437     //
   4438     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4439     if (U_FAILURE(status)) {
   4440         dataerrln("RegexPattern::compile() error");
   4441         return;
   4442     }
   4443     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4444 
   4445     //
   4446     //  Regex to split a test file line into fields.
   4447     //    There are six fields, separated by tabs.
   4448     //
   4449     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4450 
   4451     //
   4452     //  Regex to identify test patterns with flag settings, and to separate them.
   4453     //    Test patterns with flags look like 'pattern'i
   4454     //    Test patterns without flags are not quoted:   pattern
   4455     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4456     //
   4457     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4458     RegexMatcher* flagMat = flagPat->matcher(status);
   4459 
   4460     //
   4461     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4462     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4463     //   are string constants and REs for these constructs.
   4464     //
   4465     UnicodeString nulnulSrc("${nulnul}");
   4466     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4467     nulnul = nulnul.unescape();
   4468 
   4469     UnicodeString ffffSrc("${ffff}");
   4470     UnicodeString ffff("\\uffff", -1, US_INV);
   4471     ffff = ffff.unescape();
   4472 
   4473     //  regexp for $-[0], $+[2], etc.
   4474     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4475     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4476 
   4477     //  regexp for $0, $1, $2, etc.
   4478     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4479     RegexMatcher *cgMat = cgPat->matcher(status);
   4480 
   4481 
   4482     //
   4483     // Main Loop for the Perl Tests, runs once per line from the
   4484     //   test data file.
   4485     //
   4486     int32_t  lineNum = 0;
   4487     int32_t  skippedUnimplementedCount = 0;
   4488     while (lineMat->find()) {
   4489         lineNum++;
   4490 
   4491         //
   4492         //  Get a line, break it into its fields, do the Perl
   4493         //    variable substitutions.
   4494         //
   4495         UnicodeString line = lineMat->group(1, status);
   4496         UnicodeString fields[7];
   4497         fieldPat->split(line, fields, 7, status);
   4498 
   4499         flagMat->reset(fields[0]);
   4500         flagMat->matches(status);
   4501         UnicodeString pattern  = flagMat->group(2, status);
   4502         pattern.findAndReplace("${bang}", "!");
   4503         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4504         pattern.findAndReplace(ffffSrc, ffff);
   4505 
   4506         //
   4507         //  Identify patterns that include match flag settings,
   4508         //    split off the flags, remove the extra quotes.
   4509         //
   4510         UnicodeString flagStr = flagMat->group(3, status);
   4511         if (U_FAILURE(status)) {
   4512             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4513             return;
   4514         }
   4515         int32_t flags = 0;
   4516         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4517         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4518         const UChar UChar_m = 0x6d;
   4519         const UChar UChar_x = 0x78;
   4520         const UChar UChar_y = 0x79;
   4521         if (flagStr.indexOf(UChar_i) != -1) {
   4522             flags |= UREGEX_CASE_INSENSITIVE;
   4523         }
   4524         if (flagStr.indexOf(UChar_m) != -1) {
   4525             flags |= UREGEX_MULTILINE;
   4526         }
   4527         if (flagStr.indexOf(UChar_x) != -1) {
   4528             flags |= UREGEX_COMMENTS;
   4529         }
   4530 
   4531         //
   4532         // Put the pattern in a UTF-8 UText
   4533         //
   4534         status = U_ZERO_ERROR;
   4535         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4536         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4537             status = U_ZERO_ERROR;
   4538             delete[] patternChars;
   4539             patternCapacity = patternLength + 1;
   4540             patternChars = new char[patternCapacity];
   4541             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4542         }
   4543         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4544 
   4545         //
   4546         // Compile the test pattern.
   4547         //
   4548         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4549         if (status == U_REGEX_UNIMPLEMENTED) {
   4550             //
   4551             // Test of a feature that is planned for ICU, but not yet implemented.
   4552             //   skip the test.
   4553             skippedUnimplementedCount++;
   4554             delete testPat;
   4555             status = U_ZERO_ERROR;
   4556             continue;
   4557         }
   4558 
   4559         if (U_FAILURE(status)) {
   4560             // Some tests are supposed to generate errors.
   4561             //   Only report an error for tests that are supposed to succeed.
   4562             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4563                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4564             {
   4565                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4566             }
   4567             status = U_ZERO_ERROR;
   4568             delete testPat;
   4569             continue;
   4570         }
   4571 
   4572         if (fields[2].indexOf(UChar_i) >= 0) {
   4573             // ICU should skip this test.
   4574             delete testPat;
   4575             continue;
   4576         }
   4577 
   4578         if (fields[2].indexOf(UChar_c) >= 0) {
   4579             // This pattern should have caused a compilation error, but didn't/
   4580             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4581             delete testPat;
   4582             continue;
   4583         }
   4584 
   4585 
   4586         //
   4587         // replace the Perl variables that appear in some of the
   4588         //   match data strings.
   4589         //
   4590         UnicodeString matchString = fields[1];
   4591         matchString.findAndReplace(nulnulSrc, nulnul);
   4592         matchString.findAndReplace(ffffSrc,   ffff);
   4593 
   4594         // Replace any \n in the match string with an actual new-line char.
   4595         //  Don't do full unescape, as this unescapes more than Perl does, which
   4596         //  causes other spurious failures in the tests.
   4597         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4598 
   4599         //
   4600         // Put the input in a UTF-8 UText
   4601         //
   4602         status = U_ZERO_ERROR;
   4603         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4604         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4605             status = U_ZERO_ERROR;
   4606             delete[] inputChars;
   4607             inputCapacity = inputLength + 1;
   4608             inputChars = new char[inputCapacity];
   4609             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4610         }
   4611         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4612 
   4613         //
   4614         // Run the test, check for expected match/don't match result.
   4615         //
   4616         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4617         UBool found = testMat->find();
   4618         UBool expected = FALSE;
   4619         if (fields[2].indexOf(UChar_y) >=0) {
   4620             expected = TRUE;
   4621         }
   4622         if (expected != found) {
   4623             errln("line %d: Expected %smatch, got %smatch",
   4624                 lineNum, expected?"":"no ", found?"":"no " );
   4625             continue;
   4626         }
   4627 
   4628         // Don't try to check expected results if there is no match.
   4629         //   (Some have stuff in the expected fields)
   4630         if (!found) {
   4631             delete testMat;
   4632             delete testPat;
   4633             continue;
   4634         }
   4635 
   4636         //
   4637         // Interpret the Perl expression from the fourth field of the data file,
   4638         // building up an ICU string from the results of the ICU match.
   4639         //   The Perl expression will contain references to the results of
   4640         //     a regex match, including the matched string, capture group strings,
   4641         //     group starting and ending indicies, etc.
   4642         //
   4643         UnicodeString resultString;
   4644         UnicodeString perlExpr = fields[3];
   4645 
   4646         while (perlExpr.length() > 0) {
   4647             groupsMat->reset(perlExpr);
   4648             cgMat->reset(perlExpr);
   4649 
   4650             if (perlExpr.startsWith("$&")) {
   4651                 resultString.append(testMat->group(status));
   4652                 perlExpr.remove(0, 2);
   4653             }
   4654 
   4655             else if (groupsMat->lookingAt(status)) {
   4656                 // $-[0]   $+[2]  etc.
   4657                 UnicodeString digitString = groupsMat->group(2, status);
   4658                 int32_t t = 0;
   4659                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4660                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4661                 int32_t matchPosition;
   4662                 if (plusOrMinus.compare("+") == 0) {
   4663                     matchPosition = testMat->end(groupNum, status);
   4664                 } else {
   4665                     matchPosition = testMat->start(groupNum, status);
   4666                 }
   4667                 if (matchPosition != -1) {
   4668                     ICU_Utility::appendNumber(resultString, matchPosition);
   4669                 }
   4670                 perlExpr.remove(0, groupsMat->end(status));
   4671             }
   4672 
   4673             else if (cgMat->lookingAt(status)) {
   4674                 // $1, $2, $3, etc.
   4675                 UnicodeString digitString = cgMat->group(1, status);
   4676                 int32_t t = 0;
   4677                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4678                 if (U_SUCCESS(status)) {
   4679                     resultString.append(testMat->group(groupNum, status));
   4680                     status = U_ZERO_ERROR;
   4681                 }
   4682                 perlExpr.remove(0, cgMat->end(status));
   4683             }
   4684 
   4685             else if (perlExpr.startsWith("@-")) {
   4686                 int32_t i;
   4687                 for (i=0; i<=testMat->groupCount(); i++) {
   4688                     if (i>0) {
   4689                         resultString.append(" ");
   4690                     }
   4691                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4692                 }
   4693                 perlExpr.remove(0, 2);
   4694             }
   4695 
   4696             else if (perlExpr.startsWith("@+")) {
   4697                 int32_t i;
   4698                 for (i=0; i<=testMat->groupCount(); i++) {
   4699                     if (i>0) {
   4700                         resultString.append(" ");
   4701                     }
   4702                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4703                 }
   4704                 perlExpr.remove(0, 2);
   4705             }
   4706 
   4707             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4708                                                      //           or as an escaped sequence (e.g. \n)
   4709                 if (perlExpr.length() > 1) {
   4710                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4711                 }
   4712                 UChar c = perlExpr.charAt(0);
   4713                 switch (c) {
   4714                 case 'n':   c = '\n'; break;
   4715                 // add any other escape sequences that show up in the test expected results.
   4716                 }
   4717                 resultString.append(c);
   4718                 perlExpr.remove(0, 1);
   4719             }
   4720 
   4721             else  {
   4722                 // Any characters from the perl expression that we don't explicitly
   4723                 //  recognize before here are assumed to be literals and copied
   4724                 //  as-is to the expected results.
   4725                 resultString.append(perlExpr.charAt(0));
   4726                 perlExpr.remove(0, 1);
   4727             }
   4728 
   4729             if (U_FAILURE(status)) {
   4730                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4731                 break;
   4732             }
   4733         }
   4734 
   4735         //
   4736         // Expected Results Compare
   4737         //
   4738         UnicodeString expectedS(fields[4]);
   4739         expectedS.findAndReplace(nulnulSrc, nulnul);
   4740         expectedS.findAndReplace(ffffSrc,   ffff);
   4741         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4742 
   4743 
   4744         if (expectedS.compare(resultString) != 0) {
   4745             err("Line %d: Incorrect perl expression results.", lineNum);
   4746             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4747         }
   4748 
   4749         delete testMat;
   4750         delete testPat;
   4751     }
   4752 
   4753     //
   4754     // All done.  Clean up allocated stuff.
   4755     //
   4756     delete cgMat;
   4757     delete cgPat;
   4758 
   4759     delete groupsMat;
   4760     delete groupsPat;
   4761 
   4762     delete flagMat;
   4763     delete flagPat;
   4764 
   4765     delete lineMat;
   4766     delete linePat;
   4767 
   4768     delete fieldPat;
   4769     delete [] testData;
   4770 
   4771     utext_close(&patternText);
   4772     utext_close(&inputText);
   4773 
   4774     delete [] patternChars;
   4775     delete [] inputChars;
   4776 
   4777 
   4778     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4779 
   4780 }
   4781 
   4782 
   4783 //--------------------------------------------------------------
   4784 //
   4785 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4786 //             Use this pattern,
   4787 //                 "(a?){1,8000000}"
   4788 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
   4789 //                   This test is likely to be fragile, as further optimizations stop
   4790 //                   more cases of pointless looping in the match engine.
   4791 //
   4792 //---------------------------------------------------------------
   4793 void RegexTest::Bug6149() {
   4794     UnicodeString pattern("(a?){1,8000000}");
   4795     UnicodeString s("xyz");
   4796     uint32_t flags = 0;
   4797     UErrorCode status = U_ZERO_ERROR;
   4798 
   4799     RegexMatcher  matcher(pattern, s, flags, status);
   4800     UBool result = false;
   4801     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4802     REGEX_ASSERT(result == FALSE);
   4803  }
   4804 
   4805 
   4806 //
   4807 //   Callbacks()    Test the callback function.
   4808 //                  When set, callbacks occur periodically during matching operations,
   4809 //                  giving the application code the ability to abort the operation
   4810 //                  before it's normal completion.
   4811 //
   4812 
   4813 struct callBackContext {
   4814     RegexTest        *test;
   4815     int32_t          maxCalls;
   4816     int32_t          numCalls;
   4817     int32_t          lastSteps;
   4818     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4819 };
   4820 
   4821 U_CDECL_BEGIN
   4822 static UBool U_CALLCONV
   4823 testCallBackFn(const void *context, int32_t steps) {
   4824     callBackContext  *info = (callBackContext *)context;
   4825     if (info->lastSteps+1 != steps) {
   4826         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4827     }
   4828     info->lastSteps = steps;
   4829     info->numCalls++;
   4830     return (info->numCalls < info->maxCalls);
   4831 }
   4832 U_CDECL_END
   4833 
   4834 void RegexTest::Callbacks() {
   4835    {
   4836         // Getter returns NULLs if no callback has been set
   4837 
   4838         //   The variables that the getter will fill in.
   4839         //   Init to non-null values so that the action of the getter can be seen.
   4840         const void          *returnedContext = &returnedContext;
   4841         URegexMatchCallback *returnedFn = &testCallBackFn;
   4842 
   4843         UErrorCode status = U_ZERO_ERROR;
   4844         RegexMatcher matcher("x", 0, status);
   4845         REGEX_CHECK_STATUS;
   4846         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4847         REGEX_CHECK_STATUS;
   4848         REGEX_ASSERT(returnedFn == NULL);
   4849         REGEX_ASSERT(returnedContext == NULL);
   4850     }
   4851 
   4852    {
   4853         // Set and Get work
   4854         callBackContext cbInfo = {this, 0, 0, 0};
   4855         const void          *returnedContext;
   4856         URegexMatchCallback *returnedFn;
   4857         UErrorCode status = U_ZERO_ERROR;
   4858         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4859         REGEX_CHECK_STATUS;
   4860         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4861         REGEX_CHECK_STATUS;
   4862         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4863         REGEX_CHECK_STATUS;
   4864         REGEX_ASSERT(returnedFn == testCallBackFn);
   4865         REGEX_ASSERT(returnedContext == &cbInfo);
   4866 
   4867         // A short-running match shouldn't invoke the callback
   4868         status = U_ZERO_ERROR;
   4869         cbInfo.reset(1);
   4870         UnicodeString s = "xxx";
   4871         matcher.reset(s);
   4872         REGEX_ASSERT(matcher.matches(status));
   4873         REGEX_CHECK_STATUS;
   4874         REGEX_ASSERT(cbInfo.numCalls == 0);
   4875 
   4876         // A medium-length match that runs long enough to invoke the
   4877         //   callback, but not so long that the callback aborts it.
   4878         status = U_ZERO_ERROR;
   4879         cbInfo.reset(4);
   4880         s = "aaaaaaaaaaaaaaaaaaab";
   4881         matcher.reset(s);
   4882         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4883         REGEX_CHECK_STATUS;
   4884         REGEX_ASSERT(cbInfo.numCalls > 0);
   4885 
   4886         // A longer running match that the callback function will abort.
   4887         status = U_ZERO_ERROR;
   4888         cbInfo.reset(4);
   4889         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4890         matcher.reset(s);
   4891         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4892         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4893         REGEX_ASSERT(cbInfo.numCalls == 4);
   4894 
   4895         // A longer running find that the callback function will abort.
   4896         status = U_ZERO_ERROR;
   4897         cbInfo.reset(4);
   4898         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4899         matcher.reset(s);
   4900         REGEX_ASSERT(matcher.find(status)==FALSE);
   4901         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4902         REGEX_ASSERT(cbInfo.numCalls == 4);
   4903     }
   4904 
   4905 
   4906 }
   4907 
   4908 
   4909 //
   4910 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4911 //                  When set, the find progress callback will be invoked during a find operations
   4912 //                  after each return from a match attempt, giving the application the opportunity
   4913 //                  to terminate a long-running find operation before it's normal completion.
   4914 //
   4915 
   4916 struct progressCallBackContext {
   4917     RegexTest        *test;
   4918     int64_t          lastIndex;
   4919     int32_t          maxCalls;
   4920     int32_t          numCalls;
   4921     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4922 };
   4923 
   4924 // call-back function for find().
   4925 // Return TRUE to continue the find().
   4926 // Return FALSE to stop the find().
   4927 U_CDECL_BEGIN
   4928 static UBool U_CALLCONV
   4929 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4930     progressCallBackContext  *info = (progressCallBackContext *)context;
   4931     info->numCalls++;
   4932     info->lastIndex = matchIndex;
   4933 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4934     return (info->numCalls < info->maxCalls);
   4935 }
   4936 U_CDECL_END
   4937 
   4938 void RegexTest::FindProgressCallbacks() {
   4939    {
   4940         // Getter returns NULLs if no callback has been set
   4941 
   4942         //   The variables that the getter will fill in.
   4943         //   Init to non-null values so that the action of the getter can be seen.
   4944         const void                  *returnedContext = &returnedContext;
   4945         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4946 
   4947         UErrorCode status = U_ZERO_ERROR;
   4948         RegexMatcher matcher("x", 0, status);
   4949         REGEX_CHECK_STATUS;
   4950         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4951         REGEX_CHECK_STATUS;
   4952         REGEX_ASSERT(returnedFn == NULL);
   4953         REGEX_ASSERT(returnedContext == NULL);
   4954     }
   4955 
   4956    {
   4957         // Set and Get work
   4958         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4959         const void                  *returnedContext;
   4960         URegexFindProgressCallback  *returnedFn;
   4961         UErrorCode status = U_ZERO_ERROR;
   4962         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
   4963         REGEX_CHECK_STATUS;
   4964         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4965         REGEX_CHECK_STATUS;
   4966         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4967         REGEX_CHECK_STATUS;
   4968         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4969         REGEX_ASSERT(returnedContext == &cbInfo);
   4970 
   4971         // A find that matches on the initial position does NOT invoke the callback.
   4972         status = U_ZERO_ERROR;
   4973         cbInfo.reset(100);
   4974         UnicodeString s = "aaxxx";
   4975         matcher.reset(s);
   4976 #if 0
   4977         matcher.setTrace(TRUE);
   4978 #endif
   4979         REGEX_ASSERT(matcher.find(0, status));
   4980         REGEX_CHECK_STATUS;
   4981         REGEX_ASSERT(cbInfo.numCalls == 0);
   4982 
   4983         // A medium running find() that causes matcher.find() to invoke our callback for each index,
   4984         //   but not so many times that we interrupt the operation.
   4985         status = U_ZERO_ERROR;
   4986         s = "aaaaaaaaaaaaaaaaaaab";
   4987         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4988         matcher.reset(s);
   4989         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4990         REGEX_CHECK_STATUS;
   4991         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4992 
   4993         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4994         status = U_ZERO_ERROR;
   4995         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4996         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4997         matcher.reset(s1);
   4998         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4999         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   5000         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   5001 
   5002         // Now a match that will succeed, but after an interruption
   5003         status = U_ZERO_ERROR;
   5004         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   5005         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   5006         matcher.reset(s2);
   5007         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   5008         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   5009         // Now retry the match from where left off
   5010         cbInfo.maxCalls = 100; //  No callback limit
   5011         status = U_ZERO_ERROR;
   5012         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   5013         REGEX_CHECK_STATUS;
   5014     }
   5015 
   5016 
   5017 }
   5018 
   5019 
   5020 //---------------------------------------------------------------------------
   5021 //
   5022 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   5023 //                             UTexts. The pure-C implementation of UText
   5024 //                             has no mutable backing stores, but we can
   5025 //                             use UnicodeString here to test the functionality.
   5026 //
   5027 //---------------------------------------------------------------------------
   5028 void RegexTest::PreAllocatedUTextCAPI () {
   5029     UErrorCode           status = U_ZERO_ERROR;
   5030     URegularExpression  *re;
   5031     UText                patternText = UTEXT_INITIALIZER;
   5032     UnicodeString        buffer;
   5033     UText                bufferText = UTEXT_INITIALIZER;
   5034 
   5035     utext_openUnicodeString(&bufferText, &buffer, &status);
   5036 
   5037     /*
   5038      *  getText() and getUText()
   5039      */
   5040     {
   5041         UText  text1 = UTEXT_INITIALIZER;
   5042         UText  text2 = UTEXT_INITIALIZER;
   5043         UChar  text2Chars[20];
   5044         UText  *resultText;
   5045 
   5046         status = U_ZERO_ERROR;
   5047         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   5048         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   5049         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   5050         utext_openUChars(&text2, text2Chars, -1, &status);
   5051 
   5052         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   5053         re = uregex_openUText(&patternText, 0, NULL, &status);
   5054 
   5055         /* First set a UText */
   5056         uregex_setUText(re, &text1, &status);
   5057         resultText = uregex_getUText(re, &bufferText, &status);
   5058         REGEX_CHECK_STATUS;
   5059         REGEX_ASSERT(resultText == &bufferText);
   5060         utext_setNativeIndex(resultText, 0);
   5061         utext_setNativeIndex(&text1, 0);
   5062         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5063 
   5064         resultText = uregex_getUText(re, &bufferText, &status);
   5065         REGEX_CHECK_STATUS;
   5066         REGEX_ASSERT(resultText == &bufferText);
   5067         utext_setNativeIndex(resultText, 0);
   5068         utext_setNativeIndex(&text1, 0);
   5069         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5070 
   5071         /* Then set a UChar * */
   5072         uregex_setText(re, text2Chars, 7, &status);
   5073         resultText = uregex_getUText(re, &bufferText, &status);
   5074         REGEX_CHECK_STATUS;
   5075         REGEX_ASSERT(resultText == &bufferText);
   5076         utext_setNativeIndex(resultText, 0);
   5077         utext_setNativeIndex(&text2, 0);
   5078         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   5079 
   5080         uregex_close(re);
   5081         utext_close(&text1);
   5082         utext_close(&text2);
   5083     }
   5084 
   5085     /*
   5086      *  group()
   5087      */
   5088     {
   5089         UChar    text1[80];
   5090         UText   *actual;
   5091         UBool    result;
   5092         int64_t  length = 0;
   5093 
   5094         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
   5095         //                  012345678901234567890123456789012345678901234567
   5096         //                  0         1         2         3         4
   5097 
   5098         status = U_ZERO_ERROR;
   5099         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   5100         REGEX_CHECK_STATUS;
   5101 
   5102         uregex_setText(re, text1, -1, &status);
   5103         result = uregex_find(re, 0, &status);
   5104         REGEX_ASSERT(result==TRUE);
   5105 
   5106         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
   5107         status = U_ZERO_ERROR;
   5108         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
   5109         REGEX_CHECK_STATUS;
   5110         REGEX_ASSERT(actual == &bufferText);
   5111         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
   5112         REGEX_ASSERT(length == 16);
   5113         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5114 
   5115         /*  Capture group #1.  Should succeed, matching " interior ". */
   5116         status = U_ZERO_ERROR;
   5117         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
   5118         REGEX_CHECK_STATUS;
   5119         REGEX_ASSERT(actual == &bufferText);
   5120         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
   5121         REGEX_ASSERT(length == 10);
   5122         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5123 
   5124         /*  Capture group out of range.  Error. */
   5125         status = U_ZERO_ERROR;
   5126         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
   5127         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5128         REGEX_ASSERT(actual == &bufferText);
   5129         uregex_close(re);
   5130 
   5131     }
   5132 
   5133     /*
   5134      *  replaceFirst()
   5135      */
   5136     {
   5137         UChar    text1[80];
   5138         UChar    text2[80];
   5139         UText    replText = UTEXT_INITIALIZER;
   5140         UText   *result;
   5141         status = U_ZERO_ERROR;
   5142         utext_openUnicodeString(&bufferText, &buffer, &status);
   5143 
   5144         status = U_ZERO_ERROR;
   5145         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
   5146         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
   5147         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5148 
   5149         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5150         REGEX_CHECK_STATUS;
   5151 
   5152         /*  Normal case, with match */
   5153         uregex_setText(re, text1, -1, &status);
   5154         REGEX_CHECK_STATUS;
   5155         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5156         REGEX_CHECK_STATUS;
   5157         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5158         REGEX_CHECK_STATUS;
   5159         REGEX_ASSERT(result == &bufferText);
   5160         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5161 
   5162         /* No match.  Text should copy to output with no changes.  */
   5163         uregex_setText(re, text2, -1, &status);
   5164         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5165         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5166         REGEX_CHECK_STATUS;
   5167         REGEX_ASSERT(result == &bufferText);
   5168         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5169 
   5170         /* Unicode escapes */
   5171         uregex_setText(re, text1, -1, &status);
   5172         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
   5173         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5174         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5175         REGEX_CHECK_STATUS;
   5176         REGEX_ASSERT(result == &bufferText);
   5177         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5178 
   5179         uregex_close(re);
   5180         utext_close(&replText);
   5181     }
   5182 
   5183 
   5184     /*
   5185      *  replaceAll()
   5186      */
   5187     {
   5188         UChar    text1[80];
   5189         UChar    text2[80];
   5190         UText    replText = UTEXT_INITIALIZER;
   5191         UText   *result;
   5192 
   5193         status = U_ZERO_ERROR;
   5194         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5195         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5196         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5197 
   5198         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5199         REGEX_CHECK_STATUS;
   5200 
   5201         /*  Normal case, with match */
   5202         uregex_setText(re, text1, -1, &status);
   5203         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5204         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5205         REGEX_CHECK_STATUS;
   5206         REGEX_ASSERT(result == &bufferText);
   5207         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5208 
   5209         /* No match.  Text should copy to output with no changes.  */
   5210         uregex_setText(re, text2, -1, &status);
   5211         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5212         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5213         REGEX_CHECK_STATUS;
   5214         REGEX_ASSERT(result == &bufferText);
   5215         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5216 
   5217         uregex_close(re);
   5218         utext_close(&replText);
   5219     }
   5220 
   5221 
   5222     /*
   5223      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5224      *   so we don't need to test it here.
   5225      */
   5226 
   5227     utext_close(&bufferText);
   5228     utext_close(&patternText);
   5229 }
   5230 
   5231 
   5232 //--------------------------------------------------------------
   5233 //
   5234 //  NamedCapture   Check basic named capture group functionality
   5235 //
   5236 //--------------------------------------------------------------
   5237 void RegexTest::NamedCapture() {
   5238     UErrorCode status = U_ZERO_ERROR;
   5239     RegexPattern *pat = RegexPattern::compile(UnicodeString(
   5240             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
   5241     REGEX_CHECK_STATUS;
   5242     int32_t group = pat->groupNumberFromName("five", -1, status);
   5243     REGEX_CHECK_STATUS;
   5244     REGEX_ASSERT(5 == group);
   5245     group = pat->groupNumberFromName("three", -1, status);
   5246     REGEX_CHECK_STATUS;
   5247     REGEX_ASSERT(3 == group);
   5248 
   5249     status = U_ZERO_ERROR;
   5250     group = pat->groupNumberFromName(UnicodeString("six"), status);
   5251     REGEX_CHECK_STATUS;
   5252     REGEX_ASSERT(6 == group);
   5253 
   5254     status = U_ZERO_ERROR;
   5255     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
   5256     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5257 
   5258     status = U_ZERO_ERROR;
   5259 
   5260     // After copying a pattern, named capture should still work in the copy.
   5261     RegexPattern *copiedPat = new RegexPattern(*pat);
   5262     REGEX_ASSERT(*copiedPat == *pat);
   5263     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
   5264 
   5265     group = copiedPat->groupNumberFromName("five", -1, status);
   5266     REGEX_CHECK_STATUS;
   5267     REGEX_ASSERT(5 == group);
   5268     group = copiedPat->groupNumberFromName("three", -1, status);
   5269     REGEX_CHECK_STATUS;
   5270     REGEX_ASSERT(3 == group);
   5271     delete copiedPat;
   5272 
   5273     // ReplaceAll with named capture group.
   5274     status = U_ZERO_ERROR;
   5275     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
   5276     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
   5277     REGEX_CHECK_STATUS;
   5278     // m.pattern().dumpPattern();
   5279     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
   5280     REGEX_CHECK_STATUS;
   5281     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
   5282     delete m;
   5283 
   5284     // ReplaceAll, allowed capture group numbers.
   5285     text = UnicodeString("abcmxyz");
   5286     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
   5287     REGEX_CHECK_STATUS;
   5288 
   5289     status = U_ZERO_ERROR;
   5290     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
   5291     REGEX_CHECK_STATUS;
   5292     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
   5293 
   5294     status = U_ZERO_ERROR;
   5295     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
   5296     REGEX_CHECK_STATUS;
   5297     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5298 
   5299     status = U_ZERO_ERROR;
   5300     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
   5301     REGEX_CHECK_STATUS;
   5302     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5303 
   5304     status = U_ZERO_ERROR;
   5305     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
   5306     REGEX_CHECK_STATUS;
   5307     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
   5308 
   5309     status = U_ZERO_ERROR;
   5310     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
   5311     REGEX_CHECK_STATUS;
   5312     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
   5313 
   5314     status = U_ZERO_ERROR;
   5315     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
   5316     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5317 
   5318     status = U_ZERO_ERROR;
   5319     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
   5320     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
   5321     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
   5322 
   5323     status = U_ZERO_ERROR;
   5324     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
   5325     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
   5326     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
   5327 
   5328     status = U_ZERO_ERROR;
   5329     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
   5330     REGEX_CHECK_STATUS;
   5331     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
   5332 
   5333     status = U_ZERO_ERROR;
   5334     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
   5335     REGEX_CHECK_STATUS;
   5336     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
   5337 
   5338     status = U_ZERO_ERROR;
   5339     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
   5340     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5341 
   5342     status = U_ZERO_ERROR;
   5343     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
   5344     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5345 
   5346     status = U_ZERO_ERROR;
   5347     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
   5348     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5349 
   5350     status = U_ZERO_ERROR;
   5351     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
   5352     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5353 
   5354     delete m;
   5355 
   5356     // Repeat the above replaceAll() tests using the plain C API, which
   5357     //  has a separate implementation internally.
   5358     //  TODO: factor out the test data.
   5359 
   5360     status = U_ZERO_ERROR;
   5361     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
   5362     REGEX_CHECK_STATUS;
   5363     text = UnicodeString("abcmxyz");
   5364     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5365     REGEX_CHECK_STATUS;
   5366 
   5367     UChar resultBuf[100];
   5368     int32_t resultLength;
   5369     UnicodeString repl;
   5370 
   5371     status = U_ZERO_ERROR;
   5372     repl = UnicodeString("<$0>");
   5373     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5374     REGEX_CHECK_STATUS;
   5375     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
   5376 
   5377     status = U_ZERO_ERROR;
   5378     repl = UnicodeString("<$1>");
   5379     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5380     REGEX_CHECK_STATUS;
   5381     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5382 
   5383     status = U_ZERO_ERROR;
   5384     repl = UnicodeString("<${one}>");
   5385     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5386     REGEX_CHECK_STATUS;
   5387     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5388 
   5389     status = U_ZERO_ERROR;
   5390     repl = UnicodeString("<$2>");
   5391     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5392     REGEX_CHECK_STATUS;
   5393     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
   5394 
   5395     status = U_ZERO_ERROR;
   5396     repl = UnicodeString("<$3>");
   5397     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5398     REGEX_CHECK_STATUS;
   5399     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
   5400 
   5401     status = U_ZERO_ERROR;
   5402     repl = UnicodeString("<$4>");
   5403     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5404     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5405 
   5406     status = U_ZERO_ERROR;
   5407     repl = UnicodeString("<$04>");
   5408     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5409     REGEX_CHECK_STATUS;
   5410     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
   5411 
   5412     status = U_ZERO_ERROR;
   5413     repl = UnicodeString("<$000016>");
   5414     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5415     REGEX_CHECK_STATUS;
   5416     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
   5417 
   5418     status = U_ZERO_ERROR;
   5419     repl = UnicodeString("<$3$2$1${one}>");
   5420     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5421     REGEX_CHECK_STATUS;
   5422     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
   5423 
   5424     status = U_ZERO_ERROR;
   5425     repl = UnicodeString("$3$2$1${one}");
   5426     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5427     REGEX_CHECK_STATUS;
   5428     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
   5429 
   5430     status = U_ZERO_ERROR;
   5431     repl = UnicodeString("<${noSuchName}>");
   5432     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5433     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5434 
   5435     status = U_ZERO_ERROR;
   5436     repl = UnicodeString("<${invalid-name}>");
   5437     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5438     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5439 
   5440     status = U_ZERO_ERROR;
   5441     repl = UnicodeString("<${one");
   5442     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5443     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5444 
   5445     status = U_ZERO_ERROR;
   5446     repl = UnicodeString("$not a capture group");
   5447     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5448     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5449 
   5450     uregex_close(re);
   5451 }
   5452 
   5453 //--------------------------------------------------------------
   5454 //
   5455 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
   5456 //                       The point is not so much what the exact limit is,
   5457 //                       but that a largish number doesn't hit bad non-linear performance,
   5458 //                       and that exceeding the limit fails cleanly.
   5459 //
   5460 //--------------------------------------------------------------
   5461 void RegexTest::NamedCaptureLimits() {
   5462     if (quick) {
   5463         logln("Skipping test. Runs in exhuastive mode only.");
   5464         return;
   5465     }
   5466     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
   5467     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
   5468     char nnbuf[100];
   5469     UnicodeString pattern;
   5470     int32_t nn;
   5471 
   5472     for (nn=1; nn<goodLimit; nn++) {
   5473         sprintf(nnbuf, "(?<nn%d>)", nn);
   5474         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5475     }
   5476     UErrorCode status = U_ZERO_ERROR;
   5477     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
   5478     REGEX_CHECK_STATUS;
   5479     for (nn=1; nn<goodLimit; nn++) {
   5480         sprintf(nnbuf, "nn%d", nn);
   5481         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
   5482         REGEX_ASSERT(nn == groupNum);
   5483         if (nn != groupNum) {
   5484             break;
   5485         }
   5486     }
   5487     delete pat;
   5488 
   5489     pattern.remove();
   5490     for (nn=1; nn<failLimit; nn++) {
   5491         sprintf(nnbuf, "(?<nn%d>)", nn);
   5492         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5493     }
   5494     status = U_ZERO_ERROR;
   5495     pat = RegexPattern::compile(pattern, 0, status);
   5496     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
   5497     delete pat;
   5498 }
   5499 
   5500 
   5501 //--------------------------------------------------------------
   5502 //
   5503 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5504 //
   5505 //---------------------------------------------------------------
   5506 void RegexTest::Bug7651() {
   5507     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5508     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5509     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5510     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5511     UnicodeString s("#ff @abcd This is test");
   5512     RegexPattern  *REPattern = NULL;
   5513     RegexMatcher  *REMatcher = NULL;
   5514     UErrorCode status = U_ZERO_ERROR;
   5515     UParseError pe;
   5516 
   5517     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5518     REGEX_CHECK_STATUS;
   5519     REMatcher = REPattern->matcher(s, status);
   5520     REGEX_CHECK_STATUS;
   5521     REGEX_ASSERT(REMatcher->find());
   5522     REGEX_ASSERT(REMatcher->start(status) == 0);
   5523     delete REPattern;
   5524     delete REMatcher;
   5525     status = U_ZERO_ERROR;
   5526 
   5527     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5528     REGEX_CHECK_STATUS;
   5529     REMatcher = REPattern->matcher(s, status);
   5530     REGEX_CHECK_STATUS;
   5531     REGEX_ASSERT(REMatcher->find());
   5532     REGEX_ASSERT(REMatcher->start(status) == 0);
   5533     delete REPattern;
   5534     delete REMatcher;
   5535     status = U_ZERO_ERROR;
   5536  }
   5537 
   5538 void RegexTest::Bug7740() {
   5539     UErrorCode status = U_ZERO_ERROR;
   5540     UnicodeString pattern = "(a)";
   5541     UnicodeString text = "abcdef";
   5542     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5543     REGEX_CHECK_STATUS;
   5544     REGEX_ASSERT(m->lookingAt(status));
   5545     REGEX_CHECK_STATUS;
   5546     status = U_ILLEGAL_ARGUMENT_ERROR;
   5547     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5548     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5549     REGEX_ASSERT(s == "");
   5550     delete m;
   5551 }
   5552 
   5553 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5554 
   5555 void RegexTest::Bug8479() {
   5556     UErrorCode status = U_ZERO_ERROR;
   5557 
   5558     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5559     REGEX_CHECK_STATUS;
   5560     if (U_SUCCESS(status))
   5561     {
   5562         UnicodeString str;
   5563         str.setToBogus();
   5564         pMatcher->reset(str);
   5565         status = U_ZERO_ERROR;
   5566         pMatcher->matches(status);
   5567         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5568         delete pMatcher;
   5569     }
   5570 }
   5571 
   5572 
   5573 // Bug 7029
   5574 void RegexTest::Bug7029() {
   5575     UErrorCode status = U_ZERO_ERROR;
   5576 
   5577     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5578     UnicodeString text = "abc.def";
   5579     UnicodeString splits[10];
   5580     REGEX_CHECK_STATUS;
   5581     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5582     REGEX_CHECK_STATUS;
   5583     REGEX_ASSERT(numFields == 8);
   5584     delete pMatcher;
   5585 }
   5586 
   5587 // Bug 9283
   5588 //   This test is checking for the existance of any supplemental characters that case-fold
   5589 //   to a bmp character.
   5590 //
   5591 //   At the time of this writing there are none. If any should appear in a subsequent release
   5592 //   of Unicode, the code in regular expressions compilation that determines the longest
   5593 //   posssible match for a literal string  will need to be enhanced.
   5594 //
   5595 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5596 //   for details on what to do in case of a failure of this test.
   5597 //
   5598 void RegexTest::Bug9283() {
   5599 #if !UCONFIG_NO_NORMALIZATION
   5600     UErrorCode status = U_ZERO_ERROR;
   5601     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5602     REGEX_CHECK_STATUS;
   5603     int32_t index;
   5604     UChar32 c;
   5605     for (index=0; ; index++) {
   5606         c = supplementalsWithCaseFolding.charAt(index);
   5607         if (c == -1) {
   5608             break;
   5609         }
   5610         UnicodeString cf = UnicodeString(c).foldCase();
   5611         REGEX_ASSERT(cf.length() >= 2);
   5612     }
   5613 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   5614 }
   5615 
   5616 
   5617 void RegexTest::CheckInvBufSize() {
   5618   if(inv_next>=INV_BUFSIZ) {
   5619     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5620           __FILE__, INV_BUFSIZ, inv_next);
   5621   } else {
   5622     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5623   }
   5624 }
   5625 
   5626 
   5627 void RegexTest::Bug10459() {
   5628     UErrorCode status = U_ZERO_ERROR;
   5629     UnicodeString patternString("(txt)");
   5630     UnicodeString txtString("txt");
   5631 
   5632     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
   5633     REGEX_CHECK_STATUS;
   5634     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
   5635     REGEX_CHECK_STATUS;
   5636 
   5637     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
   5638     REGEX_CHECK_STATUS;
   5639 
   5640     uregex_setUText(icu_re, utext_txt, &status);
   5641     REGEX_CHECK_STATUS;
   5642 
   5643     // The bug was that calling uregex_group() before doing a matching operation
   5644     //   was causing a segfault. Only for Regular Expressions created from UText.
   5645     //   It should set an U_REGEX_INVALID_STATE.
   5646 
   5647     UChar buf[100];
   5648     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
   5649     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
   5650     REGEX_ASSERT(len == 0);
   5651 
   5652     uregex_close(icu_re);
   5653     utext_close(utext_pat);
   5654     utext_close(utext_txt);
   5655 }
   5656 
   5657 void RegexTest::TestCaseInsensitiveStarters() {
   5658     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
   5659     //  become stale because of new Unicode characters.
   5660     // If it is stale, rerun the generation tool
   5661     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   5662     // and replace the embedded data in i18n/regexcmp.cpp
   5663 
   5664     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
   5665         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
   5666             continue;
   5667         }
   5668         UnicodeSet s(cp, cp);
   5669         s.closeOver(USET_CASE_INSENSITIVE);
   5670         UnicodeSetIterator setIter(s);
   5671         while (setIter.next()) {
   5672             if (!setIter.isString()) {
   5673                 continue;
   5674             }
   5675             const UnicodeString &str = setIter.getString();
   5676             UChar32 firstChar = str.char32At(0);
   5677             UnicodeSet starters;
   5678             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
   5679             if (!starters.contains(cp)) {
   5680                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
   5681                 return;
   5682             }
   5683         }
   5684     }
   5685 }
   5686 
   5687 
   5688 void RegexTest::TestBug11049() {
   5689     // Original bug report: pattern with match start consisting of one of several individual characters,
   5690     //  and the text being matched ending with a supplementary character. find() would read past the
   5691     //  end of the input text when searching for potential match starting points.
   5692 
   5693     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
   5694     // detect the bad read.
   5695 
   5696     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5697     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
   5698 
   5699     // Test again with a pattern starting with a single character,
   5700     // which takes a different code path than starting with an OR expression,
   5701     // but with similar logic.
   5702     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5703     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
   5704 }
   5705 
   5706 // Run a single test case from TestBug11049(). Internal function.
   5707 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
   5708     UErrorCode status = U_ZERO_ERROR;
   5709     UnicodeString patternString = UnicodeString(pattern).unescape();
   5710     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5711 
   5712     UnicodeString dataString = UnicodeString(data).unescape();
   5713     UChar *exactBuffer = new UChar[dataString.length()];
   5714     dataString.extract(exactBuffer, dataString.length(), status);
   5715     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
   5716 
   5717     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
   5718     REGEX_CHECK_STATUS;
   5719     matcher->reset(ut);
   5720     UBool result = matcher->find();
   5721     if (result != expectMatch) {
   5722         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5723               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5724     }
   5725 
   5726     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
   5727     //   off-by-one on find() with match at the last code point.
   5728     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
   5729     //   because string.unescape() will only shrink it.
   5730     char * utf8Buffer = new char[uprv_strlen(data)+1];
   5731     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
   5732     REGEX_CHECK_STATUS;
   5733     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
   5734     REGEX_CHECK_STATUS;
   5735     matcher->reset(ut);
   5736     result = matcher->find();
   5737     if (result != expectMatch) {
   5738         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5739               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5740     }
   5741     delete [] utf8Buffer;
   5742 
   5743     utext_close(ut);
   5744     delete [] exactBuffer;
   5745 }
   5746 
   5747 
   5748 void RegexTest::TestBug11371() {
   5749     if (quick) {
   5750         logln("Skipping test. Runs in exhuastive mode only.");
   5751         return;
   5752     }
   5753     UErrorCode status = U_ZERO_ERROR;
   5754     UnicodeString patternString;
   5755 
   5756     for (int i=0; i<8000000; i++) {
   5757         patternString.append(UnicodeString("()"));
   5758     }
   5759     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5760     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5761         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5762               __FILE__, __LINE__, u_errorName(status));
   5763     }
   5764 
   5765     status = U_ZERO_ERROR;
   5766     patternString = "(";
   5767     for (int i=0; i<20000000; i++) {
   5768         patternString.append(UnicodeString("A++"));
   5769     }
   5770     patternString.append(UnicodeString("){0}B++"));
   5771     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
   5772     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5773         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5774               __FILE__, __LINE__, u_errorName(status));
   5775     }
   5776 
   5777     // Pattern with too much string data, such that string indexes overflow operand data field size
   5778     // in compiled instruction.
   5779     status = U_ZERO_ERROR;
   5780     patternString = "";
   5781     while (patternString.length() < 0x00ffffff) {
   5782         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
   5783     }
   5784     patternString.append(UnicodeString("X? trailing string"));
   5785     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
   5786     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5787         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5788               __FILE__, __LINE__, u_errorName(status));
   5789     }
   5790 }
   5791 
   5792 void RegexTest::TestBug11480() {
   5793     // C API, get capture group of a group that does not participate in the match.
   5794     //        (Returns a zero length string, with nul termination,
   5795     //         indistinguishable from a group with a zero length match.)
   5796 
   5797     UErrorCode status = U_ZERO_ERROR;
   5798     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
   5799     REGEX_CHECK_STATUS;
   5800     UnicodeString text = UNICODE_STRING_SIMPLE("A");
   5801     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5802     REGEX_CHECK_STATUS;
   5803     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
   5804     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
   5805     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
   5806     REGEX_ASSERT(length == 0);
   5807     REGEX_ASSERT(buf[0] == 13);
   5808     REGEX_ASSERT(buf[1] == 0);
   5809     REGEX_ASSERT(buf[2] == 13);
   5810     uregex_close(re);
   5811 
   5812     // UText C++ API, length of match is 0 for non-participating matches.
   5813     UText ut = UTEXT_INITIALIZER;
   5814     utext_openUnicodeString(&ut, &text, &status);
   5815     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
   5816     REGEX_CHECK_STATUS;
   5817     matcher.reset(&ut);
   5818     REGEX_ASSERT(matcher.lookingAt(0, status));
   5819 
   5820     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
   5821     int64_t groupLen = -666;
   5822     UText group = UTEXT_INITIALIZER;
   5823     matcher.group(1, &group, groupLen, status);
   5824     REGEX_CHECK_STATUS;
   5825     REGEX_ASSERT(groupLen == 1);
   5826     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
   5827 
   5828     // Capture group 2, the (B), does not participate in the match.
   5829     matcher.group(2, &group, groupLen, status);
   5830     REGEX_CHECK_STATUS;
   5831     REGEX_ASSERT(groupLen == 0);
   5832     REGEX_ASSERT(matcher.start(2, status) == -1);
   5833     REGEX_CHECK_STATUS;
   5834 }
   5835 
   5836 
   5837 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5838