Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2015, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 /*
     14      NOTE!!
     15 
     16      PLEASE be careful about ASCII assumptions in this test.
     17      This test is one of the worst repeat offenders.
     18      If you have questions, contact someone on the ICU PMC
     19      who has access to an EBCDIC system.
     20 
     21  */
     22 
     23 #include "intltest.h"
     24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     25 
     26 #include "unicode/localpointer.h"
     27 #include "unicode/regex.h"
     28 #include "unicode/uchar.h"
     29 #include "unicode/ucnv.h"
     30 #include "unicode/uniset.h"
     31 #include "unicode/uregex.h"
     32 #include "unicode/usetiter.h"
     33 #include "unicode/ustring.h"
     34 #include "regextst.h"
     35 #include "regexcmp.h"
     36 #include "uvector.h"
     37 #include "util.h"
     38 #include <stdlib.h>
     39 #include <string.h>
     40 #include <stdio.h>
     41 #include "cmemory.h"
     42 #include "cstring.h"
     43 #include "uinvchar.h"
     44 
     45 #define SUPPORT_MUTATING_INPUT_STRING   0
     46 
     47 //---------------------------------------------------------------------------
     48 //
     49 //  Test class boilerplate
     50 //
     51 //---------------------------------------------------------------------------
     52 RegexTest::RegexTest()
     53 {
     54 }
     55 
     56 
     57 RegexTest::~RegexTest()
     58 {
     59 }
     60 
     61 
     62 
     63 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     64 {
     65     if (exec) logln("TestSuite RegexTest: ");
     66     switch (index) {
     67 
     68         case 0: name = "Basic";
     69             if (exec) Basic();
     70             break;
     71         case 1: name = "API_Match";
     72             if (exec) API_Match();
     73             break;
     74         case 2: name = "API_Replace";
     75             if (exec) API_Replace();
     76             break;
     77         case 3: name = "API_Pattern";
     78             if (exec) API_Pattern();
     79             break;
     80         case 4:
     81 #if !UCONFIG_NO_FILE_IO
     82             name = "Extended";
     83             if (exec) Extended();
     84 #else
     85             name = "skip";
     86 #endif
     87             break;
     88         case 5: name = "Errors";
     89             if (exec) Errors();
     90             break;
     91         case 6: name = "PerlTests";
     92             if (exec) PerlTests();
     93             break;
     94         case 7: name = "Callbacks";
     95             if (exec) Callbacks();
     96             break;
     97         case 8: name = "FindProgressCallbacks";
     98             if (exec) FindProgressCallbacks();
     99             break;
    100         case 9: name = "Bug 6149";
    101              if (exec) Bug6149();
    102              break;
    103         case 10: name = "UTextBasic";
    104           if (exec) UTextBasic();
    105           break;
    106         case 11: name = "API_Match_UTF8";
    107           if (exec) API_Match_UTF8();
    108           break;
    109         case 12: name = "API_Replace_UTF8";
    110           if (exec) API_Replace_UTF8();
    111           break;
    112         case 13: name = "API_Pattern_UTF8";
    113           if (exec) API_Pattern_UTF8();
    114           break;
    115         case 14: name = "PerlTestsUTF8";
    116           if (exec) PerlTestsUTF8();
    117           break;
    118         case 15: name = "PreAllocatedUTextCAPI";
    119           if (exec) PreAllocatedUTextCAPI();
    120           break;
    121         case 16: name = "Bug 7651";
    122              if (exec) Bug7651();
    123              break;
    124         case 17: name = "Bug 7740";
    125             if (exec) Bug7740();
    126             break;
    127         case 18: name = "Bug 8479";
    128             if (exec) Bug8479();
    129             break;
    130         case 19: name = "Bug 7029";
    131             if (exec) Bug7029();
    132             break;
    133         case 20: name = "CheckInvBufSize";
    134             if (exec) CheckInvBufSize();
    135             break;
    136         case 21: name = "Bug 9283";
    137             if (exec) Bug9283();
    138             break;
    139         case 22: name = "Bug10459";
    140             if (exec) Bug10459();
    141             break;
    142         case 23: name = "TestCaseInsensitiveStarters";
    143             if (exec) TestCaseInsensitiveStarters();
    144             break;
    145         case 24: name = "TestBug11049";
    146             if (exec) TestBug11049();
    147             break;
    148         case 25: name = "TestBug11371";
    149             if (exec) TestBug11371();
    150             break;
    151         case 26: name = "TestBug11480";
    152             if (exec) TestBug11480();
    153             break;
    154         case 27: name = "NamedCapture";
    155             if (exec) NamedCapture();
    156             break;
    157         case 28: name = "NamedCaptureLimits";
    158             if (exec) NamedCaptureLimits();
    159             break;
    160         default: name = "";
    161             break; //needed to end loop
    162     }
    163 }
    164 
    165 
    166 
    167 /**
    168  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    169  * into ASCII.
    170  * @see utext_openUTF8
    171  */
    172 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    173 
    174 //---------------------------------------------------------------------------
    175 //
    176 //   Error Checking / Reporting macros used in all of the tests.
    177 //
    178 //---------------------------------------------------------------------------
    179 
    180 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    181   int64_t oldIndex = utext_getNativeIndex(text);
    182   utext_setNativeIndex(text, 0);
    183   char *bufPtr = buf;
    184   UChar32 c = utext_next32From(text, 0);
    185   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    186     if (0x000020<=c && c<0x00007e) {
    187       *bufPtr = c;
    188     } else {
    189 #if 0
    190       sprintf(bufPtr,"U+%04X", c);
    191       bufPtr+= strlen(bufPtr)-1;
    192 #else
    193       *bufPtr = '%';
    194 #endif
    195     }
    196     bufPtr++;
    197     c = UTEXT_NEXT32(text);
    198   }
    199   *bufPtr = 0;
    200 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    201   char *ebuf = (char*)malloc(bufLen);
    202   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    203   uprv_strncpy(buf, ebuf, bufLen);
    204   free((void*)ebuf);
    205 #endif
    206   utext_setNativeIndex(text, oldIndex);
    207 }
    208 
    209 
    210 static char ASSERT_BUF[1024];
    211 
    212 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    213   if(message.length()==0) {
    214     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    215   } else {
    216     UnicodeString buf;
    217     IntlTest::prettify(message,buf);
    218     if(buf.length()==0) {
    219       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    220     } else {
    221       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    222       if(ASSERT_BUF[0]==0) {
    223         ASSERT_BUF[0]=0;
    224         for(int32_t i=0;i<buf.length();i++) {
    225           UChar ch = buf[i];
    226           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    227         }
    228       }
    229     }
    230   }
    231   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    232   return ASSERT_BUF;
    233 }
    234 
    235 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    236 
    237 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    238                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    239 
    240 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    241 
    242 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    243 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    244     __LINE__, u_errorName(errcode), u_errorName(status));};}
    245 
    246 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    247     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    248 
    249 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    250     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    251 
    252 // expected: const char * , restricted to invariant characters.
    253 // actual: const UnicodeString &
    254 #define REGEX_ASSERT_UNISTR(expected, actual) { \
    255     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
    256         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
    257                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
    258 
    259 
    260 static UBool testUTextEqual(UText *uta, UText *utb) {
    261     UChar32 ca = 0;
    262     UChar32 cb = 0;
    263     utext_setNativeIndex(uta, 0);
    264     utext_setNativeIndex(utb, 0);
    265     do {
    266         ca = utext_next32(uta);
    267         cb = utext_next32(utb);
    268         if (ca != cb) {
    269             break;
    270         }
    271     } while (ca != U_SENTINEL);
    272     return ca == cb;
    273 }
    274 
    275 
    276 /**
    277  * @param expected expected text in UTF-8 (not platform) codepage
    278  */
    279 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    280     UErrorCode status = U_ZERO_ERROR;
    281     UText expectedText = UTEXT_INITIALIZER;
    282     utext_openUTF8(&expectedText, expected, -1, &status);
    283     if(U_FAILURE(status)) {
    284       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    285       return;
    286     }
    287     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    288       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    289       return;
    290     }
    291     utext_setNativeIndex(actual, 0);
    292     if (!testUTextEqual(&expectedText, actual)) {
    293         char buf[201 /*21*/];
    294         char expectedBuf[201];
    295         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    296         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    297         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    298     }
    299     utext_close(&expectedText);
    300 }
    301 /**
    302  * @param expected invariant (platform local text) input
    303  */
    304 
    305 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    306     UErrorCode status = U_ZERO_ERROR;
    307     UText expectedText = UTEXT_INITIALIZER;
    308     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    309     if(U_FAILURE(status)) {
    310       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    311       return;
    312     }
    313     utext_setNativeIndex(actual, 0);
    314     if (!testUTextEqual(&expectedText, actual)) {
    315         char buf[201 /*21*/];
    316         char expectedBuf[201];
    317         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    318         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    319         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    320     }
    321     utext_close(&expectedText);
    322 }
    323 
    324 /**
    325  * Assumes utf-8 input
    326  */
    327 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    328 /**
    329  * Assumes Invariant input
    330  */
    331 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    332 
    333 /**
    334  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    335  * passed into utext_openUTF8. An error will be given if
    336  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    337  */
    338 
    339 #define INV_BUFSIZ 2048 /* increase this if too small */
    340 
    341 static int64_t inv_next=0;
    342 
    343 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    344 static char inv_buf[INV_BUFSIZ];
    345 #endif
    346 
    347 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    348   if(length==-1) length=strlen(inv);
    349 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    350   inv_next+=length;
    351   return utext_openUTF8(ut, inv, length, status);
    352 #else
    353   if(inv_next+length+1>INV_BUFSIZ) {
    354     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    355             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    356     *status = U_MEMORY_ALLOCATION_ERROR;
    357     return NULL;
    358   }
    359 
    360   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    361   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    362   inv_next+=length;
    363 
    364 #if 0
    365   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    366 #endif
    367 
    368   return utext_openUTF8(ut, (const char*)buf, length, status);
    369 #endif
    370 }
    371 
    372 
    373 //---------------------------------------------------------------------------
    374 //
    375 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    376 //                       for the LookingAt() and  Match() functions.
    377 //
    378 //       usage:
    379 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    380 //
    381 //          The expected results are UBool - TRUE or FALSE.
    382 //          The input text is unescaped.  The pattern is not.
    383 //
    384 //
    385 //---------------------------------------------------------------------------
    386 
    387 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    388 
    389 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    390     const UnicodeString pattern(pat, -1, US_INV);
    391     const UnicodeString inputText(text, -1, US_INV);
    392     UErrorCode          status  = U_ZERO_ERROR;
    393     UParseError         pe;
    394     RegexPattern        *REPattern = NULL;
    395     RegexMatcher        *REMatcher = NULL;
    396     UBool               retVal     = TRUE;
    397 
    398     UnicodeString patString(pat, -1, US_INV);
    399     REPattern = RegexPattern::compile(patString, 0, pe, status);
    400     if (U_FAILURE(status)) {
    401         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    402             line, u_errorName(status));
    403         return FALSE;
    404     }
    405     if (line==376) { REPattern->dumpPattern();}
    406 
    407     UnicodeString inputString(inputText);
    408     UnicodeString unEscapedInput = inputString.unescape();
    409     REMatcher = REPattern->matcher(unEscapedInput, status);
    410     if (U_FAILURE(status)) {
    411         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    412             line, u_errorName(status));
    413         return FALSE;
    414     }
    415 
    416     UBool actualmatch;
    417     actualmatch = REMatcher->lookingAt(status);
    418     if (U_FAILURE(status)) {
    419         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    420             line, u_errorName(status));
    421         retVal =  FALSE;
    422     }
    423     if (actualmatch != looking) {
    424         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    425         retVal = FALSE;
    426     }
    427 
    428     status = U_ZERO_ERROR;
    429     actualmatch = REMatcher->matches(status);
    430     if (U_FAILURE(status)) {
    431         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    432             line, u_errorName(status));
    433         retVal = FALSE;
    434     }
    435     if (actualmatch != match) {
    436         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    437         retVal = FALSE;
    438     }
    439 
    440     if (retVal == FALSE) {
    441         REPattern->dumpPattern();
    442     }
    443 
    444     delete REPattern;
    445     delete REMatcher;
    446     return retVal;
    447 }
    448 
    449 
    450 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    451     UText               pattern    = UTEXT_INITIALIZER;
    452     int32_t             inputUTF8Length;
    453     char                *textChars = NULL;
    454     UText               inputText  = UTEXT_INITIALIZER;
    455     UErrorCode          status     = U_ZERO_ERROR;
    456     UParseError         pe;
    457     RegexPattern        *REPattern = NULL;
    458     RegexMatcher        *REMatcher = NULL;
    459     UBool               retVal     = TRUE;
    460 
    461     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    462     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    463     if (U_FAILURE(status)) {
    464         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    465             line, u_errorName(status));
    466         return FALSE;
    467     }
    468 
    469     UnicodeString inputString(text, -1, US_INV);
    470     UnicodeString unEscapedInput = inputString.unescape();
    471     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    472     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    473 
    474     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    475     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    476         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    477         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    478         return TRUE; // not a failure of the Regex engine
    479     }
    480     status = U_ZERO_ERROR; // buffer overflow
    481     textChars = new char[inputUTF8Length+1];
    482     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    483     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    484 
    485     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    486     if (U_FAILURE(status)) {
    487         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    488             line, u_errorName(status));
    489         return FALSE;
    490     }
    491 
    492     UBool actualmatch;
    493     actualmatch = REMatcher->lookingAt(status);
    494     if (U_FAILURE(status)) {
    495         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    496             line, u_errorName(status));
    497         retVal =  FALSE;
    498     }
    499     if (actualmatch != looking) {
    500         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    501         retVal = FALSE;
    502     }
    503 
    504     status = U_ZERO_ERROR;
    505     actualmatch = REMatcher->matches(status);
    506     if (U_FAILURE(status)) {
    507         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    508             line, u_errorName(status));
    509         retVal = FALSE;
    510     }
    511     if (actualmatch != match) {
    512         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    513         retVal = FALSE;
    514     }
    515 
    516     if (retVal == FALSE) {
    517         REPattern->dumpPattern();
    518     }
    519 
    520     delete REPattern;
    521     delete REMatcher;
    522     utext_close(&inputText);
    523     utext_close(&pattern);
    524     delete[] textChars;
    525     return retVal;
    526 }
    527 
    528 
    529 
    530 //---------------------------------------------------------------------------
    531 //
    532 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    533 //                       regex tests for incorrect patterns
    534 //
    535 //       usage:
    536 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    537 //
    538 //---------------------------------------------------------------------------
    539 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    540 
    541 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    542                           UErrorCode expectedStatus, int32_t line) {
    543     UnicodeString       pattern(pat);
    544 
    545     UErrorCode          status         = U_ZERO_ERROR;
    546     UParseError         pe;
    547     RegexPattern        *callerPattern = NULL;
    548 
    549     //
    550     //  Compile the caller's pattern
    551     //
    552     UnicodeString patString(pat);
    553     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    554     if (status != expectedStatus) {
    555         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    556     } else {
    557         if (status != U_ZERO_ERROR) {
    558             if (pe.line != errLine || pe.offset != errCol) {
    559                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    560                     line, errLine, errCol, pe.line, pe.offset);
    561             }
    562         }
    563     }
    564 
    565     delete callerPattern;
    566 
    567     //
    568     //  Compile again, using a UTF-8-based UText
    569     //
    570     UText patternText = UTEXT_INITIALIZER;
    571     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    572     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    573     if (status != expectedStatus) {
    574         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    575     } else {
    576         if (status != U_ZERO_ERROR) {
    577             if (pe.line != errLine || pe.offset != errCol) {
    578                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    579                     line, errLine, errCol, pe.line, pe.offset);
    580             }
    581         }
    582     }
    583 
    584     delete callerPattern;
    585     utext_close(&patternText);
    586 }
    587 
    588 
    589 
    590 //---------------------------------------------------------------------------
    591 //
    592 //      Basic      Check for basic functionality of regex pattern matching.
    593 //                 Avoid the use of REGEX_FIND test macro, which has
    594 //                 substantial dependencies on basic Regex functionality.
    595 //
    596 //---------------------------------------------------------------------------
    597 void RegexTest::Basic() {
    598 
    599 
    600 //
    601 // Debug - slide failing test cases early
    602 //
    603 #if 0
    604     {
    605         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    606         UParseError pe;
    607         UErrorCode  status = U_ZERO_ERROR;
    608         RegexPattern *pattern;
    609         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    610         pattern->dumpPattern();
    611         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    612         UBool result = m->find();
    613         printf("result = %d\n", result);
    614         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    615         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    616     }
    617     exit(1);
    618 #endif
    619 
    620 
    621     //
    622     // Pattern with parentheses
    623     //
    624     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    625     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    626     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    627 
    628     //
    629     // Patterns with *
    630     //
    631     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    632     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    633     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    634     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    635     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    636 
    637     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    638     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    639 
    640 
    641     //
    642     //  Patterns with "."
    643     //
    644     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    645     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    646     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    647     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    648     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    649     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    650     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    651     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    652 
    653     //
    654     //  Patterns with * applied to chars at end of literal string
    655     //
    656     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    657     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    658 
    659     //
    660     //  Supplemental chars match as single chars, not a pair of surrogates.
    661     //
    662     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    663     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    664     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    665 
    666 
    667     //
    668     //  UnicodeSets in the pattern
    669     //
    670     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    671     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    672     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    673     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    674     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    675     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    676 
    677     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    678     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    679     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    680     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    681     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    682 
    683     //
    684     //   OR operator in patterns
    685     //
    686     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    687     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    688     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    689     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    690 
    691     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    692     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    693     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    694     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    695     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    696     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    697 
    698     //
    699     //  +
    700     //
    701     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    702     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    703     REGEX_TESTLM("b+", "", FALSE, FALSE);
    704     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    705     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    706     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    707 
    708     //
    709     //   ?
    710     //
    711     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    712     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    713     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    714     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    715     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    716     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    717     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    718     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    719     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    720 
    721     //
    722     //  Escape sequences that become single literal chars, handled internally
    723     //   by ICU's Unescape.
    724     //
    725 
    726     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    727     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    728     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    729     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    730     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    731     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    732     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    733     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    734     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    735     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    736 
    737     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    738     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    739 
    740     // Escape of special chars in patterns
    741     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    742 }
    743 
    744 
    745 //---------------------------------------------------------------------------
    746 //
    747 //    UTextBasic   Check for quirks that are specific to the UText
    748 //                 implementation.
    749 //
    750 //---------------------------------------------------------------------------
    751 void RegexTest::UTextBasic() {
    752     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    753     UErrorCode status = U_ZERO_ERROR;
    754     UText pattern = UTEXT_INITIALIZER;
    755     utext_openUTF8(&pattern, str_abc, -1, &status);
    756     RegexMatcher matcher(&pattern, 0, status);
    757     REGEX_CHECK_STATUS;
    758 
    759     UText input = UTEXT_INITIALIZER;
    760     utext_openUTF8(&input, str_abc, -1, &status);
    761     REGEX_CHECK_STATUS;
    762     matcher.reset(&input);
    763     REGEX_CHECK_STATUS;
    764     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    765 
    766     matcher.reset(matcher.inputText());
    767     REGEX_CHECK_STATUS;
    768     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    769 
    770     utext_close(&pattern);
    771     utext_close(&input);
    772 }
    773 
    774 
    775 //---------------------------------------------------------------------------
    776 //
    777 //      API_Match   Test that the API for class RegexMatcher
    778 //                  is present and nominally working, but excluding functions
    779 //                  implementing replace operations.
    780 //
    781 //---------------------------------------------------------------------------
    782 void RegexTest::API_Match() {
    783     UParseError         pe;
    784     UErrorCode          status=U_ZERO_ERROR;
    785     int32_t             flags = 0;
    786 
    787     //
    788     // Debug - slide failing test cases early
    789     //
    790 #if 0
    791     {
    792     }
    793     return;
    794 #endif
    795 
    796     //
    797     // Simple pattern compilation
    798     //
    799     {
    800         UnicodeString       re("abc");
    801         RegexPattern        *pat2;
    802         pat2 = RegexPattern::compile(re, flags, pe, status);
    803         REGEX_CHECK_STATUS;
    804 
    805         UnicodeString inStr1 = "abcdef this is a test";
    806         UnicodeString instr2 = "not abc";
    807         UnicodeString empty  = "";
    808 
    809 
    810         //
    811         // Matcher creation and reset.
    812         //
    813         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    814         REGEX_CHECK_STATUS;
    815         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    816         REGEX_ASSERT(m1->input() == inStr1);
    817         m1->reset(instr2);
    818         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    819         REGEX_ASSERT(m1->input() == instr2);
    820         m1->reset(inStr1);
    821         REGEX_ASSERT(m1->input() == inStr1);
    822         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    823         m1->reset(empty);
    824         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    825         REGEX_ASSERT(m1->input() == empty);
    826         REGEX_ASSERT(&m1->pattern() == pat2);
    827 
    828         //
    829         //  reset(pos, status)
    830         //
    831         m1->reset(inStr1);
    832         m1->reset(4, status);
    833         REGEX_CHECK_STATUS;
    834         REGEX_ASSERT(m1->input() == inStr1);
    835         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    836 
    837         m1->reset(-1, status);
    838         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    839         status = U_ZERO_ERROR;
    840 
    841         m1->reset(0, status);
    842         REGEX_CHECK_STATUS;
    843         status = U_ZERO_ERROR;
    844 
    845         int32_t len = m1->input().length();
    846         m1->reset(len-1, status);
    847         REGEX_CHECK_STATUS;
    848         status = U_ZERO_ERROR;
    849 
    850         m1->reset(len, status);
    851         REGEX_CHECK_STATUS;
    852         status = U_ZERO_ERROR;
    853 
    854         m1->reset(len+1, status);
    855         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    856         status = U_ZERO_ERROR;
    857 
    858         //
    859         // match(pos, status)
    860         //
    861         m1->reset(instr2);
    862         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    863         m1->reset();
    864         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    865         m1->reset();
    866         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    867         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    868         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    869         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    870 
    871         // Match() at end of string should fail, but should not
    872         //  be an error.
    873         status = U_ZERO_ERROR;
    874         len = m1->input().length();
    875         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    876         REGEX_CHECK_STATUS;
    877 
    878         // Match beyond end of string should fail with an error.
    879         status = U_ZERO_ERROR;
    880         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    881         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    882 
    883         // Successful match at end of string.
    884         {
    885             status = U_ZERO_ERROR;
    886             RegexMatcher m("A?", 0, status);  // will match zero length string.
    887             REGEX_CHECK_STATUS;
    888             m.reset(inStr1);
    889             len = inStr1.length();
    890             REGEX_ASSERT(m.matches(len, status) == TRUE);
    891             REGEX_CHECK_STATUS;
    892             m.reset(empty);
    893             REGEX_ASSERT(m.matches(0, status) == TRUE);
    894             REGEX_CHECK_STATUS;
    895         }
    896 
    897 
    898         //
    899         // lookingAt(pos, status)
    900         //
    901         status = U_ZERO_ERROR;
    902         m1->reset(instr2);  // "not abc"
    903         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    904         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    905         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    906         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    907         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    908         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    909         status = U_ZERO_ERROR;
    910         len = m1->input().length();
    911         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    912         REGEX_CHECK_STATUS;
    913         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    914         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    915 
    916         delete m1;
    917         delete pat2;
    918     }
    919 
    920 
    921     //
    922     // Capture Group.
    923     //     RegexMatcher::start();
    924     //     RegexMatcher::end();
    925     //     RegexMatcher::groupCount();
    926     //
    927     {
    928         int32_t             flags=0;
    929         UParseError         pe;
    930         UErrorCode          status=U_ZERO_ERROR;
    931 
    932         UnicodeString       re("01(23(45)67)(.*)");
    933         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    934         REGEX_CHECK_STATUS;
    935         UnicodeString data = "0123456789";
    936 
    937         RegexMatcher *matcher = pat->matcher(data, status);
    938         REGEX_CHECK_STATUS;
    939         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    940         static const int32_t matchStarts[] = {0,  2, 4, 8};
    941         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    942         int32_t i;
    943         for (i=0; i<4; i++) {
    944             int32_t actualStart = matcher->start(i, status);
    945             REGEX_CHECK_STATUS;
    946             if (actualStart != matchStarts[i]) {
    947                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    948                     __LINE__, i, matchStarts[i], actualStart);
    949             }
    950             int32_t actualEnd = matcher->end(i, status);
    951             REGEX_CHECK_STATUS;
    952             if (actualEnd != matchEnds[i]) {
    953                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    954                     __LINE__, i, matchEnds[i], actualEnd);
    955             }
    956         }
    957 
    958         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    959         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    960 
    961         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    962         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    963         matcher->reset();
    964         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    965 
    966         matcher->lookingAt(status);
    967         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    968         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    969         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    970         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    971         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    972         REGEX_CHECK_STATUS;
    973         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    974         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    975         matcher->reset();
    976         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    977 
    978         delete matcher;
    979         delete pat;
    980 
    981     }
    982 
    983     //
    984     //  find
    985     //
    986     {
    987         int32_t             flags=0;
    988         UParseError         pe;
    989         UErrorCode          status=U_ZERO_ERROR;
    990 
    991         UnicodeString       re("abc");
    992         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    993         REGEX_CHECK_STATUS;
    994         UnicodeString data = ".abc..abc...abc..";
    995         //                    012345678901234567
    996 
    997         RegexMatcher *matcher = pat->matcher(data, status);
    998         REGEX_CHECK_STATUS;
    999         REGEX_ASSERT(matcher->find());
   1000         REGEX_ASSERT(matcher->start(status) == 1);
   1001         REGEX_ASSERT(matcher->find());
   1002         REGEX_ASSERT(matcher->start(status) == 6);
   1003         REGEX_ASSERT(matcher->find());
   1004         REGEX_ASSERT(matcher->start(status) == 12);
   1005         REGEX_ASSERT(matcher->find() == FALSE);
   1006         REGEX_ASSERT(matcher->find() == FALSE);
   1007 
   1008         matcher->reset();
   1009         REGEX_ASSERT(matcher->find());
   1010         REGEX_ASSERT(matcher->start(status) == 1);
   1011 
   1012         REGEX_ASSERT(matcher->find(0, status));
   1013         REGEX_ASSERT(matcher->start(status) == 1);
   1014         REGEX_ASSERT(matcher->find(1, status));
   1015         REGEX_ASSERT(matcher->start(status) == 1);
   1016         REGEX_ASSERT(matcher->find(2, status));
   1017         REGEX_ASSERT(matcher->start(status) == 6);
   1018         REGEX_ASSERT(matcher->find(12, status));
   1019         REGEX_ASSERT(matcher->start(status) == 12);
   1020         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   1021         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   1022         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   1023         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   1024 
   1025         status = U_ZERO_ERROR;
   1026         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1027         status = U_ZERO_ERROR;
   1028         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1029 
   1030         REGEX_ASSERT(matcher->groupCount() == 0);
   1031 
   1032         delete matcher;
   1033         delete pat;
   1034     }
   1035 
   1036 
   1037     //
   1038     //  find, with \G in pattern (true if at the end of a previous match).
   1039     //
   1040     {
   1041         int32_t             flags=0;
   1042         UParseError         pe;
   1043         UErrorCode          status=U_ZERO_ERROR;
   1044 
   1045         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
   1046         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1047         REGEX_CHECK_STATUS;
   1048         UnicodeString data = ".abcabc.abc..";
   1049         //                    012345678901234567
   1050 
   1051         RegexMatcher *matcher = pat->matcher(data, status);
   1052         REGEX_CHECK_STATUS;
   1053         REGEX_ASSERT(matcher->find());
   1054         REGEX_ASSERT(matcher->start(status) == 0);
   1055         REGEX_ASSERT(matcher->start(1, status) == -1);
   1056         REGEX_ASSERT(matcher->start(2, status) == 1);
   1057 
   1058         REGEX_ASSERT(matcher->find());
   1059         REGEX_ASSERT(matcher->start(status) == 4);
   1060         REGEX_ASSERT(matcher->start(1, status) == 4);
   1061         REGEX_ASSERT(matcher->start(2, status) == -1);
   1062         REGEX_CHECK_STATUS;
   1063 
   1064         delete matcher;
   1065         delete pat;
   1066     }
   1067 
   1068     //
   1069     //   find with zero length matches, match position should bump ahead
   1070     //     to prevent loops.
   1071     //
   1072     {
   1073         int32_t                 i;
   1074         UErrorCode          status=U_ZERO_ERROR;
   1075         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1076                                                       //   using an always-true look-ahead.
   1077         REGEX_CHECK_STATUS;
   1078         UnicodeString s("    ");
   1079         m.reset(s);
   1080         for (i=0; ; i++) {
   1081             if (m.find() == FALSE) {
   1082                 break;
   1083             }
   1084             REGEX_ASSERT(m.start(status) == i);
   1085             REGEX_ASSERT(m.end(status) == i);
   1086         }
   1087         REGEX_ASSERT(i==5);
   1088 
   1089         // Check that the bump goes over surrogate pairs OK
   1090         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1091         s = s.unescape();
   1092         m.reset(s);
   1093         for (i=0; ; i+=2) {
   1094             if (m.find() == FALSE) {
   1095                 break;
   1096             }
   1097             REGEX_ASSERT(m.start(status) == i);
   1098             REGEX_ASSERT(m.end(status) == i);
   1099         }
   1100         REGEX_ASSERT(i==10);
   1101     }
   1102     {
   1103         // find() loop breaking test.
   1104         //        with pattern of /.?/, should see a series of one char matches, then a single
   1105         //        match of zero length at the end of the input string.
   1106         int32_t                 i;
   1107         UErrorCode          status=U_ZERO_ERROR;
   1108         RegexMatcher        m(".?", 0, status);
   1109         REGEX_CHECK_STATUS;
   1110         UnicodeString s("    ");
   1111         m.reset(s);
   1112         for (i=0; ; i++) {
   1113             if (m.find() == FALSE) {
   1114                 break;
   1115             }
   1116             REGEX_ASSERT(m.start(status) == i);
   1117             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1118         }
   1119         REGEX_ASSERT(i==5);
   1120     }
   1121 
   1122 
   1123     //
   1124     // Matchers with no input string behave as if they had an empty input string.
   1125     //
   1126 
   1127     {
   1128         UErrorCode status = U_ZERO_ERROR;
   1129         RegexMatcher  m(".?", 0, status);
   1130         REGEX_CHECK_STATUS;
   1131         REGEX_ASSERT(m.find());
   1132         REGEX_ASSERT(m.start(status) == 0);
   1133         REGEX_ASSERT(m.input() == "");
   1134     }
   1135     {
   1136         UErrorCode status = U_ZERO_ERROR;
   1137         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1138         RegexMatcher  *m = p->matcher(status);
   1139         REGEX_CHECK_STATUS;
   1140 
   1141         REGEX_ASSERT(m->find() == FALSE);
   1142         REGEX_ASSERT(m->input() == "");
   1143         delete m;
   1144         delete p;
   1145     }
   1146 
   1147     //
   1148     // Regions
   1149     //
   1150     {
   1151         UErrorCode status = U_ZERO_ERROR;
   1152         UnicodeString testString("This is test data");
   1153         RegexMatcher m(".*", testString,  0, status);
   1154         REGEX_CHECK_STATUS;
   1155         REGEX_ASSERT(m.regionStart() == 0);
   1156         REGEX_ASSERT(m.regionEnd() == testString.length());
   1157         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1158         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1159 
   1160         m.region(2,4, status);
   1161         REGEX_CHECK_STATUS;
   1162         REGEX_ASSERT(m.matches(status));
   1163         REGEX_ASSERT(m.start(status)==2);
   1164         REGEX_ASSERT(m.end(status)==4);
   1165         REGEX_CHECK_STATUS;
   1166 
   1167         m.reset();
   1168         REGEX_ASSERT(m.regionStart() == 0);
   1169         REGEX_ASSERT(m.regionEnd() == testString.length());
   1170 
   1171         UnicodeString shorterString("short");
   1172         m.reset(shorterString);
   1173         REGEX_ASSERT(m.regionStart() == 0);
   1174         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1175 
   1176         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1177         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1178         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1179         REGEX_ASSERT(&m == &m.reset());
   1180         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1181 
   1182         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1183         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1184         REGEX_ASSERT(&m == &m.reset());
   1185         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1186 
   1187         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1188         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1189         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1190         REGEX_ASSERT(&m == &m.reset());
   1191         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1192 
   1193         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1194         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1195         REGEX_ASSERT(&m == &m.reset());
   1196         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1197 
   1198     }
   1199 
   1200     //
   1201     // hitEnd() and requireEnd()
   1202     //
   1203     {
   1204         UErrorCode status = U_ZERO_ERROR;
   1205         UnicodeString testString("aabb");
   1206         RegexMatcher m1(".*", testString,  0, status);
   1207         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1208         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1209         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1210         REGEX_CHECK_STATUS;
   1211 
   1212         status = U_ZERO_ERROR;
   1213         RegexMatcher m2("a*", testString, 0, status);
   1214         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1215         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1216         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1217         REGEX_CHECK_STATUS;
   1218 
   1219         status = U_ZERO_ERROR;
   1220         RegexMatcher m3(".*$", testString, 0, status);
   1221         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1222         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1223         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1224         REGEX_CHECK_STATUS;
   1225     }
   1226 
   1227 
   1228     //
   1229     // Compilation error on reset with UChar *
   1230     //   These were a hazard that people were stumbling over with runtime errors.
   1231     //   Changed them to compiler errors by adding private methods that more closely
   1232     //   matched the incorrect use of the functions.
   1233     //
   1234 #if 0
   1235     {
   1236         UErrorCode status = U_ZERO_ERROR;
   1237         UChar ucharString[20];
   1238         RegexMatcher m(".", 0, status);
   1239         m.reset(ucharString);  // should not compile.
   1240 
   1241         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1242         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1243 
   1244         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1245     }
   1246 #endif
   1247 
   1248     //
   1249     //  Time Outs.
   1250     //       Note:  These tests will need to be changed when the regexp engine is
   1251     //              able to detect and cut short the exponential time behavior on
   1252     //              this type of match.
   1253     //
   1254     {
   1255         UErrorCode status = U_ZERO_ERROR;
   1256         //    Enough 'a's in the string to cause the match to time out.
   1257         //       (Each on additonal 'a' doubles the time)
   1258         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1259         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1260         REGEX_CHECK_STATUS;
   1261         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1262         matcher.setTimeLimit(100, status);
   1263         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1264         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1265         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1266     }
   1267     {
   1268         UErrorCode status = U_ZERO_ERROR;
   1269         //   Few enough 'a's to slip in under the time limit.
   1270         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1271         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1272         REGEX_CHECK_STATUS;
   1273         matcher.setTimeLimit(100, status);
   1274         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1275         REGEX_CHECK_STATUS;
   1276     }
   1277 
   1278     //
   1279     //  Stack Limits
   1280     //
   1281     {
   1282         UErrorCode status = U_ZERO_ERROR;
   1283         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1284 
   1285         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1286         //   of the '+', and makes the stack frames larger.
   1287         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1288 
   1289         // With the default stack, this match should fail to run
   1290         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1291         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1292 
   1293         // With unlimited stack, it should run
   1294         status = U_ZERO_ERROR;
   1295         matcher.setStackLimit(0, status);
   1296         REGEX_CHECK_STATUS;
   1297         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1298         REGEX_CHECK_STATUS;
   1299         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1300 
   1301         // With a limited stack, it the match should fail
   1302         status = U_ZERO_ERROR;
   1303         matcher.setStackLimit(10000, status);
   1304         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1305         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1306         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1307     }
   1308 
   1309         // A pattern that doesn't save state should work with
   1310         //   a minimal sized stack
   1311     {
   1312         UErrorCode status = U_ZERO_ERROR;
   1313         UnicodeString testString = "abc";
   1314         RegexMatcher matcher("abc", testString, 0, status);
   1315         REGEX_CHECK_STATUS;
   1316         matcher.setStackLimit(30, status);
   1317         REGEX_CHECK_STATUS;
   1318         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1319         REGEX_CHECK_STATUS;
   1320         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1321 
   1322         // Negative stack sizes should fail
   1323         status = U_ZERO_ERROR;
   1324         matcher.setStackLimit(1000, status);
   1325         REGEX_CHECK_STATUS;
   1326         matcher.setStackLimit(-1, status);
   1327         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1328         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1329     }
   1330 
   1331 
   1332 }
   1333 
   1334 
   1335 
   1336 
   1337 
   1338 
   1339 //---------------------------------------------------------------------------
   1340 //
   1341 //      API_Replace        API test for class RegexMatcher, testing the
   1342 //                         Replace family of functions.
   1343 //
   1344 //---------------------------------------------------------------------------
   1345 void RegexTest::API_Replace() {
   1346     //
   1347     //  Replace
   1348     //
   1349     int32_t             flags=0;
   1350     UParseError         pe;
   1351     UErrorCode          status=U_ZERO_ERROR;
   1352 
   1353     UnicodeString       re("abc");
   1354     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1355     REGEX_CHECK_STATUS;
   1356     UnicodeString data = ".abc..abc...abc..";
   1357     //                    012345678901234567
   1358     RegexMatcher *matcher = pat->matcher(data, status);
   1359 
   1360     //
   1361     //  Plain vanilla matches.
   1362     //
   1363     UnicodeString  dest;
   1364     dest = matcher->replaceFirst("yz", status);
   1365     REGEX_CHECK_STATUS;
   1366     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1367 
   1368     dest = matcher->replaceAll("yz", status);
   1369     REGEX_CHECK_STATUS;
   1370     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1371 
   1372     //
   1373     //  Plain vanilla non-matches.
   1374     //
   1375     UnicodeString d2 = ".abx..abx...abx..";
   1376     matcher->reset(d2);
   1377     dest = matcher->replaceFirst("yz", status);
   1378     REGEX_CHECK_STATUS;
   1379     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1380 
   1381     dest = matcher->replaceAll("yz", status);
   1382     REGEX_CHECK_STATUS;
   1383     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1384 
   1385     //
   1386     // Empty source string
   1387     //
   1388     UnicodeString d3 = "";
   1389     matcher->reset(d3);
   1390     dest = matcher->replaceFirst("yz", status);
   1391     REGEX_CHECK_STATUS;
   1392     REGEX_ASSERT(dest == "");
   1393 
   1394     dest = matcher->replaceAll("yz", status);
   1395     REGEX_CHECK_STATUS;
   1396     REGEX_ASSERT(dest == "");
   1397 
   1398     //
   1399     // Empty substitution string
   1400     //
   1401     matcher->reset(data);              // ".abc..abc...abc.."
   1402     dest = matcher->replaceFirst("", status);
   1403     REGEX_CHECK_STATUS;
   1404     REGEX_ASSERT(dest == "...abc...abc..");
   1405 
   1406     dest = matcher->replaceAll("", status);
   1407     REGEX_CHECK_STATUS;
   1408     REGEX_ASSERT(dest == "........");
   1409 
   1410     //
   1411     // match whole string
   1412     //
   1413     UnicodeString d4 = "abc";
   1414     matcher->reset(d4);
   1415     dest = matcher->replaceFirst("xyz", status);
   1416     REGEX_CHECK_STATUS;
   1417     REGEX_ASSERT(dest == "xyz");
   1418 
   1419     dest = matcher->replaceAll("xyz", status);
   1420     REGEX_CHECK_STATUS;
   1421     REGEX_ASSERT(dest == "xyz");
   1422 
   1423     //
   1424     // Capture Group, simple case
   1425     //
   1426     UnicodeString       re2("a(..)");
   1427     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1428     REGEX_CHECK_STATUS;
   1429     UnicodeString d5 = "abcdefg";
   1430     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1431     REGEX_CHECK_STATUS;
   1432     dest = matcher2->replaceFirst("$1$1", status);
   1433     REGEX_CHECK_STATUS;
   1434     REGEX_ASSERT(dest == "bcbcdefg");
   1435 
   1436     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1437     REGEX_CHECK_STATUS;
   1438     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1439 
   1440     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1441     REGEX_ASSERT(U_FAILURE(status));
   1442     status = U_ZERO_ERROR;
   1443 
   1444     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1445     replacement = replacement.unescape();
   1446     dest = matcher2->replaceFirst(replacement, status);
   1447     REGEX_CHECK_STATUS;
   1448     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1449 
   1450     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1451 
   1452 
   1453     //
   1454     // Replacement String with \u hex escapes
   1455     //
   1456     {
   1457         UnicodeString  src = "abc 1 abc 2 abc 3";
   1458         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1459         matcher->reset(src);
   1460         UnicodeString  result = matcher->replaceAll(substitute, status);
   1461         REGEX_CHECK_STATUS;
   1462         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1463     }
   1464     {
   1465         UnicodeString  src = "abc !";
   1466         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1467         matcher->reset(src);
   1468         UnicodeString  result = matcher->replaceAll(substitute, status);
   1469         REGEX_CHECK_STATUS;
   1470         UnicodeString expected = UnicodeString("--");
   1471         expected.append((UChar32)0x10000);
   1472         expected.append("-- !");
   1473         REGEX_ASSERT(result == expected);
   1474     }
   1475     // TODO:  need more through testing of capture substitutions.
   1476 
   1477     // Bug 4057
   1478     //
   1479     {
   1480         status = U_ZERO_ERROR;
   1481         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1482         RegexMatcher m("ss(.*?)ee", 0, status);
   1483         REGEX_CHECK_STATUS;
   1484         UnicodeString result;
   1485 
   1486         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1487         m.reset(s);
   1488         m.find();
   1489         m.find();
   1490         m.appendReplacement(result, "ooh", status);
   1491         REGEX_CHECK_STATUS;
   1492         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1493 
   1494         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1495         status = U_ZERO_ERROR;
   1496         result.truncate(0);
   1497         m.reset(10, status);
   1498         m.find();
   1499         m.find();
   1500         m.appendReplacement(result, "ooh", status);
   1501         REGEX_CHECK_STATUS;
   1502         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1503 
   1504         // find() at interior of string, appendReplacemnt still starts at beginning.
   1505         status = U_ZERO_ERROR;
   1506         result.truncate(0);
   1507         m.reset();
   1508         m.find(10, status);
   1509         m.find();
   1510         m.appendReplacement(result, "ooh", status);
   1511         REGEX_CHECK_STATUS;
   1512         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1513 
   1514         m.appendTail(result);
   1515         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1516 
   1517     }
   1518 
   1519     delete matcher2;
   1520     delete pat2;
   1521     delete matcher;
   1522     delete pat;
   1523 }
   1524 
   1525 
   1526 //---------------------------------------------------------------------------
   1527 //
   1528 //      API_Pattern       Test that the API for class RegexPattern is
   1529 //                        present and nominally working.
   1530 //
   1531 //---------------------------------------------------------------------------
   1532 void RegexTest::API_Pattern() {
   1533     RegexPattern        pata;    // Test default constructor to not crash.
   1534     RegexPattern        patb;
   1535 
   1536     REGEX_ASSERT(pata == patb);
   1537     REGEX_ASSERT(pata == pata);
   1538 
   1539     UnicodeString re1("abc[a-l][m-z]");
   1540     UnicodeString re2("def");
   1541     UErrorCode    status = U_ZERO_ERROR;
   1542     UParseError   pe;
   1543 
   1544     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1545     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1546     REGEX_CHECK_STATUS;
   1547     REGEX_ASSERT(*pat1 == *pat1);
   1548     REGEX_ASSERT(*pat1 != pata);
   1549 
   1550     // Assign
   1551     patb = *pat1;
   1552     REGEX_ASSERT(patb == *pat1);
   1553 
   1554     // Copy Construct
   1555     RegexPattern patc(*pat1);
   1556     REGEX_ASSERT(patc == *pat1);
   1557     REGEX_ASSERT(patb == patc);
   1558     REGEX_ASSERT(pat1 != pat2);
   1559     patb = *pat2;
   1560     REGEX_ASSERT(patb != patc);
   1561     REGEX_ASSERT(patb == *pat2);
   1562 
   1563     // Compile with no flags.
   1564     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1565     REGEX_ASSERT(*pat1a == *pat1);
   1566 
   1567     REGEX_ASSERT(pat1a->flags() == 0);
   1568 
   1569     // Compile with different flags should be not equal
   1570     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1571     REGEX_CHECK_STATUS;
   1572 
   1573     REGEX_ASSERT(*pat1b != *pat1a);
   1574     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1575     REGEX_ASSERT(pat1a->flags() == 0);
   1576     delete pat1b;
   1577 
   1578     // clone
   1579     RegexPattern *pat1c = pat1->clone();
   1580     REGEX_ASSERT(*pat1c == *pat1);
   1581     REGEX_ASSERT(*pat1c != *pat2);
   1582 
   1583     delete pat1c;
   1584     delete pat1a;
   1585     delete pat1;
   1586     delete pat2;
   1587 
   1588 
   1589     //
   1590     //   Verify that a matcher created from a cloned pattern works.
   1591     //     (Jitterbug 3423)
   1592     //
   1593     {
   1594         UErrorCode     status     = U_ZERO_ERROR;
   1595         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1596         RegexPattern  *pClone     = pSource->clone();
   1597         delete         pSource;
   1598         RegexMatcher  *mFromClone = pClone->matcher(status);
   1599         REGEX_CHECK_STATUS;
   1600         UnicodeString s = "Hello World";
   1601         mFromClone->reset(s);
   1602         REGEX_ASSERT(mFromClone->find() == TRUE);
   1603         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1604         REGEX_ASSERT(mFromClone->find() == TRUE);
   1605         REGEX_ASSERT(mFromClone->group(status) == "World");
   1606         REGEX_ASSERT(mFromClone->find() == FALSE);
   1607         delete mFromClone;
   1608         delete pClone;
   1609     }
   1610 
   1611     //
   1612     //   matches convenience API
   1613     //
   1614     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1615     REGEX_CHECK_STATUS;
   1616     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1617     REGEX_CHECK_STATUS;
   1618     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1619     REGEX_CHECK_STATUS;
   1620     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1621     REGEX_CHECK_STATUS;
   1622     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1623     REGEX_CHECK_STATUS;
   1624     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1625     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1626     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1627 
   1628 
   1629     //
   1630     // Split()
   1631     //
   1632     status = U_ZERO_ERROR;
   1633     pat1 = RegexPattern::compile(" +",  pe, status);
   1634     REGEX_CHECK_STATUS;
   1635     UnicodeString  fields[10];
   1636 
   1637     int32_t n;
   1638     n = pat1->split("Now is the time", fields, 10, status);
   1639     REGEX_CHECK_STATUS;
   1640     REGEX_ASSERT(n==4);
   1641     REGEX_ASSERT(fields[0]=="Now");
   1642     REGEX_ASSERT(fields[1]=="is");
   1643     REGEX_ASSERT(fields[2]=="the");
   1644     REGEX_ASSERT(fields[3]=="time");
   1645     REGEX_ASSERT(fields[4]=="");
   1646 
   1647     n = pat1->split("Now is the time", fields, 2, status);
   1648     REGEX_CHECK_STATUS;
   1649     REGEX_ASSERT(n==2);
   1650     REGEX_ASSERT(fields[0]=="Now");
   1651     REGEX_ASSERT(fields[1]=="is the time");
   1652     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1653 
   1654     fields[1] = "*";
   1655     status = U_ZERO_ERROR;
   1656     n = pat1->split("Now is the time", fields, 1, status);
   1657     REGEX_CHECK_STATUS;
   1658     REGEX_ASSERT(n==1);
   1659     REGEX_ASSERT(fields[0]=="Now is the time");
   1660     REGEX_ASSERT(fields[1]=="*");
   1661     status = U_ZERO_ERROR;
   1662 
   1663     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1664     REGEX_CHECK_STATUS;
   1665     REGEX_ASSERT(n==6);
   1666     REGEX_ASSERT(fields[0]=="");
   1667     REGEX_ASSERT(fields[1]=="Now");
   1668     REGEX_ASSERT(fields[2]=="is");
   1669     REGEX_ASSERT(fields[3]=="the");
   1670     REGEX_ASSERT(fields[4]=="time");
   1671     REGEX_ASSERT(fields[5]=="");
   1672 
   1673     n = pat1->split("     ", fields, 10, status);
   1674     REGEX_CHECK_STATUS;
   1675     REGEX_ASSERT(n==2);
   1676     REGEX_ASSERT(fields[0]=="");
   1677     REGEX_ASSERT(fields[1]=="");
   1678 
   1679     fields[0] = "foo";
   1680     n = pat1->split("", fields, 10, status);
   1681     REGEX_CHECK_STATUS;
   1682     REGEX_ASSERT(n==0);
   1683     REGEX_ASSERT(fields[0]=="foo");
   1684 
   1685     delete pat1;
   1686 
   1687     //  split, with a pattern with (capture)
   1688     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1689     REGEX_CHECK_STATUS;
   1690 
   1691     status = U_ZERO_ERROR;
   1692     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1693     REGEX_CHECK_STATUS;
   1694     REGEX_ASSERT(n==7);
   1695     REGEX_ASSERT(fields[0]=="");
   1696     REGEX_ASSERT(fields[1]=="a");
   1697     REGEX_ASSERT(fields[2]=="Now is ");
   1698     REGEX_ASSERT(fields[3]=="b");
   1699     REGEX_ASSERT(fields[4]=="the time");
   1700     REGEX_ASSERT(fields[5]=="c");
   1701     REGEX_ASSERT(fields[6]=="");
   1702     REGEX_ASSERT(status==U_ZERO_ERROR);
   1703 
   1704     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1705     REGEX_CHECK_STATUS;
   1706     REGEX_ASSERT(n==7);
   1707     REGEX_ASSERT(fields[0]=="  ");
   1708     REGEX_ASSERT(fields[1]=="a");
   1709     REGEX_ASSERT(fields[2]=="Now is ");
   1710     REGEX_ASSERT(fields[3]=="b");
   1711     REGEX_ASSERT(fields[4]=="the time");
   1712     REGEX_ASSERT(fields[5]=="c");
   1713     REGEX_ASSERT(fields[6]=="");
   1714 
   1715     status = U_ZERO_ERROR;
   1716     fields[6] = "foo";
   1717     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1718     REGEX_CHECK_STATUS;
   1719     REGEX_ASSERT(n==6);
   1720     REGEX_ASSERT(fields[0]=="  ");
   1721     REGEX_ASSERT(fields[1]=="a");
   1722     REGEX_ASSERT(fields[2]=="Now is ");
   1723     REGEX_ASSERT(fields[3]=="b");
   1724     REGEX_ASSERT(fields[4]=="the time");
   1725     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1726     REGEX_ASSERT(fields[6]=="foo");
   1727 
   1728     status = U_ZERO_ERROR;
   1729     fields[5] = "foo";
   1730     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1731     REGEX_CHECK_STATUS;
   1732     REGEX_ASSERT(n==5);
   1733     REGEX_ASSERT(fields[0]=="  ");
   1734     REGEX_ASSERT(fields[1]=="a");
   1735     REGEX_ASSERT(fields[2]=="Now is ");
   1736     REGEX_ASSERT(fields[3]=="b");
   1737     REGEX_ASSERT(fields[4]=="the time<c>");
   1738     REGEX_ASSERT(fields[5]=="foo");
   1739 
   1740     status = U_ZERO_ERROR;
   1741     fields[5] = "foo";
   1742     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1743     REGEX_CHECK_STATUS;
   1744     REGEX_ASSERT(n==5);
   1745     REGEX_ASSERT(fields[0]=="  ");
   1746     REGEX_ASSERT(fields[1]=="a");
   1747     REGEX_ASSERT(fields[2]=="Now is ");
   1748     REGEX_ASSERT(fields[3]=="b");
   1749     REGEX_ASSERT(fields[4]=="the time");
   1750     REGEX_ASSERT(fields[5]=="foo");
   1751 
   1752     status = U_ZERO_ERROR;
   1753     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1754     REGEX_CHECK_STATUS;
   1755     REGEX_ASSERT(n==4);
   1756     REGEX_ASSERT(fields[0]=="  ");
   1757     REGEX_ASSERT(fields[1]=="a");
   1758     REGEX_ASSERT(fields[2]=="Now is ");
   1759     REGEX_ASSERT(fields[3]=="the time<c>");
   1760     status = U_ZERO_ERROR;
   1761     delete pat1;
   1762 
   1763     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1764     REGEX_CHECK_STATUS;
   1765     n = pat1->split("1-10,20", fields, 10, status);
   1766     REGEX_CHECK_STATUS;
   1767     REGEX_ASSERT(n==5);
   1768     REGEX_ASSERT(fields[0]=="1");
   1769     REGEX_ASSERT(fields[1]=="-");
   1770     REGEX_ASSERT(fields[2]=="10");
   1771     REGEX_ASSERT(fields[3]==",");
   1772     REGEX_ASSERT(fields[4]=="20");
   1773     delete pat1;
   1774 
   1775     // Test split of string with empty trailing fields
   1776     pat1 = RegexPattern::compile(",", pe, status);
   1777     REGEX_CHECK_STATUS;
   1778     n = pat1->split("a,b,c,", fields, 10, status);
   1779     REGEX_CHECK_STATUS;
   1780     REGEX_ASSERT(n==4);
   1781     REGEX_ASSERT(fields[0]=="a");
   1782     REGEX_ASSERT(fields[1]=="b");
   1783     REGEX_ASSERT(fields[2]=="c");
   1784     REGEX_ASSERT(fields[3]=="");
   1785 
   1786     n = pat1->split("a,,,", fields, 10, status);
   1787     REGEX_CHECK_STATUS;
   1788     REGEX_ASSERT(n==4);
   1789     REGEX_ASSERT(fields[0]=="a");
   1790     REGEX_ASSERT(fields[1]=="");
   1791     REGEX_ASSERT(fields[2]=="");
   1792     REGEX_ASSERT(fields[3]=="");
   1793     delete pat1;
   1794 
   1795     // Split Separator with zero length match.
   1796     pat1 = RegexPattern::compile(":?", pe, status);
   1797     REGEX_CHECK_STATUS;
   1798     n = pat1->split("abc", fields, 10, status);
   1799     REGEX_CHECK_STATUS;
   1800     REGEX_ASSERT(n==5);
   1801     REGEX_ASSERT(fields[0]=="");
   1802     REGEX_ASSERT(fields[1]=="a");
   1803     REGEX_ASSERT(fields[2]=="b");
   1804     REGEX_ASSERT(fields[3]=="c");
   1805     REGEX_ASSERT(fields[4]=="");
   1806 
   1807     delete pat1;
   1808 
   1809     //
   1810     // RegexPattern::pattern()
   1811     //
   1812     pat1 = new RegexPattern();
   1813     REGEX_ASSERT(pat1->pattern() == "");
   1814     delete pat1;
   1815 
   1816     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1817     REGEX_CHECK_STATUS;
   1818     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1819     delete pat1;
   1820 
   1821 
   1822     //
   1823     // classID functions
   1824     //
   1825     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1826     REGEX_CHECK_STATUS;
   1827     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1828     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1829     UnicodeString Hello("Hello, world.");
   1830     RegexMatcher *m = pat1->matcher(Hello, status);
   1831     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1832     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1833     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1834     delete m;
   1835     delete pat1;
   1836 
   1837 }
   1838 
   1839 //---------------------------------------------------------------------------
   1840 //
   1841 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1842 //                       is present and working, but excluding functions
   1843 //                       implementing replace operations.
   1844 //
   1845 //---------------------------------------------------------------------------
   1846 void RegexTest::API_Match_UTF8() {
   1847     UParseError         pe;
   1848     UErrorCode          status=U_ZERO_ERROR;
   1849     int32_t             flags = 0;
   1850 
   1851     //
   1852     // Debug - slide failing test cases early
   1853     //
   1854 #if 0
   1855     {
   1856     }
   1857     return;
   1858 #endif
   1859 
   1860     //
   1861     // Simple pattern compilation
   1862     //
   1863     {
   1864         UText               re = UTEXT_INITIALIZER;
   1865         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1866         REGEX_VERBOSE_TEXT(&re);
   1867         RegexPattern        *pat2;
   1868         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1869         REGEX_CHECK_STATUS;
   1870 
   1871         UText input1 = UTEXT_INITIALIZER;
   1872         UText input2 = UTEXT_INITIALIZER;
   1873         UText empty  = UTEXT_INITIALIZER;
   1874         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1875         REGEX_VERBOSE_TEXT(&input1);
   1876         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1877         REGEX_VERBOSE_TEXT(&input2);
   1878         utext_openUChars(&empty, NULL, 0, &status);
   1879 
   1880         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1881         int32_t input2Len = strlen("not abc");
   1882 
   1883 
   1884         //
   1885         // Matcher creation and reset.
   1886         //
   1887         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1888         REGEX_CHECK_STATUS;
   1889         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1890         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1891         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1892         m1->reset(&input2);
   1893         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1894         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1895         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1896         m1->reset(&input1);
   1897         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1898         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1899         m1->reset(&empty);
   1900         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1901         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1902 
   1903         //
   1904         //  reset(pos, status)
   1905         //
   1906         m1->reset(&input1);
   1907         m1->reset(4, status);
   1908         REGEX_CHECK_STATUS;
   1909         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1910         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1911 
   1912         m1->reset(-1, status);
   1913         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1914         status = U_ZERO_ERROR;
   1915 
   1916         m1->reset(0, status);
   1917         REGEX_CHECK_STATUS;
   1918         status = U_ZERO_ERROR;
   1919 
   1920         m1->reset(input1Len-1, status);
   1921         REGEX_CHECK_STATUS;
   1922         status = U_ZERO_ERROR;
   1923 
   1924         m1->reset(input1Len, status);
   1925         REGEX_CHECK_STATUS;
   1926         status = U_ZERO_ERROR;
   1927 
   1928         m1->reset(input1Len+1, status);
   1929         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1930         status = U_ZERO_ERROR;
   1931 
   1932         //
   1933         // match(pos, status)
   1934         //
   1935         m1->reset(&input2);
   1936         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1937         m1->reset();
   1938         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1939         m1->reset();
   1940         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1941         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1942         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1943         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1944 
   1945         // Match() at end of string should fail, but should not
   1946         //  be an error.
   1947         status = U_ZERO_ERROR;
   1948         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1949         REGEX_CHECK_STATUS;
   1950 
   1951         // Match beyond end of string should fail with an error.
   1952         status = U_ZERO_ERROR;
   1953         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1954         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1955 
   1956         // Successful match at end of string.
   1957         {
   1958             status = U_ZERO_ERROR;
   1959             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1960             REGEX_CHECK_STATUS;
   1961             m.reset(&input1);
   1962             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1963             REGEX_CHECK_STATUS;
   1964             m.reset(&empty);
   1965             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1966             REGEX_CHECK_STATUS;
   1967         }
   1968 
   1969 
   1970         //
   1971         // lookingAt(pos, status)
   1972         //
   1973         status = U_ZERO_ERROR;
   1974         m1->reset(&input2);  // "not abc"
   1975         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1976         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1977         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1978         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1979         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1980         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1981         status = U_ZERO_ERROR;
   1982         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1983         REGEX_CHECK_STATUS;
   1984         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1985         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1986 
   1987         delete m1;
   1988         delete pat2;
   1989 
   1990         utext_close(&re);
   1991         utext_close(&input1);
   1992         utext_close(&input2);
   1993         utext_close(&empty);
   1994     }
   1995 
   1996 
   1997     //
   1998     // Capture Group.
   1999     //     RegexMatcher::start();
   2000     //     RegexMatcher::end();
   2001     //     RegexMatcher::groupCount();
   2002     //
   2003     {
   2004         int32_t             flags=0;
   2005         UParseError         pe;
   2006         UErrorCode          status=U_ZERO_ERROR;
   2007         UText               re=UTEXT_INITIALIZER;
   2008         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   2009         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   2010 
   2011         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2012         REGEX_CHECK_STATUS;
   2013 
   2014         UText input = UTEXT_INITIALIZER;
   2015         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2016         utext_openUTF8(&input, str_0123456789, -1, &status);
   2017 
   2018         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2019         REGEX_CHECK_STATUS;
   2020         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   2021         static const int32_t matchStarts[] = {0,  2, 4, 8};
   2022         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   2023         int32_t i;
   2024         for (i=0; i<4; i++) {
   2025             int32_t actualStart = matcher->start(i, status);
   2026             REGEX_CHECK_STATUS;
   2027             if (actualStart != matchStarts[i]) {
   2028                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   2029                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   2030             }
   2031             int32_t actualEnd = matcher->end(i, status);
   2032             REGEX_CHECK_STATUS;
   2033             if (actualEnd != matchEnds[i]) {
   2034                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   2035                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   2036             }
   2037         }
   2038 
   2039         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   2040         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   2041 
   2042         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2043         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2044         matcher->reset();
   2045         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   2046 
   2047         matcher->lookingAt(status);
   2048 
   2049         UnicodeString dest;
   2050         UText destText = UTEXT_INITIALIZER;
   2051         utext_openUnicodeString(&destText, &dest, &status);
   2052         UText *result;
   2053         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2054         //  Test shallow-clone API
   2055         int64_t   group_len;
   2056         result = matcher->group((UText *)NULL, group_len, status);
   2057         REGEX_CHECK_STATUS;
   2058         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2059         utext_close(result);
   2060         result = matcher->group(0, &destText, group_len, status);
   2061         REGEX_CHECK_STATUS;
   2062         REGEX_ASSERT(result == &destText);
   2063         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2064         //  destText is now immutable, reopen it
   2065         utext_close(&destText);
   2066         utext_openUnicodeString(&destText, &dest, &status);
   2067 
   2068         int64_t length;
   2069         result = matcher->group(0, NULL, length, status);
   2070         REGEX_CHECK_STATUS;
   2071         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2072         utext_close(result);
   2073         result = matcher->group(0, &destText, length, status);
   2074         REGEX_CHECK_STATUS;
   2075         REGEX_ASSERT(result == &destText);
   2076         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
   2077         REGEX_ASSERT(length == 10);
   2078         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2079 
   2080         // Capture Group 1 == "234567"
   2081         result = matcher->group(1, NULL, length, status);
   2082         REGEX_CHECK_STATUS;
   2083         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2084         REGEX_ASSERT(length == 6);
   2085         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2086         utext_close(result);
   2087 
   2088         result = matcher->group(1, &destText, length, status);
   2089         REGEX_CHECK_STATUS;
   2090         REGEX_ASSERT(result == &destText);
   2091         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2092         REGEX_ASSERT(length == 6);
   2093         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2094         utext_close(result);
   2095 
   2096         // Capture Group 2 == "45"
   2097         result = matcher->group(2, NULL, length, status);
   2098         REGEX_CHECK_STATUS;
   2099         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2100         REGEX_ASSERT(length == 2);
   2101         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2102         utext_close(result);
   2103 
   2104         result = matcher->group(2, &destText, length, status);
   2105         REGEX_CHECK_STATUS;
   2106         REGEX_ASSERT(result == &destText);
   2107         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2108         REGEX_ASSERT(length == 2);
   2109         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2110         utext_close(result);
   2111 
   2112         // Capture Group 3 == "89"
   2113         result = matcher->group(3, NULL, length, status);
   2114         REGEX_CHECK_STATUS;
   2115         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2116         REGEX_ASSERT(length == 2);
   2117         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2118         utext_close(result);
   2119 
   2120         result = matcher->group(3, &destText, length, status);
   2121         REGEX_CHECK_STATUS;
   2122         REGEX_ASSERT(result == &destText);
   2123         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2124         REGEX_ASSERT(length == 2);
   2125         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2126         utext_close(result);
   2127 
   2128         // Capture Group number out of range.
   2129         status = U_ZERO_ERROR;
   2130         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2131         status = U_ZERO_ERROR;
   2132         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2133         status = U_ZERO_ERROR;
   2134         matcher->reset();
   2135         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2136 
   2137         delete matcher;
   2138         delete pat;
   2139 
   2140         utext_close(&destText);
   2141         utext_close(&input);
   2142         utext_close(&re);
   2143     }
   2144 
   2145     //
   2146     //  find
   2147     //
   2148     {
   2149         int32_t             flags=0;
   2150         UParseError         pe;
   2151         UErrorCode          status=U_ZERO_ERROR;
   2152         UText               re=UTEXT_INITIALIZER;
   2153         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2154         utext_openUTF8(&re, str_abc, -1, &status);
   2155 
   2156         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2157         REGEX_CHECK_STATUS;
   2158         UText input = UTEXT_INITIALIZER;
   2159         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2160         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2161         //                      012345678901234567
   2162 
   2163         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2164         REGEX_CHECK_STATUS;
   2165         REGEX_ASSERT(matcher->find());
   2166         REGEX_ASSERT(matcher->start(status) == 1);
   2167         REGEX_ASSERT(matcher->find());
   2168         REGEX_ASSERT(matcher->start(status) == 6);
   2169         REGEX_ASSERT(matcher->find());
   2170         REGEX_ASSERT(matcher->start(status) == 12);
   2171         REGEX_ASSERT(matcher->find() == FALSE);
   2172         REGEX_ASSERT(matcher->find() == FALSE);
   2173 
   2174         matcher->reset();
   2175         REGEX_ASSERT(matcher->find());
   2176         REGEX_ASSERT(matcher->start(status) == 1);
   2177 
   2178         REGEX_ASSERT(matcher->find(0, status));
   2179         REGEX_ASSERT(matcher->start(status) == 1);
   2180         REGEX_ASSERT(matcher->find(1, status));
   2181         REGEX_ASSERT(matcher->start(status) == 1);
   2182         REGEX_ASSERT(matcher->find(2, status));
   2183         REGEX_ASSERT(matcher->start(status) == 6);
   2184         REGEX_ASSERT(matcher->find(12, status));
   2185         REGEX_ASSERT(matcher->start(status) == 12);
   2186         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2187         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2188         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2189         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2190 
   2191         status = U_ZERO_ERROR;
   2192         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2193         status = U_ZERO_ERROR;
   2194         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2195 
   2196         REGEX_ASSERT(matcher->groupCount() == 0);
   2197 
   2198         delete matcher;
   2199         delete pat;
   2200 
   2201         utext_close(&input);
   2202         utext_close(&re);
   2203     }
   2204 
   2205 
   2206     //
   2207     //  find, with \G in pattern (true if at the end of a previous match).
   2208     //
   2209     {
   2210         int32_t             flags=0;
   2211         UParseError         pe;
   2212         UErrorCode          status=U_ZERO_ERROR;
   2213         UText               re=UTEXT_INITIALIZER;
   2214         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2215         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2216 
   2217         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2218 
   2219         REGEX_CHECK_STATUS;
   2220         UText input = UTEXT_INITIALIZER;
   2221         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2222         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2223         //                      012345678901234567
   2224 
   2225         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2226         REGEX_CHECK_STATUS;
   2227         REGEX_ASSERT(matcher->find());
   2228         REGEX_ASSERT(matcher->start(status) == 0);
   2229         REGEX_ASSERT(matcher->start(1, status) == -1);
   2230         REGEX_ASSERT(matcher->start(2, status) == 1);
   2231 
   2232         REGEX_ASSERT(matcher->find());
   2233         REGEX_ASSERT(matcher->start(status) == 4);
   2234         REGEX_ASSERT(matcher->start(1, status) == 4);
   2235         REGEX_ASSERT(matcher->start(2, status) == -1);
   2236         REGEX_CHECK_STATUS;
   2237 
   2238         delete matcher;
   2239         delete pat;
   2240 
   2241         utext_close(&input);
   2242         utext_close(&re);
   2243     }
   2244 
   2245     //
   2246     //   find with zero length matches, match position should bump ahead
   2247     //     to prevent loops.
   2248     //
   2249     {
   2250         int32_t                 i;
   2251         UErrorCode          status=U_ZERO_ERROR;
   2252         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2253                                                       //   using an always-true look-ahead.
   2254         REGEX_CHECK_STATUS;
   2255         UText s = UTEXT_INITIALIZER;
   2256         utext_openUTF8(&s, "    ", -1, &status);
   2257         m.reset(&s);
   2258         for (i=0; ; i++) {
   2259             if (m.find() == FALSE) {
   2260                 break;
   2261             }
   2262             REGEX_ASSERT(m.start(status) == i);
   2263             REGEX_ASSERT(m.end(status) == i);
   2264         }
   2265         REGEX_ASSERT(i==5);
   2266 
   2267         // Check that the bump goes over characters outside the BMP OK
   2268         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2269         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2270         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2271         m.reset(&s);
   2272         for (i=0; ; i+=4) {
   2273             if (m.find() == FALSE) {
   2274                 break;
   2275             }
   2276             REGEX_ASSERT(m.start(status) == i);
   2277             REGEX_ASSERT(m.end(status) == i);
   2278         }
   2279         REGEX_ASSERT(i==20);
   2280 
   2281         utext_close(&s);
   2282     }
   2283     {
   2284         // find() loop breaking test.
   2285         //        with pattern of /.?/, should see a series of one char matches, then a single
   2286         //        match of zero length at the end of the input string.
   2287         int32_t                 i;
   2288         UErrorCode          status=U_ZERO_ERROR;
   2289         RegexMatcher        m(".?", 0, status);
   2290         REGEX_CHECK_STATUS;
   2291         UText s = UTEXT_INITIALIZER;
   2292         utext_openUTF8(&s, "    ", -1, &status);
   2293         m.reset(&s);
   2294         for (i=0; ; i++) {
   2295             if (m.find() == FALSE) {
   2296                 break;
   2297             }
   2298             REGEX_ASSERT(m.start(status) == i);
   2299             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2300         }
   2301         REGEX_ASSERT(i==5);
   2302 
   2303         utext_close(&s);
   2304     }
   2305 
   2306 
   2307     //
   2308     // Matchers with no input string behave as if they had an empty input string.
   2309     //
   2310 
   2311     {
   2312         UErrorCode status = U_ZERO_ERROR;
   2313         RegexMatcher  m(".?", 0, status);
   2314         REGEX_CHECK_STATUS;
   2315         REGEX_ASSERT(m.find());
   2316         REGEX_ASSERT(m.start(status) == 0);
   2317         REGEX_ASSERT(m.input() == "");
   2318     }
   2319     {
   2320         UErrorCode status = U_ZERO_ERROR;
   2321         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2322         RegexMatcher  *m = p->matcher(status);
   2323         REGEX_CHECK_STATUS;
   2324 
   2325         REGEX_ASSERT(m->find() == FALSE);
   2326         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2327         delete m;
   2328         delete p;
   2329     }
   2330 
   2331     //
   2332     // Regions
   2333     //
   2334     {
   2335         UErrorCode status = U_ZERO_ERROR;
   2336         UText testPattern = UTEXT_INITIALIZER;
   2337         UText testText    = UTEXT_INITIALIZER;
   2338         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2339         REGEX_VERBOSE_TEXT(&testPattern);
   2340         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2341         REGEX_VERBOSE_TEXT(&testText);
   2342 
   2343         RegexMatcher m(&testPattern, &testText, 0, status);
   2344         REGEX_CHECK_STATUS;
   2345         REGEX_ASSERT(m.regionStart() == 0);
   2346         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2347         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2348         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2349 
   2350         m.region(2,4, status);
   2351         REGEX_CHECK_STATUS;
   2352         REGEX_ASSERT(m.matches(status));
   2353         REGEX_ASSERT(m.start(status)==2);
   2354         REGEX_ASSERT(m.end(status)==4);
   2355         REGEX_CHECK_STATUS;
   2356 
   2357         m.reset();
   2358         REGEX_ASSERT(m.regionStart() == 0);
   2359         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2360 
   2361         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2362         REGEX_VERBOSE_TEXT(&testText);
   2363         m.reset(&testText);
   2364         REGEX_ASSERT(m.regionStart() == 0);
   2365         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2366 
   2367         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2368         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2369         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2370         REGEX_ASSERT(&m == &m.reset());
   2371         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2372 
   2373         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2374         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2375         REGEX_ASSERT(&m == &m.reset());
   2376         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2377 
   2378         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2379         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2380         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2381         REGEX_ASSERT(&m == &m.reset());
   2382         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2383 
   2384         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2385         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2386         REGEX_ASSERT(&m == &m.reset());
   2387         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2388 
   2389         utext_close(&testText);
   2390         utext_close(&testPattern);
   2391     }
   2392 
   2393     //
   2394     // hitEnd() and requireEnd()
   2395     //
   2396     {
   2397         UErrorCode status = U_ZERO_ERROR;
   2398         UText testPattern = UTEXT_INITIALIZER;
   2399         UText testText    = UTEXT_INITIALIZER;
   2400         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2401         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2402         utext_openUTF8(&testPattern, str_, -1, &status);
   2403         utext_openUTF8(&testText, str_aabb, -1, &status);
   2404 
   2405         RegexMatcher m1(&testPattern, &testText,  0, status);
   2406         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2407         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2408         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2409         REGEX_CHECK_STATUS;
   2410 
   2411         status = U_ZERO_ERROR;
   2412         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2413         utext_openUTF8(&testPattern, str_a, -1, &status);
   2414         RegexMatcher m2(&testPattern, &testText, 0, status);
   2415         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2416         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2417         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2418         REGEX_CHECK_STATUS;
   2419 
   2420         status = U_ZERO_ERROR;
   2421         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2422         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2423         RegexMatcher m3(&testPattern, &testText, 0, status);
   2424         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2425         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2426         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2427         REGEX_CHECK_STATUS;
   2428 
   2429         utext_close(&testText);
   2430         utext_close(&testPattern);
   2431     }
   2432 }
   2433 
   2434 
   2435 //---------------------------------------------------------------------------
   2436 //
   2437 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2438 //                         Replace family of functions.
   2439 //
   2440 //---------------------------------------------------------------------------
   2441 void RegexTest::API_Replace_UTF8() {
   2442     //
   2443     //  Replace
   2444     //
   2445     int32_t             flags=0;
   2446     UParseError         pe;
   2447     UErrorCode          status=U_ZERO_ERROR;
   2448 
   2449     UText               re=UTEXT_INITIALIZER;
   2450     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2451     REGEX_VERBOSE_TEXT(&re);
   2452     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2453     REGEX_CHECK_STATUS;
   2454 
   2455     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2456     //             012345678901234567
   2457     UText dataText = UTEXT_INITIALIZER;
   2458     utext_openUTF8(&dataText, data, -1, &status);
   2459     REGEX_CHECK_STATUS;
   2460     REGEX_VERBOSE_TEXT(&dataText);
   2461     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2462 
   2463     //
   2464     //  Plain vanilla matches.
   2465     //
   2466     UnicodeString  dest;
   2467     UText destText = UTEXT_INITIALIZER;
   2468     utext_openUnicodeString(&destText, &dest, &status);
   2469     UText *result;
   2470 
   2471     UText replText = UTEXT_INITIALIZER;
   2472 
   2473     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2474     utext_openUTF8(&replText, str_yz, -1, &status);
   2475     REGEX_VERBOSE_TEXT(&replText);
   2476     result = matcher->replaceFirst(&replText, NULL, status);
   2477     REGEX_CHECK_STATUS;
   2478     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2479     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2480     utext_close(result);
   2481     result = matcher->replaceFirst(&replText, &destText, status);
   2482     REGEX_CHECK_STATUS;
   2483     REGEX_ASSERT(result == &destText);
   2484     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2485 
   2486     result = matcher->replaceAll(&replText, NULL, status);
   2487     REGEX_CHECK_STATUS;
   2488     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2489     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2490     utext_close(result);
   2491 
   2492     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2493     result = matcher->replaceAll(&replText, &destText, status);
   2494     REGEX_CHECK_STATUS;
   2495     REGEX_ASSERT(result == &destText);
   2496     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2497 
   2498     //
   2499     //  Plain vanilla non-matches.
   2500     //
   2501     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2502     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2503     matcher->reset(&dataText);
   2504 
   2505     result = matcher->replaceFirst(&replText, NULL, status);
   2506     REGEX_CHECK_STATUS;
   2507     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2508     utext_close(result);
   2509     result = matcher->replaceFirst(&replText, &destText, status);
   2510     REGEX_CHECK_STATUS;
   2511     REGEX_ASSERT(result == &destText);
   2512     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2513 
   2514     result = matcher->replaceAll(&replText, NULL, status);
   2515     REGEX_CHECK_STATUS;
   2516     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2517     utext_close(result);
   2518     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2519     result = matcher->replaceAll(&replText, &destText, status);
   2520     REGEX_CHECK_STATUS;
   2521     REGEX_ASSERT(result == &destText);
   2522     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2523 
   2524     //
   2525     // Empty source string
   2526     //
   2527     utext_openUTF8(&dataText, NULL, 0, &status);
   2528     matcher->reset(&dataText);
   2529 
   2530     result = matcher->replaceFirst(&replText, NULL, status);
   2531     REGEX_CHECK_STATUS;
   2532     REGEX_ASSERT_UTEXT_UTF8("", result);
   2533     utext_close(result);
   2534     result = matcher->replaceFirst(&replText, &destText, status);
   2535     REGEX_CHECK_STATUS;
   2536     REGEX_ASSERT(result == &destText);
   2537     REGEX_ASSERT_UTEXT_UTF8("", result);
   2538 
   2539     result = matcher->replaceAll(&replText, NULL, status);
   2540     REGEX_CHECK_STATUS;
   2541     REGEX_ASSERT_UTEXT_UTF8("", result);
   2542     utext_close(result);
   2543     result = matcher->replaceAll(&replText, &destText, status);
   2544     REGEX_CHECK_STATUS;
   2545     REGEX_ASSERT(result == &destText);
   2546     REGEX_ASSERT_UTEXT_UTF8("", result);
   2547 
   2548     //
   2549     // Empty substitution string
   2550     //
   2551     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2552     matcher->reset(&dataText);
   2553 
   2554     utext_openUTF8(&replText, NULL, 0, &status);
   2555     result = matcher->replaceFirst(&replText, NULL, status);
   2556     REGEX_CHECK_STATUS;
   2557     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2558     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2559     utext_close(result);
   2560     result = matcher->replaceFirst(&replText, &destText, status);
   2561     REGEX_CHECK_STATUS;
   2562     REGEX_ASSERT(result == &destText);
   2563     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2564 
   2565     result = matcher->replaceAll(&replText, NULL, status);
   2566     REGEX_CHECK_STATUS;
   2567     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2568     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2569     utext_close(result);
   2570     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2571     result = matcher->replaceAll(&replText, &destText, status);
   2572     REGEX_CHECK_STATUS;
   2573     REGEX_ASSERT(result == &destText);
   2574     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2575 
   2576     //
   2577     // match whole string
   2578     //
   2579     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2580     utext_openUTF8(&dataText, str_abc, -1, &status);
   2581     matcher->reset(&dataText);
   2582 
   2583     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2584     utext_openUTF8(&replText, str_xyz, -1, &status);
   2585     result = matcher->replaceFirst(&replText, NULL, status);
   2586     REGEX_CHECK_STATUS;
   2587     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2588     utext_close(result);
   2589     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2590     result = matcher->replaceFirst(&replText, &destText, status);
   2591     REGEX_CHECK_STATUS;
   2592     REGEX_ASSERT(result == &destText);
   2593     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2594 
   2595     result = matcher->replaceAll(&replText, NULL, status);
   2596     REGEX_CHECK_STATUS;
   2597     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2598     utext_close(result);
   2599     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2600     result = matcher->replaceAll(&replText, &destText, status);
   2601     REGEX_CHECK_STATUS;
   2602     REGEX_ASSERT(result == &destText);
   2603     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2604 
   2605     //
   2606     // Capture Group, simple case
   2607     //
   2608     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2609     utext_openUTF8(&re, str_add, -1, &status);
   2610     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2611     REGEX_CHECK_STATUS;
   2612 
   2613     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2614     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2615     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2616     REGEX_CHECK_STATUS;
   2617 
   2618     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2619     utext_openUTF8(&replText, str_11, -1, &status);
   2620     result = matcher2->replaceFirst(&replText, NULL, status);
   2621     REGEX_CHECK_STATUS;
   2622     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2623     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2624     utext_close(result);
   2625     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2626     result = matcher2->replaceFirst(&replText, &destText, status);
   2627     REGEX_CHECK_STATUS;
   2628     REGEX_ASSERT(result == &destText);
   2629     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2630 
   2631     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2632     utext_openUTF8(&replText, str_v, -1, &status);
   2633     REGEX_VERBOSE_TEXT(&replText);
   2634     result = matcher2->replaceFirst(&replText, NULL, status);
   2635     REGEX_CHECK_STATUS;
   2636     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2637     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2638     utext_close(result);
   2639     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2640     result = matcher2->replaceFirst(&replText, &destText, status);
   2641     REGEX_CHECK_STATUS;
   2642     REGEX_ASSERT(result == &destText);
   2643     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2644 
   2645     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
   2646                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
   2647                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
   2648     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2649     result = matcher2->replaceFirst(&replText, NULL, status);
   2650     REGEX_CHECK_STATUS;
   2651     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2652     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2653     utext_close(result);
   2654     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2655     result = matcher2->replaceFirst(&replText, &destText, status);
   2656     REGEX_CHECK_STATUS;
   2657     REGEX_ASSERT(result == &destText);
   2658     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2659 
   2660     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2661     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2662     //                                 012345678901234567890123456
   2663     supplDigitChars[22] = 0xF0;
   2664     supplDigitChars[23] = 0x9D;
   2665     supplDigitChars[24] = 0x9F;
   2666     supplDigitChars[25] = 0x8F;
   2667     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2668 
   2669     result = matcher2->replaceFirst(&replText, NULL, status);
   2670     REGEX_CHECK_STATUS;
   2671     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2672     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2673     utext_close(result);
   2674     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2675     result = matcher2->replaceFirst(&replText, &destText, status);
   2676     REGEX_CHECK_STATUS;
   2677     REGEX_ASSERT(result == &destText);
   2678     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2679     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2680     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2681     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2682 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2683     utext_close(result);
   2684     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2685     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2686     REGEX_ASSERT(result == &destText);
   2687 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2688 
   2689     //
   2690     // Replacement String with \u hex escapes
   2691     //
   2692     {
   2693       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2694       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2695         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2696         utext_openUTF8(&replText, str_u0043, -1, &status);
   2697         matcher->reset(&dataText);
   2698 
   2699         result = matcher->replaceAll(&replText, NULL, status);
   2700         REGEX_CHECK_STATUS;
   2701         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2702         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2703         utext_close(result);
   2704         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2705         result = matcher->replaceAll(&replText, &destText, status);
   2706         REGEX_CHECK_STATUS;
   2707         REGEX_ASSERT(result == &destText);
   2708         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2709     }
   2710     {
   2711       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2712         utext_openUTF8(&dataText, str_abc, -1, &status);
   2713         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2714         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2715         matcher->reset(&dataText);
   2716 
   2717         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2718         //                          0123456789
   2719         expected[2] = 0xF0;
   2720         expected[3] = 0x90;
   2721         expected[4] = 0x80;
   2722         expected[5] = 0x80;
   2723 
   2724         result = matcher->replaceAll(&replText, NULL, status);
   2725         REGEX_CHECK_STATUS;
   2726         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2727         utext_close(result);
   2728         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2729         result = matcher->replaceAll(&replText, &destText, status);
   2730         REGEX_CHECK_STATUS;
   2731         REGEX_ASSERT(result == &destText);
   2732         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2733     }
   2734     // TODO:  need more through testing of capture substitutions.
   2735 
   2736     // Bug 4057
   2737     //
   2738     {
   2739         status = U_ZERO_ERROR;
   2740 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2741 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2742 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2743         utext_openUTF8(&re, str_ssee, -1, &status);
   2744         utext_openUTF8(&dataText, str_blah, -1, &status);
   2745         utext_openUTF8(&replText, str_ooh, -1, &status);
   2746 
   2747         RegexMatcher m(&re, 0, status);
   2748         REGEX_CHECK_STATUS;
   2749 
   2750         UnicodeString result;
   2751         UText resultText = UTEXT_INITIALIZER;
   2752         utext_openUnicodeString(&resultText, &result, &status);
   2753 
   2754         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2755         m.reset(&dataText);
   2756         m.find();
   2757         m.find();
   2758         m.appendReplacement(&resultText, &replText, status);
   2759         REGEX_CHECK_STATUS;
   2760         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2761         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2762 
   2763         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2764         status = U_ZERO_ERROR;
   2765         result.truncate(0);
   2766         utext_openUnicodeString(&resultText, &result, &status);
   2767         m.reset(10, status);
   2768         m.find();
   2769         m.find();
   2770         m.appendReplacement(&resultText, &replText, status);
   2771         REGEX_CHECK_STATUS;
   2772         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2773         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2774 
   2775         // find() at interior of string, appendReplacement still starts at beginning.
   2776         status = U_ZERO_ERROR;
   2777         result.truncate(0);
   2778         utext_openUnicodeString(&resultText, &result, &status);
   2779         m.reset();
   2780         m.find(10, status);
   2781         m.find();
   2782         m.appendReplacement(&resultText, &replText, status);
   2783         REGEX_CHECK_STATUS;
   2784         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2785         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2786 
   2787         m.appendTail(&resultText, status);
   2788         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2789         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2790 
   2791         utext_close(&resultText);
   2792     }
   2793 
   2794     delete matcher2;
   2795     delete pat2;
   2796     delete matcher;
   2797     delete pat;
   2798 
   2799     utext_close(&dataText);
   2800     utext_close(&replText);
   2801     utext_close(&destText);
   2802     utext_close(&re);
   2803 }
   2804 
   2805 
   2806 //---------------------------------------------------------------------------
   2807 //
   2808 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2809 //                        present and nominally working.
   2810 //
   2811 //---------------------------------------------------------------------------
   2812 void RegexTest::API_Pattern_UTF8() {
   2813     RegexPattern        pata;    // Test default constructor to not crash.
   2814     RegexPattern        patb;
   2815 
   2816     REGEX_ASSERT(pata == patb);
   2817     REGEX_ASSERT(pata == pata);
   2818 
   2819     UText         re1 = UTEXT_INITIALIZER;
   2820     UText         re2 = UTEXT_INITIALIZER;
   2821     UErrorCode    status = U_ZERO_ERROR;
   2822     UParseError   pe;
   2823 
   2824     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2825     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2826     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2827     utext_openUTF8(&re2, str_def, -1, &status);
   2828 
   2829     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2830     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2831     REGEX_CHECK_STATUS;
   2832     REGEX_ASSERT(*pat1 == *pat1);
   2833     REGEX_ASSERT(*pat1 != pata);
   2834 
   2835     // Assign
   2836     patb = *pat1;
   2837     REGEX_ASSERT(patb == *pat1);
   2838 
   2839     // Copy Construct
   2840     RegexPattern patc(*pat1);
   2841     REGEX_ASSERT(patc == *pat1);
   2842     REGEX_ASSERT(patb == patc);
   2843     REGEX_ASSERT(pat1 != pat2);
   2844     patb = *pat2;
   2845     REGEX_ASSERT(patb != patc);
   2846     REGEX_ASSERT(patb == *pat2);
   2847 
   2848     // Compile with no flags.
   2849     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2850     REGEX_ASSERT(*pat1a == *pat1);
   2851 
   2852     REGEX_ASSERT(pat1a->flags() == 0);
   2853 
   2854     // Compile with different flags should be not equal
   2855     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2856     REGEX_CHECK_STATUS;
   2857 
   2858     REGEX_ASSERT(*pat1b != *pat1a);
   2859     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2860     REGEX_ASSERT(pat1a->flags() == 0);
   2861     delete pat1b;
   2862 
   2863     // clone
   2864     RegexPattern *pat1c = pat1->clone();
   2865     REGEX_ASSERT(*pat1c == *pat1);
   2866     REGEX_ASSERT(*pat1c != *pat2);
   2867 
   2868     delete pat1c;
   2869     delete pat1a;
   2870     delete pat1;
   2871     delete pat2;
   2872 
   2873     utext_close(&re1);
   2874     utext_close(&re2);
   2875 
   2876 
   2877     //
   2878     //   Verify that a matcher created from a cloned pattern works.
   2879     //     (Jitterbug 3423)
   2880     //
   2881     {
   2882         UErrorCode     status     = U_ZERO_ERROR;
   2883         UText          pattern    = UTEXT_INITIALIZER;
   2884         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2885         utext_openUTF8(&pattern, str_pL, -1, &status);
   2886 
   2887         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2888         RegexPattern  *pClone     = pSource->clone();
   2889         delete         pSource;
   2890         RegexMatcher  *mFromClone = pClone->matcher(status);
   2891         REGEX_CHECK_STATUS;
   2892 
   2893         UText          input      = UTEXT_INITIALIZER;
   2894         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2895         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2896         mFromClone->reset(&input);
   2897         REGEX_ASSERT(mFromClone->find() == TRUE);
   2898         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2899         REGEX_ASSERT(mFromClone->find() == TRUE);
   2900         REGEX_ASSERT(mFromClone->group(status) == "World");
   2901         REGEX_ASSERT(mFromClone->find() == FALSE);
   2902         delete mFromClone;
   2903         delete pClone;
   2904 
   2905         utext_close(&input);
   2906         utext_close(&pattern);
   2907     }
   2908 
   2909     //
   2910     //   matches convenience API
   2911     //
   2912     {
   2913         UErrorCode status  = U_ZERO_ERROR;
   2914         UText      pattern = UTEXT_INITIALIZER;
   2915         UText      input   = UTEXT_INITIALIZER;
   2916 
   2917         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2918         utext_openUTF8(&input, str_randominput, -1, &status);
   2919 
   2920         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2921         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2922         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2923         REGEX_CHECK_STATUS;
   2924 
   2925         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2926         utext_openUTF8(&pattern, str_abc, -1, &status);
   2927         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2928         REGEX_CHECK_STATUS;
   2929 
   2930         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2931         utext_openUTF8(&pattern, str_nput, -1, &status);
   2932         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2933         REGEX_CHECK_STATUS;
   2934 
   2935         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2936         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2937         REGEX_CHECK_STATUS;
   2938 
   2939         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2940         utext_openUTF8(&pattern, str_u, -1, &status);
   2941         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2942         REGEX_CHECK_STATUS;
   2943 
   2944         utext_openUTF8(&input, str_abc, -1, &status);
   2945         utext_openUTF8(&pattern, str_abc, -1, &status);
   2946         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2947         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2948         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2949 
   2950         utext_close(&input);
   2951         utext_close(&pattern);
   2952     }
   2953 
   2954 
   2955     //
   2956     // Split()
   2957     //
   2958     status = U_ZERO_ERROR;
   2959     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2960     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2961     pat1 = RegexPattern::compile(&re1, pe, status);
   2962     REGEX_CHECK_STATUS;
   2963     UnicodeString  fields[10];
   2964 
   2965     int32_t n;
   2966     n = pat1->split("Now is the time", fields, 10, status);
   2967     REGEX_CHECK_STATUS;
   2968     REGEX_ASSERT(n==4);
   2969     REGEX_ASSERT(fields[0]=="Now");
   2970     REGEX_ASSERT(fields[1]=="is");
   2971     REGEX_ASSERT(fields[2]=="the");
   2972     REGEX_ASSERT(fields[3]=="time");
   2973     REGEX_ASSERT(fields[4]=="");
   2974 
   2975     n = pat1->split("Now is the time", fields, 2, status);
   2976     REGEX_CHECK_STATUS;
   2977     REGEX_ASSERT(n==2);
   2978     REGEX_ASSERT(fields[0]=="Now");
   2979     REGEX_ASSERT(fields[1]=="is the time");
   2980     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2981 
   2982     fields[1] = "*";
   2983     status = U_ZERO_ERROR;
   2984     n = pat1->split("Now is the time", fields, 1, status);
   2985     REGEX_CHECK_STATUS;
   2986     REGEX_ASSERT(n==1);
   2987     REGEX_ASSERT(fields[0]=="Now is the time");
   2988     REGEX_ASSERT(fields[1]=="*");
   2989     status = U_ZERO_ERROR;
   2990 
   2991     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2992     REGEX_CHECK_STATUS;
   2993     REGEX_ASSERT(n==6);
   2994     REGEX_ASSERT(fields[0]=="");
   2995     REGEX_ASSERT(fields[1]=="Now");
   2996     REGEX_ASSERT(fields[2]=="is");
   2997     REGEX_ASSERT(fields[3]=="the");
   2998     REGEX_ASSERT(fields[4]=="time");
   2999     REGEX_ASSERT(fields[5]=="");
   3000     REGEX_ASSERT(fields[6]=="");
   3001 
   3002     fields[2] = "*";
   3003     n = pat1->split("     ", fields, 10, status);
   3004     REGEX_CHECK_STATUS;
   3005     REGEX_ASSERT(n==2);
   3006     REGEX_ASSERT(fields[0]=="");
   3007     REGEX_ASSERT(fields[1]=="");
   3008     REGEX_ASSERT(fields[2]=="*");
   3009 
   3010     fields[0] = "foo";
   3011     n = pat1->split("", fields, 10, status);
   3012     REGEX_CHECK_STATUS;
   3013     REGEX_ASSERT(n==0);
   3014     REGEX_ASSERT(fields[0]=="foo");
   3015 
   3016     delete pat1;
   3017 
   3018     //  split, with a pattern with (capture)
   3019     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   3020     pat1 = RegexPattern::compile(&re1,  pe, status);
   3021     REGEX_CHECK_STATUS;
   3022 
   3023     status = U_ZERO_ERROR;
   3024     fields[6] = fields[7] = "*";
   3025     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   3026     REGEX_CHECK_STATUS;
   3027     REGEX_ASSERT(n==7);
   3028     REGEX_ASSERT(fields[0]=="");
   3029     REGEX_ASSERT(fields[1]=="a");
   3030     REGEX_ASSERT(fields[2]=="Now is ");
   3031     REGEX_ASSERT(fields[3]=="b");
   3032     REGEX_ASSERT(fields[4]=="the time");
   3033     REGEX_ASSERT(fields[5]=="c");
   3034     REGEX_ASSERT(fields[6]=="");
   3035     REGEX_ASSERT(fields[7]=="*");
   3036     REGEX_ASSERT(status==U_ZERO_ERROR);
   3037 
   3038     fields[6] = fields[7] = "*";
   3039     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   3040     REGEX_CHECK_STATUS;
   3041     REGEX_ASSERT(n==7);
   3042     REGEX_ASSERT(fields[0]=="  ");
   3043     REGEX_ASSERT(fields[1]=="a");
   3044     REGEX_ASSERT(fields[2]=="Now is ");
   3045     REGEX_ASSERT(fields[3]=="b");
   3046     REGEX_ASSERT(fields[4]=="the time");
   3047     REGEX_ASSERT(fields[5]=="c");
   3048     REGEX_ASSERT(fields[6]=="");
   3049     REGEX_ASSERT(fields[7]=="*");
   3050 
   3051     status = U_ZERO_ERROR;
   3052     fields[6] = "foo";
   3053     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   3054     REGEX_CHECK_STATUS;
   3055     REGEX_ASSERT(n==6);
   3056     REGEX_ASSERT(fields[0]=="  ");
   3057     REGEX_ASSERT(fields[1]=="a");
   3058     REGEX_ASSERT(fields[2]=="Now is ");
   3059     REGEX_ASSERT(fields[3]=="b");
   3060     REGEX_ASSERT(fields[4]=="the time");
   3061     REGEX_ASSERT(fields[5]==" ");
   3062     REGEX_ASSERT(fields[6]=="foo");
   3063 
   3064     status = U_ZERO_ERROR;
   3065     fields[5] = "foo";
   3066     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3067     REGEX_CHECK_STATUS;
   3068     REGEX_ASSERT(n==5);
   3069     REGEX_ASSERT(fields[0]=="  ");
   3070     REGEX_ASSERT(fields[1]=="a");
   3071     REGEX_ASSERT(fields[2]=="Now is ");
   3072     REGEX_ASSERT(fields[3]=="b");
   3073     REGEX_ASSERT(fields[4]=="the time<c>");
   3074     REGEX_ASSERT(fields[5]=="foo");
   3075 
   3076     status = U_ZERO_ERROR;
   3077     fields[5] = "foo";
   3078     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3079     REGEX_CHECK_STATUS;
   3080     REGEX_ASSERT(n==5);
   3081     REGEX_ASSERT(fields[0]=="  ");
   3082     REGEX_ASSERT(fields[1]=="a");
   3083     REGEX_ASSERT(fields[2]=="Now is ");
   3084     REGEX_ASSERT(fields[3]=="b");
   3085     REGEX_ASSERT(fields[4]=="the time");
   3086     REGEX_ASSERT(fields[5]=="foo");
   3087 
   3088     status = U_ZERO_ERROR;
   3089     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3090     REGEX_CHECK_STATUS;
   3091     REGEX_ASSERT(n==4);
   3092     REGEX_ASSERT(fields[0]=="  ");
   3093     REGEX_ASSERT(fields[1]=="a");
   3094     REGEX_ASSERT(fields[2]=="Now is ");
   3095     REGEX_ASSERT(fields[3]=="the time<c>");
   3096     status = U_ZERO_ERROR;
   3097     delete pat1;
   3098 
   3099     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3100     pat1 = RegexPattern::compile(&re1, pe, status);
   3101     REGEX_CHECK_STATUS;
   3102     n = pat1->split("1-10,20", fields, 10, status);
   3103     REGEX_CHECK_STATUS;
   3104     REGEX_ASSERT(n==5);
   3105     REGEX_ASSERT(fields[0]=="1");
   3106     REGEX_ASSERT(fields[1]=="-");
   3107     REGEX_ASSERT(fields[2]=="10");
   3108     REGEX_ASSERT(fields[3]==",");
   3109     REGEX_ASSERT(fields[4]=="20");
   3110     delete pat1;
   3111 
   3112 
   3113     //
   3114     // split of a UText based string, with library allocating output UTexts.
   3115     //
   3116     {
   3117         status = U_ZERO_ERROR;
   3118         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
   3119         UnicodeString stringToSplit("first:second:third");
   3120         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
   3121         REGEX_CHECK_STATUS;
   3122 
   3123         UText *splits[10] = {NULL};
   3124         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
   3125         REGEX_CHECK_STATUS;
   3126         REGEX_ASSERT(numFields == 5);
   3127         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
   3128         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
   3129         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
   3130         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
   3131         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
   3132         REGEX_ASSERT(splits[5] == NULL);
   3133 
   3134         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
   3135             if (splits[i]) {
   3136                 utext_close(splits[i]);
   3137                 splits[i] = NULL;
   3138             }
   3139         }
   3140         utext_close(textToSplit);
   3141     }
   3142 
   3143 
   3144     //
   3145     // RegexPattern::pattern() and patternText()
   3146     //
   3147     pat1 = new RegexPattern();
   3148     REGEX_ASSERT(pat1->pattern() == "");
   3149     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3150     delete pat1;
   3151     const char *helloWorldInvariant = "(Hello, world)*";
   3152     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3153     pat1 = RegexPattern::compile(&re1, pe, status);
   3154     REGEX_CHECK_STATUS;
   3155     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
   3156     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3157     delete pat1;
   3158 
   3159     utext_close(&re1);
   3160 }
   3161 
   3162 
   3163 //---------------------------------------------------------------------------
   3164 //
   3165 //      Extended       A more thorough check for features of regex patterns
   3166 //                     The test cases are in a separate data file,
   3167 //                       source/tests/testdata/regextst.txt
   3168 //                     A description of the test data format is included in that file.
   3169 //
   3170 //---------------------------------------------------------------------------
   3171 
   3172 const char *
   3173 RegexTest::getPath(char buffer[2048], const char *filename) {
   3174     UErrorCode status=U_ZERO_ERROR;
   3175     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3176     if (U_FAILURE(status)) {
   3177         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3178         return NULL;
   3179     }
   3180 
   3181     strcpy(buffer, testDataDirectory);
   3182     strcat(buffer, filename);
   3183     return buffer;
   3184 }
   3185 
   3186 void RegexTest::Extended() {
   3187     char tdd[2048];
   3188     const char *srcPath;
   3189     UErrorCode  status  = U_ZERO_ERROR;
   3190     int32_t     lineNum = 0;
   3191 
   3192     //
   3193     //  Open and read the test data file.
   3194     //
   3195     srcPath=getPath(tdd, "regextst.txt");
   3196     if(srcPath==NULL) {
   3197         return; /* something went wrong, error already output */
   3198     }
   3199 
   3200     int32_t    len;
   3201     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3202     if (U_FAILURE(status)) {
   3203         return; /* something went wrong, error already output */
   3204     }
   3205 
   3206     //
   3207     //  Put the test data into a UnicodeString
   3208     //
   3209     UnicodeString testString(FALSE, testData, len);
   3210 
   3211     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3212     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3213     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3214 
   3215     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3216     UnicodeString   testPattern;   // The pattern for test from the test file.
   3217     UnicodeString   testFlags;     // the flags   for a test.
   3218     UnicodeString   matchString;   // The marked up string to be used as input
   3219 
   3220     if (U_FAILURE(status)){
   3221         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
   3222         delete [] testData;
   3223         return;
   3224     }
   3225 
   3226     //
   3227     //  Loop over the test data file, once per line.
   3228     //
   3229     while (lineMat.find()) {
   3230         lineNum++;
   3231         if (U_FAILURE(status)) {
   3232           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3233         }
   3234 
   3235         status = U_ZERO_ERROR;
   3236         UnicodeString testLine = lineMat.group(1, status);
   3237         if (testLine.length() == 0) {
   3238             continue;
   3239         }
   3240 
   3241         //
   3242         // Parse the test line.  Skip blank and comment only lines.
   3243         // Separate out the three main fields - pattern, flags, target.
   3244         //
   3245 
   3246         commentMat.reset(testLine);
   3247         if (commentMat.lookingAt(status)) {
   3248             // This line is a comment, or blank.
   3249             continue;
   3250         }
   3251 
   3252         //
   3253         //  Pull out the pattern field, remove it from the test file line.
   3254         //
   3255         quotedStuffMat.reset(testLine);
   3256         if (quotedStuffMat.lookingAt(status)) {
   3257             testPattern = quotedStuffMat.group(2, status);
   3258             testLine.remove(0, quotedStuffMat.end(0, status));
   3259         } else {
   3260             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3261             continue;
   3262         }
   3263 
   3264 
   3265         //
   3266         //  Pull out the flags from the test file line.
   3267         //
   3268         flagsMat.reset(testLine);
   3269         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3270         testFlags = flagsMat.group(1, status);
   3271         if (flagsMat.group(2, status).length() > 0) {
   3272             errln("Bad Match flag at line %d. Scanning %c\n",
   3273                 lineNum, flagsMat.group(2, status).charAt(0));
   3274             continue;
   3275         }
   3276         testLine.remove(0, flagsMat.end(0, status));
   3277 
   3278         //
   3279         //  Pull out the match string, as a whole.
   3280         //    We'll process the <tags> later.
   3281         //
   3282         quotedStuffMat.reset(testLine);
   3283         if (quotedStuffMat.lookingAt(status)) {
   3284             matchString = quotedStuffMat.group(2, status);
   3285             testLine.remove(0, quotedStuffMat.end(0, status));
   3286         } else {
   3287             errln("Bad match string at test file line %d", lineNum);
   3288             continue;
   3289         }
   3290 
   3291         //
   3292         //  The only thing left from the input line should be an optional trailing comment.
   3293         //
   3294         commentMat.reset(testLine);
   3295         if (commentMat.lookingAt(status) == FALSE) {
   3296             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3297             continue;
   3298         }
   3299 
   3300         //
   3301         //  Run the test
   3302         //
   3303         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3304     }
   3305 
   3306     delete [] testData;
   3307 
   3308 }
   3309 
   3310 
   3311 
   3312 //---------------------------------------------------------------------------
   3313 //
   3314 //    regex_find(pattern, flags, inputString, lineNumber)
   3315 //
   3316 //         Function to run a single test from the Extended (data driven) tests.
   3317 //         See file test/testdata/regextst.txt for a description of the
   3318 //         pattern and inputString fields, and the allowed flags.
   3319 //         lineNumber is the source line in regextst.txt of the test.
   3320 //
   3321 //---------------------------------------------------------------------------
   3322 
   3323 
   3324 //  Set a value into a UVector at position specified by a decimal number in
   3325 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3326 //   which follows.
   3327 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3328     UErrorCode  status=U_ZERO_ERROR;
   3329     int32_t  idx = 0;
   3330     for (int32_t i=0; i<index.length(); i++) {
   3331         int32_t d=u_charDigitValue(index.charAt(i));
   3332         if (d<0) {return;}
   3333         idx = idx*10 + d;
   3334     }
   3335     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3336     vec.setElementAt(val, idx);
   3337 }
   3338 
   3339 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3340     UErrorCode  status=U_ZERO_ERROR;
   3341     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3342     vec.setElementAt(val, idx);
   3343 }
   3344 
   3345 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3346 {
   3347     UBool couldFind = TRUE;
   3348     UTEXT_SETNATIVEINDEX(utext, 0);
   3349     int32_t i = 0;
   3350     while (i < unistrOffset) {
   3351         UChar32 c = UTEXT_NEXT32(utext);
   3352         if (c != U_SENTINEL) {
   3353             i += U16_LENGTH(c);
   3354         } else {
   3355             couldFind = FALSE;
   3356             break;
   3357         }
   3358     }
   3359     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3360     return couldFind;
   3361 }
   3362 
   3363 
   3364 void RegexTest::regex_find(const UnicodeString &pattern,
   3365                            const UnicodeString &flags,
   3366                            const UnicodeString &inputString,
   3367                            const char *srcPath,
   3368                            int32_t line) {
   3369     UnicodeString       unEscapedInput;
   3370     UnicodeString       deTaggedInput;
   3371 
   3372     int32_t             patternUTF8Length,      inputUTF8Length;
   3373     char                *patternChars  = NULL, *inputChars = NULL;
   3374     UText               patternText    = UTEXT_INITIALIZER;
   3375     UText               inputText      = UTEXT_INITIALIZER;
   3376     UConverter          *UTF8Converter = NULL;
   3377 
   3378     UErrorCode          status         = U_ZERO_ERROR;
   3379     UParseError         pe;
   3380     RegexPattern        *parsePat      = NULL;
   3381     RegexMatcher        *parseMatcher  = NULL;
   3382     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3383     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3384     UVector             groupStarts(status);
   3385     UVector             groupEnds(status);
   3386     UVector             groupStartsUTF8(status);
   3387     UVector             groupEndsUTF8(status);
   3388     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3389     UBool               failed         = FALSE;
   3390     int32_t             numFinds;
   3391     int32_t             i;
   3392     UBool               useMatchesFunc   = FALSE;
   3393     UBool               useLookingAtFunc = FALSE;
   3394     int32_t             regionStart      = -1;
   3395     int32_t             regionEnd        = -1;
   3396     int32_t             regionStartUTF8  = -1;
   3397     int32_t             regionEndUTF8    = -1;
   3398 
   3399 
   3400     //
   3401     //  Compile the caller's pattern
   3402     //
   3403     uint32_t bflags = 0;
   3404     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3405         bflags |= UREGEX_CASE_INSENSITIVE;
   3406     }
   3407     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3408         bflags |= UREGEX_COMMENTS;
   3409     }
   3410     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3411         bflags |= UREGEX_DOTALL;
   3412     }
   3413     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3414         bflags |= UREGEX_MULTILINE;
   3415     }
   3416 
   3417     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3418         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3419     }
   3420     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3421         bflags |= UREGEX_UNIX_LINES;
   3422     }
   3423     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3424         bflags |= UREGEX_LITERAL;
   3425     }
   3426 
   3427 
   3428     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3429     if (status != U_ZERO_ERROR) {
   3430         #if UCONFIG_NO_BREAK_ITERATION==1
   3431         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3432         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3433         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3434             goto cleanupAndReturn;
   3435         }
   3436         #endif
   3437         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3438             // Expected pattern compilation error.
   3439             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3440                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3441             }
   3442             goto cleanupAndReturn;
   3443         } else {
   3444             // Unexpected pattern compilation error.
   3445             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3446             goto cleanupAndReturn;
   3447         }
   3448     }
   3449 
   3450     UTF8Converter = ucnv_open("UTF8", &status);
   3451     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3452 
   3453     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3454     status = U_ZERO_ERROR; // buffer overflow
   3455     patternChars = new char[patternUTF8Length+1];
   3456     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3457     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3458 
   3459     if (status == U_ZERO_ERROR) {
   3460         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3461 
   3462         if (status != U_ZERO_ERROR) {
   3463 #if UCONFIG_NO_BREAK_ITERATION==1
   3464             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3465             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3466             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3467                 goto cleanupAndReturn;
   3468             }
   3469 #endif
   3470             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3471                 // Expected pattern compilation error.
   3472                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3473                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3474                 }
   3475                 goto cleanupAndReturn;
   3476             } else {
   3477                 // Unexpected pattern compilation error.
   3478                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3479                 goto cleanupAndReturn;
   3480             }
   3481         }
   3482     }
   3483 
   3484     if (UTF8Pattern == NULL) {
   3485         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3486         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3487         status = U_ZERO_ERROR;
   3488     }
   3489 
   3490     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3491         callerPattern->dumpPattern();
   3492     }
   3493 
   3494     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3495         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3496         goto cleanupAndReturn;
   3497     }
   3498 
   3499 
   3500     //
   3501     // Number of times find() should be called on the test string, default to 1
   3502     //
   3503     numFinds = 1;
   3504     for (i=2; i<=9; i++) {
   3505         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3506             if (numFinds != 1) {
   3507                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3508                 goto cleanupAndReturn;
   3509             }
   3510             numFinds = i;
   3511         }
   3512     }
   3513 
   3514     // 'M' flag.  Use matches() instead of find()
   3515     if (flags.indexOf((UChar)0x4d) >= 0) {
   3516         useMatchesFunc = TRUE;
   3517     }
   3518     if (flags.indexOf((UChar)0x4c) >= 0) {
   3519         useLookingAtFunc = TRUE;
   3520     }
   3521 
   3522     //
   3523     //  Find the tags in the input data, remove them, and record the group boundary
   3524     //    positions.
   3525     //
   3526     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3527     REGEX_CHECK_STATUS_L(line);
   3528 
   3529     unEscapedInput = inputString.unescape();
   3530     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3531     REGEX_CHECK_STATUS_L(line);
   3532     while(parseMatcher->find()) {
   3533         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3534         REGEX_CHECK_STATUS;
   3535         UnicodeString groupNum = parseMatcher->group(2, status);
   3536         if (groupNum == "r") {
   3537             // <r> or </r>, a region specification within the string
   3538             if (parseMatcher->group(1, status) == "/") {
   3539                 regionEnd = deTaggedInput.length();
   3540             } else {
   3541                 regionStart = deTaggedInput.length();
   3542             }
   3543         } else {
   3544             // <digits> or </digits>, a group match boundary tag.
   3545             if (parseMatcher->group(1, status) == "/") {
   3546                 set(groupEnds, deTaggedInput.length(), groupNum);
   3547             } else {
   3548                 set(groupStarts, deTaggedInput.length(), groupNum);
   3549             }
   3550         }
   3551     }
   3552     parseMatcher->appendTail(deTaggedInput);
   3553     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3554     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3555       errln("mismatched <r> tags");
   3556       failed = TRUE;
   3557       goto cleanupAndReturn;
   3558     }
   3559 
   3560     //
   3561     //  Configure the matcher according to the flags specified with this test.
   3562     //
   3563     matcher = callerPattern->matcher(deTaggedInput, status);
   3564     REGEX_CHECK_STATUS_L(line);
   3565     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3566         matcher->setTrace(TRUE);
   3567     }
   3568 
   3569     if (UTF8Pattern != NULL) {
   3570         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3571         status = U_ZERO_ERROR; // buffer overflow
   3572         inputChars = new char[inputUTF8Length+1];
   3573         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3574         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3575 
   3576         if (status == U_ZERO_ERROR) {
   3577             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3578             REGEX_CHECK_STATUS_L(line);
   3579         }
   3580 
   3581         if (UTF8Matcher == NULL) {
   3582             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3583           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3584             status = U_ZERO_ERROR;
   3585         }
   3586     }
   3587 
   3588     //
   3589     //  Generate native indices for UTF8 versions of region and capture group info
   3590     //
   3591     if (UTF8Matcher != NULL) {
   3592         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3593         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3594 
   3595         //  Fill out the native index UVector info.
   3596         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3597         for (i=0; i<groupStarts.size(); i++) {
   3598             int32_t  start = groupStarts.elementAti(i);
   3599             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3600             if (start >= 0) {
   3601                 int32_t  startUTF8;
   3602                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3603                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3604                     failed = TRUE;
   3605                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3606                 }
   3607                 setInt(groupStartsUTF8, startUTF8, i);
   3608             }
   3609 
   3610             int32_t  end = groupEnds.elementAti(i);
   3611             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3612             if (end >= 0) {
   3613                 int32_t  endUTF8;
   3614                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3615                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3616                     failed = TRUE;
   3617                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3618                 }
   3619                 setInt(groupEndsUTF8, endUTF8, i);
   3620             }
   3621         }
   3622     }
   3623 
   3624     if (regionStart>=0) {
   3625        matcher->region(regionStart, regionEnd, status);
   3626        REGEX_CHECK_STATUS_L(line);
   3627        if (UTF8Matcher != NULL) {
   3628            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3629            REGEX_CHECK_STATUS_L(line);
   3630        }
   3631     }
   3632     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3633         matcher->useAnchoringBounds(FALSE);
   3634         if (UTF8Matcher != NULL) {
   3635             UTF8Matcher->useAnchoringBounds(FALSE);
   3636         }
   3637     }
   3638     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3639         matcher->useTransparentBounds(TRUE);
   3640         if (UTF8Matcher != NULL) {
   3641             UTF8Matcher->useTransparentBounds(TRUE);
   3642         }
   3643     }
   3644 
   3645 
   3646 
   3647     //
   3648     // Do a find on the de-tagged input using the caller's pattern
   3649     //     TODO: error on count>1 and not find().
   3650     //           error on both matches() and lookingAt().
   3651     //
   3652     for (i=0; i<numFinds; i++) {
   3653         if (useMatchesFunc) {
   3654             isMatch = matcher->matches(status);
   3655             if (UTF8Matcher != NULL) {
   3656                isUTF8Match = UTF8Matcher->matches(status);
   3657             }
   3658         } else  if (useLookingAtFunc) {
   3659             isMatch = matcher->lookingAt(status);
   3660             if (UTF8Matcher != NULL) {
   3661                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3662             }
   3663         } else {
   3664             isMatch = matcher->find();
   3665             if (UTF8Matcher != NULL) {
   3666                 isUTF8Match = UTF8Matcher->find();
   3667             }
   3668         }
   3669     }
   3670     matcher->setTrace(FALSE);
   3671     if (U_FAILURE(status)) {
   3672         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
   3673     }
   3674 
   3675     //
   3676     // Match up the groups from the find() with the groups from the tags
   3677     //
   3678 
   3679     // number of tags should match number of groups from find operation.
   3680     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3681     //   G option in test means that capture group data is not available in the
   3682     //     expected results, so the check needs to be suppressed.
   3683     if (isMatch == FALSE && groupStarts.size() != 0) {
   3684         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3685         failed = TRUE;
   3686         goto cleanupAndReturn;
   3687     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3688         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3689         failed = TRUE;
   3690         goto cleanupAndReturn;
   3691     }
   3692 
   3693     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3694         // Only check for match / no match.  Don't check capture groups.
   3695         if (isMatch && groupStarts.size() == 0) {
   3696             errln("Error at line %d:  No match expected, but one found.", line);
   3697             failed = TRUE;
   3698         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3699             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3700             failed = TRUE;
   3701         }
   3702         goto cleanupAndReturn;
   3703     }
   3704 
   3705     REGEX_CHECK_STATUS_L(line);
   3706     for (i=0; i<=matcher->groupCount(); i++) {
   3707         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3708         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3709         if (matcher->start(i, status) != expectedStart) {
   3710             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3711                 line, i, expectedStart, matcher->start(i, status));
   3712             failed = TRUE;
   3713             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3714         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3715             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3716                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3717             failed = TRUE;
   3718             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3719         }
   3720 
   3721         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3722         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3723         if (matcher->end(i, status) != expectedEnd) {
   3724             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3725                 line, i, expectedEnd, matcher->end(i, status));
   3726             failed = TRUE;
   3727             // Error on end position;  keep going; real error is probably yet to come as group
   3728             //   end positions work from end of the input data towards the front.
   3729         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3730             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3731                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3732             failed = TRUE;
   3733             // Error on end position;  keep going; real error is probably yet to come as group
   3734             //   end positions work from end of the input data towards the front.
   3735         }
   3736     }
   3737     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3738         errln("Error at line %d: Expected %d capture groups, found %d.",
   3739             line, groupStarts.size()-1, matcher->groupCount());
   3740         failed = TRUE;
   3741         }
   3742     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3743         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3744               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3745         failed = TRUE;
   3746     }
   3747 
   3748     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3749         matcher->requireEnd() == TRUE) {
   3750         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3751         failed = TRUE;
   3752     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3753         UTF8Matcher->requireEnd() == TRUE) {
   3754         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3755         failed = TRUE;
   3756     }
   3757 
   3758     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3759         matcher->requireEnd() == FALSE) {
   3760         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3761         failed = TRUE;
   3762     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3763         UTF8Matcher->requireEnd() == FALSE) {
   3764         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3765         failed = TRUE;
   3766     }
   3767 
   3768     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3769         matcher->hitEnd() == TRUE) {
   3770         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3771         failed = TRUE;
   3772     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3773                UTF8Matcher->hitEnd() == TRUE) {
   3774         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3775         failed = TRUE;
   3776     }
   3777 
   3778     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3779         matcher->hitEnd() == FALSE) {
   3780         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3781         failed = TRUE;
   3782     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3783                UTF8Matcher->hitEnd() == FALSE) {
   3784         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3785         failed = TRUE;
   3786     }
   3787 
   3788 
   3789 cleanupAndReturn:
   3790     if (failed) {
   3791         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3792             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3793         // callerPattern->dump();
   3794     }
   3795     delete parseMatcher;
   3796     delete parsePat;
   3797     delete UTF8Matcher;
   3798     delete UTF8Pattern;
   3799     delete matcher;
   3800     delete callerPattern;
   3801 
   3802     utext_close(&inputText);
   3803     delete[] inputChars;
   3804     utext_close(&patternText);
   3805     delete[] patternChars;
   3806     ucnv_close(UTF8Converter);
   3807 }
   3808 
   3809 
   3810 
   3811 
   3812 //---------------------------------------------------------------------------
   3813 //
   3814 //      Errors     Check for error handling in patterns.
   3815 //
   3816 //---------------------------------------------------------------------------
   3817 void RegexTest::Errors() {
   3818     // \escape sequences that aren't implemented yet.
   3819     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3820 
   3821     // Missing close parentheses
   3822     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3823     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3824     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3825 
   3826     // Extra close paren
   3827     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3828     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3829     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3830 
   3831     // Look-ahead, Look-behind
   3832     //  TODO:  add tests for unbounded length look-behinds.
   3833     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3834 
   3835     // Attempt to use non-default flags
   3836     {
   3837         UParseError   pe;
   3838         UErrorCode    status = U_ZERO_ERROR;
   3839         int32_t       flags  = UREGEX_CANON_EQ |
   3840                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3841                                UREGEX_MULTILINE;
   3842         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3843         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3844         delete pat1;
   3845     }
   3846 
   3847 
   3848     // Quantifiers are allowed only after something that can be quantified.
   3849     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3850     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3851     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3852 
   3853     // Mal-formed {min,max} quantifiers
   3854     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3855     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3856     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3857     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3858     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3859     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3860     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3861     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3862     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3863 
   3864     // Ticket 5389
   3865     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3866 
   3867     // Invalid Back Reference \0
   3868     //    For ICU 3.8 and earlier
   3869     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3870     //
   3871     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3872 
   3873 }
   3874 
   3875 
   3876 //-------------------------------------------------------------------------------
   3877 //
   3878 //  Read a text data file, convert it to UChars, and return the data
   3879 //    in one big UChar * buffer, which the caller must delete.
   3880 //
   3881 //--------------------------------------------------------------------------------
   3882 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3883                                      const char *defEncoding, UErrorCode &status) {
   3884     UChar       *retPtr  = NULL;
   3885     char        *fileBuf = NULL;
   3886     UConverter* conv     = NULL;
   3887     FILE        *f       = NULL;
   3888 
   3889     ulen = 0;
   3890     if (U_FAILURE(status)) {
   3891         return retPtr;
   3892     }
   3893 
   3894     //
   3895     //  Open the file.
   3896     //
   3897     f = fopen(fileName, "rb");
   3898     if (f == 0) {
   3899         dataerrln("Error opening test data file %s\n", fileName);
   3900         status = U_FILE_ACCESS_ERROR;
   3901         return NULL;
   3902     }
   3903     //
   3904     //  Read it in
   3905     //
   3906     int32_t            fileSize;
   3907     int32_t            amt_read;
   3908 
   3909     fseek( f, 0, SEEK_END);
   3910     fileSize = ftell(f);
   3911     fileBuf = new char[fileSize];
   3912     fseek(f, 0, SEEK_SET);
   3913     amt_read = fread(fileBuf, 1, fileSize, f);
   3914     if (amt_read != fileSize || fileSize <= 0) {
   3915         errln("Error reading test data file.");
   3916         goto cleanUpAndReturn;
   3917     }
   3918 
   3919     //
   3920     // Look for a Unicode Signature (BOM) on the data just read
   3921     //
   3922     int32_t        signatureLength;
   3923     const char *   fileBufC;
   3924     const char*    encoding;
   3925 
   3926     fileBufC = fileBuf;
   3927     encoding = ucnv_detectUnicodeSignature(
   3928         fileBuf, fileSize, &signatureLength, &status);
   3929     if(encoding!=NULL ){
   3930         fileBufC  += signatureLength;
   3931         fileSize  -= signatureLength;
   3932     } else {
   3933         encoding = defEncoding;
   3934         if (strcmp(encoding, "utf-8") == 0) {
   3935             errln("file %s is missing its BOM", fileName);
   3936         }
   3937     }
   3938 
   3939     //
   3940     // Open a converter to take the rule file to UTF-16
   3941     //
   3942     conv = ucnv_open(encoding, &status);
   3943     if (U_FAILURE(status)) {
   3944         goto cleanUpAndReturn;
   3945     }
   3946 
   3947     //
   3948     // Convert the rules to UChar.
   3949     //  Preflight first to determine required buffer size.
   3950     //
   3951     ulen = ucnv_toUChars(conv,
   3952         NULL,           //  dest,
   3953         0,              //  destCapacity,
   3954         fileBufC,
   3955         fileSize,
   3956         &status);
   3957     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3958         // Buffer Overflow is expected from the preflight operation.
   3959         status = U_ZERO_ERROR;
   3960 
   3961         retPtr = new UChar[ulen+1];
   3962         ucnv_toUChars(conv,
   3963             retPtr,       //  dest,
   3964             ulen+1,
   3965             fileBufC,
   3966             fileSize,
   3967             &status);
   3968     }
   3969 
   3970 cleanUpAndReturn:
   3971     fclose(f);
   3972     delete[] fileBuf;
   3973     ucnv_close(conv);
   3974     if (U_FAILURE(status)) {
   3975         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3976         delete []retPtr;
   3977         retPtr = 0;
   3978         ulen   = 0;
   3979     };
   3980     return retPtr;
   3981 }
   3982 
   3983 
   3984 //-------------------------------------------------------------------------------
   3985 //
   3986 //   PerlTests  - Run Perl's regular expression tests
   3987 //                The input file for this test is re_tests, the standard regular
   3988 //                expression test data distributed with the Perl source code.
   3989 //
   3990 //                Here is Perl's description of the test data file:
   3991 //
   3992 //        # The tests are in a separate file 't/op/re_tests'.
   3993 //        # Each line in that file is a separate test.
   3994 //        # There are five columns, separated by tabs.
   3995 //        #
   3996 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3997 //        # Modifiers can be put after the closing C<'>.
   3998 //        #
   3999 //        # Column 2 contains the string to be matched.
   4000 //        #
   4001 //        # Column 3 contains the expected result:
   4002 //        #     y   expect a match
   4003 //        #     n   expect no match
   4004 //        #     c   expect an error
   4005 //        # B   test exposes a known bug in Perl, should be skipped
   4006 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   4007 //        #
   4008 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   4009 //        #
   4010 //        # Column 4 contains a string, usually C<$&>.
   4011 //        #
   4012 //        # Column 5 contains the expected result of double-quote
   4013 //        # interpolating that string after the match, or start of error message.
   4014 //        #
   4015 //        # Column 6, if present, contains a reason why the test is skipped.
   4016 //        # This is printed with "skipped", for harness to pick up.
   4017 //        #
   4018 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   4019 //        #
   4020 //        # If you want to add a regular expression test that can't be expressed
   4021 //        # in this format, don't add it here: put it in op/pat.t instead.
   4022 //
   4023 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   4024 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   4025 //        (The i is in addition to whatever was there before.)
   4026 //
   4027 //-------------------------------------------------------------------------------
   4028 void RegexTest::PerlTests() {
   4029     char tdd[2048];
   4030     const char *srcPath;
   4031     UErrorCode  status = U_ZERO_ERROR;
   4032     UParseError pe;
   4033 
   4034     //
   4035     //  Open and read the test data file.
   4036     //
   4037     srcPath=getPath(tdd, "re_tests.txt");
   4038     if(srcPath==NULL) {
   4039         return; /* something went wrong, error already output */
   4040     }
   4041 
   4042     int32_t    len;
   4043     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4044     if (U_FAILURE(status)) {
   4045         return; /* something went wrong, error already output */
   4046     }
   4047 
   4048     //
   4049     //  Put the test data into a UnicodeString
   4050     //
   4051     UnicodeString testDataString(FALSE, testData, len);
   4052 
   4053     //
   4054     //  Regex to break the input file into lines, and strip the new lines.
   4055     //     One line per match, capture group one is the desired data.
   4056     //
   4057     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4058     if (U_FAILURE(status)) {
   4059         dataerrln("RegexPattern::compile() error");
   4060         return;
   4061     }
   4062     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4063 
   4064     //
   4065     //  Regex to split a test file line into fields.
   4066     //    There are six fields, separated by tabs.
   4067     //
   4068     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4069 
   4070     //
   4071     //  Regex to identify test patterns with flag settings, and to separate them.
   4072     //    Test patterns with flags look like 'pattern'i
   4073     //    Test patterns without flags are not quoted:   pattern
   4074     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4075     //
   4076     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4077     RegexMatcher* flagMat = flagPat->matcher(status);
   4078 
   4079     //
   4080     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4081     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4082     //   are string constants and REs for these constructs.
   4083     //
   4084     UnicodeString nulnulSrc("${nulnul}");
   4085     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4086     nulnul = nulnul.unescape();
   4087 
   4088     UnicodeString ffffSrc("${ffff}");
   4089     UnicodeString ffff("\\uffff", -1, US_INV);
   4090     ffff = ffff.unescape();
   4091 
   4092     //  regexp for $-[0], $+[2], etc.
   4093     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4094     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4095 
   4096     //  regexp for $0, $1, $2, etc.
   4097     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4098     RegexMatcher *cgMat = cgPat->matcher(status);
   4099 
   4100 
   4101     //
   4102     // Main Loop for the Perl Tests, runs once per line from the
   4103     //   test data file.
   4104     //
   4105     int32_t  lineNum = 0;
   4106     int32_t  skippedUnimplementedCount = 0;
   4107     while (lineMat->find()) {
   4108         lineNum++;
   4109 
   4110         //
   4111         //  Get a line, break it into its fields, do the Perl
   4112         //    variable substitutions.
   4113         //
   4114         UnicodeString line = lineMat->group(1, status);
   4115         UnicodeString fields[7];
   4116         fieldPat->split(line, fields, 7, status);
   4117 
   4118         flagMat->reset(fields[0]);
   4119         flagMat->matches(status);
   4120         UnicodeString pattern  = flagMat->group(2, status);
   4121         pattern.findAndReplace("${bang}", "!");
   4122         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4123         pattern.findAndReplace(ffffSrc, ffff);
   4124 
   4125         //
   4126         //  Identify patterns that include match flag settings,
   4127         //    split off the flags, remove the extra quotes.
   4128         //
   4129         UnicodeString flagStr = flagMat->group(3, status);
   4130         if (U_FAILURE(status)) {
   4131             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4132             return;
   4133         }
   4134         int32_t flags = 0;
   4135         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4136         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4137         const UChar UChar_m = 0x6d;
   4138         const UChar UChar_x = 0x78;
   4139         const UChar UChar_y = 0x79;
   4140         if (flagStr.indexOf(UChar_i) != -1) {
   4141             flags |= UREGEX_CASE_INSENSITIVE;
   4142         }
   4143         if (flagStr.indexOf(UChar_m) != -1) {
   4144             flags |= UREGEX_MULTILINE;
   4145         }
   4146         if (flagStr.indexOf(UChar_x) != -1) {
   4147             flags |= UREGEX_COMMENTS;
   4148         }
   4149 
   4150         //
   4151         // Compile the test pattern.
   4152         //
   4153         status = U_ZERO_ERROR;
   4154         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4155         if (status == U_REGEX_UNIMPLEMENTED) {
   4156             //
   4157             // Test of a feature that is planned for ICU, but not yet implemented.
   4158             //   skip the test.
   4159             skippedUnimplementedCount++;
   4160             delete testPat;
   4161             status = U_ZERO_ERROR;
   4162             continue;
   4163         }
   4164 
   4165         if (U_FAILURE(status)) {
   4166             // Some tests are supposed to generate errors.
   4167             //   Only report an error for tests that are supposed to succeed.
   4168             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4169                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4170             {
   4171                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4172             }
   4173             status = U_ZERO_ERROR;
   4174             delete testPat;
   4175             continue;
   4176         }
   4177 
   4178         if (fields[2].indexOf(UChar_i) >= 0) {
   4179             // ICU should skip this test.
   4180             delete testPat;
   4181             continue;
   4182         }
   4183 
   4184         if (fields[2].indexOf(UChar_c) >= 0) {
   4185             // This pattern should have caused a compilation error, but didn't/
   4186             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4187             delete testPat;
   4188             continue;
   4189         }
   4190 
   4191         //
   4192         // replace the Perl variables that appear in some of the
   4193         //   match data strings.
   4194         //
   4195         UnicodeString matchString = fields[1];
   4196         matchString.findAndReplace(nulnulSrc, nulnul);
   4197         matchString.findAndReplace(ffffSrc,   ffff);
   4198 
   4199         // Replace any \n in the match string with an actual new-line char.
   4200         //  Don't do full unescape, as this unescapes more than Perl does, which
   4201         //  causes other spurious failures in the tests.
   4202         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4203 
   4204 
   4205 
   4206         //
   4207         // Run the test, check for expected match/don't match result.
   4208         //
   4209         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4210         UBool found = testMat->find();
   4211         UBool expected = FALSE;
   4212         if (fields[2].indexOf(UChar_y) >=0) {
   4213             expected = TRUE;
   4214         }
   4215         if (expected != found) {
   4216             errln("line %d: Expected %smatch, got %smatch",
   4217                 lineNum, expected?"":"no ", found?"":"no " );
   4218             continue;
   4219         }
   4220 
   4221         // Don't try to check expected results if there is no match.
   4222         //   (Some have stuff in the expected fields)
   4223         if (!found) {
   4224             delete testMat;
   4225             delete testPat;
   4226             continue;
   4227         }
   4228 
   4229         //
   4230         // Interpret the Perl expression from the fourth field of the data file,
   4231         // building up an ICU string from the results of the ICU match.
   4232         //   The Perl expression will contain references to the results of
   4233         //     a regex match, including the matched string, capture group strings,
   4234         //     group starting and ending indicies, etc.
   4235         //
   4236         UnicodeString resultString;
   4237         UnicodeString perlExpr = fields[3];
   4238 #if SUPPORT_MUTATING_INPUT_STRING
   4239         groupsMat->reset(perlExpr);
   4240         cgMat->reset(perlExpr);
   4241 #endif
   4242 
   4243         while (perlExpr.length() > 0) {
   4244 #if !SUPPORT_MUTATING_INPUT_STRING
   4245             //  Perferred usage.  Reset after any modification to input string.
   4246             groupsMat->reset(perlExpr);
   4247             cgMat->reset(perlExpr);
   4248 #endif
   4249 
   4250             if (perlExpr.startsWith("$&")) {
   4251                 resultString.append(testMat->group(status));
   4252                 perlExpr.remove(0, 2);
   4253             }
   4254 
   4255             else if (groupsMat->lookingAt(status)) {
   4256                 // $-[0]   $+[2]  etc.
   4257                 UnicodeString digitString = groupsMat->group(2, status);
   4258                 int32_t t = 0;
   4259                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4260                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4261                 int32_t matchPosition;
   4262                 if (plusOrMinus.compare("+") == 0) {
   4263                     matchPosition = testMat->end(groupNum, status);
   4264                 } else {
   4265                     matchPosition = testMat->start(groupNum, status);
   4266                 }
   4267                 if (matchPosition != -1) {
   4268                     ICU_Utility::appendNumber(resultString, matchPosition);
   4269                 }
   4270                 perlExpr.remove(0, groupsMat->end(status));
   4271             }
   4272 
   4273             else if (cgMat->lookingAt(status)) {
   4274                 // $1, $2, $3, etc.
   4275                 UnicodeString digitString = cgMat->group(1, status);
   4276                 int32_t t = 0;
   4277                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4278                 if (U_SUCCESS(status)) {
   4279                     resultString.append(testMat->group(groupNum, status));
   4280                     status = U_ZERO_ERROR;
   4281                 }
   4282                 perlExpr.remove(0, cgMat->end(status));
   4283             }
   4284 
   4285             else if (perlExpr.startsWith("@-")) {
   4286                 int32_t i;
   4287                 for (i=0; i<=testMat->groupCount(); i++) {
   4288                     if (i>0) {
   4289                         resultString.append(" ");
   4290                     }
   4291                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4292                 }
   4293                 perlExpr.remove(0, 2);
   4294             }
   4295 
   4296             else if (perlExpr.startsWith("@+")) {
   4297                 int32_t i;
   4298                 for (i=0; i<=testMat->groupCount(); i++) {
   4299                     if (i>0) {
   4300                         resultString.append(" ");
   4301                     }
   4302                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4303                 }
   4304                 perlExpr.remove(0, 2);
   4305             }
   4306 
   4307             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4308                                                      //           or as an escaped sequence (e.g. \n)
   4309                 if (perlExpr.length() > 1) {
   4310                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4311                 }
   4312                 UChar c = perlExpr.charAt(0);
   4313                 switch (c) {
   4314                 case 'n':   c = '\n'; break;
   4315                 // add any other escape sequences that show up in the test expected results.
   4316                 }
   4317                 resultString.append(c);
   4318                 perlExpr.remove(0, 1);
   4319             }
   4320 
   4321             else  {
   4322                 // Any characters from the perl expression that we don't explicitly
   4323                 //  recognize before here are assumed to be literals and copied
   4324                 //  as-is to the expected results.
   4325                 resultString.append(perlExpr.charAt(0));
   4326                 perlExpr.remove(0, 1);
   4327             }
   4328 
   4329             if (U_FAILURE(status)) {
   4330                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4331                 break;
   4332             }
   4333         }
   4334 
   4335         //
   4336         // Expected Results Compare
   4337         //
   4338         UnicodeString expectedS(fields[4]);
   4339         expectedS.findAndReplace(nulnulSrc, nulnul);
   4340         expectedS.findAndReplace(ffffSrc,   ffff);
   4341         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4342 
   4343 
   4344         if (expectedS.compare(resultString) != 0) {
   4345             err("Line %d: Incorrect perl expression results.", lineNum);
   4346             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4347         }
   4348 
   4349         delete testMat;
   4350         delete testPat;
   4351     }
   4352 
   4353     //
   4354     // All done.  Clean up allocated stuff.
   4355     //
   4356     delete cgMat;
   4357     delete cgPat;
   4358 
   4359     delete groupsMat;
   4360     delete groupsPat;
   4361 
   4362     delete flagMat;
   4363     delete flagPat;
   4364 
   4365     delete lineMat;
   4366     delete linePat;
   4367 
   4368     delete fieldPat;
   4369     delete [] testData;
   4370 
   4371 
   4372     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4373 
   4374 }
   4375 
   4376 
   4377 //-------------------------------------------------------------------------------
   4378 //
   4379 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4380 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4381 //                  The input file for this test is re_tests, the standard regular
   4382 //                  expression test data distributed with the Perl source code.
   4383 //                  See PerlTests() for more information.
   4384 //
   4385 //-------------------------------------------------------------------------------
   4386 void RegexTest::PerlTestsUTF8() {
   4387     char tdd[2048];
   4388     const char *srcPath;
   4389     UErrorCode  status = U_ZERO_ERROR;
   4390     UParseError pe;
   4391     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4392     UText       patternText = UTEXT_INITIALIZER;
   4393     char       *patternChars = NULL;
   4394     int32_t     patternLength;
   4395     int32_t     patternCapacity = 0;
   4396     UText       inputText = UTEXT_INITIALIZER;
   4397     char       *inputChars = NULL;
   4398     int32_t     inputLength;
   4399     int32_t     inputCapacity = 0;
   4400 
   4401     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4402 
   4403     //
   4404     //  Open and read the test data file.
   4405     //
   4406     srcPath=getPath(tdd, "re_tests.txt");
   4407     if(srcPath==NULL) {
   4408         return; /* something went wrong, error already output */
   4409     }
   4410 
   4411     int32_t    len;
   4412     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4413     if (U_FAILURE(status)) {
   4414         return; /* something went wrong, error already output */
   4415     }
   4416 
   4417     //
   4418     //  Put the test data into a UnicodeString
   4419     //
   4420     UnicodeString testDataString(FALSE, testData, len);
   4421 
   4422     //
   4423     //  Regex to break the input file into lines, and strip the new lines.
   4424     //     One line per match, capture group one is the desired data.
   4425     //
   4426     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4427     if (U_FAILURE(status)) {
   4428         dataerrln("RegexPattern::compile() error");
   4429         return;
   4430     }
   4431     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4432 
   4433     //
   4434     //  Regex to split a test file line into fields.
   4435     //    There are six fields, separated by tabs.
   4436     //
   4437     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4438 
   4439     //
   4440     //  Regex to identify test patterns with flag settings, and to separate them.
   4441     //    Test patterns with flags look like 'pattern'i
   4442     //    Test patterns without flags are not quoted:   pattern
   4443     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4444     //
   4445     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4446     RegexMatcher* flagMat = flagPat->matcher(status);
   4447 
   4448     //
   4449     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4450     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4451     //   are string constants and REs for these constructs.
   4452     //
   4453     UnicodeString nulnulSrc("${nulnul}");
   4454     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4455     nulnul = nulnul.unescape();
   4456 
   4457     UnicodeString ffffSrc("${ffff}");
   4458     UnicodeString ffff("\\uffff", -1, US_INV);
   4459     ffff = ffff.unescape();
   4460 
   4461     //  regexp for $-[0], $+[2], etc.
   4462     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4463     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4464 
   4465     //  regexp for $0, $1, $2, etc.
   4466     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4467     RegexMatcher *cgMat = cgPat->matcher(status);
   4468 
   4469 
   4470     //
   4471     // Main Loop for the Perl Tests, runs once per line from the
   4472     //   test data file.
   4473     //
   4474     int32_t  lineNum = 0;
   4475     int32_t  skippedUnimplementedCount = 0;
   4476     while (lineMat->find()) {
   4477         lineNum++;
   4478 
   4479         //
   4480         //  Get a line, break it into its fields, do the Perl
   4481         //    variable substitutions.
   4482         //
   4483         UnicodeString line = lineMat->group(1, status);
   4484         UnicodeString fields[7];
   4485         fieldPat->split(line, fields, 7, status);
   4486 
   4487         flagMat->reset(fields[0]);
   4488         flagMat->matches(status);
   4489         UnicodeString pattern  = flagMat->group(2, status);
   4490         pattern.findAndReplace("${bang}", "!");
   4491         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4492         pattern.findAndReplace(ffffSrc, ffff);
   4493 
   4494         //
   4495         //  Identify patterns that include match flag settings,
   4496         //    split off the flags, remove the extra quotes.
   4497         //
   4498         UnicodeString flagStr = flagMat->group(3, status);
   4499         if (U_FAILURE(status)) {
   4500             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4501             return;
   4502         }
   4503         int32_t flags = 0;
   4504         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4505         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4506         const UChar UChar_m = 0x6d;
   4507         const UChar UChar_x = 0x78;
   4508         const UChar UChar_y = 0x79;
   4509         if (flagStr.indexOf(UChar_i) != -1) {
   4510             flags |= UREGEX_CASE_INSENSITIVE;
   4511         }
   4512         if (flagStr.indexOf(UChar_m) != -1) {
   4513             flags |= UREGEX_MULTILINE;
   4514         }
   4515         if (flagStr.indexOf(UChar_x) != -1) {
   4516             flags |= UREGEX_COMMENTS;
   4517         }
   4518 
   4519         //
   4520         // Put the pattern in a UTF-8 UText
   4521         //
   4522         status = U_ZERO_ERROR;
   4523         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4524         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4525             status = U_ZERO_ERROR;
   4526             delete[] patternChars;
   4527             patternCapacity = patternLength + 1;
   4528             patternChars = new char[patternCapacity];
   4529             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4530         }
   4531         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4532 
   4533         //
   4534         // Compile the test pattern.
   4535         //
   4536         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4537         if (status == U_REGEX_UNIMPLEMENTED) {
   4538             //
   4539             // Test of a feature that is planned for ICU, but not yet implemented.
   4540             //   skip the test.
   4541             skippedUnimplementedCount++;
   4542             delete testPat;
   4543             status = U_ZERO_ERROR;
   4544             continue;
   4545         }
   4546 
   4547         if (U_FAILURE(status)) {
   4548             // Some tests are supposed to generate errors.
   4549             //   Only report an error for tests that are supposed to succeed.
   4550             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4551                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4552             {
   4553                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4554             }
   4555             status = U_ZERO_ERROR;
   4556             delete testPat;
   4557             continue;
   4558         }
   4559 
   4560         if (fields[2].indexOf(UChar_i) >= 0) {
   4561             // ICU should skip this test.
   4562             delete testPat;
   4563             continue;
   4564         }
   4565 
   4566         if (fields[2].indexOf(UChar_c) >= 0) {
   4567             // This pattern should have caused a compilation error, but didn't/
   4568             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4569             delete testPat;
   4570             continue;
   4571         }
   4572 
   4573 
   4574         //
   4575         // replace the Perl variables that appear in some of the
   4576         //   match data strings.
   4577         //
   4578         UnicodeString matchString = fields[1];
   4579         matchString.findAndReplace(nulnulSrc, nulnul);
   4580         matchString.findAndReplace(ffffSrc,   ffff);
   4581 
   4582         // Replace any \n in the match string with an actual new-line char.
   4583         //  Don't do full unescape, as this unescapes more than Perl does, which
   4584         //  causes other spurious failures in the tests.
   4585         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4586 
   4587         //
   4588         // Put the input in a UTF-8 UText
   4589         //
   4590         status = U_ZERO_ERROR;
   4591         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4592         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4593             status = U_ZERO_ERROR;
   4594             delete[] inputChars;
   4595             inputCapacity = inputLength + 1;
   4596             inputChars = new char[inputCapacity];
   4597             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4598         }
   4599         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4600 
   4601         //
   4602         // Run the test, check for expected match/don't match result.
   4603         //
   4604         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4605         UBool found = testMat->find();
   4606         UBool expected = FALSE;
   4607         if (fields[2].indexOf(UChar_y) >=0) {
   4608             expected = TRUE;
   4609         }
   4610         if (expected != found) {
   4611             errln("line %d: Expected %smatch, got %smatch",
   4612                 lineNum, expected?"":"no ", found?"":"no " );
   4613             continue;
   4614         }
   4615 
   4616         // Don't try to check expected results if there is no match.
   4617         //   (Some have stuff in the expected fields)
   4618         if (!found) {
   4619             delete testMat;
   4620             delete testPat;
   4621             continue;
   4622         }
   4623 
   4624         //
   4625         // Interpret the Perl expression from the fourth field of the data file,
   4626         // building up an ICU string from the results of the ICU match.
   4627         //   The Perl expression will contain references to the results of
   4628         //     a regex match, including the matched string, capture group strings,
   4629         //     group starting and ending indicies, etc.
   4630         //
   4631         UnicodeString resultString;
   4632         UnicodeString perlExpr = fields[3];
   4633 
   4634         while (perlExpr.length() > 0) {
   4635             groupsMat->reset(perlExpr);
   4636             cgMat->reset(perlExpr);
   4637 
   4638             if (perlExpr.startsWith("$&")) {
   4639                 resultString.append(testMat->group(status));
   4640                 perlExpr.remove(0, 2);
   4641             }
   4642 
   4643             else if (groupsMat->lookingAt(status)) {
   4644                 // $-[0]   $+[2]  etc.
   4645                 UnicodeString digitString = groupsMat->group(2, status);
   4646                 int32_t t = 0;
   4647                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4648                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4649                 int32_t matchPosition;
   4650                 if (plusOrMinus.compare("+") == 0) {
   4651                     matchPosition = testMat->end(groupNum, status);
   4652                 } else {
   4653                     matchPosition = testMat->start(groupNum, status);
   4654                 }
   4655                 if (matchPosition != -1) {
   4656                     ICU_Utility::appendNumber(resultString, matchPosition);
   4657                 }
   4658                 perlExpr.remove(0, groupsMat->end(status));
   4659             }
   4660 
   4661             else if (cgMat->lookingAt(status)) {
   4662                 // $1, $2, $3, etc.
   4663                 UnicodeString digitString = cgMat->group(1, status);
   4664                 int32_t t = 0;
   4665                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4666                 if (U_SUCCESS(status)) {
   4667                     resultString.append(testMat->group(groupNum, status));
   4668                     status = U_ZERO_ERROR;
   4669                 }
   4670                 perlExpr.remove(0, cgMat->end(status));
   4671             }
   4672 
   4673             else if (perlExpr.startsWith("@-")) {
   4674                 int32_t i;
   4675                 for (i=0; i<=testMat->groupCount(); i++) {
   4676                     if (i>0) {
   4677                         resultString.append(" ");
   4678                     }
   4679                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4680                 }
   4681                 perlExpr.remove(0, 2);
   4682             }
   4683 
   4684             else if (perlExpr.startsWith("@+")) {
   4685                 int32_t i;
   4686                 for (i=0; i<=testMat->groupCount(); i++) {
   4687                     if (i>0) {
   4688                         resultString.append(" ");
   4689                     }
   4690                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4691                 }
   4692                 perlExpr.remove(0, 2);
   4693             }
   4694 
   4695             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4696                                                      //           or as an escaped sequence (e.g. \n)
   4697                 if (perlExpr.length() > 1) {
   4698                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4699                 }
   4700                 UChar c = perlExpr.charAt(0);
   4701                 switch (c) {
   4702                 case 'n':   c = '\n'; break;
   4703                 // add any other escape sequences that show up in the test expected results.
   4704                 }
   4705                 resultString.append(c);
   4706                 perlExpr.remove(0, 1);
   4707             }
   4708 
   4709             else  {
   4710                 // Any characters from the perl expression that we don't explicitly
   4711                 //  recognize before here are assumed to be literals and copied
   4712                 //  as-is to the expected results.
   4713                 resultString.append(perlExpr.charAt(0));
   4714                 perlExpr.remove(0, 1);
   4715             }
   4716 
   4717             if (U_FAILURE(status)) {
   4718                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4719                 break;
   4720             }
   4721         }
   4722 
   4723         //
   4724         // Expected Results Compare
   4725         //
   4726         UnicodeString expectedS(fields[4]);
   4727         expectedS.findAndReplace(nulnulSrc, nulnul);
   4728         expectedS.findAndReplace(ffffSrc,   ffff);
   4729         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4730 
   4731 
   4732         if (expectedS.compare(resultString) != 0) {
   4733             err("Line %d: Incorrect perl expression results.", lineNum);
   4734             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4735         }
   4736 
   4737         delete testMat;
   4738         delete testPat;
   4739     }
   4740 
   4741     //
   4742     // All done.  Clean up allocated stuff.
   4743     //
   4744     delete cgMat;
   4745     delete cgPat;
   4746 
   4747     delete groupsMat;
   4748     delete groupsPat;
   4749 
   4750     delete flagMat;
   4751     delete flagPat;
   4752 
   4753     delete lineMat;
   4754     delete linePat;
   4755 
   4756     delete fieldPat;
   4757     delete [] testData;
   4758 
   4759     utext_close(&patternText);
   4760     utext_close(&inputText);
   4761 
   4762     delete [] patternChars;
   4763     delete [] inputChars;
   4764 
   4765 
   4766     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4767 
   4768 }
   4769 
   4770 
   4771 //--------------------------------------------------------------
   4772 //
   4773 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4774 //             Use this pattern,
   4775 //                 "(a?){1,8000000}"
   4776 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
   4777 //                   This test is likely to be fragile, as further optimizations stop
   4778 //                   more cases of pointless looping in the match engine.
   4779 //
   4780 //---------------------------------------------------------------
   4781 void RegexTest::Bug6149() {
   4782     UnicodeString pattern("(a?){1,8000000}");
   4783     UnicodeString s("xyz");
   4784     uint32_t flags = 0;
   4785     UErrorCode status = U_ZERO_ERROR;
   4786 
   4787     RegexMatcher  matcher(pattern, s, flags, status);
   4788     UBool result = false;
   4789     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4790     REGEX_ASSERT(result == FALSE);
   4791  }
   4792 
   4793 
   4794 //
   4795 //   Callbacks()    Test the callback function.
   4796 //                  When set, callbacks occur periodically during matching operations,
   4797 //                  giving the application code the ability to abort the operation
   4798 //                  before it's normal completion.
   4799 //
   4800 
   4801 struct callBackContext {
   4802     RegexTest        *test;
   4803     int32_t          maxCalls;
   4804     int32_t          numCalls;
   4805     int32_t          lastSteps;
   4806     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4807 };
   4808 
   4809 U_CDECL_BEGIN
   4810 static UBool U_CALLCONV
   4811 testCallBackFn(const void *context, int32_t steps) {
   4812     callBackContext  *info = (callBackContext *)context;
   4813     if (info->lastSteps+1 != steps) {
   4814         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4815     }
   4816     info->lastSteps = steps;
   4817     info->numCalls++;
   4818     return (info->numCalls < info->maxCalls);
   4819 }
   4820 U_CDECL_END
   4821 
   4822 void RegexTest::Callbacks() {
   4823    {
   4824         // Getter returns NULLs if no callback has been set
   4825 
   4826         //   The variables that the getter will fill in.
   4827         //   Init to non-null values so that the action of the getter can be seen.
   4828         const void          *returnedContext = &returnedContext;
   4829         URegexMatchCallback *returnedFn = &testCallBackFn;
   4830 
   4831         UErrorCode status = U_ZERO_ERROR;
   4832         RegexMatcher matcher("x", 0, status);
   4833         REGEX_CHECK_STATUS;
   4834         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4835         REGEX_CHECK_STATUS;
   4836         REGEX_ASSERT(returnedFn == NULL);
   4837         REGEX_ASSERT(returnedContext == NULL);
   4838     }
   4839 
   4840    {
   4841         // Set and Get work
   4842         callBackContext cbInfo = {this, 0, 0, 0};
   4843         const void          *returnedContext;
   4844         URegexMatchCallback *returnedFn;
   4845         UErrorCode status = U_ZERO_ERROR;
   4846         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4847         REGEX_CHECK_STATUS;
   4848         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4849         REGEX_CHECK_STATUS;
   4850         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4851         REGEX_CHECK_STATUS;
   4852         REGEX_ASSERT(returnedFn == testCallBackFn);
   4853         REGEX_ASSERT(returnedContext == &cbInfo);
   4854 
   4855         // A short-running match shouldn't invoke the callback
   4856         status = U_ZERO_ERROR;
   4857         cbInfo.reset(1);
   4858         UnicodeString s = "xxx";
   4859         matcher.reset(s);
   4860         REGEX_ASSERT(matcher.matches(status));
   4861         REGEX_CHECK_STATUS;
   4862         REGEX_ASSERT(cbInfo.numCalls == 0);
   4863 
   4864         // A medium-length match that runs long enough to invoke the
   4865         //   callback, but not so long that the callback aborts it.
   4866         status = U_ZERO_ERROR;
   4867         cbInfo.reset(4);
   4868         s = "aaaaaaaaaaaaaaaaaaab";
   4869         matcher.reset(s);
   4870         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4871         REGEX_CHECK_STATUS;
   4872         REGEX_ASSERT(cbInfo.numCalls > 0);
   4873 
   4874         // A longer running match that the callback function will abort.
   4875         status = U_ZERO_ERROR;
   4876         cbInfo.reset(4);
   4877         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4878         matcher.reset(s);
   4879         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4880         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4881         REGEX_ASSERT(cbInfo.numCalls == 4);
   4882 
   4883         // A longer running find that the callback function will abort.
   4884         status = U_ZERO_ERROR;
   4885         cbInfo.reset(4);
   4886         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4887         matcher.reset(s);
   4888         REGEX_ASSERT(matcher.find(status)==FALSE);
   4889         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4890         REGEX_ASSERT(cbInfo.numCalls == 4);
   4891     }
   4892 
   4893 
   4894 }
   4895 
   4896 
   4897 //
   4898 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4899 //                  When set, the find progress callback will be invoked during a find operations
   4900 //                  after each return from a match attempt, giving the application the opportunity
   4901 //                  to terminate a long-running find operation before it's normal completion.
   4902 //
   4903 
   4904 struct progressCallBackContext {
   4905     RegexTest        *test;
   4906     int64_t          lastIndex;
   4907     int32_t          maxCalls;
   4908     int32_t          numCalls;
   4909     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4910 };
   4911 
   4912 // call-back function for find().
   4913 // Return TRUE to continue the find().
   4914 // Return FALSE to stop the find().
   4915 U_CDECL_BEGIN
   4916 static UBool U_CALLCONV
   4917 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4918     progressCallBackContext  *info = (progressCallBackContext *)context;
   4919     info->numCalls++;
   4920     info->lastIndex = matchIndex;
   4921 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4922     return (info->numCalls < info->maxCalls);
   4923 }
   4924 U_CDECL_END
   4925 
   4926 void RegexTest::FindProgressCallbacks() {
   4927    {
   4928         // Getter returns NULLs if no callback has been set
   4929 
   4930         //   The variables that the getter will fill in.
   4931         //   Init to non-null values so that the action of the getter can be seen.
   4932         const void                  *returnedContext = &returnedContext;
   4933         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4934 
   4935         UErrorCode status = U_ZERO_ERROR;
   4936         RegexMatcher matcher("x", 0, status);
   4937         REGEX_CHECK_STATUS;
   4938         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4939         REGEX_CHECK_STATUS;
   4940         REGEX_ASSERT(returnedFn == NULL);
   4941         REGEX_ASSERT(returnedContext == NULL);
   4942     }
   4943 
   4944    {
   4945         // Set and Get work
   4946         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4947         const void                  *returnedContext;
   4948         URegexFindProgressCallback  *returnedFn;
   4949         UErrorCode status = U_ZERO_ERROR;
   4950         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
   4951         REGEX_CHECK_STATUS;
   4952         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4953         REGEX_CHECK_STATUS;
   4954         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4955         REGEX_CHECK_STATUS;
   4956         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4957         REGEX_ASSERT(returnedContext == &cbInfo);
   4958 
   4959         // A find that matches on the initial position does NOT invoke the callback.
   4960         status = U_ZERO_ERROR;
   4961         cbInfo.reset(100);
   4962         UnicodeString s = "aaxxx";
   4963         matcher.reset(s);
   4964 #if 0
   4965         matcher.setTrace(TRUE);
   4966 #endif
   4967         REGEX_ASSERT(matcher.find(0, status));
   4968         REGEX_CHECK_STATUS;
   4969         REGEX_ASSERT(cbInfo.numCalls == 0);
   4970 
   4971         // A medium running find() that causes matcher.find() to invoke our callback for each index,
   4972         //   but not so many times that we interrupt the operation.
   4973         status = U_ZERO_ERROR;
   4974         s = "aaaaaaaaaaaaaaaaaaab";
   4975         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4976         matcher.reset(s);
   4977         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4978         REGEX_CHECK_STATUS;
   4979         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4980 
   4981         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4982         status = U_ZERO_ERROR;
   4983         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4984         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4985         matcher.reset(s1);
   4986         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4987         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4988         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4989 
   4990         // Now a match that will succeed, but after an interruption
   4991         status = U_ZERO_ERROR;
   4992         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4993         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4994         matcher.reset(s2);
   4995         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4996         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4997         // Now retry the match from where left off
   4998         cbInfo.maxCalls = 100; //  No callback limit
   4999         status = U_ZERO_ERROR;
   5000         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   5001         REGEX_CHECK_STATUS;
   5002     }
   5003 
   5004 
   5005 }
   5006 
   5007 
   5008 //---------------------------------------------------------------------------
   5009 //
   5010 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   5011 //                             UTexts. The pure-C implementation of UText
   5012 //                             has no mutable backing stores, but we can
   5013 //                             use UnicodeString here to test the functionality.
   5014 //
   5015 //---------------------------------------------------------------------------
   5016 void RegexTest::PreAllocatedUTextCAPI () {
   5017     UErrorCode           status = U_ZERO_ERROR;
   5018     URegularExpression  *re;
   5019     UText                patternText = UTEXT_INITIALIZER;
   5020     UnicodeString        buffer;
   5021     UText                bufferText = UTEXT_INITIALIZER;
   5022 
   5023     utext_openUnicodeString(&bufferText, &buffer, &status);
   5024 
   5025     /*
   5026      *  getText() and getUText()
   5027      */
   5028     {
   5029         UText  text1 = UTEXT_INITIALIZER;
   5030         UText  text2 = UTEXT_INITIALIZER;
   5031         UChar  text2Chars[20];
   5032         UText  *resultText;
   5033 
   5034         status = U_ZERO_ERROR;
   5035         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   5036         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   5037         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   5038         utext_openUChars(&text2, text2Chars, -1, &status);
   5039 
   5040         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   5041         re = uregex_openUText(&patternText, 0, NULL, &status);
   5042 
   5043         /* First set a UText */
   5044         uregex_setUText(re, &text1, &status);
   5045         resultText = uregex_getUText(re, &bufferText, &status);
   5046         REGEX_CHECK_STATUS;
   5047         REGEX_ASSERT(resultText == &bufferText);
   5048         utext_setNativeIndex(resultText, 0);
   5049         utext_setNativeIndex(&text1, 0);
   5050         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5051 
   5052         resultText = uregex_getUText(re, &bufferText, &status);
   5053         REGEX_CHECK_STATUS;
   5054         REGEX_ASSERT(resultText == &bufferText);
   5055         utext_setNativeIndex(resultText, 0);
   5056         utext_setNativeIndex(&text1, 0);
   5057         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5058 
   5059         /* Then set a UChar * */
   5060         uregex_setText(re, text2Chars, 7, &status);
   5061         resultText = uregex_getUText(re, &bufferText, &status);
   5062         REGEX_CHECK_STATUS;
   5063         REGEX_ASSERT(resultText == &bufferText);
   5064         utext_setNativeIndex(resultText, 0);
   5065         utext_setNativeIndex(&text2, 0);
   5066         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   5067 
   5068         uregex_close(re);
   5069         utext_close(&text1);
   5070         utext_close(&text2);
   5071     }
   5072 
   5073     /*
   5074      *  group()
   5075      */
   5076     {
   5077         UChar    text1[80];
   5078         UText   *actual;
   5079         UBool    result;
   5080         int64_t  length = 0;
   5081 
   5082         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
   5083         //                  012345678901234567890123456789012345678901234567
   5084         //                  0         1         2         3         4
   5085 
   5086         status = U_ZERO_ERROR;
   5087         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   5088         REGEX_CHECK_STATUS;
   5089 
   5090         uregex_setText(re, text1, -1, &status);
   5091         result = uregex_find(re, 0, &status);
   5092         REGEX_ASSERT(result==TRUE);
   5093 
   5094         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
   5095         status = U_ZERO_ERROR;
   5096         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
   5097         REGEX_CHECK_STATUS;
   5098         REGEX_ASSERT(actual == &bufferText);
   5099         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
   5100         REGEX_ASSERT(length == 16);
   5101         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5102 
   5103         /*  Capture group #1.  Should succeed, matching " interior ". */
   5104         status = U_ZERO_ERROR;
   5105         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
   5106         REGEX_CHECK_STATUS;
   5107         REGEX_ASSERT(actual == &bufferText);
   5108         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
   5109         REGEX_ASSERT(length == 10);
   5110         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5111 
   5112         /*  Capture group out of range.  Error. */
   5113         status = U_ZERO_ERROR;
   5114         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
   5115         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5116         REGEX_ASSERT(actual == &bufferText);
   5117         uregex_close(re);
   5118 
   5119     }
   5120 
   5121     /*
   5122      *  replaceFirst()
   5123      */
   5124     {
   5125         UChar    text1[80];
   5126         UChar    text2[80];
   5127         UText    replText = UTEXT_INITIALIZER;
   5128         UText   *result;
   5129         status = U_ZERO_ERROR;
   5130         utext_openUnicodeString(&bufferText, &buffer, &status);
   5131 
   5132         status = U_ZERO_ERROR;
   5133         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
   5134         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
   5135         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5136 
   5137         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5138         REGEX_CHECK_STATUS;
   5139 
   5140         /*  Normal case, with match */
   5141         uregex_setText(re, text1, -1, &status);
   5142         REGEX_CHECK_STATUS;
   5143         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5144         REGEX_CHECK_STATUS;
   5145         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5146         REGEX_CHECK_STATUS;
   5147         REGEX_ASSERT(result == &bufferText);
   5148         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5149 
   5150         /* No match.  Text should copy to output with no changes.  */
   5151         uregex_setText(re, text2, -1, &status);
   5152         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5153         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5154         REGEX_CHECK_STATUS;
   5155         REGEX_ASSERT(result == &bufferText);
   5156         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5157 
   5158         /* Unicode escapes */
   5159         uregex_setText(re, text1, -1, &status);
   5160         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
   5161         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5162         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5163         REGEX_CHECK_STATUS;
   5164         REGEX_ASSERT(result == &bufferText);
   5165         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5166 
   5167         uregex_close(re);
   5168         utext_close(&replText);
   5169     }
   5170 
   5171 
   5172     /*
   5173      *  replaceAll()
   5174      */
   5175     {
   5176         UChar    text1[80];
   5177         UChar    text2[80];
   5178         UText    replText = UTEXT_INITIALIZER;
   5179         UText   *result;
   5180 
   5181         status = U_ZERO_ERROR;
   5182         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5183         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5184         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5185 
   5186         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5187         REGEX_CHECK_STATUS;
   5188 
   5189         /*  Normal case, with match */
   5190         uregex_setText(re, text1, -1, &status);
   5191         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5192         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5193         REGEX_CHECK_STATUS;
   5194         REGEX_ASSERT(result == &bufferText);
   5195         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5196 
   5197         /* No match.  Text should copy to output with no changes.  */
   5198         uregex_setText(re, text2, -1, &status);
   5199         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5200         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5201         REGEX_CHECK_STATUS;
   5202         REGEX_ASSERT(result == &bufferText);
   5203         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5204 
   5205         uregex_close(re);
   5206         utext_close(&replText);
   5207     }
   5208 
   5209 
   5210     /*
   5211      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5212      *   so we don't need to test it here.
   5213      */
   5214 
   5215     utext_close(&bufferText);
   5216     utext_close(&patternText);
   5217 }
   5218 
   5219 
   5220 //--------------------------------------------------------------
   5221 //
   5222 //  NamedCapture   Check basic named capture group functionality
   5223 //
   5224 //--------------------------------------------------------------
   5225 void RegexTest::NamedCapture() {
   5226     UErrorCode status = U_ZERO_ERROR;
   5227     RegexPattern *pat = RegexPattern::compile(UnicodeString(
   5228             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
   5229     REGEX_CHECK_STATUS;
   5230     int32_t group = pat->groupNumberFromName("five", -1, status);
   5231     REGEX_CHECK_STATUS;
   5232     REGEX_ASSERT(5 == group);
   5233     group = pat->groupNumberFromName("three", -1, status);
   5234     REGEX_CHECK_STATUS;
   5235     REGEX_ASSERT(3 == group);
   5236 
   5237     status = U_ZERO_ERROR;
   5238     group = pat->groupNumberFromName(UnicodeString("six"), status);
   5239     REGEX_CHECK_STATUS;
   5240     REGEX_ASSERT(6 == group);
   5241 
   5242     status = U_ZERO_ERROR;
   5243     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
   5244     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5245 
   5246     status = U_ZERO_ERROR;
   5247 
   5248     // After copying a pattern, named capture should still work in the copy.
   5249     RegexPattern *copiedPat = new RegexPattern(*pat);
   5250     REGEX_ASSERT(*copiedPat == *pat);
   5251     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
   5252 
   5253     group = copiedPat->groupNumberFromName("five", -1, status);
   5254     REGEX_CHECK_STATUS;
   5255     REGEX_ASSERT(5 == group);
   5256     group = copiedPat->groupNumberFromName("three", -1, status);
   5257     REGEX_CHECK_STATUS;
   5258     REGEX_ASSERT(3 == group);
   5259     delete copiedPat;
   5260 
   5261     // ReplaceAll with named capture group.
   5262     status = U_ZERO_ERROR;
   5263     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
   5264     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
   5265     REGEX_CHECK_STATUS;
   5266     // m.pattern().dumpPattern();
   5267     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
   5268     REGEX_CHECK_STATUS;
   5269     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
   5270     delete m;
   5271 
   5272     // ReplaceAll, allowed capture group numbers.
   5273     text = UnicodeString("abcmxyz");
   5274     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
   5275     REGEX_CHECK_STATUS;
   5276 
   5277     status = U_ZERO_ERROR;
   5278     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
   5279     REGEX_CHECK_STATUS;
   5280     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
   5281 
   5282     status = U_ZERO_ERROR;
   5283     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
   5284     REGEX_CHECK_STATUS;
   5285     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5286 
   5287     status = U_ZERO_ERROR;
   5288     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
   5289     REGEX_CHECK_STATUS;
   5290     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5291 
   5292     status = U_ZERO_ERROR;
   5293     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
   5294     REGEX_CHECK_STATUS;
   5295     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
   5296 
   5297     status = U_ZERO_ERROR;
   5298     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
   5299     REGEX_CHECK_STATUS;
   5300     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
   5301 
   5302     status = U_ZERO_ERROR;
   5303     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
   5304     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5305 
   5306     status = U_ZERO_ERROR;
   5307     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
   5308     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
   5309     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
   5310 
   5311     status = U_ZERO_ERROR;
   5312     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
   5313     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
   5314     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
   5315 
   5316     status = U_ZERO_ERROR;
   5317     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
   5318     REGEX_CHECK_STATUS;
   5319     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
   5320 
   5321     status = U_ZERO_ERROR;
   5322     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
   5323     REGEX_CHECK_STATUS;
   5324     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
   5325 
   5326     status = U_ZERO_ERROR;
   5327     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
   5328     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5329 
   5330     status = U_ZERO_ERROR;
   5331     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
   5332     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5333 
   5334     status = U_ZERO_ERROR;
   5335     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
   5336     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5337 
   5338     status = U_ZERO_ERROR;
   5339     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
   5340     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5341 
   5342     delete m;
   5343 
   5344     // Repeat the above replaceAll() tests using the plain C API, which
   5345     //  has a separate implementation internally.
   5346     //  TODO: factor out the test data.
   5347 
   5348     status = U_ZERO_ERROR;
   5349     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
   5350     REGEX_CHECK_STATUS;
   5351     text = UnicodeString("abcmxyz");
   5352     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5353     REGEX_CHECK_STATUS;
   5354 
   5355     UChar resultBuf[100];
   5356     int32_t resultLength;
   5357     UnicodeString repl;
   5358 
   5359     status = U_ZERO_ERROR;
   5360     repl = UnicodeString("<$0>");
   5361     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5362     REGEX_CHECK_STATUS;
   5363     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
   5364 
   5365     status = U_ZERO_ERROR;
   5366     repl = UnicodeString("<$1>");
   5367     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5368     REGEX_CHECK_STATUS;
   5369     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5370 
   5371     status = U_ZERO_ERROR;
   5372     repl = UnicodeString("<${one}>");
   5373     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5374     REGEX_CHECK_STATUS;
   5375     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5376 
   5377     status = U_ZERO_ERROR;
   5378     repl = UnicodeString("<$2>");
   5379     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5380     REGEX_CHECK_STATUS;
   5381     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
   5382 
   5383     status = U_ZERO_ERROR;
   5384     repl = UnicodeString("<$3>");
   5385     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5386     REGEX_CHECK_STATUS;
   5387     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
   5388 
   5389     status = U_ZERO_ERROR;
   5390     repl = UnicodeString("<$4>");
   5391     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5392     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5393 
   5394     status = U_ZERO_ERROR;
   5395     repl = UnicodeString("<$04>");
   5396     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5397     REGEX_CHECK_STATUS;
   5398     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
   5399 
   5400     status = U_ZERO_ERROR;
   5401     repl = UnicodeString("<$000016>");
   5402     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5403     REGEX_CHECK_STATUS;
   5404     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
   5405 
   5406     status = U_ZERO_ERROR;
   5407     repl = UnicodeString("<$3$2$1${one}>");
   5408     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5409     REGEX_CHECK_STATUS;
   5410     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
   5411 
   5412     status = U_ZERO_ERROR;
   5413     repl = UnicodeString("$3$2$1${one}");
   5414     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5415     REGEX_CHECK_STATUS;
   5416     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
   5417 
   5418     status = U_ZERO_ERROR;
   5419     repl = UnicodeString("<${noSuchName}>");
   5420     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5421     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5422 
   5423     status = U_ZERO_ERROR;
   5424     repl = UnicodeString("<${invalid-name}>");
   5425     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5426     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5427 
   5428     status = U_ZERO_ERROR;
   5429     repl = UnicodeString("<${one");
   5430     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5431     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5432 
   5433     status = U_ZERO_ERROR;
   5434     repl = UnicodeString("$not a capture group");
   5435     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5436     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5437 
   5438     uregex_close(re);
   5439 }
   5440 
   5441 //--------------------------------------------------------------
   5442 //
   5443 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
   5444 //                       The point is not so much what the exact limit is,
   5445 //                       but that a largish number doesn't hit bad non-linear performance,
   5446 //                       and that exceeding the limit fails cleanly.
   5447 //
   5448 //--------------------------------------------------------------
   5449 void RegexTest::NamedCaptureLimits() {
   5450     if (quick) {
   5451         logln("Skipping test. Runs in exhuastive mode only.");
   5452         return;
   5453     }
   5454     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
   5455     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
   5456     char nnbuf[100];
   5457     UnicodeString pattern;
   5458     int32_t nn;
   5459 
   5460     for (nn=1; nn<goodLimit; nn++) {
   5461         sprintf(nnbuf, "(?<nn%d>)", nn);
   5462         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5463     }
   5464     UErrorCode status = U_ZERO_ERROR;
   5465     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
   5466     REGEX_CHECK_STATUS;
   5467     for (nn=1; nn<goodLimit; nn++) {
   5468         sprintf(nnbuf, "nn%d", nn);
   5469         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
   5470         REGEX_ASSERT(nn == groupNum);
   5471         if (nn != groupNum) {
   5472             break;
   5473         }
   5474     }
   5475     delete pat;
   5476 
   5477     pattern.remove();
   5478     for (nn=1; nn<failLimit; nn++) {
   5479         sprintf(nnbuf, "(?<nn%d>)", nn);
   5480         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5481     }
   5482     status = U_ZERO_ERROR;
   5483     pat = RegexPattern::compile(pattern, 0, status);
   5484     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
   5485     delete pat;
   5486 }
   5487 
   5488 
   5489 //--------------------------------------------------------------
   5490 //
   5491 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5492 //
   5493 //---------------------------------------------------------------
   5494 void RegexTest::Bug7651() {
   5495     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5496     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5497     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5498     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5499     UnicodeString s("#ff @abcd This is test");
   5500     RegexPattern  *REPattern = NULL;
   5501     RegexMatcher  *REMatcher = NULL;
   5502     UErrorCode status = U_ZERO_ERROR;
   5503     UParseError pe;
   5504 
   5505     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5506     REGEX_CHECK_STATUS;
   5507     REMatcher = REPattern->matcher(s, status);
   5508     REGEX_CHECK_STATUS;
   5509     REGEX_ASSERT(REMatcher->find());
   5510     REGEX_ASSERT(REMatcher->start(status) == 0);
   5511     delete REPattern;
   5512     delete REMatcher;
   5513     status = U_ZERO_ERROR;
   5514 
   5515     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5516     REGEX_CHECK_STATUS;
   5517     REMatcher = REPattern->matcher(s, status);
   5518     REGEX_CHECK_STATUS;
   5519     REGEX_ASSERT(REMatcher->find());
   5520     REGEX_ASSERT(REMatcher->start(status) == 0);
   5521     delete REPattern;
   5522     delete REMatcher;
   5523     status = U_ZERO_ERROR;
   5524  }
   5525 
   5526 void RegexTest::Bug7740() {
   5527     UErrorCode status = U_ZERO_ERROR;
   5528     UnicodeString pattern = "(a)";
   5529     UnicodeString text = "abcdef";
   5530     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5531     REGEX_CHECK_STATUS;
   5532     REGEX_ASSERT(m->lookingAt(status));
   5533     REGEX_CHECK_STATUS;
   5534     status = U_ILLEGAL_ARGUMENT_ERROR;
   5535     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5536     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5537     REGEX_ASSERT(s == "");
   5538     delete m;
   5539 }
   5540 
   5541 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5542 
   5543 void RegexTest::Bug8479() {
   5544     UErrorCode status = U_ZERO_ERROR;
   5545 
   5546     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5547     REGEX_CHECK_STATUS;
   5548     if (U_SUCCESS(status))
   5549     {
   5550         UnicodeString str;
   5551         str.setToBogus();
   5552         pMatcher->reset(str);
   5553         status = U_ZERO_ERROR;
   5554         pMatcher->matches(status);
   5555         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5556         delete pMatcher;
   5557     }
   5558 }
   5559 
   5560 
   5561 // Bug 7029
   5562 void RegexTest::Bug7029() {
   5563     UErrorCode status = U_ZERO_ERROR;
   5564 
   5565     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5566     UnicodeString text = "abc.def";
   5567     UnicodeString splits[10];
   5568     REGEX_CHECK_STATUS;
   5569     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5570     REGEX_CHECK_STATUS;
   5571     REGEX_ASSERT(numFields == 8);
   5572     delete pMatcher;
   5573 }
   5574 
   5575 // Bug 9283
   5576 //   This test is checking for the existance of any supplemental characters that case-fold
   5577 //   to a bmp character.
   5578 //
   5579 //   At the time of this writing there are none. If any should appear in a subsequent release
   5580 //   of Unicode, the code in regular expressions compilation that determines the longest
   5581 //   posssible match for a literal string  will need to be enhanced.
   5582 //
   5583 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5584 //   for details on what to do in case of a failure of this test.
   5585 //
   5586 void RegexTest::Bug9283() {
   5587 #if !UCONFIG_NO_NORMALIZATION
   5588     UErrorCode status = U_ZERO_ERROR;
   5589     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5590     REGEX_CHECK_STATUS;
   5591     int32_t index;
   5592     UChar32 c;
   5593     for (index=0; ; index++) {
   5594         c = supplementalsWithCaseFolding.charAt(index);
   5595         if (c == -1) {
   5596             break;
   5597         }
   5598         UnicodeString cf = UnicodeString(c).foldCase();
   5599         REGEX_ASSERT(cf.length() >= 2);
   5600     }
   5601 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   5602 }
   5603 
   5604 
   5605 void RegexTest::CheckInvBufSize() {
   5606   if(inv_next>=INV_BUFSIZ) {
   5607     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5608           __FILE__, INV_BUFSIZ, inv_next);
   5609   } else {
   5610     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5611   }
   5612 }
   5613 
   5614 
   5615 void RegexTest::Bug10459() {
   5616     UErrorCode status = U_ZERO_ERROR;
   5617     UnicodeString patternString("(txt)");
   5618     UnicodeString txtString("txt");
   5619 
   5620     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
   5621     REGEX_CHECK_STATUS;
   5622     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
   5623     REGEX_CHECK_STATUS;
   5624 
   5625     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
   5626     REGEX_CHECK_STATUS;
   5627 
   5628     uregex_setUText(icu_re, utext_txt, &status);
   5629     REGEX_CHECK_STATUS;
   5630 
   5631     // The bug was that calling uregex_group() before doing a matching operation
   5632     //   was causing a segfault. Only for Regular Expressions created from UText.
   5633     //   It should set an U_REGEX_INVALID_STATE.
   5634 
   5635     UChar buf[100];
   5636     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
   5637     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
   5638     REGEX_ASSERT(len == 0);
   5639 
   5640     uregex_close(icu_re);
   5641     utext_close(utext_pat);
   5642     utext_close(utext_txt);
   5643 }
   5644 
   5645 void RegexTest::TestCaseInsensitiveStarters() {
   5646     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
   5647     //  become stale because of new Unicode characters.
   5648     // If it is stale, rerun the generation tool
   5649     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   5650     // and replace the embedded data in i18n/regexcmp.cpp
   5651 
   5652     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
   5653         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
   5654             continue;
   5655         }
   5656         UnicodeSet s(cp, cp);
   5657         s.closeOver(USET_CASE_INSENSITIVE);
   5658         UnicodeSetIterator setIter(s);
   5659         while (setIter.next()) {
   5660             if (!setIter.isString()) {
   5661                 continue;
   5662             }
   5663             const UnicodeString &str = setIter.getString();
   5664             UChar32 firstChar = str.char32At(0);
   5665             UnicodeSet starters;
   5666             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
   5667             if (!starters.contains(cp)) {
   5668                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
   5669                 return;
   5670             }
   5671         }
   5672     }
   5673 }
   5674 
   5675 
   5676 void RegexTest::TestBug11049() {
   5677     // Original bug report: pattern with match start consisting of one of several individual characters,
   5678     //  and the text being matched ending with a supplementary character. find() would read past the
   5679     //  end of the input text when searching for potential match starting points.
   5680 
   5681     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
   5682     // detect the bad read.
   5683 
   5684     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5685     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
   5686 
   5687     // Test again with a pattern starting with a single character,
   5688     // which takes a different code path than starting with an OR expression,
   5689     // but with similar logic.
   5690     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5691     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
   5692 }
   5693 
   5694 // Run a single test case from TestBug11049(). Internal function.
   5695 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
   5696     UErrorCode status = U_ZERO_ERROR;
   5697     UnicodeString patternString = UnicodeString(pattern).unescape();
   5698     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5699 
   5700     UnicodeString dataString = UnicodeString(data).unescape();
   5701     UChar *exactBuffer = new UChar[dataString.length()];
   5702     dataString.extract(exactBuffer, dataString.length(), status);
   5703     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
   5704 
   5705     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
   5706     REGEX_CHECK_STATUS;
   5707     matcher->reset(ut);
   5708     UBool result = matcher->find();
   5709     if (result != expectMatch) {
   5710         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5711               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5712     }
   5713 
   5714     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
   5715     //   off-by-one on find() with match at the last code point.
   5716     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
   5717     //   because string.unescape() will only shrink it.
   5718     char * utf8Buffer = new char[uprv_strlen(data)+1];
   5719     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
   5720     REGEX_CHECK_STATUS;
   5721     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
   5722     REGEX_CHECK_STATUS;
   5723     matcher->reset(ut);
   5724     result = matcher->find();
   5725     if (result != expectMatch) {
   5726         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5727               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5728     }
   5729     delete [] utf8Buffer;
   5730 
   5731     utext_close(ut);
   5732     delete [] exactBuffer;
   5733 }
   5734 
   5735 
   5736 void RegexTest::TestBug11371() {
   5737     if (quick) {
   5738         logln("Skipping test. Runs in exhuastive mode only.");
   5739         return;
   5740     }
   5741     UErrorCode status = U_ZERO_ERROR;
   5742     UnicodeString patternString;
   5743 
   5744     for (int i=0; i<8000000; i++) {
   5745         patternString.append(UnicodeString("()"));
   5746     }
   5747     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5748     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5749         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5750               __FILE__, __LINE__, u_errorName(status));
   5751     }
   5752 
   5753     status = U_ZERO_ERROR;
   5754     patternString = "(";
   5755     for (int i=0; i<20000000; i++) {
   5756         patternString.append(UnicodeString("A++"));
   5757     }
   5758     patternString.append(UnicodeString("){0}B++"));
   5759     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
   5760     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5761         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5762               __FILE__, __LINE__, u_errorName(status));
   5763     }
   5764 
   5765     // Pattern with too much string data, such that string indexes overflow operand data field size
   5766     // in compiled instruction.
   5767     status = U_ZERO_ERROR;
   5768     patternString = "";
   5769     while (patternString.length() < 0x00ffffff) {
   5770         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
   5771     }
   5772     patternString.append(UnicodeString("X? trailing string"));
   5773     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
   5774     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5775         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5776               __FILE__, __LINE__, u_errorName(status));
   5777     }
   5778 }
   5779 
   5780 void RegexTest::TestBug11480() {
   5781     // C API, get capture group of a group that does not participate in the match.
   5782     //        (Returns a zero length string, with nul termination,
   5783     //         indistinguishable from a group with a zero lenght match.)
   5784 
   5785     UErrorCode status = U_ZERO_ERROR;
   5786     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
   5787     REGEX_CHECK_STATUS;
   5788     UnicodeString text = UNICODE_STRING_SIMPLE("A");
   5789     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5790     REGEX_CHECK_STATUS;
   5791     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
   5792     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
   5793     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
   5794     REGEX_ASSERT(length == 0);
   5795     REGEX_ASSERT(buf[0] == 13);
   5796     REGEX_ASSERT(buf[1] == 0);
   5797     REGEX_ASSERT(buf[2] == 13);
   5798     uregex_close(re);
   5799 }
   5800 
   5801 
   5802 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5803