Home | History | Annotate | Download | only in intltest
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 2002-2015, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ********************************************************************/
      6 
      7 //
      8 //   regextst.cpp
      9 //
     10 //      ICU Regular Expressions test, part of intltest.
     11 //
     12 
     13 /*
     14      NOTE!!
     15 
     16      PLEASE be careful about ASCII assumptions in this test.
     17      This test is one of the worst repeat offenders.
     18      If you have questions, contact someone on the ICU PMC
     19      who has access to an EBCDIC system.
     20 
     21  */
     22 
     23 #include "intltest.h"
     24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     25 
     26 #include <stdlib.h>
     27 #include <stdio.h>
     28 #include <string.h>
     29 
     30 #include "unicode/localpointer.h"
     31 #include "unicode/regex.h"
     32 #include "unicode/uchar.h"
     33 #include "unicode/ucnv.h"
     34 #include "unicode/uniset.h"
     35 #include "unicode/uregex.h"
     36 #include "unicode/usetiter.h"
     37 #include "unicode/ustring.h"
     38 #include "unicode/utext.h"
     39 
     40 #include "regextst.h"
     41 #include "regexcmp.h"
     42 #include "uvector.h"
     43 #include "util.h"
     44 #include "cmemory.h"
     45 #include "cstring.h"
     46 #include "uinvchar.h"
     47 
     48 #define SUPPORT_MUTATING_INPUT_STRING   0
     49 
     50 //---------------------------------------------------------------------------
     51 //
     52 //  Test class boilerplate
     53 //
     54 //---------------------------------------------------------------------------
     55 RegexTest::RegexTest()
     56 {
     57 }
     58 
     59 
     60 RegexTest::~RegexTest()
     61 {
     62 }
     63 
     64 
     65 
     66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     67 {
     68     if (exec) logln("TestSuite RegexTest: ");
     69     switch (index) {
     70 
     71         case 0: name = "Basic";
     72             if (exec) Basic();
     73             break;
     74         case 1: name = "API_Match";
     75             if (exec) API_Match();
     76             break;
     77         case 2: name = "API_Replace";
     78             if (exec) API_Replace();
     79             break;
     80         case 3: name = "API_Pattern";
     81             if (exec) API_Pattern();
     82             break;
     83         case 4:
     84 #if !UCONFIG_NO_FILE_IO
     85             name = "Extended";
     86             if (exec) Extended();
     87 #else
     88             name = "skip";
     89 #endif
     90             break;
     91         case 5: name = "Errors";
     92             if (exec) Errors();
     93             break;
     94         case 6: name = "PerlTests";
     95             if (exec) PerlTests();
     96             break;
     97         case 7: name = "Callbacks";
     98             if (exec) Callbacks();
     99             break;
    100         case 8: name = "FindProgressCallbacks";
    101             if (exec) FindProgressCallbacks();
    102             break;
    103         case 9: name = "Bug 6149";
    104              if (exec) Bug6149();
    105              break;
    106         case 10: name = "UTextBasic";
    107           if (exec) UTextBasic();
    108           break;
    109         case 11: name = "API_Match_UTF8";
    110           if (exec) API_Match_UTF8();
    111           break;
    112         case 12: name = "API_Replace_UTF8";
    113           if (exec) API_Replace_UTF8();
    114           break;
    115         case 13: name = "API_Pattern_UTF8";
    116           if (exec) API_Pattern_UTF8();
    117           break;
    118         case 14: name = "PerlTestsUTF8";
    119           if (exec) PerlTestsUTF8();
    120           break;
    121         case 15: name = "PreAllocatedUTextCAPI";
    122           if (exec) PreAllocatedUTextCAPI();
    123           break;
    124         case 16: name = "Bug 7651";
    125              if (exec) Bug7651();
    126              break;
    127         case 17: name = "Bug 7740";
    128             if (exec) Bug7740();
    129             break;
    130         case 18: name = "Bug 8479";
    131             if (exec) Bug8479();
    132             break;
    133         case 19: name = "Bug 7029";
    134             if (exec) Bug7029();
    135             break;
    136         case 20: name = "CheckInvBufSize";
    137             if (exec) CheckInvBufSize();
    138             break;
    139         case 21: name = "Bug 9283";
    140             if (exec) Bug9283();
    141             break;
    142         case 22: name = "Bug10459";
    143             if (exec) Bug10459();
    144             break;
    145         case 23: name = "TestCaseInsensitiveStarters";
    146             if (exec) TestCaseInsensitiveStarters();
    147             break;
    148         case 24: name = "TestBug11049";
    149             if (exec) TestBug11049();
    150             break;
    151         case 25: name = "TestBug11371";
    152             if (exec) TestBug11371();
    153             break;
    154         case 26: name = "TestBug11480";
    155             if (exec) TestBug11480();
    156             break;
    157         case 27: name = "NamedCapture";
    158             if (exec) NamedCapture();
    159             break;
    160         case 28: name = "NamedCaptureLimits";
    161             if (exec) NamedCaptureLimits();
    162             break;
    163         default: name = "";
    164             break; //needed to end loop
    165     }
    166 }
    167 
    168 
    169 
    170 /**
    171  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    172  * into ASCII.
    173  * @see utext_openUTF8
    174  */
    175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    176 
    177 //---------------------------------------------------------------------------
    178 //
    179 //   Error Checking / Reporting macros used in all of the tests.
    180 //
    181 //---------------------------------------------------------------------------
    182 
    183 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    184   int64_t oldIndex = utext_getNativeIndex(text);
    185   utext_setNativeIndex(text, 0);
    186   char *bufPtr = buf;
    187   UChar32 c = utext_next32From(text, 0);
    188   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    189     if (0x000020<=c && c<0x00007e) {
    190       *bufPtr = c;
    191     } else {
    192 #if 0
    193       sprintf(bufPtr,"U+%04X", c);
    194       bufPtr+= strlen(bufPtr)-1;
    195 #else
    196       *bufPtr = '%';
    197 #endif
    198     }
    199     bufPtr++;
    200     c = UTEXT_NEXT32(text);
    201   }
    202   *bufPtr = 0;
    203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    204   char *ebuf = (char*)malloc(bufLen);
    205   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    206   uprv_strncpy(buf, ebuf, bufLen);
    207   free((void*)ebuf);
    208 #endif
    209   utext_setNativeIndex(text, oldIndex);
    210 }
    211 
    212 
    213 static char ASSERT_BUF[1024];
    214 
    215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    216   if(message.length()==0) {
    217     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    218   } else {
    219     UnicodeString buf;
    220     IntlTest::prettify(message,buf);
    221     if(buf.length()==0) {
    222       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    223     } else {
    224       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    225       if(ASSERT_BUF[0]==0) {
    226         ASSERT_BUF[0]=0;
    227         for(int32_t i=0;i<buf.length();i++) {
    228           UChar ch = buf[i];
    229           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    230         }
    231       }
    232     }
    233   }
    234   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    235   return ASSERT_BUF;
    236 }
    237 
    238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    239 
    240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    241                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    242 
    243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    244 
    245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    246 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    247     __LINE__, u_errorName(errcode), u_errorName(status));};}
    248 
    249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    250     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    251 
    252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    253     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    254 
    255 // expected: const char * , restricted to invariant characters.
    256 // actual: const UnicodeString &
    257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
    258     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
    259         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
    260                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
    261 
    262 
    263 static UBool testUTextEqual(UText *uta, UText *utb) {
    264     UChar32 ca = 0;
    265     UChar32 cb = 0;
    266     utext_setNativeIndex(uta, 0);
    267     utext_setNativeIndex(utb, 0);
    268     do {
    269         ca = utext_next32(uta);
    270         cb = utext_next32(utb);
    271         if (ca != cb) {
    272             break;
    273         }
    274     } while (ca != U_SENTINEL);
    275     return ca == cb;
    276 }
    277 
    278 
    279 /**
    280  * @param expected expected text in UTF-8 (not platform) codepage
    281  */
    282 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    283     UErrorCode status = U_ZERO_ERROR;
    284     UText expectedText = UTEXT_INITIALIZER;
    285     utext_openUTF8(&expectedText, expected, -1, &status);
    286     if(U_FAILURE(status)) {
    287       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    288       return;
    289     }
    290     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    291       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    292       return;
    293     }
    294     utext_setNativeIndex(actual, 0);
    295     if (!testUTextEqual(&expectedText, actual)) {
    296         char buf[201 /*21*/];
    297         char expectedBuf[201];
    298         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    299         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    300         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    301     }
    302     utext_close(&expectedText);
    303 }
    304 /**
    305  * @param expected invariant (platform local text) input
    306  */
    307 
    308 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    309     UErrorCode status = U_ZERO_ERROR;
    310     UText expectedText = UTEXT_INITIALIZER;
    311     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    312     if(U_FAILURE(status)) {
    313       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    314       return;
    315     }
    316     utext_setNativeIndex(actual, 0);
    317     if (!testUTextEqual(&expectedText, actual)) {
    318         char buf[201 /*21*/];
    319         char expectedBuf[201];
    320         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
    321         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
    322         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    323     }
    324     utext_close(&expectedText);
    325 }
    326 
    327 /**
    328  * Assumes utf-8 input
    329  */
    330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    331 /**
    332  * Assumes Invariant input
    333  */
    334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    335 
    336 /**
    337  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    338  * passed into utext_openUTF8. An error will be given if
    339  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    340  */
    341 
    342 #define INV_BUFSIZ 2048 /* increase this if too small */
    343 
    344 static int64_t inv_next=0;
    345 
    346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    347 static char inv_buf[INV_BUFSIZ];
    348 #endif
    349 
    350 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    351   if(length==-1) length=strlen(inv);
    352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    353   inv_next+=length;
    354   return utext_openUTF8(ut, inv, length, status);
    355 #else
    356   if(inv_next+length+1>INV_BUFSIZ) {
    357     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    358             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    359     *status = U_MEMORY_ALLOCATION_ERROR;
    360     return NULL;
    361   }
    362 
    363   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    364   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    365   inv_next+=length;
    366 
    367 #if 0
    368   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    369 #endif
    370 
    371   return utext_openUTF8(ut, (const char*)buf, length, status);
    372 #endif
    373 }
    374 
    375 
    376 //---------------------------------------------------------------------------
    377 //
    378 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    379 //                       for the LookingAt() and  Match() functions.
    380 //
    381 //       usage:
    382 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    383 //
    384 //          The expected results are UBool - TRUE or FALSE.
    385 //          The input text is unescaped.  The pattern is not.
    386 //
    387 //
    388 //---------------------------------------------------------------------------
    389 
    390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    391 
    392 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    393     const UnicodeString pattern(pat, -1, US_INV);
    394     const UnicodeString inputText(text, -1, US_INV);
    395     UErrorCode          status  = U_ZERO_ERROR;
    396     UParseError         pe;
    397     RegexPattern        *REPattern = NULL;
    398     RegexMatcher        *REMatcher = NULL;
    399     UBool               retVal     = TRUE;
    400 
    401     UnicodeString patString(pat, -1, US_INV);
    402     REPattern = RegexPattern::compile(patString, 0, pe, status);
    403     if (U_FAILURE(status)) {
    404         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    405             line, u_errorName(status));
    406         return FALSE;
    407     }
    408     if (line==376) { REPattern->dumpPattern();}
    409 
    410     UnicodeString inputString(inputText);
    411     UnicodeString unEscapedInput = inputString.unescape();
    412     REMatcher = REPattern->matcher(unEscapedInput, status);
    413     if (U_FAILURE(status)) {
    414         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    415             line, u_errorName(status));
    416         return FALSE;
    417     }
    418 
    419     UBool actualmatch;
    420     actualmatch = REMatcher->lookingAt(status);
    421     if (U_FAILURE(status)) {
    422         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    423             line, u_errorName(status));
    424         retVal =  FALSE;
    425     }
    426     if (actualmatch != looking) {
    427         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    428         retVal = FALSE;
    429     }
    430 
    431     status = U_ZERO_ERROR;
    432     actualmatch = REMatcher->matches(status);
    433     if (U_FAILURE(status)) {
    434         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    435             line, u_errorName(status));
    436         retVal = FALSE;
    437     }
    438     if (actualmatch != match) {
    439         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    440         retVal = FALSE;
    441     }
    442 
    443     if (retVal == FALSE) {
    444         REPattern->dumpPattern();
    445     }
    446 
    447     delete REPattern;
    448     delete REMatcher;
    449     return retVal;
    450 }
    451 
    452 
    453 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    454     UText               pattern    = UTEXT_INITIALIZER;
    455     int32_t             inputUTF8Length;
    456     char                *textChars = NULL;
    457     UText               inputText  = UTEXT_INITIALIZER;
    458     UErrorCode          status     = U_ZERO_ERROR;
    459     UParseError         pe;
    460     RegexPattern        *REPattern = NULL;
    461     RegexMatcher        *REMatcher = NULL;
    462     UBool               retVal     = TRUE;
    463 
    464     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    465     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    466     if (U_FAILURE(status)) {
    467         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    468             line, u_errorName(status));
    469         return FALSE;
    470     }
    471 
    472     UnicodeString inputString(text, -1, US_INV);
    473     UnicodeString unEscapedInput = inputString.unescape();
    474     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    475     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    476 
    477     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    478     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    479         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    480         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    481         return TRUE; // not a failure of the Regex engine
    482     }
    483     status = U_ZERO_ERROR; // buffer overflow
    484     textChars = new char[inputUTF8Length+1];
    485     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    486     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    487 
    488     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    489     if (U_FAILURE(status)) {
    490         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    491             line, u_errorName(status));
    492         return FALSE;
    493     }
    494 
    495     UBool actualmatch;
    496     actualmatch = REMatcher->lookingAt(status);
    497     if (U_FAILURE(status)) {
    498         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    499             line, u_errorName(status));
    500         retVal =  FALSE;
    501     }
    502     if (actualmatch != looking) {
    503         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    504         retVal = FALSE;
    505     }
    506 
    507     status = U_ZERO_ERROR;
    508     actualmatch = REMatcher->matches(status);
    509     if (U_FAILURE(status)) {
    510         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    511             line, u_errorName(status));
    512         retVal = FALSE;
    513     }
    514     if (actualmatch != match) {
    515         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    516         retVal = FALSE;
    517     }
    518 
    519     if (retVal == FALSE) {
    520         REPattern->dumpPattern();
    521     }
    522 
    523     delete REPattern;
    524     delete REMatcher;
    525     utext_close(&inputText);
    526     utext_close(&pattern);
    527     delete[] textChars;
    528     return retVal;
    529 }
    530 
    531 
    532 
    533 //---------------------------------------------------------------------------
    534 //
    535 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    536 //                       regex tests for incorrect patterns
    537 //
    538 //       usage:
    539 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    540 //
    541 //---------------------------------------------------------------------------
    542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    543 
    544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    545                           UErrorCode expectedStatus, int32_t line) {
    546     UnicodeString       pattern(pat);
    547 
    548     UErrorCode          status         = U_ZERO_ERROR;
    549     UParseError         pe;
    550     RegexPattern        *callerPattern = NULL;
    551 
    552     //
    553     //  Compile the caller's pattern
    554     //
    555     UnicodeString patString(pat);
    556     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    557     if (status != expectedStatus) {
    558         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    559     } else {
    560         if (status != U_ZERO_ERROR) {
    561             if (pe.line != errLine || pe.offset != errCol) {
    562                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    563                     line, errLine, errCol, pe.line, pe.offset);
    564             }
    565         }
    566     }
    567 
    568     delete callerPattern;
    569 
    570     //
    571     //  Compile again, using a UTF-8-based UText
    572     //
    573     UText patternText = UTEXT_INITIALIZER;
    574     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    575     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    576     if (status != expectedStatus) {
    577         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    578     } else {
    579         if (status != U_ZERO_ERROR) {
    580             if (pe.line != errLine || pe.offset != errCol) {
    581                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    582                     line, errLine, errCol, pe.line, pe.offset);
    583             }
    584         }
    585     }
    586 
    587     delete callerPattern;
    588     utext_close(&patternText);
    589 }
    590 
    591 
    592 
    593 //---------------------------------------------------------------------------
    594 //
    595 //      Basic      Check for basic functionality of regex pattern matching.
    596 //                 Avoid the use of REGEX_FIND test macro, which has
    597 //                 substantial dependencies on basic Regex functionality.
    598 //
    599 //---------------------------------------------------------------------------
    600 void RegexTest::Basic() {
    601 
    602 
    603 //
    604 // Debug - slide failing test cases early
    605 //
    606 #if 0
    607     {
    608         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    609         UParseError pe;
    610         UErrorCode  status = U_ZERO_ERROR;
    611         RegexPattern *pattern;
    612         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    613         pattern->dumpPattern();
    614         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    615         UBool result = m->find();
    616         printf("result = %d\n", result);
    617         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    618         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    619     }
    620     exit(1);
    621 #endif
    622 
    623 
    624     //
    625     // Pattern with parentheses
    626     //
    627     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    628     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    629     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    630 
    631     //
    632     // Patterns with *
    633     //
    634     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    635     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    636     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    637     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    638     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    639 
    640     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    641     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    642 
    643 
    644     //
    645     //  Patterns with "."
    646     //
    647     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    648     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    649     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    650     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    651     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    652     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    653     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    654     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    655 
    656     //
    657     //  Patterns with * applied to chars at end of literal string
    658     //
    659     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    660     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    661 
    662     //
    663     //  Supplemental chars match as single chars, not a pair of surrogates.
    664     //
    665     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    666     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    667     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    668 
    669 
    670     //
    671     //  UnicodeSets in the pattern
    672     //
    673     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    674     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    675     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    676     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    677     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    678     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    679 
    680     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    681     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    682     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    683     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    684     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    685 
    686     //
    687     //   OR operator in patterns
    688     //
    689     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    690     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    691     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    692     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    693 
    694     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    695     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    696     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    697     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    698     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    699     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    700 
    701     //
    702     //  +
    703     //
    704     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    705     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    706     REGEX_TESTLM("b+", "", FALSE, FALSE);
    707     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    708     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    709     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    710 
    711     //
    712     //   ?
    713     //
    714     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    715     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    716     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    717     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    718     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    719     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    720     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    721     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    722     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    723 
    724     //
    725     //  Escape sequences that become single literal chars, handled internally
    726     //   by ICU's Unescape.
    727     //
    728 
    729     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    730     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    731     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    732     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    733     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    734     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    735     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    736     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    737     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    738     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    739 
    740     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    741     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    742 
    743     // Escape of special chars in patterns
    744     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    745 }
    746 
    747 
    748 //---------------------------------------------------------------------------
    749 //
    750 //    UTextBasic   Check for quirks that are specific to the UText
    751 //                 implementation.
    752 //
    753 //---------------------------------------------------------------------------
    754 void RegexTest::UTextBasic() {
    755     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    756     UErrorCode status = U_ZERO_ERROR;
    757     UText pattern = UTEXT_INITIALIZER;
    758     utext_openUTF8(&pattern, str_abc, -1, &status);
    759     RegexMatcher matcher(&pattern, 0, status);
    760     REGEX_CHECK_STATUS;
    761 
    762     UText input = UTEXT_INITIALIZER;
    763     utext_openUTF8(&input, str_abc, -1, &status);
    764     REGEX_CHECK_STATUS;
    765     matcher.reset(&input);
    766     REGEX_CHECK_STATUS;
    767     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    768 
    769     matcher.reset(matcher.inputText());
    770     REGEX_CHECK_STATUS;
    771     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    772 
    773     utext_close(&pattern);
    774     utext_close(&input);
    775 }
    776 
    777 
    778 //---------------------------------------------------------------------------
    779 //
    780 //      API_Match   Test that the API for class RegexMatcher
    781 //                  is present and nominally working, but excluding functions
    782 //                  implementing replace operations.
    783 //
    784 //---------------------------------------------------------------------------
    785 void RegexTest::API_Match() {
    786     UParseError         pe;
    787     UErrorCode          status=U_ZERO_ERROR;
    788     int32_t             flags = 0;
    789 
    790     //
    791     // Debug - slide failing test cases early
    792     //
    793 #if 0
    794     {
    795     }
    796     return;
    797 #endif
    798 
    799     //
    800     // Simple pattern compilation
    801     //
    802     {
    803         UnicodeString       re("abc");
    804         RegexPattern        *pat2;
    805         pat2 = RegexPattern::compile(re, flags, pe, status);
    806         REGEX_CHECK_STATUS;
    807 
    808         UnicodeString inStr1 = "abcdef this is a test";
    809         UnicodeString instr2 = "not abc";
    810         UnicodeString empty  = "";
    811 
    812 
    813         //
    814         // Matcher creation and reset.
    815         //
    816         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    817         REGEX_CHECK_STATUS;
    818         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    819         REGEX_ASSERT(m1->input() == inStr1);
    820         m1->reset(instr2);
    821         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    822         REGEX_ASSERT(m1->input() == instr2);
    823         m1->reset(inStr1);
    824         REGEX_ASSERT(m1->input() == inStr1);
    825         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    826         m1->reset(empty);
    827         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    828         REGEX_ASSERT(m1->input() == empty);
    829         REGEX_ASSERT(&m1->pattern() == pat2);
    830 
    831         //
    832         //  reset(pos, status)
    833         //
    834         m1->reset(inStr1);
    835         m1->reset(4, status);
    836         REGEX_CHECK_STATUS;
    837         REGEX_ASSERT(m1->input() == inStr1);
    838         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    839 
    840         m1->reset(-1, status);
    841         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    842         status = U_ZERO_ERROR;
    843 
    844         m1->reset(0, status);
    845         REGEX_CHECK_STATUS;
    846         status = U_ZERO_ERROR;
    847 
    848         int32_t len = m1->input().length();
    849         m1->reset(len-1, status);
    850         REGEX_CHECK_STATUS;
    851         status = U_ZERO_ERROR;
    852 
    853         m1->reset(len, status);
    854         REGEX_CHECK_STATUS;
    855         status = U_ZERO_ERROR;
    856 
    857         m1->reset(len+1, status);
    858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    859         status = U_ZERO_ERROR;
    860 
    861         //
    862         // match(pos, status)
    863         //
    864         m1->reset(instr2);
    865         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    866         m1->reset();
    867         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    868         m1->reset();
    869         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    870         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    871         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    872         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    873 
    874         // Match() at end of string should fail, but should not
    875         //  be an error.
    876         status = U_ZERO_ERROR;
    877         len = m1->input().length();
    878         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    879         REGEX_CHECK_STATUS;
    880 
    881         // Match beyond end of string should fail with an error.
    882         status = U_ZERO_ERROR;
    883         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    885 
    886         // Successful match at end of string.
    887         {
    888             status = U_ZERO_ERROR;
    889             RegexMatcher m("A?", 0, status);  // will match zero length string.
    890             REGEX_CHECK_STATUS;
    891             m.reset(inStr1);
    892             len = inStr1.length();
    893             REGEX_ASSERT(m.matches(len, status) == TRUE);
    894             REGEX_CHECK_STATUS;
    895             m.reset(empty);
    896             REGEX_ASSERT(m.matches(0, status) == TRUE);
    897             REGEX_CHECK_STATUS;
    898         }
    899 
    900 
    901         //
    902         // lookingAt(pos, status)
    903         //
    904         status = U_ZERO_ERROR;
    905         m1->reset(instr2);  // "not abc"
    906         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    907         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    908         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    909         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    910         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    911         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    912         status = U_ZERO_ERROR;
    913         len = m1->input().length();
    914         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    915         REGEX_CHECK_STATUS;
    916         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    917         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    918 
    919         delete m1;
    920         delete pat2;
    921     }
    922 
    923 
    924     //
    925     // Capture Group.
    926     //     RegexMatcher::start();
    927     //     RegexMatcher::end();
    928     //     RegexMatcher::groupCount();
    929     //
    930     {
    931         int32_t             flags=0;
    932         UParseError         pe;
    933         UErrorCode          status=U_ZERO_ERROR;
    934 
    935         UnicodeString       re("01(23(45)67)(.*)");
    936         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    937         REGEX_CHECK_STATUS;
    938         UnicodeString data = "0123456789";
    939 
    940         RegexMatcher *matcher = pat->matcher(data, status);
    941         REGEX_CHECK_STATUS;
    942         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    943         static const int32_t matchStarts[] = {0,  2, 4, 8};
    944         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    945         int32_t i;
    946         for (i=0; i<4; i++) {
    947             int32_t actualStart = matcher->start(i, status);
    948             REGEX_CHECK_STATUS;
    949             if (actualStart != matchStarts[i]) {
    950                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    951                     __LINE__, i, matchStarts[i], actualStart);
    952             }
    953             int32_t actualEnd = matcher->end(i, status);
    954             REGEX_CHECK_STATUS;
    955             if (actualEnd != matchEnds[i]) {
    956                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    957                     __LINE__, i, matchEnds[i], actualEnd);
    958             }
    959         }
    960 
    961         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    962         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    963 
    964         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    965         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    966         matcher->reset();
    967         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    968 
    969         matcher->lookingAt(status);
    970         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    971         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    972         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    973         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    974         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    975         REGEX_CHECK_STATUS;
    976         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    977         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    978         matcher->reset();
    979         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    980 
    981         delete matcher;
    982         delete pat;
    983 
    984     }
    985 
    986     //
    987     //  find
    988     //
    989     {
    990         int32_t             flags=0;
    991         UParseError         pe;
    992         UErrorCode          status=U_ZERO_ERROR;
    993 
    994         UnicodeString       re("abc");
    995         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    996         REGEX_CHECK_STATUS;
    997         UnicodeString data = ".abc..abc...abc..";
    998         //                    012345678901234567
    999 
   1000         RegexMatcher *matcher = pat->matcher(data, status);
   1001         REGEX_CHECK_STATUS;
   1002         REGEX_ASSERT(matcher->find());
   1003         REGEX_ASSERT(matcher->start(status) == 1);
   1004         REGEX_ASSERT(matcher->find());
   1005         REGEX_ASSERT(matcher->start(status) == 6);
   1006         REGEX_ASSERT(matcher->find());
   1007         REGEX_ASSERT(matcher->start(status) == 12);
   1008         REGEX_ASSERT(matcher->find() == FALSE);
   1009         REGEX_ASSERT(matcher->find() == FALSE);
   1010 
   1011         matcher->reset();
   1012         REGEX_ASSERT(matcher->find());
   1013         REGEX_ASSERT(matcher->start(status) == 1);
   1014 
   1015         REGEX_ASSERT(matcher->find(0, status));
   1016         REGEX_ASSERT(matcher->start(status) == 1);
   1017         REGEX_ASSERT(matcher->find(1, status));
   1018         REGEX_ASSERT(matcher->start(status) == 1);
   1019         REGEX_ASSERT(matcher->find(2, status));
   1020         REGEX_ASSERT(matcher->start(status) == 6);
   1021         REGEX_ASSERT(matcher->find(12, status));
   1022         REGEX_ASSERT(matcher->start(status) == 12);
   1023         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   1024         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   1025         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   1026         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   1027 
   1028         status = U_ZERO_ERROR;
   1029         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1030         status = U_ZERO_ERROR;
   1031         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1032 
   1033         REGEX_ASSERT(matcher->groupCount() == 0);
   1034 
   1035         delete matcher;
   1036         delete pat;
   1037     }
   1038 
   1039 
   1040     //
   1041     //  find, with \G in pattern (true if at the end of a previous match).
   1042     //
   1043     {
   1044         int32_t             flags=0;
   1045         UParseError         pe;
   1046         UErrorCode          status=U_ZERO_ERROR;
   1047 
   1048         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
   1049         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1050         REGEX_CHECK_STATUS;
   1051         UnicodeString data = ".abcabc.abc..";
   1052         //                    012345678901234567
   1053 
   1054         RegexMatcher *matcher = pat->matcher(data, status);
   1055         REGEX_CHECK_STATUS;
   1056         REGEX_ASSERT(matcher->find());
   1057         REGEX_ASSERT(matcher->start(status) == 0);
   1058         REGEX_ASSERT(matcher->start(1, status) == -1);
   1059         REGEX_ASSERT(matcher->start(2, status) == 1);
   1060 
   1061         REGEX_ASSERT(matcher->find());
   1062         REGEX_ASSERT(matcher->start(status) == 4);
   1063         REGEX_ASSERT(matcher->start(1, status) == 4);
   1064         REGEX_ASSERT(matcher->start(2, status) == -1);
   1065         REGEX_CHECK_STATUS;
   1066 
   1067         delete matcher;
   1068         delete pat;
   1069     }
   1070 
   1071     //
   1072     //   find with zero length matches, match position should bump ahead
   1073     //     to prevent loops.
   1074     //
   1075     {
   1076         int32_t                 i;
   1077         UErrorCode          status=U_ZERO_ERROR;
   1078         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1079                                                       //   using an always-true look-ahead.
   1080         REGEX_CHECK_STATUS;
   1081         UnicodeString s("    ");
   1082         m.reset(s);
   1083         for (i=0; ; i++) {
   1084             if (m.find() == FALSE) {
   1085                 break;
   1086             }
   1087             REGEX_ASSERT(m.start(status) == i);
   1088             REGEX_ASSERT(m.end(status) == i);
   1089         }
   1090         REGEX_ASSERT(i==5);
   1091 
   1092         // Check that the bump goes over surrogate pairs OK
   1093         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1094         s = s.unescape();
   1095         m.reset(s);
   1096         for (i=0; ; i+=2) {
   1097             if (m.find() == FALSE) {
   1098                 break;
   1099             }
   1100             REGEX_ASSERT(m.start(status) == i);
   1101             REGEX_ASSERT(m.end(status) == i);
   1102         }
   1103         REGEX_ASSERT(i==10);
   1104     }
   1105     {
   1106         // find() loop breaking test.
   1107         //        with pattern of /.?/, should see a series of one char matches, then a single
   1108         //        match of zero length at the end of the input string.
   1109         int32_t                 i;
   1110         UErrorCode          status=U_ZERO_ERROR;
   1111         RegexMatcher        m(".?", 0, status);
   1112         REGEX_CHECK_STATUS;
   1113         UnicodeString s("    ");
   1114         m.reset(s);
   1115         for (i=0; ; i++) {
   1116             if (m.find() == FALSE) {
   1117                 break;
   1118             }
   1119             REGEX_ASSERT(m.start(status) == i);
   1120             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1121         }
   1122         REGEX_ASSERT(i==5);
   1123     }
   1124 
   1125 
   1126     //
   1127     // Matchers with no input string behave as if they had an empty input string.
   1128     //
   1129 
   1130     {
   1131         UErrorCode status = U_ZERO_ERROR;
   1132         RegexMatcher  m(".?", 0, status);
   1133         REGEX_CHECK_STATUS;
   1134         REGEX_ASSERT(m.find());
   1135         REGEX_ASSERT(m.start(status) == 0);
   1136         REGEX_ASSERT(m.input() == "");
   1137     }
   1138     {
   1139         UErrorCode status = U_ZERO_ERROR;
   1140         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1141         RegexMatcher  *m = p->matcher(status);
   1142         REGEX_CHECK_STATUS;
   1143 
   1144         REGEX_ASSERT(m->find() == FALSE);
   1145         REGEX_ASSERT(m->input() == "");
   1146         delete m;
   1147         delete p;
   1148     }
   1149 
   1150     //
   1151     // Regions
   1152     //
   1153     {
   1154         UErrorCode status = U_ZERO_ERROR;
   1155         UnicodeString testString("This is test data");
   1156         RegexMatcher m(".*", testString,  0, status);
   1157         REGEX_CHECK_STATUS;
   1158         REGEX_ASSERT(m.regionStart() == 0);
   1159         REGEX_ASSERT(m.regionEnd() == testString.length());
   1160         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1161         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1162 
   1163         m.region(2,4, status);
   1164         REGEX_CHECK_STATUS;
   1165         REGEX_ASSERT(m.matches(status));
   1166         REGEX_ASSERT(m.start(status)==2);
   1167         REGEX_ASSERT(m.end(status)==4);
   1168         REGEX_CHECK_STATUS;
   1169 
   1170         m.reset();
   1171         REGEX_ASSERT(m.regionStart() == 0);
   1172         REGEX_ASSERT(m.regionEnd() == testString.length());
   1173 
   1174         UnicodeString shorterString("short");
   1175         m.reset(shorterString);
   1176         REGEX_ASSERT(m.regionStart() == 0);
   1177         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1178 
   1179         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1180         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1181         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1182         REGEX_ASSERT(&m == &m.reset());
   1183         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1184 
   1185         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1186         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1187         REGEX_ASSERT(&m == &m.reset());
   1188         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1189 
   1190         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1191         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1192         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1193         REGEX_ASSERT(&m == &m.reset());
   1194         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1195 
   1196         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1197         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1198         REGEX_ASSERT(&m == &m.reset());
   1199         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1200 
   1201     }
   1202 
   1203     //
   1204     // hitEnd() and requireEnd()
   1205     //
   1206     {
   1207         UErrorCode status = U_ZERO_ERROR;
   1208         UnicodeString testString("aabb");
   1209         RegexMatcher m1(".*", testString,  0, status);
   1210         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1211         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1212         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1213         REGEX_CHECK_STATUS;
   1214 
   1215         status = U_ZERO_ERROR;
   1216         RegexMatcher m2("a*", testString, 0, status);
   1217         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1218         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1219         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1220         REGEX_CHECK_STATUS;
   1221 
   1222         status = U_ZERO_ERROR;
   1223         RegexMatcher m3(".*$", testString, 0, status);
   1224         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1225         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1226         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1227         REGEX_CHECK_STATUS;
   1228     }
   1229 
   1230 
   1231     //
   1232     // Compilation error on reset with UChar *
   1233     //   These were a hazard that people were stumbling over with runtime errors.
   1234     //   Changed them to compiler errors by adding private methods that more closely
   1235     //   matched the incorrect use of the functions.
   1236     //
   1237 #if 0
   1238     {
   1239         UErrorCode status = U_ZERO_ERROR;
   1240         UChar ucharString[20];
   1241         RegexMatcher m(".", 0, status);
   1242         m.reset(ucharString);  // should not compile.
   1243 
   1244         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1245         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1246 
   1247         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1248     }
   1249 #endif
   1250 
   1251     //
   1252     //  Time Outs.
   1253     //       Note:  These tests will need to be changed when the regexp engine is
   1254     //              able to detect and cut short the exponential time behavior on
   1255     //              this type of match.
   1256     //
   1257     {
   1258         UErrorCode status = U_ZERO_ERROR;
   1259         //    Enough 'a's in the string to cause the match to time out.
   1260         //       (Each on additonal 'a' doubles the time)
   1261         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1262         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1263         REGEX_CHECK_STATUS;
   1264         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1265         matcher.setTimeLimit(100, status);
   1266         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1267         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1268         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1269     }
   1270     {
   1271         UErrorCode status = U_ZERO_ERROR;
   1272         //   Few enough 'a's to slip in under the time limit.
   1273         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1274         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1275         REGEX_CHECK_STATUS;
   1276         matcher.setTimeLimit(100, status);
   1277         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1278         REGEX_CHECK_STATUS;
   1279     }
   1280 
   1281     //
   1282     //  Stack Limits
   1283     //
   1284     {
   1285         UErrorCode status = U_ZERO_ERROR;
   1286         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1287 
   1288         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1289         //   of the '+', and makes the stack frames larger.
   1290         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1291 
   1292         // With the default stack, this match should fail to run
   1293         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1294         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1295 
   1296         // With unlimited stack, it should run
   1297         status = U_ZERO_ERROR;
   1298         matcher.setStackLimit(0, status);
   1299         REGEX_CHECK_STATUS;
   1300         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1301         REGEX_CHECK_STATUS;
   1302         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1303 
   1304         // With a limited stack, it the match should fail
   1305         status = U_ZERO_ERROR;
   1306         matcher.setStackLimit(10000, status);
   1307         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1308         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1309         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1310     }
   1311 
   1312         // A pattern that doesn't save state should work with
   1313         //   a minimal sized stack
   1314     {
   1315         UErrorCode status = U_ZERO_ERROR;
   1316         UnicodeString testString = "abc";
   1317         RegexMatcher matcher("abc", testString, 0, status);
   1318         REGEX_CHECK_STATUS;
   1319         matcher.setStackLimit(30, status);
   1320         REGEX_CHECK_STATUS;
   1321         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1322         REGEX_CHECK_STATUS;
   1323         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1324 
   1325         // Negative stack sizes should fail
   1326         status = U_ZERO_ERROR;
   1327         matcher.setStackLimit(1000, status);
   1328         REGEX_CHECK_STATUS;
   1329         matcher.setStackLimit(-1, status);
   1330         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1331         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1332     }
   1333 
   1334 
   1335 }
   1336 
   1337 
   1338 
   1339 
   1340 
   1341 
   1342 //---------------------------------------------------------------------------
   1343 //
   1344 //      API_Replace        API test for class RegexMatcher, testing the
   1345 //                         Replace family of functions.
   1346 //
   1347 //---------------------------------------------------------------------------
   1348 void RegexTest::API_Replace() {
   1349     //
   1350     //  Replace
   1351     //
   1352     int32_t             flags=0;
   1353     UParseError         pe;
   1354     UErrorCode          status=U_ZERO_ERROR;
   1355 
   1356     UnicodeString       re("abc");
   1357     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1358     REGEX_CHECK_STATUS;
   1359     UnicodeString data = ".abc..abc...abc..";
   1360     //                    012345678901234567
   1361     RegexMatcher *matcher = pat->matcher(data, status);
   1362 
   1363     //
   1364     //  Plain vanilla matches.
   1365     //
   1366     UnicodeString  dest;
   1367     dest = matcher->replaceFirst("yz", status);
   1368     REGEX_CHECK_STATUS;
   1369     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1370 
   1371     dest = matcher->replaceAll("yz", status);
   1372     REGEX_CHECK_STATUS;
   1373     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1374 
   1375     //
   1376     //  Plain vanilla non-matches.
   1377     //
   1378     UnicodeString d2 = ".abx..abx...abx..";
   1379     matcher->reset(d2);
   1380     dest = matcher->replaceFirst("yz", status);
   1381     REGEX_CHECK_STATUS;
   1382     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1383 
   1384     dest = matcher->replaceAll("yz", status);
   1385     REGEX_CHECK_STATUS;
   1386     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1387 
   1388     //
   1389     // Empty source string
   1390     //
   1391     UnicodeString d3 = "";
   1392     matcher->reset(d3);
   1393     dest = matcher->replaceFirst("yz", status);
   1394     REGEX_CHECK_STATUS;
   1395     REGEX_ASSERT(dest == "");
   1396 
   1397     dest = matcher->replaceAll("yz", status);
   1398     REGEX_CHECK_STATUS;
   1399     REGEX_ASSERT(dest == "");
   1400 
   1401     //
   1402     // Empty substitution string
   1403     //
   1404     matcher->reset(data);              // ".abc..abc...abc.."
   1405     dest = matcher->replaceFirst("", status);
   1406     REGEX_CHECK_STATUS;
   1407     REGEX_ASSERT(dest == "...abc...abc..");
   1408 
   1409     dest = matcher->replaceAll("", status);
   1410     REGEX_CHECK_STATUS;
   1411     REGEX_ASSERT(dest == "........");
   1412 
   1413     //
   1414     // match whole string
   1415     //
   1416     UnicodeString d4 = "abc";
   1417     matcher->reset(d4);
   1418     dest = matcher->replaceFirst("xyz", status);
   1419     REGEX_CHECK_STATUS;
   1420     REGEX_ASSERT(dest == "xyz");
   1421 
   1422     dest = matcher->replaceAll("xyz", status);
   1423     REGEX_CHECK_STATUS;
   1424     REGEX_ASSERT(dest == "xyz");
   1425 
   1426     //
   1427     // Capture Group, simple case
   1428     //
   1429     UnicodeString       re2("a(..)");
   1430     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1431     REGEX_CHECK_STATUS;
   1432     UnicodeString d5 = "abcdefg";
   1433     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1434     REGEX_CHECK_STATUS;
   1435     dest = matcher2->replaceFirst("$1$1", status);
   1436     REGEX_CHECK_STATUS;
   1437     REGEX_ASSERT(dest == "bcbcdefg");
   1438 
   1439     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1440     REGEX_CHECK_STATUS;
   1441     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1442 
   1443     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1444     REGEX_ASSERT(U_FAILURE(status));
   1445     status = U_ZERO_ERROR;
   1446 
   1447     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1448     replacement = replacement.unescape();
   1449     dest = matcher2->replaceFirst(replacement, status);
   1450     REGEX_CHECK_STATUS;
   1451     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1452 
   1453     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1454 
   1455 
   1456     //
   1457     // Replacement String with \u hex escapes
   1458     //
   1459     {
   1460         UnicodeString  src = "abc 1 abc 2 abc 3";
   1461         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1462         matcher->reset(src);
   1463         UnicodeString  result = matcher->replaceAll(substitute, status);
   1464         REGEX_CHECK_STATUS;
   1465         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1466     }
   1467     {
   1468         UnicodeString  src = "abc !";
   1469         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1470         matcher->reset(src);
   1471         UnicodeString  result = matcher->replaceAll(substitute, status);
   1472         REGEX_CHECK_STATUS;
   1473         UnicodeString expected = UnicodeString("--");
   1474         expected.append((UChar32)0x10000);
   1475         expected.append("-- !");
   1476         REGEX_ASSERT(result == expected);
   1477     }
   1478     // TODO:  need more through testing of capture substitutions.
   1479 
   1480     // Bug 4057
   1481     //
   1482     {
   1483         status = U_ZERO_ERROR;
   1484         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1485         RegexMatcher m("ss(.*?)ee", 0, status);
   1486         REGEX_CHECK_STATUS;
   1487         UnicodeString result;
   1488 
   1489         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1490         m.reset(s);
   1491         m.find();
   1492         m.find();
   1493         m.appendReplacement(result, "ooh", status);
   1494         REGEX_CHECK_STATUS;
   1495         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1496 
   1497         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1498         status = U_ZERO_ERROR;
   1499         result.truncate(0);
   1500         m.reset(10, status);
   1501         m.find();
   1502         m.find();
   1503         m.appendReplacement(result, "ooh", status);
   1504         REGEX_CHECK_STATUS;
   1505         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1506 
   1507         // find() at interior of string, appendReplacemnt still starts at beginning.
   1508         status = U_ZERO_ERROR;
   1509         result.truncate(0);
   1510         m.reset();
   1511         m.find(10, status);
   1512         m.find();
   1513         m.appendReplacement(result, "ooh", status);
   1514         REGEX_CHECK_STATUS;
   1515         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1516 
   1517         m.appendTail(result);
   1518         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1519 
   1520     }
   1521 
   1522     delete matcher2;
   1523     delete pat2;
   1524     delete matcher;
   1525     delete pat;
   1526 }
   1527 
   1528 
   1529 //---------------------------------------------------------------------------
   1530 //
   1531 //      API_Pattern       Test that the API for class RegexPattern is
   1532 //                        present and nominally working.
   1533 //
   1534 //---------------------------------------------------------------------------
   1535 void RegexTest::API_Pattern() {
   1536     RegexPattern        pata;    // Test default constructor to not crash.
   1537     RegexPattern        patb;
   1538 
   1539     REGEX_ASSERT(pata == patb);
   1540     REGEX_ASSERT(pata == pata);
   1541 
   1542     UnicodeString re1("abc[a-l][m-z]");
   1543     UnicodeString re2("def");
   1544     UErrorCode    status = U_ZERO_ERROR;
   1545     UParseError   pe;
   1546 
   1547     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1548     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1549     REGEX_CHECK_STATUS;
   1550     REGEX_ASSERT(*pat1 == *pat1);
   1551     REGEX_ASSERT(*pat1 != pata);
   1552 
   1553     // Assign
   1554     patb = *pat1;
   1555     REGEX_ASSERT(patb == *pat1);
   1556 
   1557     // Copy Construct
   1558     RegexPattern patc(*pat1);
   1559     REGEX_ASSERT(patc == *pat1);
   1560     REGEX_ASSERT(patb == patc);
   1561     REGEX_ASSERT(pat1 != pat2);
   1562     patb = *pat2;
   1563     REGEX_ASSERT(patb != patc);
   1564     REGEX_ASSERT(patb == *pat2);
   1565 
   1566     // Compile with no flags.
   1567     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1568     REGEX_ASSERT(*pat1a == *pat1);
   1569 
   1570     REGEX_ASSERT(pat1a->flags() == 0);
   1571 
   1572     // Compile with different flags should be not equal
   1573     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1574     REGEX_CHECK_STATUS;
   1575 
   1576     REGEX_ASSERT(*pat1b != *pat1a);
   1577     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1578     REGEX_ASSERT(pat1a->flags() == 0);
   1579     delete pat1b;
   1580 
   1581     // clone
   1582     RegexPattern *pat1c = pat1->clone();
   1583     REGEX_ASSERT(*pat1c == *pat1);
   1584     REGEX_ASSERT(*pat1c != *pat2);
   1585 
   1586     delete pat1c;
   1587     delete pat1a;
   1588     delete pat1;
   1589     delete pat2;
   1590 
   1591 
   1592     //
   1593     //   Verify that a matcher created from a cloned pattern works.
   1594     //     (Jitterbug 3423)
   1595     //
   1596     {
   1597         UErrorCode     status     = U_ZERO_ERROR;
   1598         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1599         RegexPattern  *pClone     = pSource->clone();
   1600         delete         pSource;
   1601         RegexMatcher  *mFromClone = pClone->matcher(status);
   1602         REGEX_CHECK_STATUS;
   1603         UnicodeString s = "Hello World";
   1604         mFromClone->reset(s);
   1605         REGEX_ASSERT(mFromClone->find() == TRUE);
   1606         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1607         REGEX_ASSERT(mFromClone->find() == TRUE);
   1608         REGEX_ASSERT(mFromClone->group(status) == "World");
   1609         REGEX_ASSERT(mFromClone->find() == FALSE);
   1610         delete mFromClone;
   1611         delete pClone;
   1612     }
   1613 
   1614     //
   1615     //   matches convenience API
   1616     //
   1617     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1618     REGEX_CHECK_STATUS;
   1619     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1620     REGEX_CHECK_STATUS;
   1621     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1622     REGEX_CHECK_STATUS;
   1623     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1624     REGEX_CHECK_STATUS;
   1625     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1626     REGEX_CHECK_STATUS;
   1627     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1628     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1629     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1630 
   1631 
   1632     //
   1633     // Split()
   1634     //
   1635     status = U_ZERO_ERROR;
   1636     pat1 = RegexPattern::compile(" +",  pe, status);
   1637     REGEX_CHECK_STATUS;
   1638     UnicodeString  fields[10];
   1639 
   1640     int32_t n;
   1641     n = pat1->split("Now is the time", fields, 10, status);
   1642     REGEX_CHECK_STATUS;
   1643     REGEX_ASSERT(n==4);
   1644     REGEX_ASSERT(fields[0]=="Now");
   1645     REGEX_ASSERT(fields[1]=="is");
   1646     REGEX_ASSERT(fields[2]=="the");
   1647     REGEX_ASSERT(fields[3]=="time");
   1648     REGEX_ASSERT(fields[4]=="");
   1649 
   1650     n = pat1->split("Now is the time", fields, 2, status);
   1651     REGEX_CHECK_STATUS;
   1652     REGEX_ASSERT(n==2);
   1653     REGEX_ASSERT(fields[0]=="Now");
   1654     REGEX_ASSERT(fields[1]=="is the time");
   1655     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1656 
   1657     fields[1] = "*";
   1658     status = U_ZERO_ERROR;
   1659     n = pat1->split("Now is the time", fields, 1, status);
   1660     REGEX_CHECK_STATUS;
   1661     REGEX_ASSERT(n==1);
   1662     REGEX_ASSERT(fields[0]=="Now is the time");
   1663     REGEX_ASSERT(fields[1]=="*");
   1664     status = U_ZERO_ERROR;
   1665 
   1666     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1667     REGEX_CHECK_STATUS;
   1668     REGEX_ASSERT(n==6);
   1669     REGEX_ASSERT(fields[0]=="");
   1670     REGEX_ASSERT(fields[1]=="Now");
   1671     REGEX_ASSERT(fields[2]=="is");
   1672     REGEX_ASSERT(fields[3]=="the");
   1673     REGEX_ASSERT(fields[4]=="time");
   1674     REGEX_ASSERT(fields[5]=="");
   1675 
   1676     n = pat1->split("     ", fields, 10, status);
   1677     REGEX_CHECK_STATUS;
   1678     REGEX_ASSERT(n==2);
   1679     REGEX_ASSERT(fields[0]=="");
   1680     REGEX_ASSERT(fields[1]=="");
   1681 
   1682     fields[0] = "foo";
   1683     n = pat1->split("", fields, 10, status);
   1684     REGEX_CHECK_STATUS;
   1685     REGEX_ASSERT(n==0);
   1686     REGEX_ASSERT(fields[0]=="foo");
   1687 
   1688     delete pat1;
   1689 
   1690     //  split, with a pattern with (capture)
   1691     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1692     REGEX_CHECK_STATUS;
   1693 
   1694     status = U_ZERO_ERROR;
   1695     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1696     REGEX_CHECK_STATUS;
   1697     REGEX_ASSERT(n==7);
   1698     REGEX_ASSERT(fields[0]=="");
   1699     REGEX_ASSERT(fields[1]=="a");
   1700     REGEX_ASSERT(fields[2]=="Now is ");
   1701     REGEX_ASSERT(fields[3]=="b");
   1702     REGEX_ASSERT(fields[4]=="the time");
   1703     REGEX_ASSERT(fields[5]=="c");
   1704     REGEX_ASSERT(fields[6]=="");
   1705     REGEX_ASSERT(status==U_ZERO_ERROR);
   1706 
   1707     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1708     REGEX_CHECK_STATUS;
   1709     REGEX_ASSERT(n==7);
   1710     REGEX_ASSERT(fields[0]=="  ");
   1711     REGEX_ASSERT(fields[1]=="a");
   1712     REGEX_ASSERT(fields[2]=="Now is ");
   1713     REGEX_ASSERT(fields[3]=="b");
   1714     REGEX_ASSERT(fields[4]=="the time");
   1715     REGEX_ASSERT(fields[5]=="c");
   1716     REGEX_ASSERT(fields[6]=="");
   1717 
   1718     status = U_ZERO_ERROR;
   1719     fields[6] = "foo";
   1720     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1721     REGEX_CHECK_STATUS;
   1722     REGEX_ASSERT(n==6);
   1723     REGEX_ASSERT(fields[0]=="  ");
   1724     REGEX_ASSERT(fields[1]=="a");
   1725     REGEX_ASSERT(fields[2]=="Now is ");
   1726     REGEX_ASSERT(fields[3]=="b");
   1727     REGEX_ASSERT(fields[4]=="the time");
   1728     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1729     REGEX_ASSERT(fields[6]=="foo");
   1730 
   1731     status = U_ZERO_ERROR;
   1732     fields[5] = "foo";
   1733     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1734     REGEX_CHECK_STATUS;
   1735     REGEX_ASSERT(n==5);
   1736     REGEX_ASSERT(fields[0]=="  ");
   1737     REGEX_ASSERT(fields[1]=="a");
   1738     REGEX_ASSERT(fields[2]=="Now is ");
   1739     REGEX_ASSERT(fields[3]=="b");
   1740     REGEX_ASSERT(fields[4]=="the time<c>");
   1741     REGEX_ASSERT(fields[5]=="foo");
   1742 
   1743     status = U_ZERO_ERROR;
   1744     fields[5] = "foo";
   1745     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1746     REGEX_CHECK_STATUS;
   1747     REGEX_ASSERT(n==5);
   1748     REGEX_ASSERT(fields[0]=="  ");
   1749     REGEX_ASSERT(fields[1]=="a");
   1750     REGEX_ASSERT(fields[2]=="Now is ");
   1751     REGEX_ASSERT(fields[3]=="b");
   1752     REGEX_ASSERT(fields[4]=="the time");
   1753     REGEX_ASSERT(fields[5]=="foo");
   1754 
   1755     status = U_ZERO_ERROR;
   1756     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1757     REGEX_CHECK_STATUS;
   1758     REGEX_ASSERT(n==4);
   1759     REGEX_ASSERT(fields[0]=="  ");
   1760     REGEX_ASSERT(fields[1]=="a");
   1761     REGEX_ASSERT(fields[2]=="Now is ");
   1762     REGEX_ASSERT(fields[3]=="the time<c>");
   1763     status = U_ZERO_ERROR;
   1764     delete pat1;
   1765 
   1766     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1767     REGEX_CHECK_STATUS;
   1768     n = pat1->split("1-10,20", fields, 10, status);
   1769     REGEX_CHECK_STATUS;
   1770     REGEX_ASSERT(n==5);
   1771     REGEX_ASSERT(fields[0]=="1");
   1772     REGEX_ASSERT(fields[1]=="-");
   1773     REGEX_ASSERT(fields[2]=="10");
   1774     REGEX_ASSERT(fields[3]==",");
   1775     REGEX_ASSERT(fields[4]=="20");
   1776     delete pat1;
   1777 
   1778     // Test split of string with empty trailing fields
   1779     pat1 = RegexPattern::compile(",", pe, status);
   1780     REGEX_CHECK_STATUS;
   1781     n = pat1->split("a,b,c,", fields, 10, status);
   1782     REGEX_CHECK_STATUS;
   1783     REGEX_ASSERT(n==4);
   1784     REGEX_ASSERT(fields[0]=="a");
   1785     REGEX_ASSERT(fields[1]=="b");
   1786     REGEX_ASSERT(fields[2]=="c");
   1787     REGEX_ASSERT(fields[3]=="");
   1788 
   1789     n = pat1->split("a,,,", fields, 10, status);
   1790     REGEX_CHECK_STATUS;
   1791     REGEX_ASSERT(n==4);
   1792     REGEX_ASSERT(fields[0]=="a");
   1793     REGEX_ASSERT(fields[1]=="");
   1794     REGEX_ASSERT(fields[2]=="");
   1795     REGEX_ASSERT(fields[3]=="");
   1796     delete pat1;
   1797 
   1798     // Split Separator with zero length match.
   1799     pat1 = RegexPattern::compile(":?", pe, status);
   1800     REGEX_CHECK_STATUS;
   1801     n = pat1->split("abc", fields, 10, status);
   1802     REGEX_CHECK_STATUS;
   1803     REGEX_ASSERT(n==5);
   1804     REGEX_ASSERT(fields[0]=="");
   1805     REGEX_ASSERT(fields[1]=="a");
   1806     REGEX_ASSERT(fields[2]=="b");
   1807     REGEX_ASSERT(fields[3]=="c");
   1808     REGEX_ASSERT(fields[4]=="");
   1809 
   1810     delete pat1;
   1811 
   1812     //
   1813     // RegexPattern::pattern()
   1814     //
   1815     pat1 = new RegexPattern();
   1816     REGEX_ASSERT(pat1->pattern() == "");
   1817     delete pat1;
   1818 
   1819     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1820     REGEX_CHECK_STATUS;
   1821     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1822     delete pat1;
   1823 
   1824 
   1825     //
   1826     // classID functions
   1827     //
   1828     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1829     REGEX_CHECK_STATUS;
   1830     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1831     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1832     UnicodeString Hello("Hello, world.");
   1833     RegexMatcher *m = pat1->matcher(Hello, status);
   1834     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1835     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1836     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1837     delete m;
   1838     delete pat1;
   1839 
   1840 }
   1841 
   1842 //---------------------------------------------------------------------------
   1843 //
   1844 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1845 //                       is present and working, but excluding functions
   1846 //                       implementing replace operations.
   1847 //
   1848 //---------------------------------------------------------------------------
   1849 void RegexTest::API_Match_UTF8() {
   1850     UParseError         pe;
   1851     UErrorCode          status=U_ZERO_ERROR;
   1852     int32_t             flags = 0;
   1853 
   1854     //
   1855     // Debug - slide failing test cases early
   1856     //
   1857 #if 0
   1858     {
   1859     }
   1860     return;
   1861 #endif
   1862 
   1863     //
   1864     // Simple pattern compilation
   1865     //
   1866     {
   1867         UText               re = UTEXT_INITIALIZER;
   1868         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1869         REGEX_VERBOSE_TEXT(&re);
   1870         RegexPattern        *pat2;
   1871         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1872         REGEX_CHECK_STATUS;
   1873 
   1874         UText input1 = UTEXT_INITIALIZER;
   1875         UText input2 = UTEXT_INITIALIZER;
   1876         UText empty  = UTEXT_INITIALIZER;
   1877         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1878         REGEX_VERBOSE_TEXT(&input1);
   1879         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1880         REGEX_VERBOSE_TEXT(&input2);
   1881         utext_openUChars(&empty, NULL, 0, &status);
   1882 
   1883         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1884         int32_t input2Len = strlen("not abc");
   1885 
   1886 
   1887         //
   1888         // Matcher creation and reset.
   1889         //
   1890         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1891         REGEX_CHECK_STATUS;
   1892         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1893         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1894         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1895         m1->reset(&input2);
   1896         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1897         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1898         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1899         m1->reset(&input1);
   1900         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1901         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1902         m1->reset(&empty);
   1903         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1904         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1905 
   1906         //
   1907         //  reset(pos, status)
   1908         //
   1909         m1->reset(&input1);
   1910         m1->reset(4, status);
   1911         REGEX_CHECK_STATUS;
   1912         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1913         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1914 
   1915         m1->reset(-1, status);
   1916         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1917         status = U_ZERO_ERROR;
   1918 
   1919         m1->reset(0, status);
   1920         REGEX_CHECK_STATUS;
   1921         status = U_ZERO_ERROR;
   1922 
   1923         m1->reset(input1Len-1, status);
   1924         REGEX_CHECK_STATUS;
   1925         status = U_ZERO_ERROR;
   1926 
   1927         m1->reset(input1Len, status);
   1928         REGEX_CHECK_STATUS;
   1929         status = U_ZERO_ERROR;
   1930 
   1931         m1->reset(input1Len+1, status);
   1932         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1933         status = U_ZERO_ERROR;
   1934 
   1935         //
   1936         // match(pos, status)
   1937         //
   1938         m1->reset(&input2);
   1939         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1940         m1->reset();
   1941         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1942         m1->reset();
   1943         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1944         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1945         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1946         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1947 
   1948         // Match() at end of string should fail, but should not
   1949         //  be an error.
   1950         status = U_ZERO_ERROR;
   1951         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1952         REGEX_CHECK_STATUS;
   1953 
   1954         // Match beyond end of string should fail with an error.
   1955         status = U_ZERO_ERROR;
   1956         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1957         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1958 
   1959         // Successful match at end of string.
   1960         {
   1961             status = U_ZERO_ERROR;
   1962             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1963             REGEX_CHECK_STATUS;
   1964             m.reset(&input1);
   1965             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1966             REGEX_CHECK_STATUS;
   1967             m.reset(&empty);
   1968             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1969             REGEX_CHECK_STATUS;
   1970         }
   1971 
   1972 
   1973         //
   1974         // lookingAt(pos, status)
   1975         //
   1976         status = U_ZERO_ERROR;
   1977         m1->reset(&input2);  // "not abc"
   1978         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1979         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1980         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1981         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1982         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1983         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1984         status = U_ZERO_ERROR;
   1985         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1986         REGEX_CHECK_STATUS;
   1987         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1988         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1989 
   1990         delete m1;
   1991         delete pat2;
   1992 
   1993         utext_close(&re);
   1994         utext_close(&input1);
   1995         utext_close(&input2);
   1996         utext_close(&empty);
   1997     }
   1998 
   1999 
   2000     //
   2001     // Capture Group.
   2002     //     RegexMatcher::start();
   2003     //     RegexMatcher::end();
   2004     //     RegexMatcher::groupCount();
   2005     //
   2006     {
   2007         int32_t             flags=0;
   2008         UParseError         pe;
   2009         UErrorCode          status=U_ZERO_ERROR;
   2010         UText               re=UTEXT_INITIALIZER;
   2011         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   2012         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   2013 
   2014         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2015         REGEX_CHECK_STATUS;
   2016 
   2017         UText input = UTEXT_INITIALIZER;
   2018         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2019         utext_openUTF8(&input, str_0123456789, -1, &status);
   2020 
   2021         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2022         REGEX_CHECK_STATUS;
   2023         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   2024         static const int32_t matchStarts[] = {0,  2, 4, 8};
   2025         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   2026         int32_t i;
   2027         for (i=0; i<4; i++) {
   2028             int32_t actualStart = matcher->start(i, status);
   2029             REGEX_CHECK_STATUS;
   2030             if (actualStart != matchStarts[i]) {
   2031                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   2032                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   2033             }
   2034             int32_t actualEnd = matcher->end(i, status);
   2035             REGEX_CHECK_STATUS;
   2036             if (actualEnd != matchEnds[i]) {
   2037                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   2038                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   2039             }
   2040         }
   2041 
   2042         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   2043         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   2044 
   2045         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2046         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2047         matcher->reset();
   2048         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   2049 
   2050         matcher->lookingAt(status);
   2051 
   2052         UnicodeString dest;
   2053         UText destText = UTEXT_INITIALIZER;
   2054         utext_openUnicodeString(&destText, &dest, &status);
   2055         UText *result;
   2056         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   2057         //  Test shallow-clone API
   2058         int64_t   group_len;
   2059         result = matcher->group((UText *)NULL, group_len, status);
   2060         REGEX_CHECK_STATUS;
   2061         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2062         utext_close(result);
   2063         result = matcher->group(0, &destText, group_len, status);
   2064         REGEX_CHECK_STATUS;
   2065         REGEX_ASSERT(result == &destText);
   2066         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2067         //  destText is now immutable, reopen it
   2068         utext_close(&destText);
   2069         utext_openUnicodeString(&destText, &dest, &status);
   2070 
   2071         int64_t length;
   2072         result = matcher->group(0, NULL, length, status);
   2073         REGEX_CHECK_STATUS;
   2074         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2075         utext_close(result);
   2076         result = matcher->group(0, &destText, length, status);
   2077         REGEX_CHECK_STATUS;
   2078         REGEX_ASSERT(result == &destText);
   2079         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
   2080         REGEX_ASSERT(length == 10);
   2081         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2082 
   2083         // Capture Group 1 == "234567"
   2084         result = matcher->group(1, NULL, length, status);
   2085         REGEX_CHECK_STATUS;
   2086         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2087         REGEX_ASSERT(length == 6);
   2088         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2089         utext_close(result);
   2090 
   2091         result = matcher->group(1, &destText, length, status);
   2092         REGEX_CHECK_STATUS;
   2093         REGEX_ASSERT(result == &destText);
   2094         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2095         REGEX_ASSERT(length == 6);
   2096         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2097         utext_close(result);
   2098 
   2099         // Capture Group 2 == "45"
   2100         result = matcher->group(2, NULL, length, status);
   2101         REGEX_CHECK_STATUS;
   2102         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2103         REGEX_ASSERT(length == 2);
   2104         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2105         utext_close(result);
   2106 
   2107         result = matcher->group(2, &destText, length, status);
   2108         REGEX_CHECK_STATUS;
   2109         REGEX_ASSERT(result == &destText);
   2110         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2111         REGEX_ASSERT(length == 2);
   2112         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2113         utext_close(result);
   2114 
   2115         // Capture Group 3 == "89"
   2116         result = matcher->group(3, NULL, length, status);
   2117         REGEX_CHECK_STATUS;
   2118         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2119         REGEX_ASSERT(length == 2);
   2120         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2121         utext_close(result);
   2122 
   2123         result = matcher->group(3, &destText, length, status);
   2124         REGEX_CHECK_STATUS;
   2125         REGEX_ASSERT(result == &destText);
   2126         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2127         REGEX_ASSERT(length == 2);
   2128         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2129         utext_close(result);
   2130 
   2131         // Capture Group number out of range.
   2132         status = U_ZERO_ERROR;
   2133         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2134         status = U_ZERO_ERROR;
   2135         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2136         status = U_ZERO_ERROR;
   2137         matcher->reset();
   2138         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2139 
   2140         delete matcher;
   2141         delete pat;
   2142 
   2143         utext_close(&destText);
   2144         utext_close(&input);
   2145         utext_close(&re);
   2146     }
   2147 
   2148     //
   2149     //  find
   2150     //
   2151     {
   2152         int32_t             flags=0;
   2153         UParseError         pe;
   2154         UErrorCode          status=U_ZERO_ERROR;
   2155         UText               re=UTEXT_INITIALIZER;
   2156         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2157         utext_openUTF8(&re, str_abc, -1, &status);
   2158 
   2159         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2160         REGEX_CHECK_STATUS;
   2161         UText input = UTEXT_INITIALIZER;
   2162         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2163         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2164         //                      012345678901234567
   2165 
   2166         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2167         REGEX_CHECK_STATUS;
   2168         REGEX_ASSERT(matcher->find());
   2169         REGEX_ASSERT(matcher->start(status) == 1);
   2170         REGEX_ASSERT(matcher->find());
   2171         REGEX_ASSERT(matcher->start(status) == 6);
   2172         REGEX_ASSERT(matcher->find());
   2173         REGEX_ASSERT(matcher->start(status) == 12);
   2174         REGEX_ASSERT(matcher->find() == FALSE);
   2175         REGEX_ASSERT(matcher->find() == FALSE);
   2176 
   2177         matcher->reset();
   2178         REGEX_ASSERT(matcher->find());
   2179         REGEX_ASSERT(matcher->start(status) == 1);
   2180 
   2181         REGEX_ASSERT(matcher->find(0, status));
   2182         REGEX_ASSERT(matcher->start(status) == 1);
   2183         REGEX_ASSERT(matcher->find(1, status));
   2184         REGEX_ASSERT(matcher->start(status) == 1);
   2185         REGEX_ASSERT(matcher->find(2, status));
   2186         REGEX_ASSERT(matcher->start(status) == 6);
   2187         REGEX_ASSERT(matcher->find(12, status));
   2188         REGEX_ASSERT(matcher->start(status) == 12);
   2189         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2190         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2191         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2192         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2193 
   2194         status = U_ZERO_ERROR;
   2195         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2196         status = U_ZERO_ERROR;
   2197         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2198 
   2199         REGEX_ASSERT(matcher->groupCount() == 0);
   2200 
   2201         delete matcher;
   2202         delete pat;
   2203 
   2204         utext_close(&input);
   2205         utext_close(&re);
   2206     }
   2207 
   2208 
   2209     //
   2210     //  find, with \G in pattern (true if at the end of a previous match).
   2211     //
   2212     {
   2213         int32_t             flags=0;
   2214         UParseError         pe;
   2215         UErrorCode          status=U_ZERO_ERROR;
   2216         UText               re=UTEXT_INITIALIZER;
   2217         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2218         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2219 
   2220         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2221 
   2222         REGEX_CHECK_STATUS;
   2223         UText input = UTEXT_INITIALIZER;
   2224         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2225         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2226         //                      012345678901234567
   2227 
   2228         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2229         REGEX_CHECK_STATUS;
   2230         REGEX_ASSERT(matcher->find());
   2231         REGEX_ASSERT(matcher->start(status) == 0);
   2232         REGEX_ASSERT(matcher->start(1, status) == -1);
   2233         REGEX_ASSERT(matcher->start(2, status) == 1);
   2234 
   2235         REGEX_ASSERT(matcher->find());
   2236         REGEX_ASSERT(matcher->start(status) == 4);
   2237         REGEX_ASSERT(matcher->start(1, status) == 4);
   2238         REGEX_ASSERT(matcher->start(2, status) == -1);
   2239         REGEX_CHECK_STATUS;
   2240 
   2241         delete matcher;
   2242         delete pat;
   2243 
   2244         utext_close(&input);
   2245         utext_close(&re);
   2246     }
   2247 
   2248     //
   2249     //   find with zero length matches, match position should bump ahead
   2250     //     to prevent loops.
   2251     //
   2252     {
   2253         int32_t                 i;
   2254         UErrorCode          status=U_ZERO_ERROR;
   2255         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2256                                                       //   using an always-true look-ahead.
   2257         REGEX_CHECK_STATUS;
   2258         UText s = UTEXT_INITIALIZER;
   2259         utext_openUTF8(&s, "    ", -1, &status);
   2260         m.reset(&s);
   2261         for (i=0; ; i++) {
   2262             if (m.find() == FALSE) {
   2263                 break;
   2264             }
   2265             REGEX_ASSERT(m.start(status) == i);
   2266             REGEX_ASSERT(m.end(status) == i);
   2267         }
   2268         REGEX_ASSERT(i==5);
   2269 
   2270         // Check that the bump goes over characters outside the BMP OK
   2271         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2272         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2273         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2274         m.reset(&s);
   2275         for (i=0; ; i+=4) {
   2276             if (m.find() == FALSE) {
   2277                 break;
   2278             }
   2279             REGEX_ASSERT(m.start(status) == i);
   2280             REGEX_ASSERT(m.end(status) == i);
   2281         }
   2282         REGEX_ASSERT(i==20);
   2283 
   2284         utext_close(&s);
   2285     }
   2286     {
   2287         // find() loop breaking test.
   2288         //        with pattern of /.?/, should see a series of one char matches, then a single
   2289         //        match of zero length at the end of the input string.
   2290         int32_t                 i;
   2291         UErrorCode          status=U_ZERO_ERROR;
   2292         RegexMatcher        m(".?", 0, status);
   2293         REGEX_CHECK_STATUS;
   2294         UText s = UTEXT_INITIALIZER;
   2295         utext_openUTF8(&s, "    ", -1, &status);
   2296         m.reset(&s);
   2297         for (i=0; ; i++) {
   2298             if (m.find() == FALSE) {
   2299                 break;
   2300             }
   2301             REGEX_ASSERT(m.start(status) == i);
   2302             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2303         }
   2304         REGEX_ASSERT(i==5);
   2305 
   2306         utext_close(&s);
   2307     }
   2308 
   2309 
   2310     //
   2311     // Matchers with no input string behave as if they had an empty input string.
   2312     //
   2313 
   2314     {
   2315         UErrorCode status = U_ZERO_ERROR;
   2316         RegexMatcher  m(".?", 0, status);
   2317         REGEX_CHECK_STATUS;
   2318         REGEX_ASSERT(m.find());
   2319         REGEX_ASSERT(m.start(status) == 0);
   2320         REGEX_ASSERT(m.input() == "");
   2321     }
   2322     {
   2323         UErrorCode status = U_ZERO_ERROR;
   2324         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2325         RegexMatcher  *m = p->matcher(status);
   2326         REGEX_CHECK_STATUS;
   2327 
   2328         REGEX_ASSERT(m->find() == FALSE);
   2329         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2330         delete m;
   2331         delete p;
   2332     }
   2333 
   2334     //
   2335     // Regions
   2336     //
   2337     {
   2338         UErrorCode status = U_ZERO_ERROR;
   2339         UText testPattern = UTEXT_INITIALIZER;
   2340         UText testText    = UTEXT_INITIALIZER;
   2341         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2342         REGEX_VERBOSE_TEXT(&testPattern);
   2343         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2344         REGEX_VERBOSE_TEXT(&testText);
   2345 
   2346         RegexMatcher m(&testPattern, &testText, 0, status);
   2347         REGEX_CHECK_STATUS;
   2348         REGEX_ASSERT(m.regionStart() == 0);
   2349         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2350         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2351         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2352 
   2353         m.region(2,4, status);
   2354         REGEX_CHECK_STATUS;
   2355         REGEX_ASSERT(m.matches(status));
   2356         REGEX_ASSERT(m.start(status)==2);
   2357         REGEX_ASSERT(m.end(status)==4);
   2358         REGEX_CHECK_STATUS;
   2359 
   2360         m.reset();
   2361         REGEX_ASSERT(m.regionStart() == 0);
   2362         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2363 
   2364         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2365         REGEX_VERBOSE_TEXT(&testText);
   2366         m.reset(&testText);
   2367         REGEX_ASSERT(m.regionStart() == 0);
   2368         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2369 
   2370         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2371         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2372         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2373         REGEX_ASSERT(&m == &m.reset());
   2374         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2375 
   2376         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2377         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2378         REGEX_ASSERT(&m == &m.reset());
   2379         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2380 
   2381         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2382         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2383         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2384         REGEX_ASSERT(&m == &m.reset());
   2385         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2386 
   2387         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2388         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2389         REGEX_ASSERT(&m == &m.reset());
   2390         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2391 
   2392         utext_close(&testText);
   2393         utext_close(&testPattern);
   2394     }
   2395 
   2396     //
   2397     // hitEnd() and requireEnd()
   2398     //
   2399     {
   2400         UErrorCode status = U_ZERO_ERROR;
   2401         UText testPattern = UTEXT_INITIALIZER;
   2402         UText testText    = UTEXT_INITIALIZER;
   2403         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2404         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2405         utext_openUTF8(&testPattern, str_, -1, &status);
   2406         utext_openUTF8(&testText, str_aabb, -1, &status);
   2407 
   2408         RegexMatcher m1(&testPattern, &testText,  0, status);
   2409         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2410         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2411         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2412         REGEX_CHECK_STATUS;
   2413 
   2414         status = U_ZERO_ERROR;
   2415         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2416         utext_openUTF8(&testPattern, str_a, -1, &status);
   2417         RegexMatcher m2(&testPattern, &testText, 0, status);
   2418         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2419         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2420         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2421         REGEX_CHECK_STATUS;
   2422 
   2423         status = U_ZERO_ERROR;
   2424         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2425         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2426         RegexMatcher m3(&testPattern, &testText, 0, status);
   2427         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2428         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2429         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2430         REGEX_CHECK_STATUS;
   2431 
   2432         utext_close(&testText);
   2433         utext_close(&testPattern);
   2434     }
   2435 }
   2436 
   2437 
   2438 //---------------------------------------------------------------------------
   2439 //
   2440 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2441 //                         Replace family of functions.
   2442 //
   2443 //---------------------------------------------------------------------------
   2444 void RegexTest::API_Replace_UTF8() {
   2445     //
   2446     //  Replace
   2447     //
   2448     int32_t             flags=0;
   2449     UParseError         pe;
   2450     UErrorCode          status=U_ZERO_ERROR;
   2451 
   2452     UText               re=UTEXT_INITIALIZER;
   2453     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2454     REGEX_VERBOSE_TEXT(&re);
   2455     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2456     REGEX_CHECK_STATUS;
   2457 
   2458     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2459     //             012345678901234567
   2460     UText dataText = UTEXT_INITIALIZER;
   2461     utext_openUTF8(&dataText, data, -1, &status);
   2462     REGEX_CHECK_STATUS;
   2463     REGEX_VERBOSE_TEXT(&dataText);
   2464     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2465 
   2466     //
   2467     //  Plain vanilla matches.
   2468     //
   2469     UnicodeString  dest;
   2470     UText destText = UTEXT_INITIALIZER;
   2471     utext_openUnicodeString(&destText, &dest, &status);
   2472     UText *result;
   2473 
   2474     UText replText = UTEXT_INITIALIZER;
   2475 
   2476     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2477     utext_openUTF8(&replText, str_yz, -1, &status);
   2478     REGEX_VERBOSE_TEXT(&replText);
   2479     result = matcher->replaceFirst(&replText, NULL, status);
   2480     REGEX_CHECK_STATUS;
   2481     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2482     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2483     utext_close(result);
   2484     result = matcher->replaceFirst(&replText, &destText, status);
   2485     REGEX_CHECK_STATUS;
   2486     REGEX_ASSERT(result == &destText);
   2487     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2488 
   2489     result = matcher->replaceAll(&replText, NULL, status);
   2490     REGEX_CHECK_STATUS;
   2491     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2492     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2493     utext_close(result);
   2494 
   2495     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2496     result = matcher->replaceAll(&replText, &destText, status);
   2497     REGEX_CHECK_STATUS;
   2498     REGEX_ASSERT(result == &destText);
   2499     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2500 
   2501     //
   2502     //  Plain vanilla non-matches.
   2503     //
   2504     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2505     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2506     matcher->reset(&dataText);
   2507 
   2508     result = matcher->replaceFirst(&replText, NULL, status);
   2509     REGEX_CHECK_STATUS;
   2510     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2511     utext_close(result);
   2512     result = matcher->replaceFirst(&replText, &destText, status);
   2513     REGEX_CHECK_STATUS;
   2514     REGEX_ASSERT(result == &destText);
   2515     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2516 
   2517     result = matcher->replaceAll(&replText, NULL, status);
   2518     REGEX_CHECK_STATUS;
   2519     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2520     utext_close(result);
   2521     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2522     result = matcher->replaceAll(&replText, &destText, status);
   2523     REGEX_CHECK_STATUS;
   2524     REGEX_ASSERT(result == &destText);
   2525     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2526 
   2527     //
   2528     // Empty source string
   2529     //
   2530     utext_openUTF8(&dataText, NULL, 0, &status);
   2531     matcher->reset(&dataText);
   2532 
   2533     result = matcher->replaceFirst(&replText, NULL, status);
   2534     REGEX_CHECK_STATUS;
   2535     REGEX_ASSERT_UTEXT_UTF8("", result);
   2536     utext_close(result);
   2537     result = matcher->replaceFirst(&replText, &destText, status);
   2538     REGEX_CHECK_STATUS;
   2539     REGEX_ASSERT(result == &destText);
   2540     REGEX_ASSERT_UTEXT_UTF8("", result);
   2541 
   2542     result = matcher->replaceAll(&replText, NULL, status);
   2543     REGEX_CHECK_STATUS;
   2544     REGEX_ASSERT_UTEXT_UTF8("", result);
   2545     utext_close(result);
   2546     result = matcher->replaceAll(&replText, &destText, status);
   2547     REGEX_CHECK_STATUS;
   2548     REGEX_ASSERT(result == &destText);
   2549     REGEX_ASSERT_UTEXT_UTF8("", result);
   2550 
   2551     //
   2552     // Empty substitution string
   2553     //
   2554     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2555     matcher->reset(&dataText);
   2556 
   2557     utext_openUTF8(&replText, NULL, 0, &status);
   2558     result = matcher->replaceFirst(&replText, NULL, status);
   2559     REGEX_CHECK_STATUS;
   2560     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2561     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2562     utext_close(result);
   2563     result = matcher->replaceFirst(&replText, &destText, status);
   2564     REGEX_CHECK_STATUS;
   2565     REGEX_ASSERT(result == &destText);
   2566     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2567 
   2568     result = matcher->replaceAll(&replText, NULL, status);
   2569     REGEX_CHECK_STATUS;
   2570     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2571     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2572     utext_close(result);
   2573     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2574     result = matcher->replaceAll(&replText, &destText, status);
   2575     REGEX_CHECK_STATUS;
   2576     REGEX_ASSERT(result == &destText);
   2577     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2578 
   2579     //
   2580     // match whole string
   2581     //
   2582     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2583     utext_openUTF8(&dataText, str_abc, -1, &status);
   2584     matcher->reset(&dataText);
   2585 
   2586     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2587     utext_openUTF8(&replText, str_xyz, -1, &status);
   2588     result = matcher->replaceFirst(&replText, NULL, status);
   2589     REGEX_CHECK_STATUS;
   2590     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2591     utext_close(result);
   2592     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2593     result = matcher->replaceFirst(&replText, &destText, status);
   2594     REGEX_CHECK_STATUS;
   2595     REGEX_ASSERT(result == &destText);
   2596     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2597 
   2598     result = matcher->replaceAll(&replText, NULL, status);
   2599     REGEX_CHECK_STATUS;
   2600     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2601     utext_close(result);
   2602     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2603     result = matcher->replaceAll(&replText, &destText, status);
   2604     REGEX_CHECK_STATUS;
   2605     REGEX_ASSERT(result == &destText);
   2606     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2607 
   2608     //
   2609     // Capture Group, simple case
   2610     //
   2611     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2612     utext_openUTF8(&re, str_add, -1, &status);
   2613     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2614     REGEX_CHECK_STATUS;
   2615 
   2616     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2617     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2618     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2619     REGEX_CHECK_STATUS;
   2620 
   2621     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2622     utext_openUTF8(&replText, str_11, -1, &status);
   2623     result = matcher2->replaceFirst(&replText, NULL, status);
   2624     REGEX_CHECK_STATUS;
   2625     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2626     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2627     utext_close(result);
   2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2629     result = matcher2->replaceFirst(&replText, &destText, status);
   2630     REGEX_CHECK_STATUS;
   2631     REGEX_ASSERT(result == &destText);
   2632     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2633 
   2634     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2635     utext_openUTF8(&replText, str_v, -1, &status);
   2636     REGEX_VERBOSE_TEXT(&replText);
   2637     result = matcher2->replaceFirst(&replText, NULL, status);
   2638     REGEX_CHECK_STATUS;
   2639     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2640     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2641     utext_close(result);
   2642     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2643     result = matcher2->replaceFirst(&replText, &destText, status);
   2644     REGEX_CHECK_STATUS;
   2645     REGEX_ASSERT(result == &destText);
   2646     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2647 
   2648     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
   2649                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
   2650                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
   2651     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2652     result = matcher2->replaceFirst(&replText, NULL, status);
   2653     REGEX_CHECK_STATUS;
   2654     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2655     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2656     utext_close(result);
   2657     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2658     result = matcher2->replaceFirst(&replText, &destText, status);
   2659     REGEX_CHECK_STATUS;
   2660     REGEX_ASSERT(result == &destText);
   2661     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2662 
   2663     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2664     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2665     //                                 012345678901234567890123456
   2666     supplDigitChars[22] = 0xF0;
   2667     supplDigitChars[23] = 0x9D;
   2668     supplDigitChars[24] = 0x9F;
   2669     supplDigitChars[25] = 0x8F;
   2670     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2671 
   2672     result = matcher2->replaceFirst(&replText, NULL, status);
   2673     REGEX_CHECK_STATUS;
   2674     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2675     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2676     utext_close(result);
   2677     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2678     result = matcher2->replaceFirst(&replText, &destText, status);
   2679     REGEX_CHECK_STATUS;
   2680     REGEX_ASSERT(result == &destText);
   2681     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2682     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2683     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2684     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2685 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2686     utext_close(result);
   2687     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2688     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2689     REGEX_ASSERT(result == &destText);
   2690 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2691 
   2692     //
   2693     // Replacement String with \u hex escapes
   2694     //
   2695     {
   2696       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2697       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2698         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2699         utext_openUTF8(&replText, str_u0043, -1, &status);
   2700         matcher->reset(&dataText);
   2701 
   2702         result = matcher->replaceAll(&replText, NULL, status);
   2703         REGEX_CHECK_STATUS;
   2704         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2705         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2706         utext_close(result);
   2707         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2708         result = matcher->replaceAll(&replText, &destText, status);
   2709         REGEX_CHECK_STATUS;
   2710         REGEX_ASSERT(result == &destText);
   2711         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2712     }
   2713     {
   2714       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2715         utext_openUTF8(&dataText, str_abc, -1, &status);
   2716         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2717         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2718         matcher->reset(&dataText);
   2719 
   2720         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2721         //                          0123456789
   2722         expected[2] = 0xF0;
   2723         expected[3] = 0x90;
   2724         expected[4] = 0x80;
   2725         expected[5] = 0x80;
   2726 
   2727         result = matcher->replaceAll(&replText, NULL, status);
   2728         REGEX_CHECK_STATUS;
   2729         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2730         utext_close(result);
   2731         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2732         result = matcher->replaceAll(&replText, &destText, status);
   2733         REGEX_CHECK_STATUS;
   2734         REGEX_ASSERT(result == &destText);
   2735         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2736     }
   2737     // TODO:  need more through testing of capture substitutions.
   2738 
   2739     // Bug 4057
   2740     //
   2741     {
   2742         status = U_ZERO_ERROR;
   2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2746         utext_openUTF8(&re, str_ssee, -1, &status);
   2747         utext_openUTF8(&dataText, str_blah, -1, &status);
   2748         utext_openUTF8(&replText, str_ooh, -1, &status);
   2749 
   2750         RegexMatcher m(&re, 0, status);
   2751         REGEX_CHECK_STATUS;
   2752 
   2753         UnicodeString result;
   2754         UText resultText = UTEXT_INITIALIZER;
   2755         utext_openUnicodeString(&resultText, &result, &status);
   2756 
   2757         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2758         m.reset(&dataText);
   2759         m.find();
   2760         m.find();
   2761         m.appendReplacement(&resultText, &replText, status);
   2762         REGEX_CHECK_STATUS;
   2763         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2764         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2765 
   2766         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2767         status = U_ZERO_ERROR;
   2768         result.truncate(0);
   2769         utext_openUnicodeString(&resultText, &result, &status);
   2770         m.reset(10, status);
   2771         m.find();
   2772         m.find();
   2773         m.appendReplacement(&resultText, &replText, status);
   2774         REGEX_CHECK_STATUS;
   2775         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2776         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2777 
   2778         // find() at interior of string, appendReplacement still starts at beginning.
   2779         status = U_ZERO_ERROR;
   2780         result.truncate(0);
   2781         utext_openUnicodeString(&resultText, &result, &status);
   2782         m.reset();
   2783         m.find(10, status);
   2784         m.find();
   2785         m.appendReplacement(&resultText, &replText, status);
   2786         REGEX_CHECK_STATUS;
   2787         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2788         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2789 
   2790         m.appendTail(&resultText, status);
   2791         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2792         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2793 
   2794         utext_close(&resultText);
   2795     }
   2796 
   2797     delete matcher2;
   2798     delete pat2;
   2799     delete matcher;
   2800     delete pat;
   2801 
   2802     utext_close(&dataText);
   2803     utext_close(&replText);
   2804     utext_close(&destText);
   2805     utext_close(&re);
   2806 }
   2807 
   2808 
   2809 //---------------------------------------------------------------------------
   2810 //
   2811 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2812 //                        present and nominally working.
   2813 //
   2814 //---------------------------------------------------------------------------
   2815 void RegexTest::API_Pattern_UTF8() {
   2816     RegexPattern        pata;    // Test default constructor to not crash.
   2817     RegexPattern        patb;
   2818 
   2819     REGEX_ASSERT(pata == patb);
   2820     REGEX_ASSERT(pata == pata);
   2821 
   2822     UText         re1 = UTEXT_INITIALIZER;
   2823     UText         re2 = UTEXT_INITIALIZER;
   2824     UErrorCode    status = U_ZERO_ERROR;
   2825     UParseError   pe;
   2826 
   2827     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2828     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2829     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2830     utext_openUTF8(&re2, str_def, -1, &status);
   2831 
   2832     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2833     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2834     REGEX_CHECK_STATUS;
   2835     REGEX_ASSERT(*pat1 == *pat1);
   2836     REGEX_ASSERT(*pat1 != pata);
   2837 
   2838     // Assign
   2839     patb = *pat1;
   2840     REGEX_ASSERT(patb == *pat1);
   2841 
   2842     // Copy Construct
   2843     RegexPattern patc(*pat1);
   2844     REGEX_ASSERT(patc == *pat1);
   2845     REGEX_ASSERT(patb == patc);
   2846     REGEX_ASSERT(pat1 != pat2);
   2847     patb = *pat2;
   2848     REGEX_ASSERT(patb != patc);
   2849     REGEX_ASSERT(patb == *pat2);
   2850 
   2851     // Compile with no flags.
   2852     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2853     REGEX_ASSERT(*pat1a == *pat1);
   2854 
   2855     REGEX_ASSERT(pat1a->flags() == 0);
   2856 
   2857     // Compile with different flags should be not equal
   2858     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2859     REGEX_CHECK_STATUS;
   2860 
   2861     REGEX_ASSERT(*pat1b != *pat1a);
   2862     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2863     REGEX_ASSERT(pat1a->flags() == 0);
   2864     delete pat1b;
   2865 
   2866     // clone
   2867     RegexPattern *pat1c = pat1->clone();
   2868     REGEX_ASSERT(*pat1c == *pat1);
   2869     REGEX_ASSERT(*pat1c != *pat2);
   2870 
   2871     delete pat1c;
   2872     delete pat1a;
   2873     delete pat1;
   2874     delete pat2;
   2875 
   2876     utext_close(&re1);
   2877     utext_close(&re2);
   2878 
   2879 
   2880     //
   2881     //   Verify that a matcher created from a cloned pattern works.
   2882     //     (Jitterbug 3423)
   2883     //
   2884     {
   2885         UErrorCode     status     = U_ZERO_ERROR;
   2886         UText          pattern    = UTEXT_INITIALIZER;
   2887         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2888         utext_openUTF8(&pattern, str_pL, -1, &status);
   2889 
   2890         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2891         RegexPattern  *pClone     = pSource->clone();
   2892         delete         pSource;
   2893         RegexMatcher  *mFromClone = pClone->matcher(status);
   2894         REGEX_CHECK_STATUS;
   2895 
   2896         UText          input      = UTEXT_INITIALIZER;
   2897         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2898         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2899         mFromClone->reset(&input);
   2900         REGEX_ASSERT(mFromClone->find() == TRUE);
   2901         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2902         REGEX_ASSERT(mFromClone->find() == TRUE);
   2903         REGEX_ASSERT(mFromClone->group(status) == "World");
   2904         REGEX_ASSERT(mFromClone->find() == FALSE);
   2905         delete mFromClone;
   2906         delete pClone;
   2907 
   2908         utext_close(&input);
   2909         utext_close(&pattern);
   2910     }
   2911 
   2912     //
   2913     //   matches convenience API
   2914     //
   2915     {
   2916         UErrorCode status  = U_ZERO_ERROR;
   2917         UText      pattern = UTEXT_INITIALIZER;
   2918         UText      input   = UTEXT_INITIALIZER;
   2919 
   2920         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2921         utext_openUTF8(&input, str_randominput, -1, &status);
   2922 
   2923         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2924         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2925         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2926         REGEX_CHECK_STATUS;
   2927 
   2928         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2929         utext_openUTF8(&pattern, str_abc, -1, &status);
   2930         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2931         REGEX_CHECK_STATUS;
   2932 
   2933         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2934         utext_openUTF8(&pattern, str_nput, -1, &status);
   2935         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2936         REGEX_CHECK_STATUS;
   2937 
   2938         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2939         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2940         REGEX_CHECK_STATUS;
   2941 
   2942         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2943         utext_openUTF8(&pattern, str_u, -1, &status);
   2944         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2945         REGEX_CHECK_STATUS;
   2946 
   2947         utext_openUTF8(&input, str_abc, -1, &status);
   2948         utext_openUTF8(&pattern, str_abc, -1, &status);
   2949         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2950         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2951         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2952 
   2953         utext_close(&input);
   2954         utext_close(&pattern);
   2955     }
   2956 
   2957 
   2958     //
   2959     // Split()
   2960     //
   2961     status = U_ZERO_ERROR;
   2962     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2963     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2964     pat1 = RegexPattern::compile(&re1, pe, status);
   2965     REGEX_CHECK_STATUS;
   2966     UnicodeString  fields[10];
   2967 
   2968     int32_t n;
   2969     n = pat1->split("Now is the time", fields, 10, status);
   2970     REGEX_CHECK_STATUS;
   2971     REGEX_ASSERT(n==4);
   2972     REGEX_ASSERT(fields[0]=="Now");
   2973     REGEX_ASSERT(fields[1]=="is");
   2974     REGEX_ASSERT(fields[2]=="the");
   2975     REGEX_ASSERT(fields[3]=="time");
   2976     REGEX_ASSERT(fields[4]=="");
   2977 
   2978     n = pat1->split("Now is the time", fields, 2, status);
   2979     REGEX_CHECK_STATUS;
   2980     REGEX_ASSERT(n==2);
   2981     REGEX_ASSERT(fields[0]=="Now");
   2982     REGEX_ASSERT(fields[1]=="is the time");
   2983     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2984 
   2985     fields[1] = "*";
   2986     status = U_ZERO_ERROR;
   2987     n = pat1->split("Now is the time", fields, 1, status);
   2988     REGEX_CHECK_STATUS;
   2989     REGEX_ASSERT(n==1);
   2990     REGEX_ASSERT(fields[0]=="Now is the time");
   2991     REGEX_ASSERT(fields[1]=="*");
   2992     status = U_ZERO_ERROR;
   2993 
   2994     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2995     REGEX_CHECK_STATUS;
   2996     REGEX_ASSERT(n==6);
   2997     REGEX_ASSERT(fields[0]=="");
   2998     REGEX_ASSERT(fields[1]=="Now");
   2999     REGEX_ASSERT(fields[2]=="is");
   3000     REGEX_ASSERT(fields[3]=="the");
   3001     REGEX_ASSERT(fields[4]=="time");
   3002     REGEX_ASSERT(fields[5]=="");
   3003     REGEX_ASSERT(fields[6]=="");
   3004 
   3005     fields[2] = "*";
   3006     n = pat1->split("     ", fields, 10, status);
   3007     REGEX_CHECK_STATUS;
   3008     REGEX_ASSERT(n==2);
   3009     REGEX_ASSERT(fields[0]=="");
   3010     REGEX_ASSERT(fields[1]=="");
   3011     REGEX_ASSERT(fields[2]=="*");
   3012 
   3013     fields[0] = "foo";
   3014     n = pat1->split("", fields, 10, status);
   3015     REGEX_CHECK_STATUS;
   3016     REGEX_ASSERT(n==0);
   3017     REGEX_ASSERT(fields[0]=="foo");
   3018 
   3019     delete pat1;
   3020 
   3021     //  split, with a pattern with (capture)
   3022     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   3023     pat1 = RegexPattern::compile(&re1,  pe, status);
   3024     REGEX_CHECK_STATUS;
   3025 
   3026     status = U_ZERO_ERROR;
   3027     fields[6] = fields[7] = "*";
   3028     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   3029     REGEX_CHECK_STATUS;
   3030     REGEX_ASSERT(n==7);
   3031     REGEX_ASSERT(fields[0]=="");
   3032     REGEX_ASSERT(fields[1]=="a");
   3033     REGEX_ASSERT(fields[2]=="Now is ");
   3034     REGEX_ASSERT(fields[3]=="b");
   3035     REGEX_ASSERT(fields[4]=="the time");
   3036     REGEX_ASSERT(fields[5]=="c");
   3037     REGEX_ASSERT(fields[6]=="");
   3038     REGEX_ASSERT(fields[7]=="*");
   3039     REGEX_ASSERT(status==U_ZERO_ERROR);
   3040 
   3041     fields[6] = fields[7] = "*";
   3042     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   3043     REGEX_CHECK_STATUS;
   3044     REGEX_ASSERT(n==7);
   3045     REGEX_ASSERT(fields[0]=="  ");
   3046     REGEX_ASSERT(fields[1]=="a");
   3047     REGEX_ASSERT(fields[2]=="Now is ");
   3048     REGEX_ASSERT(fields[3]=="b");
   3049     REGEX_ASSERT(fields[4]=="the time");
   3050     REGEX_ASSERT(fields[5]=="c");
   3051     REGEX_ASSERT(fields[6]=="");
   3052     REGEX_ASSERT(fields[7]=="*");
   3053 
   3054     status = U_ZERO_ERROR;
   3055     fields[6] = "foo";
   3056     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   3057     REGEX_CHECK_STATUS;
   3058     REGEX_ASSERT(n==6);
   3059     REGEX_ASSERT(fields[0]=="  ");
   3060     REGEX_ASSERT(fields[1]=="a");
   3061     REGEX_ASSERT(fields[2]=="Now is ");
   3062     REGEX_ASSERT(fields[3]=="b");
   3063     REGEX_ASSERT(fields[4]=="the time");
   3064     REGEX_ASSERT(fields[5]==" ");
   3065     REGEX_ASSERT(fields[6]=="foo");
   3066 
   3067     status = U_ZERO_ERROR;
   3068     fields[5] = "foo";
   3069     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3070     REGEX_CHECK_STATUS;
   3071     REGEX_ASSERT(n==5);
   3072     REGEX_ASSERT(fields[0]=="  ");
   3073     REGEX_ASSERT(fields[1]=="a");
   3074     REGEX_ASSERT(fields[2]=="Now is ");
   3075     REGEX_ASSERT(fields[3]=="b");
   3076     REGEX_ASSERT(fields[4]=="the time<c>");
   3077     REGEX_ASSERT(fields[5]=="foo");
   3078 
   3079     status = U_ZERO_ERROR;
   3080     fields[5] = "foo";
   3081     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3082     REGEX_CHECK_STATUS;
   3083     REGEX_ASSERT(n==5);
   3084     REGEX_ASSERT(fields[0]=="  ");
   3085     REGEX_ASSERT(fields[1]=="a");
   3086     REGEX_ASSERT(fields[2]=="Now is ");
   3087     REGEX_ASSERT(fields[3]=="b");
   3088     REGEX_ASSERT(fields[4]=="the time");
   3089     REGEX_ASSERT(fields[5]=="foo");
   3090 
   3091     status = U_ZERO_ERROR;
   3092     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3093     REGEX_CHECK_STATUS;
   3094     REGEX_ASSERT(n==4);
   3095     REGEX_ASSERT(fields[0]=="  ");
   3096     REGEX_ASSERT(fields[1]=="a");
   3097     REGEX_ASSERT(fields[2]=="Now is ");
   3098     REGEX_ASSERT(fields[3]=="the time<c>");
   3099     status = U_ZERO_ERROR;
   3100     delete pat1;
   3101 
   3102     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3103     pat1 = RegexPattern::compile(&re1, pe, status);
   3104     REGEX_CHECK_STATUS;
   3105     n = pat1->split("1-10,20", fields, 10, status);
   3106     REGEX_CHECK_STATUS;
   3107     REGEX_ASSERT(n==5);
   3108     REGEX_ASSERT(fields[0]=="1");
   3109     REGEX_ASSERT(fields[1]=="-");
   3110     REGEX_ASSERT(fields[2]=="10");
   3111     REGEX_ASSERT(fields[3]==",");
   3112     REGEX_ASSERT(fields[4]=="20");
   3113     delete pat1;
   3114 
   3115 
   3116     //
   3117     // split of a UText based string, with library allocating output UTexts.
   3118     //
   3119     {
   3120         status = U_ZERO_ERROR;
   3121         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
   3122         UnicodeString stringToSplit("first:second:third");
   3123         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
   3124         REGEX_CHECK_STATUS;
   3125 
   3126         UText *splits[10] = {NULL};
   3127         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
   3128         REGEX_CHECK_STATUS;
   3129         REGEX_ASSERT(numFields == 5);
   3130         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
   3131         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
   3132         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
   3133         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
   3134         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
   3135         REGEX_ASSERT(splits[5] == NULL);
   3136 
   3137         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
   3138             if (splits[i]) {
   3139                 utext_close(splits[i]);
   3140                 splits[i] = NULL;
   3141             }
   3142         }
   3143         utext_close(textToSplit);
   3144     }
   3145 
   3146 
   3147     //
   3148     // RegexPattern::pattern() and patternText()
   3149     //
   3150     pat1 = new RegexPattern();
   3151     REGEX_ASSERT(pat1->pattern() == "");
   3152     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3153     delete pat1;
   3154     const char *helloWorldInvariant = "(Hello, world)*";
   3155     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3156     pat1 = RegexPattern::compile(&re1, pe, status);
   3157     REGEX_CHECK_STATUS;
   3158     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
   3159     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3160     delete pat1;
   3161 
   3162     utext_close(&re1);
   3163 }
   3164 
   3165 
   3166 //---------------------------------------------------------------------------
   3167 //
   3168 //      Extended       A more thorough check for features of regex patterns
   3169 //                     The test cases are in a separate data file,
   3170 //                       source/tests/testdata/regextst.txt
   3171 //                     A description of the test data format is included in that file.
   3172 //
   3173 //---------------------------------------------------------------------------
   3174 
   3175 const char *
   3176 RegexTest::getPath(char buffer[2048], const char *filename) {
   3177     UErrorCode status=U_ZERO_ERROR;
   3178     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3179     if (U_FAILURE(status)) {
   3180         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3181         return NULL;
   3182     }
   3183 
   3184     strcpy(buffer, testDataDirectory);
   3185     strcat(buffer, filename);
   3186     return buffer;
   3187 }
   3188 
   3189 void RegexTest::Extended() {
   3190     char tdd[2048];
   3191     const char *srcPath;
   3192     UErrorCode  status  = U_ZERO_ERROR;
   3193     int32_t     lineNum = 0;
   3194 
   3195     //
   3196     //  Open and read the test data file.
   3197     //
   3198     srcPath=getPath(tdd, "regextst.txt");
   3199     if(srcPath==NULL) {
   3200         return; /* something went wrong, error already output */
   3201     }
   3202 
   3203     int32_t    len;
   3204     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3205     if (U_FAILURE(status)) {
   3206         return; /* something went wrong, error already output */
   3207     }
   3208 
   3209     //
   3210     //  Put the test data into a UnicodeString
   3211     //
   3212     UnicodeString testString(FALSE, testData, len);
   3213 
   3214     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3215     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3216     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3217 
   3218     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3219     UnicodeString   testPattern;   // The pattern for test from the test file.
   3220     UnicodeString   testFlags;     // the flags   for a test.
   3221     UnicodeString   matchString;   // The marked up string to be used as input
   3222 
   3223     if (U_FAILURE(status)){
   3224         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
   3225         delete [] testData;
   3226         return;
   3227     }
   3228 
   3229     //
   3230     //  Loop over the test data file, once per line.
   3231     //
   3232     while (lineMat.find()) {
   3233         lineNum++;
   3234         if (U_FAILURE(status)) {
   3235           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3236         }
   3237 
   3238         status = U_ZERO_ERROR;
   3239         UnicodeString testLine = lineMat.group(1, status);
   3240         if (testLine.length() == 0) {
   3241             continue;
   3242         }
   3243 
   3244         //
   3245         // Parse the test line.  Skip blank and comment only lines.
   3246         // Separate out the three main fields - pattern, flags, target.
   3247         //
   3248 
   3249         commentMat.reset(testLine);
   3250         if (commentMat.lookingAt(status)) {
   3251             // This line is a comment, or blank.
   3252             continue;
   3253         }
   3254 
   3255         //
   3256         //  Pull out the pattern field, remove it from the test file line.
   3257         //
   3258         quotedStuffMat.reset(testLine);
   3259         if (quotedStuffMat.lookingAt(status)) {
   3260             testPattern = quotedStuffMat.group(2, status);
   3261             testLine.remove(0, quotedStuffMat.end(0, status));
   3262         } else {
   3263             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3264             continue;
   3265         }
   3266 
   3267 
   3268         //
   3269         //  Pull out the flags from the test file line.
   3270         //
   3271         flagsMat.reset(testLine);
   3272         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3273         testFlags = flagsMat.group(1, status);
   3274         if (flagsMat.group(2, status).length() > 0) {
   3275             errln("Bad Match flag at line %d. Scanning %c\n",
   3276                 lineNum, flagsMat.group(2, status).charAt(0));
   3277             continue;
   3278         }
   3279         testLine.remove(0, flagsMat.end(0, status));
   3280 
   3281         //
   3282         //  Pull out the match string, as a whole.
   3283         //    We'll process the <tags> later.
   3284         //
   3285         quotedStuffMat.reset(testLine);
   3286         if (quotedStuffMat.lookingAt(status)) {
   3287             matchString = quotedStuffMat.group(2, status);
   3288             testLine.remove(0, quotedStuffMat.end(0, status));
   3289         } else {
   3290             errln("Bad match string at test file line %d", lineNum);
   3291             continue;
   3292         }
   3293 
   3294         //
   3295         //  The only thing left from the input line should be an optional trailing comment.
   3296         //
   3297         commentMat.reset(testLine);
   3298         if (commentMat.lookingAt(status) == FALSE) {
   3299             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3300             continue;
   3301         }
   3302 
   3303         //
   3304         //  Run the test
   3305         //
   3306         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3307     }
   3308 
   3309     delete [] testData;
   3310 
   3311 }
   3312 
   3313 
   3314 
   3315 //---------------------------------------------------------------------------
   3316 //
   3317 //    regex_find(pattern, flags, inputString, lineNumber)
   3318 //
   3319 //         Function to run a single test from the Extended (data driven) tests.
   3320 //         See file test/testdata/regextst.txt for a description of the
   3321 //         pattern and inputString fields, and the allowed flags.
   3322 //         lineNumber is the source line in regextst.txt of the test.
   3323 //
   3324 //---------------------------------------------------------------------------
   3325 
   3326 
   3327 //  Set a value into a UVector at position specified by a decimal number in
   3328 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3329 //   which follows.
   3330 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3331     UErrorCode  status=U_ZERO_ERROR;
   3332     int32_t  idx = 0;
   3333     for (int32_t i=0; i<index.length(); i++) {
   3334         int32_t d=u_charDigitValue(index.charAt(i));
   3335         if (d<0) {return;}
   3336         idx = idx*10 + d;
   3337     }
   3338     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3339     vec.setElementAt(val, idx);
   3340 }
   3341 
   3342 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3343     UErrorCode  status=U_ZERO_ERROR;
   3344     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3345     vec.setElementAt(val, idx);
   3346 }
   3347 
   3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3349 {
   3350     UBool couldFind = TRUE;
   3351     UTEXT_SETNATIVEINDEX(utext, 0);
   3352     int32_t i = 0;
   3353     while (i < unistrOffset) {
   3354         UChar32 c = UTEXT_NEXT32(utext);
   3355         if (c != U_SENTINEL) {
   3356             i += U16_LENGTH(c);
   3357         } else {
   3358             couldFind = FALSE;
   3359             break;
   3360         }
   3361     }
   3362     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3363     return couldFind;
   3364 }
   3365 
   3366 
   3367 void RegexTest::regex_find(const UnicodeString &pattern,
   3368                            const UnicodeString &flags,
   3369                            const UnicodeString &inputString,
   3370                            const char *srcPath,
   3371                            int32_t line) {
   3372     UnicodeString       unEscapedInput;
   3373     UnicodeString       deTaggedInput;
   3374 
   3375     int32_t             patternUTF8Length,      inputUTF8Length;
   3376     char                *patternChars  = NULL, *inputChars = NULL;
   3377     UText               patternText    = UTEXT_INITIALIZER;
   3378     UText               inputText      = UTEXT_INITIALIZER;
   3379     UConverter          *UTF8Converter = NULL;
   3380 
   3381     UErrorCode          status         = U_ZERO_ERROR;
   3382     UParseError         pe;
   3383     RegexPattern        *parsePat      = NULL;
   3384     RegexMatcher        *parseMatcher  = NULL;
   3385     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3386     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3387     UVector             groupStarts(status);
   3388     UVector             groupEnds(status);
   3389     UVector             groupStartsUTF8(status);
   3390     UVector             groupEndsUTF8(status);
   3391     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3392     UBool               failed         = FALSE;
   3393     int32_t             numFinds;
   3394     int32_t             i;
   3395     UBool               useMatchesFunc   = FALSE;
   3396     UBool               useLookingAtFunc = FALSE;
   3397     int32_t             regionStart      = -1;
   3398     int32_t             regionEnd        = -1;
   3399     int32_t             regionStartUTF8  = -1;
   3400     int32_t             regionEndUTF8    = -1;
   3401 
   3402 
   3403     //
   3404     //  Compile the caller's pattern
   3405     //
   3406     uint32_t bflags = 0;
   3407     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3408         bflags |= UREGEX_CASE_INSENSITIVE;
   3409     }
   3410     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3411         bflags |= UREGEX_COMMENTS;
   3412     }
   3413     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3414         bflags |= UREGEX_DOTALL;
   3415     }
   3416     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3417         bflags |= UREGEX_MULTILINE;
   3418     }
   3419 
   3420     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3421         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3422     }
   3423     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3424         bflags |= UREGEX_UNIX_LINES;
   3425     }
   3426     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3427         bflags |= UREGEX_LITERAL;
   3428     }
   3429 
   3430 
   3431     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3432     if (status != U_ZERO_ERROR) {
   3433         #if UCONFIG_NO_BREAK_ITERATION==1
   3434         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3435         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3436         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3437             goto cleanupAndReturn;
   3438         }
   3439         #endif
   3440         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3441             // Expected pattern compilation error.
   3442             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3443                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3444             }
   3445             goto cleanupAndReturn;
   3446         } else {
   3447             // Unexpected pattern compilation error.
   3448             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3449             goto cleanupAndReturn;
   3450         }
   3451     }
   3452 
   3453     UTF8Converter = ucnv_open("UTF8", &status);
   3454     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3455 
   3456     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3457     status = U_ZERO_ERROR; // buffer overflow
   3458     patternChars = new char[patternUTF8Length+1];
   3459     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3460     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3461 
   3462     if (status == U_ZERO_ERROR) {
   3463         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3464 
   3465         if (status != U_ZERO_ERROR) {
   3466 #if UCONFIG_NO_BREAK_ITERATION==1
   3467             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3468             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3469             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3470                 goto cleanupAndReturn;
   3471             }
   3472 #endif
   3473             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3474                 // Expected pattern compilation error.
   3475                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3476                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3477                 }
   3478                 goto cleanupAndReturn;
   3479             } else {
   3480                 // Unexpected pattern compilation error.
   3481                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3482                 goto cleanupAndReturn;
   3483             }
   3484         }
   3485     }
   3486 
   3487     if (UTF8Pattern == NULL) {
   3488         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3489         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3490         status = U_ZERO_ERROR;
   3491     }
   3492 
   3493     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3494         callerPattern->dumpPattern();
   3495     }
   3496 
   3497     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3498         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3499         goto cleanupAndReturn;
   3500     }
   3501 
   3502 
   3503     //
   3504     // Number of times find() should be called on the test string, default to 1
   3505     //
   3506     numFinds = 1;
   3507     for (i=2; i<=9; i++) {
   3508         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3509             if (numFinds != 1) {
   3510                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3511                 goto cleanupAndReturn;
   3512             }
   3513             numFinds = i;
   3514         }
   3515     }
   3516 
   3517     // 'M' flag.  Use matches() instead of find()
   3518     if (flags.indexOf((UChar)0x4d) >= 0) {
   3519         useMatchesFunc = TRUE;
   3520     }
   3521     if (flags.indexOf((UChar)0x4c) >= 0) {
   3522         useLookingAtFunc = TRUE;
   3523     }
   3524 
   3525     //
   3526     //  Find the tags in the input data, remove them, and record the group boundary
   3527     //    positions.
   3528     //
   3529     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3530     REGEX_CHECK_STATUS_L(line);
   3531 
   3532     unEscapedInput = inputString.unescape();
   3533     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3534     REGEX_CHECK_STATUS_L(line);
   3535     while(parseMatcher->find()) {
   3536         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3537         REGEX_CHECK_STATUS;
   3538         UnicodeString groupNum = parseMatcher->group(2, status);
   3539         if (groupNum == "r") {
   3540             // <r> or </r>, a region specification within the string
   3541             if (parseMatcher->group(1, status) == "/") {
   3542                 regionEnd = deTaggedInput.length();
   3543             } else {
   3544                 regionStart = deTaggedInput.length();
   3545             }
   3546         } else {
   3547             // <digits> or </digits>, a group match boundary tag.
   3548             if (parseMatcher->group(1, status) == "/") {
   3549                 set(groupEnds, deTaggedInput.length(), groupNum);
   3550             } else {
   3551                 set(groupStarts, deTaggedInput.length(), groupNum);
   3552             }
   3553         }
   3554     }
   3555     parseMatcher->appendTail(deTaggedInput);
   3556     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3557     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3558       errln("mismatched <r> tags");
   3559       failed = TRUE;
   3560       goto cleanupAndReturn;
   3561     }
   3562 
   3563     //
   3564     //  Configure the matcher according to the flags specified with this test.
   3565     //
   3566     matcher = callerPattern->matcher(deTaggedInput, status);
   3567     REGEX_CHECK_STATUS_L(line);
   3568     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3569         matcher->setTrace(TRUE);
   3570     }
   3571 
   3572     if (UTF8Pattern != NULL) {
   3573         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3574         status = U_ZERO_ERROR; // buffer overflow
   3575         inputChars = new char[inputUTF8Length+1];
   3576         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3577         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3578 
   3579         if (status == U_ZERO_ERROR) {
   3580             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3581             REGEX_CHECK_STATUS_L(line);
   3582         }
   3583 
   3584         if (UTF8Matcher == NULL) {
   3585             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3586           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3587             status = U_ZERO_ERROR;
   3588         }
   3589     }
   3590 
   3591     //
   3592     //  Generate native indices for UTF8 versions of region and capture group info
   3593     //
   3594     if (UTF8Matcher != NULL) {
   3595         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3596         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3597 
   3598         //  Fill out the native index UVector info.
   3599         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3600         for (i=0; i<groupStarts.size(); i++) {
   3601             int32_t  start = groupStarts.elementAti(i);
   3602             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3603             if (start >= 0) {
   3604                 int32_t  startUTF8;
   3605                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3606                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3607                     failed = TRUE;
   3608                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3609                 }
   3610                 setInt(groupStartsUTF8, startUTF8, i);
   3611             }
   3612 
   3613             int32_t  end = groupEnds.elementAti(i);
   3614             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3615             if (end >= 0) {
   3616                 int32_t  endUTF8;
   3617                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3618                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3619                     failed = TRUE;
   3620                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3621                 }
   3622                 setInt(groupEndsUTF8, endUTF8, i);
   3623             }
   3624         }
   3625     }
   3626 
   3627     if (regionStart>=0) {
   3628        matcher->region(regionStart, regionEnd, status);
   3629        REGEX_CHECK_STATUS_L(line);
   3630        if (UTF8Matcher != NULL) {
   3631            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3632            REGEX_CHECK_STATUS_L(line);
   3633        }
   3634     }
   3635     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3636         matcher->useAnchoringBounds(FALSE);
   3637         if (UTF8Matcher != NULL) {
   3638             UTF8Matcher->useAnchoringBounds(FALSE);
   3639         }
   3640     }
   3641     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3642         matcher->useTransparentBounds(TRUE);
   3643         if (UTF8Matcher != NULL) {
   3644             UTF8Matcher->useTransparentBounds(TRUE);
   3645         }
   3646     }
   3647 
   3648 
   3649 
   3650     //
   3651     // Do a find on the de-tagged input using the caller's pattern
   3652     //     TODO: error on count>1 and not find().
   3653     //           error on both matches() and lookingAt().
   3654     //
   3655     for (i=0; i<numFinds; i++) {
   3656         if (useMatchesFunc) {
   3657             isMatch = matcher->matches(status);
   3658             if (UTF8Matcher != NULL) {
   3659                isUTF8Match = UTF8Matcher->matches(status);
   3660             }
   3661         } else  if (useLookingAtFunc) {
   3662             isMatch = matcher->lookingAt(status);
   3663             if (UTF8Matcher != NULL) {
   3664                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3665             }
   3666         } else {
   3667             isMatch = matcher->find();
   3668             if (UTF8Matcher != NULL) {
   3669                 isUTF8Match = UTF8Matcher->find();
   3670             }
   3671         }
   3672     }
   3673     matcher->setTrace(FALSE);
   3674     if (U_FAILURE(status)) {
   3675         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
   3676     }
   3677 
   3678     //
   3679     // Match up the groups from the find() with the groups from the tags
   3680     //
   3681 
   3682     // number of tags should match number of groups from find operation.
   3683     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3684     //   G option in test means that capture group data is not available in the
   3685     //     expected results, so the check needs to be suppressed.
   3686     if (isMatch == FALSE && groupStarts.size() != 0) {
   3687         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3688         failed = TRUE;
   3689         goto cleanupAndReturn;
   3690     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3691         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3692         failed = TRUE;
   3693         goto cleanupAndReturn;
   3694     }
   3695 
   3696     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3697         // Only check for match / no match.  Don't check capture groups.
   3698         if (isMatch && groupStarts.size() == 0) {
   3699             errln("Error at line %d:  No match expected, but one found.", line);
   3700             failed = TRUE;
   3701         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
   3702             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
   3703             failed = TRUE;
   3704         }
   3705         goto cleanupAndReturn;
   3706     }
   3707 
   3708     REGEX_CHECK_STATUS_L(line);
   3709     for (i=0; i<=matcher->groupCount(); i++) {
   3710         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3711         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3712         if (matcher->start(i, status) != expectedStart) {
   3713             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3714                 line, i, expectedStart, matcher->start(i, status));
   3715             failed = TRUE;
   3716             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3717         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3718             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3719                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3720             failed = TRUE;
   3721             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3722         }
   3723 
   3724         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3725         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3726         if (matcher->end(i, status) != expectedEnd) {
   3727             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3728                 line, i, expectedEnd, matcher->end(i, status));
   3729             failed = TRUE;
   3730             // Error on end position;  keep going; real error is probably yet to come as group
   3731             //   end positions work from end of the input data towards the front.
   3732         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3733             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3734                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3735             failed = TRUE;
   3736             // Error on end position;  keep going; real error is probably yet to come as group
   3737             //   end positions work from end of the input data towards the front.
   3738         }
   3739     }
   3740     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3741         errln("Error at line %d: Expected %d capture groups, found %d.",
   3742             line, groupStarts.size()-1, matcher->groupCount());
   3743         failed = TRUE;
   3744         }
   3745     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3746         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3747               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3748         failed = TRUE;
   3749     }
   3750 
   3751     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3752         matcher->requireEnd() == TRUE) {
   3753         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3754         failed = TRUE;
   3755     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3756         UTF8Matcher->requireEnd() == TRUE) {
   3757         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3758         failed = TRUE;
   3759     }
   3760 
   3761     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3762         matcher->requireEnd() == FALSE) {
   3763         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3764         failed = TRUE;
   3765     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3766         UTF8Matcher->requireEnd() == FALSE) {
   3767         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3768         failed = TRUE;
   3769     }
   3770 
   3771     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3772         matcher->hitEnd() == TRUE) {
   3773         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3774         failed = TRUE;
   3775     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3776                UTF8Matcher->hitEnd() == TRUE) {
   3777         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3778         failed = TRUE;
   3779     }
   3780 
   3781     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3782         matcher->hitEnd() == FALSE) {
   3783         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3784         failed = TRUE;
   3785     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3786                UTF8Matcher->hitEnd() == FALSE) {
   3787         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3788         failed = TRUE;
   3789     }
   3790 
   3791 
   3792 cleanupAndReturn:
   3793     if (failed) {
   3794         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3795             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3796         // callerPattern->dump();
   3797     }
   3798     delete parseMatcher;
   3799     delete parsePat;
   3800     delete UTF8Matcher;
   3801     delete UTF8Pattern;
   3802     delete matcher;
   3803     delete callerPattern;
   3804 
   3805     utext_close(&inputText);
   3806     delete[] inputChars;
   3807     utext_close(&patternText);
   3808     delete[] patternChars;
   3809     ucnv_close(UTF8Converter);
   3810 }
   3811 
   3812 
   3813 
   3814 
   3815 //---------------------------------------------------------------------------
   3816 //
   3817 //      Errors     Check for error handling in patterns.
   3818 //
   3819 //---------------------------------------------------------------------------
   3820 void RegexTest::Errors() {
   3821     // \escape sequences that aren't implemented yet.
   3822     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3823 
   3824     // Missing close parentheses
   3825     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3826     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3827     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3828 
   3829     // Extra close paren
   3830     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3831     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3832     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3833 
   3834     // Look-ahead, Look-behind
   3835     //  TODO:  add tests for unbounded length look-behinds.
   3836     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3837 
   3838     // Attempt to use non-default flags
   3839     {
   3840         UParseError   pe;
   3841         UErrorCode    status = U_ZERO_ERROR;
   3842         int32_t       flags  = UREGEX_CANON_EQ |
   3843                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3844                                UREGEX_MULTILINE;
   3845         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3846         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3847         delete pat1;
   3848     }
   3849 
   3850 
   3851     // Quantifiers are allowed only after something that can be quantified.
   3852     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3853     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3854     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3855 
   3856     // Mal-formed {min,max} quantifiers
   3857     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3858     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3859     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3860     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3861     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3862     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3863     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3864     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3865     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3866 
   3867     // Ticket 5389
   3868     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3869 
   3870     // Invalid Back Reference \0
   3871     //    For ICU 3.8 and earlier
   3872     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3873     //
   3874     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3875 
   3876 }
   3877 
   3878 
   3879 //-------------------------------------------------------------------------------
   3880 //
   3881 //  Read a text data file, convert it to UChars, and return the data
   3882 //    in one big UChar * buffer, which the caller must delete.
   3883 //
   3884 //--------------------------------------------------------------------------------
   3885 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3886                                      const char *defEncoding, UErrorCode &status) {
   3887     UChar       *retPtr  = NULL;
   3888     char        *fileBuf = NULL;
   3889     UConverter* conv     = NULL;
   3890     FILE        *f       = NULL;
   3891 
   3892     ulen = 0;
   3893     if (U_FAILURE(status)) {
   3894         return retPtr;
   3895     }
   3896 
   3897     //
   3898     //  Open the file.
   3899     //
   3900     f = fopen(fileName, "rb");
   3901     if (f == 0) {
   3902         dataerrln("Error opening test data file %s\n", fileName);
   3903         status = U_FILE_ACCESS_ERROR;
   3904         return NULL;
   3905     }
   3906     //
   3907     //  Read it in
   3908     //
   3909     int32_t            fileSize;
   3910     int32_t            amt_read;
   3911 
   3912     fseek( f, 0, SEEK_END);
   3913     fileSize = ftell(f);
   3914     fileBuf = new char[fileSize];
   3915     fseek(f, 0, SEEK_SET);
   3916     amt_read = fread(fileBuf, 1, fileSize, f);
   3917     if (amt_read != fileSize || fileSize <= 0) {
   3918         errln("Error reading test data file.");
   3919         goto cleanUpAndReturn;
   3920     }
   3921 
   3922     //
   3923     // Look for a Unicode Signature (BOM) on the data just read
   3924     //
   3925     int32_t        signatureLength;
   3926     const char *   fileBufC;
   3927     const char*    encoding;
   3928 
   3929     fileBufC = fileBuf;
   3930     encoding = ucnv_detectUnicodeSignature(
   3931         fileBuf, fileSize, &signatureLength, &status);
   3932     if(encoding!=NULL ){
   3933         fileBufC  += signatureLength;
   3934         fileSize  -= signatureLength;
   3935     } else {
   3936         encoding = defEncoding;
   3937         if (strcmp(encoding, "utf-8") == 0) {
   3938             errln("file %s is missing its BOM", fileName);
   3939         }
   3940     }
   3941 
   3942     //
   3943     // Open a converter to take the rule file to UTF-16
   3944     //
   3945     conv = ucnv_open(encoding, &status);
   3946     if (U_FAILURE(status)) {
   3947         goto cleanUpAndReturn;
   3948     }
   3949 
   3950     //
   3951     // Convert the rules to UChar.
   3952     //  Preflight first to determine required buffer size.
   3953     //
   3954     ulen = ucnv_toUChars(conv,
   3955         NULL,           //  dest,
   3956         0,              //  destCapacity,
   3957         fileBufC,
   3958         fileSize,
   3959         &status);
   3960     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3961         // Buffer Overflow is expected from the preflight operation.
   3962         status = U_ZERO_ERROR;
   3963 
   3964         retPtr = new UChar[ulen+1];
   3965         ucnv_toUChars(conv,
   3966             retPtr,       //  dest,
   3967             ulen+1,
   3968             fileBufC,
   3969             fileSize,
   3970             &status);
   3971     }
   3972 
   3973 cleanUpAndReturn:
   3974     fclose(f);
   3975     delete[] fileBuf;
   3976     ucnv_close(conv);
   3977     if (U_FAILURE(status)) {
   3978         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3979         delete []retPtr;
   3980         retPtr = 0;
   3981         ulen   = 0;
   3982     };
   3983     return retPtr;
   3984 }
   3985 
   3986 
   3987 //-------------------------------------------------------------------------------
   3988 //
   3989 //   PerlTests  - Run Perl's regular expression tests
   3990 //                The input file for this test is re_tests, the standard regular
   3991 //                expression test data distributed with the Perl source code.
   3992 //
   3993 //                Here is Perl's description of the test data file:
   3994 //
   3995 //        # The tests are in a separate file 't/op/re_tests'.
   3996 //        # Each line in that file is a separate test.
   3997 //        # There are five columns, separated by tabs.
   3998 //        #
   3999 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   4000 //        # Modifiers can be put after the closing C<'>.
   4001 //        #
   4002 //        # Column 2 contains the string to be matched.
   4003 //        #
   4004 //        # Column 3 contains the expected result:
   4005 //        #     y   expect a match
   4006 //        #     n   expect no match
   4007 //        #     c   expect an error
   4008 //        # B   test exposes a known bug in Perl, should be skipped
   4009 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   4010 //        #
   4011 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   4012 //        #
   4013 //        # Column 4 contains a string, usually C<$&>.
   4014 //        #
   4015 //        # Column 5 contains the expected result of double-quote
   4016 //        # interpolating that string after the match, or start of error message.
   4017 //        #
   4018 //        # Column 6, if present, contains a reason why the test is skipped.
   4019 //        # This is printed with "skipped", for harness to pick up.
   4020 //        #
   4021 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   4022 //        #
   4023 //        # If you want to add a regular expression test that can't be expressed
   4024 //        # in this format, don't add it here: put it in op/pat.t instead.
   4025 //
   4026 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   4027 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   4028 //        (The i is in addition to whatever was there before.)
   4029 //
   4030 //-------------------------------------------------------------------------------
   4031 void RegexTest::PerlTests() {
   4032     char tdd[2048];
   4033     const char *srcPath;
   4034     UErrorCode  status = U_ZERO_ERROR;
   4035     UParseError pe;
   4036 
   4037     //
   4038     //  Open and read the test data file.
   4039     //
   4040     srcPath=getPath(tdd, "re_tests.txt");
   4041     if(srcPath==NULL) {
   4042         return; /* something went wrong, error already output */
   4043     }
   4044 
   4045     int32_t    len;
   4046     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4047     if (U_FAILURE(status)) {
   4048         return; /* something went wrong, error already output */
   4049     }
   4050 
   4051     //
   4052     //  Put the test data into a UnicodeString
   4053     //
   4054     UnicodeString testDataString(FALSE, testData, len);
   4055 
   4056     //
   4057     //  Regex to break the input file into lines, and strip the new lines.
   4058     //     One line per match, capture group one is the desired data.
   4059     //
   4060     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4061     if (U_FAILURE(status)) {
   4062         dataerrln("RegexPattern::compile() error");
   4063         return;
   4064     }
   4065     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4066 
   4067     //
   4068     //  Regex to split a test file line into fields.
   4069     //    There are six fields, separated by tabs.
   4070     //
   4071     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4072 
   4073     //
   4074     //  Regex to identify test patterns with flag settings, and to separate them.
   4075     //    Test patterns with flags look like 'pattern'i
   4076     //    Test patterns without flags are not quoted:   pattern
   4077     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4078     //
   4079     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4080     RegexMatcher* flagMat = flagPat->matcher(status);
   4081 
   4082     //
   4083     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4084     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4085     //   are string constants and REs for these constructs.
   4086     //
   4087     UnicodeString nulnulSrc("${nulnul}");
   4088     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4089     nulnul = nulnul.unescape();
   4090 
   4091     UnicodeString ffffSrc("${ffff}");
   4092     UnicodeString ffff("\\uffff", -1, US_INV);
   4093     ffff = ffff.unescape();
   4094 
   4095     //  regexp for $-[0], $+[2], etc.
   4096     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4097     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4098 
   4099     //  regexp for $0, $1, $2, etc.
   4100     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4101     RegexMatcher *cgMat = cgPat->matcher(status);
   4102 
   4103 
   4104     //
   4105     // Main Loop for the Perl Tests, runs once per line from the
   4106     //   test data file.
   4107     //
   4108     int32_t  lineNum = 0;
   4109     int32_t  skippedUnimplementedCount = 0;
   4110     while (lineMat->find()) {
   4111         lineNum++;
   4112 
   4113         //
   4114         //  Get a line, break it into its fields, do the Perl
   4115         //    variable substitutions.
   4116         //
   4117         UnicodeString line = lineMat->group(1, status);
   4118         UnicodeString fields[7];
   4119         fieldPat->split(line, fields, 7, status);
   4120 
   4121         flagMat->reset(fields[0]);
   4122         flagMat->matches(status);
   4123         UnicodeString pattern  = flagMat->group(2, status);
   4124         pattern.findAndReplace("${bang}", "!");
   4125         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4126         pattern.findAndReplace(ffffSrc, ffff);
   4127 
   4128         //
   4129         //  Identify patterns that include match flag settings,
   4130         //    split off the flags, remove the extra quotes.
   4131         //
   4132         UnicodeString flagStr = flagMat->group(3, status);
   4133         if (U_FAILURE(status)) {
   4134             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4135             return;
   4136         }
   4137         int32_t flags = 0;
   4138         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4139         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4140         const UChar UChar_m = 0x6d;
   4141         const UChar UChar_x = 0x78;
   4142         const UChar UChar_y = 0x79;
   4143         if (flagStr.indexOf(UChar_i) != -1) {
   4144             flags |= UREGEX_CASE_INSENSITIVE;
   4145         }
   4146         if (flagStr.indexOf(UChar_m) != -1) {
   4147             flags |= UREGEX_MULTILINE;
   4148         }
   4149         if (flagStr.indexOf(UChar_x) != -1) {
   4150             flags |= UREGEX_COMMENTS;
   4151         }
   4152 
   4153         //
   4154         // Compile the test pattern.
   4155         //
   4156         status = U_ZERO_ERROR;
   4157         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4158         if (status == U_REGEX_UNIMPLEMENTED) {
   4159             //
   4160             // Test of a feature that is planned for ICU, but not yet implemented.
   4161             //   skip the test.
   4162             skippedUnimplementedCount++;
   4163             delete testPat;
   4164             status = U_ZERO_ERROR;
   4165             continue;
   4166         }
   4167 
   4168         if (U_FAILURE(status)) {
   4169             // Some tests are supposed to generate errors.
   4170             //   Only report an error for tests that are supposed to succeed.
   4171             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4172                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4173             {
   4174                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4175             }
   4176             status = U_ZERO_ERROR;
   4177             delete testPat;
   4178             continue;
   4179         }
   4180 
   4181         if (fields[2].indexOf(UChar_i) >= 0) {
   4182             // ICU should skip this test.
   4183             delete testPat;
   4184             continue;
   4185         }
   4186 
   4187         if (fields[2].indexOf(UChar_c) >= 0) {
   4188             // This pattern should have caused a compilation error, but didn't/
   4189             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4190             delete testPat;
   4191             continue;
   4192         }
   4193 
   4194         //
   4195         // replace the Perl variables that appear in some of the
   4196         //   match data strings.
   4197         //
   4198         UnicodeString matchString = fields[1];
   4199         matchString.findAndReplace(nulnulSrc, nulnul);
   4200         matchString.findAndReplace(ffffSrc,   ffff);
   4201 
   4202         // Replace any \n in the match string with an actual new-line char.
   4203         //  Don't do full unescape, as this unescapes more than Perl does, which
   4204         //  causes other spurious failures in the tests.
   4205         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4206 
   4207 
   4208 
   4209         //
   4210         // Run the test, check for expected match/don't match result.
   4211         //
   4212         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4213         UBool found = testMat->find();
   4214         UBool expected = FALSE;
   4215         if (fields[2].indexOf(UChar_y) >=0) {
   4216             expected = TRUE;
   4217         }
   4218         if (expected != found) {
   4219             errln("line %d: Expected %smatch, got %smatch",
   4220                 lineNum, expected?"":"no ", found?"":"no " );
   4221             continue;
   4222         }
   4223 
   4224         // Don't try to check expected results if there is no match.
   4225         //   (Some have stuff in the expected fields)
   4226         if (!found) {
   4227             delete testMat;
   4228             delete testPat;
   4229             continue;
   4230         }
   4231 
   4232         //
   4233         // Interpret the Perl expression from the fourth field of the data file,
   4234         // building up an ICU string from the results of the ICU match.
   4235         //   The Perl expression will contain references to the results of
   4236         //     a regex match, including the matched string, capture group strings,
   4237         //     group starting and ending indicies, etc.
   4238         //
   4239         UnicodeString resultString;
   4240         UnicodeString perlExpr = fields[3];
   4241 #if SUPPORT_MUTATING_INPUT_STRING
   4242         groupsMat->reset(perlExpr);
   4243         cgMat->reset(perlExpr);
   4244 #endif
   4245 
   4246         while (perlExpr.length() > 0) {
   4247 #if !SUPPORT_MUTATING_INPUT_STRING
   4248             //  Perferred usage.  Reset after any modification to input string.
   4249             groupsMat->reset(perlExpr);
   4250             cgMat->reset(perlExpr);
   4251 #endif
   4252 
   4253             if (perlExpr.startsWith("$&")) {
   4254                 resultString.append(testMat->group(status));
   4255                 perlExpr.remove(0, 2);
   4256             }
   4257 
   4258             else if (groupsMat->lookingAt(status)) {
   4259                 // $-[0]   $+[2]  etc.
   4260                 UnicodeString digitString = groupsMat->group(2, status);
   4261                 int32_t t = 0;
   4262                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4263                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4264                 int32_t matchPosition;
   4265                 if (plusOrMinus.compare("+") == 0) {
   4266                     matchPosition = testMat->end(groupNum, status);
   4267                 } else {
   4268                     matchPosition = testMat->start(groupNum, status);
   4269                 }
   4270                 if (matchPosition != -1) {
   4271                     ICU_Utility::appendNumber(resultString, matchPosition);
   4272                 }
   4273                 perlExpr.remove(0, groupsMat->end(status));
   4274             }
   4275 
   4276             else if (cgMat->lookingAt(status)) {
   4277                 // $1, $2, $3, etc.
   4278                 UnicodeString digitString = cgMat->group(1, status);
   4279                 int32_t t = 0;
   4280                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4281                 if (U_SUCCESS(status)) {
   4282                     resultString.append(testMat->group(groupNum, status));
   4283                     status = U_ZERO_ERROR;
   4284                 }
   4285                 perlExpr.remove(0, cgMat->end(status));
   4286             }
   4287 
   4288             else if (perlExpr.startsWith("@-")) {
   4289                 int32_t i;
   4290                 for (i=0; i<=testMat->groupCount(); i++) {
   4291                     if (i>0) {
   4292                         resultString.append(" ");
   4293                     }
   4294                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4295                 }
   4296                 perlExpr.remove(0, 2);
   4297             }
   4298 
   4299             else if (perlExpr.startsWith("@+")) {
   4300                 int32_t i;
   4301                 for (i=0; i<=testMat->groupCount(); i++) {
   4302                     if (i>0) {
   4303                         resultString.append(" ");
   4304                     }
   4305                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4306                 }
   4307                 perlExpr.remove(0, 2);
   4308             }
   4309 
   4310             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4311                                                      //           or as an escaped sequence (e.g. \n)
   4312                 if (perlExpr.length() > 1) {
   4313                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4314                 }
   4315                 UChar c = perlExpr.charAt(0);
   4316                 switch (c) {
   4317                 case 'n':   c = '\n'; break;
   4318                 // add any other escape sequences that show up in the test expected results.
   4319                 }
   4320                 resultString.append(c);
   4321                 perlExpr.remove(0, 1);
   4322             }
   4323 
   4324             else  {
   4325                 // Any characters from the perl expression that we don't explicitly
   4326                 //  recognize before here are assumed to be literals and copied
   4327                 //  as-is to the expected results.
   4328                 resultString.append(perlExpr.charAt(0));
   4329                 perlExpr.remove(0, 1);
   4330             }
   4331 
   4332             if (U_FAILURE(status)) {
   4333                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4334                 break;
   4335             }
   4336         }
   4337 
   4338         //
   4339         // Expected Results Compare
   4340         //
   4341         UnicodeString expectedS(fields[4]);
   4342         expectedS.findAndReplace(nulnulSrc, nulnul);
   4343         expectedS.findAndReplace(ffffSrc,   ffff);
   4344         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4345 
   4346 
   4347         if (expectedS.compare(resultString) != 0) {
   4348             err("Line %d: Incorrect perl expression results.", lineNum);
   4349             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4350         }
   4351 
   4352         delete testMat;
   4353         delete testPat;
   4354     }
   4355 
   4356     //
   4357     // All done.  Clean up allocated stuff.
   4358     //
   4359     delete cgMat;
   4360     delete cgPat;
   4361 
   4362     delete groupsMat;
   4363     delete groupsPat;
   4364 
   4365     delete flagMat;
   4366     delete flagPat;
   4367 
   4368     delete lineMat;
   4369     delete linePat;
   4370 
   4371     delete fieldPat;
   4372     delete [] testData;
   4373 
   4374 
   4375     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4376 
   4377 }
   4378 
   4379 
   4380 //-------------------------------------------------------------------------------
   4381 //
   4382 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4383 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4384 //                  The input file for this test is re_tests, the standard regular
   4385 //                  expression test data distributed with the Perl source code.
   4386 //                  See PerlTests() for more information.
   4387 //
   4388 //-------------------------------------------------------------------------------
   4389 void RegexTest::PerlTestsUTF8() {
   4390     char tdd[2048];
   4391     const char *srcPath;
   4392     UErrorCode  status = U_ZERO_ERROR;
   4393     UParseError pe;
   4394     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4395     UText       patternText = UTEXT_INITIALIZER;
   4396     char       *patternChars = NULL;
   4397     int32_t     patternLength;
   4398     int32_t     patternCapacity = 0;
   4399     UText       inputText = UTEXT_INITIALIZER;
   4400     char       *inputChars = NULL;
   4401     int32_t     inputLength;
   4402     int32_t     inputCapacity = 0;
   4403 
   4404     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4405 
   4406     //
   4407     //  Open and read the test data file.
   4408     //
   4409     srcPath=getPath(tdd, "re_tests.txt");
   4410     if(srcPath==NULL) {
   4411         return; /* something went wrong, error already output */
   4412     }
   4413 
   4414     int32_t    len;
   4415     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4416     if (U_FAILURE(status)) {
   4417         return; /* something went wrong, error already output */
   4418     }
   4419 
   4420     //
   4421     //  Put the test data into a UnicodeString
   4422     //
   4423     UnicodeString testDataString(FALSE, testData, len);
   4424 
   4425     //
   4426     //  Regex to break the input file into lines, and strip the new lines.
   4427     //     One line per match, capture group one is the desired data.
   4428     //
   4429     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4430     if (U_FAILURE(status)) {
   4431         dataerrln("RegexPattern::compile() error");
   4432         return;
   4433     }
   4434     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4435 
   4436     //
   4437     //  Regex to split a test file line into fields.
   4438     //    There are six fields, separated by tabs.
   4439     //
   4440     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4441 
   4442     //
   4443     //  Regex to identify test patterns with flag settings, and to separate them.
   4444     //    Test patterns with flags look like 'pattern'i
   4445     //    Test patterns without flags are not quoted:   pattern
   4446     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4447     //
   4448     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4449     RegexMatcher* flagMat = flagPat->matcher(status);
   4450 
   4451     //
   4452     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4453     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4454     //   are string constants and REs for these constructs.
   4455     //
   4456     UnicodeString nulnulSrc("${nulnul}");
   4457     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4458     nulnul = nulnul.unescape();
   4459 
   4460     UnicodeString ffffSrc("${ffff}");
   4461     UnicodeString ffff("\\uffff", -1, US_INV);
   4462     ffff = ffff.unescape();
   4463 
   4464     //  regexp for $-[0], $+[2], etc.
   4465     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4466     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4467 
   4468     //  regexp for $0, $1, $2, etc.
   4469     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4470     RegexMatcher *cgMat = cgPat->matcher(status);
   4471 
   4472 
   4473     //
   4474     // Main Loop for the Perl Tests, runs once per line from the
   4475     //   test data file.
   4476     //
   4477     int32_t  lineNum = 0;
   4478     int32_t  skippedUnimplementedCount = 0;
   4479     while (lineMat->find()) {
   4480         lineNum++;
   4481 
   4482         //
   4483         //  Get a line, break it into its fields, do the Perl
   4484         //    variable substitutions.
   4485         //
   4486         UnicodeString line = lineMat->group(1, status);
   4487         UnicodeString fields[7];
   4488         fieldPat->split(line, fields, 7, status);
   4489 
   4490         flagMat->reset(fields[0]);
   4491         flagMat->matches(status);
   4492         UnicodeString pattern  = flagMat->group(2, status);
   4493         pattern.findAndReplace("${bang}", "!");
   4494         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4495         pattern.findAndReplace(ffffSrc, ffff);
   4496 
   4497         //
   4498         //  Identify patterns that include match flag settings,
   4499         //    split off the flags, remove the extra quotes.
   4500         //
   4501         UnicodeString flagStr = flagMat->group(3, status);
   4502         if (U_FAILURE(status)) {
   4503             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4504             return;
   4505         }
   4506         int32_t flags = 0;
   4507         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4508         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4509         const UChar UChar_m = 0x6d;
   4510         const UChar UChar_x = 0x78;
   4511         const UChar UChar_y = 0x79;
   4512         if (flagStr.indexOf(UChar_i) != -1) {
   4513             flags |= UREGEX_CASE_INSENSITIVE;
   4514         }
   4515         if (flagStr.indexOf(UChar_m) != -1) {
   4516             flags |= UREGEX_MULTILINE;
   4517         }
   4518         if (flagStr.indexOf(UChar_x) != -1) {
   4519             flags |= UREGEX_COMMENTS;
   4520         }
   4521 
   4522         //
   4523         // Put the pattern in a UTF-8 UText
   4524         //
   4525         status = U_ZERO_ERROR;
   4526         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4527         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4528             status = U_ZERO_ERROR;
   4529             delete[] patternChars;
   4530             patternCapacity = patternLength + 1;
   4531             patternChars = new char[patternCapacity];
   4532             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4533         }
   4534         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4535 
   4536         //
   4537         // Compile the test pattern.
   4538         //
   4539         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4540         if (status == U_REGEX_UNIMPLEMENTED) {
   4541             //
   4542             // Test of a feature that is planned for ICU, but not yet implemented.
   4543             //   skip the test.
   4544             skippedUnimplementedCount++;
   4545             delete testPat;
   4546             status = U_ZERO_ERROR;
   4547             continue;
   4548         }
   4549 
   4550         if (U_FAILURE(status)) {
   4551             // Some tests are supposed to generate errors.
   4552             //   Only report an error for tests that are supposed to succeed.
   4553             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4554                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4555             {
   4556                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4557             }
   4558             status = U_ZERO_ERROR;
   4559             delete testPat;
   4560             continue;
   4561         }
   4562 
   4563         if (fields[2].indexOf(UChar_i) >= 0) {
   4564             // ICU should skip this test.
   4565             delete testPat;
   4566             continue;
   4567         }
   4568 
   4569         if (fields[2].indexOf(UChar_c) >= 0) {
   4570             // This pattern should have caused a compilation error, but didn't/
   4571             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4572             delete testPat;
   4573             continue;
   4574         }
   4575 
   4576 
   4577         //
   4578         // replace the Perl variables that appear in some of the
   4579         //   match data strings.
   4580         //
   4581         UnicodeString matchString = fields[1];
   4582         matchString.findAndReplace(nulnulSrc, nulnul);
   4583         matchString.findAndReplace(ffffSrc,   ffff);
   4584 
   4585         // Replace any \n in the match string with an actual new-line char.
   4586         //  Don't do full unescape, as this unescapes more than Perl does, which
   4587         //  causes other spurious failures in the tests.
   4588         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4589 
   4590         //
   4591         // Put the input in a UTF-8 UText
   4592         //
   4593         status = U_ZERO_ERROR;
   4594         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4595         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4596             status = U_ZERO_ERROR;
   4597             delete[] inputChars;
   4598             inputCapacity = inputLength + 1;
   4599             inputChars = new char[inputCapacity];
   4600             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4601         }
   4602         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4603 
   4604         //
   4605         // Run the test, check for expected match/don't match result.
   4606         //
   4607         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4608         UBool found = testMat->find();
   4609         UBool expected = FALSE;
   4610         if (fields[2].indexOf(UChar_y) >=0) {
   4611             expected = TRUE;
   4612         }
   4613         if (expected != found) {
   4614             errln("line %d: Expected %smatch, got %smatch",
   4615                 lineNum, expected?"":"no ", found?"":"no " );
   4616             continue;
   4617         }
   4618 
   4619         // Don't try to check expected results if there is no match.
   4620         //   (Some have stuff in the expected fields)
   4621         if (!found) {
   4622             delete testMat;
   4623             delete testPat;
   4624             continue;
   4625         }
   4626 
   4627         //
   4628         // Interpret the Perl expression from the fourth field of the data file,
   4629         // building up an ICU string from the results of the ICU match.
   4630         //   The Perl expression will contain references to the results of
   4631         //     a regex match, including the matched string, capture group strings,
   4632         //     group starting and ending indicies, etc.
   4633         //
   4634         UnicodeString resultString;
   4635         UnicodeString perlExpr = fields[3];
   4636 
   4637         while (perlExpr.length() > 0) {
   4638             groupsMat->reset(perlExpr);
   4639             cgMat->reset(perlExpr);
   4640 
   4641             if (perlExpr.startsWith("$&")) {
   4642                 resultString.append(testMat->group(status));
   4643                 perlExpr.remove(0, 2);
   4644             }
   4645 
   4646             else if (groupsMat->lookingAt(status)) {
   4647                 // $-[0]   $+[2]  etc.
   4648                 UnicodeString digitString = groupsMat->group(2, status);
   4649                 int32_t t = 0;
   4650                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4651                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4652                 int32_t matchPosition;
   4653                 if (plusOrMinus.compare("+") == 0) {
   4654                     matchPosition = testMat->end(groupNum, status);
   4655                 } else {
   4656                     matchPosition = testMat->start(groupNum, status);
   4657                 }
   4658                 if (matchPosition != -1) {
   4659                     ICU_Utility::appendNumber(resultString, matchPosition);
   4660                 }
   4661                 perlExpr.remove(0, groupsMat->end(status));
   4662             }
   4663 
   4664             else if (cgMat->lookingAt(status)) {
   4665                 // $1, $2, $3, etc.
   4666                 UnicodeString digitString = cgMat->group(1, status);
   4667                 int32_t t = 0;
   4668                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4669                 if (U_SUCCESS(status)) {
   4670                     resultString.append(testMat->group(groupNum, status));
   4671                     status = U_ZERO_ERROR;
   4672                 }
   4673                 perlExpr.remove(0, cgMat->end(status));
   4674             }
   4675 
   4676             else if (perlExpr.startsWith("@-")) {
   4677                 int32_t i;
   4678                 for (i=0; i<=testMat->groupCount(); i++) {
   4679                     if (i>0) {
   4680                         resultString.append(" ");
   4681                     }
   4682                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4683                 }
   4684                 perlExpr.remove(0, 2);
   4685             }
   4686 
   4687             else if (perlExpr.startsWith("@+")) {
   4688                 int32_t i;
   4689                 for (i=0; i<=testMat->groupCount(); i++) {
   4690                     if (i>0) {
   4691                         resultString.append(" ");
   4692                     }
   4693                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4694                 }
   4695                 perlExpr.remove(0, 2);
   4696             }
   4697 
   4698             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4699                                                      //           or as an escaped sequence (e.g. \n)
   4700                 if (perlExpr.length() > 1) {
   4701                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4702                 }
   4703                 UChar c = perlExpr.charAt(0);
   4704                 switch (c) {
   4705                 case 'n':   c = '\n'; break;
   4706                 // add any other escape sequences that show up in the test expected results.
   4707                 }
   4708                 resultString.append(c);
   4709                 perlExpr.remove(0, 1);
   4710             }
   4711 
   4712             else  {
   4713                 // Any characters from the perl expression that we don't explicitly
   4714                 //  recognize before here are assumed to be literals and copied
   4715                 //  as-is to the expected results.
   4716                 resultString.append(perlExpr.charAt(0));
   4717                 perlExpr.remove(0, 1);
   4718             }
   4719 
   4720             if (U_FAILURE(status)) {
   4721                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4722                 break;
   4723             }
   4724         }
   4725 
   4726         //
   4727         // Expected Results Compare
   4728         //
   4729         UnicodeString expectedS(fields[4]);
   4730         expectedS.findAndReplace(nulnulSrc, nulnul);
   4731         expectedS.findAndReplace(ffffSrc,   ffff);
   4732         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4733 
   4734 
   4735         if (expectedS.compare(resultString) != 0) {
   4736             err("Line %d: Incorrect perl expression results.", lineNum);
   4737             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4738         }
   4739 
   4740         delete testMat;
   4741         delete testPat;
   4742     }
   4743 
   4744     //
   4745     // All done.  Clean up allocated stuff.
   4746     //
   4747     delete cgMat;
   4748     delete cgPat;
   4749 
   4750     delete groupsMat;
   4751     delete groupsPat;
   4752 
   4753     delete flagMat;
   4754     delete flagPat;
   4755 
   4756     delete lineMat;
   4757     delete linePat;
   4758 
   4759     delete fieldPat;
   4760     delete [] testData;
   4761 
   4762     utext_close(&patternText);
   4763     utext_close(&inputText);
   4764 
   4765     delete [] patternChars;
   4766     delete [] inputChars;
   4767 
   4768 
   4769     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4770 
   4771 }
   4772 
   4773 
   4774 //--------------------------------------------------------------
   4775 //
   4776 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4777 //             Use this pattern,
   4778 //                 "(a?){1,8000000}"
   4779 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
   4780 //                   This test is likely to be fragile, as further optimizations stop
   4781 //                   more cases of pointless looping in the match engine.
   4782 //
   4783 //---------------------------------------------------------------
   4784 void RegexTest::Bug6149() {
   4785     UnicodeString pattern("(a?){1,8000000}");
   4786     UnicodeString s("xyz");
   4787     uint32_t flags = 0;
   4788     UErrorCode status = U_ZERO_ERROR;
   4789 
   4790     RegexMatcher  matcher(pattern, s, flags, status);
   4791     UBool result = false;
   4792     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4793     REGEX_ASSERT(result == FALSE);
   4794  }
   4795 
   4796 
   4797 //
   4798 //   Callbacks()    Test the callback function.
   4799 //                  When set, callbacks occur periodically during matching operations,
   4800 //                  giving the application code the ability to abort the operation
   4801 //                  before it's normal completion.
   4802 //
   4803 
   4804 struct callBackContext {
   4805     RegexTest        *test;
   4806     int32_t          maxCalls;
   4807     int32_t          numCalls;
   4808     int32_t          lastSteps;
   4809     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4810 };
   4811 
   4812 U_CDECL_BEGIN
   4813 static UBool U_CALLCONV
   4814 testCallBackFn(const void *context, int32_t steps) {
   4815     callBackContext  *info = (callBackContext *)context;
   4816     if (info->lastSteps+1 != steps) {
   4817         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4818     }
   4819     info->lastSteps = steps;
   4820     info->numCalls++;
   4821     return (info->numCalls < info->maxCalls);
   4822 }
   4823 U_CDECL_END
   4824 
   4825 void RegexTest::Callbacks() {
   4826    {
   4827         // Getter returns NULLs if no callback has been set
   4828 
   4829         //   The variables that the getter will fill in.
   4830         //   Init to non-null values so that the action of the getter can be seen.
   4831         const void          *returnedContext = &returnedContext;
   4832         URegexMatchCallback *returnedFn = &testCallBackFn;
   4833 
   4834         UErrorCode status = U_ZERO_ERROR;
   4835         RegexMatcher matcher("x", 0, status);
   4836         REGEX_CHECK_STATUS;
   4837         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4838         REGEX_CHECK_STATUS;
   4839         REGEX_ASSERT(returnedFn == NULL);
   4840         REGEX_ASSERT(returnedContext == NULL);
   4841     }
   4842 
   4843    {
   4844         // Set and Get work
   4845         callBackContext cbInfo = {this, 0, 0, 0};
   4846         const void          *returnedContext;
   4847         URegexMatchCallback *returnedFn;
   4848         UErrorCode status = U_ZERO_ERROR;
   4849         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4850         REGEX_CHECK_STATUS;
   4851         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4852         REGEX_CHECK_STATUS;
   4853         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4854         REGEX_CHECK_STATUS;
   4855         REGEX_ASSERT(returnedFn == testCallBackFn);
   4856         REGEX_ASSERT(returnedContext == &cbInfo);
   4857 
   4858         // A short-running match shouldn't invoke the callback
   4859         status = U_ZERO_ERROR;
   4860         cbInfo.reset(1);
   4861         UnicodeString s = "xxx";
   4862         matcher.reset(s);
   4863         REGEX_ASSERT(matcher.matches(status));
   4864         REGEX_CHECK_STATUS;
   4865         REGEX_ASSERT(cbInfo.numCalls == 0);
   4866 
   4867         // A medium-length match that runs long enough to invoke the
   4868         //   callback, but not so long that the callback aborts it.
   4869         status = U_ZERO_ERROR;
   4870         cbInfo.reset(4);
   4871         s = "aaaaaaaaaaaaaaaaaaab";
   4872         matcher.reset(s);
   4873         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4874         REGEX_CHECK_STATUS;
   4875         REGEX_ASSERT(cbInfo.numCalls > 0);
   4876 
   4877         // A longer running match that the callback function will abort.
   4878         status = U_ZERO_ERROR;
   4879         cbInfo.reset(4);
   4880         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4881         matcher.reset(s);
   4882         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4883         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4884         REGEX_ASSERT(cbInfo.numCalls == 4);
   4885 
   4886         // A longer running find that the callback function will abort.
   4887         status = U_ZERO_ERROR;
   4888         cbInfo.reset(4);
   4889         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4890         matcher.reset(s);
   4891         REGEX_ASSERT(matcher.find(status)==FALSE);
   4892         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4893         REGEX_ASSERT(cbInfo.numCalls == 4);
   4894     }
   4895 
   4896 
   4897 }
   4898 
   4899 
   4900 //
   4901 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4902 //                  When set, the find progress callback will be invoked during a find operations
   4903 //                  after each return from a match attempt, giving the application the opportunity
   4904 //                  to terminate a long-running find operation before it's normal completion.
   4905 //
   4906 
   4907 struct progressCallBackContext {
   4908     RegexTest        *test;
   4909     int64_t          lastIndex;
   4910     int32_t          maxCalls;
   4911     int32_t          numCalls;
   4912     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4913 };
   4914 
   4915 // call-back function for find().
   4916 // Return TRUE to continue the find().
   4917 // Return FALSE to stop the find().
   4918 U_CDECL_BEGIN
   4919 static UBool U_CALLCONV
   4920 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4921     progressCallBackContext  *info = (progressCallBackContext *)context;
   4922     info->numCalls++;
   4923     info->lastIndex = matchIndex;
   4924 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4925     return (info->numCalls < info->maxCalls);
   4926 }
   4927 U_CDECL_END
   4928 
   4929 void RegexTest::FindProgressCallbacks() {
   4930    {
   4931         // Getter returns NULLs if no callback has been set
   4932 
   4933         //   The variables that the getter will fill in.
   4934         //   Init to non-null values so that the action of the getter can be seen.
   4935         const void                  *returnedContext = &returnedContext;
   4936         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4937 
   4938         UErrorCode status = U_ZERO_ERROR;
   4939         RegexMatcher matcher("x", 0, status);
   4940         REGEX_CHECK_STATUS;
   4941         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4942         REGEX_CHECK_STATUS;
   4943         REGEX_ASSERT(returnedFn == NULL);
   4944         REGEX_ASSERT(returnedContext == NULL);
   4945     }
   4946 
   4947    {
   4948         // Set and Get work
   4949         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4950         const void                  *returnedContext;
   4951         URegexFindProgressCallback  *returnedFn;
   4952         UErrorCode status = U_ZERO_ERROR;
   4953         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
   4954         REGEX_CHECK_STATUS;
   4955         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4956         REGEX_CHECK_STATUS;
   4957         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4958         REGEX_CHECK_STATUS;
   4959         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4960         REGEX_ASSERT(returnedContext == &cbInfo);
   4961 
   4962         // A find that matches on the initial position does NOT invoke the callback.
   4963         status = U_ZERO_ERROR;
   4964         cbInfo.reset(100);
   4965         UnicodeString s = "aaxxx";
   4966         matcher.reset(s);
   4967 #if 0
   4968         matcher.setTrace(TRUE);
   4969 #endif
   4970         REGEX_ASSERT(matcher.find(0, status));
   4971         REGEX_CHECK_STATUS;
   4972         REGEX_ASSERT(cbInfo.numCalls == 0);
   4973 
   4974         // A medium running find() that causes matcher.find() to invoke our callback for each index,
   4975         //   but not so many times that we interrupt the operation.
   4976         status = U_ZERO_ERROR;
   4977         s = "aaaaaaaaaaaaaaaaaaab";
   4978         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4979         matcher.reset(s);
   4980         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4981         REGEX_CHECK_STATUS;
   4982         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4983 
   4984         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4985         status = U_ZERO_ERROR;
   4986         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4987         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4988         matcher.reset(s1);
   4989         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4990         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4991         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4992 
   4993         // Now a match that will succeed, but after an interruption
   4994         status = U_ZERO_ERROR;
   4995         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4996         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4997         matcher.reset(s2);
   4998         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4999         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   5000         // Now retry the match from where left off
   5001         cbInfo.maxCalls = 100; //  No callback limit
   5002         status = U_ZERO_ERROR;
   5003         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   5004         REGEX_CHECK_STATUS;
   5005     }
   5006 
   5007 
   5008 }
   5009 
   5010 
   5011 //---------------------------------------------------------------------------
   5012 //
   5013 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   5014 //                             UTexts. The pure-C implementation of UText
   5015 //                             has no mutable backing stores, but we can
   5016 //                             use UnicodeString here to test the functionality.
   5017 //
   5018 //---------------------------------------------------------------------------
   5019 void RegexTest::PreAllocatedUTextCAPI () {
   5020     UErrorCode           status = U_ZERO_ERROR;
   5021     URegularExpression  *re;
   5022     UText                patternText = UTEXT_INITIALIZER;
   5023     UnicodeString        buffer;
   5024     UText                bufferText = UTEXT_INITIALIZER;
   5025 
   5026     utext_openUnicodeString(&bufferText, &buffer, &status);
   5027 
   5028     /*
   5029      *  getText() and getUText()
   5030      */
   5031     {
   5032         UText  text1 = UTEXT_INITIALIZER;
   5033         UText  text2 = UTEXT_INITIALIZER;
   5034         UChar  text2Chars[20];
   5035         UText  *resultText;
   5036 
   5037         status = U_ZERO_ERROR;
   5038         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   5039         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   5040         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   5041         utext_openUChars(&text2, text2Chars, -1, &status);
   5042 
   5043         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   5044         re = uregex_openUText(&patternText, 0, NULL, &status);
   5045 
   5046         /* First set a UText */
   5047         uregex_setUText(re, &text1, &status);
   5048         resultText = uregex_getUText(re, &bufferText, &status);
   5049         REGEX_CHECK_STATUS;
   5050         REGEX_ASSERT(resultText == &bufferText);
   5051         utext_setNativeIndex(resultText, 0);
   5052         utext_setNativeIndex(&text1, 0);
   5053         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5054 
   5055         resultText = uregex_getUText(re, &bufferText, &status);
   5056         REGEX_CHECK_STATUS;
   5057         REGEX_ASSERT(resultText == &bufferText);
   5058         utext_setNativeIndex(resultText, 0);
   5059         utext_setNativeIndex(&text1, 0);
   5060         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5061 
   5062         /* Then set a UChar * */
   5063         uregex_setText(re, text2Chars, 7, &status);
   5064         resultText = uregex_getUText(re, &bufferText, &status);
   5065         REGEX_CHECK_STATUS;
   5066         REGEX_ASSERT(resultText == &bufferText);
   5067         utext_setNativeIndex(resultText, 0);
   5068         utext_setNativeIndex(&text2, 0);
   5069         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   5070 
   5071         uregex_close(re);
   5072         utext_close(&text1);
   5073         utext_close(&text2);
   5074     }
   5075 
   5076     /*
   5077      *  group()
   5078      */
   5079     {
   5080         UChar    text1[80];
   5081         UText   *actual;
   5082         UBool    result;
   5083         int64_t  length = 0;
   5084 
   5085         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
   5086         //                  012345678901234567890123456789012345678901234567
   5087         //                  0         1         2         3         4
   5088 
   5089         status = U_ZERO_ERROR;
   5090         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   5091         REGEX_CHECK_STATUS;
   5092 
   5093         uregex_setText(re, text1, -1, &status);
   5094         result = uregex_find(re, 0, &status);
   5095         REGEX_ASSERT(result==TRUE);
   5096 
   5097         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
   5098         status = U_ZERO_ERROR;
   5099         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
   5100         REGEX_CHECK_STATUS;
   5101         REGEX_ASSERT(actual == &bufferText);
   5102         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
   5103         REGEX_ASSERT(length == 16);
   5104         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5105 
   5106         /*  Capture group #1.  Should succeed, matching " interior ". */
   5107         status = U_ZERO_ERROR;
   5108         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
   5109         REGEX_CHECK_STATUS;
   5110         REGEX_ASSERT(actual == &bufferText);
   5111         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
   5112         REGEX_ASSERT(length == 10);
   5113         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5114 
   5115         /*  Capture group out of range.  Error. */
   5116         status = U_ZERO_ERROR;
   5117         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
   5118         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5119         REGEX_ASSERT(actual == &bufferText);
   5120         uregex_close(re);
   5121 
   5122     }
   5123 
   5124     /*
   5125      *  replaceFirst()
   5126      */
   5127     {
   5128         UChar    text1[80];
   5129         UChar    text2[80];
   5130         UText    replText = UTEXT_INITIALIZER;
   5131         UText   *result;
   5132         status = U_ZERO_ERROR;
   5133         utext_openUnicodeString(&bufferText, &buffer, &status);
   5134 
   5135         status = U_ZERO_ERROR;
   5136         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
   5137         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
   5138         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5139 
   5140         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5141         REGEX_CHECK_STATUS;
   5142 
   5143         /*  Normal case, with match */
   5144         uregex_setText(re, text1, -1, &status);
   5145         REGEX_CHECK_STATUS;
   5146         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5147         REGEX_CHECK_STATUS;
   5148         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5149         REGEX_CHECK_STATUS;
   5150         REGEX_ASSERT(result == &bufferText);
   5151         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5152 
   5153         /* No match.  Text should copy to output with no changes.  */
   5154         uregex_setText(re, text2, -1, &status);
   5155         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5156         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5157         REGEX_CHECK_STATUS;
   5158         REGEX_ASSERT(result == &bufferText);
   5159         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5160 
   5161         /* Unicode escapes */
   5162         uregex_setText(re, text1, -1, &status);
   5163         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
   5164         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5165         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5166         REGEX_CHECK_STATUS;
   5167         REGEX_ASSERT(result == &bufferText);
   5168         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5169 
   5170         uregex_close(re);
   5171         utext_close(&replText);
   5172     }
   5173 
   5174 
   5175     /*
   5176      *  replaceAll()
   5177      */
   5178     {
   5179         UChar    text1[80];
   5180         UChar    text2[80];
   5181         UText    replText = UTEXT_INITIALIZER;
   5182         UText   *result;
   5183 
   5184         status = U_ZERO_ERROR;
   5185         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5186         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5187         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5188 
   5189         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5190         REGEX_CHECK_STATUS;
   5191 
   5192         /*  Normal case, with match */
   5193         uregex_setText(re, text1, -1, &status);
   5194         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5195         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5196         REGEX_CHECK_STATUS;
   5197         REGEX_ASSERT(result == &bufferText);
   5198         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5199 
   5200         /* No match.  Text should copy to output with no changes.  */
   5201         uregex_setText(re, text2, -1, &status);
   5202         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5203         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5204         REGEX_CHECK_STATUS;
   5205         REGEX_ASSERT(result == &bufferText);
   5206         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5207 
   5208         uregex_close(re);
   5209         utext_close(&replText);
   5210     }
   5211 
   5212 
   5213     /*
   5214      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5215      *   so we don't need to test it here.
   5216      */
   5217 
   5218     utext_close(&bufferText);
   5219     utext_close(&patternText);
   5220 }
   5221 
   5222 
   5223 //--------------------------------------------------------------
   5224 //
   5225 //  NamedCapture   Check basic named capture group functionality
   5226 //
   5227 //--------------------------------------------------------------
   5228 void RegexTest::NamedCapture() {
   5229     UErrorCode status = U_ZERO_ERROR;
   5230     RegexPattern *pat = RegexPattern::compile(UnicodeString(
   5231             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
   5232     REGEX_CHECK_STATUS;
   5233     int32_t group = pat->groupNumberFromName("five", -1, status);
   5234     REGEX_CHECK_STATUS;
   5235     REGEX_ASSERT(5 == group);
   5236     group = pat->groupNumberFromName("three", -1, status);
   5237     REGEX_CHECK_STATUS;
   5238     REGEX_ASSERT(3 == group);
   5239 
   5240     status = U_ZERO_ERROR;
   5241     group = pat->groupNumberFromName(UnicodeString("six"), status);
   5242     REGEX_CHECK_STATUS;
   5243     REGEX_ASSERT(6 == group);
   5244 
   5245     status = U_ZERO_ERROR;
   5246     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
   5247     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5248 
   5249     status = U_ZERO_ERROR;
   5250 
   5251     // After copying a pattern, named capture should still work in the copy.
   5252     RegexPattern *copiedPat = new RegexPattern(*pat);
   5253     REGEX_ASSERT(*copiedPat == *pat);
   5254     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
   5255 
   5256     group = copiedPat->groupNumberFromName("five", -1, status);
   5257     REGEX_CHECK_STATUS;
   5258     REGEX_ASSERT(5 == group);
   5259     group = copiedPat->groupNumberFromName("three", -1, status);
   5260     REGEX_CHECK_STATUS;
   5261     REGEX_ASSERT(3 == group);
   5262     delete copiedPat;
   5263 
   5264     // ReplaceAll with named capture group.
   5265     status = U_ZERO_ERROR;
   5266     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
   5267     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
   5268     REGEX_CHECK_STATUS;
   5269     // m.pattern().dumpPattern();
   5270     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
   5271     REGEX_CHECK_STATUS;
   5272     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
   5273     delete m;
   5274 
   5275     // ReplaceAll, allowed capture group numbers.
   5276     text = UnicodeString("abcmxyz");
   5277     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
   5278     REGEX_CHECK_STATUS;
   5279 
   5280     status = U_ZERO_ERROR;
   5281     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
   5282     REGEX_CHECK_STATUS;
   5283     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
   5284 
   5285     status = U_ZERO_ERROR;
   5286     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
   5287     REGEX_CHECK_STATUS;
   5288     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5289 
   5290     status = U_ZERO_ERROR;
   5291     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
   5292     REGEX_CHECK_STATUS;
   5293     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5294 
   5295     status = U_ZERO_ERROR;
   5296     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
   5297     REGEX_CHECK_STATUS;
   5298     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
   5299 
   5300     status = U_ZERO_ERROR;
   5301     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
   5302     REGEX_CHECK_STATUS;
   5303     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
   5304 
   5305     status = U_ZERO_ERROR;
   5306     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
   5307     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5308 
   5309     status = U_ZERO_ERROR;
   5310     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
   5311     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
   5312     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
   5313 
   5314     status = U_ZERO_ERROR;
   5315     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
   5316     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
   5317     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
   5318 
   5319     status = U_ZERO_ERROR;
   5320     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
   5321     REGEX_CHECK_STATUS;
   5322     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
   5323 
   5324     status = U_ZERO_ERROR;
   5325     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
   5326     REGEX_CHECK_STATUS;
   5327     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
   5328 
   5329     status = U_ZERO_ERROR;
   5330     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
   5331     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5332 
   5333     status = U_ZERO_ERROR;
   5334     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
   5335     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5336 
   5337     status = U_ZERO_ERROR;
   5338     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
   5339     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5340 
   5341     status = U_ZERO_ERROR;
   5342     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
   5343     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5344 
   5345     delete m;
   5346 
   5347     // Repeat the above replaceAll() tests using the plain C API, which
   5348     //  has a separate implementation internally.
   5349     //  TODO: factor out the test data.
   5350 
   5351     status = U_ZERO_ERROR;
   5352     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
   5353     REGEX_CHECK_STATUS;
   5354     text = UnicodeString("abcmxyz");
   5355     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5356     REGEX_CHECK_STATUS;
   5357 
   5358     UChar resultBuf[100];
   5359     int32_t resultLength;
   5360     UnicodeString repl;
   5361 
   5362     status = U_ZERO_ERROR;
   5363     repl = UnicodeString("<$0>");
   5364     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5365     REGEX_CHECK_STATUS;
   5366     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
   5367 
   5368     status = U_ZERO_ERROR;
   5369     repl = UnicodeString("<$1>");
   5370     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5371     REGEX_CHECK_STATUS;
   5372     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5373 
   5374     status = U_ZERO_ERROR;
   5375     repl = UnicodeString("<${one}>");
   5376     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5377     REGEX_CHECK_STATUS;
   5378     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5379 
   5380     status = U_ZERO_ERROR;
   5381     repl = UnicodeString("<$2>");
   5382     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5383     REGEX_CHECK_STATUS;
   5384     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
   5385 
   5386     status = U_ZERO_ERROR;
   5387     repl = UnicodeString("<$3>");
   5388     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5389     REGEX_CHECK_STATUS;
   5390     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
   5391 
   5392     status = U_ZERO_ERROR;
   5393     repl = UnicodeString("<$4>");
   5394     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5395     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5396 
   5397     status = U_ZERO_ERROR;
   5398     repl = UnicodeString("<$04>");
   5399     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5400     REGEX_CHECK_STATUS;
   5401     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
   5402 
   5403     status = U_ZERO_ERROR;
   5404     repl = UnicodeString("<$000016>");
   5405     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5406     REGEX_CHECK_STATUS;
   5407     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
   5408 
   5409     status = U_ZERO_ERROR;
   5410     repl = UnicodeString("<$3$2$1${one}>");
   5411     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5412     REGEX_CHECK_STATUS;
   5413     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
   5414 
   5415     status = U_ZERO_ERROR;
   5416     repl = UnicodeString("$3$2$1${one}");
   5417     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5418     REGEX_CHECK_STATUS;
   5419     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
   5420 
   5421     status = U_ZERO_ERROR;
   5422     repl = UnicodeString("<${noSuchName}>");
   5423     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5424     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5425 
   5426     status = U_ZERO_ERROR;
   5427     repl = UnicodeString("<${invalid-name}>");
   5428     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5429     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5430 
   5431     status = U_ZERO_ERROR;
   5432     repl = UnicodeString("<${one");
   5433     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5434     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5435 
   5436     status = U_ZERO_ERROR;
   5437     repl = UnicodeString("$not a capture group");
   5438     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5439     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5440 
   5441     uregex_close(re);
   5442 }
   5443 
   5444 //--------------------------------------------------------------
   5445 //
   5446 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
   5447 //                       The point is not so much what the exact limit is,
   5448 //                       but that a largish number doesn't hit bad non-linear performance,
   5449 //                       and that exceeding the limit fails cleanly.
   5450 //
   5451 //--------------------------------------------------------------
   5452 void RegexTest::NamedCaptureLimits() {
   5453     if (quick) {
   5454         logln("Skipping test. Runs in exhuastive mode only.");
   5455         return;
   5456     }
   5457     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
   5458     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
   5459     char nnbuf[100];
   5460     UnicodeString pattern;
   5461     int32_t nn;
   5462 
   5463     for (nn=1; nn<goodLimit; nn++) {
   5464         sprintf(nnbuf, "(?<nn%d>)", nn);
   5465         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5466     }
   5467     UErrorCode status = U_ZERO_ERROR;
   5468     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
   5469     REGEX_CHECK_STATUS;
   5470     for (nn=1; nn<goodLimit; nn++) {
   5471         sprintf(nnbuf, "nn%d", nn);
   5472         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
   5473         REGEX_ASSERT(nn == groupNum);
   5474         if (nn != groupNum) {
   5475             break;
   5476         }
   5477     }
   5478     delete pat;
   5479 
   5480     pattern.remove();
   5481     for (nn=1; nn<failLimit; nn++) {
   5482         sprintf(nnbuf, "(?<nn%d>)", nn);
   5483         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5484     }
   5485     status = U_ZERO_ERROR;
   5486     pat = RegexPattern::compile(pattern, 0, status);
   5487     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
   5488     delete pat;
   5489 }
   5490 
   5491 
   5492 //--------------------------------------------------------------
   5493 //
   5494 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5495 //
   5496 //---------------------------------------------------------------
   5497 void RegexTest::Bug7651() {
   5498     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5499     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5500     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5501     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5502     UnicodeString s("#ff @abcd This is test");
   5503     RegexPattern  *REPattern = NULL;
   5504     RegexMatcher  *REMatcher = NULL;
   5505     UErrorCode status = U_ZERO_ERROR;
   5506     UParseError pe;
   5507 
   5508     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5509     REGEX_CHECK_STATUS;
   5510     REMatcher = REPattern->matcher(s, status);
   5511     REGEX_CHECK_STATUS;
   5512     REGEX_ASSERT(REMatcher->find());
   5513     REGEX_ASSERT(REMatcher->start(status) == 0);
   5514     delete REPattern;
   5515     delete REMatcher;
   5516     status = U_ZERO_ERROR;
   5517 
   5518     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5519     REGEX_CHECK_STATUS;
   5520     REMatcher = REPattern->matcher(s, status);
   5521     REGEX_CHECK_STATUS;
   5522     REGEX_ASSERT(REMatcher->find());
   5523     REGEX_ASSERT(REMatcher->start(status) == 0);
   5524     delete REPattern;
   5525     delete REMatcher;
   5526     status = U_ZERO_ERROR;
   5527  }
   5528 
   5529 void RegexTest::Bug7740() {
   5530     UErrorCode status = U_ZERO_ERROR;
   5531     UnicodeString pattern = "(a)";
   5532     UnicodeString text = "abcdef";
   5533     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5534     REGEX_CHECK_STATUS;
   5535     REGEX_ASSERT(m->lookingAt(status));
   5536     REGEX_CHECK_STATUS;
   5537     status = U_ILLEGAL_ARGUMENT_ERROR;
   5538     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5539     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5540     REGEX_ASSERT(s == "");
   5541     delete m;
   5542 }
   5543 
   5544 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5545 
   5546 void RegexTest::Bug8479() {
   5547     UErrorCode status = U_ZERO_ERROR;
   5548 
   5549     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5550     REGEX_CHECK_STATUS;
   5551     if (U_SUCCESS(status))
   5552     {
   5553         UnicodeString str;
   5554         str.setToBogus();
   5555         pMatcher->reset(str);
   5556         status = U_ZERO_ERROR;
   5557         pMatcher->matches(status);
   5558         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5559         delete pMatcher;
   5560     }
   5561 }
   5562 
   5563 
   5564 // Bug 7029
   5565 void RegexTest::Bug7029() {
   5566     UErrorCode status = U_ZERO_ERROR;
   5567 
   5568     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5569     UnicodeString text = "abc.def";
   5570     UnicodeString splits[10];
   5571     REGEX_CHECK_STATUS;
   5572     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5573     REGEX_CHECK_STATUS;
   5574     REGEX_ASSERT(numFields == 8);
   5575     delete pMatcher;
   5576 }
   5577 
   5578 // Bug 9283
   5579 //   This test is checking for the existance of any supplemental characters that case-fold
   5580 //   to a bmp character.
   5581 //
   5582 //   At the time of this writing there are none. If any should appear in a subsequent release
   5583 //   of Unicode, the code in regular expressions compilation that determines the longest
   5584 //   posssible match for a literal string  will need to be enhanced.
   5585 //
   5586 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5587 //   for details on what to do in case of a failure of this test.
   5588 //
   5589 void RegexTest::Bug9283() {
   5590 #if !UCONFIG_NO_NORMALIZATION
   5591     UErrorCode status = U_ZERO_ERROR;
   5592     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5593     REGEX_CHECK_STATUS;
   5594     int32_t index;
   5595     UChar32 c;
   5596     for (index=0; ; index++) {
   5597         c = supplementalsWithCaseFolding.charAt(index);
   5598         if (c == -1) {
   5599             break;
   5600         }
   5601         UnicodeString cf = UnicodeString(c).foldCase();
   5602         REGEX_ASSERT(cf.length() >= 2);
   5603     }
   5604 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   5605 }
   5606 
   5607 
   5608 void RegexTest::CheckInvBufSize() {
   5609   if(inv_next>=INV_BUFSIZ) {
   5610     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5611           __FILE__, INV_BUFSIZ, inv_next);
   5612   } else {
   5613     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5614   }
   5615 }
   5616 
   5617 
   5618 void RegexTest::Bug10459() {
   5619     UErrorCode status = U_ZERO_ERROR;
   5620     UnicodeString patternString("(txt)");
   5621     UnicodeString txtString("txt");
   5622 
   5623     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
   5624     REGEX_CHECK_STATUS;
   5625     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
   5626     REGEX_CHECK_STATUS;
   5627 
   5628     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
   5629     REGEX_CHECK_STATUS;
   5630 
   5631     uregex_setUText(icu_re, utext_txt, &status);
   5632     REGEX_CHECK_STATUS;
   5633 
   5634     // The bug was that calling uregex_group() before doing a matching operation
   5635     //   was causing a segfault. Only for Regular Expressions created from UText.
   5636     //   It should set an U_REGEX_INVALID_STATE.
   5637 
   5638     UChar buf[100];
   5639     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
   5640     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
   5641     REGEX_ASSERT(len == 0);
   5642 
   5643     uregex_close(icu_re);
   5644     utext_close(utext_pat);
   5645     utext_close(utext_txt);
   5646 }
   5647 
   5648 void RegexTest::TestCaseInsensitiveStarters() {
   5649     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
   5650     //  become stale because of new Unicode characters.
   5651     // If it is stale, rerun the generation tool
   5652     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   5653     // and replace the embedded data in i18n/regexcmp.cpp
   5654 
   5655     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
   5656         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
   5657             continue;
   5658         }
   5659         UnicodeSet s(cp, cp);
   5660         s.closeOver(USET_CASE_INSENSITIVE);
   5661         UnicodeSetIterator setIter(s);
   5662         while (setIter.next()) {
   5663             if (!setIter.isString()) {
   5664                 continue;
   5665             }
   5666             const UnicodeString &str = setIter.getString();
   5667             UChar32 firstChar = str.char32At(0);
   5668             UnicodeSet starters;
   5669             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
   5670             if (!starters.contains(cp)) {
   5671                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
   5672                 return;
   5673             }
   5674         }
   5675     }
   5676 }
   5677 
   5678 
   5679 void RegexTest::TestBug11049() {
   5680     // Original bug report: pattern with match start consisting of one of several individual characters,
   5681     //  and the text being matched ending with a supplementary character. find() would read past the
   5682     //  end of the input text when searching for potential match starting points.
   5683 
   5684     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
   5685     // detect the bad read.
   5686 
   5687     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5688     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
   5689 
   5690     // Test again with a pattern starting with a single character,
   5691     // which takes a different code path than starting with an OR expression,
   5692     // but with similar logic.
   5693     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5694     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
   5695 }
   5696 
   5697 // Run a single test case from TestBug11049(). Internal function.
   5698 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
   5699     UErrorCode status = U_ZERO_ERROR;
   5700     UnicodeString patternString = UnicodeString(pattern).unescape();
   5701     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5702 
   5703     UnicodeString dataString = UnicodeString(data).unescape();
   5704     UChar *exactBuffer = new UChar[dataString.length()];
   5705     dataString.extract(exactBuffer, dataString.length(), status);
   5706     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
   5707 
   5708     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
   5709     REGEX_CHECK_STATUS;
   5710     matcher->reset(ut);
   5711     UBool result = matcher->find();
   5712     if (result != expectMatch) {
   5713         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5714               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5715     }
   5716 
   5717     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
   5718     //   off-by-one on find() with match at the last code point.
   5719     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
   5720     //   because string.unescape() will only shrink it.
   5721     char * utf8Buffer = new char[uprv_strlen(data)+1];
   5722     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
   5723     REGEX_CHECK_STATUS;
   5724     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
   5725     REGEX_CHECK_STATUS;
   5726     matcher->reset(ut);
   5727     result = matcher->find();
   5728     if (result != expectMatch) {
   5729         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5730               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5731     }
   5732     delete [] utf8Buffer;
   5733 
   5734     utext_close(ut);
   5735     delete [] exactBuffer;
   5736 }
   5737 
   5738 
   5739 void RegexTest::TestBug11371() {
   5740     if (quick) {
   5741         logln("Skipping test. Runs in exhuastive mode only.");
   5742         return;
   5743     }
   5744     UErrorCode status = U_ZERO_ERROR;
   5745     UnicodeString patternString;
   5746 
   5747     for (int i=0; i<8000000; i++) {
   5748         patternString.append(UnicodeString("()"));
   5749     }
   5750     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5751     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5752         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5753               __FILE__, __LINE__, u_errorName(status));
   5754     }
   5755 
   5756     status = U_ZERO_ERROR;
   5757     patternString = "(";
   5758     for (int i=0; i<20000000; i++) {
   5759         patternString.append(UnicodeString("A++"));
   5760     }
   5761     patternString.append(UnicodeString("){0}B++"));
   5762     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
   5763     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5764         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5765               __FILE__, __LINE__, u_errorName(status));
   5766     }
   5767 
   5768     // Pattern with too much string data, such that string indexes overflow operand data field size
   5769     // in compiled instruction.
   5770     status = U_ZERO_ERROR;
   5771     patternString = "";
   5772     while (patternString.length() < 0x00ffffff) {
   5773         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
   5774     }
   5775     patternString.append(UnicodeString("X? trailing string"));
   5776     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
   5777     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5778         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5779               __FILE__, __LINE__, u_errorName(status));
   5780     }
   5781 }
   5782 
   5783 void RegexTest::TestBug11480() {
   5784     // C API, get capture group of a group that does not participate in the match.
   5785     //        (Returns a zero length string, with nul termination,
   5786     //         indistinguishable from a group with a zero length match.)
   5787 
   5788     UErrorCode status = U_ZERO_ERROR;
   5789     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
   5790     REGEX_CHECK_STATUS;
   5791     UnicodeString text = UNICODE_STRING_SIMPLE("A");
   5792     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5793     REGEX_CHECK_STATUS;
   5794     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
   5795     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
   5796     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
   5797     REGEX_ASSERT(length == 0);
   5798     REGEX_ASSERT(buf[0] == 13);
   5799     REGEX_ASSERT(buf[1] == 0);
   5800     REGEX_ASSERT(buf[2] == 13);
   5801     uregex_close(re);
   5802 
   5803     // UText C++ API, length of match is 0 for non-participating matches.
   5804     UText ut = UTEXT_INITIALIZER;
   5805     utext_openUnicodeString(&ut, &text, &status);
   5806     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
   5807     REGEX_CHECK_STATUS;
   5808     matcher.reset(&ut);
   5809     REGEX_ASSERT(matcher.lookingAt(0, status));
   5810 
   5811     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
   5812     int64_t groupLen = -666;
   5813     UText group = UTEXT_INITIALIZER;
   5814     matcher.group(1, &group, groupLen, status);
   5815     REGEX_CHECK_STATUS;
   5816     REGEX_ASSERT(groupLen == 1);
   5817     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
   5818 
   5819     // Capture group 2, the (B), does not participate in the match.
   5820     matcher.group(2, &group, groupLen, status);
   5821     REGEX_CHECK_STATUS;
   5822     REGEX_ASSERT(groupLen == 0);
   5823     REGEX_ASSERT(matcher.start(2, status) == -1);
   5824     REGEX_CHECK_STATUS;
   5825 }
   5826 
   5827 
   5828 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5829