Home | History | Annotate | Download | only in intltest
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /********************************************************************
      4  * COPYRIGHT:
      5  * Copyright (c) 2002-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ********************************************************************/
      8 
      9 //
     10 //   regextst.cpp
     11 //
     12 //      ICU Regular Expressions test, part of intltest.
     13 //
     14 
     15 /*
     16      NOTE!!
     17 
     18      PLEASE be careful about ASCII assumptions in this test.
     19      This test is one of the worst repeat offenders.
     20      If you have questions, contact someone on the ICU PMC
     21      who has access to an EBCDIC system.
     22 
     23  */
     24 
     25 #include "intltest.h"
     26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     27 
     28 #include <stdlib.h>
     29 #include <stdio.h>
     30 #include <string.h>
     31 
     32 #include "unicode/localpointer.h"
     33 #include "unicode/regex.h"
     34 #include "unicode/uchar.h"
     35 #include "unicode/ucnv.h"
     36 #include "unicode/uniset.h"
     37 #include "unicode/uregex.h"
     38 #include "unicode/usetiter.h"
     39 #include "unicode/ustring.h"
     40 #include "unicode/utext.h"
     41 #include "unicode/utf16.h"
     42 #include "cstr.h"
     43 #include "regextst.h"
     44 #include "regexcmp.h"
     45 #include "uvector.h"
     46 #include "util.h"
     47 #include "cmemory.h"
     48 #include "cstring.h"
     49 #include "uinvchar.h"
     50 
     51 #define SUPPORT_MUTATING_INPUT_STRING   0
     52 
     53 //---------------------------------------------------------------------------
     54 //
     55 //  Test class boilerplate
     56 //
     57 //---------------------------------------------------------------------------
     58 RegexTest::RegexTest()
     59 {
     60 }
     61 
     62 
     63 RegexTest::~RegexTest()
     64 {
     65 }
     66 
     67 
     68 
     69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     70 {
     71     if (exec) logln("TestSuite RegexTest: ");
     72     TESTCASE_AUTO_BEGIN;
     73     TESTCASE_AUTO(Basic);
     74     TESTCASE_AUTO(API_Match);
     75     TESTCASE_AUTO(API_Replace);
     76     TESTCASE_AUTO(API_Pattern);
     77 #if !UCONFIG_NO_FILE_IO
     78     TESTCASE_AUTO(Extended);
     79 #endif
     80     TESTCASE_AUTO(Errors);
     81     TESTCASE_AUTO(PerlTests);
     82     TESTCASE_AUTO(Callbacks);
     83     TESTCASE_AUTO(FindProgressCallbacks);
     84     TESTCASE_AUTO(Bug6149);
     85     TESTCASE_AUTO(UTextBasic);
     86     TESTCASE_AUTO(API_Match_UTF8);
     87     TESTCASE_AUTO(API_Replace_UTF8);
     88     TESTCASE_AUTO(API_Pattern_UTF8);
     89     TESTCASE_AUTO(PerlTestsUTF8);
     90     TESTCASE_AUTO(PreAllocatedUTextCAPI);
     91     TESTCASE_AUTO(Bug7651);
     92     TESTCASE_AUTO(Bug7740);
     93     TESTCASE_AUTO(Bug8479);
     94     TESTCASE_AUTO(Bug7029);
     95     TESTCASE_AUTO(CheckInvBufSize);
     96     TESTCASE_AUTO(Bug9283);
     97     TESTCASE_AUTO(Bug10459);
     98     TESTCASE_AUTO(TestCaseInsensitiveStarters);
     99     TESTCASE_AUTO(TestBug11049);
    100     TESTCASE_AUTO(TestBug11371);
    101     TESTCASE_AUTO(TestBug11480);
    102     TESTCASE_AUTO(NamedCapture);
    103     TESTCASE_AUTO(NamedCaptureLimits);
    104     TESTCASE_AUTO(TestBug12884);
    105     TESTCASE_AUTO(TestBug13631);
    106     TESTCASE_AUTO_END;
    107 }
    108 
    109 
    110 /**
    111  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
    112  * into ASCII.
    113  * @see utext_openUTF8
    114  */
    115 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
    116 
    117 //---------------------------------------------------------------------------
    118 //
    119 //   Error Checking / Reporting macros used in all of the tests.
    120 //
    121 //---------------------------------------------------------------------------
    122 
    123 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
    124   int64_t oldIndex = utext_getNativeIndex(text);
    125   utext_setNativeIndex(text, 0);
    126   char *bufPtr = buf;
    127   UChar32 c = utext_next32From(text, 0);
    128   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
    129     if (0x000020<=c && c<0x00007e) {
    130       *bufPtr = c;
    131     } else {
    132 #if 0
    133       sprintf(bufPtr,"U+%04X", c);
    134       bufPtr+= strlen(bufPtr)-1;
    135 #else
    136       *bufPtr = '%';
    137 #endif
    138     }
    139     bufPtr++;
    140     c = UTEXT_NEXT32(text);
    141   }
    142   *bufPtr = 0;
    143 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
    144   char *ebuf = (char*)malloc(bufLen);
    145   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
    146   uprv_strncpy(buf, ebuf, bufLen);
    147   free((void*)ebuf);
    148 #endif
    149   utext_setNativeIndex(text, oldIndex);
    150 }
    151 
    152 
    153 static char ASSERT_BUF[1024];
    154 
    155 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
    156   if(message.length()==0) {
    157     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
    158   } else {
    159     UnicodeString buf;
    160     IntlTest::prettify(message,buf);
    161     if(buf.length()==0) {
    162       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
    163     } else {
    164       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
    165       if(ASSERT_BUF[0]==0) {
    166         ASSERT_BUF[0]=0;
    167         for(int32_t i=0;i<buf.length();i++) {
    168           UChar ch = buf[i];
    169           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
    170         }
    171       }
    172     }
    173   }
    174   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
    175   return ASSERT_BUF;
    176 }
    177 
    178 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
    179 
    180 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
    181                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
    182 
    183 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
    184 
    185 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
    186 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
    187     __LINE__, u_errorName(errcode), u_errorName(status));};}
    188 
    189 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
    190     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
    191 
    192 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
    193     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
    194 
    195 // expected: const char * , restricted to invariant characters.
    196 // actual: const UnicodeString &
    197 #define REGEX_ASSERT_UNISTR(expected, actual) { \
    198     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
    199         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
    200                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
    201 
    202 
    203 static UBool testUTextEqual(UText *uta, UText *utb) {
    204     UChar32 ca = 0;
    205     UChar32 cb = 0;
    206     utext_setNativeIndex(uta, 0);
    207     utext_setNativeIndex(utb, 0);
    208     do {
    209         ca = utext_next32(uta);
    210         cb = utext_next32(utb);
    211         if (ca != cb) {
    212             break;
    213         }
    214     } while (ca != U_SENTINEL);
    215     return ca == cb;
    216 }
    217 
    218 
    219 /**
    220  * @param expected expected text in UTF-8 (not platform) codepage
    221  */
    222 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
    223     UErrorCode status = U_ZERO_ERROR;
    224     UText expectedText = UTEXT_INITIALIZER;
    225     utext_openUTF8(&expectedText, expected, -1, &status);
    226     if(U_FAILURE(status)) {
    227       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    228       return;
    229     }
    230     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
    231       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
    232       return;
    233     }
    234     utext_setNativeIndex(actual, 0);
    235     if (!testUTextEqual(&expectedText, actual)) {
    236         char buf[201 /*21*/];
    237         char expectedBuf[201];
    238         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
    239         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
    240         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    241     }
    242     utext_close(&expectedText);
    243 }
    244 /**
    245  * @param expected invariant (platform local text) input
    246  */
    247 
    248 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
    249     UErrorCode status = U_ZERO_ERROR;
    250     UText expectedText = UTEXT_INITIALIZER;
    251     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
    252     if(U_FAILURE(status)) {
    253       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
    254       return;
    255     }
    256     utext_setNativeIndex(actual, 0);
    257     if (!testUTextEqual(&expectedText, actual)) {
    258         char buf[201 /*21*/];
    259         char expectedBuf[201];
    260         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
    261         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
    262         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
    263     }
    264     utext_close(&expectedText);
    265 }
    266 
    267 /**
    268  * Assumes utf-8 input
    269  */
    270 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
    271 /**
    272  * Assumes Invariant input
    273  */
    274 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
    275 
    276 /**
    277  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
    278  * passed into utext_openUTF8. An error will be given if
    279  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
    280  */
    281 
    282 #define INV_BUFSIZ 2048 /* increase this if too small */
    283 
    284 static int64_t inv_next=0;
    285 
    286 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
    287 static char inv_buf[INV_BUFSIZ];
    288 #endif
    289 
    290 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
    291   if(length==-1) length=strlen(inv);
    292 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    293   inv_next+=length;
    294   return utext_openUTF8(ut, inv, length, status);
    295 #else
    296   if(inv_next+length+1>INV_BUFSIZ) {
    297     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
    298             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
    299     *status = U_MEMORY_ALLOCATION_ERROR;
    300     return NULL;
    301   }
    302 
    303   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
    304   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
    305   inv_next+=length;
    306 
    307 #if 0
    308   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
    309 #endif
    310 
    311   return utext_openUTF8(ut, (const char*)buf, length, status);
    312 #endif
    313 }
    314 
    315 
    316 //---------------------------------------------------------------------------
    317 //
    318 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
    319 //                       for the LookingAt() and  Match() functions.
    320 //
    321 //       usage:
    322 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
    323 //
    324 //          The expected results are UBool - TRUE or FALSE.
    325 //          The input text is unescaped.  The pattern is not.
    326 //
    327 //
    328 //---------------------------------------------------------------------------
    329 
    330 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
    331 
    332 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    333     const UnicodeString pattern(pat, -1, US_INV);
    334     const UnicodeString inputText(text, -1, US_INV);
    335     UErrorCode          status  = U_ZERO_ERROR;
    336     UParseError         pe;
    337     RegexPattern        *REPattern = NULL;
    338     RegexMatcher        *REMatcher = NULL;
    339     UBool               retVal     = TRUE;
    340 
    341     UnicodeString patString(pat, -1, US_INV);
    342     REPattern = RegexPattern::compile(patString, 0, pe, status);
    343     if (U_FAILURE(status)) {
    344         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
    345             line, u_errorName(status));
    346         return FALSE;
    347     }
    348     if (line==376) { REPattern->dumpPattern();}
    349 
    350     UnicodeString inputString(inputText);
    351     UnicodeString unEscapedInput = inputString.unescape();
    352     REMatcher = REPattern->matcher(unEscapedInput, status);
    353     if (U_FAILURE(status)) {
    354         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
    355             line, u_errorName(status));
    356         return FALSE;
    357     }
    358 
    359     UBool actualmatch;
    360     actualmatch = REMatcher->lookingAt(status);
    361     if (U_FAILURE(status)) {
    362         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
    363             line, u_errorName(status));
    364         retVal =  FALSE;
    365     }
    366     if (actualmatch != looking) {
    367         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
    368         retVal = FALSE;
    369     }
    370 
    371     status = U_ZERO_ERROR;
    372     actualmatch = REMatcher->matches(status);
    373     if (U_FAILURE(status)) {
    374         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
    375             line, u_errorName(status));
    376         retVal = FALSE;
    377     }
    378     if (actualmatch != match) {
    379         errln("RegexTest: wrong return from matches() at line %d.\n", line);
    380         retVal = FALSE;
    381     }
    382 
    383     if (retVal == FALSE) {
    384         REPattern->dumpPattern();
    385     }
    386 
    387     delete REPattern;
    388     delete REMatcher;
    389     return retVal;
    390 }
    391 
    392 
    393 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
    394     UText               pattern    = UTEXT_INITIALIZER;
    395     int32_t             inputUTF8Length;
    396     char                *textChars = NULL;
    397     UText               inputText  = UTEXT_INITIALIZER;
    398     UErrorCode          status     = U_ZERO_ERROR;
    399     UParseError         pe;
    400     RegexPattern        *REPattern = NULL;
    401     RegexMatcher        *REMatcher = NULL;
    402     UBool               retVal     = TRUE;
    403 
    404     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
    405     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
    406     if (U_FAILURE(status)) {
    407         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
    408             line, u_errorName(status));
    409         return FALSE;
    410     }
    411 
    412     UnicodeString inputString(text, -1, US_INV);
    413     UnicodeString unEscapedInput = inputString.unescape();
    414     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
    415     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
    416 
    417     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
    418     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
    419         // UTF-8 does not allow unpaired surrogates, so this could actually happen
    420         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
    421         return TRUE; // not a failure of the Regex engine
    422     }
    423     status = U_ZERO_ERROR; // buffer overflow
    424     textChars = new char[inputUTF8Length+1];
    425     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
    426     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
    427 
    428     REMatcher = &REPattern->matcher(status)->reset(&inputText);
    429     if (U_FAILURE(status)) {
    430         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
    431             line, u_errorName(status));
    432         return FALSE;
    433     }
    434 
    435     UBool actualmatch;
    436     actualmatch = REMatcher->lookingAt(status);
    437     if (U_FAILURE(status)) {
    438         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
    439             line, u_errorName(status));
    440         retVal =  FALSE;
    441     }
    442     if (actualmatch != looking) {
    443         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
    444         retVal = FALSE;
    445     }
    446 
    447     status = U_ZERO_ERROR;
    448     actualmatch = REMatcher->matches(status);
    449     if (U_FAILURE(status)) {
    450         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
    451             line, u_errorName(status));
    452         retVal = FALSE;
    453     }
    454     if (actualmatch != match) {
    455         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
    456         retVal = FALSE;
    457     }
    458 
    459     if (retVal == FALSE) {
    460         REPattern->dumpPattern();
    461     }
    462 
    463     delete REPattern;
    464     delete REMatcher;
    465     utext_close(&inputText);
    466     utext_close(&pattern);
    467     delete[] textChars;
    468     return retVal;
    469 }
    470 
    471 
    472 
    473 //---------------------------------------------------------------------------
    474 //
    475 //    REGEX_ERR       Macro + invocation function to simplify writing tests
    476 //                       regex tests for incorrect patterns
    477 //
    478 //       usage:
    479 //          REGEX_ERR("pattern",   expected error line, column, expected status);
    480 //
    481 //---------------------------------------------------------------------------
    482 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
    483 
    484 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
    485                           UErrorCode expectedStatus, int32_t line) {
    486     UnicodeString       pattern(pat);
    487 
    488     UErrorCode          status         = U_ZERO_ERROR;
    489     UParseError         pe;
    490     RegexPattern        *callerPattern = NULL;
    491 
    492     //
    493     //  Compile the caller's pattern
    494     //
    495     UnicodeString patString(pat);
    496     callerPattern = RegexPattern::compile(patString, 0, pe, status);
    497     if (status != expectedStatus) {
    498         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    499     } else {
    500         if (status != U_ZERO_ERROR) {
    501             if (pe.line != errLine || pe.offset != errCol) {
    502                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    503                     line, errLine, errCol, pe.line, pe.offset);
    504             }
    505         }
    506     }
    507 
    508     delete callerPattern;
    509 
    510     //
    511     //  Compile again, using a UTF-8-based UText
    512     //
    513     UText patternText = UTEXT_INITIALIZER;
    514     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
    515     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
    516     if (status != expectedStatus) {
    517         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
    518     } else {
    519         if (status != U_ZERO_ERROR) {
    520             if (pe.line != errLine || pe.offset != errCol) {
    521                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
    522                     line, errLine, errCol, pe.line, pe.offset);
    523             }
    524         }
    525     }
    526 
    527     delete callerPattern;
    528     utext_close(&patternText);
    529 }
    530 
    531 
    532 
    533 //---------------------------------------------------------------------------
    534 //
    535 //      Basic      Check for basic functionality of regex pattern matching.
    536 //                 Avoid the use of REGEX_FIND test macro, which has
    537 //                 substantial dependencies on basic Regex functionality.
    538 //
    539 //---------------------------------------------------------------------------
    540 void RegexTest::Basic() {
    541 
    542 
    543 //
    544 // Debug - slide failing test cases early
    545 //
    546 #if 0
    547     {
    548         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
    549         UParseError pe;
    550         UErrorCode  status = U_ZERO_ERROR;
    551         RegexPattern *pattern;
    552         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
    553         pattern->dumpPattern();
    554         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
    555         UBool result = m->find();
    556         printf("result = %d\n", result);
    557         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
    558         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
    559     }
    560     exit(1);
    561 #endif
    562 
    563 
    564     //
    565     // Pattern with parentheses
    566     //
    567     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
    568     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
    569     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
    570 
    571     //
    572     // Patterns with *
    573     //
    574     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
    575     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
    576     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
    577     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
    578     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
    579 
    580     REGEX_TESTLM("a*", "",  TRUE, TRUE);
    581     REGEX_TESTLM("a*", "b", TRUE, FALSE);
    582 
    583 
    584     //
    585     //  Patterns with "."
    586     //
    587     REGEX_TESTLM(".", "abc", TRUE, FALSE);
    588     REGEX_TESTLM("...", "abc", TRUE, TRUE);
    589     REGEX_TESTLM("....", "abc", FALSE, FALSE);
    590     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
    591     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
    592     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
    593     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
    594     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
    595 
    596     //
    597     //  Patterns with * applied to chars at end of literal string
    598     //
    599     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
    600     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
    601 
    602     //
    603     //  Supplemental chars match as single chars, not a pair of surrogates.
    604     //
    605     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
    606     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
    607     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
    608 
    609 
    610     //
    611     //  UnicodeSets in the pattern
    612     //
    613     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
    614     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
    615     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
    616     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    617     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
    618     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
    619 
    620     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
    621     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
    622     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
    623     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
    624     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
    625 
    626     //
    627     //   OR operator in patterns
    628     //
    629     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
    630     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
    631     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
    632     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
    633 
    634     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
    635     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
    636     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
    637     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
    638     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
    639     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
    640 
    641     //
    642     //  +
    643     //
    644     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
    645     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
    646     REGEX_TESTLM("b+", "", FALSE, FALSE);
    647     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
    648     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
    649     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
    650 
    651     //
    652     //   ?
    653     //
    654     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
    655     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
    656     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
    657     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
    658     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
    659     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
    660     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
    661     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
    662     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
    663 
    664     //
    665     //  Escape sequences that become single literal chars, handled internally
    666     //   by ICU's Unescape.
    667     //
    668 
    669     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
    670     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
    671     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
    672     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
    673     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
    674     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
    675     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
    676     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
    677     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
    678     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
    679 
    680     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
    681     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
    682 
    683     // Escape of special chars in patterns
    684     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
    685 }
    686 
    687 
    688 //---------------------------------------------------------------------------
    689 //
    690 //    UTextBasic   Check for quirks that are specific to the UText
    691 //                 implementation.
    692 //
    693 //---------------------------------------------------------------------------
    694 void RegexTest::UTextBasic() {
    695     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
    696     UErrorCode status = U_ZERO_ERROR;
    697     UText pattern = UTEXT_INITIALIZER;
    698     utext_openUTF8(&pattern, str_abc, -1, &status);
    699     RegexMatcher matcher(&pattern, 0, status);
    700     REGEX_CHECK_STATUS;
    701 
    702     UText input = UTEXT_INITIALIZER;
    703     utext_openUTF8(&input, str_abc, -1, &status);
    704     REGEX_CHECK_STATUS;
    705     matcher.reset(&input);
    706     REGEX_CHECK_STATUS;
    707     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    708 
    709     matcher.reset(matcher.inputText());
    710     REGEX_CHECK_STATUS;
    711     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
    712 
    713     utext_close(&pattern);
    714     utext_close(&input);
    715 }
    716 
    717 
    718 //---------------------------------------------------------------------------
    719 //
    720 //      API_Match   Test that the API for class RegexMatcher
    721 //                  is present and nominally working, but excluding functions
    722 //                  implementing replace operations.
    723 //
    724 //---------------------------------------------------------------------------
    725 void RegexTest::API_Match() {
    726     UParseError         pe;
    727     UErrorCode          status=U_ZERO_ERROR;
    728     int32_t             flags = 0;
    729 
    730     //
    731     // Debug - slide failing test cases early
    732     //
    733 #if 0
    734     {
    735     }
    736     return;
    737 #endif
    738 
    739     //
    740     // Simple pattern compilation
    741     //
    742     {
    743         UnicodeString       re("abc");
    744         RegexPattern        *pat2;
    745         pat2 = RegexPattern::compile(re, flags, pe, status);
    746         REGEX_CHECK_STATUS;
    747 
    748         UnicodeString inStr1 = "abcdef this is a test";
    749         UnicodeString instr2 = "not abc";
    750         UnicodeString empty  = "";
    751 
    752 
    753         //
    754         // Matcher creation and reset.
    755         //
    756         RegexMatcher *m1 = pat2->matcher(inStr1, status);
    757         REGEX_CHECK_STATUS;
    758         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    759         REGEX_ASSERT(m1->input() == inStr1);
    760         m1->reset(instr2);
    761         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    762         REGEX_ASSERT(m1->input() == instr2);
    763         m1->reset(inStr1);
    764         REGEX_ASSERT(m1->input() == inStr1);
    765         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    766         m1->reset(empty);
    767         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
    768         REGEX_ASSERT(m1->input() == empty);
    769         REGEX_ASSERT(&m1->pattern() == pat2);
    770 
    771         //
    772         //  reset(pos, status)
    773         //
    774         m1->reset(inStr1);
    775         m1->reset(4, status);
    776         REGEX_CHECK_STATUS;
    777         REGEX_ASSERT(m1->input() == inStr1);
    778         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
    779 
    780         m1->reset(-1, status);
    781         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    782         status = U_ZERO_ERROR;
    783 
    784         m1->reset(0, status);
    785         REGEX_CHECK_STATUS;
    786         status = U_ZERO_ERROR;
    787 
    788         int32_t len = m1->input().length();
    789         m1->reset(len-1, status);
    790         REGEX_CHECK_STATUS;
    791         status = U_ZERO_ERROR;
    792 
    793         m1->reset(len, status);
    794         REGEX_CHECK_STATUS;
    795         status = U_ZERO_ERROR;
    796 
    797         m1->reset(len+1, status);
    798         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    799         status = U_ZERO_ERROR;
    800 
    801         //
    802         // match(pos, status)
    803         //
    804         m1->reset(instr2);
    805         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    806         m1->reset();
    807         REGEX_ASSERT(m1->matches(3, status) == FALSE);
    808         m1->reset();
    809         REGEX_ASSERT(m1->matches(5, status) == FALSE);
    810         REGEX_ASSERT(m1->matches(4, status) == TRUE);
    811         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
    812         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    813 
    814         // Match() at end of string should fail, but should not
    815         //  be an error.
    816         status = U_ZERO_ERROR;
    817         len = m1->input().length();
    818         REGEX_ASSERT(m1->matches(len, status) == FALSE);
    819         REGEX_CHECK_STATUS;
    820 
    821         // Match beyond end of string should fail with an error.
    822         status = U_ZERO_ERROR;
    823         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
    824         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    825 
    826         // Successful match at end of string.
    827         {
    828             status = U_ZERO_ERROR;
    829             RegexMatcher m("A?", 0, status);  // will match zero length string.
    830             REGEX_CHECK_STATUS;
    831             m.reset(inStr1);
    832             len = inStr1.length();
    833             REGEX_ASSERT(m.matches(len, status) == TRUE);
    834             REGEX_CHECK_STATUS;
    835             m.reset(empty);
    836             REGEX_ASSERT(m.matches(0, status) == TRUE);
    837             REGEX_CHECK_STATUS;
    838         }
    839 
    840 
    841         //
    842         // lookingAt(pos, status)
    843         //
    844         status = U_ZERO_ERROR;
    845         m1->reset(instr2);  // "not abc"
    846         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    847         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
    848         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
    849         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
    850         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
    851         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    852         status = U_ZERO_ERROR;
    853         len = m1->input().length();
    854         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
    855         REGEX_CHECK_STATUS;
    856         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
    857         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
    858 
    859         delete m1;
    860         delete pat2;
    861     }
    862 
    863 
    864     //
    865     // Capture Group.
    866     //     RegexMatcher::start();
    867     //     RegexMatcher::end();
    868     //     RegexMatcher::groupCount();
    869     //
    870     {
    871         int32_t             flags=0;
    872         UParseError         pe;
    873         UErrorCode          status=U_ZERO_ERROR;
    874 
    875         UnicodeString       re("01(23(45)67)(.*)");
    876         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    877         REGEX_CHECK_STATUS;
    878         UnicodeString data = "0123456789";
    879 
    880         RegexMatcher *matcher = pat->matcher(data, status);
    881         REGEX_CHECK_STATUS;
    882         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
    883         static const int32_t matchStarts[] = {0,  2, 4, 8};
    884         static const int32_t matchEnds[]   = {10, 8, 6, 10};
    885         int32_t i;
    886         for (i=0; i<4; i++) {
    887             int32_t actualStart = matcher->start(i, status);
    888             REGEX_CHECK_STATUS;
    889             if (actualStart != matchStarts[i]) {
    890                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
    891                     __LINE__, i, matchStarts[i], actualStart);
    892             }
    893             int32_t actualEnd = matcher->end(i, status);
    894             REGEX_CHECK_STATUS;
    895             if (actualEnd != matchEnds[i]) {
    896                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
    897                     __LINE__, i, matchEnds[i], actualEnd);
    898             }
    899         }
    900 
    901         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
    902         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
    903 
    904         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    905         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    906         matcher->reset();
    907         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
    908 
    909         matcher->lookingAt(status);
    910         REGEX_ASSERT(matcher->group(status)    == "0123456789");
    911         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
    912         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
    913         REGEX_ASSERT(matcher->group(2, status) == "45"        );
    914         REGEX_ASSERT(matcher->group(3, status) == "89"        );
    915         REGEX_CHECK_STATUS;
    916         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    917         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
    918         matcher->reset();
    919         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
    920 
    921         delete matcher;
    922         delete pat;
    923 
    924     }
    925 
    926     //
    927     //  find
    928     //
    929     {
    930         int32_t             flags=0;
    931         UParseError         pe;
    932         UErrorCode          status=U_ZERO_ERROR;
    933 
    934         UnicodeString       re("abc");
    935         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    936         REGEX_CHECK_STATUS;
    937         UnicodeString data = ".abc..abc...abc..";
    938         //                    012345678901234567
    939 
    940         RegexMatcher *matcher = pat->matcher(data, status);
    941         REGEX_CHECK_STATUS;
    942         REGEX_ASSERT(matcher->find());
    943         REGEX_ASSERT(matcher->start(status) == 1);
    944         REGEX_ASSERT(matcher->find());
    945         REGEX_ASSERT(matcher->start(status) == 6);
    946         REGEX_ASSERT(matcher->find());
    947         REGEX_ASSERT(matcher->start(status) == 12);
    948         REGEX_ASSERT(matcher->find() == FALSE);
    949         REGEX_ASSERT(matcher->find() == FALSE);
    950 
    951         matcher->reset();
    952         REGEX_ASSERT(matcher->find());
    953         REGEX_ASSERT(matcher->start(status) == 1);
    954 
    955         REGEX_ASSERT(matcher->find(0, status));
    956         REGEX_ASSERT(matcher->start(status) == 1);
    957         REGEX_ASSERT(matcher->find(1, status));
    958         REGEX_ASSERT(matcher->start(status) == 1);
    959         REGEX_ASSERT(matcher->find(2, status));
    960         REGEX_ASSERT(matcher->start(status) == 6);
    961         REGEX_ASSERT(matcher->find(12, status));
    962         REGEX_ASSERT(matcher->start(status) == 12);
    963         REGEX_ASSERT(matcher->find(13, status) == FALSE);
    964         REGEX_ASSERT(matcher->find(16, status) == FALSE);
    965         REGEX_ASSERT(matcher->find(17, status) == FALSE);
    966         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
    967 
    968         status = U_ZERO_ERROR;
    969         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
    970         status = U_ZERO_ERROR;
    971         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
    972 
    973         REGEX_ASSERT(matcher->groupCount() == 0);
    974 
    975         delete matcher;
    976         delete pat;
    977     }
    978 
    979 
    980     //
    981     //  find, with \G in pattern (true if at the end of a previous match).
    982     //
    983     {
    984         int32_t             flags=0;
    985         UParseError         pe;
    986         UErrorCode          status=U_ZERO_ERROR;
    987 
    988         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
    989         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
    990         REGEX_CHECK_STATUS;
    991         UnicodeString data = ".abcabc.abc..";
    992         //                    012345678901234567
    993 
    994         RegexMatcher *matcher = pat->matcher(data, status);
    995         REGEX_CHECK_STATUS;
    996         REGEX_ASSERT(matcher->find());
    997         REGEX_ASSERT(matcher->start(status) == 0);
    998         REGEX_ASSERT(matcher->start(1, status) == -1);
    999         REGEX_ASSERT(matcher->start(2, status) == 1);
   1000 
   1001         REGEX_ASSERT(matcher->find());
   1002         REGEX_ASSERT(matcher->start(status) == 4);
   1003         REGEX_ASSERT(matcher->start(1, status) == 4);
   1004         REGEX_ASSERT(matcher->start(2, status) == -1);
   1005         REGEX_CHECK_STATUS;
   1006 
   1007         delete matcher;
   1008         delete pat;
   1009     }
   1010 
   1011     //
   1012     //   find with zero length matches, match position should bump ahead
   1013     //     to prevent loops.
   1014     //
   1015     {
   1016         int32_t                 i;
   1017         UErrorCode          status=U_ZERO_ERROR;
   1018         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   1019                                                       //   using an always-true look-ahead.
   1020         REGEX_CHECK_STATUS;
   1021         UnicodeString s("    ");
   1022         m.reset(s);
   1023         for (i=0; ; i++) {
   1024             if (m.find() == FALSE) {
   1025                 break;
   1026             }
   1027             REGEX_ASSERT(m.start(status) == i);
   1028             REGEX_ASSERT(m.end(status) == i);
   1029         }
   1030         REGEX_ASSERT(i==5);
   1031 
   1032         // Check that the bump goes over surrogate pairs OK
   1033         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
   1034         s = s.unescape();
   1035         m.reset(s);
   1036         for (i=0; ; i+=2) {
   1037             if (m.find() == FALSE) {
   1038                 break;
   1039             }
   1040             REGEX_ASSERT(m.start(status) == i);
   1041             REGEX_ASSERT(m.end(status) == i);
   1042         }
   1043         REGEX_ASSERT(i==10);
   1044     }
   1045     {
   1046         // find() loop breaking test.
   1047         //        with pattern of /.?/, should see a series of one char matches, then a single
   1048         //        match of zero length at the end of the input string.
   1049         int32_t                 i;
   1050         UErrorCode          status=U_ZERO_ERROR;
   1051         RegexMatcher        m(".?", 0, status);
   1052         REGEX_CHECK_STATUS;
   1053         UnicodeString s("    ");
   1054         m.reset(s);
   1055         for (i=0; ; i++) {
   1056             if (m.find() == FALSE) {
   1057                 break;
   1058             }
   1059             REGEX_ASSERT(m.start(status) == i);
   1060             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   1061         }
   1062         REGEX_ASSERT(i==5);
   1063     }
   1064 
   1065 
   1066     //
   1067     // Matchers with no input string behave as if they had an empty input string.
   1068     //
   1069 
   1070     {
   1071         UErrorCode status = U_ZERO_ERROR;
   1072         RegexMatcher  m(".?", 0, status);
   1073         REGEX_CHECK_STATUS;
   1074         REGEX_ASSERT(m.find());
   1075         REGEX_ASSERT(m.start(status) == 0);
   1076         REGEX_ASSERT(m.input() == "");
   1077     }
   1078     {
   1079         UErrorCode status = U_ZERO_ERROR;
   1080         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   1081         RegexMatcher  *m = p->matcher(status);
   1082         REGEX_CHECK_STATUS;
   1083 
   1084         REGEX_ASSERT(m->find() == FALSE);
   1085         REGEX_ASSERT(m->input() == "");
   1086         delete m;
   1087         delete p;
   1088     }
   1089 
   1090     //
   1091     // Regions
   1092     //
   1093     {
   1094         UErrorCode status = U_ZERO_ERROR;
   1095         UnicodeString testString("This is test data");
   1096         RegexMatcher m(".*", testString,  0, status);
   1097         REGEX_CHECK_STATUS;
   1098         REGEX_ASSERT(m.regionStart() == 0);
   1099         REGEX_ASSERT(m.regionEnd() == testString.length());
   1100         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1101         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1102 
   1103         m.region(2,4, status);
   1104         REGEX_CHECK_STATUS;
   1105         REGEX_ASSERT(m.matches(status));
   1106         REGEX_ASSERT(m.start(status)==2);
   1107         REGEX_ASSERT(m.end(status)==4);
   1108         REGEX_CHECK_STATUS;
   1109 
   1110         m.reset();
   1111         REGEX_ASSERT(m.regionStart() == 0);
   1112         REGEX_ASSERT(m.regionEnd() == testString.length());
   1113 
   1114         UnicodeString shorterString("short");
   1115         m.reset(shorterString);
   1116         REGEX_ASSERT(m.regionStart() == 0);
   1117         REGEX_ASSERT(m.regionEnd() == shorterString.length());
   1118 
   1119         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1120         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   1121         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1122         REGEX_ASSERT(&m == &m.reset());
   1123         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   1124 
   1125         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   1126         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1127         REGEX_ASSERT(&m == &m.reset());
   1128         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   1129 
   1130         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1131         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   1132         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1133         REGEX_ASSERT(&m == &m.reset());
   1134         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   1135 
   1136         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   1137         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1138         REGEX_ASSERT(&m == &m.reset());
   1139         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   1140 
   1141     }
   1142 
   1143     //
   1144     // hitEnd() and requireEnd()
   1145     //
   1146     {
   1147         UErrorCode status = U_ZERO_ERROR;
   1148         UnicodeString testString("aabb");
   1149         RegexMatcher m1(".*", testString,  0, status);
   1150         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   1151         REGEX_ASSERT(m1.hitEnd() == TRUE);
   1152         REGEX_ASSERT(m1.requireEnd() == FALSE);
   1153         REGEX_CHECK_STATUS;
   1154 
   1155         status = U_ZERO_ERROR;
   1156         RegexMatcher m2("a*", testString, 0, status);
   1157         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   1158         REGEX_ASSERT(m2.hitEnd() == FALSE);
   1159         REGEX_ASSERT(m2.requireEnd() == FALSE);
   1160         REGEX_CHECK_STATUS;
   1161 
   1162         status = U_ZERO_ERROR;
   1163         RegexMatcher m3(".*$", testString, 0, status);
   1164         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   1165         REGEX_ASSERT(m3.hitEnd() == TRUE);
   1166         REGEX_ASSERT(m3.requireEnd() == TRUE);
   1167         REGEX_CHECK_STATUS;
   1168     }
   1169 
   1170 
   1171     //
   1172     // Compilation error on reset with UChar *
   1173     //   These were a hazard that people were stumbling over with runtime errors.
   1174     //   Changed them to compiler errors by adding private methods that more closely
   1175     //   matched the incorrect use of the functions.
   1176     //
   1177 #if 0
   1178     {
   1179         UErrorCode status = U_ZERO_ERROR;
   1180         UChar ucharString[20];
   1181         RegexMatcher m(".", 0, status);
   1182         m.reset(ucharString);  // should not compile.
   1183 
   1184         RegexPattern *p = RegexPattern::compile(".", 0, status);
   1185         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
   1186 
   1187         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
   1188     }
   1189 #endif
   1190 
   1191     //
   1192     //  Time Outs.
   1193     //       Note:  These tests will need to be changed when the regexp engine is
   1194     //              able to detect and cut short the exponential time behavior on
   1195     //              this type of match.
   1196     //
   1197     {
   1198         UErrorCode status = U_ZERO_ERROR;
   1199         //    Enough 'a's in the string to cause the match to time out.
   1200         //       (Each on additonal 'a' doubles the time)
   1201         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
   1202         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1203         REGEX_CHECK_STATUS;
   1204         REGEX_ASSERT(matcher.getTimeLimit() == 0);
   1205         matcher.setTimeLimit(100, status);
   1206         REGEX_ASSERT(matcher.getTimeLimit() == 100);
   1207         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1208         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   1209     }
   1210     {
   1211         UErrorCode status = U_ZERO_ERROR;
   1212         //   Few enough 'a's to slip in under the time limit.
   1213         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
   1214         RegexMatcher matcher("(a+)+b", testString, 0, status);
   1215         REGEX_CHECK_STATUS;
   1216         matcher.setTimeLimit(100, status);
   1217         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1218         REGEX_CHECK_STATUS;
   1219     }
   1220 
   1221     //
   1222     //  Stack Limits
   1223     //
   1224     {
   1225         UErrorCode status = U_ZERO_ERROR;
   1226         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
   1227 
   1228         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
   1229         //   of the '+', and makes the stack frames larger.
   1230         RegexMatcher matcher("(A)+A$", testString, 0, status);
   1231 
   1232         // With the default stack, this match should fail to run
   1233         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1234         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1235 
   1236         // With unlimited stack, it should run
   1237         status = U_ZERO_ERROR;
   1238         matcher.setStackLimit(0, status);
   1239         REGEX_CHECK_STATUS;
   1240         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
   1241         REGEX_CHECK_STATUS;
   1242         REGEX_ASSERT(matcher.getStackLimit() == 0);
   1243 
   1244         // With a limited stack, it the match should fail
   1245         status = U_ZERO_ERROR;
   1246         matcher.setStackLimit(10000, status);
   1247         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
   1248         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
   1249         REGEX_ASSERT(matcher.getStackLimit() == 10000);
   1250     }
   1251 
   1252         // A pattern that doesn't save state should work with
   1253         //   a minimal sized stack
   1254     {
   1255         UErrorCode status = U_ZERO_ERROR;
   1256         UnicodeString testString = "abc";
   1257         RegexMatcher matcher("abc", testString, 0, status);
   1258         REGEX_CHECK_STATUS;
   1259         matcher.setStackLimit(30, status);
   1260         REGEX_CHECK_STATUS;
   1261         REGEX_ASSERT(matcher.matches(status) == TRUE);
   1262         REGEX_CHECK_STATUS;
   1263         REGEX_ASSERT(matcher.getStackLimit() == 30);
   1264 
   1265         // Negative stack sizes should fail
   1266         status = U_ZERO_ERROR;
   1267         matcher.setStackLimit(1000, status);
   1268         REGEX_CHECK_STATUS;
   1269         matcher.setStackLimit(-1, status);
   1270         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   1271         REGEX_ASSERT(matcher.getStackLimit() == 1000);
   1272     }
   1273 
   1274 
   1275 }
   1276 
   1277 
   1278 
   1279 
   1280 
   1281 
   1282 //---------------------------------------------------------------------------
   1283 //
   1284 //      API_Replace        API test for class RegexMatcher, testing the
   1285 //                         Replace family of functions.
   1286 //
   1287 //---------------------------------------------------------------------------
   1288 void RegexTest::API_Replace() {
   1289     //
   1290     //  Replace
   1291     //
   1292     int32_t             flags=0;
   1293     UParseError         pe;
   1294     UErrorCode          status=U_ZERO_ERROR;
   1295 
   1296     UnicodeString       re("abc");
   1297     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
   1298     REGEX_CHECK_STATUS;
   1299     UnicodeString data = ".abc..abc...abc..";
   1300     //                    012345678901234567
   1301     RegexMatcher *matcher = pat->matcher(data, status);
   1302 
   1303     //
   1304     //  Plain vanilla matches.
   1305     //
   1306     UnicodeString  dest;
   1307     dest = matcher->replaceFirst("yz", status);
   1308     REGEX_CHECK_STATUS;
   1309     REGEX_ASSERT(dest == ".yz..abc...abc..");
   1310 
   1311     dest = matcher->replaceAll("yz", status);
   1312     REGEX_CHECK_STATUS;
   1313     REGEX_ASSERT(dest == ".yz..yz...yz..");
   1314 
   1315     //
   1316     //  Plain vanilla non-matches.
   1317     //
   1318     UnicodeString d2 = ".abx..abx...abx..";
   1319     matcher->reset(d2);
   1320     dest = matcher->replaceFirst("yz", status);
   1321     REGEX_CHECK_STATUS;
   1322     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1323 
   1324     dest = matcher->replaceAll("yz", status);
   1325     REGEX_CHECK_STATUS;
   1326     REGEX_ASSERT(dest == ".abx..abx...abx..");
   1327 
   1328     //
   1329     // Empty source string
   1330     //
   1331     UnicodeString d3 = "";
   1332     matcher->reset(d3);
   1333     dest = matcher->replaceFirst("yz", status);
   1334     REGEX_CHECK_STATUS;
   1335     REGEX_ASSERT(dest == "");
   1336 
   1337     dest = matcher->replaceAll("yz", status);
   1338     REGEX_CHECK_STATUS;
   1339     REGEX_ASSERT(dest == "");
   1340 
   1341     //
   1342     // Empty substitution string
   1343     //
   1344     matcher->reset(data);              // ".abc..abc...abc.."
   1345     dest = matcher->replaceFirst("", status);
   1346     REGEX_CHECK_STATUS;
   1347     REGEX_ASSERT(dest == "...abc...abc..");
   1348 
   1349     dest = matcher->replaceAll("", status);
   1350     REGEX_CHECK_STATUS;
   1351     REGEX_ASSERT(dest == "........");
   1352 
   1353     //
   1354     // match whole string
   1355     //
   1356     UnicodeString d4 = "abc";
   1357     matcher->reset(d4);
   1358     dest = matcher->replaceFirst("xyz", status);
   1359     REGEX_CHECK_STATUS;
   1360     REGEX_ASSERT(dest == "xyz");
   1361 
   1362     dest = matcher->replaceAll("xyz", status);
   1363     REGEX_CHECK_STATUS;
   1364     REGEX_ASSERT(dest == "xyz");
   1365 
   1366     //
   1367     // Capture Group, simple case
   1368     //
   1369     UnicodeString       re2("a(..)");
   1370     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
   1371     REGEX_CHECK_STATUS;
   1372     UnicodeString d5 = "abcdefg";
   1373     RegexMatcher *matcher2 = pat2->matcher(d5, status);
   1374     REGEX_CHECK_STATUS;
   1375     dest = matcher2->replaceFirst("$1$1", status);
   1376     REGEX_CHECK_STATUS;
   1377     REGEX_ASSERT(dest == "bcbcdefg");
   1378 
   1379     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
   1380     REGEX_CHECK_STATUS;
   1381     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
   1382 
   1383     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
   1384     REGEX_ASSERT(U_FAILURE(status));
   1385     status = U_ZERO_ERROR;
   1386 
   1387     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
   1388     replacement = replacement.unescape();
   1389     dest = matcher2->replaceFirst(replacement, status);
   1390     REGEX_CHECK_STATUS;
   1391     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
   1392 
   1393     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
   1394 
   1395 
   1396     //
   1397     // Replacement String with \u hex escapes
   1398     //
   1399     {
   1400         UnicodeString  src = "abc 1 abc 2 abc 3";
   1401         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
   1402         matcher->reset(src);
   1403         UnicodeString  result = matcher->replaceAll(substitute, status);
   1404         REGEX_CHECK_STATUS;
   1405         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
   1406     }
   1407     {
   1408         UnicodeString  src = "abc !";
   1409         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
   1410         matcher->reset(src);
   1411         UnicodeString  result = matcher->replaceAll(substitute, status);
   1412         REGEX_CHECK_STATUS;
   1413         UnicodeString expected = UnicodeString("--");
   1414         expected.append((UChar32)0x10000);
   1415         expected.append("-- !");
   1416         REGEX_ASSERT(result == expected);
   1417     }
   1418     // TODO:  need more through testing of capture substitutions.
   1419 
   1420     // Bug 4057
   1421     //
   1422     {
   1423         status = U_ZERO_ERROR;
   1424         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
   1425         RegexMatcher m("ss(.*?)ee", 0, status);
   1426         REGEX_CHECK_STATUS;
   1427         UnicodeString result;
   1428 
   1429         // Multiple finds do NOT bump up the previous appendReplacement postion.
   1430         m.reset(s);
   1431         m.find();
   1432         m.find();
   1433         m.appendReplacement(result, "ooh", status);
   1434         REGEX_CHECK_STATUS;
   1435         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1436 
   1437         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
   1438         status = U_ZERO_ERROR;
   1439         result.truncate(0);
   1440         m.reset(10, status);
   1441         m.find();
   1442         m.find();
   1443         m.appendReplacement(result, "ooh", status);
   1444         REGEX_CHECK_STATUS;
   1445         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1446 
   1447         // find() at interior of string, appendReplacemnt still starts at beginning.
   1448         status = U_ZERO_ERROR;
   1449         result.truncate(0);
   1450         m.reset();
   1451         m.find(10, status);
   1452         m.find();
   1453         m.appendReplacement(result, "ooh", status);
   1454         REGEX_CHECK_STATUS;
   1455         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
   1456 
   1457         m.appendTail(result);
   1458         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
   1459 
   1460     }
   1461 
   1462     delete matcher2;
   1463     delete pat2;
   1464     delete matcher;
   1465     delete pat;
   1466 }
   1467 
   1468 
   1469 //---------------------------------------------------------------------------
   1470 //
   1471 //      API_Pattern       Test that the API for class RegexPattern is
   1472 //                        present and nominally working.
   1473 //
   1474 //---------------------------------------------------------------------------
   1475 void RegexTest::API_Pattern() {
   1476     RegexPattern        pata;    // Test default constructor to not crash.
   1477     RegexPattern        patb;
   1478 
   1479     REGEX_ASSERT(pata == patb);
   1480     REGEX_ASSERT(pata == pata);
   1481 
   1482     UnicodeString re1("abc[a-l][m-z]");
   1483     UnicodeString re2("def");
   1484     UErrorCode    status = U_ZERO_ERROR;
   1485     UParseError   pe;
   1486 
   1487     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
   1488     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
   1489     REGEX_CHECK_STATUS;
   1490     REGEX_ASSERT(*pat1 == *pat1);
   1491     REGEX_ASSERT(*pat1 != pata);
   1492 
   1493     // Assign
   1494     patb = *pat1;
   1495     REGEX_ASSERT(patb == *pat1);
   1496 
   1497     // Copy Construct
   1498     RegexPattern patc(*pat1);
   1499     REGEX_ASSERT(patc == *pat1);
   1500     REGEX_ASSERT(patb == patc);
   1501     REGEX_ASSERT(pat1 != pat2);
   1502     patb = *pat2;
   1503     REGEX_ASSERT(patb != patc);
   1504     REGEX_ASSERT(patb == *pat2);
   1505 
   1506     // Compile with no flags.
   1507     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
   1508     REGEX_ASSERT(*pat1a == *pat1);
   1509 
   1510     REGEX_ASSERT(pat1a->flags() == 0);
   1511 
   1512     // Compile with different flags should be not equal
   1513     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
   1514     REGEX_CHECK_STATUS;
   1515 
   1516     REGEX_ASSERT(*pat1b != *pat1a);
   1517     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   1518     REGEX_ASSERT(pat1a->flags() == 0);
   1519     delete pat1b;
   1520 
   1521     // clone
   1522     RegexPattern *pat1c = pat1->clone();
   1523     REGEX_ASSERT(*pat1c == *pat1);
   1524     REGEX_ASSERT(*pat1c != *pat2);
   1525 
   1526     delete pat1c;
   1527     delete pat1a;
   1528     delete pat1;
   1529     delete pat2;
   1530 
   1531 
   1532     //
   1533     //   Verify that a matcher created from a cloned pattern works.
   1534     //     (Jitterbug 3423)
   1535     //
   1536     {
   1537         UErrorCode     status     = U_ZERO_ERROR;
   1538         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
   1539         RegexPattern  *pClone     = pSource->clone();
   1540         delete         pSource;
   1541         RegexMatcher  *mFromClone = pClone->matcher(status);
   1542         REGEX_CHECK_STATUS;
   1543         UnicodeString s = "Hello World";
   1544         mFromClone->reset(s);
   1545         REGEX_ASSERT(mFromClone->find() == TRUE);
   1546         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   1547         REGEX_ASSERT(mFromClone->find() == TRUE);
   1548         REGEX_ASSERT(mFromClone->group(status) == "World");
   1549         REGEX_ASSERT(mFromClone->find() == FALSE);
   1550         delete mFromClone;
   1551         delete pClone;
   1552     }
   1553 
   1554     //
   1555     //   matches convenience API
   1556     //
   1557     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
   1558     REGEX_CHECK_STATUS;
   1559     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   1560     REGEX_CHECK_STATUS;
   1561     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   1562     REGEX_CHECK_STATUS;
   1563     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   1564     REGEX_CHECK_STATUS;
   1565     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   1566     REGEX_CHECK_STATUS;
   1567     status = U_INDEX_OUTOFBOUNDS_ERROR;
   1568     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   1569     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1570 
   1571 
   1572     //
   1573     // Split()
   1574     //
   1575     status = U_ZERO_ERROR;
   1576     pat1 = RegexPattern::compile(" +",  pe, status);
   1577     REGEX_CHECK_STATUS;
   1578     UnicodeString  fields[10];
   1579 
   1580     int32_t n;
   1581     n = pat1->split("Now is the time", fields, 10, status);
   1582     REGEX_CHECK_STATUS;
   1583     REGEX_ASSERT(n==4);
   1584     REGEX_ASSERT(fields[0]=="Now");
   1585     REGEX_ASSERT(fields[1]=="is");
   1586     REGEX_ASSERT(fields[2]=="the");
   1587     REGEX_ASSERT(fields[3]=="time");
   1588     REGEX_ASSERT(fields[4]=="");
   1589 
   1590     n = pat1->split("Now is the time", fields, 2, status);
   1591     REGEX_CHECK_STATUS;
   1592     REGEX_ASSERT(n==2);
   1593     REGEX_ASSERT(fields[0]=="Now");
   1594     REGEX_ASSERT(fields[1]=="is the time");
   1595     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   1596 
   1597     fields[1] = "*";
   1598     status = U_ZERO_ERROR;
   1599     n = pat1->split("Now is the time", fields, 1, status);
   1600     REGEX_CHECK_STATUS;
   1601     REGEX_ASSERT(n==1);
   1602     REGEX_ASSERT(fields[0]=="Now is the time");
   1603     REGEX_ASSERT(fields[1]=="*");
   1604     status = U_ZERO_ERROR;
   1605 
   1606     n = pat1->split("    Now       is the time   ", fields, 10, status);
   1607     REGEX_CHECK_STATUS;
   1608     REGEX_ASSERT(n==6);
   1609     REGEX_ASSERT(fields[0]=="");
   1610     REGEX_ASSERT(fields[1]=="Now");
   1611     REGEX_ASSERT(fields[2]=="is");
   1612     REGEX_ASSERT(fields[3]=="the");
   1613     REGEX_ASSERT(fields[4]=="time");
   1614     REGEX_ASSERT(fields[5]=="");
   1615 
   1616     n = pat1->split("     ", fields, 10, status);
   1617     REGEX_CHECK_STATUS;
   1618     REGEX_ASSERT(n==2);
   1619     REGEX_ASSERT(fields[0]=="");
   1620     REGEX_ASSERT(fields[1]=="");
   1621 
   1622     fields[0] = "foo";
   1623     n = pat1->split("", fields, 10, status);
   1624     REGEX_CHECK_STATUS;
   1625     REGEX_ASSERT(n==0);
   1626     REGEX_ASSERT(fields[0]=="foo");
   1627 
   1628     delete pat1;
   1629 
   1630     //  split, with a pattern with (capture)
   1631     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
   1632     REGEX_CHECK_STATUS;
   1633 
   1634     status = U_ZERO_ERROR;
   1635     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   1636     REGEX_CHECK_STATUS;
   1637     REGEX_ASSERT(n==7);
   1638     REGEX_ASSERT(fields[0]=="");
   1639     REGEX_ASSERT(fields[1]=="a");
   1640     REGEX_ASSERT(fields[2]=="Now is ");
   1641     REGEX_ASSERT(fields[3]=="b");
   1642     REGEX_ASSERT(fields[4]=="the time");
   1643     REGEX_ASSERT(fields[5]=="c");
   1644     REGEX_ASSERT(fields[6]=="");
   1645     REGEX_ASSERT(status==U_ZERO_ERROR);
   1646 
   1647     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   1648     REGEX_CHECK_STATUS;
   1649     REGEX_ASSERT(n==7);
   1650     REGEX_ASSERT(fields[0]=="  ");
   1651     REGEX_ASSERT(fields[1]=="a");
   1652     REGEX_ASSERT(fields[2]=="Now is ");
   1653     REGEX_ASSERT(fields[3]=="b");
   1654     REGEX_ASSERT(fields[4]=="the time");
   1655     REGEX_ASSERT(fields[5]=="c");
   1656     REGEX_ASSERT(fields[6]=="");
   1657 
   1658     status = U_ZERO_ERROR;
   1659     fields[6] = "foo";
   1660     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
   1661     REGEX_CHECK_STATUS;
   1662     REGEX_ASSERT(n==6);
   1663     REGEX_ASSERT(fields[0]=="  ");
   1664     REGEX_ASSERT(fields[1]=="a");
   1665     REGEX_ASSERT(fields[2]=="Now is ");
   1666     REGEX_ASSERT(fields[3]=="b");
   1667     REGEX_ASSERT(fields[4]=="the time");
   1668     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
   1669     REGEX_ASSERT(fields[6]=="foo");
   1670 
   1671     status = U_ZERO_ERROR;
   1672     fields[5] = "foo";
   1673     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   1674     REGEX_CHECK_STATUS;
   1675     REGEX_ASSERT(n==5);
   1676     REGEX_ASSERT(fields[0]=="  ");
   1677     REGEX_ASSERT(fields[1]=="a");
   1678     REGEX_ASSERT(fields[2]=="Now is ");
   1679     REGEX_ASSERT(fields[3]=="b");
   1680     REGEX_ASSERT(fields[4]=="the time<c>");
   1681     REGEX_ASSERT(fields[5]=="foo");
   1682 
   1683     status = U_ZERO_ERROR;
   1684     fields[5] = "foo";
   1685     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   1686     REGEX_CHECK_STATUS;
   1687     REGEX_ASSERT(n==5);
   1688     REGEX_ASSERT(fields[0]=="  ");
   1689     REGEX_ASSERT(fields[1]=="a");
   1690     REGEX_ASSERT(fields[2]=="Now is ");
   1691     REGEX_ASSERT(fields[3]=="b");
   1692     REGEX_ASSERT(fields[4]=="the time");
   1693     REGEX_ASSERT(fields[5]=="foo");
   1694 
   1695     status = U_ZERO_ERROR;
   1696     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   1697     REGEX_CHECK_STATUS;
   1698     REGEX_ASSERT(n==4);
   1699     REGEX_ASSERT(fields[0]=="  ");
   1700     REGEX_ASSERT(fields[1]=="a");
   1701     REGEX_ASSERT(fields[2]=="Now is ");
   1702     REGEX_ASSERT(fields[3]=="the time<c>");
   1703     status = U_ZERO_ERROR;
   1704     delete pat1;
   1705 
   1706     pat1 = RegexPattern::compile("([-,])",  pe, status);
   1707     REGEX_CHECK_STATUS;
   1708     n = pat1->split("1-10,20", fields, 10, status);
   1709     REGEX_CHECK_STATUS;
   1710     REGEX_ASSERT(n==5);
   1711     REGEX_ASSERT(fields[0]=="1");
   1712     REGEX_ASSERT(fields[1]=="-");
   1713     REGEX_ASSERT(fields[2]=="10");
   1714     REGEX_ASSERT(fields[3]==",");
   1715     REGEX_ASSERT(fields[4]=="20");
   1716     delete pat1;
   1717 
   1718     // Test split of string with empty trailing fields
   1719     pat1 = RegexPattern::compile(",", pe, status);
   1720     REGEX_CHECK_STATUS;
   1721     n = pat1->split("a,b,c,", fields, 10, status);
   1722     REGEX_CHECK_STATUS;
   1723     REGEX_ASSERT(n==4);
   1724     REGEX_ASSERT(fields[0]=="a");
   1725     REGEX_ASSERT(fields[1]=="b");
   1726     REGEX_ASSERT(fields[2]=="c");
   1727     REGEX_ASSERT(fields[3]=="");
   1728 
   1729     n = pat1->split("a,,,", fields, 10, status);
   1730     REGEX_CHECK_STATUS;
   1731     REGEX_ASSERT(n==4);
   1732     REGEX_ASSERT(fields[0]=="a");
   1733     REGEX_ASSERT(fields[1]=="");
   1734     REGEX_ASSERT(fields[2]=="");
   1735     REGEX_ASSERT(fields[3]=="");
   1736     delete pat1;
   1737 
   1738     // Split Separator with zero length match.
   1739     pat1 = RegexPattern::compile(":?", pe, status);
   1740     REGEX_CHECK_STATUS;
   1741     n = pat1->split("abc", fields, 10, status);
   1742     REGEX_CHECK_STATUS;
   1743     REGEX_ASSERT(n==5);
   1744     REGEX_ASSERT(fields[0]=="");
   1745     REGEX_ASSERT(fields[1]=="a");
   1746     REGEX_ASSERT(fields[2]=="b");
   1747     REGEX_ASSERT(fields[3]=="c");
   1748     REGEX_ASSERT(fields[4]=="");
   1749 
   1750     delete pat1;
   1751 
   1752     //
   1753     // RegexPattern::pattern()
   1754     //
   1755     pat1 = new RegexPattern();
   1756     REGEX_ASSERT(pat1->pattern() == "");
   1757     delete pat1;
   1758 
   1759     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1760     REGEX_CHECK_STATUS;
   1761     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
   1762     delete pat1;
   1763 
   1764 
   1765     //
   1766     // classID functions
   1767     //
   1768     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
   1769     REGEX_CHECK_STATUS;
   1770     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
   1771     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
   1772     UnicodeString Hello("Hello, world.");
   1773     RegexMatcher *m = pat1->matcher(Hello, status);
   1774     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
   1775     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
   1776     REGEX_ASSERT(m->getDynamicClassID() != NULL);
   1777     delete m;
   1778     delete pat1;
   1779 
   1780 }
   1781 
   1782 //---------------------------------------------------------------------------
   1783 //
   1784 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
   1785 //                       is present and working, but excluding functions
   1786 //                       implementing replace operations.
   1787 //
   1788 //---------------------------------------------------------------------------
   1789 void RegexTest::API_Match_UTF8() {
   1790     UParseError         pe;
   1791     UErrorCode          status=U_ZERO_ERROR;
   1792     int32_t             flags = 0;
   1793 
   1794     //
   1795     // Debug - slide failing test cases early
   1796     //
   1797 #if 0
   1798     {
   1799     }
   1800     return;
   1801 #endif
   1802 
   1803     //
   1804     // Simple pattern compilation
   1805     //
   1806     {
   1807         UText               re = UTEXT_INITIALIZER;
   1808         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   1809         REGEX_VERBOSE_TEXT(&re);
   1810         RegexPattern        *pat2;
   1811         pat2 = RegexPattern::compile(&re, flags, pe, status);
   1812         REGEX_CHECK_STATUS;
   1813 
   1814         UText input1 = UTEXT_INITIALIZER;
   1815         UText input2 = UTEXT_INITIALIZER;
   1816         UText empty  = UTEXT_INITIALIZER;
   1817         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
   1818         REGEX_VERBOSE_TEXT(&input1);
   1819         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
   1820         REGEX_VERBOSE_TEXT(&input2);
   1821         utext_openUChars(&empty, NULL, 0, &status);
   1822 
   1823         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
   1824         int32_t input2Len = strlen("not abc");
   1825 
   1826 
   1827         //
   1828         // Matcher creation and reset.
   1829         //
   1830         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
   1831         REGEX_CHECK_STATUS;
   1832         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1833         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
   1834         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1835         m1->reset(&input2);
   1836         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1837         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
   1838         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
   1839         m1->reset(&input1);
   1840         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1841         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1842         m1->reset(&empty);
   1843         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
   1844         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
   1845 
   1846         //
   1847         //  reset(pos, status)
   1848         //
   1849         m1->reset(&input1);
   1850         m1->reset(4, status);
   1851         REGEX_CHECK_STATUS;
   1852         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
   1853         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
   1854 
   1855         m1->reset(-1, status);
   1856         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1857         status = U_ZERO_ERROR;
   1858 
   1859         m1->reset(0, status);
   1860         REGEX_CHECK_STATUS;
   1861         status = U_ZERO_ERROR;
   1862 
   1863         m1->reset(input1Len-1, status);
   1864         REGEX_CHECK_STATUS;
   1865         status = U_ZERO_ERROR;
   1866 
   1867         m1->reset(input1Len, status);
   1868         REGEX_CHECK_STATUS;
   1869         status = U_ZERO_ERROR;
   1870 
   1871         m1->reset(input1Len+1, status);
   1872         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1873         status = U_ZERO_ERROR;
   1874 
   1875         //
   1876         // match(pos, status)
   1877         //
   1878         m1->reset(&input2);
   1879         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1880         m1->reset();
   1881         REGEX_ASSERT(m1->matches(3, status) == FALSE);
   1882         m1->reset();
   1883         REGEX_ASSERT(m1->matches(5, status) == FALSE);
   1884         REGEX_ASSERT(m1->matches(4, status) == TRUE);
   1885         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
   1886         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1887 
   1888         // Match() at end of string should fail, but should not
   1889         //  be an error.
   1890         status = U_ZERO_ERROR;
   1891         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
   1892         REGEX_CHECK_STATUS;
   1893 
   1894         // Match beyond end of string should fail with an error.
   1895         status = U_ZERO_ERROR;
   1896         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
   1897         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1898 
   1899         // Successful match at end of string.
   1900         {
   1901             status = U_ZERO_ERROR;
   1902             RegexMatcher m("A?", 0, status);  // will match zero length string.
   1903             REGEX_CHECK_STATUS;
   1904             m.reset(&input1);
   1905             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
   1906             REGEX_CHECK_STATUS;
   1907             m.reset(&empty);
   1908             REGEX_ASSERT(m.matches(0, status) == TRUE);
   1909             REGEX_CHECK_STATUS;
   1910         }
   1911 
   1912 
   1913         //
   1914         // lookingAt(pos, status)
   1915         //
   1916         status = U_ZERO_ERROR;
   1917         m1->reset(&input2);  // "not abc"
   1918         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1919         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
   1920         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
   1921         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
   1922         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
   1923         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1924         status = U_ZERO_ERROR;
   1925         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
   1926         REGEX_CHECK_STATUS;
   1927         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
   1928         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   1929 
   1930         delete m1;
   1931         delete pat2;
   1932 
   1933         utext_close(&re);
   1934         utext_close(&input1);
   1935         utext_close(&input2);
   1936         utext_close(&empty);
   1937     }
   1938 
   1939 
   1940     //
   1941     // Capture Group.
   1942     //     RegexMatcher::start();
   1943     //     RegexMatcher::end();
   1944     //     RegexMatcher::groupCount();
   1945     //
   1946     {
   1947         int32_t             flags=0;
   1948         UParseError         pe;
   1949         UErrorCode          status=U_ZERO_ERROR;
   1950         UText               re=UTEXT_INITIALIZER;
   1951         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
   1952         utext_openUTF8(&re, str_01234567_pat, -1, &status);
   1953 
   1954         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   1955         REGEX_CHECK_STATUS;
   1956 
   1957         UText input = UTEXT_INITIALIZER;
   1958         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1959         utext_openUTF8(&input, str_0123456789, -1, &status);
   1960 
   1961         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   1962         REGEX_CHECK_STATUS;
   1963         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
   1964         static const int32_t matchStarts[] = {0,  2, 4, 8};
   1965         static const int32_t matchEnds[]   = {10, 8, 6, 10};
   1966         int32_t i;
   1967         for (i=0; i<4; i++) {
   1968             int32_t actualStart = matcher->start(i, status);
   1969             REGEX_CHECK_STATUS;
   1970             if (actualStart != matchStarts[i]) {
   1971                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
   1972                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
   1973             }
   1974             int32_t actualEnd = matcher->end(i, status);
   1975             REGEX_CHECK_STATUS;
   1976             if (actualEnd != matchEnds[i]) {
   1977                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
   1978                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
   1979             }
   1980         }
   1981 
   1982         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
   1983         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
   1984 
   1985         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1986         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   1987         matcher->reset();
   1988         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
   1989 
   1990         matcher->lookingAt(status);
   1991 
   1992         UnicodeString dest;
   1993         UText destText = UTEXT_INITIALIZER;
   1994         utext_openUnicodeString(&destText, &dest, &status);
   1995         UText *result;
   1996         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
   1997         //  Test shallow-clone API
   1998         int64_t   group_len;
   1999         result = matcher->group((UText *)NULL, group_len, status);
   2000         REGEX_CHECK_STATUS;
   2001         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2002         utext_close(result);
   2003         result = matcher->group(0, &destText, group_len, status);
   2004         REGEX_CHECK_STATUS;
   2005         REGEX_ASSERT(result == &destText);
   2006         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2007         //  destText is now immutable, reopen it
   2008         utext_close(&destText);
   2009         utext_openUnicodeString(&destText, &dest, &status);
   2010 
   2011         int64_t length;
   2012         result = matcher->group(0, NULL, length, status);
   2013         REGEX_CHECK_STATUS;
   2014         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
   2015         utext_close(result);
   2016         result = matcher->group(0, &destText, length, status);
   2017         REGEX_CHECK_STATUS;
   2018         REGEX_ASSERT(result == &destText);
   2019         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
   2020         REGEX_ASSERT(length == 10);
   2021         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2022 
   2023         // Capture Group 1 == "234567"
   2024         result = matcher->group(1, NULL, length, status);
   2025         REGEX_CHECK_STATUS;
   2026         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2027         REGEX_ASSERT(length == 6);
   2028         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2029         utext_close(result);
   2030 
   2031         result = matcher->group(1, &destText, length, status);
   2032         REGEX_CHECK_STATUS;
   2033         REGEX_ASSERT(result == &destText);
   2034         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
   2035         REGEX_ASSERT(length == 6);
   2036         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2037         utext_close(result);
   2038 
   2039         // Capture Group 2 == "45"
   2040         result = matcher->group(2, NULL, length, status);
   2041         REGEX_CHECK_STATUS;
   2042         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2043         REGEX_ASSERT(length == 2);
   2044         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2045         utext_close(result);
   2046 
   2047         result = matcher->group(2, &destText, length, status);
   2048         REGEX_CHECK_STATUS;
   2049         REGEX_ASSERT(result == &destText);
   2050         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
   2051         REGEX_ASSERT(length == 2);
   2052         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2053         utext_close(result);
   2054 
   2055         // Capture Group 3 == "89"
   2056         result = matcher->group(3, NULL, length, status);
   2057         REGEX_CHECK_STATUS;
   2058         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2059         REGEX_ASSERT(length == 2);
   2060         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2061         utext_close(result);
   2062 
   2063         result = matcher->group(3, &destText, length, status);
   2064         REGEX_CHECK_STATUS;
   2065         REGEX_ASSERT(result == &destText);
   2066         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
   2067         REGEX_ASSERT(length == 2);
   2068         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
   2069         utext_close(result);
   2070 
   2071         // Capture Group number out of range.
   2072         status = U_ZERO_ERROR;
   2073         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2074         status = U_ZERO_ERROR;
   2075         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2076         status = U_ZERO_ERROR;
   2077         matcher->reset();
   2078         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
   2079 
   2080         delete matcher;
   2081         delete pat;
   2082 
   2083         utext_close(&destText);
   2084         utext_close(&input);
   2085         utext_close(&re);
   2086     }
   2087 
   2088     //
   2089     //  find
   2090     //
   2091     {
   2092         int32_t             flags=0;
   2093         UParseError         pe;
   2094         UErrorCode          status=U_ZERO_ERROR;
   2095         UText               re=UTEXT_INITIALIZER;
   2096         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2097         utext_openUTF8(&re, str_abc, -1, &status);
   2098 
   2099         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2100         REGEX_CHECK_STATUS;
   2101         UText input = UTEXT_INITIALIZER;
   2102         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2103         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2104         //                      012345678901234567
   2105 
   2106         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2107         REGEX_CHECK_STATUS;
   2108         REGEX_ASSERT(matcher->find());
   2109         REGEX_ASSERT(matcher->start(status) == 1);
   2110         REGEX_ASSERT(matcher->find());
   2111         REGEX_ASSERT(matcher->start(status) == 6);
   2112         REGEX_ASSERT(matcher->find());
   2113         REGEX_ASSERT(matcher->start(status) == 12);
   2114         REGEX_ASSERT(matcher->find() == FALSE);
   2115         REGEX_ASSERT(matcher->find() == FALSE);
   2116 
   2117         matcher->reset();
   2118         REGEX_ASSERT(matcher->find());
   2119         REGEX_ASSERT(matcher->start(status) == 1);
   2120 
   2121         REGEX_ASSERT(matcher->find(0, status));
   2122         REGEX_ASSERT(matcher->start(status) == 1);
   2123         REGEX_ASSERT(matcher->find(1, status));
   2124         REGEX_ASSERT(matcher->start(status) == 1);
   2125         REGEX_ASSERT(matcher->find(2, status));
   2126         REGEX_ASSERT(matcher->start(status) == 6);
   2127         REGEX_ASSERT(matcher->find(12, status));
   2128         REGEX_ASSERT(matcher->start(status) == 12);
   2129         REGEX_ASSERT(matcher->find(13, status) == FALSE);
   2130         REGEX_ASSERT(matcher->find(16, status) == FALSE);
   2131         REGEX_ASSERT(matcher->find(17, status) == FALSE);
   2132         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
   2133 
   2134         status = U_ZERO_ERROR;
   2135         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2136         status = U_ZERO_ERROR;
   2137         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
   2138 
   2139         REGEX_ASSERT(matcher->groupCount() == 0);
   2140 
   2141         delete matcher;
   2142         delete pat;
   2143 
   2144         utext_close(&input);
   2145         utext_close(&re);
   2146     }
   2147 
   2148 
   2149     //
   2150     //  find, with \G in pattern (true if at the end of a previous match).
   2151     //
   2152     {
   2153         int32_t             flags=0;
   2154         UParseError         pe;
   2155         UErrorCode          status=U_ZERO_ERROR;
   2156         UText               re=UTEXT_INITIALIZER;
   2157         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
   2158         utext_openUTF8(&re, str_Gabcabc, -1, &status);
   2159 
   2160         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2161 
   2162         REGEX_CHECK_STATUS;
   2163         UText input = UTEXT_INITIALIZER;
   2164         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
   2165         utext_openUTF8(&input, str_abcabcabc, -1, &status);
   2166         //                      012345678901234567
   2167 
   2168         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
   2169         REGEX_CHECK_STATUS;
   2170         REGEX_ASSERT(matcher->find());
   2171         REGEX_ASSERT(matcher->start(status) == 0);
   2172         REGEX_ASSERT(matcher->start(1, status) == -1);
   2173         REGEX_ASSERT(matcher->start(2, status) == 1);
   2174 
   2175         REGEX_ASSERT(matcher->find());
   2176         REGEX_ASSERT(matcher->start(status) == 4);
   2177         REGEX_ASSERT(matcher->start(1, status) == 4);
   2178         REGEX_ASSERT(matcher->start(2, status) == -1);
   2179         REGEX_CHECK_STATUS;
   2180 
   2181         delete matcher;
   2182         delete pat;
   2183 
   2184         utext_close(&input);
   2185         utext_close(&re);
   2186     }
   2187 
   2188     //
   2189     //   find with zero length matches, match position should bump ahead
   2190     //     to prevent loops.
   2191     //
   2192     {
   2193         int32_t                 i;
   2194         UErrorCode          status=U_ZERO_ERROR;
   2195         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
   2196                                                       //   using an always-true look-ahead.
   2197         REGEX_CHECK_STATUS;
   2198         UText s = UTEXT_INITIALIZER;
   2199         utext_openUTF8(&s, "    ", -1, &status);
   2200         m.reset(&s);
   2201         for (i=0; ; i++) {
   2202             if (m.find() == FALSE) {
   2203                 break;
   2204             }
   2205             REGEX_ASSERT(m.start(status) == i);
   2206             REGEX_ASSERT(m.end(status) == i);
   2207         }
   2208         REGEX_ASSERT(i==5);
   2209 
   2210         // Check that the bump goes over characters outside the BMP OK
   2211         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
   2212         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
   2213         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
   2214         m.reset(&s);
   2215         for (i=0; ; i+=4) {
   2216             if (m.find() == FALSE) {
   2217                 break;
   2218             }
   2219             REGEX_ASSERT(m.start(status) == i);
   2220             REGEX_ASSERT(m.end(status) == i);
   2221         }
   2222         REGEX_ASSERT(i==20);
   2223 
   2224         utext_close(&s);
   2225     }
   2226     {
   2227         // find() loop breaking test.
   2228         //        with pattern of /.?/, should see a series of one char matches, then a single
   2229         //        match of zero length at the end of the input string.
   2230         int32_t                 i;
   2231         UErrorCode          status=U_ZERO_ERROR;
   2232         RegexMatcher        m(".?", 0, status);
   2233         REGEX_CHECK_STATUS;
   2234         UText s = UTEXT_INITIALIZER;
   2235         utext_openUTF8(&s, "    ", -1, &status);
   2236         m.reset(&s);
   2237         for (i=0; ; i++) {
   2238             if (m.find() == FALSE) {
   2239                 break;
   2240             }
   2241             REGEX_ASSERT(m.start(status) == i);
   2242             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
   2243         }
   2244         REGEX_ASSERT(i==5);
   2245 
   2246         utext_close(&s);
   2247     }
   2248 
   2249 
   2250     //
   2251     // Matchers with no input string behave as if they had an empty input string.
   2252     //
   2253 
   2254     {
   2255         UErrorCode status = U_ZERO_ERROR;
   2256         RegexMatcher  m(".?", 0, status);
   2257         REGEX_CHECK_STATUS;
   2258         REGEX_ASSERT(m.find());
   2259         REGEX_ASSERT(m.start(status) == 0);
   2260         REGEX_ASSERT(m.input() == "");
   2261     }
   2262     {
   2263         UErrorCode status = U_ZERO_ERROR;
   2264         RegexPattern  *p = RegexPattern::compile(".", 0, status);
   2265         RegexMatcher  *m = p->matcher(status);
   2266         REGEX_CHECK_STATUS;
   2267 
   2268         REGEX_ASSERT(m->find() == FALSE);
   2269         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
   2270         delete m;
   2271         delete p;
   2272     }
   2273 
   2274     //
   2275     // Regions
   2276     //
   2277     {
   2278         UErrorCode status = U_ZERO_ERROR;
   2279         UText testPattern = UTEXT_INITIALIZER;
   2280         UText testText    = UTEXT_INITIALIZER;
   2281         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
   2282         REGEX_VERBOSE_TEXT(&testPattern);
   2283         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
   2284         REGEX_VERBOSE_TEXT(&testText);
   2285 
   2286         RegexMatcher m(&testPattern, &testText, 0, status);
   2287         REGEX_CHECK_STATUS;
   2288         REGEX_ASSERT(m.regionStart() == 0);
   2289         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2290         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2291         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2292 
   2293         m.region(2,4, status);
   2294         REGEX_CHECK_STATUS;
   2295         REGEX_ASSERT(m.matches(status));
   2296         REGEX_ASSERT(m.start(status)==2);
   2297         REGEX_ASSERT(m.end(status)==4);
   2298         REGEX_CHECK_STATUS;
   2299 
   2300         m.reset();
   2301         REGEX_ASSERT(m.regionStart() == 0);
   2302         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
   2303 
   2304         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
   2305         REGEX_VERBOSE_TEXT(&testText);
   2306         m.reset(&testText);
   2307         REGEX_ASSERT(m.regionStart() == 0);
   2308         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
   2309 
   2310         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2311         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
   2312         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2313         REGEX_ASSERT(&m == &m.reset());
   2314         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
   2315 
   2316         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
   2317         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2318         REGEX_ASSERT(&m == &m.reset());
   2319         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
   2320 
   2321         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2322         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
   2323         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2324         REGEX_ASSERT(&m == &m.reset());
   2325         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
   2326 
   2327         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
   2328         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2329         REGEX_ASSERT(&m == &m.reset());
   2330         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
   2331 
   2332         utext_close(&testText);
   2333         utext_close(&testPattern);
   2334     }
   2335 
   2336     //
   2337     // hitEnd() and requireEnd()
   2338     //
   2339     {
   2340         UErrorCode status = U_ZERO_ERROR;
   2341         UText testPattern = UTEXT_INITIALIZER;
   2342         UText testText    = UTEXT_INITIALIZER;
   2343         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2344         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
   2345         utext_openUTF8(&testPattern, str_, -1, &status);
   2346         utext_openUTF8(&testText, str_aabb, -1, &status);
   2347 
   2348         RegexMatcher m1(&testPattern, &testText,  0, status);
   2349         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
   2350         REGEX_ASSERT(m1.hitEnd() == TRUE);
   2351         REGEX_ASSERT(m1.requireEnd() == FALSE);
   2352         REGEX_CHECK_STATUS;
   2353 
   2354         status = U_ZERO_ERROR;
   2355         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
   2356         utext_openUTF8(&testPattern, str_a, -1, &status);
   2357         RegexMatcher m2(&testPattern, &testText, 0, status);
   2358         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
   2359         REGEX_ASSERT(m2.hitEnd() == FALSE);
   2360         REGEX_ASSERT(m2.requireEnd() == FALSE);
   2361         REGEX_CHECK_STATUS;
   2362 
   2363         status = U_ZERO_ERROR;
   2364         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
   2365         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
   2366         RegexMatcher m3(&testPattern, &testText, 0, status);
   2367         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
   2368         REGEX_ASSERT(m3.hitEnd() == TRUE);
   2369         REGEX_ASSERT(m3.requireEnd() == TRUE);
   2370         REGEX_CHECK_STATUS;
   2371 
   2372         utext_close(&testText);
   2373         utext_close(&testPattern);
   2374     }
   2375 }
   2376 
   2377 
   2378 //---------------------------------------------------------------------------
   2379 //
   2380 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
   2381 //                         Replace family of functions.
   2382 //
   2383 //---------------------------------------------------------------------------
   2384 void RegexTest::API_Replace_UTF8() {
   2385     //
   2386     //  Replace
   2387     //
   2388     int32_t             flags=0;
   2389     UParseError         pe;
   2390     UErrorCode          status=U_ZERO_ERROR;
   2391 
   2392     UText               re=UTEXT_INITIALIZER;
   2393     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
   2394     REGEX_VERBOSE_TEXT(&re);
   2395     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
   2396     REGEX_CHECK_STATUS;
   2397 
   2398     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
   2399     //             012345678901234567
   2400     UText dataText = UTEXT_INITIALIZER;
   2401     utext_openUTF8(&dataText, data, -1, &status);
   2402     REGEX_CHECK_STATUS;
   2403     REGEX_VERBOSE_TEXT(&dataText);
   2404     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
   2405 
   2406     //
   2407     //  Plain vanilla matches.
   2408     //
   2409     UnicodeString  dest;
   2410     UText destText = UTEXT_INITIALIZER;
   2411     utext_openUnicodeString(&destText, &dest, &status);
   2412     UText *result;
   2413 
   2414     UText replText = UTEXT_INITIALIZER;
   2415 
   2416     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
   2417     utext_openUTF8(&replText, str_yz, -1, &status);
   2418     REGEX_VERBOSE_TEXT(&replText);
   2419     result = matcher->replaceFirst(&replText, NULL, status);
   2420     REGEX_CHECK_STATUS;
   2421     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
   2422     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2423     utext_close(result);
   2424     result = matcher->replaceFirst(&replText, &destText, status);
   2425     REGEX_CHECK_STATUS;
   2426     REGEX_ASSERT(result == &destText);
   2427     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
   2428 
   2429     result = matcher->replaceAll(&replText, NULL, status);
   2430     REGEX_CHECK_STATUS;
   2431     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
   2432     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2433     utext_close(result);
   2434 
   2435     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2436     result = matcher->replaceAll(&replText, &destText, status);
   2437     REGEX_CHECK_STATUS;
   2438     REGEX_ASSERT(result == &destText);
   2439     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
   2440 
   2441     //
   2442     //  Plain vanilla non-matches.
   2443     //
   2444     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
   2445     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
   2446     matcher->reset(&dataText);
   2447 
   2448     result = matcher->replaceFirst(&replText, NULL, status);
   2449     REGEX_CHECK_STATUS;
   2450     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2451     utext_close(result);
   2452     result = matcher->replaceFirst(&replText, &destText, status);
   2453     REGEX_CHECK_STATUS;
   2454     REGEX_ASSERT(result == &destText);
   2455     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2456 
   2457     result = matcher->replaceAll(&replText, NULL, status);
   2458     REGEX_CHECK_STATUS;
   2459     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2460     utext_close(result);
   2461     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2462     result = matcher->replaceAll(&replText, &destText, status);
   2463     REGEX_CHECK_STATUS;
   2464     REGEX_ASSERT(result == &destText);
   2465     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
   2466 
   2467     //
   2468     // Empty source string
   2469     //
   2470     utext_openUTF8(&dataText, NULL, 0, &status);
   2471     matcher->reset(&dataText);
   2472 
   2473     result = matcher->replaceFirst(&replText, NULL, status);
   2474     REGEX_CHECK_STATUS;
   2475     REGEX_ASSERT_UTEXT_UTF8("", result);
   2476     utext_close(result);
   2477     result = matcher->replaceFirst(&replText, &destText, status);
   2478     REGEX_CHECK_STATUS;
   2479     REGEX_ASSERT(result == &destText);
   2480     REGEX_ASSERT_UTEXT_UTF8("", result);
   2481 
   2482     result = matcher->replaceAll(&replText, NULL, status);
   2483     REGEX_CHECK_STATUS;
   2484     REGEX_ASSERT_UTEXT_UTF8("", result);
   2485     utext_close(result);
   2486     result = matcher->replaceAll(&replText, &destText, status);
   2487     REGEX_CHECK_STATUS;
   2488     REGEX_ASSERT(result == &destText);
   2489     REGEX_ASSERT_UTEXT_UTF8("", result);
   2490 
   2491     //
   2492     // Empty substitution string
   2493     //
   2494     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
   2495     matcher->reset(&dataText);
   2496 
   2497     utext_openUTF8(&replText, NULL, 0, &status);
   2498     result = matcher->replaceFirst(&replText, NULL, status);
   2499     REGEX_CHECK_STATUS;
   2500     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
   2501     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2502     utext_close(result);
   2503     result = matcher->replaceFirst(&replText, &destText, status);
   2504     REGEX_CHECK_STATUS;
   2505     REGEX_ASSERT(result == &destText);
   2506     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
   2507 
   2508     result = matcher->replaceAll(&replText, NULL, status);
   2509     REGEX_CHECK_STATUS;
   2510     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
   2511     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2512     utext_close(result);
   2513     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2514     result = matcher->replaceAll(&replText, &destText, status);
   2515     REGEX_CHECK_STATUS;
   2516     REGEX_ASSERT(result == &destText);
   2517     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
   2518 
   2519     //
   2520     // match whole string
   2521     //
   2522     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2523     utext_openUTF8(&dataText, str_abc, -1, &status);
   2524     matcher->reset(&dataText);
   2525 
   2526     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
   2527     utext_openUTF8(&replText, str_xyz, -1, &status);
   2528     result = matcher->replaceFirst(&replText, NULL, status);
   2529     REGEX_CHECK_STATUS;
   2530     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2531     utext_close(result);
   2532     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2533     result = matcher->replaceFirst(&replText, &destText, status);
   2534     REGEX_CHECK_STATUS;
   2535     REGEX_ASSERT(result == &destText);
   2536     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2537 
   2538     result = matcher->replaceAll(&replText, NULL, status);
   2539     REGEX_CHECK_STATUS;
   2540     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2541     utext_close(result);
   2542     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2543     result = matcher->replaceAll(&replText, &destText, status);
   2544     REGEX_CHECK_STATUS;
   2545     REGEX_ASSERT(result == &destText);
   2546     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
   2547 
   2548     //
   2549     // Capture Group, simple case
   2550     //
   2551     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
   2552     utext_openUTF8(&re, str_add, -1, &status);
   2553     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
   2554     REGEX_CHECK_STATUS;
   2555 
   2556     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
   2557     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
   2558     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
   2559     REGEX_CHECK_STATUS;
   2560 
   2561     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
   2562     utext_openUTF8(&replText, str_11, -1, &status);
   2563     result = matcher2->replaceFirst(&replText, NULL, status);
   2564     REGEX_CHECK_STATUS;
   2565     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
   2566     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2567     utext_close(result);
   2568     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2569     result = matcher2->replaceFirst(&replText, &destText, status);
   2570     REGEX_CHECK_STATUS;
   2571     REGEX_ASSERT(result == &destText);
   2572     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
   2573 
   2574     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
   2575     utext_openUTF8(&replText, str_v, -1, &status);
   2576     REGEX_VERBOSE_TEXT(&replText);
   2577     result = matcher2->replaceFirst(&replText, NULL, status);
   2578     REGEX_CHECK_STATUS;
   2579     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
   2580     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2581     utext_close(result);
   2582     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2583     result = matcher2->replaceFirst(&replText, &destText, status);
   2584     REGEX_CHECK_STATUS;
   2585     REGEX_ASSERT(result == &destText);
   2586     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
   2587 
   2588     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
   2589                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
   2590                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
   2591     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
   2592     result = matcher2->replaceFirst(&replText, NULL, status);
   2593     REGEX_CHECK_STATUS;
   2594     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
   2595     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2596     utext_close(result);
   2597     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2598     result = matcher2->replaceFirst(&replText, &destText, status);
   2599     REGEX_CHECK_STATUS;
   2600     REGEX_ASSERT(result == &destText);
   2601     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
   2602 
   2603     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
   2604     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
   2605     //                                 012345678901234567890123456
   2606     supplDigitChars[22] = 0xF0;
   2607     supplDigitChars[23] = 0x9D;
   2608     supplDigitChars[24] = 0x9F;
   2609     supplDigitChars[25] = 0x8F;
   2610     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
   2611 
   2612     result = matcher2->replaceFirst(&replText, NULL, status);
   2613     REGEX_CHECK_STATUS;
   2614     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
   2615     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2616     utext_close(result);
   2617     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2618     result = matcher2->replaceFirst(&replText, &destText, status);
   2619     REGEX_CHECK_STATUS;
   2620     REGEX_ASSERT(result == &destText);
   2621     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
   2622     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
   2623     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
   2624     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2625 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2626     utext_close(result);
   2627     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2628     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
   2629     REGEX_ASSERT(result == &destText);
   2630 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
   2631 
   2632     //
   2633     // Replacement String with \u hex escapes
   2634     //
   2635     {
   2636       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
   2637       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
   2638         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
   2639         utext_openUTF8(&replText, str_u0043, -1, &status);
   2640         matcher->reset(&dataText);
   2641 
   2642         result = matcher->replaceAll(&replText, NULL, status);
   2643         REGEX_CHECK_STATUS;
   2644         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
   2645         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2646         utext_close(result);
   2647         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2648         result = matcher->replaceAll(&replText, &destText, status);
   2649         REGEX_CHECK_STATUS;
   2650         REGEX_ASSERT(result == &destText);
   2651         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
   2652     }
   2653     {
   2654       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
   2655         utext_openUTF8(&dataText, str_abc, -1, &status);
   2656         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
   2657         utext_openUTF8(&replText, str_U00010000, -1, &status);
   2658         matcher->reset(&dataText);
   2659 
   2660         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
   2661         //                          0123456789
   2662         expected[2] = 0xF0;
   2663         expected[3] = 0x90;
   2664         expected[4] = 0x80;
   2665         expected[5] = 0x80;
   2666 
   2667         result = matcher->replaceAll(&replText, NULL, status);
   2668         REGEX_CHECK_STATUS;
   2669         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2670         utext_close(result);
   2671         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
   2672         result = matcher->replaceAll(&replText, &destText, status);
   2673         REGEX_CHECK_STATUS;
   2674         REGEX_ASSERT(result == &destText);
   2675         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
   2676     }
   2677     // TODO:  need more through testing of capture substitutions.
   2678 
   2679     // Bug 4057
   2680     //
   2681     {
   2682         status = U_ZERO_ERROR;
   2683 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
   2684 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
   2685 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
   2686         utext_openUTF8(&re, str_ssee, -1, &status);
   2687         utext_openUTF8(&dataText, str_blah, -1, &status);
   2688         utext_openUTF8(&replText, str_ooh, -1, &status);
   2689 
   2690         RegexMatcher m(&re, 0, status);
   2691         REGEX_CHECK_STATUS;
   2692 
   2693         UnicodeString result;
   2694         UText resultText = UTEXT_INITIALIZER;
   2695         utext_openUnicodeString(&resultText, &result, &status);
   2696 
   2697         // Multiple finds do NOT bump up the previous appendReplacement postion.
   2698         m.reset(&dataText);
   2699         m.find();
   2700         m.find();
   2701         m.appendReplacement(&resultText, &replText, status);
   2702         REGEX_CHECK_STATUS;
   2703         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2704         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
   2705 
   2706         // After a reset into the interior of a string, appendReplacement still starts at beginning.
   2707         status = U_ZERO_ERROR;
   2708         result.truncate(0);
   2709         utext_openUnicodeString(&resultText, &result, &status);
   2710         m.reset(10, status);
   2711         m.find();
   2712         m.find();
   2713         m.appendReplacement(&resultText, &replText, status);
   2714         REGEX_CHECK_STATUS;
   2715         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2716         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
   2717 
   2718         // find() at interior of string, appendReplacement still starts at beginning.
   2719         status = U_ZERO_ERROR;
   2720         result.truncate(0);
   2721         utext_openUnicodeString(&resultText, &result, &status);
   2722         m.reset();
   2723         m.find(10, status);
   2724         m.find();
   2725         m.appendReplacement(&resultText, &replText, status);
   2726         REGEX_CHECK_STATUS;
   2727         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
   2728         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
   2729 
   2730         m.appendTail(&resultText, status);
   2731         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
   2732         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
   2733 
   2734         utext_close(&resultText);
   2735     }
   2736 
   2737     delete matcher2;
   2738     delete pat2;
   2739     delete matcher;
   2740     delete pat;
   2741 
   2742     utext_close(&dataText);
   2743     utext_close(&replText);
   2744     utext_close(&destText);
   2745     utext_close(&re);
   2746 }
   2747 
   2748 
   2749 //---------------------------------------------------------------------------
   2750 //
   2751 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
   2752 //                        present and nominally working.
   2753 //
   2754 //---------------------------------------------------------------------------
   2755 void RegexTest::API_Pattern_UTF8() {
   2756     RegexPattern        pata;    // Test default constructor to not crash.
   2757     RegexPattern        patb;
   2758 
   2759     REGEX_ASSERT(pata == patb);
   2760     REGEX_ASSERT(pata == pata);
   2761 
   2762     UText         re1 = UTEXT_INITIALIZER;
   2763     UText         re2 = UTEXT_INITIALIZER;
   2764     UErrorCode    status = U_ZERO_ERROR;
   2765     UParseError   pe;
   2766 
   2767     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
   2768     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
   2769     utext_openUTF8(&re1, str_abcalmz, -1, &status);
   2770     utext_openUTF8(&re2, str_def, -1, &status);
   2771 
   2772     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
   2773     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
   2774     REGEX_CHECK_STATUS;
   2775     REGEX_ASSERT(*pat1 == *pat1);
   2776     REGEX_ASSERT(*pat1 != pata);
   2777 
   2778     // Assign
   2779     patb = *pat1;
   2780     REGEX_ASSERT(patb == *pat1);
   2781 
   2782     // Copy Construct
   2783     RegexPattern patc(*pat1);
   2784     REGEX_ASSERT(patc == *pat1);
   2785     REGEX_ASSERT(patb == patc);
   2786     REGEX_ASSERT(pat1 != pat2);
   2787     patb = *pat2;
   2788     REGEX_ASSERT(patb != patc);
   2789     REGEX_ASSERT(patb == *pat2);
   2790 
   2791     // Compile with no flags.
   2792     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
   2793     REGEX_ASSERT(*pat1a == *pat1);
   2794 
   2795     REGEX_ASSERT(pat1a->flags() == 0);
   2796 
   2797     // Compile with different flags should be not equal
   2798     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
   2799     REGEX_CHECK_STATUS;
   2800 
   2801     REGEX_ASSERT(*pat1b != *pat1a);
   2802     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
   2803     REGEX_ASSERT(pat1a->flags() == 0);
   2804     delete pat1b;
   2805 
   2806     // clone
   2807     RegexPattern *pat1c = pat1->clone();
   2808     REGEX_ASSERT(*pat1c == *pat1);
   2809     REGEX_ASSERT(*pat1c != *pat2);
   2810 
   2811     delete pat1c;
   2812     delete pat1a;
   2813     delete pat1;
   2814     delete pat2;
   2815 
   2816     utext_close(&re1);
   2817     utext_close(&re2);
   2818 
   2819 
   2820     //
   2821     //   Verify that a matcher created from a cloned pattern works.
   2822     //     (Jitterbug 3423)
   2823     //
   2824     {
   2825         UErrorCode     status     = U_ZERO_ERROR;
   2826         UText          pattern    = UTEXT_INITIALIZER;
   2827         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
   2828         utext_openUTF8(&pattern, str_pL, -1, &status);
   2829 
   2830         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
   2831         RegexPattern  *pClone     = pSource->clone();
   2832         delete         pSource;
   2833         RegexMatcher  *mFromClone = pClone->matcher(status);
   2834         REGEX_CHECK_STATUS;
   2835 
   2836         UText          input      = UTEXT_INITIALIZER;
   2837         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
   2838         utext_openUTF8(&input, str_HelloWorld, -1, &status);
   2839         mFromClone->reset(&input);
   2840         REGEX_ASSERT(mFromClone->find() == TRUE);
   2841         REGEX_ASSERT(mFromClone->group(status) == "Hello");
   2842         REGEX_ASSERT(mFromClone->find() == TRUE);
   2843         REGEX_ASSERT(mFromClone->group(status) == "World");
   2844         REGEX_ASSERT(mFromClone->find() == FALSE);
   2845         delete mFromClone;
   2846         delete pClone;
   2847 
   2848         utext_close(&input);
   2849         utext_close(&pattern);
   2850     }
   2851 
   2852     //
   2853     //   matches convenience API
   2854     //
   2855     {
   2856         UErrorCode status  = U_ZERO_ERROR;
   2857         UText      pattern = UTEXT_INITIALIZER;
   2858         UText      input   = UTEXT_INITIALIZER;
   2859 
   2860         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
   2861         utext_openUTF8(&input, str_randominput, -1, &status);
   2862 
   2863         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
   2864         utext_openUTF8(&pattern, str_dotstar, -1, &status);
   2865         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
   2866         REGEX_CHECK_STATUS;
   2867 
   2868         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
   2869         utext_openUTF8(&pattern, str_abc, -1, &status);
   2870         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
   2871         REGEX_CHECK_STATUS;
   2872 
   2873         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
   2874         utext_openUTF8(&pattern, str_nput, -1, &status);
   2875         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
   2876         REGEX_CHECK_STATUS;
   2877 
   2878         utext_openUTF8(&pattern, str_randominput, -1, &status);
   2879         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
   2880         REGEX_CHECK_STATUS;
   2881 
   2882         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
   2883         utext_openUTF8(&pattern, str_u, -1, &status);
   2884         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
   2885         REGEX_CHECK_STATUS;
   2886 
   2887         utext_openUTF8(&input, str_abc, -1, &status);
   2888         utext_openUTF8(&pattern, str_abc, -1, &status);
   2889         status = U_INDEX_OUTOFBOUNDS_ERROR;
   2890         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
   2891         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   2892 
   2893         utext_close(&input);
   2894         utext_close(&pattern);
   2895     }
   2896 
   2897 
   2898     //
   2899     // Split()
   2900     //
   2901     status = U_ZERO_ERROR;
   2902     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
   2903     utext_openUTF8(&re1, str_spaceplus, -1, &status);
   2904     pat1 = RegexPattern::compile(&re1, pe, status);
   2905     REGEX_CHECK_STATUS;
   2906     UnicodeString  fields[10];
   2907 
   2908     int32_t n;
   2909     n = pat1->split("Now is the time", fields, 10, status);
   2910     REGEX_CHECK_STATUS;
   2911     REGEX_ASSERT(n==4);
   2912     REGEX_ASSERT(fields[0]=="Now");
   2913     REGEX_ASSERT(fields[1]=="is");
   2914     REGEX_ASSERT(fields[2]=="the");
   2915     REGEX_ASSERT(fields[3]=="time");
   2916     REGEX_ASSERT(fields[4]=="");
   2917 
   2918     n = pat1->split("Now is the time", fields, 2, status);
   2919     REGEX_CHECK_STATUS;
   2920     REGEX_ASSERT(n==2);
   2921     REGEX_ASSERT(fields[0]=="Now");
   2922     REGEX_ASSERT(fields[1]=="is the time");
   2923     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
   2924 
   2925     fields[1] = "*";
   2926     status = U_ZERO_ERROR;
   2927     n = pat1->split("Now is the time", fields, 1, status);
   2928     REGEX_CHECK_STATUS;
   2929     REGEX_ASSERT(n==1);
   2930     REGEX_ASSERT(fields[0]=="Now is the time");
   2931     REGEX_ASSERT(fields[1]=="*");
   2932     status = U_ZERO_ERROR;
   2933 
   2934     n = pat1->split("    Now       is the time   ", fields, 10, status);
   2935     REGEX_CHECK_STATUS;
   2936     REGEX_ASSERT(n==6);
   2937     REGEX_ASSERT(fields[0]=="");
   2938     REGEX_ASSERT(fields[1]=="Now");
   2939     REGEX_ASSERT(fields[2]=="is");
   2940     REGEX_ASSERT(fields[3]=="the");
   2941     REGEX_ASSERT(fields[4]=="time");
   2942     REGEX_ASSERT(fields[5]=="");
   2943     REGEX_ASSERT(fields[6]=="");
   2944 
   2945     fields[2] = "*";
   2946     n = pat1->split("     ", fields, 10, status);
   2947     REGEX_CHECK_STATUS;
   2948     REGEX_ASSERT(n==2);
   2949     REGEX_ASSERT(fields[0]=="");
   2950     REGEX_ASSERT(fields[1]=="");
   2951     REGEX_ASSERT(fields[2]=="*");
   2952 
   2953     fields[0] = "foo";
   2954     n = pat1->split("", fields, 10, status);
   2955     REGEX_CHECK_STATUS;
   2956     REGEX_ASSERT(n==0);
   2957     REGEX_ASSERT(fields[0]=="foo");
   2958 
   2959     delete pat1;
   2960 
   2961     //  split, with a pattern with (capture)
   2962     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
   2963     pat1 = RegexPattern::compile(&re1,  pe, status);
   2964     REGEX_CHECK_STATUS;
   2965 
   2966     status = U_ZERO_ERROR;
   2967     fields[6] = fields[7] = "*";
   2968     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
   2969     REGEX_CHECK_STATUS;
   2970     REGEX_ASSERT(n==7);
   2971     REGEX_ASSERT(fields[0]=="");
   2972     REGEX_ASSERT(fields[1]=="a");
   2973     REGEX_ASSERT(fields[2]=="Now is ");
   2974     REGEX_ASSERT(fields[3]=="b");
   2975     REGEX_ASSERT(fields[4]=="the time");
   2976     REGEX_ASSERT(fields[5]=="c");
   2977     REGEX_ASSERT(fields[6]=="");
   2978     REGEX_ASSERT(fields[7]=="*");
   2979     REGEX_ASSERT(status==U_ZERO_ERROR);
   2980 
   2981     fields[6] = fields[7] = "*";
   2982     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
   2983     REGEX_CHECK_STATUS;
   2984     REGEX_ASSERT(n==7);
   2985     REGEX_ASSERT(fields[0]=="  ");
   2986     REGEX_ASSERT(fields[1]=="a");
   2987     REGEX_ASSERT(fields[2]=="Now is ");
   2988     REGEX_ASSERT(fields[3]=="b");
   2989     REGEX_ASSERT(fields[4]=="the time");
   2990     REGEX_ASSERT(fields[5]=="c");
   2991     REGEX_ASSERT(fields[6]=="");
   2992     REGEX_ASSERT(fields[7]=="*");
   2993 
   2994     status = U_ZERO_ERROR;
   2995     fields[6] = "foo";
   2996     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
   2997     REGEX_CHECK_STATUS;
   2998     REGEX_ASSERT(n==6);
   2999     REGEX_ASSERT(fields[0]=="  ");
   3000     REGEX_ASSERT(fields[1]=="a");
   3001     REGEX_ASSERT(fields[2]=="Now is ");
   3002     REGEX_ASSERT(fields[3]=="b");
   3003     REGEX_ASSERT(fields[4]=="the time");
   3004     REGEX_ASSERT(fields[5]==" ");
   3005     REGEX_ASSERT(fields[6]=="foo");
   3006 
   3007     status = U_ZERO_ERROR;
   3008     fields[5] = "foo";
   3009     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
   3010     REGEX_CHECK_STATUS;
   3011     REGEX_ASSERT(n==5);
   3012     REGEX_ASSERT(fields[0]=="  ");
   3013     REGEX_ASSERT(fields[1]=="a");
   3014     REGEX_ASSERT(fields[2]=="Now is ");
   3015     REGEX_ASSERT(fields[3]=="b");
   3016     REGEX_ASSERT(fields[4]=="the time<c>");
   3017     REGEX_ASSERT(fields[5]=="foo");
   3018 
   3019     status = U_ZERO_ERROR;
   3020     fields[5] = "foo";
   3021     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
   3022     REGEX_CHECK_STATUS;
   3023     REGEX_ASSERT(n==5);
   3024     REGEX_ASSERT(fields[0]=="  ");
   3025     REGEX_ASSERT(fields[1]=="a");
   3026     REGEX_ASSERT(fields[2]=="Now is ");
   3027     REGEX_ASSERT(fields[3]=="b");
   3028     REGEX_ASSERT(fields[4]=="the time");
   3029     REGEX_ASSERT(fields[5]=="foo");
   3030 
   3031     status = U_ZERO_ERROR;
   3032     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
   3033     REGEX_CHECK_STATUS;
   3034     REGEX_ASSERT(n==4);
   3035     REGEX_ASSERT(fields[0]=="  ");
   3036     REGEX_ASSERT(fields[1]=="a");
   3037     REGEX_ASSERT(fields[2]=="Now is ");
   3038     REGEX_ASSERT(fields[3]=="the time<c>");
   3039     status = U_ZERO_ERROR;
   3040     delete pat1;
   3041 
   3042     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
   3043     pat1 = RegexPattern::compile(&re1, pe, status);
   3044     REGEX_CHECK_STATUS;
   3045     n = pat1->split("1-10,20", fields, 10, status);
   3046     REGEX_CHECK_STATUS;
   3047     REGEX_ASSERT(n==5);
   3048     REGEX_ASSERT(fields[0]=="1");
   3049     REGEX_ASSERT(fields[1]=="-");
   3050     REGEX_ASSERT(fields[2]=="10");
   3051     REGEX_ASSERT(fields[3]==",");
   3052     REGEX_ASSERT(fields[4]=="20");
   3053     delete pat1;
   3054 
   3055 
   3056     //
   3057     // split of a UText based string, with library allocating output UTexts.
   3058     //
   3059     {
   3060         status = U_ZERO_ERROR;
   3061         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
   3062         UnicodeString stringToSplit("first:second:third");
   3063         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
   3064         REGEX_CHECK_STATUS;
   3065 
   3066         UText *splits[10] = {NULL};
   3067         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
   3068         REGEX_CHECK_STATUS;
   3069         REGEX_ASSERT(numFields == 5);
   3070         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
   3071         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
   3072         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
   3073         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
   3074         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
   3075         REGEX_ASSERT(splits[5] == NULL);
   3076 
   3077         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
   3078             if (splits[i]) {
   3079                 utext_close(splits[i]);
   3080                 splits[i] = NULL;
   3081             }
   3082         }
   3083         utext_close(textToSplit);
   3084     }
   3085 
   3086 
   3087     //
   3088     // RegexPattern::pattern() and patternText()
   3089     //
   3090     pat1 = new RegexPattern();
   3091     REGEX_ASSERT(pat1->pattern() == "");
   3092     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
   3093     delete pat1;
   3094     const char *helloWorldInvariant = "(Hello, world)*";
   3095     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
   3096     pat1 = RegexPattern::compile(&re1, pe, status);
   3097     REGEX_CHECK_STATUS;
   3098     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
   3099     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
   3100     delete pat1;
   3101 
   3102     utext_close(&re1);
   3103 }
   3104 
   3105 
   3106 //---------------------------------------------------------------------------
   3107 //
   3108 //      Extended       A more thorough check for features of regex patterns
   3109 //                     The test cases are in a separate data file,
   3110 //                       source/tests/testdata/regextst.txt
   3111 //                     A description of the test data format is included in that file.
   3112 //
   3113 //---------------------------------------------------------------------------
   3114 
   3115 const char *
   3116 RegexTest::getPath(char buffer[2048], const char *filename) {
   3117     UErrorCode status=U_ZERO_ERROR;
   3118     const char *testDataDirectory = IntlTest::getSourceTestData(status);
   3119     if (U_FAILURE(status)) {
   3120         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
   3121         return NULL;
   3122     }
   3123 
   3124     strcpy(buffer, testDataDirectory);
   3125     strcat(buffer, filename);
   3126     return buffer;
   3127 }
   3128 
   3129 void RegexTest::Extended() {
   3130     char tdd[2048];
   3131     const char *srcPath;
   3132     UErrorCode  status  = U_ZERO_ERROR;
   3133     int32_t     lineNum = 0;
   3134 
   3135     //
   3136     //  Open and read the test data file.
   3137     //
   3138     srcPath=getPath(tdd, "regextst.txt");
   3139     if(srcPath==NULL) {
   3140         return; /* something went wrong, error already output */
   3141     }
   3142 
   3143     int32_t    len;
   3144     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
   3145     if (U_FAILURE(status)) {
   3146         return; /* something went wrong, error already output */
   3147     }
   3148 
   3149     //
   3150     //  Put the test data into a UnicodeString
   3151     //
   3152     UnicodeString testString(FALSE, testData, len);
   3153 
   3154     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
   3155     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
   3156     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
   3157 
   3158     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
   3159     UnicodeString   testPattern;   // The pattern for test from the test file.
   3160     UnicodeString   testFlags;     // the flags   for a test.
   3161     UnicodeString   matchString;   // The marked up string to be used as input
   3162 
   3163     if (U_FAILURE(status)){
   3164         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
   3165         delete [] testData;
   3166         return;
   3167     }
   3168 
   3169     //
   3170     //  Loop over the test data file, once per line.
   3171     //
   3172     while (lineMat.find()) {
   3173         lineNum++;
   3174         if (U_FAILURE(status)) {
   3175           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
   3176         }
   3177 
   3178         status = U_ZERO_ERROR;
   3179         UnicodeString testLine = lineMat.group(1, status);
   3180         if (testLine.length() == 0) {
   3181             continue;
   3182         }
   3183 
   3184         //
   3185         // Parse the test line.  Skip blank and comment only lines.
   3186         // Separate out the three main fields - pattern, flags, target.
   3187         //
   3188 
   3189         commentMat.reset(testLine);
   3190         if (commentMat.lookingAt(status)) {
   3191             // This line is a comment, or blank.
   3192             continue;
   3193         }
   3194 
   3195         //
   3196         //  Pull out the pattern field, remove it from the test file line.
   3197         //
   3198         quotedStuffMat.reset(testLine);
   3199         if (quotedStuffMat.lookingAt(status)) {
   3200             testPattern = quotedStuffMat.group(2, status);
   3201             testLine.remove(0, quotedStuffMat.end(0, status));
   3202         } else {
   3203             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
   3204             continue;
   3205         }
   3206 
   3207 
   3208         //
   3209         //  Pull out the flags from the test file line.
   3210         //
   3211         flagsMat.reset(testLine);
   3212         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
   3213         testFlags = flagsMat.group(1, status);
   3214         if (flagsMat.group(2, status).length() > 0) {
   3215             errln("Bad Match flag at line %d. Scanning %c\n",
   3216                 lineNum, flagsMat.group(2, status).charAt(0));
   3217             continue;
   3218         }
   3219         testLine.remove(0, flagsMat.end(0, status));
   3220 
   3221         //
   3222         //  Pull out the match string, as a whole.
   3223         //    We'll process the <tags> later.
   3224         //
   3225         quotedStuffMat.reset(testLine);
   3226         if (quotedStuffMat.lookingAt(status)) {
   3227             matchString = quotedStuffMat.group(2, status);
   3228             testLine.remove(0, quotedStuffMat.end(0, status));
   3229         } else {
   3230             errln("Bad match string at test file line %d", lineNum);
   3231             continue;
   3232         }
   3233 
   3234         //
   3235         //  The only thing left from the input line should be an optional trailing comment.
   3236         //
   3237         commentMat.reset(testLine);
   3238         if (commentMat.lookingAt(status) == FALSE) {
   3239             errln("Line %d: unexpected characters at end of test line.", lineNum);
   3240             continue;
   3241         }
   3242 
   3243         //
   3244         //  Run the test
   3245         //
   3246         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
   3247     }
   3248 
   3249     delete [] testData;
   3250 
   3251 }
   3252 
   3253 
   3254 
   3255 //---------------------------------------------------------------------------
   3256 //
   3257 //    regex_find(pattern, flags, inputString, lineNumber)
   3258 //
   3259 //         Function to run a single test from the Extended (data driven) tests.
   3260 //         See file test/testdata/regextst.txt for a description of the
   3261 //         pattern and inputString fields, and the allowed flags.
   3262 //         lineNumber is the source line in regextst.txt of the test.
   3263 //
   3264 //---------------------------------------------------------------------------
   3265 
   3266 
   3267 //  Set a value into a UVector at position specified by a decimal number in
   3268 //   a UnicodeString.   This is a utility function needed by the actual test function,
   3269 //   which follows.
   3270 static void set(UVector &vec, int32_t val, UnicodeString index) {
   3271     UErrorCode  status=U_ZERO_ERROR;
   3272     int32_t  idx = 0;
   3273     for (int32_t i=0; i<index.length(); i++) {
   3274         int32_t d=u_charDigitValue(index.charAt(i));
   3275         if (d<0) {return;}
   3276         idx = idx*10 + d;
   3277     }
   3278     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3279     vec.setElementAt(val, idx);
   3280 }
   3281 
   3282 static void setInt(UVector &vec, int32_t val, int32_t idx) {
   3283     UErrorCode  status=U_ZERO_ERROR;
   3284     while (vec.size()<idx+1) {vec.addElement(-1, status);}
   3285     vec.setElementAt(val, idx);
   3286 }
   3287 
   3288 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
   3289 {
   3290     UBool couldFind = TRUE;
   3291     UTEXT_SETNATIVEINDEX(utext, 0);
   3292     int32_t i = 0;
   3293     while (i < unistrOffset) {
   3294         UChar32 c = UTEXT_NEXT32(utext);
   3295         if (c != U_SENTINEL) {
   3296             i += U16_LENGTH(c);
   3297         } else {
   3298             couldFind = FALSE;
   3299             break;
   3300         }
   3301     }
   3302     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
   3303     return couldFind;
   3304 }
   3305 
   3306 
   3307 void RegexTest::regex_find(const UnicodeString &pattern,
   3308                            const UnicodeString &flags,
   3309                            const UnicodeString &inputString,
   3310                            const char *srcPath,
   3311                            int32_t line) {
   3312     UnicodeString       unEscapedInput;
   3313     UnicodeString       deTaggedInput;
   3314 
   3315     int32_t             patternUTF8Length,      inputUTF8Length;
   3316     char                *patternChars  = NULL, *inputChars = NULL;
   3317     UText               patternText    = UTEXT_INITIALIZER;
   3318     UText               inputText      = UTEXT_INITIALIZER;
   3319     UConverter          *UTF8Converter = NULL;
   3320 
   3321     UErrorCode          status         = U_ZERO_ERROR;
   3322     UParseError         pe;
   3323     RegexPattern        *parsePat      = NULL;
   3324     RegexMatcher        *parseMatcher  = NULL;
   3325     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
   3326     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
   3327     UVector             groupStarts(status);
   3328     UVector             groupEnds(status);
   3329     UVector             groupStartsUTF8(status);
   3330     UVector             groupEndsUTF8(status);
   3331     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
   3332     UBool               failed         = FALSE;
   3333     int32_t             numFinds;
   3334     int32_t             i;
   3335     UBool               useMatchesFunc   = FALSE;
   3336     UBool               useLookingAtFunc = FALSE;
   3337     int32_t             regionStart      = -1;
   3338     int32_t             regionEnd        = -1;
   3339     int32_t             regionStartUTF8  = -1;
   3340     int32_t             regionEndUTF8    = -1;
   3341 
   3342 
   3343     //
   3344     //  Compile the caller's pattern
   3345     //
   3346     uint32_t bflags = 0;
   3347     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
   3348         bflags |= UREGEX_CASE_INSENSITIVE;
   3349     }
   3350     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
   3351         bflags |= UREGEX_COMMENTS;
   3352     }
   3353     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
   3354         bflags |= UREGEX_DOTALL;
   3355     }
   3356     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
   3357         bflags |= UREGEX_MULTILINE;
   3358     }
   3359 
   3360     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
   3361         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
   3362     }
   3363     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
   3364         bflags |= UREGEX_UNIX_LINES;
   3365     }
   3366     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
   3367         bflags |= UREGEX_LITERAL;
   3368     }
   3369 
   3370 
   3371     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
   3372     if (status != U_ZERO_ERROR) {
   3373         #if UCONFIG_NO_BREAK_ITERATION==1
   3374         // 'v' test flag means that the test pattern should not compile if ICU was configured
   3375         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3376         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3377             goto cleanupAndReturn;
   3378         }
   3379         #endif
   3380         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3381             // Expected pattern compilation error.
   3382             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3383                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
   3384             }
   3385             goto cleanupAndReturn;
   3386         } else {
   3387             // Unexpected pattern compilation error.
   3388             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
   3389             goto cleanupAndReturn;
   3390         }
   3391     }
   3392 
   3393     UTF8Converter = ucnv_open("UTF8", &status);
   3394     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   3395 
   3396     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
   3397     status = U_ZERO_ERROR; // buffer overflow
   3398     patternChars = new char[patternUTF8Length+1];
   3399     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
   3400     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
   3401 
   3402     if (status == U_ZERO_ERROR) {
   3403         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
   3404 
   3405         if (status != U_ZERO_ERROR) {
   3406 #if UCONFIG_NO_BREAK_ITERATION==1
   3407             // 'v' test flag means that the test pattern should not compile if ICU was configured
   3408             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
   3409             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
   3410                 goto cleanupAndReturn;
   3411             }
   3412 #endif
   3413             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
   3414                 // Expected pattern compilation error.
   3415                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
   3416                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
   3417                 }
   3418                 goto cleanupAndReturn;
   3419             } else {
   3420                 // Unexpected pattern compilation error.
   3421                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
   3422                 goto cleanupAndReturn;
   3423             }
   3424         }
   3425     }
   3426 
   3427     if (UTF8Pattern == NULL) {
   3428         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3429         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
   3430         status = U_ZERO_ERROR;
   3431     }
   3432 
   3433     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
   3434         callerPattern->dumpPattern();
   3435     }
   3436 
   3437     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
   3438         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
   3439         goto cleanupAndReturn;
   3440     }
   3441 
   3442 
   3443     //
   3444     // Number of times find() should be called on the test string, default to 1
   3445     //
   3446     numFinds = 1;
   3447     for (i=2; i<=9; i++) {
   3448         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
   3449             if (numFinds != 1) {
   3450                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
   3451                 goto cleanupAndReturn;
   3452             }
   3453             numFinds = i;
   3454         }
   3455     }
   3456 
   3457     // 'M' flag.  Use matches() instead of find()
   3458     if (flags.indexOf((UChar)0x4d) >= 0) {
   3459         useMatchesFunc = TRUE;
   3460     }
   3461     if (flags.indexOf((UChar)0x4c) >= 0) {
   3462         useLookingAtFunc = TRUE;
   3463     }
   3464 
   3465     //
   3466     //  Find the tags in the input data, remove them, and record the group boundary
   3467     //    positions.
   3468     //
   3469     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
   3470     REGEX_CHECK_STATUS_L(line);
   3471 
   3472     unEscapedInput = inputString.unescape();
   3473     parseMatcher = parsePat->matcher(unEscapedInput, status);
   3474     REGEX_CHECK_STATUS_L(line);
   3475     while(parseMatcher->find()) {
   3476         parseMatcher->appendReplacement(deTaggedInput, "", status);
   3477         REGEX_CHECK_STATUS;
   3478         UnicodeString groupNum = parseMatcher->group(2, status);
   3479         if (groupNum == "r") {
   3480             // <r> or </r>, a region specification within the string
   3481             if (parseMatcher->group(1, status) == "/") {
   3482                 regionEnd = deTaggedInput.length();
   3483             } else {
   3484                 regionStart = deTaggedInput.length();
   3485             }
   3486         } else {
   3487             // <digits> or </digits>, a group match boundary tag.
   3488             if (parseMatcher->group(1, status) == "/") {
   3489                 set(groupEnds, deTaggedInput.length(), groupNum);
   3490             } else {
   3491                 set(groupStarts, deTaggedInput.length(), groupNum);
   3492             }
   3493         }
   3494     }
   3495     parseMatcher->appendTail(deTaggedInput);
   3496     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
   3497     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
   3498       errln("mismatched <r> tags");
   3499       failed = TRUE;
   3500       goto cleanupAndReturn;
   3501     }
   3502 
   3503     //
   3504     //  Configure the matcher according to the flags specified with this test.
   3505     //
   3506     matcher = callerPattern->matcher(deTaggedInput, status);
   3507     REGEX_CHECK_STATUS_L(line);
   3508     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3509         matcher->setTrace(TRUE);
   3510     }
   3511 
   3512     if (UTF8Pattern != NULL) {
   3513         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
   3514         status = U_ZERO_ERROR; // buffer overflow
   3515         inputChars = new char[inputUTF8Length+1];
   3516         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
   3517         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
   3518 
   3519         if (status == U_ZERO_ERROR) {
   3520             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
   3521             REGEX_CHECK_STATUS_L(line);
   3522         }
   3523 
   3524         if (UTF8Matcher == NULL) {
   3525             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
   3526             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
   3527             status = U_ZERO_ERROR;
   3528         }
   3529     }
   3530 
   3531     //
   3532     //  Generate native indices for UTF8 versions of region and capture group info
   3533     //
   3534     if (UTF8Matcher != NULL) {
   3535         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
   3536             UTF8Matcher->setTrace(TRUE);
   3537         }
   3538         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
   3539         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
   3540 
   3541         //  Fill out the native index UVector info.
   3542         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
   3543         for (i=0; i<groupStarts.size(); i++) {
   3544             int32_t  start = groupStarts.elementAti(i);
   3545             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3546             if (start >= 0) {
   3547                 int32_t  startUTF8;
   3548                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
   3549                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
   3550                     failed = TRUE;
   3551                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3552                 }
   3553                 setInt(groupStartsUTF8, startUTF8, i);
   3554             }
   3555 
   3556             int32_t  end = groupEnds.elementAti(i);
   3557             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
   3558             if (end >= 0) {
   3559                 int32_t  endUTF8;
   3560                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
   3561                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
   3562                     failed = TRUE;
   3563                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3564                 }
   3565                 setInt(groupEndsUTF8, endUTF8, i);
   3566             }
   3567         }
   3568     }
   3569 
   3570     if (regionStart>=0) {
   3571        matcher->region(regionStart, regionEnd, status);
   3572        REGEX_CHECK_STATUS_L(line);
   3573        if (UTF8Matcher != NULL) {
   3574            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
   3575            REGEX_CHECK_STATUS_L(line);
   3576        }
   3577     }
   3578     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
   3579         matcher->useAnchoringBounds(FALSE);
   3580         if (UTF8Matcher != NULL) {
   3581             UTF8Matcher->useAnchoringBounds(FALSE);
   3582         }
   3583     }
   3584     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
   3585         matcher->useTransparentBounds(TRUE);
   3586         if (UTF8Matcher != NULL) {
   3587             UTF8Matcher->useTransparentBounds(TRUE);
   3588         }
   3589     }
   3590 
   3591 
   3592 
   3593     //
   3594     // Do a find on the de-tagged input using the caller's pattern
   3595     //     TODO: error on count>1 and not find().
   3596     //           error on both matches() and lookingAt().
   3597     //
   3598     for (i=0; i<numFinds; i++) {
   3599         if (useMatchesFunc) {
   3600             isMatch = matcher->matches(status);
   3601             if (UTF8Matcher != NULL) {
   3602                isUTF8Match = UTF8Matcher->matches(status);
   3603             }
   3604         } else  if (useLookingAtFunc) {
   3605             isMatch = matcher->lookingAt(status);
   3606             if (UTF8Matcher != NULL) {
   3607                 isUTF8Match = UTF8Matcher->lookingAt(status);
   3608             }
   3609         } else {
   3610             isMatch = matcher->find();
   3611             if (UTF8Matcher != NULL) {
   3612                 isUTF8Match = UTF8Matcher->find();
   3613             }
   3614         }
   3615     }
   3616     matcher->setTrace(FALSE);
   3617     if (UTF8Matcher) {
   3618         UTF8Matcher->setTrace(FALSE);
   3619     }
   3620     if (U_FAILURE(status)) {
   3621         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
   3622     }
   3623 
   3624     //
   3625     // Match up the groups from the find() with the groups from the tags
   3626     //
   3627 
   3628     // number of tags should match number of groups from find operation.
   3629     // matcher->groupCount does not include group 0, the entire match, hence the +1.
   3630     //   G option in test means that capture group data is not available in the
   3631     //     expected results, so the check needs to be suppressed.
   3632     if (isMatch == FALSE && groupStarts.size() != 0) {
   3633         dataerrln("Error at line %d:  Match expected, but none found.", line);
   3634         failed = TRUE;
   3635         goto cleanupAndReturn;
   3636     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
   3637         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
   3638         failed = TRUE;
   3639         goto cleanupAndReturn;
   3640     }
   3641     if (isMatch && groupStarts.size() == 0) {
   3642         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
   3643         failed = TRUE;
   3644     }
   3645     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
   3646         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
   3647         failed = TRUE;
   3648     }
   3649 
   3650     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
   3651         // Only check for match / no match.  Don't check capture groups.
   3652         goto cleanupAndReturn;
   3653     }
   3654 
   3655     REGEX_CHECK_STATUS_L(line);
   3656     for (i=0; i<=matcher->groupCount(); i++) {
   3657         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
   3658         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
   3659         if (matcher->start(i, status) != expectedStart) {
   3660             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
   3661                 line, i, expectedStart, matcher->start(i, status));
   3662             failed = TRUE;
   3663             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3664         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
   3665             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
   3666                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
   3667             failed = TRUE;
   3668             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
   3669         }
   3670 
   3671         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
   3672         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
   3673         if (matcher->end(i, status) != expectedEnd) {
   3674             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
   3675                 line, i, expectedEnd, matcher->end(i, status));
   3676             failed = TRUE;
   3677             // Error on end position;  keep going; real error is probably yet to come as group
   3678             //   end positions work from end of the input data towards the front.
   3679         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
   3680             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
   3681                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
   3682             failed = TRUE;
   3683             // Error on end position;  keep going; real error is probably yet to come as group
   3684             //   end positions work from end of the input data towards the front.
   3685         }
   3686     }
   3687     if ( matcher->groupCount()+1 < groupStarts.size()) {
   3688         errln("Error at line %d: Expected %d capture groups, found %d.",
   3689             line, groupStarts.size()-1, matcher->groupCount());
   3690         failed = TRUE;
   3691         }
   3692     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
   3693         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
   3694               line, groupStarts.size()-1, UTF8Matcher->groupCount());
   3695         failed = TRUE;
   3696     }
   3697 
   3698     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3699         matcher->requireEnd() == TRUE) {
   3700         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
   3701         failed = TRUE;
   3702     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3703         UTF8Matcher->requireEnd() == TRUE) {
   3704         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3705         failed = TRUE;
   3706     }
   3707 
   3708     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
   3709         matcher->requireEnd() == FALSE) {
   3710         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
   3711         failed = TRUE;
   3712     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
   3713         UTF8Matcher->requireEnd() == FALSE) {
   3714         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3715         failed = TRUE;
   3716     }
   3717 
   3718     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3719         matcher->hitEnd() == TRUE) {
   3720         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
   3721         failed = TRUE;
   3722     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
   3723                UTF8Matcher->hitEnd() == TRUE) {
   3724         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
   3725         failed = TRUE;
   3726     }
   3727 
   3728     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3729         matcher->hitEnd() == FALSE) {
   3730         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
   3731         failed = TRUE;
   3732     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
   3733                UTF8Matcher->hitEnd() == FALSE) {
   3734         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
   3735         failed = TRUE;
   3736     }
   3737 
   3738 
   3739 cleanupAndReturn:
   3740     if (failed) {
   3741         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
   3742             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
   3743         // callerPattern->dump();
   3744     }
   3745     delete parseMatcher;
   3746     delete parsePat;
   3747     delete UTF8Matcher;
   3748     delete UTF8Pattern;
   3749     delete matcher;
   3750     delete callerPattern;
   3751 
   3752     utext_close(&inputText);
   3753     delete[] inputChars;
   3754     utext_close(&patternText);
   3755     delete[] patternChars;
   3756     ucnv_close(UTF8Converter);
   3757 }
   3758 
   3759 
   3760 
   3761 
   3762 //---------------------------------------------------------------------------
   3763 //
   3764 //      Errors     Check for error handling in patterns.
   3765 //
   3766 //---------------------------------------------------------------------------
   3767 void RegexTest::Errors() {
   3768     // \escape sequences that aren't implemented yet.
   3769     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
   3770 
   3771     // Missing close parentheses
   3772     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3773     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
   3774     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
   3775 
   3776     // Extra close paren
   3777     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
   3778     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
   3779     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
   3780 
   3781     // Look-ahead, Look-behind
   3782     //  TODO:  add tests for unbounded length look-behinds.
   3783     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
   3784 
   3785     // Attempt to use non-default flags
   3786     {
   3787         UParseError   pe;
   3788         UErrorCode    status = U_ZERO_ERROR;
   3789         int32_t       flags  = UREGEX_CANON_EQ |
   3790                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
   3791                                UREGEX_MULTILINE;
   3792         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
   3793         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
   3794         delete pat1;
   3795     }
   3796 
   3797 
   3798     // Quantifiers are allowed only after something that can be quantified.
   3799     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
   3800     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
   3801     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
   3802 
   3803     // Mal-formed {min,max} quantifiers
   3804     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
   3805     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
   3806     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
   3807     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
   3808     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
   3809     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
   3810     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
   3811     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
   3812     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
   3813 
   3814     // Ticket 5389
   3815     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
   3816 
   3817     // Invalid Back Reference \0
   3818     //    For ICU 3.8 and earlier
   3819     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
   3820     //
   3821     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
   3822 
   3823 }
   3824 
   3825 
   3826 //-------------------------------------------------------------------------------
   3827 //
   3828 //  Read a text data file, convert it to UChars, and return the data
   3829 //    in one big UChar * buffer, which the caller must delete.
   3830 //
   3831 //--------------------------------------------------------------------------------
   3832 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
   3833                                      const char *defEncoding, UErrorCode &status) {
   3834     UChar       *retPtr  = NULL;
   3835     char        *fileBuf = NULL;
   3836     UConverter* conv     = NULL;
   3837     FILE        *f       = NULL;
   3838 
   3839     ulen = 0;
   3840     if (U_FAILURE(status)) {
   3841         return retPtr;
   3842     }
   3843 
   3844     //
   3845     //  Open the file.
   3846     //
   3847     f = fopen(fileName, "rb");
   3848     if (f == 0) {
   3849         dataerrln("Error opening test data file %s\n", fileName);
   3850         status = U_FILE_ACCESS_ERROR;
   3851         return NULL;
   3852     }
   3853     //
   3854     //  Read it in
   3855     //
   3856     int32_t            fileSize;
   3857     int32_t            amt_read;
   3858 
   3859     fseek( f, 0, SEEK_END);
   3860     fileSize = ftell(f);
   3861     fileBuf = new char[fileSize];
   3862     fseek(f, 0, SEEK_SET);
   3863     amt_read = fread(fileBuf, 1, fileSize, f);
   3864     if (amt_read != fileSize || fileSize <= 0) {
   3865         errln("Error reading test data file.");
   3866         goto cleanUpAndReturn;
   3867     }
   3868 
   3869     //
   3870     // Look for a Unicode Signature (BOM) on the data just read
   3871     //
   3872     int32_t        signatureLength;
   3873     const char *   fileBufC;
   3874     const char*    encoding;
   3875 
   3876     fileBufC = fileBuf;
   3877     encoding = ucnv_detectUnicodeSignature(
   3878         fileBuf, fileSize, &signatureLength, &status);
   3879     if(encoding!=NULL ){
   3880         fileBufC  += signatureLength;
   3881         fileSize  -= signatureLength;
   3882     } else {
   3883         encoding = defEncoding;
   3884         if (strcmp(encoding, "utf-8") == 0) {
   3885             errln("file %s is missing its BOM", fileName);
   3886         }
   3887     }
   3888 
   3889     //
   3890     // Open a converter to take the rule file to UTF-16
   3891     //
   3892     conv = ucnv_open(encoding, &status);
   3893     if (U_FAILURE(status)) {
   3894         goto cleanUpAndReturn;
   3895     }
   3896 
   3897     //
   3898     // Convert the rules to UChar.
   3899     //  Preflight first to determine required buffer size.
   3900     //
   3901     ulen = ucnv_toUChars(conv,
   3902         NULL,           //  dest,
   3903         0,              //  destCapacity,
   3904         fileBufC,
   3905         fileSize,
   3906         &status);
   3907     if (status == U_BUFFER_OVERFLOW_ERROR) {
   3908         // Buffer Overflow is expected from the preflight operation.
   3909         status = U_ZERO_ERROR;
   3910 
   3911         retPtr = new UChar[ulen+1];
   3912         ucnv_toUChars(conv,
   3913             retPtr,       //  dest,
   3914             ulen+1,
   3915             fileBufC,
   3916             fileSize,
   3917             &status);
   3918     }
   3919 
   3920 cleanUpAndReturn:
   3921     fclose(f);
   3922     delete[] fileBuf;
   3923     ucnv_close(conv);
   3924     if (U_FAILURE(status)) {
   3925         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   3926         delete []retPtr;
   3927         retPtr = 0;
   3928         ulen   = 0;
   3929     };
   3930     return retPtr;
   3931 }
   3932 
   3933 
   3934 //-------------------------------------------------------------------------------
   3935 //
   3936 //   PerlTests  - Run Perl's regular expression tests
   3937 //                The input file for this test is re_tests, the standard regular
   3938 //                expression test data distributed with the Perl source code.
   3939 //
   3940 //                Here is Perl's description of the test data file:
   3941 //
   3942 //        # The tests are in a separate file 't/op/re_tests'.
   3943 //        # Each line in that file is a separate test.
   3944 //        # There are five columns, separated by tabs.
   3945 //        #
   3946 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
   3947 //        # Modifiers can be put after the closing C<'>.
   3948 //        #
   3949 //        # Column 2 contains the string to be matched.
   3950 //        #
   3951 //        # Column 3 contains the expected result:
   3952 //        #     y   expect a match
   3953 //        #     n   expect no match
   3954 //        #     c   expect an error
   3955 //        # B   test exposes a known bug in Perl, should be skipped
   3956 //        # b   test exposes a known bug in Perl, should be skipped if noamp
   3957 //        #
   3958 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
   3959 //        #
   3960 //        # Column 4 contains a string, usually C<$&>.
   3961 //        #
   3962 //        # Column 5 contains the expected result of double-quote
   3963 //        # interpolating that string after the match, or start of error message.
   3964 //        #
   3965 //        # Column 6, if present, contains a reason why the test is skipped.
   3966 //        # This is printed with "skipped", for harness to pick up.
   3967 //        #
   3968 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
   3969 //        #
   3970 //        # If you want to add a regular expression test that can't be expressed
   3971 //        # in this format, don't add it here: put it in op/pat.t instead.
   3972 //
   3973 //        For ICU, if field 3 contains an 'i', the test will be skipped.
   3974 //        The test exposes is some known incompatibility between ICU and Perl regexps.
   3975 //        (The i is in addition to whatever was there before.)
   3976 //
   3977 //-------------------------------------------------------------------------------
   3978 void RegexTest::PerlTests() {
   3979     char tdd[2048];
   3980     const char *srcPath;
   3981     UErrorCode  status = U_ZERO_ERROR;
   3982     UParseError pe;
   3983 
   3984     //
   3985     //  Open and read the test data file.
   3986     //
   3987     srcPath=getPath(tdd, "re_tests.txt");
   3988     if(srcPath==NULL) {
   3989         return; /* something went wrong, error already output */
   3990     }
   3991 
   3992     int32_t    len;
   3993     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   3994     if (U_FAILURE(status)) {
   3995         return; /* something went wrong, error already output */
   3996     }
   3997 
   3998     //
   3999     //  Put the test data into a UnicodeString
   4000     //
   4001     UnicodeString testDataString(FALSE, testData, len);
   4002 
   4003     //
   4004     //  Regex to break the input file into lines, and strip the new lines.
   4005     //     One line per match, capture group one is the desired data.
   4006     //
   4007     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4008     if (U_FAILURE(status)) {
   4009         dataerrln("RegexPattern::compile() error");
   4010         return;
   4011     }
   4012     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4013 
   4014     //
   4015     //  Regex to split a test file line into fields.
   4016     //    There are six fields, separated by tabs.
   4017     //
   4018     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4019 
   4020     //
   4021     //  Regex to identify test patterns with flag settings, and to separate them.
   4022     //    Test patterns with flags look like 'pattern'i
   4023     //    Test patterns without flags are not quoted:   pattern
   4024     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4025     //
   4026     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4027     RegexMatcher* flagMat = flagPat->matcher(status);
   4028 
   4029     //
   4030     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4031     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4032     //   are string constants and REs for these constructs.
   4033     //
   4034     UnicodeString nulnulSrc("${nulnul}");
   4035     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4036     nulnul = nulnul.unescape();
   4037 
   4038     UnicodeString ffffSrc("${ffff}");
   4039     UnicodeString ffff("\\uffff", -1, US_INV);
   4040     ffff = ffff.unescape();
   4041 
   4042     //  regexp for $-[0], $+[2], etc.
   4043     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4044     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4045 
   4046     //  regexp for $0, $1, $2, etc.
   4047     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4048     RegexMatcher *cgMat = cgPat->matcher(status);
   4049 
   4050 
   4051     //
   4052     // Main Loop for the Perl Tests, runs once per line from the
   4053     //   test data file.
   4054     //
   4055     int32_t  lineNum = 0;
   4056     int32_t  skippedUnimplementedCount = 0;
   4057     while (lineMat->find()) {
   4058         lineNum++;
   4059 
   4060         //
   4061         //  Get a line, break it into its fields, do the Perl
   4062         //    variable substitutions.
   4063         //
   4064         UnicodeString line = lineMat->group(1, status);
   4065         UnicodeString fields[7];
   4066         fieldPat->split(line, fields, 7, status);
   4067 
   4068         flagMat->reset(fields[0]);
   4069         flagMat->matches(status);
   4070         UnicodeString pattern  = flagMat->group(2, status);
   4071         pattern.findAndReplace("${bang}", "!");
   4072         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4073         pattern.findAndReplace(ffffSrc, ffff);
   4074 
   4075         //
   4076         //  Identify patterns that include match flag settings,
   4077         //    split off the flags, remove the extra quotes.
   4078         //
   4079         UnicodeString flagStr = flagMat->group(3, status);
   4080         if (U_FAILURE(status)) {
   4081             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4082             return;
   4083         }
   4084         int32_t flags = 0;
   4085         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4086         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4087         const UChar UChar_m = 0x6d;
   4088         const UChar UChar_x = 0x78;
   4089         const UChar UChar_y = 0x79;
   4090         if (flagStr.indexOf(UChar_i) != -1) {
   4091             flags |= UREGEX_CASE_INSENSITIVE;
   4092         }
   4093         if (flagStr.indexOf(UChar_m) != -1) {
   4094             flags |= UREGEX_MULTILINE;
   4095         }
   4096         if (flagStr.indexOf(UChar_x) != -1) {
   4097             flags |= UREGEX_COMMENTS;
   4098         }
   4099 
   4100         //
   4101         // Compile the test pattern.
   4102         //
   4103         status = U_ZERO_ERROR;
   4104         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
   4105         if (status == U_REGEX_UNIMPLEMENTED) {
   4106             //
   4107             // Test of a feature that is planned for ICU, but not yet implemented.
   4108             //   skip the test.
   4109             skippedUnimplementedCount++;
   4110             delete testPat;
   4111             status = U_ZERO_ERROR;
   4112             continue;
   4113         }
   4114 
   4115         if (U_FAILURE(status)) {
   4116             // Some tests are supposed to generate errors.
   4117             //   Only report an error for tests that are supposed to succeed.
   4118             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4119                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4120             {
   4121                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4122             }
   4123             status = U_ZERO_ERROR;
   4124             delete testPat;
   4125             continue;
   4126         }
   4127 
   4128         if (fields[2].indexOf(UChar_i) >= 0) {
   4129             // ICU should skip this test.
   4130             delete testPat;
   4131             continue;
   4132         }
   4133 
   4134         if (fields[2].indexOf(UChar_c) >= 0) {
   4135             // This pattern should have caused a compilation error, but didn't/
   4136             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4137             delete testPat;
   4138             continue;
   4139         }
   4140 
   4141         //
   4142         // replace the Perl variables that appear in some of the
   4143         //   match data strings.
   4144         //
   4145         UnicodeString matchString = fields[1];
   4146         matchString.findAndReplace(nulnulSrc, nulnul);
   4147         matchString.findAndReplace(ffffSrc,   ffff);
   4148 
   4149         // Replace any \n in the match string with an actual new-line char.
   4150         //  Don't do full unescape, as this unescapes more than Perl does, which
   4151         //  causes other spurious failures in the tests.
   4152         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4153 
   4154 
   4155 
   4156         //
   4157         // Run the test, check for expected match/don't match result.
   4158         //
   4159         RegexMatcher *testMat = testPat->matcher(matchString, status);
   4160         UBool found = testMat->find();
   4161         UBool expected = FALSE;
   4162         if (fields[2].indexOf(UChar_y) >=0) {
   4163             expected = TRUE;
   4164         }
   4165         if (expected != found) {
   4166             errln("line %d: Expected %smatch, got %smatch",
   4167                 lineNum, expected?"":"no ", found?"":"no " );
   4168             continue;
   4169         }
   4170 
   4171         // Don't try to check expected results if there is no match.
   4172         //   (Some have stuff in the expected fields)
   4173         if (!found) {
   4174             delete testMat;
   4175             delete testPat;
   4176             continue;
   4177         }
   4178 
   4179         //
   4180         // Interpret the Perl expression from the fourth field of the data file,
   4181         // building up an ICU string from the results of the ICU match.
   4182         //   The Perl expression will contain references to the results of
   4183         //     a regex match, including the matched string, capture group strings,
   4184         //     group starting and ending indicies, etc.
   4185         //
   4186         UnicodeString resultString;
   4187         UnicodeString perlExpr = fields[3];
   4188 #if SUPPORT_MUTATING_INPUT_STRING
   4189         groupsMat->reset(perlExpr);
   4190         cgMat->reset(perlExpr);
   4191 #endif
   4192 
   4193         while (perlExpr.length() > 0) {
   4194 #if !SUPPORT_MUTATING_INPUT_STRING
   4195             //  Perferred usage.  Reset after any modification to input string.
   4196             groupsMat->reset(perlExpr);
   4197             cgMat->reset(perlExpr);
   4198 #endif
   4199 
   4200             if (perlExpr.startsWith("$&")) {
   4201                 resultString.append(testMat->group(status));
   4202                 perlExpr.remove(0, 2);
   4203             }
   4204 
   4205             else if (groupsMat->lookingAt(status)) {
   4206                 // $-[0]   $+[2]  etc.
   4207                 UnicodeString digitString = groupsMat->group(2, status);
   4208                 int32_t t = 0;
   4209                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4210                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4211                 int32_t matchPosition;
   4212                 if (plusOrMinus.compare("+") == 0) {
   4213                     matchPosition = testMat->end(groupNum, status);
   4214                 } else {
   4215                     matchPosition = testMat->start(groupNum, status);
   4216                 }
   4217                 if (matchPosition != -1) {
   4218                     ICU_Utility::appendNumber(resultString, matchPosition);
   4219                 }
   4220                 perlExpr.remove(0, groupsMat->end(status));
   4221             }
   4222 
   4223             else if (cgMat->lookingAt(status)) {
   4224                 // $1, $2, $3, etc.
   4225                 UnicodeString digitString = cgMat->group(1, status);
   4226                 int32_t t = 0;
   4227                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4228                 if (U_SUCCESS(status)) {
   4229                     resultString.append(testMat->group(groupNum, status));
   4230                     status = U_ZERO_ERROR;
   4231                 }
   4232                 perlExpr.remove(0, cgMat->end(status));
   4233             }
   4234 
   4235             else if (perlExpr.startsWith("@-")) {
   4236                 int32_t i;
   4237                 for (i=0; i<=testMat->groupCount(); i++) {
   4238                     if (i>0) {
   4239                         resultString.append(" ");
   4240                     }
   4241                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4242                 }
   4243                 perlExpr.remove(0, 2);
   4244             }
   4245 
   4246             else if (perlExpr.startsWith("@+")) {
   4247                 int32_t i;
   4248                 for (i=0; i<=testMat->groupCount(); i++) {
   4249                     if (i>0) {
   4250                         resultString.append(" ");
   4251                     }
   4252                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4253                 }
   4254                 perlExpr.remove(0, 2);
   4255             }
   4256 
   4257             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4258                                                      //           or as an escaped sequence (e.g. \n)
   4259                 if (perlExpr.length() > 1) {
   4260                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4261                 }
   4262                 UChar c = perlExpr.charAt(0);
   4263                 switch (c) {
   4264                 case 'n':   c = '\n'; break;
   4265                 // add any other escape sequences that show up in the test expected results.
   4266                 }
   4267                 resultString.append(c);
   4268                 perlExpr.remove(0, 1);
   4269             }
   4270 
   4271             else  {
   4272                 // Any characters from the perl expression that we don't explicitly
   4273                 //  recognize before here are assumed to be literals and copied
   4274                 //  as-is to the expected results.
   4275                 resultString.append(perlExpr.charAt(0));
   4276                 perlExpr.remove(0, 1);
   4277             }
   4278 
   4279             if (U_FAILURE(status)) {
   4280                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4281                 break;
   4282             }
   4283         }
   4284 
   4285         //
   4286         // Expected Results Compare
   4287         //
   4288         UnicodeString expectedS(fields[4]);
   4289         expectedS.findAndReplace(nulnulSrc, nulnul);
   4290         expectedS.findAndReplace(ffffSrc,   ffff);
   4291         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4292 
   4293 
   4294         if (expectedS.compare(resultString) != 0) {
   4295             err("Line %d: Incorrect perl expression results.", lineNum);
   4296             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4297         }
   4298 
   4299         delete testMat;
   4300         delete testPat;
   4301     }
   4302 
   4303     //
   4304     // All done.  Clean up allocated stuff.
   4305     //
   4306     delete cgMat;
   4307     delete cgPat;
   4308 
   4309     delete groupsMat;
   4310     delete groupsPat;
   4311 
   4312     delete flagMat;
   4313     delete flagPat;
   4314 
   4315     delete lineMat;
   4316     delete linePat;
   4317 
   4318     delete fieldPat;
   4319     delete [] testData;
   4320 
   4321 
   4322     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4323 
   4324 }
   4325 
   4326 
   4327 //-------------------------------------------------------------------------------
   4328 //
   4329 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
   4330 //                  (instead of using UnicodeStrings) to test the alternate engine.
   4331 //                  The input file for this test is re_tests, the standard regular
   4332 //                  expression test data distributed with the Perl source code.
   4333 //                  See PerlTests() for more information.
   4334 //
   4335 //-------------------------------------------------------------------------------
   4336 void RegexTest::PerlTestsUTF8() {
   4337     char tdd[2048];
   4338     const char *srcPath;
   4339     UErrorCode  status = U_ZERO_ERROR;
   4340     UParseError pe;
   4341     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
   4342     UText       patternText = UTEXT_INITIALIZER;
   4343     char       *patternChars = NULL;
   4344     int32_t     patternLength;
   4345     int32_t     patternCapacity = 0;
   4346     UText       inputText = UTEXT_INITIALIZER;
   4347     char       *inputChars = NULL;
   4348     int32_t     inputLength;
   4349     int32_t     inputCapacity = 0;
   4350 
   4351     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
   4352 
   4353     //
   4354     //  Open and read the test data file.
   4355     //
   4356     srcPath=getPath(tdd, "re_tests.txt");
   4357     if(srcPath==NULL) {
   4358         return; /* something went wrong, error already output */
   4359     }
   4360 
   4361     int32_t    len;
   4362     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
   4363     if (U_FAILURE(status)) {
   4364         return; /* something went wrong, error already output */
   4365     }
   4366 
   4367     //
   4368     //  Put the test data into a UnicodeString
   4369     //
   4370     UnicodeString testDataString(FALSE, testData, len);
   4371 
   4372     //
   4373     //  Regex to break the input file into lines, and strip the new lines.
   4374     //     One line per match, capture group one is the desired data.
   4375     //
   4376     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
   4377     if (U_FAILURE(status)) {
   4378         dataerrln("RegexPattern::compile() error");
   4379         return;
   4380     }
   4381     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
   4382 
   4383     //
   4384     //  Regex to split a test file line into fields.
   4385     //    There are six fields, separated by tabs.
   4386     //
   4387     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
   4388 
   4389     //
   4390     //  Regex to identify test patterns with flag settings, and to separate them.
   4391     //    Test patterns with flags look like 'pattern'i
   4392     //    Test patterns without flags are not quoted:   pattern
   4393     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
   4394     //
   4395     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
   4396     RegexMatcher* flagMat = flagPat->matcher(status);
   4397 
   4398     //
   4399     // The Perl tests reference several perl-isms, which are evaluated/substituted
   4400     //   in the test data.  Not being perl, this must be done explicitly.  Here
   4401     //   are string constants and REs for these constructs.
   4402     //
   4403     UnicodeString nulnulSrc("${nulnul}");
   4404     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
   4405     nulnul = nulnul.unescape();
   4406 
   4407     UnicodeString ffffSrc("${ffff}");
   4408     UnicodeString ffff("\\uffff", -1, US_INV);
   4409     ffff = ffff.unescape();
   4410 
   4411     //  regexp for $-[0], $+[2], etc.
   4412     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
   4413     RegexMatcher *groupsMat = groupsPat->matcher(status);
   4414 
   4415     //  regexp for $0, $1, $2, etc.
   4416     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
   4417     RegexMatcher *cgMat = cgPat->matcher(status);
   4418 
   4419 
   4420     //
   4421     // Main Loop for the Perl Tests, runs once per line from the
   4422     //   test data file.
   4423     //
   4424     int32_t  lineNum = 0;
   4425     int32_t  skippedUnimplementedCount = 0;
   4426     while (lineMat->find()) {
   4427         lineNum++;
   4428 
   4429         //
   4430         //  Get a line, break it into its fields, do the Perl
   4431         //    variable substitutions.
   4432         //
   4433         UnicodeString line = lineMat->group(1, status);
   4434         UnicodeString fields[7];
   4435         fieldPat->split(line, fields, 7, status);
   4436 
   4437         flagMat->reset(fields[0]);
   4438         flagMat->matches(status);
   4439         UnicodeString pattern  = flagMat->group(2, status);
   4440         pattern.findAndReplace("${bang}", "!");
   4441         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
   4442         pattern.findAndReplace(ffffSrc, ffff);
   4443 
   4444         //
   4445         //  Identify patterns that include match flag settings,
   4446         //    split off the flags, remove the extra quotes.
   4447         //
   4448         UnicodeString flagStr = flagMat->group(3, status);
   4449         if (U_FAILURE(status)) {
   4450             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
   4451             return;
   4452         }
   4453         int32_t flags = 0;
   4454         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
   4455         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
   4456         const UChar UChar_m = 0x6d;
   4457         const UChar UChar_x = 0x78;
   4458         const UChar UChar_y = 0x79;
   4459         if (flagStr.indexOf(UChar_i) != -1) {
   4460             flags |= UREGEX_CASE_INSENSITIVE;
   4461         }
   4462         if (flagStr.indexOf(UChar_m) != -1) {
   4463             flags |= UREGEX_MULTILINE;
   4464         }
   4465         if (flagStr.indexOf(UChar_x) != -1) {
   4466             flags |= UREGEX_COMMENTS;
   4467         }
   4468 
   4469         //
   4470         // Put the pattern in a UTF-8 UText
   4471         //
   4472         status = U_ZERO_ERROR;
   4473         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4474         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4475             status = U_ZERO_ERROR;
   4476             delete[] patternChars;
   4477             patternCapacity = patternLength + 1;
   4478             patternChars = new char[patternCapacity];
   4479             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
   4480         }
   4481         utext_openUTF8(&patternText, patternChars, patternLength, &status);
   4482 
   4483         //
   4484         // Compile the test pattern.
   4485         //
   4486         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
   4487         if (status == U_REGEX_UNIMPLEMENTED) {
   4488             //
   4489             // Test of a feature that is planned for ICU, but not yet implemented.
   4490             //   skip the test.
   4491             skippedUnimplementedCount++;
   4492             delete testPat;
   4493             status = U_ZERO_ERROR;
   4494             continue;
   4495         }
   4496 
   4497         if (U_FAILURE(status)) {
   4498             // Some tests are supposed to generate errors.
   4499             //   Only report an error for tests that are supposed to succeed.
   4500             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
   4501                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
   4502             {
   4503                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
   4504             }
   4505             status = U_ZERO_ERROR;
   4506             delete testPat;
   4507             continue;
   4508         }
   4509 
   4510         if (fields[2].indexOf(UChar_i) >= 0) {
   4511             // ICU should skip this test.
   4512             delete testPat;
   4513             continue;
   4514         }
   4515 
   4516         if (fields[2].indexOf(UChar_c) >= 0) {
   4517             // This pattern should have caused a compilation error, but didn't/
   4518             errln("line %d: Expected a pattern compile error, got success.", lineNum);
   4519             delete testPat;
   4520             continue;
   4521         }
   4522 
   4523 
   4524         //
   4525         // replace the Perl variables that appear in some of the
   4526         //   match data strings.
   4527         //
   4528         UnicodeString matchString = fields[1];
   4529         matchString.findAndReplace(nulnulSrc, nulnul);
   4530         matchString.findAndReplace(ffffSrc,   ffff);
   4531 
   4532         // Replace any \n in the match string with an actual new-line char.
   4533         //  Don't do full unescape, as this unescapes more than Perl does, which
   4534         //  causes other spurious failures in the tests.
   4535         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4536 
   4537         //
   4538         // Put the input in a UTF-8 UText
   4539         //
   4540         status = U_ZERO_ERROR;
   4541         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4542         if (status == U_BUFFER_OVERFLOW_ERROR) {
   4543             status = U_ZERO_ERROR;
   4544             delete[] inputChars;
   4545             inputCapacity = inputLength + 1;
   4546             inputChars = new char[inputCapacity];
   4547             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
   4548         }
   4549         utext_openUTF8(&inputText, inputChars, inputLength, &status);
   4550 
   4551         //
   4552         // Run the test, check for expected match/don't match result.
   4553         //
   4554         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
   4555         UBool found = testMat->find();
   4556         UBool expected = FALSE;
   4557         if (fields[2].indexOf(UChar_y) >=0) {
   4558             expected = TRUE;
   4559         }
   4560         if (expected != found) {
   4561             errln("line %d: Expected %smatch, got %smatch",
   4562                 lineNum, expected?"":"no ", found?"":"no " );
   4563             continue;
   4564         }
   4565 
   4566         // Don't try to check expected results if there is no match.
   4567         //   (Some have stuff in the expected fields)
   4568         if (!found) {
   4569             delete testMat;
   4570             delete testPat;
   4571             continue;
   4572         }
   4573 
   4574         //
   4575         // Interpret the Perl expression from the fourth field of the data file,
   4576         // building up an ICU string from the results of the ICU match.
   4577         //   The Perl expression will contain references to the results of
   4578         //     a regex match, including the matched string, capture group strings,
   4579         //     group starting and ending indicies, etc.
   4580         //
   4581         UnicodeString resultString;
   4582         UnicodeString perlExpr = fields[3];
   4583 
   4584         while (perlExpr.length() > 0) {
   4585             groupsMat->reset(perlExpr);
   4586             cgMat->reset(perlExpr);
   4587 
   4588             if (perlExpr.startsWith("$&")) {
   4589                 resultString.append(testMat->group(status));
   4590                 perlExpr.remove(0, 2);
   4591             }
   4592 
   4593             else if (groupsMat->lookingAt(status)) {
   4594                 // $-[0]   $+[2]  etc.
   4595                 UnicodeString digitString = groupsMat->group(2, status);
   4596                 int32_t t = 0;
   4597                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4598                 UnicodeString plusOrMinus = groupsMat->group(1, status);
   4599                 int32_t matchPosition;
   4600                 if (plusOrMinus.compare("+") == 0) {
   4601                     matchPosition = testMat->end(groupNum, status);
   4602                 } else {
   4603                     matchPosition = testMat->start(groupNum, status);
   4604                 }
   4605                 if (matchPosition != -1) {
   4606                     ICU_Utility::appendNumber(resultString, matchPosition);
   4607                 }
   4608                 perlExpr.remove(0, groupsMat->end(status));
   4609             }
   4610 
   4611             else if (cgMat->lookingAt(status)) {
   4612                 // $1, $2, $3, etc.
   4613                 UnicodeString digitString = cgMat->group(1, status);
   4614                 int32_t t = 0;
   4615                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
   4616                 if (U_SUCCESS(status)) {
   4617                     resultString.append(testMat->group(groupNum, status));
   4618                     status = U_ZERO_ERROR;
   4619                 }
   4620                 perlExpr.remove(0, cgMat->end(status));
   4621             }
   4622 
   4623             else if (perlExpr.startsWith("@-")) {
   4624                 int32_t i;
   4625                 for (i=0; i<=testMat->groupCount(); i++) {
   4626                     if (i>0) {
   4627                         resultString.append(" ");
   4628                     }
   4629                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
   4630                 }
   4631                 perlExpr.remove(0, 2);
   4632             }
   4633 
   4634             else if (perlExpr.startsWith("@+")) {
   4635                 int32_t i;
   4636                 for (i=0; i<=testMat->groupCount(); i++) {
   4637                     if (i>0) {
   4638                         resultString.append(" ");
   4639                     }
   4640                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
   4641                 }
   4642                 perlExpr.remove(0, 2);
   4643             }
   4644 
   4645             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
   4646                                                      //           or as an escaped sequence (e.g. \n)
   4647                 if (perlExpr.length() > 1) {
   4648                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
   4649                 }
   4650                 UChar c = perlExpr.charAt(0);
   4651                 switch (c) {
   4652                 case 'n':   c = '\n'; break;
   4653                 // add any other escape sequences that show up in the test expected results.
   4654                 }
   4655                 resultString.append(c);
   4656                 perlExpr.remove(0, 1);
   4657             }
   4658 
   4659             else  {
   4660                 // Any characters from the perl expression that we don't explicitly
   4661                 //  recognize before here are assumed to be literals and copied
   4662                 //  as-is to the expected results.
   4663                 resultString.append(perlExpr.charAt(0));
   4664                 perlExpr.remove(0, 1);
   4665             }
   4666 
   4667             if (U_FAILURE(status)) {
   4668                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
   4669                 break;
   4670             }
   4671         }
   4672 
   4673         //
   4674         // Expected Results Compare
   4675         //
   4676         UnicodeString expectedS(fields[4]);
   4677         expectedS.findAndReplace(nulnulSrc, nulnul);
   4678         expectedS.findAndReplace(ffffSrc,   ffff);
   4679         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
   4680 
   4681 
   4682         if (expectedS.compare(resultString) != 0) {
   4683             err("Line %d: Incorrect perl expression results.", lineNum);
   4684             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
   4685         }
   4686 
   4687         delete testMat;
   4688         delete testPat;
   4689     }
   4690 
   4691     //
   4692     // All done.  Clean up allocated stuff.
   4693     //
   4694     delete cgMat;
   4695     delete cgPat;
   4696 
   4697     delete groupsMat;
   4698     delete groupsPat;
   4699 
   4700     delete flagMat;
   4701     delete flagPat;
   4702 
   4703     delete lineMat;
   4704     delete linePat;
   4705 
   4706     delete fieldPat;
   4707     delete [] testData;
   4708 
   4709     utext_close(&patternText);
   4710     utext_close(&inputText);
   4711 
   4712     delete [] patternChars;
   4713     delete [] inputChars;
   4714 
   4715 
   4716     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
   4717 
   4718 }
   4719 
   4720 
   4721 //--------------------------------------------------------------
   4722 //
   4723 //  Bug6149   Verify limits to heap expansion for backtrack stack.
   4724 //             Use this pattern,
   4725 //                 "(a?){1,8000000}"
   4726 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
   4727 //                   This test is likely to be fragile, as further optimizations stop
   4728 //                   more cases of pointless looping in the match engine.
   4729 //
   4730 //---------------------------------------------------------------
   4731 void RegexTest::Bug6149() {
   4732     UnicodeString pattern("(a?){1,8000000}");
   4733     UnicodeString s("xyz");
   4734     uint32_t flags = 0;
   4735     UErrorCode status = U_ZERO_ERROR;
   4736 
   4737     RegexMatcher  matcher(pattern, s, flags, status);
   4738     UBool result = false;
   4739     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
   4740     REGEX_ASSERT(result == FALSE);
   4741  }
   4742 
   4743 
   4744 //
   4745 //   Callbacks()    Test the callback function.
   4746 //                  When set, callbacks occur periodically during matching operations,
   4747 //                  giving the application code the ability to abort the operation
   4748 //                  before it's normal completion.
   4749 //
   4750 
   4751 struct callBackContext {
   4752     RegexTest        *test;
   4753     int32_t          maxCalls;
   4754     int32_t          numCalls;
   4755     int32_t          lastSteps;
   4756     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
   4757 };
   4758 
   4759 U_CDECL_BEGIN
   4760 static UBool U_CALLCONV
   4761 testCallBackFn(const void *context, int32_t steps) {
   4762     callBackContext  *info = (callBackContext *)context;
   4763     if (info->lastSteps+1 != steps) {
   4764         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
   4765     }
   4766     info->lastSteps = steps;
   4767     info->numCalls++;
   4768     return (info->numCalls < info->maxCalls);
   4769 }
   4770 U_CDECL_END
   4771 
   4772 void RegexTest::Callbacks() {
   4773    {
   4774         // Getter returns NULLs if no callback has been set
   4775 
   4776         //   The variables that the getter will fill in.
   4777         //   Init to non-null values so that the action of the getter can be seen.
   4778         const void          *returnedContext = &returnedContext;
   4779         URegexMatchCallback *returnedFn = &testCallBackFn;
   4780 
   4781         UErrorCode status = U_ZERO_ERROR;
   4782         RegexMatcher matcher("x", 0, status);
   4783         REGEX_CHECK_STATUS;
   4784         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4785         REGEX_CHECK_STATUS;
   4786         REGEX_ASSERT(returnedFn == NULL);
   4787         REGEX_ASSERT(returnedContext == NULL);
   4788     }
   4789 
   4790    {
   4791         // Set and Get work
   4792         callBackContext cbInfo = {this, 0, 0, 0};
   4793         const void          *returnedContext;
   4794         URegexMatchCallback *returnedFn;
   4795         UErrorCode status = U_ZERO_ERROR;
   4796         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
   4797         REGEX_CHECK_STATUS;
   4798         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
   4799         REGEX_CHECK_STATUS;
   4800         matcher.getMatchCallback(returnedFn, returnedContext, status);
   4801         REGEX_CHECK_STATUS;
   4802         REGEX_ASSERT(returnedFn == testCallBackFn);
   4803         REGEX_ASSERT(returnedContext == &cbInfo);
   4804 
   4805         // A short-running match shouldn't invoke the callback
   4806         status = U_ZERO_ERROR;
   4807         cbInfo.reset(1);
   4808         UnicodeString s = "xxx";
   4809         matcher.reset(s);
   4810         REGEX_ASSERT(matcher.matches(status));
   4811         REGEX_CHECK_STATUS;
   4812         REGEX_ASSERT(cbInfo.numCalls == 0);
   4813 
   4814         // A medium-length match that runs long enough to invoke the
   4815         //   callback, but not so long that the callback aborts it.
   4816         status = U_ZERO_ERROR;
   4817         cbInfo.reset(4);
   4818         s = "aaaaaaaaaaaaaaaaaaab";
   4819         matcher.reset(s);
   4820         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4821         REGEX_CHECK_STATUS;
   4822         REGEX_ASSERT(cbInfo.numCalls > 0);
   4823 
   4824         // A longer running match that the callback function will abort.
   4825         status = U_ZERO_ERROR;
   4826         cbInfo.reset(4);
   4827         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4828         matcher.reset(s);
   4829         REGEX_ASSERT(matcher.matches(status)==FALSE);
   4830         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4831         REGEX_ASSERT(cbInfo.numCalls == 4);
   4832 
   4833         // A longer running find that the callback function will abort.
   4834         status = U_ZERO_ERROR;
   4835         cbInfo.reset(4);
   4836         s = "aaaaaaaaaaaaaaaaaaaaaaab";
   4837         matcher.reset(s);
   4838         REGEX_ASSERT(matcher.find(status)==FALSE);
   4839         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4840         REGEX_ASSERT(cbInfo.numCalls == 4);
   4841     }
   4842 
   4843 
   4844 }
   4845 
   4846 
   4847 //
   4848 //   FindProgressCallbacks()    Test the find "progress" callback function.
   4849 //                  When set, the find progress callback will be invoked during a find operations
   4850 //                  after each return from a match attempt, giving the application the opportunity
   4851 //                  to terminate a long-running find operation before it's normal completion.
   4852 //
   4853 
   4854 struct progressCallBackContext {
   4855     RegexTest        *test;
   4856     int64_t          lastIndex;
   4857     int32_t          maxCalls;
   4858     int32_t          numCalls;
   4859     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
   4860 };
   4861 
   4862 // call-back function for find().
   4863 // Return TRUE to continue the find().
   4864 // Return FALSE to stop the find().
   4865 U_CDECL_BEGIN
   4866 static UBool U_CALLCONV
   4867 testProgressCallBackFn(const void *context, int64_t matchIndex) {
   4868     progressCallBackContext  *info = (progressCallBackContext *)context;
   4869     info->numCalls++;
   4870     info->lastIndex = matchIndex;
   4871 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
   4872     return (info->numCalls < info->maxCalls);
   4873 }
   4874 U_CDECL_END
   4875 
   4876 void RegexTest::FindProgressCallbacks() {
   4877    {
   4878         // Getter returns NULLs if no callback has been set
   4879 
   4880         //   The variables that the getter will fill in.
   4881         //   Init to non-null values so that the action of the getter can be seen.
   4882         const void                  *returnedContext = &returnedContext;
   4883         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
   4884 
   4885         UErrorCode status = U_ZERO_ERROR;
   4886         RegexMatcher matcher("x", 0, status);
   4887         REGEX_CHECK_STATUS;
   4888         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4889         REGEX_CHECK_STATUS;
   4890         REGEX_ASSERT(returnedFn == NULL);
   4891         REGEX_ASSERT(returnedContext == NULL);
   4892     }
   4893 
   4894    {
   4895         // Set and Get work
   4896         progressCallBackContext cbInfo = {this, 0, 0, 0};
   4897         const void                  *returnedContext;
   4898         URegexFindProgressCallback  *returnedFn;
   4899         UErrorCode status = U_ZERO_ERROR;
   4900         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
   4901         REGEX_CHECK_STATUS;
   4902         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
   4903         REGEX_CHECK_STATUS;
   4904         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
   4905         REGEX_CHECK_STATUS;
   4906         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
   4907         REGEX_ASSERT(returnedContext == &cbInfo);
   4908 
   4909         // A find that matches on the initial position does NOT invoke the callback.
   4910         status = U_ZERO_ERROR;
   4911         cbInfo.reset(100);
   4912         UnicodeString s = "aaxxx";
   4913         matcher.reset(s);
   4914 #if 0
   4915         matcher.setTrace(TRUE);
   4916 #endif
   4917         REGEX_ASSERT(matcher.find(0, status));
   4918         REGEX_CHECK_STATUS;
   4919         REGEX_ASSERT(cbInfo.numCalls == 0);
   4920 
   4921         // A medium running find() that causes matcher.find() to invoke our callback for each index,
   4922         //   but not so many times that we interrupt the operation.
   4923         status = U_ZERO_ERROR;
   4924         s = "aaaaaaaaaaaaaaaaaaab";
   4925         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
   4926         matcher.reset(s);
   4927         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4928         REGEX_CHECK_STATUS;
   4929         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
   4930 
   4931         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
   4932         status = U_ZERO_ERROR;
   4933         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
   4934         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
   4935         matcher.reset(s1);
   4936         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4937         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4938         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
   4939 
   4940         // Now a match that will succeed, but after an interruption
   4941         status = U_ZERO_ERROR;
   4942         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
   4943         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
   4944         matcher.reset(s2);
   4945         REGEX_ASSERT(matcher.find(0, status)==FALSE);
   4946         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
   4947         // Now retry the match from where left off
   4948         cbInfo.maxCalls = 100; //  No callback limit
   4949         status = U_ZERO_ERROR;
   4950         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
   4951         REGEX_CHECK_STATUS;
   4952     }
   4953 
   4954 
   4955 }
   4956 
   4957 
   4958 //---------------------------------------------------------------------------
   4959 //
   4960 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
   4961 //                             UTexts. The pure-C implementation of UText
   4962 //                             has no mutable backing stores, but we can
   4963 //                             use UnicodeString here to test the functionality.
   4964 //
   4965 //---------------------------------------------------------------------------
   4966 void RegexTest::PreAllocatedUTextCAPI () {
   4967     UErrorCode           status = U_ZERO_ERROR;
   4968     URegularExpression  *re;
   4969     UText                patternText = UTEXT_INITIALIZER;
   4970     UnicodeString        buffer;
   4971     UText                bufferText = UTEXT_INITIALIZER;
   4972 
   4973     utext_openUnicodeString(&bufferText, &buffer, &status);
   4974 
   4975     /*
   4976      *  getText() and getUText()
   4977      */
   4978     {
   4979         UText  text1 = UTEXT_INITIALIZER;
   4980         UText  text2 = UTEXT_INITIALIZER;
   4981         UChar  text2Chars[20];
   4982         UText  *resultText;
   4983 
   4984         status = U_ZERO_ERROR;
   4985         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
   4986         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
   4987         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
   4988         utext_openUChars(&text2, text2Chars, -1, &status);
   4989 
   4990         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
   4991         re = uregex_openUText(&patternText, 0, NULL, &status);
   4992 
   4993         /* First set a UText */
   4994         uregex_setUText(re, &text1, &status);
   4995         resultText = uregex_getUText(re, &bufferText, &status);
   4996         REGEX_CHECK_STATUS;
   4997         REGEX_ASSERT(resultText == &bufferText);
   4998         utext_setNativeIndex(resultText, 0);
   4999         utext_setNativeIndex(&text1, 0);
   5000         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5001 
   5002         resultText = uregex_getUText(re, &bufferText, &status);
   5003         REGEX_CHECK_STATUS;
   5004         REGEX_ASSERT(resultText == &bufferText);
   5005         utext_setNativeIndex(resultText, 0);
   5006         utext_setNativeIndex(&text1, 0);
   5007         REGEX_ASSERT(testUTextEqual(resultText, &text1));
   5008 
   5009         /* Then set a UChar * */
   5010         uregex_setText(re, text2Chars, 7, &status);
   5011         resultText = uregex_getUText(re, &bufferText, &status);
   5012         REGEX_CHECK_STATUS;
   5013         REGEX_ASSERT(resultText == &bufferText);
   5014         utext_setNativeIndex(resultText, 0);
   5015         utext_setNativeIndex(&text2, 0);
   5016         REGEX_ASSERT(testUTextEqual(resultText, &text2));
   5017 
   5018         uregex_close(re);
   5019         utext_close(&text1);
   5020         utext_close(&text2);
   5021     }
   5022 
   5023     /*
   5024      *  group()
   5025      */
   5026     {
   5027         UChar    text1[80];
   5028         UText   *actual;
   5029         UBool    result;
   5030         int64_t  length = 0;
   5031 
   5032         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
   5033         //                  012345678901234567890123456789012345678901234567
   5034         //                  0         1         2         3         4
   5035 
   5036         status = U_ZERO_ERROR;
   5037         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
   5038         REGEX_CHECK_STATUS;
   5039 
   5040         uregex_setText(re, text1, -1, &status);
   5041         result = uregex_find(re, 0, &status);
   5042         REGEX_ASSERT(result==TRUE);
   5043 
   5044         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
   5045         status = U_ZERO_ERROR;
   5046         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
   5047         REGEX_CHECK_STATUS;
   5048         REGEX_ASSERT(actual == &bufferText);
   5049         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
   5050         REGEX_ASSERT(length == 16);
   5051         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5052 
   5053         /*  Capture group #1.  Should succeed, matching " interior ". */
   5054         status = U_ZERO_ERROR;
   5055         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
   5056         REGEX_CHECK_STATUS;
   5057         REGEX_ASSERT(actual == &bufferText);
   5058         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
   5059         REGEX_ASSERT(length == 10);
   5060         REGEX_ASSERT(utext_nativeLength(actual) == 47);
   5061 
   5062         /*  Capture group out of range.  Error. */
   5063         status = U_ZERO_ERROR;
   5064         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
   5065         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5066         REGEX_ASSERT(actual == &bufferText);
   5067         uregex_close(re);
   5068 
   5069     }
   5070 
   5071     /*
   5072      *  replaceFirst()
   5073      */
   5074     {
   5075         UChar    text1[80];
   5076         UChar    text2[80];
   5077         UText    replText = UTEXT_INITIALIZER;
   5078         UText   *result;
   5079         status = U_ZERO_ERROR;
   5080         utext_openUnicodeString(&bufferText, &buffer, &status);
   5081 
   5082         status = U_ZERO_ERROR;
   5083         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
   5084         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
   5085         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5086 
   5087         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5088         REGEX_CHECK_STATUS;
   5089 
   5090         /*  Normal case, with match */
   5091         uregex_setText(re, text1, -1, &status);
   5092         REGEX_CHECK_STATUS;
   5093         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5094         REGEX_CHECK_STATUS;
   5095         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5096         REGEX_CHECK_STATUS;
   5097         REGEX_ASSERT(result == &bufferText);
   5098         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
   5099 
   5100         /* No match.  Text should copy to output with no changes.  */
   5101         uregex_setText(re, text2, -1, &status);
   5102         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5103         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5104         REGEX_CHECK_STATUS;
   5105         REGEX_ASSERT(result == &bufferText);
   5106         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5107 
   5108         /* Unicode escapes */
   5109         uregex_setText(re, text1, -1, &status);
   5110         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
   5111         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5112         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
   5113         REGEX_CHECK_STATUS;
   5114         REGEX_ASSERT(result == &bufferText);
   5115         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
   5116 
   5117         uregex_close(re);
   5118         utext_close(&replText);
   5119     }
   5120 
   5121 
   5122     /*
   5123      *  replaceAll()
   5124      */
   5125     {
   5126         UChar    text1[80];
   5127         UChar    text2[80];
   5128         UText    replText = UTEXT_INITIALIZER;
   5129         UText   *result;
   5130 
   5131         status = U_ZERO_ERROR;
   5132         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
   5133         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
   5134         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
   5135 
   5136         re = uregex_openC("x(.*?)x", 0, NULL, &status);
   5137         REGEX_CHECK_STATUS;
   5138 
   5139         /*  Normal case, with match */
   5140         uregex_setText(re, text1, -1, &status);
   5141         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5142         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5143         REGEX_CHECK_STATUS;
   5144         REGEX_ASSERT(result == &bufferText);
   5145         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
   5146 
   5147         /* No match.  Text should copy to output with no changes.  */
   5148         uregex_setText(re, text2, -1, &status);
   5149         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
   5150         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
   5151         REGEX_CHECK_STATUS;
   5152         REGEX_ASSERT(result == &bufferText);
   5153         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
   5154 
   5155         uregex_close(re);
   5156         utext_close(&replText);
   5157     }
   5158 
   5159 
   5160     /*
   5161      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
   5162      *   so we don't need to test it here.
   5163      */
   5164 
   5165     utext_close(&bufferText);
   5166     utext_close(&patternText);
   5167 }
   5168 
   5169 
   5170 //--------------------------------------------------------------
   5171 //
   5172 //  NamedCapture   Check basic named capture group functionality
   5173 //
   5174 //--------------------------------------------------------------
   5175 void RegexTest::NamedCapture() {
   5176     UErrorCode status = U_ZERO_ERROR;
   5177     RegexPattern *pat = RegexPattern::compile(UnicodeString(
   5178             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
   5179     REGEX_CHECK_STATUS;
   5180     int32_t group = pat->groupNumberFromName("five", -1, status);
   5181     REGEX_CHECK_STATUS;
   5182     REGEX_ASSERT(5 == group);
   5183     group = pat->groupNumberFromName("three", -1, status);
   5184     REGEX_CHECK_STATUS;
   5185     REGEX_ASSERT(3 == group);
   5186 
   5187     status = U_ZERO_ERROR;
   5188     group = pat->groupNumberFromName(UnicodeString("six"), status);
   5189     REGEX_CHECK_STATUS;
   5190     REGEX_ASSERT(6 == group);
   5191 
   5192     status = U_ZERO_ERROR;
   5193     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
   5194     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5195 
   5196     status = U_ZERO_ERROR;
   5197 
   5198     // After copying a pattern, named capture should still work in the copy.
   5199     RegexPattern *copiedPat = new RegexPattern(*pat);
   5200     REGEX_ASSERT(*copiedPat == *pat);
   5201     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
   5202 
   5203     group = copiedPat->groupNumberFromName("five", -1, status);
   5204     REGEX_CHECK_STATUS;
   5205     REGEX_ASSERT(5 == group);
   5206     group = copiedPat->groupNumberFromName("three", -1, status);
   5207     REGEX_CHECK_STATUS;
   5208     REGEX_ASSERT(3 == group);
   5209     delete copiedPat;
   5210 
   5211     // ReplaceAll with named capture group.
   5212     status = U_ZERO_ERROR;
   5213     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
   5214     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
   5215     REGEX_CHECK_STATUS;
   5216     // m.pattern().dumpPattern();
   5217     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
   5218     REGEX_CHECK_STATUS;
   5219     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
   5220     delete m;
   5221 
   5222     // ReplaceAll, allowed capture group numbers.
   5223     text = UnicodeString("abcmxyz");
   5224     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
   5225     REGEX_CHECK_STATUS;
   5226 
   5227     status = U_ZERO_ERROR;
   5228     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
   5229     REGEX_CHECK_STATUS;
   5230     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
   5231 
   5232     status = U_ZERO_ERROR;
   5233     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
   5234     REGEX_CHECK_STATUS;
   5235     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5236 
   5237     status = U_ZERO_ERROR;
   5238     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
   5239     REGEX_CHECK_STATUS;
   5240     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
   5241 
   5242     status = U_ZERO_ERROR;
   5243     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
   5244     REGEX_CHECK_STATUS;
   5245     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
   5246 
   5247     status = U_ZERO_ERROR;
   5248     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
   5249     REGEX_CHECK_STATUS;
   5250     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
   5251 
   5252     status = U_ZERO_ERROR;
   5253     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
   5254     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5255 
   5256     status = U_ZERO_ERROR;
   5257     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
   5258     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
   5259     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
   5260 
   5261     status = U_ZERO_ERROR;
   5262     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
   5263     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
   5264     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
   5265 
   5266     status = U_ZERO_ERROR;
   5267     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
   5268     REGEX_CHECK_STATUS;
   5269     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
   5270 
   5271     status = U_ZERO_ERROR;
   5272     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
   5273     REGEX_CHECK_STATUS;
   5274     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
   5275 
   5276     status = U_ZERO_ERROR;
   5277     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
   5278     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5279 
   5280     status = U_ZERO_ERROR;
   5281     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
   5282     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5283 
   5284     status = U_ZERO_ERROR;
   5285     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
   5286     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5287 
   5288     status = U_ZERO_ERROR;
   5289     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
   5290     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5291 
   5292     delete m;
   5293 
   5294     // Repeat the above replaceAll() tests using the plain C API, which
   5295     //  has a separate implementation internally.
   5296     //  TODO: factor out the test data.
   5297 
   5298     status = U_ZERO_ERROR;
   5299     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
   5300     REGEX_CHECK_STATUS;
   5301     text = UnicodeString("abcmxyz");
   5302     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5303     REGEX_CHECK_STATUS;
   5304 
   5305     UChar resultBuf[100];
   5306     int32_t resultLength;
   5307     UnicodeString repl;
   5308 
   5309     status = U_ZERO_ERROR;
   5310     repl = UnicodeString("<$0>");
   5311     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5312     REGEX_CHECK_STATUS;
   5313     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
   5314 
   5315     status = U_ZERO_ERROR;
   5316     repl = UnicodeString("<$1>");
   5317     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5318     REGEX_CHECK_STATUS;
   5319     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5320 
   5321     status = U_ZERO_ERROR;
   5322     repl = UnicodeString("<${one}>");
   5323     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5324     REGEX_CHECK_STATUS;
   5325     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
   5326 
   5327     status = U_ZERO_ERROR;
   5328     repl = UnicodeString("<$2>");
   5329     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5330     REGEX_CHECK_STATUS;
   5331     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
   5332 
   5333     status = U_ZERO_ERROR;
   5334     repl = UnicodeString("<$3>");
   5335     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5336     REGEX_CHECK_STATUS;
   5337     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
   5338 
   5339     status = U_ZERO_ERROR;
   5340     repl = UnicodeString("<$4>");
   5341     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5342     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
   5343 
   5344     status = U_ZERO_ERROR;
   5345     repl = UnicodeString("<$04>");
   5346     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5347     REGEX_CHECK_STATUS;
   5348     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
   5349 
   5350     status = U_ZERO_ERROR;
   5351     repl = UnicodeString("<$000016>");
   5352     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5353     REGEX_CHECK_STATUS;
   5354     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
   5355 
   5356     status = U_ZERO_ERROR;
   5357     repl = UnicodeString("<$3$2$1${one}>");
   5358     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5359     REGEX_CHECK_STATUS;
   5360     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
   5361 
   5362     status = U_ZERO_ERROR;
   5363     repl = UnicodeString("$3$2$1${one}");
   5364     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5365     REGEX_CHECK_STATUS;
   5366     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
   5367 
   5368     status = U_ZERO_ERROR;
   5369     repl = UnicodeString("<${noSuchName}>");
   5370     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5371     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5372 
   5373     status = U_ZERO_ERROR;
   5374     repl = UnicodeString("<${invalid-name}>");
   5375     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5376     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5377 
   5378     status = U_ZERO_ERROR;
   5379     repl = UnicodeString("<${one");
   5380     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5381     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5382 
   5383     status = U_ZERO_ERROR;
   5384     repl = UnicodeString("$not a capture group");
   5385     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
   5386     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
   5387 
   5388     uregex_close(re);
   5389 }
   5390 
   5391 //--------------------------------------------------------------
   5392 //
   5393 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
   5394 //                       The point is not so much what the exact limit is,
   5395 //                       but that a largish number doesn't hit bad non-linear performance,
   5396 //                       and that exceeding the limit fails cleanly.
   5397 //
   5398 //--------------------------------------------------------------
   5399 void RegexTest::NamedCaptureLimits() {
   5400     if (quick) {
   5401         logln("Skipping test. Runs in exhuastive mode only.");
   5402         return;
   5403     }
   5404     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
   5405     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
   5406     char nnbuf[100];
   5407     UnicodeString pattern;
   5408     int32_t nn;
   5409 
   5410     for (nn=1; nn<goodLimit; nn++) {
   5411         sprintf(nnbuf, "(?<nn%d>)", nn);
   5412         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5413     }
   5414     UErrorCode status = U_ZERO_ERROR;
   5415     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
   5416     REGEX_CHECK_STATUS;
   5417     for (nn=1; nn<goodLimit; nn++) {
   5418         sprintf(nnbuf, "nn%d", nn);
   5419         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
   5420         REGEX_ASSERT(nn == groupNum);
   5421         if (nn != groupNum) {
   5422             break;
   5423         }
   5424     }
   5425     delete pat;
   5426 
   5427     pattern.remove();
   5428     for (nn=1; nn<failLimit; nn++) {
   5429         sprintf(nnbuf, "(?<nn%d>)", nn);
   5430         pattern.append(UnicodeString(nnbuf, -1, US_INV));
   5431     }
   5432     status = U_ZERO_ERROR;
   5433     pat = RegexPattern::compile(pattern, 0, status);
   5434     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
   5435     delete pat;
   5436 }
   5437 
   5438 
   5439 //--------------------------------------------------------------
   5440 //
   5441 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
   5442 //
   5443 //---------------------------------------------------------------
   5444 void RegexTest::Bug7651() {
   5445     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
   5446     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
   5447     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
   5448     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
   5449     UnicodeString s("#ff @abcd This is test");
   5450     RegexPattern  *REPattern = NULL;
   5451     RegexMatcher  *REMatcher = NULL;
   5452     UErrorCode status = U_ZERO_ERROR;
   5453     UParseError pe;
   5454 
   5455     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
   5456     REGEX_CHECK_STATUS;
   5457     REMatcher = REPattern->matcher(s, status);
   5458     REGEX_CHECK_STATUS;
   5459     REGEX_ASSERT(REMatcher->find());
   5460     REGEX_ASSERT(REMatcher->start(status) == 0);
   5461     delete REPattern;
   5462     delete REMatcher;
   5463     status = U_ZERO_ERROR;
   5464 
   5465     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
   5466     REGEX_CHECK_STATUS;
   5467     REMatcher = REPattern->matcher(s, status);
   5468     REGEX_CHECK_STATUS;
   5469     REGEX_ASSERT(REMatcher->find());
   5470     REGEX_ASSERT(REMatcher->start(status) == 0);
   5471     delete REPattern;
   5472     delete REMatcher;
   5473     status = U_ZERO_ERROR;
   5474  }
   5475 
   5476 void RegexTest::Bug7740() {
   5477     UErrorCode status = U_ZERO_ERROR;
   5478     UnicodeString pattern = "(a)";
   5479     UnicodeString text = "abcdef";
   5480     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
   5481     REGEX_CHECK_STATUS;
   5482     REGEX_ASSERT(m->lookingAt(status));
   5483     REGEX_CHECK_STATUS;
   5484     status = U_ILLEGAL_ARGUMENT_ERROR;
   5485     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
   5486     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5487     REGEX_ASSERT(s == "");
   5488     delete m;
   5489 }
   5490 
   5491 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
   5492 
   5493 void RegexTest::Bug8479() {
   5494     UErrorCode status = U_ZERO_ERROR;
   5495 
   5496     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
   5497     REGEX_CHECK_STATUS;
   5498     if (U_SUCCESS(status))
   5499     {
   5500         UnicodeString str;
   5501         str.setToBogus();
   5502         pMatcher->reset(str);
   5503         status = U_ZERO_ERROR;
   5504         pMatcher->matches(status);
   5505         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
   5506         delete pMatcher;
   5507     }
   5508 }
   5509 
   5510 
   5511 // Bug 7029
   5512 void RegexTest::Bug7029() {
   5513     UErrorCode status = U_ZERO_ERROR;
   5514 
   5515     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
   5516     UnicodeString text = "abc.def";
   5517     UnicodeString splits[10];
   5518     REGEX_CHECK_STATUS;
   5519     int32_t numFields = pMatcher->split(text, splits, 10, status);
   5520     REGEX_CHECK_STATUS;
   5521     REGEX_ASSERT(numFields == 8);
   5522     delete pMatcher;
   5523 }
   5524 
   5525 // Bug 9283
   5526 //   This test is checking for the existance of any supplemental characters that case-fold
   5527 //   to a bmp character.
   5528 //
   5529 //   At the time of this writing there are none. If any should appear in a subsequent release
   5530 //   of Unicode, the code in regular expressions compilation that determines the longest
   5531 //   posssible match for a literal string  will need to be enhanced.
   5532 //
   5533 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
   5534 //   for details on what to do in case of a failure of this test.
   5535 //
   5536 void RegexTest::Bug9283() {
   5537 #if !UCONFIG_NO_NORMALIZATION
   5538     UErrorCode status = U_ZERO_ERROR;
   5539     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
   5540     REGEX_CHECK_STATUS;
   5541     int32_t index;
   5542     UChar32 c;
   5543     for (index=0; ; index++) {
   5544         c = supplementalsWithCaseFolding.charAt(index);
   5545         if (c == -1) {
   5546             break;
   5547         }
   5548         UnicodeString cf = UnicodeString(c).foldCase();
   5549         REGEX_ASSERT(cf.length() >= 2);
   5550     }
   5551 #endif /* #if !UCONFIG_NO_NORMALIZATION */
   5552 }
   5553 
   5554 
   5555 void RegexTest::CheckInvBufSize() {
   5556   if(inv_next>=INV_BUFSIZ) {
   5557     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
   5558           __FILE__, INV_BUFSIZ, inv_next);
   5559   } else {
   5560     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
   5561   }
   5562 }
   5563 
   5564 
   5565 void RegexTest::Bug10459() {
   5566     UErrorCode status = U_ZERO_ERROR;
   5567     UnicodeString patternString("(txt)");
   5568     UnicodeString txtString("txt");
   5569 
   5570     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
   5571     REGEX_CHECK_STATUS;
   5572     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
   5573     REGEX_CHECK_STATUS;
   5574 
   5575     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
   5576     REGEX_CHECK_STATUS;
   5577 
   5578     uregex_setUText(icu_re, utext_txt, &status);
   5579     REGEX_CHECK_STATUS;
   5580 
   5581     // The bug was that calling uregex_group() before doing a matching operation
   5582     //   was causing a segfault. Only for Regular Expressions created from UText.
   5583     //   It should set an U_REGEX_INVALID_STATE.
   5584 
   5585     UChar buf[100];
   5586     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
   5587     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
   5588     REGEX_ASSERT(len == 0);
   5589 
   5590     uregex_close(icu_re);
   5591     utext_close(utext_pat);
   5592     utext_close(utext_txt);
   5593 }
   5594 
   5595 void RegexTest::TestCaseInsensitiveStarters() {
   5596     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
   5597     //  become stale because of new Unicode characters.
   5598     // If it is stale, rerun the generation tool
   5599     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
   5600     // and replace the embedded data in i18n/regexcmp.cpp
   5601 
   5602     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
   5603         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
   5604             continue;
   5605         }
   5606         UnicodeSet s(cp, cp);
   5607         s.closeOver(USET_CASE_INSENSITIVE);
   5608         UnicodeSetIterator setIter(s);
   5609         while (setIter.next()) {
   5610             if (!setIter.isString()) {
   5611                 continue;
   5612             }
   5613             const UnicodeString &str = setIter.getString();
   5614             UChar32 firstChar = str.char32At(0);
   5615             UnicodeSet starters;
   5616             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
   5617             if (!starters.contains(cp)) {
   5618                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
   5619                 return;
   5620             }
   5621         }
   5622     }
   5623 }
   5624 
   5625 
   5626 void RegexTest::TestBug11049() {
   5627     // Original bug report: pattern with match start consisting of one of several individual characters,
   5628     //  and the text being matched ending with a supplementary character. find() would read past the
   5629     //  end of the input text when searching for potential match starting points.
   5630 
   5631     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
   5632     // detect the bad read.
   5633 
   5634     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5635     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
   5636 
   5637     // Test again with a pattern starting with a single character,
   5638     // which takes a different code path than starting with an OR expression,
   5639     // but with similar logic.
   5640     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
   5641     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
   5642 }
   5643 
   5644 // Run a single test case from TestBug11049(). Internal function.
   5645 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
   5646     UErrorCode status = U_ZERO_ERROR;
   5647     UnicodeString patternString = UnicodeString(pattern).unescape();
   5648     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5649 
   5650     UnicodeString dataString = UnicodeString(data).unescape();
   5651     UChar *exactBuffer = new UChar[dataString.length()];
   5652     dataString.extract(exactBuffer, dataString.length(), status);
   5653     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
   5654 
   5655     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
   5656     REGEX_CHECK_STATUS;
   5657     matcher->reset(ut);
   5658     UBool result = matcher->find();
   5659     if (result != expectMatch) {
   5660         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5661               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5662     }
   5663 
   5664     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
   5665     //   off-by-one on find() with match at the last code point.
   5666     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
   5667     //   because string.unescape() will only shrink it.
   5668     char * utf8Buffer = new char[uprv_strlen(data)+1];
   5669     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
   5670     REGEX_CHECK_STATUS;
   5671     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
   5672     REGEX_CHECK_STATUS;
   5673     matcher->reset(ut);
   5674     result = matcher->find();
   5675     if (result != expectMatch) {
   5676         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
   5677               __FILE__, lineNumber, expectMatch, result, pattern, data);
   5678     }
   5679     delete [] utf8Buffer;
   5680 
   5681     utext_close(ut);
   5682     delete [] exactBuffer;
   5683 }
   5684 
   5685 
   5686 void RegexTest::TestBug11371() {
   5687     if (quick) {
   5688         logln("Skipping test. Runs in exhuastive mode only.");
   5689         return;
   5690     }
   5691     UErrorCode status = U_ZERO_ERROR;
   5692     UnicodeString patternString;
   5693 
   5694     for (int i=0; i<8000000; i++) {
   5695         patternString.append(UnicodeString("()"));
   5696     }
   5697     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
   5698     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5699         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5700               __FILE__, __LINE__, u_errorName(status));
   5701     }
   5702 
   5703     status = U_ZERO_ERROR;
   5704     patternString = "(";
   5705     for (int i=0; i<20000000; i++) {
   5706         patternString.append(UnicodeString("A++"));
   5707     }
   5708     patternString.append(UnicodeString("){0}B++"));
   5709     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
   5710     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5711         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5712               __FILE__, __LINE__, u_errorName(status));
   5713     }
   5714 
   5715     // Pattern with too much string data, such that string indexes overflow operand data field size
   5716     // in compiled instruction.
   5717     status = U_ZERO_ERROR;
   5718     patternString = "";
   5719     while (patternString.length() < 0x00ffffff) {
   5720         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
   5721     }
   5722     patternString.append(UnicodeString("X? trailing string"));
   5723     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
   5724     if (status != U_REGEX_PATTERN_TOO_BIG) {
   5725         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
   5726               __FILE__, __LINE__, u_errorName(status));
   5727     }
   5728 }
   5729 
   5730 void RegexTest::TestBug11480() {
   5731     // C API, get capture group of a group that does not participate in the match.
   5732     //        (Returns a zero length string, with nul termination,
   5733     //         indistinguishable from a group with a zero length match.)
   5734 
   5735     UErrorCode status = U_ZERO_ERROR;
   5736     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
   5737     REGEX_CHECK_STATUS;
   5738     UnicodeString text = UNICODE_STRING_SIMPLE("A");
   5739     uregex_setText(re, text.getBuffer(), text.length(), &status);
   5740     REGEX_CHECK_STATUS;
   5741     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
   5742     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
   5743     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
   5744     REGEX_ASSERT(length == 0);
   5745     REGEX_ASSERT(buf[0] == 13);
   5746     REGEX_ASSERT(buf[1] == 0);
   5747     REGEX_ASSERT(buf[2] == 13);
   5748     uregex_close(re);
   5749 
   5750     // UText C++ API, length of match is 0 for non-participating matches.
   5751     UText ut = UTEXT_INITIALIZER;
   5752     utext_openUnicodeString(&ut, &text, &status);
   5753     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
   5754     REGEX_CHECK_STATUS;
   5755     matcher.reset(&ut);
   5756     REGEX_ASSERT(matcher.lookingAt(0, status));
   5757 
   5758     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
   5759     int64_t groupLen = -666;
   5760     UText group = UTEXT_INITIALIZER;
   5761     matcher.group(1, &group, groupLen, status);
   5762     REGEX_CHECK_STATUS;
   5763     REGEX_ASSERT(groupLen == 1);
   5764     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
   5765 
   5766     // Capture group 2, the (B), does not participate in the match.
   5767     matcher.group(2, &group, groupLen, status);
   5768     REGEX_CHECK_STATUS;
   5769     REGEX_ASSERT(groupLen == 0);
   5770     REGEX_ASSERT(matcher.start(2, status) == -1);
   5771     REGEX_CHECK_STATUS;
   5772 }
   5773 
   5774 void RegexTest::TestBug12884() {
   5775     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
   5776     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
   5777     UnicodeString text(u"hello");
   5778     UErrorCode status = U_ZERO_ERROR;
   5779     RegexMatcher m(pattern, text, 0, status);
   5780     REGEX_CHECK_STATUS;
   5781     m.setTimeLimit(5, status);
   5782     m.find(status);
   5783     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   5784 
   5785     // Non-greedy loops. They take a different code path during matching.
   5786     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
   5787     status = U_ZERO_ERROR;
   5788     RegexMatcher ngM(ngPattern, text, 0, status);
   5789     REGEX_CHECK_STATUS;
   5790     ngM.setTimeLimit(5, status);
   5791     ngM.find(status);
   5792     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   5793 
   5794     // UText, wrapping non-UTF-16 text, also takes a different execution path.
   5795     const char *text8 = u8"Qu es Unicode?  Unicode proporciona un nmero nico para cada"
   5796                           "carcter, sin importar la plataforma, sin importar el programa,"
   5797                           "sin importar el idioma.";
   5798     status = U_ZERO_ERROR;
   5799     LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
   5800     REGEX_CHECK_STATUS;
   5801     m.reset(ut.getAlias());
   5802     m.find(status);
   5803     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   5804 
   5805     status = U_ZERO_ERROR;
   5806     ngM.reset(ut.getAlias());
   5807     ngM.find(status);
   5808     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
   5809 }
   5810 
   5811 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
   5812 //            can cause a read past the end of the input text.
   5813 //            The failure is seen when running this test with Clang's Addresss Sanitizer.
   5814 
   5815 void RegexTest::TestBug13631() {
   5816     const UChar *pats[] = { u"(?<!^)",
   5817                             u"(?<=^)",
   5818                             nullptr
   5819                           };
   5820     for (const UChar **pat=pats; *pat; ++pat) {
   5821         UErrorCode status = U_ZERO_ERROR;
   5822         UnicodeString upat(*pat);
   5823         RegexMatcher matcher(upat, 0, status);
   5824         const UChar s =u'a';
   5825         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
   5826         REGEX_CHECK_STATUS;
   5827         matcher.reset(ut);
   5828         while (matcher.find()) {
   5829         }
   5830         utext_close(ut);
   5831     }
   5832 }
   5833 
   5834 
   5835 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
   5836